from app.services.llm.llm_connector import LLMConnector class NormalizeForm: def __init__(self, llm: LLMConnector) -> None: self.llm = llm def _system_prompt(self) -> str: """ Defines the system's role for the LLM. """ return """You are a text processing expert specializing in citation formatting. Your sole purpose is to receive a piece of text and correct its citation style according to a strict set of rules, without altering any other part of the content. You must return only the corrected text.""" def _prompt(self, report: str) -> str: """ Constructs the user prompt for the LLM with STRICT CITATION RULES. """ return f"""Please correct the citation formatting in the following text. ================================================== CRITICAL CITATION RULES (MUST FOLLOW): 1. Single Citation: - CORRECT: [Source 1] - CORRECT: [Image 1] - INCORRECT: [1], (Source 1), Source[1], (1), ... 2. Multiple Citations (SAME TYPE) -> COMBINE: - CORRECT: [Source 1, 2] - CORRECT: [Image 1, 3] - INCORRECT: [Source 1][Source 2] 3. Multiple Citations (DIFFERENT TYPES) -> SEPARATE: - CORRECT: [Source 1], [Image 2] - INCORRECT (NEVER DO THIS): [Source 1, Image 2] - INCORRECT: [Source 1; Image 2] 4. Forbidden Characters: - Do NOT use parentheses like (Source 1). - Do NOT use standalone numbers like [1]. 5. Enforcement Example: - "According to [Source 1], the tank is visible in [Image 2, 3]." - "This is confirmed by [Source 1, 2] and shown in [Image 1]." 6. No Citation Case (STRICT) - Never invent, assume, or auto-append citations. - Citations must appear only if they already exist in the original text. - Adding fake or guessed citations is considered a critical error. Example (CORRECT): "The system validates citation formatting." Example (INCORRECT): "The system validates citation formatting. [Source 1]" ================================================== Original Text to correct: --- {report} --- Example: "The data was collected from an official report Source(1)" "This information is supported by multiple references Source[1, 2, 3]" CORRECT: "The data was collected from an official report [Source 1]" "This information is supported by multiple references [Source 1, 2, 3]" INCORRECT: "The data was collected from an official report [1]" "The data was collected from an official report (Source 1)" "The data was collected from an official report Source[1]" "This information is supported by multiple references [Source 1][Source 2]" "This information is supported by multiple references [Source 1, Source 2]" Return only the corrected text, with no preamble or explanation. If no citation exists in the text, do not add any citation. """ def _schema(self) -> dict: """ Defines the JSON schema for the expected output from the LLM. """ return { "type": "object", "properties": { "normalized_text": { "type": "string", "description": "The text with all citation formats corrected according to the provided rules. If no citations are present, this will be the original text." } }, "required": ["normalized_text"] } async def normalize(self, report: str) -> str: """ Normalizes the citation format in the report string using an LLM call. """ if not report or not isinstance(report, str): return report prompt = self._prompt(report) system_prompt = self._system_prompt() schema = self._schema() structured_response = await self.llm.call_with_structured_output( prompt=prompt, system_prompt=system_prompt, schema=schema ) if structured_response and "normalized_text" in structured_response: return structured_response["normalized_text"] return report