Spaces:
Sleeping
Sleeping
| from app.services.llm.llm_connector import LLMConnector | |
| class NormalizeForm: | |
| def __init__(self, llm: LLMConnector) -> None: | |
| self.llm = llm | |
| def _system_prompt(self) -> str: | |
| """ | |
| Defines the system's role for the LLM. | |
| """ | |
| return """You are a text processing expert specializing in citation formatting. Your sole purpose is to receive a piece of text and correct its citation style according to a strict set of rules, without altering any other part of the content. You must return only the corrected text.""" | |
| def _prompt(self, report: str) -> str: | |
| """ | |
| Constructs the user prompt for the LLM with STRICT CITATION RULES. | |
| """ | |
| return f"""Please correct the citation formatting in the following text. | |
| ================================================== | |
| CRITICAL CITATION RULES (MUST FOLLOW): | |
| 1. Single Citation: | |
| - CORRECT: [Source 1] | |
| - CORRECT: [Image 1] | |
| - INCORRECT: [1], (Source 1), Source[1], (1), ... | |
| 2. Multiple Citations (SAME TYPE) -> COMBINE: | |
| - CORRECT: [Source 1, 2] | |
| - CORRECT: [Image 1, 3] | |
| - INCORRECT: [Source 1][Source 2] | |
| 3. Multiple Citations (DIFFERENT TYPES) -> SEPARATE: | |
| - CORRECT: [Source 1], [Image 2] | |
| - INCORRECT (NEVER DO THIS): [Source 1, Image 2] | |
| - INCORRECT: [Source 1; Image 2] | |
| 4. Forbidden Characters: | |
| - Do NOT use parentheses like (Source 1). | |
| - Do NOT use standalone numbers like [1]. | |
| 5. Enforcement Example: | |
| - "According to [Source 1], the tank is visible in [Image 2, 3]." | |
| - "This is confirmed by [Source 1, 2] and shown in [Image 1]." | |
| 6. No Citation Case (STRICT) | |
| - Never invent, assume, or auto-append citations. | |
| - Citations must appear only if they already exist in the original text. | |
| - Adding fake or guessed citations is considered a critical error. | |
| Example (CORRECT): | |
| "The system validates citation formatting." | |
| Example (INCORRECT): | |
| "The system validates citation formatting. [Source 1]" | |
| ================================================== | |
| Original Text to correct: | |
| --- | |
| {report} | |
| --- | |
| Example: | |
| "The data was collected from an official report Source(1)" | |
| "This information is supported by multiple references Source[1, 2, 3]" | |
| CORRECT: | |
| "The data was collected from an official report [Source 1]" | |
| "This information is supported by multiple references [Source 1, 2, 3]" | |
| INCORRECT: | |
| "The data was collected from an official report [1]" | |
| "The data was collected from an official report (Source 1)" | |
| "The data was collected from an official report Source[1]" | |
| "This information is supported by multiple references [Source 1][Source 2]" | |
| "This information is supported by multiple references [Source 1, Source 2]" | |
| Return only the corrected text, with no preamble or explanation. | |
| If no citation exists in the text, do not add any citation. | |
| """ | |
| def _schema(self) -> dict: | |
| """ | |
| Defines the JSON schema for the expected output from the LLM. | |
| """ | |
| return { | |
| "type": "object", | |
| "properties": { | |
| "normalized_text": { | |
| "type": "string", | |
| "description": "The text with all citation formats corrected according to the provided rules. If no citations are present, this will be the original text." | |
| } | |
| }, | |
| "required": ["normalized_text"] | |
| } | |
| async def normalize(self, report: str) -> str: | |
| """ | |
| Normalizes the citation format in the report string using an LLM call. | |
| """ | |
| if not report or not isinstance(report, str): | |
| return report | |
| prompt = self._prompt(report) | |
| system_prompt = self._system_prompt() | |
| schema = self._schema() | |
| structured_response = await self.llm.call_with_structured_output( | |
| prompt=prompt, | |
| system_prompt=system_prompt, | |
| schema=schema | |
| ) | |
| if structured_response and "normalized_text" in structured_response: | |
| return structured_response["normalized_text"] | |
| return report |