Spaces:

tokyotechlab
/

report

Sleeping

File size: 4,418 Bytes

a128e26

from app.services.llm.llm_connector import LLMConnector

class NormalizeForm:
    def __init__(self, llm: LLMConnector) -> None:
        self.llm = llm
    
    def _system_prompt(self) -> str:
        """
        Defines the system's role for the LLM.
        """
        return """You are a text processing expert specializing in citation formatting. Your sole purpose is to receive a piece of text and correct its citation style according to a strict set of rules, without altering any other part of the content. You must return only the corrected text."""

    def _prompt(self, report: str) -> str:
        """
        Constructs the user prompt for the LLM with STRICT CITATION RULES.
        """
        return f"""Please correct the citation formatting in the following text.

        ==================================================
        CRITICAL CITATION RULES (MUST FOLLOW):

        1. Single Citation:
        - CORRECT: [Source 1]
        - CORRECT: [Image 1]
        - INCORRECT: [1], (Source 1), Source[1], (1), ...

        2. Multiple Citations (SAME TYPE) -> COMBINE:
        - CORRECT: [Source 1, 2]
        - CORRECT: [Image 1, 3]
        - INCORRECT: [Source 1][Source 2]

        3. Multiple Citations (DIFFERENT TYPES) -> SEPARATE:
        - CORRECT: [Source 1], [Image 2]
        - INCORRECT (NEVER DO THIS): [Source 1, Image 2]
        - INCORRECT: [Source 1; Image 2]

        4. Forbidden Characters:
        - Do NOT use parentheses like (Source 1).
        - Do NOT use standalone numbers like [1].

        5. Enforcement Example:
        - "According to [Source 1], the tank is visible in [Image 2, 3]."
        - "This is confirmed by [Source 1, 2] and shown in [Image 1]."

        6. No Citation Case (STRICT)

        - Never invent, assume, or auto-append citations.

        - Citations must appear only if they already exist in the original text.

        - Adding fake or guessed citations is considered a critical error.

        Example (CORRECT):

            "The system validates citation formatting."

        Example (INCORRECT):

            "The system validates citation formatting. [Source 1]"
        ==================================================

        Original Text to correct:
        ---
        {report}
        ---

        Example: 

        "The data was collected from an official report Source(1)"
        "This information is supported by multiple references  Source[1, 2, 3]"

        CORRECT:

        "The data was collected from an official report [Source 1]"
        "This information is supported by multiple references [Source 1, 2, 3]"

        INCORRECT:
        "The data was collected from an official report [1]"
        "The data was collected from an official report (Source 1)"
        "The data was collected from an official report Source[1]"
        "This information is supported by multiple references [Source 1][Source 2]"
        "This information is supported by multiple references [Source 1, Source 2]"

        Return only the corrected text, with no preamble or explanation.
        If no citation exists in the text, do not add any citation.

        """

    def _schema(self) -> dict:
        """
        Defines the JSON schema for the expected output from the LLM.
        """
        return {
            "type": "object",
            "properties": {
                "normalized_text": {
                    "type": "string",
                    "description": "The text with all citation formats corrected according to the provided rules. If no citations are present, this will be the original text."
                }
            },
            "required": ["normalized_text"]
        }

    async def normalize(self, report: str) -> str:
        """
        Normalizes the citation format in the report string using an LLM call.
        """
        if not report or not isinstance(report, str):
            return report
        
        prompt = self._prompt(report)
        system_prompt = self._system_prompt()
        schema = self._schema()
        
        structured_response = await self.llm.call_with_structured_output(
            prompt=prompt,
            system_prompt=system_prompt,
            schema=schema
        )
        
        if structured_response and "normalized_text" in structured_response:
            return structured_response["normalized_text"]
        
        return report