report / app /services /analysis /normalizeForm.py
3v324v23's picture
[D] update promt
a128e26
from app.services.llm.llm_connector import LLMConnector
class NormalizeForm:
def __init__(self, llm: LLMConnector) -> None:
self.llm = llm
def _system_prompt(self) -> str:
"""
Defines the system's role for the LLM.
"""
return """You are a text processing expert specializing in citation formatting. Your sole purpose is to receive a piece of text and correct its citation style according to a strict set of rules, without altering any other part of the content. You must return only the corrected text."""
def _prompt(self, report: str) -> str:
"""
Constructs the user prompt for the LLM with STRICT CITATION RULES.
"""
return f"""Please correct the citation formatting in the following text.
==================================================
CRITICAL CITATION RULES (MUST FOLLOW):
1. Single Citation:
- CORRECT: [Source 1]
- CORRECT: [Image 1]
- INCORRECT: [1], (Source 1), Source[1], (1), ...
2. Multiple Citations (SAME TYPE) -> COMBINE:
- CORRECT: [Source 1, 2]
- CORRECT: [Image 1, 3]
- INCORRECT: [Source 1][Source 2]
3. Multiple Citations (DIFFERENT TYPES) -> SEPARATE:
- CORRECT: [Source 1], [Image 2]
- INCORRECT (NEVER DO THIS): [Source 1, Image 2]
- INCORRECT: [Source 1; Image 2]
4. Forbidden Characters:
- Do NOT use parentheses like (Source 1).
- Do NOT use standalone numbers like [1].
5. Enforcement Example:
- "According to [Source 1], the tank is visible in [Image 2, 3]."
- "This is confirmed by [Source 1, 2] and shown in [Image 1]."
6. No Citation Case (STRICT)
- Never invent, assume, or auto-append citations.
- Citations must appear only if they already exist in the original text.
- Adding fake or guessed citations is considered a critical error.
Example (CORRECT):
"The system validates citation formatting."
Example (INCORRECT):
"The system validates citation formatting. [Source 1]"
==================================================
Original Text to correct:
---
{report}
---
Example:
"The data was collected from an official report Source(1)"
"This information is supported by multiple references Source[1, 2, 3]"
CORRECT:
"The data was collected from an official report [Source 1]"
"This information is supported by multiple references [Source 1, 2, 3]"
INCORRECT:
"The data was collected from an official report [1]"
"The data was collected from an official report (Source 1)"
"The data was collected from an official report Source[1]"
"This information is supported by multiple references [Source 1][Source 2]"
"This information is supported by multiple references [Source 1, Source 2]"
Return only the corrected text, with no preamble or explanation.
If no citation exists in the text, do not add any citation.
"""
def _schema(self) -> dict:
"""
Defines the JSON schema for the expected output from the LLM.
"""
return {
"type": "object",
"properties": {
"normalized_text": {
"type": "string",
"description": "The text with all citation formats corrected according to the provided rules. If no citations are present, this will be the original text."
}
},
"required": ["normalized_text"]
}
async def normalize(self, report: str) -> str:
"""
Normalizes the citation format in the report string using an LLM call.
"""
if not report or not isinstance(report, str):
return report
prompt = self._prompt(report)
system_prompt = self._system_prompt()
schema = self._schema()
structured_response = await self.llm.call_with_structured_output(
prompt=prompt,
system_prompt=system_prompt,
schema=schema
)
if structured_response and "normalized_text" in structured_response:
return structured_response["normalized_text"]
return report