Spaces:
Sleeping
Sleeping
File size: 4,418 Bytes
a128e26 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 |
from app.services.llm.llm_connector import LLMConnector
class NormalizeForm:
def __init__(self, llm: LLMConnector) -> None:
self.llm = llm
def _system_prompt(self) -> str:
"""
Defines the system's role for the LLM.
"""
return """You are a text processing expert specializing in citation formatting. Your sole purpose is to receive a piece of text and correct its citation style according to a strict set of rules, without altering any other part of the content. You must return only the corrected text."""
def _prompt(self, report: str) -> str:
"""
Constructs the user prompt for the LLM with STRICT CITATION RULES.
"""
return f"""Please correct the citation formatting in the following text.
==================================================
CRITICAL CITATION RULES (MUST FOLLOW):
1. Single Citation:
- CORRECT: [Source 1]
- CORRECT: [Image 1]
- INCORRECT: [1], (Source 1), Source[1], (1), ...
2. Multiple Citations (SAME TYPE) -> COMBINE:
- CORRECT: [Source 1, 2]
- CORRECT: [Image 1, 3]
- INCORRECT: [Source 1][Source 2]
3. Multiple Citations (DIFFERENT TYPES) -> SEPARATE:
- CORRECT: [Source 1], [Image 2]
- INCORRECT (NEVER DO THIS): [Source 1, Image 2]
- INCORRECT: [Source 1; Image 2]
4. Forbidden Characters:
- Do NOT use parentheses like (Source 1).
- Do NOT use standalone numbers like [1].
5. Enforcement Example:
- "According to [Source 1], the tank is visible in [Image 2, 3]."
- "This is confirmed by [Source 1, 2] and shown in [Image 1]."
6. No Citation Case (STRICT)
- Never invent, assume, or auto-append citations.
- Citations must appear only if they already exist in the original text.
- Adding fake or guessed citations is considered a critical error.
Example (CORRECT):
"The system validates citation formatting."
Example (INCORRECT):
"The system validates citation formatting. [Source 1]"
==================================================
Original Text to correct:
---
{report}
---
Example:
"The data was collected from an official report Source(1)"
"This information is supported by multiple references Source[1, 2, 3]"
CORRECT:
"The data was collected from an official report [Source 1]"
"This information is supported by multiple references [Source 1, 2, 3]"
INCORRECT:
"The data was collected from an official report [1]"
"The data was collected from an official report (Source 1)"
"The data was collected from an official report Source[1]"
"This information is supported by multiple references [Source 1][Source 2]"
"This information is supported by multiple references [Source 1, Source 2]"
Return only the corrected text, with no preamble or explanation.
If no citation exists in the text, do not add any citation.
"""
def _schema(self) -> dict:
"""
Defines the JSON schema for the expected output from the LLM.
"""
return {
"type": "object",
"properties": {
"normalized_text": {
"type": "string",
"description": "The text with all citation formats corrected according to the provided rules. If no citations are present, this will be the original text."
}
},
"required": ["normalized_text"]
}
async def normalize(self, report: str) -> str:
"""
Normalizes the citation format in the report string using an LLM call.
"""
if not report or not isinstance(report, str):
return report
prompt = self._prompt(report)
system_prompt = self._system_prompt()
schema = self._schema()
structured_response = await self.llm.call_with_structured_output(
prompt=prompt,
system_prompt=system_prompt,
schema=schema
)
if structured_response and "normalized_text" in structured_response:
return structured_response["normalized_text"]
return report |