Spaces:
Sleeping
Sleeping
further cleaning of citations
Browse files- utils/generator.py +62 -40
utils/generator.py
CHANGED
|
@@ -105,30 +105,48 @@ def _extract_sources(processed_results: List[Dict[str, Any]], cited_numbers: Lis
|
|
| 105 |
|
| 106 |
return cited_sources
|
| 107 |
|
| 108 |
-
def
|
| 109 |
-
"""
|
| 110 |
-
|
| 111 |
-
|
| 112 |
-
|
| 113 |
-
|
| 114 |
-
|
| 115 |
-
|
| 116 |
-
return response
|
| 117 |
-
|
| 118 |
-
def clean_response(response: str) -> str:
|
| 119 |
-
"""Remove unwanted reference sections"""
|
| 120 |
-
# Split by common reference section headers
|
| 121 |
-
patterns = [
|
| 122 |
-
r'\n\s*References?\s*:',
|
| 123 |
-
r'\n\s*Sources?\s*:',
|
| 124 |
-
r'\n\s*Bibliography\s*:',
|
| 125 |
-
r'\n\s*Citations?\s*:',
|
| 126 |
]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 127 |
|
| 128 |
-
|
| 129 |
-
|
| 130 |
-
|
| 131 |
-
|
|
|
|
|
|
|
|
|
|
| 132 |
|
| 133 |
return response.strip()
|
| 134 |
|
|
@@ -191,16 +209,24 @@ Guidelines:
|
|
| 191 |
- You do not need to use every passage. Only use the ones that help answer the question.
|
| 192 |
- Answer the USER question using only the CONTEXT provided.
|
| 193 |
|
| 194 |
-
CITATION FORMAT
|
| 195 |
-
-
|
| 196 |
-
- Place citations at the end of sentences
|
| 197 |
-
- For multiple sources
|
| 198 |
-
|
| 199 |
-
|
| 200 |
-
|
| 201 |
-
|
| 202 |
-
|
| 203 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 204 |
|
| 205 |
- If the context is insufficient, say "I don't have sufficient information to answer the question. Please try rephrasing your query."
|
| 206 |
"""
|
|
@@ -263,10 +289,8 @@ async def generate(query: str, context: Union[str, List[Dict[str, Any]]], chatui
|
|
| 263 |
messages = _build_messages(query, formatted_context)
|
| 264 |
answer = await _call_llm(messages)
|
| 265 |
|
| 266 |
-
#
|
| 267 |
-
answer =
|
| 268 |
-
# Clean response to remove unwanted reference sections
|
| 269 |
-
answer = clean_response(answer)
|
| 270 |
|
| 271 |
if chatui_format:
|
| 272 |
result = {"answer": answer}
|
|
@@ -306,10 +330,8 @@ async def generate_streaming(query: str, context: Union[str, List[Dict[str, Any]
|
|
| 306 |
else:
|
| 307 |
yield chunk
|
| 308 |
|
| 309 |
-
#
|
| 310 |
-
|
| 311 |
-
# Clean response to remove unwanted reference sections
|
| 312 |
-
cleaned_response = clean_response(normalized_response)
|
| 313 |
|
| 314 |
# Send sources at the end if available and in ChatUI format
|
| 315 |
if chatui_format and processed_results:
|
|
|
|
| 105 |
|
| 106 |
return cited_sources
|
| 107 |
|
| 108 |
+
def clean_citations(response: str) -> str:
|
| 109 |
+
"""Normalize all citation formats to [x]"""
|
| 110 |
+
|
| 111 |
+
# Remove References/Sources sections
|
| 112 |
+
ref_patterns = [
|
| 113 |
+
r'\n\s*References?\s*:.*$',
|
| 114 |
+
r'\n\s*Sources?\s*:.*$',
|
| 115 |
+
r'\n\s*Bibliography\s*:.*$',
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 116 |
]
|
| 117 |
+
for pattern in ref_patterns:
|
| 118 |
+
response = re.sub(pattern, '', response, flags=re.IGNORECASE | re.DOTALL)
|
| 119 |
+
|
| 120 |
+
# Fix [Document X, Page Y, Year Z] -> [X]
|
| 121 |
+
response = re.sub(
|
| 122 |
+
r'\[Document\s+(\d+)[^\]]*\]',
|
| 123 |
+
r'[\1]',
|
| 124 |
+
response,
|
| 125 |
+
flags=re.IGNORECASE
|
| 126 |
+
)
|
| 127 |
+
|
| 128 |
+
# Fix [2.2.2] style (section numbers) -> [2]
|
| 129 |
+
response = re.sub(
|
| 130 |
+
r'\[(\d+)\.[\d\.]+\]',
|
| 131 |
+
r'[\1]',
|
| 132 |
+
response
|
| 133 |
+
)
|
| 134 |
+
|
| 135 |
+
# Fix (Document X) -> [X]
|
| 136 |
+
response = re.sub(
|
| 137 |
+
r'\(Document\s+(\d+)\)',
|
| 138 |
+
r'[\1]',
|
| 139 |
+
response,
|
| 140 |
+
flags=re.IGNORECASE
|
| 141 |
+
)
|
| 142 |
|
| 143 |
+
# Fix "Document X states/says" -> [X]
|
| 144 |
+
response = re.sub(
|
| 145 |
+
r'Document\s+(\d+)\s+(?:states|says|mentions|reports|indicates)',
|
| 146 |
+
r'[\1]',
|
| 147 |
+
response,
|
| 148 |
+
flags=re.IGNORECASE
|
| 149 |
+
)
|
| 150 |
|
| 151 |
return response.strip()
|
| 152 |
|
|
|
|
| 209 |
- You do not need to use every passage. Only use the ones that help answer the question.
|
| 210 |
- Answer the USER question using only the CONTEXT provided.
|
| 211 |
|
| 212 |
+
CITATION FORMAT - FOLLOW EXACTLY:
|
| 213 |
+
- Citations MUST be in this format: [1], [2], [3], etc. - ONLY the document number in square brackets.
|
| 214 |
+
- Place citations at the end of relevant sentences.
|
| 215 |
+
- For multiple sources: [1][2].
|
| 216 |
+
|
| 217 |
+
CORRECT EXAMPLES:
|
| 218 |
+
✓ "The budget was UGX.284bn [2]."
|
| 219 |
+
✓ "Funding was approved by Parliament [1][3]."
|
| 220 |
+
|
| 221 |
+
INCORRECT EXAMPLES - NEVER USE THESE:
|
| 222 |
+
✗ [2.2.2] - NO section numbers
|
| 223 |
+
✗ [Document 1, Page 295, Year 2021] - NO page numbers, years, or document names
|
| 224 |
+
✗ (Document 3) - NO parentheses
|
| 225 |
+
✗ "Document 5 states" - NO narrative references
|
| 226 |
+
|
| 227 |
+
CRITICAL: Use ONLY [number] format. Never include page numbers, years, document names, or section numbers in citations.
|
| 228 |
+
|
| 229 |
+
DO NOT add a "References" section, bibliography, or sources list at the end.
|
| 230 |
|
| 231 |
- If the context is insufficient, say "I don't have sufficient information to answer the question. Please try rephrasing your query."
|
| 232 |
"""
|
|
|
|
| 289 |
messages = _build_messages(query, formatted_context)
|
| 290 |
answer = await _call_llm(messages)
|
| 291 |
|
| 292 |
+
# Clean citations to ensure proper format and remove unwanted sections
|
| 293 |
+
answer = clean_citations(answer)
|
|
|
|
|
|
|
| 294 |
|
| 295 |
if chatui_format:
|
| 296 |
result = {"answer": answer}
|
|
|
|
| 330 |
else:
|
| 331 |
yield chunk
|
| 332 |
|
| 333 |
+
# Clean citations in the complete response
|
| 334 |
+
cleaned_response = clean_citations(accumulated_response)
|
|
|
|
|
|
|
| 335 |
|
| 336 |
# Send sources at the end if available and in ChatUI format
|
| 337 |
if chatui_format and processed_results:
|