mtyrrell commited on
Commit
b85478b
·
1 Parent(s): 7c59563

further cleaning of citations

Browse files
Files changed (1) hide show
  1. utils/generator.py +62 -40
utils/generator.py CHANGED
@@ -105,30 +105,48 @@ def _extract_sources(processed_results: List[Dict[str, Any]], cited_numbers: Lis
105
 
106
  return cited_sources
107
 
108
- def normalize_citations(response: str) -> str:
109
- """Convert non-compliant citation formats to [x] format"""
110
- # Convert (Document X) to [X]
111
- response = re.sub(r'\(Document\s+(\d+)\)', r'[\1]', response, flags=re.IGNORECASE)
112
- # Convert (Doc X) to [X]
113
- response = re.sub(r'\(Doc\s+(\d+)\)', r'[\1]', response, flags=re.IGNORECASE)
114
- # Convert "Document X says" to [X]
115
- response = re.sub(r'Document\s+(\d+)\s+(?:says|states|mentions)', r'[\1]', response, flags=re.IGNORECASE)
116
- return response
117
-
118
- def clean_response(response: str) -> str:
119
- """Remove unwanted reference sections"""
120
- # Split by common reference section headers
121
- patterns = [
122
- r'\n\s*References?\s*:',
123
- r'\n\s*Sources?\s*:',
124
- r'\n\s*Bibliography\s*:',
125
- r'\n\s*Citations?\s*:',
126
  ]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
127
 
128
- for pattern in patterns:
129
- if re.search(pattern, response, re.IGNORECASE):
130
- response = re.split(pattern, response, flags=re.IGNORECASE)[0]
131
- break
 
 
 
132
 
133
  return response.strip()
134
 
@@ -191,16 +209,24 @@ Guidelines:
191
  - You do not need to use every passage. Only use the ones that help answer the question.
192
  - Answer the USER question using only the CONTEXT provided.
193
 
194
- CITATION FORMAT (CRITICAL):
195
- - ALWAYS use inline citations in square brackets like [1], [2], etc. to reference document numbers.
196
- - Place citations at the end of sentences or claims: "The audit found compliance issues [1]."
197
- - For multiple sources, use [1][2].
198
- - CORRECT: "Revenue increased by 15% [3]."
199
- - INCORRECT: "(Document 3)", "(Doc 3)", "Document 3 states", "according to document 3"
200
- - NEVER use phrases like "Doc x says" or "(Document x)" - ONLY use [x] format.
201
- - DO NOT add a "References" section at the end of your response.
202
- - DO NOT list out the full document names, page numbers, or years at the end.
203
- - Your response should END after your answer - no bibliography, no references list, no sources section.
 
 
 
 
 
 
 
 
204
 
205
  - If the context is insufficient, say "I don't have sufficient information to answer the question. Please try rephrasing your query."
206
  """
@@ -263,10 +289,8 @@ async def generate(query: str, context: Union[str, List[Dict[str, Any]]], chatui
263
  messages = _build_messages(query, formatted_context)
264
  answer = await _call_llm(messages)
265
 
266
- # Normalize citations to ensure proper format
267
- answer = normalize_citations(answer)
268
- # Clean response to remove unwanted reference sections
269
- answer = clean_response(answer)
270
 
271
  if chatui_format:
272
  result = {"answer": answer}
@@ -306,10 +330,8 @@ async def generate_streaming(query: str, context: Union[str, List[Dict[str, Any]
306
  else:
307
  yield chunk
308
 
309
- # Normalize citations in the complete response
310
- normalized_response = normalize_citations(accumulated_response)
311
- # Clean response to remove unwanted reference sections
312
- cleaned_response = clean_response(normalized_response)
313
 
314
  # Send sources at the end if available and in ChatUI format
315
  if chatui_format and processed_results:
 
105
 
106
  return cited_sources
107
 
108
+ def clean_citations(response: str) -> str:
109
+ """Normalize all citation formats to [x]"""
110
+
111
+ # Remove References/Sources sections
112
+ ref_patterns = [
113
+ r'\n\s*References?\s*:.*$',
114
+ r'\n\s*Sources?\s*:.*$',
115
+ r'\n\s*Bibliography\s*:.*$',
 
 
 
 
 
 
 
 
 
 
116
  ]
117
+ for pattern in ref_patterns:
118
+ response = re.sub(pattern, '', response, flags=re.IGNORECASE | re.DOTALL)
119
+
120
+ # Fix [Document X, Page Y, Year Z] -> [X]
121
+ response = re.sub(
122
+ r'\[Document\s+(\d+)[^\]]*\]',
123
+ r'[\1]',
124
+ response,
125
+ flags=re.IGNORECASE
126
+ )
127
+
128
+ # Fix [2.2.2] style (section numbers) -> [2]
129
+ response = re.sub(
130
+ r'\[(\d+)\.[\d\.]+\]',
131
+ r'[\1]',
132
+ response
133
+ )
134
+
135
+ # Fix (Document X) -> [X]
136
+ response = re.sub(
137
+ r'\(Document\s+(\d+)\)',
138
+ r'[\1]',
139
+ response,
140
+ flags=re.IGNORECASE
141
+ )
142
 
143
+ # Fix "Document X states/says" -> [X]
144
+ response = re.sub(
145
+ r'Document\s+(\d+)\s+(?:states|says|mentions|reports|indicates)',
146
+ r'[\1]',
147
+ response,
148
+ flags=re.IGNORECASE
149
+ )
150
 
151
  return response.strip()
152
 
 
209
  - You do not need to use every passage. Only use the ones that help answer the question.
210
  - Answer the USER question using only the CONTEXT provided.
211
 
212
+ CITATION FORMAT - FOLLOW EXACTLY:
213
+ - Citations MUST be in this format: [1], [2], [3], etc. - ONLY the document number in square brackets.
214
+ - Place citations at the end of relevant sentences.
215
+ - For multiple sources: [1][2].
216
+
217
+ CORRECT EXAMPLES:
218
+ "The budget was UGX.284bn [2]."
219
+ "Funding was approved by Parliament [1][3]."
220
+
221
+ INCORRECT EXAMPLES - NEVER USE THESE:
222
+ ✗ [2.2.2] - NO section numbers
223
+ ✗ [Document 1, Page 295, Year 2021] - NO page numbers, years, or document names
224
+ ✗ (Document 3) - NO parentheses
225
+ ✗ "Document 5 states" - NO narrative references
226
+
227
+ CRITICAL: Use ONLY [number] format. Never include page numbers, years, document names, or section numbers in citations.
228
+
229
+ DO NOT add a "References" section, bibliography, or sources list at the end.
230
 
231
  - If the context is insufficient, say "I don't have sufficient information to answer the question. Please try rephrasing your query."
232
  """
 
289
  messages = _build_messages(query, formatted_context)
290
  answer = await _call_llm(messages)
291
 
292
+ # Clean citations to ensure proper format and remove unwanted sections
293
+ answer = clean_citations(answer)
 
 
294
 
295
  if chatui_format:
296
  result = {"answer": answer}
 
330
  else:
331
  yield chunk
332
 
333
+ # Clean citations in the complete response
334
+ cleaned_response = clean_citations(accumulated_response)
 
 
335
 
336
  # Send sources at the end if available and in ChatUI format
337
  if chatui_format and processed_results: