mtyrrell commited on
Commit
d384965
Β·
1 Parent(s): 42934a1

further steering for citation format

Browse files
Files changed (1) hide show
  1. utils/generator.py +63 -34
utils/generator.py CHANGED
@@ -81,10 +81,13 @@ def _extract_sources(processed_results: List[Dict[str, Any]], cited_numbers: Lis
81
  return cited_sources
82
 
83
  def clean_citations(response: str) -> str:
84
- """Normalize all citation formats to [x]"""
85
 
86
- # Remove References/Sources sections
87
  ref_patterns = [
 
 
 
88
  r'\n\s*References?\s*:.*$',
89
  r'\n\s*Sources?\s*:.*$',
90
  r'\n\s*Bibliography\s*:.*$',
@@ -92,15 +95,31 @@ def clean_citations(response: str) -> str:
92
  for pattern in ref_patterns:
93
  response = re.sub(pattern, '', response, flags=re.IGNORECASE | re.DOTALL)
94
 
 
 
 
 
 
 
 
 
95
  # Fix [Document X, Page Y, Year Z] -> [X]
96
  response = re.sub(
97
- r'\[Document\s+(\d+)[^\]]*\]',
98
  r'[\1]',
99
  response,
100
  flags=re.IGNORECASE
101
  )
102
 
103
- # Fix [2.2.2] style (section numbers) -> [2]
 
 
 
 
 
 
 
 
104
  response = re.sub(
105
  r'\[(\d+)\.[\d\.]+\]',
106
  r'[\1]',
@@ -115,14 +134,28 @@ def clean_citations(response: str) -> str:
115
  flags=re.IGNORECASE
116
  )
117
 
118
- # Fix "Document X states/says" -> [X]
 
 
 
 
 
 
 
 
119
  response = re.sub(
120
- r'Document\s+(\d+)\s+(?:states|says|mentions|reports|indicates)',
121
  r'[\1]',
122
  response,
123
  flags=re.IGNORECASE
124
  )
125
 
 
 
 
 
 
 
126
  return response.strip()
127
 
128
  def _process_context(context: Union[str, List[Dict[str, Any]]]) -> tuple[str, List[Dict[str, Any]]]:
@@ -149,16 +182,11 @@ def _process_context(context: Union[str, List[Dict[str, Any]]]) -> tuple[str, Li
149
  }
150
  processed_results.append(doc_info)
151
 
152
- # Format context string
153
  context_parts = []
154
  for i, result in enumerate(processed_results, 1):
155
- doc_ref = f"[Document {i}: {result['filename']}"
156
- if result['page'] != 'Unknown':
157
- doc_ref += f", Page {result['page']}"
158
- if result['year'] != 'Unknown':
159
- doc_ref += f", Year {result['year']}"
160
- doc_ref += "]"
161
- context_parts.append(f"{doc_ref}\n{result['answer']}\n")
162
 
163
  formatted_context = "\n".join(context_parts)
164
 
@@ -174,7 +202,7 @@ def _process_context(context: Union[str, List[Dict[str, Any]]]) -> tuple[str, Li
174
  def _build_messages(question: str, context: str) -> list:
175
  """Build messages for LLM call"""
176
  system_content = """You are AuditQ&A, an AI Assistant created by Auditors and Data Scientists.
177
- You are given a question and extracted passages of the consolidated/departmental/thematic focus audit reports.
178
  Provide a clear and structured answer based on the passages/context provided and the guidelines.
179
 
180
  Guidelines:
@@ -182,34 +210,35 @@ Guidelines:
182
  - Do not just summarize each passage one by one. Group your summaries to highlight the key parts in the explanation.
183
  - If it makes sense, use bullet points and lists to make your answers easier to understand.
184
  - You do not need to use every passage. Only use the ones that help answer the question.
185
- - Answer the USER question using only the CONTEXT provided.
 
186
 
187
- CITATION FORMAT - FOLLOW EXACTLY:
188
- - Citations MUST be in this format: [1], [2], [3], etc. - ONLY the document number in square brackets.
189
- - Place citations at the end of relevant sentences.
190
- - For multiple sources: [1][2].
 
191
 
192
- CORRECT EXAMPLES:
193
  βœ“ "The budget was UGX.284bn [2]."
194
  βœ“ "Funding was approved by Parliament [1][3]."
195
 
196
- INCORRECT EXAMPLES - NEVER USE THESE:
197
- βœ— [2.2.2] - NO section numbers
198
- βœ— [Document 1, Page 295, Year 2021] - NO page numbers, years, or document names
199
- βœ— (Document 3) - NO parentheses
200
- βœ— "Document 5 states" - NO narrative references
201
-
202
- CRITICAL: Use ONLY [number] format. Never include page numbers, years, document names, or section numbers in citations.
203
 
204
- DO NOT add a "References" section, bibliography, or sources list at the end.
205
 
206
  FOLLOW-UP QUESTIONS:
207
- - If the context contains related information beyond what you included in your answer, suggest 1 relevant follow-up question the user might want to explore.
208
- - Base the question on related information you found in the context or natural extensions of the user's query.
209
- - Format the follow-up question clearly at the end of your response under "You might also want to know:"
210
- - Keep the follow-up question concise and directly related to the audit reports.
211
 
212
- - If the context is insufficient, say "I don't have sufficient information to answer the question. Please try rephrasing your query."
213
  """
214
  user_content = f"### CONTEXT\n{context}\n\n### USER QUESTION\n{question}"
215
  return [SystemMessage(content=system_content), HumanMessage(content=user_content)]
 
81
  return cited_sources
82
 
83
  def clean_citations(response: str) -> str:
84
+ """Normalize all citation formats to [x] and remove unwanted sections"""
85
 
86
+ # Remove References/Sources/Bibliography sections
87
  ref_patterns = [
88
+ r'\n\s*#+\s*References?\s*:?.*$',
89
+ r'\n\s*#+\s*Sources?\s*:?.*$',
90
+ r'\n\s*#+\s*Bibliography\s*:?.*$',
91
  r'\n\s*References?\s*:.*$',
92
  r'\n\s*Sources?\s*:.*$',
93
  r'\n\s*Bibliography\s*:.*$',
 
95
  for pattern in ref_patterns:
96
  response = re.sub(pattern, '', response, flags=re.IGNORECASE | re.DOTALL)
97
 
98
+ # Fix (Document X, Page Y, Year Z) -> [X]
99
+ response = re.sub(
100
+ r'\(Document\s+(\d+)(?:,\s*Page\s+\d+)?(?:,\s*(?:Year\s+)?\d+)?\)',
101
+ r'[\1]',
102
+ response,
103
+ flags=re.IGNORECASE
104
+ )
105
+
106
  # Fix [Document X, Page Y, Year Z] -> [X]
107
  response = re.sub(
108
+ r'\[Document\s+(\d+)(?:[^\]]*)\]',
109
  r'[\1]',
110
  response,
111
  flags=re.IGNORECASE
112
  )
113
 
114
+ # Fix [Document X: filename, Page Y, Year Z] -> [X]
115
+ response = re.sub(
116
+ r'\[Document\s+(\d+):[^\]]+\]',
117
+ r'[\1]',
118
+ response,
119
+ flags=re.IGNORECASE
120
+ )
121
+
122
+ # Fix [X.Y.Z] style (section numbers) -> [X]
123
  response = re.sub(
124
  r'\[(\d+)\.[\d\.]+\]',
125
  r'[\1]',
 
134
  flags=re.IGNORECASE
135
  )
136
 
137
+ # Fix "Document X, Page Y, Year Z" (no brackets) -> [X]
138
+ response = re.sub(
139
+ r'Document\s+(\d+)(?:,\s*Page\s+\d+)?(?:,\s*(?:Year\s+)?\d+)?(?=\s|[,.])',
140
+ r'[\1]',
141
+ response,
142
+ flags=re.IGNORECASE
143
+ )
144
+
145
+ # Fix "Document X states/says/mentions" -> [X]
146
  response = re.sub(
147
+ r'Document\s+(\d+)\s+(?:states|says|mentions|reports|indicates|notes|shows)',
148
  r'[\1]',
149
  response,
150
  flags=re.IGNORECASE
151
  )
152
 
153
+ # Clean up any double citations [[1]] -> [1]
154
+ response = re.sub(r'\[\[(\d+)\]\]', r'[\1]', response)
155
+
156
+ # Clean up multiple spaces
157
+ response = re.sub(r'\s+', ' ', response)
158
+
159
  return response.strip()
160
 
161
  def _process_context(context: Union[str, List[Dict[str, Any]]]) -> tuple[str, List[Dict[str, Any]]]:
 
182
  }
183
  processed_results.append(doc_info)
184
 
185
+ # Format context string - SIMPLIFIED TO ONLY USE [1], [2], [3]
186
  context_parts = []
187
  for i, result in enumerate(processed_results, 1):
188
+ # Simple format: [1], [2], etc.
189
+ context_parts.append(f"[{i}]\n{result['answer']}\n")
 
 
 
 
 
190
 
191
  formatted_context = "\n".join(context_parts)
192
 
 
202
  def _build_messages(question: str, context: str) -> list:
203
  """Build messages for LLM call"""
204
  system_content = """You are AuditQ&A, an AI Assistant created by Auditors and Data Scientists.
205
+ You are given a question and extracted passages from consolidated/departmental/thematic focus audit reports.
206
  Provide a clear and structured answer based on the passages/context provided and the guidelines.
207
 
208
  Guidelines:
 
210
  - Do not just summarize each passage one by one. Group your summaries to highlight the key parts in the explanation.
211
  - If it makes sense, use bullet points and lists to make your answers easier to understand.
212
  - You do not need to use every passage. Only use the ones that help answer the question.
213
+ - Answer the USER question using ONLY the CONTEXT provided. Do not add information from outside the context.
214
+ - Stay focused on the user's question. Do not add unrelated sections or topics.
215
 
216
+ CRITICAL - CITATION FORMAT:
217
+ Citations MUST be in this exact format: [1], [2], [3], etc.
218
+ - ONLY the number in square brackets
219
+ - Place at the end of relevant sentences
220
+ - For multiple sources: [1][2]
221
 
222
+ CORRECT:
223
  βœ“ "The budget was UGX.284bn [2]."
224
  βœ“ "Funding was approved by Parliament [1][3]."
225
 
226
+ NEVER USE:
227
+ βœ— [Document 1, Page 295, Year 2021]
228
+ βœ— (Document 3, Page 23, 2021)
229
+ βœ— Document 5, Page 295, 2021
230
+ βœ— [2.2.2]
231
+ βœ— "Document 5 states"
 
232
 
233
+ DO NOT add a "References", "Sources", or "Bibliography" section at the end.
234
 
235
  FOLLOW-UP QUESTIONS:
236
+ - If the context contains related information beyond what you included, suggest 1 relevant follow-up question.
237
+ - Base the question on information found in the context or natural extensions of the user's query.
238
+ - Format: "You might also want to know:"
239
+ - Keep it concise and directly related to the audit reports.
240
 
241
+ If the context is insufficient, say: "I don't have sufficient information to answer the question. Please try rephrasing your query."
242
  """
243
  user_content = f"### CONTEXT\n{context}\n\n### USER QUESTION\n{question}"
244
  return [SystemMessage(content=system_content), HumanMessage(content=user_content)]