Spaces:
Sleeping
Sleeping
further steering for citation format
Browse files- utils/generator.py +63 -34
utils/generator.py
CHANGED
|
@@ -81,10 +81,13 @@ def _extract_sources(processed_results: List[Dict[str, Any]], cited_numbers: Lis
|
|
| 81 |
return cited_sources
|
| 82 |
|
| 83 |
def clean_citations(response: str) -> str:
|
| 84 |
-
"""Normalize all citation formats to [x]"""
|
| 85 |
|
| 86 |
-
# Remove References/Sources sections
|
| 87 |
ref_patterns = [
|
|
|
|
|
|
|
|
|
|
| 88 |
r'\n\s*References?\s*:.*$',
|
| 89 |
r'\n\s*Sources?\s*:.*$',
|
| 90 |
r'\n\s*Bibliography\s*:.*$',
|
|
@@ -92,15 +95,31 @@ def clean_citations(response: str) -> str:
|
|
| 92 |
for pattern in ref_patterns:
|
| 93 |
response = re.sub(pattern, '', response, flags=re.IGNORECASE | re.DOTALL)
|
| 94 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 95 |
# Fix [Document X, Page Y, Year Z] -> [X]
|
| 96 |
response = re.sub(
|
| 97 |
-
r'\[Document\s+(\d+)[^\]]*\]',
|
| 98 |
r'[\1]',
|
| 99 |
response,
|
| 100 |
flags=re.IGNORECASE
|
| 101 |
)
|
| 102 |
|
| 103 |
-
# Fix [
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 104 |
response = re.sub(
|
| 105 |
r'\[(\d+)\.[\d\.]+\]',
|
| 106 |
r'[\1]',
|
|
@@ -115,14 +134,28 @@ def clean_citations(response: str) -> str:
|
|
| 115 |
flags=re.IGNORECASE
|
| 116 |
)
|
| 117 |
|
| 118 |
-
# Fix "Document X
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 119 |
response = re.sub(
|
| 120 |
-
r'Document\s+(\d+)\s+(?:states|says|mentions|reports|indicates)',
|
| 121 |
r'[\1]',
|
| 122 |
response,
|
| 123 |
flags=re.IGNORECASE
|
| 124 |
)
|
| 125 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 126 |
return response.strip()
|
| 127 |
|
| 128 |
def _process_context(context: Union[str, List[Dict[str, Any]]]) -> tuple[str, List[Dict[str, Any]]]:
|
|
@@ -149,16 +182,11 @@ def _process_context(context: Union[str, List[Dict[str, Any]]]) -> tuple[str, Li
|
|
| 149 |
}
|
| 150 |
processed_results.append(doc_info)
|
| 151 |
|
| 152 |
-
# Format context string
|
| 153 |
context_parts = []
|
| 154 |
for i, result in enumerate(processed_results, 1):
|
| 155 |
-
|
| 156 |
-
|
| 157 |
-
doc_ref += f", Page {result['page']}"
|
| 158 |
-
if result['year'] != 'Unknown':
|
| 159 |
-
doc_ref += f", Year {result['year']}"
|
| 160 |
-
doc_ref += "]"
|
| 161 |
-
context_parts.append(f"{doc_ref}\n{result['answer']}\n")
|
| 162 |
|
| 163 |
formatted_context = "\n".join(context_parts)
|
| 164 |
|
|
@@ -174,7 +202,7 @@ def _process_context(context: Union[str, List[Dict[str, Any]]]) -> tuple[str, Li
|
|
| 174 |
def _build_messages(question: str, context: str) -> list:
|
| 175 |
"""Build messages for LLM call"""
|
| 176 |
system_content = """You are AuditQ&A, an AI Assistant created by Auditors and Data Scientists.
|
| 177 |
-
You are given a question and extracted passages
|
| 178 |
Provide a clear and structured answer based on the passages/context provided and the guidelines.
|
| 179 |
|
| 180 |
Guidelines:
|
|
@@ -182,34 +210,35 @@ Guidelines:
|
|
| 182 |
- Do not just summarize each passage one by one. Group your summaries to highlight the key parts in the explanation.
|
| 183 |
- If it makes sense, use bullet points and lists to make your answers easier to understand.
|
| 184 |
- You do not need to use every passage. Only use the ones that help answer the question.
|
| 185 |
-
- Answer the USER question using
|
|
|
|
| 186 |
|
| 187 |
-
|
| 188 |
-
|
| 189 |
-
-
|
| 190 |
-
-
|
|
|
|
| 191 |
|
| 192 |
-
CORRECT
|
| 193 |
β "The budget was UGX.284bn [2]."
|
| 194 |
β "Funding was approved by Parliament [1][3]."
|
| 195 |
|
| 196 |
-
|
| 197 |
-
β [
|
| 198 |
-
β
|
| 199 |
-
β
|
| 200 |
-
β
|
| 201 |
-
|
| 202 |
-
CRITICAL: Use ONLY [number] format. Never include page numbers, years, document names, or section numbers in citations.
|
| 203 |
|
| 204 |
-
DO NOT add a "References"
|
| 205 |
|
| 206 |
FOLLOW-UP QUESTIONS:
|
| 207 |
-
- If the context contains related information beyond what you included
|
| 208 |
-
- Base the question on
|
| 209 |
-
- Format
|
| 210 |
-
- Keep
|
| 211 |
|
| 212 |
-
|
| 213 |
"""
|
| 214 |
user_content = f"### CONTEXT\n{context}\n\n### USER QUESTION\n{question}"
|
| 215 |
return [SystemMessage(content=system_content), HumanMessage(content=user_content)]
|
|
|
|
| 81 |
return cited_sources
|
| 82 |
|
| 83 |
def clean_citations(response: str) -> str:
|
| 84 |
+
"""Normalize all citation formats to [x] and remove unwanted sections"""
|
| 85 |
|
| 86 |
+
# Remove References/Sources/Bibliography sections
|
| 87 |
ref_patterns = [
|
| 88 |
+
r'\n\s*#+\s*References?\s*:?.*$',
|
| 89 |
+
r'\n\s*#+\s*Sources?\s*:?.*$',
|
| 90 |
+
r'\n\s*#+\s*Bibliography\s*:?.*$',
|
| 91 |
r'\n\s*References?\s*:.*$',
|
| 92 |
r'\n\s*Sources?\s*:.*$',
|
| 93 |
r'\n\s*Bibliography\s*:.*$',
|
|
|
|
| 95 |
for pattern in ref_patterns:
|
| 96 |
response = re.sub(pattern, '', response, flags=re.IGNORECASE | re.DOTALL)
|
| 97 |
|
| 98 |
+
# Fix (Document X, Page Y, Year Z) -> [X]
|
| 99 |
+
response = re.sub(
|
| 100 |
+
r'\(Document\s+(\d+)(?:,\s*Page\s+\d+)?(?:,\s*(?:Year\s+)?\d+)?\)',
|
| 101 |
+
r'[\1]',
|
| 102 |
+
response,
|
| 103 |
+
flags=re.IGNORECASE
|
| 104 |
+
)
|
| 105 |
+
|
| 106 |
# Fix [Document X, Page Y, Year Z] -> [X]
|
| 107 |
response = re.sub(
|
| 108 |
+
r'\[Document\s+(\d+)(?:[^\]]*)\]',
|
| 109 |
r'[\1]',
|
| 110 |
response,
|
| 111 |
flags=re.IGNORECASE
|
| 112 |
)
|
| 113 |
|
| 114 |
+
# Fix [Document X: filename, Page Y, Year Z] -> [X]
|
| 115 |
+
response = re.sub(
|
| 116 |
+
r'\[Document\s+(\d+):[^\]]+\]',
|
| 117 |
+
r'[\1]',
|
| 118 |
+
response,
|
| 119 |
+
flags=re.IGNORECASE
|
| 120 |
+
)
|
| 121 |
+
|
| 122 |
+
# Fix [X.Y.Z] style (section numbers) -> [X]
|
| 123 |
response = re.sub(
|
| 124 |
r'\[(\d+)\.[\d\.]+\]',
|
| 125 |
r'[\1]',
|
|
|
|
| 134 |
flags=re.IGNORECASE
|
| 135 |
)
|
| 136 |
|
| 137 |
+
# Fix "Document X, Page Y, Year Z" (no brackets) -> [X]
|
| 138 |
+
response = re.sub(
|
| 139 |
+
r'Document\s+(\d+)(?:,\s*Page\s+\d+)?(?:,\s*(?:Year\s+)?\d+)?(?=\s|[,.])',
|
| 140 |
+
r'[\1]',
|
| 141 |
+
response,
|
| 142 |
+
flags=re.IGNORECASE
|
| 143 |
+
)
|
| 144 |
+
|
| 145 |
+
# Fix "Document X states/says/mentions" -> [X]
|
| 146 |
response = re.sub(
|
| 147 |
+
r'Document\s+(\d+)\s+(?:states|says|mentions|reports|indicates|notes|shows)',
|
| 148 |
r'[\1]',
|
| 149 |
response,
|
| 150 |
flags=re.IGNORECASE
|
| 151 |
)
|
| 152 |
|
| 153 |
+
# Clean up any double citations [[1]] -> [1]
|
| 154 |
+
response = re.sub(r'\[\[(\d+)\]\]', r'[\1]', response)
|
| 155 |
+
|
| 156 |
+
# Clean up multiple spaces
|
| 157 |
+
response = re.sub(r'\s+', ' ', response)
|
| 158 |
+
|
| 159 |
return response.strip()
|
| 160 |
|
| 161 |
def _process_context(context: Union[str, List[Dict[str, Any]]]) -> tuple[str, List[Dict[str, Any]]]:
|
|
|
|
| 182 |
}
|
| 183 |
processed_results.append(doc_info)
|
| 184 |
|
| 185 |
+
# Format context string - SIMPLIFIED TO ONLY USE [1], [2], [3]
|
| 186 |
context_parts = []
|
| 187 |
for i, result in enumerate(processed_results, 1):
|
| 188 |
+
# Simple format: [1], [2], etc.
|
| 189 |
+
context_parts.append(f"[{i}]\n{result['answer']}\n")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 190 |
|
| 191 |
formatted_context = "\n".join(context_parts)
|
| 192 |
|
|
|
|
| 202 |
def _build_messages(question: str, context: str) -> list:
|
| 203 |
"""Build messages for LLM call"""
|
| 204 |
system_content = """You are AuditQ&A, an AI Assistant created by Auditors and Data Scientists.
|
| 205 |
+
You are given a question and extracted passages from consolidated/departmental/thematic focus audit reports.
|
| 206 |
Provide a clear and structured answer based on the passages/context provided and the guidelines.
|
| 207 |
|
| 208 |
Guidelines:
|
|
|
|
| 210 |
- Do not just summarize each passage one by one. Group your summaries to highlight the key parts in the explanation.
|
| 211 |
- If it makes sense, use bullet points and lists to make your answers easier to understand.
|
| 212 |
- You do not need to use every passage. Only use the ones that help answer the question.
|
| 213 |
+
- Answer the USER question using ONLY the CONTEXT provided. Do not add information from outside the context.
|
| 214 |
+
- Stay focused on the user's question. Do not add unrelated sections or topics.
|
| 215 |
|
| 216 |
+
CRITICAL - CITATION FORMAT:
|
| 217 |
+
Citations MUST be in this exact format: [1], [2], [3], etc.
|
| 218 |
+
- ONLY the number in square brackets
|
| 219 |
+
- Place at the end of relevant sentences
|
| 220 |
+
- For multiple sources: [1][2]
|
| 221 |
|
| 222 |
+
CORRECT:
|
| 223 |
β "The budget was UGX.284bn [2]."
|
| 224 |
β "Funding was approved by Parliament [1][3]."
|
| 225 |
|
| 226 |
+
NEVER USE:
|
| 227 |
+
β [Document 1, Page 295, Year 2021]
|
| 228 |
+
β (Document 3, Page 23, 2021)
|
| 229 |
+
β Document 5, Page 295, 2021
|
| 230 |
+
β [2.2.2]
|
| 231 |
+
β "Document 5 states"
|
|
|
|
| 232 |
|
| 233 |
+
DO NOT add a "References", "Sources", or "Bibliography" section at the end.
|
| 234 |
|
| 235 |
FOLLOW-UP QUESTIONS:
|
| 236 |
+
- If the context contains related information beyond what you included, suggest 1 relevant follow-up question.
|
| 237 |
+
- Base the question on information found in the context or natural extensions of the user's query.
|
| 238 |
+
- Format: "You might also want to know:"
|
| 239 |
+
- Keep it concise and directly related to the audit reports.
|
| 240 |
|
| 241 |
+
If the context is insufficient, say: "I don't have sufficient information to answer the question. Please try rephrasing your query."
|
| 242 |
"""
|
| 243 |
user_content = f"### CONTEXT\n{context}\n\n### USER QUESTION\n{question}"
|
| 244 |
return [SystemMessage(content=system_content), HumanMessage(content=user_content)]
|