Spaces:

mtyrrell
/

chatfed_generator

Sleeping

App Files Files Community

mtyrrell commited on Sep 30, 2025

Commit

d384965

1 Parent(s): 42934a1

further steering for citation format

Browse files

Files changed (1) hide show

utils/generator.py +63 -34

utils/generator.py CHANGED Viewed

@@ -81,10 +81,13 @@ def _extract_sources(processed_results: List[Dict[str, Any]], cited_numbers: Lis
     return cited_sources
 def clean_citations(response: str) -> str:
-    """Normalize all citation formats to [x]"""
-    # Remove References/Sources sections
     ref_patterns = [
         r'\n\s*References?\s*:.*$',
         r'\n\s*Sources?\s*:.*$',
         r'\n\s*Bibliography\s*:.*$',
@@ -92,15 +95,31 @@ def clean_citations(response: str) -> str:
     for pattern in ref_patterns:
         response = re.sub(pattern, '', response, flags=re.IGNORECASE | re.DOTALL)
     # Fix [Document X, Page Y, Year Z] -> [X]
     response = re.sub(
-        r'\[Document\s+(\d+)[^\]]*\]',
         r'[\1]',
         response,
         flags=re.IGNORECASE
     )
-    # Fix [2.2.2] style (section numbers) -> [2]
     response = re.sub(
         r'\[(\d+)\.[\d\.]+\]',
         r'[\1]',
@@ -115,14 +134,28 @@ def clean_citations(response: str) -> str:
         flags=re.IGNORECASE
     )
-    # Fix "Document X states/says" -> [X]
     response = re.sub(
-        r'Document\s+(\d+)\s+(?:states|says|mentions|reports|indicates)',
         r'[\1]',
         response,
         flags=re.IGNORECASE
     )
     return response.strip()
 def _process_context(context: Union[str, List[Dict[str, Any]]]) -> tuple[str, List[Dict[str, Any]]]:
@@ -149,16 +182,11 @@ def _process_context(context: Union[str, List[Dict[str, Any]]]) -> tuple[str, Li
             }
             processed_results.append(doc_info)
-        # Format context string
         context_parts = []
         for i, result in enumerate(processed_results, 1):
-            doc_ref = f"[Document {i}: {result['filename']}"
-            if result['page'] != 'Unknown':
-                doc_ref += f", Page {result['page']}"
-            if result['year'] != 'Unknown':
-                doc_ref += f", Year {result['year']}"
-            doc_ref += "]"
-            context_parts.append(f"{doc_ref}\n{result['answer']}\n")
         formatted_context = "\n".join(context_parts)
@@ -174,7 +202,7 @@ def _process_context(context: Union[str, List[Dict[str, Any]]]) -> tuple[str, Li
 def _build_messages(question: str, context: str) -> list:
     """Build messages for LLM call"""
     system_content = """You are AuditQ&A, an AI Assistant created by Auditors and Data Scientists.
-You are given a question and extracted passages of the consolidated/departmental/thematic focus audit reports.
 Provide a clear and structured answer based on the passages/context provided and the guidelines.
 Guidelines:
@@ -182,34 +210,35 @@ Guidelines:
 - Do not just summarize each passage one by one. Group your summaries to highlight the key parts in the explanation.
 - If it makes sense, use bullet points and lists to make your answers easier to understand.
 - You do not need to use every passage. Only use the ones that help answer the question.
-- Answer the USER question using only the CONTEXT provided.
-CITATION FORMAT - FOLLOW EXACTLY:
-- Citations MUST be in this format: [1], [2], [3], etc. - ONLY the document number in square brackets.
-- Place citations at the end of relevant sentences.
-- For multiple sources: [1][2].
-CORRECT EXAMPLES:
 ✓ "The budget was UGX.284bn [2]."
 ✓ "Funding was approved by Parliament [1][3]."
-INCORRECT EXAMPLES - NEVER USE THESE:
-✗ [2.2.2] - NO section numbers
-✗ [Document 1, Page 295, Year 2021] - NO page numbers, years, or document names
-✗ (Document 3) - NO parentheses
-✗ "Document 5 states" - NO narrative references
-CRITICAL: Use ONLY [number] format. Never include page numbers, years, document names, or section numbers in citations.
-DO NOT add a "References" section, bibliography, or sources list at the end.
 FOLLOW-UP QUESTIONS:
-- If the context contains related information beyond what you included in your answer, suggest 1 relevant follow-up question the user might want to explore.
-- Base the question on related information you found in the context or natural extensions of the user's query.
-- Format the follow-up question clearly at the end of your response under "You might also want to know:"
-- Keep the follow-up question concise and directly related to the audit reports.
-- If the context is insufficient, say "I don't have sufficient information to answer the question. Please try rephrasing your query."
 """
     user_content = f"### CONTEXT\n{context}\n\n### USER QUESTION\n{question}"
     return [SystemMessage(content=system_content), HumanMessage(content=user_content)]

     return cited_sources
 def clean_citations(response: str) -> str:
+    """Normalize all citation formats to [x] and remove unwanted sections"""
+    # Remove References/Sources/Bibliography sections
     ref_patterns = [
+        r'\n\s*#+\s*References?\s*:?.*$',
+        r'\n\s*#+\s*Sources?\s*:?.*$',
+        r'\n\s*#+\s*Bibliography\s*:?.*$',
         r'\n\s*References?\s*:.*$',
         r'\n\s*Sources?\s*:.*$',
         r'\n\s*Bibliography\s*:.*$',
     for pattern in ref_patterns:
         response = re.sub(pattern, '', response, flags=re.IGNORECASE | re.DOTALL)
+    # Fix (Document X, Page Y, Year Z) -> [X]
+    response = re.sub(
+        r'\(Document\s+(\d+)(?:,\s*Page\s+\d+)?(?:,\s*(?:Year\s+)?\d+)?\)',
+        r'[\1]',
+        response,
+        flags=re.IGNORECASE
+    )
     # Fix [Document X, Page Y, Year Z] -> [X]
     response = re.sub(
+        r'\[Document\s+(\d+)(?:[^\]]*)\]',
         r'[\1]',
         response,
         flags=re.IGNORECASE
     )
+    # Fix [Document X: filename, Page Y, Year Z] -> [X]
+    response = re.sub(
+        r'\[Document\s+(\d+):[^\]]+\]',
+        r'[\1]',
+        response,
+        flags=re.IGNORECASE
+    )
+    # Fix [X.Y.Z] style (section numbers) -> [X]
     response = re.sub(
         r'\[(\d+)\.[\d\.]+\]',
         r'[\1]',
         flags=re.IGNORECASE
     )
+    # Fix "Document X, Page Y, Year Z" (no brackets) -> [X]
+    response = re.sub(
+        r'Document\s+(\d+)(?:,\s*Page\s+\d+)?(?:,\s*(?:Year\s+)?\d+)?(?=\s|[,.])',
+        r'[\1]',
+        response,
+        flags=re.IGNORECASE
+    )
+    # Fix "Document X states/says/mentions" -> [X]
     response = re.sub(
+        r'Document\s+(\d+)\s+(?:states|says|mentions|reports|indicates|notes|shows)',
         r'[\1]',
         response,
         flags=re.IGNORECASE
     )
+    # Clean up any double citations [[1]] -> [1]
+    response = re.sub(r'\[\[(\d+)\]\]', r'[\1]', response)
+    # Clean up multiple spaces
+    response = re.sub(r'\s+', ' ', response)
     return response.strip()
 def _process_context(context: Union[str, List[Dict[str, Any]]]) -> tuple[str, List[Dict[str, Any]]]:
             }
             processed_results.append(doc_info)
+        # Format context string - SIMPLIFIED TO ONLY USE [1], [2], [3]
         context_parts = []
         for i, result in enumerate(processed_results, 1):
+            # Simple format: [1], [2], etc.
+            context_parts.append(f"[{i}]\n{result['answer']}\n")
         formatted_context = "\n".join(context_parts)
 def _build_messages(question: str, context: str) -> list:
     """Build messages for LLM call"""
     system_content = """You are AuditQ&A, an AI Assistant created by Auditors and Data Scientists.
+You are given a question and extracted passages from consolidated/departmental/thematic focus audit reports.
 Provide a clear and structured answer based on the passages/context provided and the guidelines.
 Guidelines:
 - Do not just summarize each passage one by one. Group your summaries to highlight the key parts in the explanation.
 - If it makes sense, use bullet points and lists to make your answers easier to understand.
 - You do not need to use every passage. Only use the ones that help answer the question.
+- Answer the USER question using ONLY the CONTEXT provided. Do not add information from outside the context.
+- Stay focused on the user's question. Do not add unrelated sections or topics.
+CRITICAL - CITATION FORMAT:
+Citations MUST be in this exact format: [1], [2], [3], etc.
+- ONLY the number in square brackets
+- Place at the end of relevant sentences
+- For multiple sources: [1][2]
+CORRECT:
 ✓ "The budget was UGX.284bn [2]."
 ✓ "Funding was approved by Parliament [1][3]."
+NEVER USE:
+✗ [Document 1, Page 295, Year 2021]
+✗ (Document 3, Page 23, 2021)
+✗ Document 5, Page 295, 2021
+✗ [2.2.2]
+✗ "Document 5 states"
+DO NOT add a "References", "Sources", or "Bibliography" section at the end.
 FOLLOW-UP QUESTIONS:
+- If the context contains related information beyond what you included, suggest 1 relevant follow-up question.
+- Base the question on information found in the context or natural extensions of the user's query.
+- Format: "You might also want to know:"
+- Keep it concise and directly related to the audit reports.
+If the context is insufficient, say: "I don't have sufficient information to answer the question. Please try rephrasing your query."
 """
     user_content = f"### CONTEXT\n{context}\n\n### USER QUESTION\n{question}"
     return [SystemMessage(content=system_content), HumanMessage(content=user_content)]