Spaces:

mtyrrell
/

chatfed_generator

Sleeping

App Files Files Community

mtyrrell commited on Sep 29, 2025

Commit

b85478b

1 Parent(s): 7c59563

further cleaning of citations

Browse files

Files changed (1) hide show

utils/generator.py +62 -40

utils/generator.py CHANGED Viewed

@@ -105,30 +105,48 @@ def _extract_sources(processed_results: List[Dict[str, Any]], cited_numbers: Lis
     return cited_sources
-def normalize_citations(response: str) -> str:
-    """Convert non-compliant citation formats to [x] format"""
-    # Convert (Document X) to [X]
-    response = re.sub(r'\(Document\s+(\d+)\)', r'[\1]', response, flags=re.IGNORECASE)
-    # Convert (Doc X) to [X]
-    response = re.sub(r'\(Doc\s+(\d+)\)', r'[\1]', response, flags=re.IGNORECASE)
-    # Convert "Document X says" to [X]
-    response = re.sub(r'Document\s+(\d+)\s+(?:says|states|mentions)', r'[\1]', response, flags=re.IGNORECASE)
-    return response
-def clean_response(response: str) -> str:
-    """Remove unwanted reference sections"""
-    # Split by common reference section headers
-    patterns = [
-        r'\n\s*References?\s*:',
-        r'\n\s*Sources?\s*:',
-        r'\n\s*Bibliography\s*:',
-        r'\n\s*Citations?\s*:',
     ]
-    for pattern in patterns:
-        if re.search(pattern, response, re.IGNORECASE):
-            response = re.split(pattern, response, flags=re.IGNORECASE)[0]
-            break
     return response.strip()
@@ -191,16 +209,24 @@ Guidelines:
 - You do not need to use every passage. Only use the ones that help answer the question.
 - Answer the USER question using only the CONTEXT provided.
-CITATION FORMAT (CRITICAL):
-- ALWAYS use inline citations in square brackets like [1], [2], etc. to reference document numbers.
-- Place citations at the end of sentences or claims: "The audit found compliance issues [1]."
-- For multiple sources, use [1][2].
-- CORRECT: "Revenue increased by 15% [3]."
-- INCORRECT: "(Document 3)", "(Doc 3)", "Document 3 states", "according to document 3"
-- NEVER use phrases like "Doc x says" or "(Document x)" - ONLY use [x] format.
-- DO NOT add a "References" section at the end of your response.
-- DO NOT list out the full document names, page numbers, or years at the end.
-- Your response should END after your answer - no bibliography, no references list, no sources section.
 - If the context is insufficient, say "I don't have sufficient information to answer the question. Please try rephrasing your query."
 """
@@ -263,10 +289,8 @@ async def generate(query: str, context: Union[str, List[Dict[str, Any]]], chatui
         messages = _build_messages(query, formatted_context)
         answer = await _call_llm(messages)
-        # Normalize citations to ensure proper format
-        answer = normalize_citations(answer)
-        # Clean response to remove unwanted reference sections
-        answer = clean_response(answer)
         if chatui_format:
             result = {"answer": answer}
@@ -306,10 +330,8 @@ async def generate_streaming(query: str, context: Union[str, List[Dict[str, Any]
             else:
                 yield chunk
-        # Normalize citations in the complete response
-        normalized_response = normalize_citations(accumulated_response)
-        # Clean response to remove unwanted reference sections
-        cleaned_response = clean_response(normalized_response)
         # Send sources at the end if available and in ChatUI format
         if chatui_format and processed_results:

     return cited_sources
+def clean_citations(response: str) -> str:
+    """Normalize all citation formats to [x]"""
+    # Remove References/Sources sections
+    ref_patterns = [
+        r'\n\s*References?\s*:.*$',
+        r'\n\s*Sources?\s*:.*$',
+        r'\n\s*Bibliography\s*:.*$',
     ]
+    for pattern in ref_patterns:
+        response = re.sub(pattern, '', response, flags=re.IGNORECASE | re.DOTALL)
+    # Fix [Document X, Page Y, Year Z] -> [X]
+    response = re.sub(
+        r'\[Document\s+(\d+)[^\]]*\]',
+        r'[\1]',
+        response,
+        flags=re.IGNORECASE
+    )
+    # Fix [2.2.2] style (section numbers) -> [2]
+    response = re.sub(
+        r'\[(\d+)\.[\d\.]+\]',
+        r'[\1]',
+        response
+    )
+    # Fix (Document X) -> [X]
+    response = re.sub(
+        r'\(Document\s+(\d+)\)',
+        r'[\1]',
+        response,
+        flags=re.IGNORECASE
+    )
+    # Fix "Document X states/says" -> [X]
+    response = re.sub(
+        r'Document\s+(\d+)\s+(?:states|says|mentions|reports|indicates)',
+        r'[\1]',
+        response,
+        flags=re.IGNORECASE
+    )
     return response.strip()
 - You do not need to use every passage. Only use the ones that help answer the question.
 - Answer the USER question using only the CONTEXT provided.
+CITATION FORMAT - FOLLOW EXACTLY:
+- Citations MUST be in this format: [1], [2], [3], etc. - ONLY the document number in square brackets.
+- Place citations at the end of relevant sentences.
+- For multiple sources: [1][2].
+CORRECT EXAMPLES:
+✓ "The budget was UGX.284bn [2]."
+✓ "Funding was approved by Parliament [1][3]."
+INCORRECT EXAMPLES - NEVER USE THESE:
+✗ [2.2.2] - NO section numbers
+✗ [Document 1, Page 295, Year 2021] - NO page numbers, years, or document names
+✗ (Document 3) - NO parentheses
+✗ "Document 5 states" - NO narrative references
+CRITICAL: Use ONLY [number] format. Never include page numbers, years, document names, or section numbers in citations.
+DO NOT add a "References" section, bibliography, or sources list at the end.
 - If the context is insufficient, say "I don't have sufficient information to answer the question. Please try rephrasing your query."
 """
         messages = _build_messages(query, formatted_context)
         answer = await _call_llm(messages)
+        # Clean citations to ensure proper format and remove unwanted sections
+        answer = clean_citations(answer)
         if chatui_format:
             result = {"answer": answer}
             else:
                 yield chunk
+        # Clean citations in the complete response
+        cleaned_response = clean_citations(accumulated_response)
         # Send sources at the end if available and in ChatUI format
         if chatui_format and processed_results: