Spaces:

mtyrrell
/

chatfed_generator

Sleeping

App Files Files Community

mtyrrell commited on Sep 29, 2025

Commit

f2a3674

1 Parent(s): 4a1d809

added validation guardrails

Browse files

Files changed (1) hide show

utils/generator.py +41 -1

utils/generator.py CHANGED Viewed

@@ -72,6 +72,33 @@ def _extract_sources(processed_results: List[Dict[str, Any]], cited_numbers: Lis
     return cited_sources
 def _process_context(context: Union[str, List[Dict[str, Any]]]) -> tuple[str, List[Dict[str, Any]]]:
     """Process context and return formatted context string and processed results"""
     processed_results = []
@@ -138,6 +165,9 @@ CITATION FORMAT (CRITICAL):
 - CORRECT: "Revenue increased by 15% [3]."
 - INCORRECT: "(Document 3)", "(Doc 3)", "Document 3 states", "according to document 3"
 - NEVER use phrases like "Doc x says" or "(Document x)" - ONLY use [x] format.
 - If the context is insufficient, say "I don't have sufficient information to answer the question. Please try rephrasing your query."
 """
@@ -200,6 +230,11 @@ async def generate(query: str, context: Union[str, List[Dict[str, Any]]], chatui
         messages = _build_messages(query, formatted_context)
         answer = await _call_llm(messages)
         if chatui_format:
             result = {"answer": answer}
             if processed_results:
@@ -238,9 +273,14 @@ async def generate_streaming(query: str, context: Union[str, List[Dict[str, Any]
             else:
                 yield chunk
         # Send sources at the end if available and in ChatUI format
         if chatui_format and processed_results:
-            cited_numbers = _parse_citations(accumulated_response)
             cited_sources = _extract_sources(processed_results, cited_numbers)
             sources = _create_sources_list(cited_sources)
             yield {"event": "sources", "data": {"sources": sources}}

     return cited_sources
+def normalize_citations(response: str) -> str:
+    """Convert non-compliant citation formats to [x] format"""
+    # Convert (Document X) to [X]
+    response = re.sub(r'\(Document\s+(\d+)\)', r'[\1]', response, flags=re.IGNORECASE)
+    # Convert (Doc X) to [X]
+    response = re.sub(r'\(Doc\s+(\d+)\)', r'[\1]', response, flags=re.IGNORECASE)
+    # Convert "Document X says" to [X]
+    response = re.sub(r'Document\s+(\d+)\s+(?:says|states|mentions)', r'[\1]', response, flags=re.IGNORECASE)
+    return response
+def clean_response(response: str) -> str:
+    """Remove unwanted reference sections"""
+    # Split by common reference section headers
+    patterns = [
+        r'\n\s*References?\s*:',
+        r'\n\s*Sources?\s*:',
+        r'\n\s*Bibliography\s*:',
+        r'\n\s*Citations?\s*:',
+    ]
+    for pattern in patterns:
+        if re.search(pattern, response, re.IGNORECASE):
+            response = re.split(pattern, response, flags=re.IGNORECASE)[0]
+            break
+    return response.strip()
 def _process_context(context: Union[str, List[Dict[str, Any]]]) -> tuple[str, List[Dict[str, Any]]]:
     """Process context and return formatted context string and processed results"""
     processed_results = []
 - CORRECT: "Revenue increased by 15% [3]."
 - INCORRECT: "(Document 3)", "(Doc 3)", "Document 3 states", "according to document 3"
 - NEVER use phrases like "Doc x says" or "(Document x)" - ONLY use [x] format.
+- DO NOT add a "References" section at the end of your response.
+- DO NOT list out the full document names, page numbers, or years at the end.
+- Your response should END after your answer - no bibliography, no references list, no sources section.
 - If the context is insufficient, say "I don't have sufficient information to answer the question. Please try rephrasing your query."
 """
         messages = _build_messages(query, formatted_context)
         answer = await _call_llm(messages)
+        # Normalize citations to ensure proper format
+        answer = normalize_citations(answer)
+        # Clean response to remove unwanted reference sections
+        answer = clean_response(answer)
         if chatui_format:
             result = {"answer": answer}
             if processed_results:
             else:
                 yield chunk
+        # Normalize citations in the complete response
+        normalized_response = normalize_citations(accumulated_response)
+        # Clean response to remove unwanted reference sections
+        cleaned_response = clean_response(normalized_response)
         # Send sources at the end if available and in ChatUI format
         if chatui_format and processed_results:
+            cited_numbers = _parse_citations(cleaned_response)
             cited_sources = _extract_sources(processed_results, cited_numbers)
             sources = _create_sources_list(cited_sources)
             yield {"event": "sources", "data": {"sources": sources}}