Spaces:

mtyrrell
/

chatfed_generator

Sleeping

App Files Files Community

mtyrrell commited on Sep 29, 2025

Commit

d049b68

1 Parent(s): f852f01

citation filtering

Browse files

Files changed (2) hide show

.gitignore +2 -1
utils/generator.py +78 -11

.gitignore CHANGED Viewed

	@@ -1 +1,2 @@
1	- .DS_Store


1	+ .DS_Store
2	+ .env

utils/generator.py CHANGED Viewed

@@ -2,6 +2,7 @@ import logging
 import asyncio
 import json
 import ast
 from typing import List, Dict, Any, Union, Generator, AsyncGenerator
 from dotenv import load_dotenv
@@ -73,6 +74,51 @@ def get_chat_model():
 # Initialize provider-agnostic chat model
 chat_model = get_chat_model()
 # ---------------------------------------------------------------------
 # Context processing - may need further refinement (i.e. to manage other data sources)
 # ---------------------------------------------------------------------
@@ -189,14 +235,21 @@ def build_messages(question: str, context: str) -> list:
     Returns:
         List of LangChain message objects
     """
-    system_content = (
-        "You are an expert assistant. Answer the USER question using only the "
-        "CONTEXT provided. When referencing information from the context, use inline "
-        "citations in square brackets like [1], [2], etc. to reference the document "
-        "numbers shown in the context. Use multiple citations when information comes "
-        "from multiple documents, like [1][2]. If the context is insufficient, say "
-        "'I don't know.'"
-    )
     user_content = f"### CONTEXT\n{context}\n\n### USER QUESTION\n{question}"
@@ -253,9 +306,15 @@ async def generate(query: str, context: Union[str, List[Dict[str, Any]]], chatui
             # Return ChatUI format
             result = {"answer": answer}
             if processed_results:
                 # Extract sources for ChatUI
                 sources = []
-                for result_item in processed_results:
                     filename = result_item.get('filename', 'Unknown')
                     page = result_item.get('page', 'Unknown')
                     year = result_item.get('year', 'Unknown')
@@ -349,8 +408,10 @@ async def generate_streaming(query: str, context: Union[str, List[Dict[str, Any]
     try:
         messages = build_messages(query, formatted_context)
-        # Stream the text response
         async for chunk in _call_llm_streaming(messages):
             if chatui_format:
                 yield {"event": "data", "data": chunk}
             else:
@@ -358,8 +419,14 @@ async def generate_streaming(query: str, context: Union[str, List[Dict[str, Any]
         # Send sources at the end if available and in ChatUI format
         if chatui_format and processed_results:
             sources = []
-            for result in processed_results:
                 filename = result.get('filename', 'Unknown')
                 page = result.get('page', 'Unknown')
                 year = result.get('year', 'Unknown')

 import asyncio
 import json
 import ast
+import re
 from typing import List, Dict, Any, Union, Generator, AsyncGenerator
 from dotenv import load_dotenv
 # Initialize provider-agnostic chat model
 chat_model = get_chat_model()
+# ---------------------------------------------------------------------
+# Citation parsing and source filtering
+# ---------------------------------------------------------------------
+def parse_citations_from_response(response: str) -> List[int]:
+    """
+    Parse citation numbers from the generated response.
+    Args:
+        response: The generated response text
+    Returns:
+        List of unique citation numbers found in the response
+    """
+    # Find all citation patterns like [1], [2], [1][2], etc.
+    citation_pattern = r'\[(\d+)\]'
+    matches = re.findall(citation_pattern, response)
+    # Convert to integers and return unique values
+    citation_numbers = [int(match) for match in matches]
+    return sorted(list(set(citation_numbers)))
+def filter_sources_by_citations(processed_results: List[Dict[str, Any]], cited_numbers: List[int]) -> List[Dict[str, Any]]:
+    """
+    Filter sources to only include those that were cited in the response.
+    Args:
+        processed_results: All processed retrieval results
+        cited_numbers: List of citation numbers found in the response
+    Returns:
+        List of sources that were actually cited
+    """
+    if not cited_numbers:
+        return []
+    # Filter sources based on citation numbers (1-indexed)
+    cited_sources = []
+    for citation_num in cited_numbers:
+        # Convert to 0-indexed for list access
+        source_index = citation_num - 1
+        if 0 <= source_index < len(processed_results):
+            cited_sources.append(processed_results[source_index])
+    return cited_sources
 # ---------------------------------------------------------------------
 # Context processing - may need further refinement (i.e. to manage other data sources)
 # ---------------------------------------------------------------------
     Returns:
         List of LangChain message objects
     """
+    system_content = """
+        You are AuditQ&A, an AI Assistant created by Auditors and Data Scientist. \
+        You are given a question and extracted passages of the consolidated/departmental/thematic focus audit reports.\
+        Provide a clear and structured answer based on the passages/context provided and the guidelines.
+        Guidelines:
+        - If the passages have useful facts or numbers, use them in your answer.
+        - Do not just summarize each passage one by one. Group your summaries to highlight the key parts in the explanation.
+        - If it makes sense, use bullet points and lists to make your answers easier to understand.
+        - You do not need to use every passage. Only use the ones that help answer the question.
+        - Answer the USER question using only the CONTEXT provided.
+        - When referencing information from the context, use inline citations in square brackets like [1], [2], etc. to reference the document numbers shown in the context.
+        - Use multiple citations when information comes from multiple documents, like [1][2].
+        - Do not use the sentence 'Doc x says ...' to say where information came from, but rather just include the citation at the end of the sentence.
+        - If the context is insufficient, say "I don't have sufficient information to answer the question. Please try rephrasing your query."
+"""
     user_content = f"### CONTEXT\n{context}\n\n### USER QUESTION\n{question}"
             # Return ChatUI format
             result = {"answer": answer}
             if processed_results:
+                # Parse citations from the response
+                cited_numbers = parse_citations_from_response(answer)
+                # Filter sources to only include cited ones
+                cited_sources = filter_sources_by_citations(processed_results, cited_numbers)
                 # Extract sources for ChatUI
                 sources = []
+                for result_item in cited_sources:  # Only cited sources
                     filename = result_item.get('filename', 'Unknown')
                     page = result_item.get('page', 'Unknown')
                     year = result_item.get('year', 'Unknown')
     try:
         messages = build_messages(query, formatted_context)
+        # Stream the text response and accumulate it for citation parsing
+        accumulated_response = ""
         async for chunk in _call_llm_streaming(messages):
+            accumulated_response += chunk
             if chatui_format:
                 yield {"event": "data", "data": chunk}
             else:
         # Send sources at the end if available and in ChatUI format
         if chatui_format and processed_results:
+            # Parse citations from the complete response
+            cited_numbers = parse_citations_from_response(accumulated_response)
+            # Filter sources to only include cited ones
+            cited_sources = filter_sources_by_citations(processed_results, cited_numbers)
             sources = []
+            for result in cited_sources:  # Only cited sources
                 filename = result.get('filename', 'Unknown')
                 page = result.get('page', 'Unknown')
                 year = result.get('year', 'Unknown')