Spaces:

mtyrrell
/

chatfed_generator

Sleeping

App Files Files Community

mtyrrell commited on Sep 30, 2025

Commit

d60bab3

1 Parent(s): d384965

refactor

Browse files

Files changed (3) hide show

utils/generator.py +1 -213
utils/prompts.py +50 -0
utils/sources.py +176 -0

utils/generator.py CHANGED Viewed

@@ -1,10 +1,5 @@
 import logging
-import asyncio
-import json
-import ast
-import re
 from typing import List, Dict, Any, Union, AsyncGenerator
-from dotenv import load_dotenv
 # LangChain imports
 from langchain_openai import ChatOpenAI
@@ -15,6 +10,7 @@ from langchain_core.messages import SystemMessage, HumanMessage
 # Local imports
 from .utils import getconfig, get_auth
 # Set up logger
 logger = logging.getLogger(__name__)
@@ -53,214 +49,6 @@ def _get_chat_model():
 # Initialize chat model
 chat_model = _get_chat_model()
-# ---------------------------------------------------------------------
-# Core Processing Functions
-# ---------------------------------------------------------------------
-def _parse_citations(response: str) -> List[int]:
-    """Parse citation numbers from response text"""
-    citation_pattern = r'\[(\d+)\]'
-    matches = re.findall(citation_pattern, response)
-    citation_numbers = sorted(list(set(int(match) for match in matches)))
-    return citation_numbers
-def _extract_sources(processed_results: List[Dict[str, Any]], cited_numbers: List[int]) -> List[Dict[str, Any]]:
-    """Extract sources that were cited in the response"""
-    if not cited_numbers:
-        return []
-    cited_sources = []
-    for citation_num in cited_numbers:
-        source_index = citation_num - 1
-        if 0 <= source_index < len(processed_results):
-            source = processed_results[source_index].copy()  # Make copy to avoid modifying original
-            source['_citation_number'] = citation_num  # Preserve original citation number
-            cited_sources.append(source)
-    return cited_sources
-def clean_citations(response: str) -> str:
-    """Normalize all citation formats to [x] and remove unwanted sections"""
-    # Remove References/Sources/Bibliography sections
-    ref_patterns = [
-        r'\n\s*#+\s*References?\s*:?.*$',
-        r'\n\s*#+\s*Sources?\s*:?.*$',
-        r'\n\s*#+\s*Bibliography\s*:?.*$',
-        r'\n\s*References?\s*:.*$',
-        r'\n\s*Sources?\s*:.*$',
-        r'\n\s*Bibliography\s*:.*$',
-    ]
-    for pattern in ref_patterns:
-        response = re.sub(pattern, '', response, flags=re.IGNORECASE | re.DOTALL)
-    # Fix (Document X, Page Y, Year Z) -> [X]
-    response = re.sub(
-        r'\(Document\s+(\d+)(?:,\s*Page\s+\d+)?(?:,\s*(?:Year\s+)?\d+)?\)',
-        r'[\1]',
-        response,
-        flags=re.IGNORECASE
-    )
-    # Fix [Document X, Page Y, Year Z] -> [X]
-    response = re.sub(
-        r'\[Document\s+(\d+)(?:[^\]]*)\]',
-        r'[\1]',
-        response,
-        flags=re.IGNORECASE
-    )
-    # Fix [Document X: filename, Page Y, Year Z] -> [X]
-    response = re.sub(
-        r'\[Document\s+(\d+):[^\]]+\]',
-        r'[\1]',
-        response,
-        flags=re.IGNORECASE
-    )
-    # Fix [X.Y.Z] style (section numbers) -> [X]
-    response = re.sub(
-        r'\[(\d+)\.[\d\.]+\]',
-        r'[\1]',
-        response
-    )
-    # Fix (Document X) -> [X]
-    response = re.sub(
-        r'\(Document\s+(\d+)\)',
-        r'[\1]',
-        response,
-        flags=re.IGNORECASE
-    )
-    # Fix "Document X, Page Y, Year Z" (no brackets) -> [X]
-    response = re.sub(
-        r'Document\s+(\d+)(?:,\s*Page\s+\d+)?(?:,\s*(?:Year\s+)?\d+)?(?=\s|[,.])',
-        r'[\1]',
-        response,
-        flags=re.IGNORECASE
-    )
-    # Fix "Document X states/says/mentions" -> [X]
-    response = re.sub(
-        r'Document\s+(\d+)\s+(?:states|says|mentions|reports|indicates|notes|shows)',
-        r'[\1]',
-        response,
-        flags=re.IGNORECASE
-    )
-    # Clean up any double citations [[1]] -> [1]
-    response = re.sub(r'\[\[(\d+)\]\]', r'[\1]', response)
-    # Clean up multiple spaces
-    response = re.sub(r'\s+', ' ', response)
-    return response.strip()
-def _process_context(context: Union[str, List[Dict[str, Any]]]) -> tuple[str, List[Dict[str, Any]]]:
-    """Process context and return formatted context string and processed results"""
-    processed_results = []
-    if isinstance(context, list):
-        if not context:
-            raise ValueError("No retrieval results provided")
-        # Extract relevant fields from retrieval results
-        for result in context:
-            if isinstance(result, str):
-                result = ast.literal_eval(result)
-            metadata = result.get('answer_metadata', {})
-            doc_info = {
-                'answer': result.get('answer', ''),
-                'filename': metadata.get('filename', 'Unknown'),
-                'page': metadata.get('page', 'Unknown'),
-                'year': metadata.get('year', 'Unknown'),
-                'source': metadata.get('source', 'Unknown'),
-                'document_id': metadata.get('_id', 'Unknown')
-            }
-            processed_results.append(doc_info)
-        # Format context string - SIMPLIFIED TO ONLY USE [1], [2], [3]
-        context_parts = []
-        for i, result in enumerate(processed_results, 1):
-            # Simple format: [1], [2], etc.
-            context_parts.append(f"[{i}]\n{result['answer']}\n")
-        formatted_context = "\n".join(context_parts)
-    elif isinstance(context, str):
-        if not context.strip():
-            raise ValueError("Context cannot be empty")
-        formatted_context = context
-    else:
-        raise ValueError("Context must be either a string or list of retrieval results")
-    return formatted_context, processed_results
-def _build_messages(question: str, context: str) -> list:
-    """Build messages for LLM call"""
-    system_content = """You are AuditQ&A, an AI Assistant created by Auditors and Data Scientists.
-You are given a question and extracted passages from consolidated/departmental/thematic focus audit reports.
-Provide a clear and structured answer based on the passages/context provided and the guidelines.
-Guidelines:
-- If the passages have useful facts or numbers, use them in your answer.
-- Do not just summarize each passage one by one. Group your summaries to highlight the key parts in the explanation.
-- If it makes sense, use bullet points and lists to make your answers easier to understand.
-- You do not need to use every passage. Only use the ones that help answer the question.
-- Answer the USER question using ONLY the CONTEXT provided. Do not add information from outside the context.
-- Stay focused on the user's question. Do not add unrelated sections or topics.
-CRITICAL - CITATION FORMAT:
-Citations MUST be in this exact format: [1], [2], [3], etc.
-- ONLY the number in square brackets
-- Place at the end of relevant sentences
-- For multiple sources: [1][2]
-CORRECT:
-✓ "The budget was UGX.284bn [2]."
-✓ "Funding was approved by Parliament [1][3]."
-NEVER USE:
-✗ [Document 1, Page 295, Year 2021]
-✗ (Document 3, Page 23, 2021)
-✗ Document 5, Page 295, 2021
-✗ [2.2.2]
-✗ "Document 5 states"
-DO NOT add a "References", "Sources", or "Bibliography" section at the end.
-FOLLOW-UP QUESTIONS:
-- If the context contains related information beyond what you included, suggest 1 relevant follow-up question.
-- Base the question on information found in the context or natural extensions of the user's query.
-- Format: "You might also want to know:"
-- Keep it concise and directly related to the audit reports.
-If the context is insufficient, say: "I don't have sufficient information to answer the question. Please try rephrasing your query."
-"""
-    user_content = f"### CONTEXT\n{context}\n\n### USER QUESTION\n{question}"
-    return [SystemMessage(content=system_content), HumanMessage(content=user_content)]
-def _create_sources_list(cited_sources: List[Dict[str, Any]]) -> List[Dict[str, str]]:
-    """Create sources list for ChatUI format"""
-    sources = []
-    for result in cited_sources:
-        filename = result.get('filename', 'Unknown')
-        page = result.get('page', 'Unknown')
-        year = result.get('year', 'Unknown')
-        link = f"doc://{filename}"
-        title_parts = [filename]
-        if page != 'Unknown':
-            title_parts.append(f"Page {page}")
-        if year != 'Unknown':
-            title_parts.append(f"({year})")
-        sources.append({"link": link, "title": " - ".join(title_parts)})
-    return sources
 # ---------------------------------------------------------------------
 # LLM Call Functions

 import logging
 from typing import List, Dict, Any, Union, AsyncGenerator
 # LangChain imports
 from langchain_openai import ChatOpenAI
 # Local imports
 from .utils import getconfig, get_auth
+from .sources import _process_context, _build_messages, _parse_citations, _extract_sources, _create_sources_list, clean_citations
 # Set up logger
 logger = logging.getLogger(__name__)
 # Initialize chat model
 chat_model = _get_chat_model()
 # ---------------------------------------------------------------------
 # LLM Call Functions

utils/prompts.py ADDED Viewed

	@@ -0,0 +1,50 @@

+system_prompt = """You are AuditQ&A, an AI Assistant created by Auditors and Data Scientists.
+You are given a question and extracted passages from consolidated/departmental/thematic focus audit reports.
+Provide a clear and structured answer based on the passages/context provided and the guidelines.
+Guidelines:
+- If the passages have useful facts or numbers, use them in your answer.
+- Do not just summarize each passage one by one. Group your summaries to highlight the key parts in the explanation.
+- If it makes sense, use bullet points and lists to make your answers easier to understand.
+- You do not need to use every passage. Only use the ones that help answer the question.
+- Answer the USER question using ONLY the CONTEXT provided. Do not add information from outside the context.
+- Stay focused on the user's question. Do not add unrelated sections or topics.
+CRITICAL - CITATION REQUIREMENTS:
+EVERY factual statement, description, or claim MUST be cited. This includes:
+- Numerical data and statistics
+- Descriptions of what things are or how they work
+- Background information about concepts, systems, or datasets
+- Suggested applications or use cases based on context information
+- ANY information derived from the passages
+CRITICAL - CITATION FORMAT:
+Citations MUST be in this exact format: [1], [2], [3], etc.
+- ONLY the number in square brackets
+- Place at the end of relevant sentences
+- For multiple sources: [1][2]
+- If an entire paragraph is based on one source, cite it at the end of the paragraph
+CORRECT:
+✓ "The budget was UGX.284bn [2]."
+✓ "Funding was approved by Parliament [1][3]."
+✓ "The dataset is designed to bolster analytical capabilities [1]."
+NEVER USE:
+✗ [Document 1, Page 295, Year 2021]
+✗ (Document 3, Page 23, 2021)
+✗ Document 5, Page 295, 2021
+✗ [2.2.2]
+✗ "Document 5 states"
+DO NOT add a "References", "Sources", or "Bibliography" section at the end.
+FOLLOW-UP QUESTIONS:
+- If the context contains related information beyond what you included, suggest 1 relevant follow-up question.
+- Base the question on information found in the context or natural extensions of the user's query.
+- Format: "You might also want to know:"
+- Keep it concise and directly related to the audit reports.
+If the context is insufficient, say: "I don't have sufficient information to answer the question. Please try rephrasing your query."
+"""

utils/sources.py ADDED Viewed

	@@ -0,0 +1,176 @@

+import re
+from typing import List, Dict, Any, Union
+import ast
+from langchain_core.messages import SystemMessage, HumanMessage
+from .prompts import system_prompt
+# ---------------------------------------------------------------------
+# Core Processing Functions
+# ---------------------------------------------------------------------
+def _parse_citations(response: str) -> List[int]:
+    """Parse citation numbers from response text"""
+    citation_pattern = r'\[(\d+)\]'
+    matches = re.findall(citation_pattern, response)
+    citation_numbers = sorted(list(set(int(match) for match in matches)))
+    return citation_numbers
+def _extract_sources(processed_results: List[Dict[str, Any]], cited_numbers: List[int]) -> List[Dict[str, Any]]:
+    """Extract sources that were cited in the response"""
+    if not cited_numbers:
+        return []
+    cited_sources = []
+    for citation_num in cited_numbers:
+        source_index = citation_num - 1
+        if 0 <= source_index < len(processed_results):
+            source = processed_results[source_index].copy()  # Make copy to avoid modifying original
+            source['_citation_number'] = citation_num  # Preserve original citation number
+            cited_sources.append(source)
+    return cited_sources
+def clean_citations(response: str) -> str:
+    """Normalize all citation formats to [x] and remove unwanted sections"""
+    # Remove References/Sources/Bibliography sections
+    ref_patterns = [
+        r'\n\s*#+\s*References?\s*:?.*$',
+        r'\n\s*#+\s*Sources?\s*:?.*$',
+        r'\n\s*#+\s*Bibliography\s*:?.*$',
+        r'\n\s*References?\s*:.*$',
+        r'\n\s*Sources?\s*:.*$',
+        r'\n\s*Bibliography\s*:.*$',
+    ]
+    for pattern in ref_patterns:
+        response = re.sub(pattern, '', response, flags=re.IGNORECASE | re.DOTALL)
+    # Fix (Document X, Page Y, Year Z) -> [X]
+    response = re.sub(
+        r'\(Document\s+(\d+)(?:,\s*Page\s+\d+)?(?:,\s*(?:Year\s+)?\d+)?\)',
+        r'[\1]',
+        response,
+        flags=re.IGNORECASE
+    )
+    # Fix [Document X, Page Y, Year Z] -> [X]
+    response = re.sub(
+        r'\[Document\s+(\d+)(?:[^\]]*)\]',
+        r'[\1]',
+        response,
+        flags=re.IGNORECASE
+    )
+    # Fix [Document X: filename, Page Y, Year Z] -> [X]
+    response = re.sub(
+        r'\[Document\s+(\d+):[^\]]+\]',
+        r'[\1]',
+        response,
+        flags=re.IGNORECASE
+    )
+    # Fix [X.Y.Z] style (section numbers) -> [X]
+    response = re.sub(
+        r'\[(\d+)\.[\d\.]+\]',
+        r'[\1]',
+        response
+    )
+    # Fix (Document X) -> [X]
+    response = re.sub(
+        r'\(Document\s+(\d+)\)',
+        r'[\1]',
+        response,
+        flags=re.IGNORECASE
+    )
+    # Fix "Document X, Page Y, Year Z" (no brackets) -> [X]
+    response = re.sub(
+        r'Document\s+(\d+)(?:,\s*Page\s+\d+)?(?:,\s*(?:Year\s+)?\d+)?(?=\s|[,.])',
+        r'[\1]',
+        response,
+        flags=re.IGNORECASE
+    )
+    # Fix "Document X states/says/mentions" -> [X]
+    response = re.sub(
+        r'Document\s+(\d+)\s+(?:states|says|mentions|reports|indicates|notes|shows)',
+        r'[\1]',
+        response,
+        flags=re.IGNORECASE
+    )
+    # Clean up any double citations [[1]] -> [1]
+    response = re.sub(r'\[\[(\d+)\]\]', r'[\1]', response)
+    # Clean up multiple spaces
+    response = re.sub(r'\s+', ' ', response)
+    return response.strip()
+def _process_context(context: Union[str, List[Dict[str, Any]]]) -> tuple[str, List[Dict[str, Any]]]:
+    """Process context and return formatted context string and processed results"""
+    processed_results = []
+    if isinstance(context, list):
+        if not context:
+            raise ValueError("No retrieval results provided")
+        # Extract relevant fields from retrieval results
+        for result in context:
+            if isinstance(result, str):
+                result = ast.literal_eval(result)
+            metadata = result.get('answer_metadata', {})
+            doc_info = {
+                'answer': result.get('answer', ''),
+                'filename': metadata.get('filename', 'Unknown'),
+                'page': metadata.get('page', 'Unknown'),
+                'year': metadata.get('year', 'Unknown'),
+                'source': metadata.get('source', 'Unknown'),
+                'document_id': metadata.get('_id', 'Unknown')
+            }
+            processed_results.append(doc_info)
+        # Format context string - SIMPLIFIED TO ONLY USE [1], [2], [3]
+        context_parts = []
+        for i, result in enumerate(processed_results, 1):
+            # Simple format: [1], [2], etc.
+            context_parts.append(f"[{i}]\n{result['answer']}\n")
+        formatted_context = "\n".join(context_parts)
+    elif isinstance(context, str):
+        if not context.strip():
+            raise ValueError("Context cannot be empty")
+        formatted_context = context
+    else:
+        raise ValueError("Context must be either a string or list of retrieval results")
+    return formatted_context, processed_results
+def _build_messages(system_prompt: str, question: str, context: str) -> list:
+    """Build messages for LLM call"""
+    system_content = system_prompt
+    user_content = f"### CONTEXT\n{context}\n\n### USER QUESTION\n{question}"
+    return [SystemMessage(content=system_content), HumanMessage(content=user_content)]
+def _create_sources_list(cited_sources: List[Dict[str, Any]]) -> List[Dict[str, str]]:
+    """Create sources list for ChatUI format"""
+    sources = []
+    for result in cited_sources:
+        filename = result.get('filename', 'Unknown')
+        page = result.get('page', 'Unknown')
+        year = result.get('year', 'Unknown')
+        link = f"doc://{filename}"
+        title_parts = [filename]
+        if page != 'Unknown':
+            title_parts.append(f"Page {page}")
+        if year != 'Unknown':
+            title_parts.append(f"({year})")
+        sources.append({"link": link, "title": " - ".join(title_parts)})
+    return sources