Spaces:

akryldigital
/

audit_assistant

Sleeping

App Files Files Community

Ara Yeroyan commited on 27 days ago

Commit

06faccd

1 Parent(s): 6f5999e

fix gemini chunk extraction

Browse files

Files changed (4) hide show

src/agents/gemini_chatbot.py +42 -22
src/agents/multi_agent_chatbot.py +1 -22
src/gemini/file_search.py +201 -30
src/ui_components/components.py +7 -7

src/agents/gemini_chatbot.py CHANGED Viewed

@@ -145,11 +145,11 @@ class GeminiRAGChatbot:
         return state
     def _enhance_response_with_references(self, answer: str, sources: List[Any], query: str) -> str:
-        """Enhance Gemini response to include document references"""
         if not sources or not answer:
             return answer
-        # Use LLM to intelligently add document references
         try:
             from src.llm.adapters import get_llm_client
             llm = get_llm_client()
@@ -163,8 +163,17 @@ class GeminiRAGChatbot:
                 filename = metadata.get('filename', 'Unknown') if isinstance(metadata, dict) else 'Unknown'
                 year = metadata.get('year', 'Unknown') if isinstance(metadata, dict) else 'Unknown'
                 source = metadata.get('source', 'Unknown') if isinstance(metadata, dict) else 'Unknown'
-                doc_summaries.append(f"[Doc {idx}] {filename} ({year}, {source}): {content[:300]}...")
             prompt = f"""You are enhancing a response from a document search system. The original response is:
@@ -175,34 +184,42 @@ The following documents were retrieved and used to generate this response:
 {chr(10).join(doc_summaries)}
 CRITICAL RULES:
-1. The response should ONLY contain information from the retrieved documents listed above
-2. If the response mentions information NOT found in the retrieved documents, you must REMOVE or CORRECT that information
-3. Add document references [Doc i] at the end of sentences that use information from specific documents
-4. Only reference documents that are actually used in the response
-5. If the response mentions years, sources, or data that don't match the retrieved documents, you must correct it
-6. Keep the response natural and conversational
-7. Don't change the core content that matches the documents, just add references where appropriate
-8. If multiple documents support the same claim, use [Doc i, Doc j] format
-9. If the response contains information that cannot be verified in the retrieved documents, add a note like: "Note: This information may not be in the retrieved documents."
-Return ONLY the enhanced response with references added and any corrections made. Do not include any explanation or meta-commentary."""
             enhanced = llm.invoke(prompt).content if hasattr(llm.invoke(prompt), 'content') else str(llm.invoke(prompt))
-            # Fallback: if LLM fails, just return original
             if not enhanced or len(enhanced) < len(answer) * 0.5:
-                logger.warning("LLM enhancement failed, using original response")
-                return answer
             return enhanced
         except Exception as e:
             logger.warning(f"Failed to enhance response with references: {e}")
-            # Fallback: add basic references at the end
             if sources:
                 ref_list = ", ".join([f"[Doc {i+1}]" for i in range(min(len(sources), 5))])
-                return f"{answer}\n\n*Based on documents: {ref_list}*"
-            return answer
     def _extract_ui_filters(self, query: str) -> Dict[str, List[str]]:
         """Extract UI filters from query if present"""
@@ -303,8 +320,10 @@ Return ONLY the enhanced response with references added and any corrections made
         # Format sources for display
         sources = []
-        if final_state.get("gemini_result"):
-            sources = self.gemini_client.format_sources_for_display(final_state["gemini_result"])
         return {
             'response': final_state["final_response"] or "I apologize, but I couldn't process your request.",
@@ -313,7 +332,8 @@ Return ONLY the enhanced response with references added and any corrections made
                 'answer': final_state["final_response"]
             },
             'agent_logs': final_state["agent_logs"],
-            'actual_rag_query': final_state["current_query"]
         }
     def _load_conversation(self, conversation_file: Path) -> Dict[str, Any]:

         return state
     def _enhance_response_with_references(self, answer: str, sources: List[Any], query: str) -> str:
+        """Enhance Gemini response to include document references and format nicely"""
         if not sources or not answer:
             return answer
+        # Use LLM to intelligently add document references and format nicely
         try:
             from src.llm.adapters import get_llm_client
             llm = get_llm_client()
                 filename = metadata.get('filename', 'Unknown') if isinstance(metadata, dict) else 'Unknown'
                 year = metadata.get('year', 'Unknown') if isinstance(metadata, dict) else 'Unknown'
                 source = metadata.get('source', 'Unknown') if isinstance(metadata, dict) else 'Unknown'
+                district = metadata.get('district', '') if isinstance(metadata, dict) else ''
+                doc_info = f"{filename}"
+                if year and year != 'Unknown':
+                    doc_info += f" ({year})"
+                if source and source != 'Unknown':
+                    doc_info += f" - {source}"
+                if district:
+                    doc_info += f" - {district}"
+                doc_summaries.append(f"[Doc {idx}] {doc_info}: {content[:300]}...")
             prompt = f"""You are enhancing a response from a document search system. The original response is:
 {chr(10).join(doc_summaries)}
 CRITICAL RULES:
+1. Format the response nicely with proper paragraphs, bullet points, or structured sections where appropriate
+2. The response should ONLY contain information from the retrieved documents listed above
+3. If the response mentions information NOT found in the retrieved documents, you must REMOVE or CORRECT that information
+4. Add document references [Doc i] at the end of sentences that use information from specific documents
+5. Only reference documents that are actually used in the response
+6. If the response mentions years, sources, or data that don't match the retrieved documents, you must correct it
+7. Keep the response natural, conversational, and well-formatted
+8. Use proper formatting: paragraphs, line breaks, and structure for readability
+9. Don't change the core content that matches the documents, just add references where appropriate and improve formatting
+10. If multiple documents support the same claim, use [Doc i, Doc j] format
+11. If the response contains information that cannot be verified in the retrieved documents, add a note like: "Note: This information may not be in the retrieved documents."
+Return ONLY the enhanced, well-formatted response with references added and any corrections made. Do not include any explanation or meta-commentary."""
             enhanced = llm.invoke(prompt).content if hasattr(llm.invoke(prompt), 'content') else str(llm.invoke(prompt))
+            # Fallback: if LLM fails, just return original with basic formatting
             if not enhanced or len(enhanced) < len(answer) * 0.5:
+                logger.warning("LLM enhancement failed, using original response with basic formatting")
+                # Basic formatting: add line breaks after periods for readability
+                formatted = answer.replace('. ', '.\n\n')
+                if sources:
+                    ref_list = ", ".join([f"[Doc {i+1}]" for i in range(min(len(sources), 5))])
+                    formatted += f"\n\n*Based on documents: {ref_list}*"
+                return formatted
             return enhanced
         except Exception as e:
             logger.warning(f"Failed to enhance response with references: {e}")
+            # Fallback: add basic formatting and references at the end
+            formatted = answer.replace('. ', '.\n\n')  # Basic paragraph formatting
             if sources:
                 ref_list = ", ".join([f"[Doc {i+1}]" for i in range(min(len(sources), 5))])
+                formatted += f"\n\n*Based on documents: {ref_list}*"
+            return formatted
     def _extract_ui_filters(self, query: str) -> Dict[str, List[str]]:
         """Extract UI filters from query if present"""
         # Format sources for display
         sources = []
+        gemini_result = final_state.get("gemini_result")
+        if gemini_result:
+            sources = self.gemini_client.format_sources_for_display(gemini_result)
+            logger.info(f"📋 GEMINI CHAT: Formatted {len(sources)} sources for display")
         return {
             'response': final_state["final_response"] or "I apologize, but I couldn't process your request.",
                 'answer': final_state["final_response"]
             },
             'agent_logs': final_state["agent_logs"],
+            'actual_rag_query': final_state["current_query"],
+            'gemini_result': gemini_result  # Include raw result for tracking
         }
     def _load_conversation(self, conversation_file: Path) -> Dict[str, Any]:

src/agents/multi_agent_chatbot.py CHANGED Viewed

@@ -1222,7 +1222,7 @@ Generate a conversational response with proper document references:""")
                     doc_sources.add(str(metadata['source']))
         # Correct misspellings in response using correct names from documents
-        response = self._correct_misspellings_in_response(response, doc_districts, doc_sources)
         # Check if response mentions years not in documents
         year_pattern = r'\b(20\d{2})\b'
@@ -1252,27 +1252,6 @@ Generate a conversational response with proper document references:""")
         return response
-    def _correct_misspellings_in_response(self, response: str, correct_districts: set, correct_sources: set) -> str:
-        """Correct common misspellings in response using correct names from documents."""
-        # Common misspelling mappings (e.g., "Kalagala" -> "Kalangala")
-        # We'll use fuzzy matching if needed, but first try direct corrections
-        corrected = response
-        # Correct district names
-        for correct_district in correct_districts:
-            # Try common misspellings
-            if correct_district.lower() == "kalangala":
-                # Replace "Kalagala" (missing 'n') with "Kalangala"
-                corrected = re.sub(r'\bKalagala\b', 'Kalangala', corrected, flags=re.IGNORECASE)
-            # Add more common misspellings as needed
-            # For now, we rely on the LLM to use correct names from the prompt
-        # Correct source names if needed
-        # Add source corrections as needed in the future
-        return corrected
     def _generate_conversational_response_without_docs(self, query: str, messages: List[Any]) -> str:
         """Generate conversational response using only LLM knowledge and conversation history"""
         logger.info("💬 RESPONSE GENERATION (NO DOCS): Starting response generation without documents")

                     doc_sources.add(str(metadata['source']))
         # Correct misspellings in response using correct names from documents
+        # response = self._correct_misspellings_in_response(response, doc_districts, doc_sources)
         # Check if response mentions years not in documents
         year_pattern = r'\b(20\d{2})\b'
         return response
     def _generate_conversational_response_without_docs(self, query: str, messages: List[Any]) -> str:
         """Generate conversational response using only LLM knowledge and conversation history"""
         logger.info("💬 RESPONSE GENERATION (NO DOCS): Starting response generation without documents")

src/gemini/file_search.py CHANGED Viewed

@@ -46,10 +46,20 @@ class GeminiFileSearchClient:
         if not self.api_key:
             raise ValueError("GEMINI_API_KEY not found. Set it in .env file or pass as argument.")
-        self.store_name = store_name or os.getenv("GEMINI_FILESTORE_NAME")
-        if not self.store_name:
             raise ValueError("GEMINI_FILESTORE_NAME not found. Set it in .env file or pass as argument.")
         self.client = genai.Client(api_key=self.api_key)
         self.model = "gemini-2.5-flash"  # or "gemini-2.5-pro"
@@ -95,15 +105,32 @@ class GeminiFileSearchClient:
                 filter_context = f"\n\nPlease focus on documents matching these criteria: {', '.join(filter_parts)}"
         # Combine query with filter context
-        # Add explicit instruction to only use information from retrieved documents
-        instruction = "\n\nIMPORTANT: Only use information from the retrieved documents. Do not use information from your training data unless it's explicitly mentioned in the retrieved documents. If the retrieved documents don't contain the requested information, clearly state that.\n\n"
-        full_query = query + filter_context + instruction
         try:
             # Generate content with file search
             # Based on Gemini API docs: https://ai.google.dev/gemini-api/docs/file-search
             try:
-                # Try the documented format first
                 response = self.client.models.generate_content(
                     model=model,
                     contents=full_query,
@@ -111,27 +138,53 @@ class GeminiFileSearchClient:
                         tools=[
                             types.Tool(
                                 file_search=types.FileSearch(
-                                    file_search_store_names=[self.store_name]
                                 )
                             )
                         ]
                     )
                 )
-            except (AttributeError, TypeError) as e:
-                # Fallback: try alternative format
-                logger.warning(f"Primary API format failed, trying alternative: {e}")
-                try:
-                    response = self.client.models.generate_content(
-                        model=model,
-                        contents=full_query,
-                        tools=[{
-                            "file_search": {
-                                "file_search_store_names": [self.store_name]
-                            }
-                        }]
-                    )
-                except Exception as e2:
-                    raise Exception(f"Failed to call Gemini API: {e2}")
             # Extract answer
             answer = ""
@@ -156,23 +209,36 @@ class GeminiFileSearchClient:
             sources = []
             grounding_metadata = None
             if hasattr(response, 'candidates') and response.candidates:
                 candidate = response.candidates[0]
                 # Get grounding metadata
                 if hasattr(candidate, 'grounding_metadata'):
                     grounding_metadata = candidate.grounding_metadata
                     # Extract source documents from grounding metadata
                     # Handle different response formats
                     grounding_chunks = None
                     if hasattr(grounding_metadata, 'grounding_chunks'):
                         grounding_chunks = grounding_metadata.grounding_chunks
                     elif isinstance(grounding_metadata, dict) and 'grounding_chunks' in grounding_metadata:
                         grounding_chunks = grounding_metadata['grounding_chunks']
                     if grounding_chunks:
-                        for chunk in grounding_chunks:
                             # Handle both object and dict formats
                             try:
                                 if isinstance(chunk, dict):
@@ -196,6 +262,46 @@ class GeminiFileSearchClient:
                                 text = chunk_info.get('text', '') if isinstance(chunk_info, dict) else ''
                                 file_name = chunk_info.get('file_name', '') if isinstance(chunk_info, dict) else ''
                                 score_data = chunk_data.get('relevance_score', {})
                                 score = score_data.get('score', 0.0) if isinstance(score_data, dict) else 0.0
@@ -204,11 +310,43 @@ class GeminiFileSearchClient:
                                         "content": text,
                                         "filename": file_name,
                                         "score": score,
                                     }
                                     sources.append(source_info)
                             except Exception as e:
-                                logger.warning(f"Error extracting chunk info: {e}")
                                 continue
             return GeminiFileSearchResult(
                 answer=answer,
@@ -236,21 +374,54 @@ class GeminiFileSearchClient:
         formatted_sources = []
         for i, source in enumerate(result.sources):
             # Create a Document object compatible with existing code
             doc = Document(
                 page_content=source.get("content", ""),
                 metadata={
-                    "filename": source.get("filename", "Unknown"),
-                    "source": "Gemini File Search",
                     "score": source.get("score"),
                     "chunk_index": i,
-                    # Add default fields that might be expected
-                    "page": None,
-                    "year": None,
-                    "district": None,
                 }
             )
             formatted_sources.append(doc)
         return formatted_sources

         if not self.api_key:
             raise ValueError("GEMINI_API_KEY not found. Set it in .env file or pass as argument.")
+        store_name_raw = store_name or os.getenv("GEMINI_FILESTORE_NAME")
+        if not store_name_raw:
             raise ValueError("GEMINI_FILESTORE_NAME not found. Set it in .env file or pass as argument.")
+        # Normalize store name: API expects the FULL path format (fileSearchStores/xxx)
+        # If just the ID is provided, construct the full path
+        if store_name_raw.startswith("fileSearchStores/"):
+            self.store_name = store_name_raw  # Already full path
+        else:
+            # Just the ID provided, construct full path
+            self.store_name = f"fileSearchStores/{store_name_raw}"
+        logger.info(f"📦 Using file search store: {self.store_name}")
         self.client = genai.Client(api_key=self.api_key)
         self.model = "gemini-2.5-flash"  # or "gemini-2.5-pro"
                 filter_context = f"\n\nPlease focus on documents matching these criteria: {', '.join(filter_parts)}"
         # Combine query with filter context
+        # Add comprehensive system instructions similar to multi-agent system
+        system_instructions = """You are a helpful audit report assistant specialized in analyzing government audit reports from Uganda's Office of the Auditor General.
+CRITICAL RULES:
+1. **NO HALLUCINATION**: Only use information that is explicitly stated in the retrieved documents. Do not make up facts, numbers, or details.
+2. **Document References**: Always cite which documents you're using with [Doc i] references at the end of sentences that use specific information.
+3. **Formatting**: Structure your response with clear paragraphs, bullet points, or sections for readability.
+4. **Accuracy**: If the retrieved documents don't contain the requested information, explicitly state "The retrieved documents do not contain information about [topic]."
+5. **Years and Data**: Pay careful attention to years mentioned in documents. If a user asks about a specific year but documents show different years, explicitly state this.
+6. **District/Source Names**: Use the exact district and source names as they appear in the document metadata (e.g., "Kalangala" not "Kalagala").
+7. **Financial Data**: When providing financial figures, include the currency (UGX) and be precise about amounts.
+8. **Conversational Tone**: Be helpful, clear, and conversational while maintaining accuracy.
+IMPORTANT: Only use information from the retrieved documents. Do not use information from your training data unless it's explicitly mentioned in the retrieved documents."""
+        # Combine system instructions with query
+        full_query = f"{system_instructions}\n\nUser Question: {query}{filter_context}\n\nPlease provide a detailed, well-formatted response with proper document references."
         try:
             # Generate content with file search
             # Based on Gemini API docs: https://ai.google.dev/gemini-api/docs/file-search
+            # Try with full path format first, then fallback to just ID if needed
+            store_name_to_try = self.store_name
             try:
+                # Try the documented format first with full path
                 response = self.client.models.generate_content(
                     model=model,
                     contents=full_query,
                         tools=[
                             types.Tool(
                                 file_search=types.FileSearch(
+                                    file_search_store_names=[store_name_to_try]
                                 )
                             )
                         ]
                     )
                 )
+            except Exception as api_error:
+                error_str = str(api_error).lower()
+                # If format error, try with just the ID (without fileSearchStores/ prefix)
+                if 'format' in error_str or 'invalid' in error_str or 'too long' in error_str:
+                    logger.warning(f"Full path format failed, trying with just store ID: {api_error}")
+                    # Extract just the ID part
+                    if store_name_to_try.startswith("fileSearchStores/"):
+                        store_id = store_name_to_try.split("/", 1)[1]
+                        store_name_to_try = store_id
+                    try:
+                        response = self.client.models.generate_content(
+                            model=model,
+                            contents=full_query,
+                            config=types.GenerateContentConfig(
+                                tools=[
+                                    types.Tool(
+                                        file_search=types.FileSearch(
+                                            file_search_store_names=[store_name_to_try]
+                                        )
+                                    )
+                                ]
+                            )
+                        )
+                    except Exception as e2:
+                        raise Exception(f"Failed to call Gemini API with both formats. Full path error: {api_error}, ID-only error: {e2}")
+                else:
+                    # Try alternative dict format
+                    logger.warning(f"Primary API format failed, trying alternative: {api_error}")
+                    try:
+                        response = self.client.models.generate_content(
+                            model=model,
+                            contents=full_query,
+                            tools=[{
+                                "file_search": {
+                                    "file_search_store_names": [store_name_to_try]
+                                }
+                            }]
+                        )
+                    except Exception as e2:
+                        raise Exception(f"Failed to call Gemini API: {e2}")
             # Extract answer
             answer = ""
             sources = []
             grounding_metadata = None
+            logger.info(f"🔍 Extracting sources from Gemini response...")
             if hasattr(response, 'candidates') and response.candidates:
                 candidate = response.candidates[0]
+                logger.info(f"   Found candidate, checking for grounding_metadata...")
                 # Get grounding metadata
                 if hasattr(candidate, 'grounding_metadata'):
                     grounding_metadata = candidate.grounding_metadata
+                    logger.info(f"   Found grounding_metadata: {type(grounding_metadata)}")
                     # Extract source documents from grounding metadata
                     # Handle different response formats
                     grounding_chunks = None
                     if hasattr(grounding_metadata, 'grounding_chunks'):
                         grounding_chunks = grounding_metadata.grounding_chunks
+                        logger.info(f"   Found grounding_chunks (attr): {len(grounding_chunks) if grounding_chunks else 0}")
                     elif isinstance(grounding_metadata, dict) and 'grounding_chunks' in grounding_metadata:
                         grounding_chunks = grounding_metadata['grounding_chunks']
+                        logger.info(f"   Found grounding_chunks (dict): {len(grounding_chunks) if grounding_chunks else 0}")
+                    elif hasattr(grounding_metadata, '__dict__'):
+                        # Try to access as object attributes
+                        metadata_dict = grounding_metadata.__dict__
+                        if 'grounding_chunks' in metadata_dict:
+                            grounding_chunks = metadata_dict['grounding_chunks']
+                            logger.info(f"   Found grounding_chunks (__dict__): {len(grounding_chunks) if grounding_chunks else 0}")
                     if grounding_chunks:
+                        logger.info(f"   Processing {len(grounding_chunks)} grounding chunks...")
+                        for idx, chunk in enumerate(grounding_chunks):
                             # Handle both object and dict formats
                             try:
                                 if isinstance(chunk, dict):
                                 text = chunk_info.get('text', '') if isinstance(chunk_info, dict) else ''
                                 file_name = chunk_info.get('file_name', '') if isinstance(chunk_info, dict) else ''
+                                # Try to extract file URI and parse metadata from it
+                                file_uri = chunk_info.get('file_uri', '') if isinstance(chunk_info, dict) else ''
+                                # Also check for 'web' attribute (GroundingChunkData structure)
+                                if hasattr(chunk, 'web') and chunk.web:
+                                    web_data = chunk.web
+                                    file_uri = getattr(web_data, 'file_uri', '') or file_uri
+                                    file_name = getattr(web_data, 'title', '') or getattr(web_data, 'filename', '') or file_name
+                                    text = getattr(web_data, 'text', '') or getattr(web_data, 'content', '') or text
+                                # Check retrieved_context - this is where the actual data seems to be!
+                                if hasattr(chunk, 'retrieved_context') and chunk.retrieved_context:
+                                    rc = chunk.retrieved_context
+                                    # Get text content
+                                    if hasattr(rc, 'text'):
+                                        text = getattr(rc, 'text', '') or text
+                                    # Get document name
+                                    if hasattr(rc, 'document_name'):
+                                        doc_name = getattr(rc, 'document_name', '')
+                                        if doc_name:
+                                            file_name = doc_name or file_name
+                                # Fallback: Parse from string representation if we still don't have filename
+                                if not file_name:
+                                    chunk_str = str(chunk)
+                                    import re
+                                    # Look for PDF filenames
+                                    pdf_match = re.search(r"([A-Za-z0-9\s_-]+\.pdf)", chunk_str)
+                                    if pdf_match:
+                                        file_name = pdf_match.group(1)
+                                    # Or look for title= pattern
+                                    if not file_name and 'title=' in chunk_str:
+                                        title_match = re.search(r"title=['\"]([^'\"]+)['\"]", chunk_str)
+                                        if title_match:
+                                            file_name = title_match.group(1)
+                                if not file_name and file_uri:
+                                    # Extract filename from URI if available
+                                    file_name = file_uri.split('/')[-1] if '/' in file_uri else file_uri
                                 score_data = chunk_data.get('relevance_score', {})
                                 score = score_data.get('score', 0.0) if isinstance(score_data, dict) else 0.0
                                         "content": text,
                                         "filename": file_name,
                                         "score": score,
+                                        "file_uri": file_uri,
                                     }
                                     sources.append(source_info)
+                                    logger.info(f"📄 Extracted source {idx+1}: {file_name} (score: {score:.3f}, content length: {len(text)})")
                             except Exception as e:
+                                logger.warning(f"Error extracting chunk {idx+1} info: {e}")
+                                import traceback
+                                logger.debug(traceback.format_exc())
                                 continue
+                    else:
+                        logger.warning(f"   No grounding_chunks found in grounding_metadata")
+                else:
+                    logger.warning(f"   Candidate does not have grounding_metadata attribute")
+                # Also try to get file references from other parts of the response
+                # Sometimes Gemini includes file references in the response itself
+                if not sources or len(sources) == 0:
+                    logger.info(f"   No sources from grounding_metadata, trying alternative extraction...")
+                    # Check if response has file references in other attributes
+                    if hasattr(candidate, 'content') and candidate.content:
+                        if hasattr(candidate.content, 'parts'):
+                            for part in candidate.content.parts:
+                                if hasattr(part, 'file_data'):
+                                    file_data = part.file_data
+                                    if hasattr(file_data, 'file_uri') or (isinstance(file_data, dict) and 'file_uri' in file_data):
+                                        file_uri = getattr(file_data, 'file_uri', None) or (file_data.get('file_uri') if isinstance(file_data, dict) else None)
+                                        if file_uri:
+                                            file_name = file_uri.split('/')[-1] if '/' in file_uri else file_uri
+                                            sources.append({
+                                                "content": "",
+                                                "filename": file_name,
+                                                "score": 0.0,
+                                                "file_uri": file_uri,
+                                            })
+                                            logger.info(f"📄 Extracted source from file_data: {file_name}")
+            logger.info(f"✅ Total sources extracted: {len(sources)}")
             return GeminiFileSearchResult(
                 answer=answer,
         formatted_sources = []
         for i, source in enumerate(result.sources):
+            filename = source.get("filename", "Unknown")
+            # Try to extract metadata from filename (e.g., "Kalangala DLG Report of Auditor General 2021.pdf")
+            year = None
+            district = None
+            source_name = "Gemini File Search"
+            # Parse filename for year
+            import re
+            year_match = re.search(r'\b(20\d{2})\b', filename)
+            if year_match:
+                year = int(year_match.group(1))
+            # Parse filename for district/source
+            if "Kalangala" in filename:
+                district = "Kalangala"
+                source_name = "Kalangala DLG"
+            elif "Gulu" in filename:
+                district = "Gulu"
+                source_name = "Gulu DLG"
+            elif "KCCA" in filename:
+                district = "Kampala"
+                source_name = "KCCA"
+            elif "MAAIF" in filename:
+                source_name = "MAAIF"
+            elif "MWTS" in filename:
+                source_name = "MWTS"
+            elif "Consolidated" in filename:
+                source_name = "Consolidated"
             # Create a Document object compatible with existing code
             doc = Document(
                 page_content=source.get("content", ""),
                 metadata={
+                    "filename": filename,
+                    "source": source_name,
                     "score": source.get("score"),
                     "chunk_index": i,
+                    "page": None,  # Gemini doesn't provide page numbers
+                    "year": year,
+                    "district": district,
+                    "chunk_id": f"gemini_{i}",
+                    "_id": f"gemini_{i}",
                 }
             )
             formatted_sources.append(doc)
+            logger.info(f"📋 Formatted source {i+1}: {filename} ({year}, {source_name})")
+        logger.info(f"✅ Formatted {len(formatted_sources)} sources for display")
         return formatted_sources

src/ui_components/components.py CHANGED Viewed

@@ -57,7 +57,7 @@ def display_chunk_statistics_charts(stats: Dict[str, Any], title: str = "Retriev
                 color_continuous_scale='viridis'
             )
             fig_source.update_layout(height=400, showlegend=False)
-            st.plotly_chart(fig_source, use_container_width=True)
     with col2:
         # Year distribution chart
@@ -84,7 +84,7 @@ def display_chunk_statistics_charts(stats: Dict[str, Any], title: str = "Retriev
                 # Ensure years are treated as categorical (discrete) not continuous
                 fig_year.update_xaxes(type='category')
                 fig_year.update_layout(height=400, showlegend=False)
-                st.plotly_chart(fig_year, use_container_width=True)
             else:
                 st.info("No valid years found in the results")
@@ -109,7 +109,7 @@ def display_chunk_statistics_charts(stats: Dict[str, Any], title: str = "Retriev
                     color_continuous_scale='blues'
                 )
                 fig_district.update_layout(height=400, showlegend=False)
-                st.plotly_chart(fig_district, use_container_width=True)
             else:
                 st.info("No valid districts found in the results")
@@ -144,7 +144,7 @@ def display_chunk_statistics_table(stats: Dict[str, Any], title: str = "Retrieva
                         "Count": list(district_dist_filtered.values())
                     }
                     district_df = pd.DataFrame(district_data).sort_values('Count', ascending=False)
-                    st.dataframe(district_df, hide_index=True, use_container_width=True)
                 else:
                     st.write("No district data")
             else:
@@ -158,7 +158,7 @@ def display_chunk_statistics_table(stats: Dict[str, Any], title: str = "Retrieva
                     "Count": list(stats['source_distribution'].values())
                 }
                 source_df = pd.DataFrame(source_data).sort_values('Count', ascending=False)
-                st.dataframe(source_df, hide_index=True, use_container_width=True)
             else:
                 st.write("No source data")
@@ -175,7 +175,7 @@ def display_chunk_statistics_table(stats: Dict[str, Any], title: str = "Retrieva
                     # Sort by year as integer but display as string
                     year_df['Year_Int'] = year_df['Year'].astype(int)
                     year_df = year_df.sort_values('Year_Int')[['Year', 'Count']]
-                    st.dataframe(year_df, hide_index=True, use_container_width=True)
                 else:
                     st.write("No year data")
             else:
@@ -193,7 +193,7 @@ def display_chunk_statistics_table(stats: Dict[str, Any], title: str = "Retrieva
                     "Count": [c for f, c in filename_items[:5]]
                 }
                 file_df = pd.DataFrame(file_data)
-                st.dataframe(file_df, hide_index=True, use_container_width=True)
             else:
                 st.write("No file data")

                 color_continuous_scale='viridis'
             )
             fig_source.update_layout(height=400, showlegend=False)
+            st.plotly_chart(fig_source, use_container_width=True)  # Note: plotly_chart still uses use_container_width
     with col2:
         # Year distribution chart
                 # Ensure years are treated as categorical (discrete) not continuous
                 fig_year.update_xaxes(type='category')
                 fig_year.update_layout(height=400, showlegend=False)
+                st.plotly_chart(fig_year, use_container_width=True)  # Note: plotly_chart still uses use_container_width
             else:
                 st.info("No valid years found in the results")
                     color_continuous_scale='blues'
                 )
                 fig_district.update_layout(height=400, showlegend=False)
+                st.plotly_chart(fig_district, use_container_width=True)  # Note: plotly_chart still uses use_container_width
             else:
                 st.info("No valid districts found in the results")
                         "Count": list(district_dist_filtered.values())
                     }
                     district_df = pd.DataFrame(district_data).sort_values('Count', ascending=False)
+                    st.dataframe(district_df, hide_index=True, width='stretch')
                 else:
                     st.write("No district data")
             else:
                     "Count": list(stats['source_distribution'].values())
                 }
                 source_df = pd.DataFrame(source_data).sort_values('Count', ascending=False)
+                st.dataframe(source_df, hide_index=True, width='stretch')
             else:
                 st.write("No source data")
                     # Sort by year as integer but display as string
                     year_df['Year_Int'] = year_df['Year'].astype(int)
                     year_df = year_df.sort_values('Year_Int')[['Year', 'Count']]
+                    st.dataframe(year_df, hide_index=True, width='stretch')
                 else:
                     st.write("No year data")
             else:
                     "Count": [c for f, c in filename_items[:5]]
                 }
                 file_df = pd.DataFrame(file_data)
+                st.dataframe(file_df, hide_index=True, width='stretch')
             else:
                 st.write("No file data")