Spaces:

akryldigital
/

audit_assistant

Sleeping

App Files Files Community

Gemini FSA

by Yeroyan - opened 29 days ago

base: refs/heads/main

←

from: refs/pr/6

Discussion Files changed

+3593

-561

Files changed (26) hide show

.gitignore +115 -0
Dockerfile +1 -0
add_district_metadata.py +379 -0
app.py +679 -311
src/agents/__init__.py +10 -0
src/agents/gemini_chatbot.py +392 -0
multi_agent_chatbot.py → src/agents/multi_agent_chatbot.py +283 -53
smart_chatbot.py → src/agents/smart_chatbot.py +4 -3
src/config/paths.py +59 -0
src/feedback/__init__.py +152 -0
src/feedback/feedback_schema.py +161 -0
src/feedback/snowflake_connector.py +331 -0
src/gemini/__init__.py +11 -0
src/gemini/file_search.py +427 -0
src/{loader.py → llm/loader.py} +0 -0
src/pipeline.py +33 -38
src/reporting/__init__.py +5 -1
src/reporting/feedback_schema.py +36 -71
src/reporting/snowflake_connector.py +67 -39
src/streamlit_app.py +0 -40
src/ui_components/__init__.py +21 -0
src/ui_components/components.py +202 -0
src/ui_components/styles.py +117 -0
src/ui_components/utils.py +73 -0
utils.py → src/utils.py +0 -0
src/vectorstore.py +35 -5

.gitignore ADDED Viewed

	@@ -0,0 +1,115 @@

+# ==========================================
+# PYTHON
+# ==========================================
+__pycache__/
+*.py[cod]
+*.pyo
+*.pyd
+*$py.class
+# Virtual environments
+.venv/
+venv/
+env/
+ENV/
+.conda/
+.venv*/
+# Byte-compiled / optimized / DLL files
+*.so
+*.dll
+*.dylib
+# Logs and debug
+*.log
+*.out
+*.err
+logs/
+debug/
+*.sqlite3
+# ==========================================
+# BUILD / PACKAGING
+# ==========================================
+build/
+dist/
+*.egg-info/
+.eggs/
+pip-wheel-metadata/
+.wheels/
+# ==========================================
+# JUPYTER / NOTEBOOKS
+# ==========================================
+.ipynb_checkpoints/
+*.ipynb_convert/
+# ==========================================
+# DATA / MODELS / CACHE
+# ==========================================
+data/
+datasets/
+.cache/
+*.ckpt
+*.h5
+*.hdf5
+*.tflite
+*.onnx
+*.pth
+*.pt
+*.joblib
+*.pkl
+*.pickle
+*.npz
+*.npy
+outputs/
+artifacts/
+checkpoints/
+runs/
+wandb/
+mlruns/
+lightning_logs/
+# Hugging Face
+huggingface/
+~/.cache/huggingface/
+~/.cache/torch/
+~/.cache/datasets/
+~/.cache/transformers/
+# ==========================================
+# EDITORS / TOOLS
+# ==========================================
+.vscode/
+.idea/
+*.swp
+*.swo
+*.bak
+.DS_Store
+Thumbs.db
+# ==========================================
+# ENV FILES / CREDENTIALS
+# ==========================================
+.env
+.env.*
+*.env.local
+secrets.*
+config.json
+token.json
+# ==========================================
+# TESTS / TEMP FILES
+# ==========================================
+__tests__/
+.tox/
+.coverage
+.cache/
+pytest_cache/
+tmp/
+temp/
+*.tmp
+*.temp
+local_*

Dockerfile CHANGED Viewed

@@ -59,6 +59,7 @@ HEALTHCHECK --interval=30s --timeout=10s --start-period=40s --retries=3 \
     CMD curl --fail http://localhost:8501/_stcore/health || exit 1
 #temp developement commands
 # RUN mkdir /app/conversations && chmod -R 777 conversations
 # RUN mkdir /app/feedback && chmod -R 777 feedback

     CMD curl --fail http://localhost:8501/_stcore/health || exit 1
 #temp developement commands
+RUN pip3 install plotly
 # RUN mkdir /app/conversations && chmod -R 777 conversations
 # RUN mkdir /app/feedback && chmod -R 777 feedback

add_district_metadata.py ADDED Viewed

	@@ -0,0 +1,379 @@

+#!/usr/bin/env python3
+"""
+Script to add District metadata to Qdrant chunks based on filename analysis.
+Handles Uganda districts, ministry mappings, and LLM inference for ambiguous cases.
+"""
+import re
+import yaml
+import logging
+from dataclasses import dataclass
+from typing import Dict, List, Optional
+from qdrant_client import QdrantClient
+# Configure logging
+logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
+logger = logging.getLogger(__name__)
+@dataclass
+class DistrictMapping:
+    """Mapping for district-related entities"""
+    name: str
+    aliases: List[str]
+    is_district: bool = True
+class DistrictMetadataProcessor:
+    def __init__(self, config_path: str = "src/config/settings.yaml"):
+        # Load config manually
+        with open(config_path, 'r') as f:
+            self.config = yaml.safe_load(f)
+        # Initialize Qdrant client (will be imported when needed)
+        self.llm_client = None
+        self.qdrant_client = None
+        self.collection_name = self.config["qdrant"]["collection_name"]
+        # Initialize district mappings
+        self.district_mappings = self._initialize_district_mappings()
+        self.ministry_mappings = self._initialize_ministry_mappings()
+    def _initialize_district_mappings(self) -> Dict[str, DistrictMapping]:
+        """Initialize Uganda districts and their aliases"""
+        districts = [
+            # Central Region
+            DistrictMapping("Kampala", ["KCCA", "Kampala Capital City Authority"]),
+            DistrictMapping("Wakiso", ["Wakiso"]),
+            DistrictMapping("Mukono", ["Mukono"]),
+            DistrictMapping("Luweero", ["Luweero"]),
+            DistrictMapping("Nakaseke", ["Nakaseke"]),
+            DistrictMapping("Nakasongola", ["Nakasongola"]),
+            DistrictMapping("Kayunga", ["Kayunga"]),
+            DistrictMapping("Buikwe", ["Buikwe"]),
+            DistrictMapping("Buvuma", ["Buvuma"]),
+            # Northern Region
+            DistrictMapping("Gulu", ["Gulu", "Gulu DLG"]),
+            DistrictMapping("Kitgum", ["Kitgum"]),
+            DistrictMapping("Pader", ["Pader"]),
+            DistrictMapping("Agago", ["Agago"]),
+            DistrictMapping("Lamwo", ["Lamwo"]),
+            DistrictMapping("Nwoya", ["Nwoya"]),
+            DistrictMapping("Amuru", ["Amuru"]),
+            DistrictMapping("Omoro", ["Omoro"]),
+            DistrictMapping("Oyam", ["Oyam"]),
+            DistrictMapping("Kole", ["Kole"]),
+            DistrictMapping("Apac", ["Apac", "Apac District"]),
+            DistrictMapping("Lira", ["Lira"]),
+            DistrictMapping("Alebtong", ["Alebtong"]),
+            DistrictMapping("Amolatar", ["Amolatar"]),
+            DistrictMapping("Dokolo", ["Dokolo"]),
+            DistrictMapping("Otuke", ["Otuke"]),
+            DistrictMapping("Kwania", ["Kwania"]),
+            # Eastern Region
+            DistrictMapping("Jinja", ["Jinja"]),
+            DistrictMapping("Kamuli", ["Kamuli"]),
+            DistrictMapping("Iganga", ["Iganga"]),
+            DistrictMapping("Bugiri", ["Bugiri"]),
+            DistrictMapping("Mayuge", ["Mayuge"]),
+            DistrictMapping("Namayingo", ["Namayingo"]),
+            DistrictMapping("Busia", ["Busia"]),
+            DistrictMapping("Tororo", ["Tororo"]),
+            DistrictMapping("Pallisa", ["Pallisa"]),
+            DistrictMapping("Kumi", ["Kumi"]),
+            DistrictMapping("Bukedea", ["Bukedea"]),
+            DistrictMapping("Soroti", ["Soroti"]),
+            DistrictMapping("Serere", ["Serere"]),
+            DistrictMapping("Ngora", ["Ngora"]),
+            DistrictMapping("Kaberamaido", ["Kaberamaido"]),
+            DistrictMapping("Kalaki", ["Kalaki"]),
+            DistrictMapping("Kapelebyong", ["Kapelebyong"]),
+            DistrictMapping("Amuria", ["Amuria"]),
+            DistrictMapping("Katakwi", ["Katakwi"]),
+            DistrictMapping("Kotido", ["Kotido"]),
+            DistrictMapping("Abim", ["Abim"]),
+            DistrictMapping("Kaabong", ["Kaabong", "Kaabong District"]),
+            DistrictMapping("Karenga", ["Karenga"]),
+            DistrictMapping("Moroto", ["Moroto"]),
+            DistrictMapping("Napak", ["Napak"]),
+            DistrictMapping("Nabilatuk", ["Nabilatuk"]),
+            DistrictMapping("Amudat", ["Amudat"]),
+            DistrictMapping("Nakapiripirit", ["Nakapiripirit"]),
+            DistrictMapping("Bukwo", ["Bukwo"]),
+            DistrictMapping("Kween", ["Kween"]),
+            DistrictMapping("Kapchorwa", ["Kapchorwa"]),
+            DistrictMapping("Sironko", ["Sironko"]),
+            DistrictMapping("Manafwa", ["Manafwa"]),
+            DistrictMapping("Bududa", ["Bududa"]),
+            DistrictMapping("Mbale", ["Mbale"]),
+            DistrictMapping("Butaleja", ["Butaleja"]),
+            DistrictMapping("Namisindwa", ["Namisindwa"]),
+            DistrictMapping("Bulambuli", ["Bulambuli"]),
+            # Western Region
+            DistrictMapping("Masaka", ["Masaka"]),
+            DistrictMapping("Kalungu", ["Kalungu"]),
+            DistrictMapping("Bukomansimbi", ["Bukomansimbi"]),
+            DistrictMapping("Lwengo", ["Lwengo"]),
+            DistrictMapping("Sembabule", ["Sembabule"]),
+            DistrictMapping("Rakai", ["Rakai"]),
+            DistrictMapping("Kyotera", ["Kyotera"]),
+            DistrictMapping("Mpigi", ["Mpigi"]),
+            DistrictMapping("Butambala", ["Butambala"]),
+            DistrictMapping("Gomba", ["Gomba"]),
+            DistrictMapping("Mityana", ["Mityana"]),
+            DistrictMapping("Mubende", ["Mubende"]),
+            DistrictMapping("Kassanda", ["Kassanda"]),
+            DistrictMapping("Kiboga", ["Kiboga"]),
+            DistrictMapping("Kyankwanzi", ["Kyankwanzi"]),
+            DistrictMapping("Hoima", ["Hoima"]),
+            DistrictMapping("Kikuube", ["Kikuube"]),
+            DistrictMapping("Kakumiro", ["Kakumiro"]),
+            DistrictMapping("Kibaale", ["Kibaale"]),
+            DistrictMapping("Kagadi", ["Kagadi"]),
+            DistrictMapping("Buliisa", ["Buliisa"]),
+            DistrictMapping("Masindi", ["Masindi"]),
+            DistrictMapping("Kiryandongo", ["Kiryandongo"]),
+            DistrictMapping("Buliisa", ["Buliisa"]),
+            DistrictMapping("Pakwach", ["Pakwach"]),
+            DistrictMapping("Nebbi", ["Nebbi"]),
+            DistrictMapping("Zombo", ["Zombo"]),
+            DistrictMapping("Arua", ["Arua"]),
+            DistrictMapping("Terego", ["Terego"]),
+            DistrictMapping("Madi-Okollo", ["Madi-Okollo"]),
+            DistrictMapping("Obongi", ["Obongi"]),
+            DistrictMapping("Moyo", ["Moyo"]),
+            DistrictMapping("Yumbe", ["Yumbe"]),
+            DistrictMapping("Koboko", ["Koboko"]),
+            DistrictMapping("Maracha", ["Maracha"]),
+            DistrictMapping("Adjumani", ["Adjumani"]),
+            # South Western Region
+            DistrictMapping("Mbarara", ["Mbarara"]),
+            DistrictMapping("Ibanda", ["Ibanda"]),
+            DistrictMapping("Isingiro", ["Isingiro"]),
+            DistrictMapping("Kiruhura", ["Kiruhura"]),
+            DistrictMapping("Kazo", ["Kazo"]),
+            DistrictMapping("Ntungamo", ["Ntungamo"]),
+            DistrictMapping("Rwampara", ["Rwampara"]),
+            DistrictMapping("Rubanda", ["Rubanda"]),
+            DistrictMapping("Rukiga", ["Rukiga"]),
+            DistrictMapping("Kanungu", ["Kanungu"]),
+            DistrictMapping("Rukungiri", ["Rukungiri"]),
+            DistrictMapping("Kisoro", ["Kisoro"]),
+            DistrictMapping("Bundibugyo", ["Bundibugyo"]),
+            DistrictMapping("Ntoroko", ["Ntoroko"]),
+            DistrictMapping("Kasese", ["Kasese"]),
+            DistrictMapping("Bunyangabu", ["Bunyangabu"]),
+            DistrictMapping("Fort Portal", ["Fort Portal"]),
+            DistrictMapping("Kabarole", ["Kabarole"]),
+            DistrictMapping("Kyenjojo", ["Kyenjojo"]),
+            DistrictMapping("Kamwenge", ["Kamwenge"]),
+            DistrictMapping("Kitagwenda", ["Kitagwenda"]),
+            DistrictMapping("Kyegegwa", ["Kyegegwa"]),
+            DistrictMapping("Mitooma", ["Mitooma"]),
+            DistrictMapping("Rubirizi", ["Rubirizi"]),
+            DistrictMapping("Sheema", ["Sheema"]),
+            DistrictMapping("Bushenyi", ["Bushenyi"]),
+            # Special cases
+            DistrictMapping("Kalangala", ["Kalangala", "Kalangala DLG"]),
+        ]
+        # Create mapping dictionary
+        mapping_dict = {}
+        for district in districts:
+            mapping_dict[district.name.lower()] = district
+            for alias in district.aliases:
+                mapping_dict[alias.lower()] = district
+        return mapping_dict
+    def _initialize_ministry_mappings(self) -> Dict[str, str]:
+        """Initialize ministry and organization mappings"""
+        return {
+            "maaif": "Ministry of Agriculture, Animal Industry and Fisheries",
+            "mwts": "Ministry of Works and Transport",
+            "kcca": "Kampala Capital City Authority",
+            "oag": "Office of the Auditor General",
+            "arsdp": "Albertine Regional Sustainable Development Project",
+            "avcdp": "Agriculture Value Chain Development Project",
+            "ida": "International Development Association",
+            "dlg": "District Local Government",
+            "lg": "Local Government",
+        }
+    def _extract_district_from_filename(self, filename: str) -> Optional[str]:
+        """Extract district from filename using pattern matching"""
+        filename_lower = filename.lower()
+        # Check for explicit district mentions
+        for key, district_mapping in self.district_mappings.items():
+            if key in filename_lower:
+                return district_mapping.name
+        # Check for ministry/organization patterns that are NOT districts
+        for ministry_key in self.ministry_mappings.keys():
+            if ministry_key in filename_lower:
+                return None  # This is a ministry, not a district
+        # Check for patterns like "District Local Government"
+        district_pattern = r'(\w+)\s+district\s+local\s+government'
+        match = re.search(district_pattern, filename_lower)
+        if match:
+            district_name = match.group(1).title()
+            if district_name.lower() in self.district_mappings:
+                return self.district_mappings[district_name.lower()].name
+        # Check for patterns like "DLG Report"
+        dlg_pattern = r'(\w+)\s+dlg\s+report'
+        match = re.search(dlg_pattern, filename_lower)
+        if match:
+            district_name = match.group(1).title()
+            if district_name.lower() in self.district_mappings:
+                return self.district_mappings[district_name.lower()].name
+        return None
+    def _infer_district_with_llm(self, filename: str) -> Optional[str]:
+        """Use LLM to infer district from filename when pattern matching fails"""
+        # For now, return None - LLM integration can be added later
+        logger.info(f"LLM inference needed for filename: {filename}")
+        return None
+    def infer_district(self, filename: str) -> Optional[str]:
+        """Main method to infer district from filename"""
+        # First try pattern matching
+        district = self._extract_district_from_filename(filename)
+        if district:
+            return district
+        # If pattern matching fails, use LLM
+        return self._infer_district_with_llm(filename)
+    def fetch_chunks_batch(self, batch_size: int = 100, offset: int = 0) -> List[Dict]:
+        """Fetch a batch of chunks from Qdrant (metadata only)"""
+        try:
+            # Import Qdrant client when needed
+            if self.qdrant_client is None:
+                self.qdrant_client = QdrantClient(
+                    url=self.config["qdrant"]["url"],
+                    api_key=self.config["qdrant"]["api_key"]
+                )
+            # Get points with metadata only (no vectors)
+            points = self.qdrant_client.scroll(
+                collection_name=self.collection_name,
+                limit=batch_size,
+                offset=offset,
+                with_payload=True,
+                with_vectors=False
+            )[0]
+            return points
+        except Exception as e:
+            logger.error(f"Failed to fetch batch: {e}")
+            return []
+    def update_chunks_with_district(self, points: List[Dict]) -> int:
+        """Update chunks with district metadata"""
+        updated_count = 0
+        # Import Qdrant client when needed
+        if self.qdrant_client is None:
+            from qdrant_client import QdrantClient
+            self.qdrant_client = QdrantClient(
+                url=self.config["qdrant"]["url"],
+                api_key=self.config["qdrant"]["api_key"]
+            )
+        for point in points:
+            try:
+                point_id = point.id
+                metadata = point.payload.get("metadata", {})
+                filename = metadata.get("filename", "")
+                if not filename:
+                    logger.warning(f"Point {point_id} has no filename")
+                    continue
+                # Infer district
+                district = self.infer_district(filename)
+                # Update metadata
+                updated_metadata = metadata.copy()
+                updated_metadata["district"] = district
+                # Update point in Qdrant
+                self.qdrant_client.set_payload(
+                    collection_name=self.collection_name,
+                    payload={"metadata": updated_metadata},
+                    points=[point_id]
+                )
+                updated_count += 1
+                logger.info(f"Updated point {point_id}: {filename} -> {district}")
+            except Exception as e:
+                logger.error(f"Failed to update point {point_id}: {e}")
+        return updated_count
+    def process_all_chunks(self, batch_size: int = 100):
+        """Process all chunks in batches"""
+        total_updated = 0
+        offset = 0
+        logger.info(f"Starting to process chunks in batches of {batch_size}")
+        while True:
+            # Fetch batch
+            points = self.fetch_chunks_batch(batch_size, offset)
+            if not points:
+                break
+            logger.info(f"Processing batch: {len(points)} points (offset: {offset})")
+            # Update batch
+            updated_count = self.update_chunks_with_district(points)
+            total_updated += updated_count
+            logger.info(f"Updated {updated_count} points in this batch")
+            # Move to next batch
+            offset += batch_size
+        logger.info(f"Total updated: {total_updated} points")
+        return total_updated
+def main():
+    """Main function to run the district metadata processor"""
+    try:
+        processor = DistrictMetadataProcessor()
+        # Test with a small batch first
+        logger.info("Testing with first 10 chunks...")
+        test_points = processor.fetch_chunks_batch(10, 0)
+        if test_points:
+            logger.info("Test batch fetched successfully. Processing...")
+            for point in test_points:
+                filename = point.payload.get("metadata", {}).get("filename", "")
+                district = processor.infer_district(filename)
+                logger.info(f"Test: {filename} -> {district}")
+        # Ask user if they want to proceed with full processing
+        response = input("\nProceed with full processing? (y/n): ")
+        if response.lower() == 'y':
+            processor.process_all_chunks(batch_size=100)
+        else:
+            logger.info("Processing cancelled by user")
+    except Exception as e:
+        logger.error(f"Error in main: {e}")
+        raise
+if __name__ == "__main__":
+    main()

app.py CHANGED Viewed

@@ -3,7 +3,36 @@ Intelligent Audit Report Chatbot UI
 """
 import os
-import sys
 # ===== CRITICAL: Fix OMP_NUM_THREADS FIRST, before ANY other imports =====
 # Some libraries load at import time and will fail if OMP_NUM_THREADS is invalid
@@ -29,42 +58,33 @@ except (ValueError, TypeError):
 # ===== Setup HuggingFace cache directories BEFORE any model imports =====
 # CRITICAL: Set these before any imports that might use HuggingFace (like sentence-transformers)
-# This ensures models downloaded during Docker build are found at runtime
-cache_dir = "/app/.cache/huggingface"
-os.environ["HF_HOME"] = cache_dir
-os.environ["TRANSFORMERS_CACHE"] = cache_dir
-os.environ["HF_DATASETS_CACHE"] = cache_dir
-os.environ["HF_HUB_CACHE"] = cache_dir
-os.environ["SENTENCE_TRANSFORMERS_HOME"] = cache_dir
-# Ensure cache directory exists (created in Dockerfile, but ensure it's there)
-try:
-    os.makedirs(cache_dir, mode=0o755, exist_ok=True)
-except (PermissionError, OSError) as e:
-    # If we can't create it, log but continue (might already exist from Dockerfile)
-    # HuggingFace will try to create subdirectories, but we need parent to exist
-    pass
-import time
-import json
-import uuid
-import logging
-from pathlib import Path
-import argparse
-import streamlit as st
-from langchain_core.messages import HumanMessage, AIMessage
-from multi_agent_chatbot import get_multi_agent_chatbot
-from smart_chatbot import get_chatbot as get_smart_chatbot
-from src.reporting.feedback_schema import create_feedback_from_dict
 # Configure logging
 logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
 logger = logging.getLogger(__name__)
 # Log environment setup for debugging
-logger.info(f"📁 HuggingFace cache directory: {os.environ.get('HF_HOME', 'NOT SET')}")
 logger.info(f"🔧 OMP_NUM_THREADS: {os.environ.get('OMP_NUM_THREADS', 'NOT SET')}")
@@ -76,84 +96,9 @@ st.set_page_config(
     page_title="Intelligent Audit Report Chatbot"
 )
-# Custom CSS
-st.markdown("""
-<style>
-    .main-header {
-        font-size: 2.5rem;
-        font-weight: bold;
-        color: #1f77b4;
-        text-align: center;
-        margin-bottom: 1rem;
-    }
-    .subtitle {
-        font-size: 1.2rem;
-        color: #666;
-        text-align: center;
-        margin-bottom: 2rem;
-    }
-    .session-info {
-        background-color: #f0f2f6;
-        padding: 10px;
-        border-radius: 5px;
-        margin-bottom: 20px;
-        font-size: 0.9rem;
-    }
-    .user-message {
-        background-color: #007bff;
-        color: white;
-        padding: 12px 16px;
-        border-radius: 18px 18px 4px 18px;
-        margin: 8px 0;
-        margin-left: 20%;
-        word-wrap: break-word;
-    }
-    .bot-message {
-        background-color: #f1f3f4;
-        color: #333;
-        padding: 12px 16px;
-        border-radius: 18px 18px 18px 4px;
-        margin: 8px 0;
-        margin-right: 20%;
-        word-wrap: break-word;
-        border: 1px solid #e0e0e0;
-    }
-    .filter-section {
-        margin-bottom: 20px;
-        padding: 15px;
-        background-color: #f8f9fa;
-        border-radius: 8px;
-        border: 1px solid #e9ecef;
-    }
-    .filter-title {
-        font-weight: bold;
-        margin-bottom: 10px;
-        color: #495057;
-    }
-    .feedback-section {
-        background-color: #f8f9fa;
-        padding: 20px;
-        border-radius: 10px;
-        margin-top: 30px;
-        border: 2px solid #dee2e6;
-    }
-    .retrieval-history {
-        background-color: #ffffff;
-        padding: 15px;
-        border-radius: 5px;
-        margin: 10px 0;
-        border-left: 4px solid #007bff;
-    }
-</style>
-""", unsafe_allow_html=True)
 def get_system_type():
     """Get the current system type"""
@@ -163,14 +108,17 @@ def get_system_type():
     else:
         return "Multi-Agent System"
-def get_chatbot():
-    """Initialize and return the chatbot based on system type"""
-    # Check environment variable for system type
-    system = os.environ.get('CHATBOT_SYSTEM', 'multi-agent')
-    if system == 'smart':
-        return get_smart_chatbot()
     else:
-        return get_multi_agent_chatbot()
 def serialize_messages(messages):
     """Serialize LangChain messages to dictionaries"""
@@ -215,13 +163,18 @@ def serialize_documents(sources):
     return serialized
 @st.cache_data
 def load_filter_options():
     try:
-        with open("src/config/filter_options.json", "r") as f:
             return json.load(f)
     except FileNotFoundError:
-        st.info([x for x in os.listdir() if x.endswith('.json')])
         st.error("filter_options.json not found. Please run the metadata analysis script.")
         return {"sources": [], "years": [], "districts": [], 'filenames': []}
@@ -238,11 +191,48 @@ def main():
     # Track RAG retrieval history for feedback
     if 'rag_retrieval_history' not in st.session_state:
         st.session_state.rag_retrieval_history = []
-    # Initialize chatbot only once per app session (cached)
-    if 'chatbot' not in st.session_state:
-        with st.spinner("🔄 Loading AI models and connecting to database..."):
-            st.session_state.chatbot = get_chatbot()
-        st.success("✅ AI system ready!")
     # Reset conversation history if needed (but keep chatbot cached)
     if 'reset_conversation' in st.session_state and st.session_state.reset_conversation:
@@ -254,17 +244,43 @@ def main():
         st.session_state.reset_conversation = False
         st.rerun()
-    # Header with system indicator
     col1, col2 = st.columns([3, 1])
     with col1:
-        st.markdown('<h1 class="main-header">🤖 Intelligent Audit Report Chatbot</h1>', unsafe_allow_html=True)
     with col2:
-        system_type = get_system_type()
-        if "Multi-Agent" in system_type:
-            st.success(f"🔧 {system_type}")
-        else:
-            st.info(f"🔧 {system_type}")
-    st.markdown('<p class="subtitle">Ask questions about audit reports. Use the sidebar filters to narrow down your search!</p>', unsafe_allow_html=True)
     # Session info
     duration = int(time.time() - st.session_state.session_start_time)
@@ -280,6 +296,34 @@ def main():
     # Sidebar for filters
     with st.sidebar:
         st.markdown("### 🔍 Search Filters")
         st.markdown("Select filters to narrow down your search. Leave empty to search all data.")
@@ -298,7 +342,7 @@ def main():
         # Determine if filename filter is active
         filename_mode = len(selected_filenames) > 0
         # Sources filter
-        st.markdown('<div class="filter-section">', unsafe_allow_html=True)
         st.markdown('<div class="filter-title">📊 Sources</div>', unsafe_allow_html=True)
         selected_sources = st.multiselect(
             "Select sources:",
@@ -311,7 +355,7 @@ def main():
         st.markdown('</div>', unsafe_allow_html=True)
         # Years filter
-        st.markdown('<div class="filter-section">', unsafe_allow_html=True)
         st.markdown('<div class="filter-title">📅 Years</div>', unsafe_allow_html=True)
         selected_years = st.multiselect(
             "Select years:",
@@ -324,7 +368,7 @@ def main():
         st.markdown('</div>', unsafe_allow_html=True)
         # Districts filter
-        st.markdown('<div class="filter-section">', unsafe_allow_html=True)
         st.markdown('<div class="filter-title">🏘️ Districts</div>', unsafe_allow_html=True)
         selected_districts = st.multiselect(
             "Select districts:",
@@ -375,26 +419,37 @@ def main():
             if 'input_counter' not in st.session_state:
                 st.session_state.input_counter = 0
             user_input = st.text_input(
                 "Type your message here...",
                 placeholder="Ask about budget allocations, expenditures, or audit findings...",
-                key=f"user_input_{st.session_state.input_counter}",
-                label_visibility="collapsed"
             )
         with col2:
-            send_button = st.button("Send", key="send_button", use_container_width=True)
         # Clear chat button
         if st.button("🗑️ Clear Chat", key="clear_chat_button"):
             st.session_state.reset_conversation = True
             # Clear all conversation files
-            import os
-            conversations_dir = "conversations"
-            if os.path.exists(conversations_dir):
-                for file in os.listdir(conversations_dir):
-                    if file.endswith('.json'):
-                        os.remove(os.path.join(conversations_dir, file))
             st.rerun()
         # Handle user input
@@ -436,6 +491,36 @@ def main():
                         if rag_result:
                             sources = rag_result.get('sources', []) if isinstance(rag_result, dict) else (rag_result.sources if hasattr(rag_result, 'sources') else [])
                             # Get the actual RAG query
                             actual_rag_query = chat_result.get('actual_rag_query', '')
                             if actual_rag_query:
@@ -445,12 +530,25 @@ def main():
                             else:
                                 formatted_query = "No RAG query available"
                             retrieval_entry = {
                                 "conversation_up_to": serialize_messages(st.session_state.messages),
                                 "rag_query_expansion": formatted_query,
-                                "docs_retrieved": serialize_documents(sources)
                             }
                             st.session_state.rag_retrieval_history.append(retrieval_entry)
                     else:
                         response = chat_result
                         st.session_state.last_rag_result = None
@@ -480,6 +578,16 @@ def main():
                 # Dictionary format from multi-agent system
                 sources = rag_result['sources']
             if sources and len(sources) > 0:
                 # Count unique filenames
                 unique_filenames = set()
@@ -487,16 +595,40 @@ def main():
                     filename = getattr(doc, 'metadata', {}).get('filename', 'Unknown')
                     unique_filenames.add(filename)
-                st.markdown(f"**Found {len(sources)} document chunks from {len(unique_filenames)} unique documents (showing top 10):**")
                 if len(unique_filenames) < len(sources):
                     st.info(f"💡 **Note**: Each document is split into multiple chunks. You're seeing {len(sources)} chunks from {len(unique_filenames)} documents.")
-                for i, doc in enumerate(sources[:10]):  # Show top 10
                     # Get relevance score and ID if available
                     metadata = getattr(doc, 'metadata', {})
-                    score = metadata.get('reranked_score', metadata.get('original_score', None))
-                    chunk_id = metadata.get('_id', 'Unknown')
-                    score_text = f" (Score: {score:.3f}, ID: {chunk_id[:8]}...)" if score is not None else f" (ID: {chunk_id[:8]}...)"
                     with st.expander(f"📄 Document {i+1}: {getattr(doc, 'metadata', {}).get('filename', 'Unknown')[:50]}...{score_text}"):
                         # Display document metadata with emojis
@@ -543,200 +675,409 @@ def main():
         if 'feedback_submitted' not in st.session_state:
             st.session_state.feedback_submitted = False
-        # Feedback form
-        with st.form("feedback_form", clear_on_submit=False):
-            col1, col2 = st.columns([1, 1])
-            with col1:
-                feedback_score = st.slider(
-                    "Rate this conversation (1-5)",
-                    min_value=1,
-                    max_value=5,
-                    help="How satisfied are you with the conversation?"
-                )
-            with col2:
-                is_feedback_about_last_retrieval = st.checkbox(
-                    "Feedback about last retrieval only",
-                    value=True,
-                    help="If checked, feedback applies to the most recent document retrieval"
-                )
-            open_ended_feedback = st.text_area(
-                "Your feedback (optional)",
-                placeholder="Tell us what went well or what could be improved...",
-                height=100
-            )
-            # Disable submit if no score selected
-            submit_disabled = feedback_score is None
-            submitted = st.form_submit_button(
-                "📤 Submit Feedback",
-                use_container_width=True,
-                disabled=submit_disabled
-            )
-            if submitted and not st.session_state.feedback_submitted:
-                # Log the feedback data being submitted
-                print("=" * 80)
-                print("🔄 FEEDBACK SUBMISSION: Starting...")
-                print("=" * 80)
-                st.write("🔍 **Debug: Feedback Data Being Submitted:**")
-                # Create feedback data dictionary
-                feedback_dict = {
-                    "open_ended_feedback": open_ended_feedback,
-                    "score": feedback_score,
-                    "is_feedback_about_last_retrieval": is_feedback_about_last_retrieval,
-                    "retrieved_data": st.session_state.rag_retrieval_history.copy() if st.session_state.rag_retrieval_history else [],
-                    "conversation_id": st.session_state.conversation_id,
-                    "timestamp": time.time(),
-                    "message_count": len(st.session_state.messages),
-                    "has_retrievals": has_retrievals,
-                    "retrieval_count": len(st.session_state.rag_retrieval_history)
-                }
-                print(f"📝 FEEDBACK SUBMISSION: Score={feedback_score}, Retrievals={len(st.session_state.rag_retrieval_history) if st.session_state.rag_retrieval_history else 0}")
-                # Create UserFeedback dataclass instance
-                feedback_obj = None  # Initialize outside try block
-                try:
-                    feedback_obj = create_feedback_from_dict(feedback_dict)
-                    print(f"✅ FEEDBACK SUBMISSION: Feedback object created - ID={feedback_obj.feedback_id}")
-                    st.write(f"✅ **Feedback Object Created**")
-                    st.write(f"- Feedback ID: {feedback_obj.feedback_id}")
-                    st.write(f"- Score: {feedback_obj.score}/5")
-                    st.write(f"- Has Retrievals: {feedback_obj.has_retrievals}")
-                    # Convert back to dict for JSON serialization
-                    feedback_data = feedback_obj.to_dict()
-                except Exception as e:
-                    print(f"❌ FEEDBACK SUBMISSION: Failed to create feedback object: {e}")
-                    st.error(f"Failed to create feedback object: {e}")
-                    feedback_data = feedback_dict
-                # Display the data being submitted
-                st.json(feedback_data)
-                # Save feedback to file - use absolute path in /app to ensure writability
-                feedback_dir = Path("/app/feedback")
-                try:
-                    # Ensure directory exists with write permissions (777 for compatibility)
-                    feedback_dir.mkdir(parents=True, mode=0o777, exist_ok=True)
-                except (PermissionError, OSError) as e:
-                    logger.warning(f"Could not create feedback directory at {feedback_dir}: {e}")
-                    # Fallback to relative path
-                    feedback_dir = Path("feedback")
-                    feedback_dir.mkdir(parents=True, mode=0o777, exist_ok=True)
-                feedback_file = feedback_dir / f"feedback_{st.session_state.conversation_id}_{int(time.time())}.json"
-                try:
-                    # Ensure parent directory exists before writing
-                    feedback_file.parent.mkdir(parents=True, mode=0o777, exist_ok=True)
-                    # Save to local file
-                    print(f"💾 FEEDBACK SAVE: Saving to local file: {feedback_file}")
-                    with open(feedback_file, 'w') as f:
-                        json.dump(feedback_data, f, indent=2, default=str)
-                    print(f"✅ FEEDBACK SAVE: Local file saved successfully")
-                    st.success("✅ Thank you for your feedback! It has been saved locally.")
-                    st.balloons()
-                    # Save to Snowflake if enabled and credentials available
-                    logger.info("🔄 FEEDBACK SAVE: Starting Snowflake save process...")
-                    logger.info(f"📊 FEEDBACK SAVE: feedback_obj={'exists' if feedback_obj else 'None'}")
                     try:
-                        import os
-                        snowflake_enabled = os.getenv("SNOWFLAKE_ENABLED", "false").lower() == "true"
-                        logger.info(f"🔍 SNOWFLAKE CHECK: enabled={snowflake_enabled}")
-                        if snowflake_enabled:
-                            if feedback_obj:
-                                try:
-                                    from src.reporting.snowflake_connector import save_to_snowflake
-                                    logger.info("📤 SNOWFLAKE UI: Attempting to save feedback to Snowflake...")
-                                    print("📤 SNOWFLAKE UI: Attempting to save feedback to Snowflake...")  # Also print to terminal
-                                    if save_to_snowflake(feedback_obj):
-                                        logger.info("✅ SNOWFLAKE UI: Successfully saved to Snowflake")
-                                        print("✅ SNOWFLAKE UI: Successfully saved to Snowflake")  # Also print to terminal
-                                        st.success("✅ Feedback also saved to Snowflake!")
-                                    else:
-                                        logger.warning("⚠️ SNOWFLAKE UI: Save failed")
-                                        print("⚠️ SNOWFLAKE UI: Save failed")  # Also print to terminal
-                                        st.warning("⚠️ Snowflake save failed, but local save succeeded")
-                                except Exception as e:
-                                    logger.error(f"❌ SNOWFLAKE UI ERROR: {e}")
-                                    print(f"❌ SNOWFLAKE UI ERROR: {e}")  # Also print to terminal
-                                    import traceback
-                                    traceback.print_exc()  # Print full traceback to terminal
-                                    st.warning(f"⚠️ Could not save to Snowflake: {e}")
-                            else:
-                                logger.warning("⚠️ SNOWFLAKE UI: Skipping (feedback object not created)")
-                                print("⚠️ SNOWFLAKE UI: Skipping (feedback object not created)")  # Also print to terminal
-                                st.warning("⚠️ Skipping Snowflake save (feedback object not created)")
-                        else:
-                            logger.info("💡 SNOWFLAKE UI: Integration disabled")
-                            print("💡 SNOWFLAKE UI: Integration disabled")  # Also print to terminal
-                            st.info("💡 Snowflake integration disabled (set SNOWFLAKE_ENABLED=true to enable)")
-                    except NameError as e:
-                        import traceback
-                        traceback.print_exc()
-                        logger.error(f"❌ NameError in Snowflake save: {e}")
-                        print(f"❌ NameError in Snowflake save: {e}")  # Also print to terminal
-                        st.warning(f"⚠️ Snowflake save error: {e}")
                     except Exception as e:
-                        logger.error(f"❌ Exception in Snowflake save: {type(e).__name__}: {e}")
-                        print(f"❌ Exception in Snowflake save: {type(e).__name__}: {e}")  # Also print to terminal
-                        st.warning(f"⚠️ Snowflake save error: {e}")
-                    # Mark feedback as submitted to prevent resubmission
-                    st.session_state.feedback_submitted = True
-                    print("=" * 80)
-                    print(f"✅ FEEDBACK SUBMISSION: Completed successfully")
-                    print("=" * 80)
-                    # Log file location
-                    st.info(f"📁 Feedback saved to: {feedback_file}")
-                except Exception as e:
-                    print(f"❌ FEEDBACK SUBMISSION: Error saving feedback: {e}")
-                    print(f"❌ FEEDBACK SUBMISSION: Error type: {type(e).__name__}")
-                    import traceback
-                    traceback.print_exc()
-                    st.error(f"❌ Error saving feedback: {e}")
-                    st.write(f"Debug error: {str(e)}")
-            elif st.session_state.feedback_submitted:
-                st.success("✅ Feedback already submitted for this conversation!")
-                if st.button("🔄 Submit New Feedback", key="new_feedback_button"):
-                    st.session_state.feedback_submitted = False
-                    st.rerun()
     # Display retrieval history stats
     if st.session_state.rag_retrieval_history:
         st.markdown("---")
         st.markdown("#### 📊 Retrieval History")
-        with st.expander(f"View {len(st.session_state.rag_retrieval_history)} retrieval entries", expanded=False):
             for idx, entry in enumerate(st.session_state.rag_retrieval_history, 1):
-                st.markdown(f"**Retrieval #{idx}**")
                 # Display the actual RAG query
                 rag_query_expansion = entry.get("rag_query_expansion", "No query available")
                 st.code(rag_query_expansion, language="text")
                 # Display summary stats
                 st.json({
-                    "conversation_length": len(entry.get("conversation_up_to", [])),
-                    "documents_retrieved": len(entry.get("docs_retrieved", []))
                 })
-                st.markdown("---")
     # Auto-scroll to bottom
     st.markdown("""
@@ -745,5 +1086,32 @@ def main():
     </script>
     """, unsafe_allow_html=True)
 if __name__ == "__main__":
     main()

 """
 import os
+import time
+import json
+import uuid
+import logging
+import traceback
+from pathlib import Path
+from collections import Counter
+from typing import List, Dict, Any, Optional
+import pandas as pd
+import streamlit as st
+import plotly.express as px
+from langchain_core.messages import HumanMessage, AIMessage
+from src.agents import get_multi_agent_chatbot, get_smart_chatbot, get_gemini_chatbot
+from src.feedback import FeedbackManager
+from src.ui_components import get_custom_css, display_chunk_statistics_charts, display_chunk_statistics_table, extract_chunk_statistics
+from src.config.paths import (
+    IS_DEPLOYED,
+    PROJECT_DIR,
+    HF_CACHE_DIR,
+    FEEDBACK_DIR,
+    CONVERSATIONS_DIR,
+)
 # ===== CRITICAL: Fix OMP_NUM_THREADS FIRST, before ANY other imports =====
 # Some libraries load at import time and will fail if OMP_NUM_THREADS is invalid
 # ===== Setup HuggingFace cache directories BEFORE any model imports =====
 # CRITICAL: Set these before any imports that might use HuggingFace (like sentence-transformers)
+# Only override cache directories in deployed environment (local uses defaults)
+if IS_DEPLOYED and HF_CACHE_DIR:
+    cache_dir = str(HF_CACHE_DIR)
+    os.environ["HF_HOME"] = cache_dir
+    os.environ["TRANSFORMERS_CACHE"] = cache_dir
+    os.environ["HF_DATASETS_CACHE"] = cache_dir
+    os.environ["HF_HUB_CACHE"] = cache_dir
+    os.environ["SENTENCE_TRANSFORMERS_HOME"] = cache_dir
+    # Ensure cache directory exists (created in Dockerfile, but ensure it's there)
+    try:
+        os.makedirs(cache_dir, mode=0o755, exist_ok=True)
+    except (PermissionError, OSError):
+        # If we can't create it, log but continue (might already exist from Dockerfile)
+        pass
+else:
+    from dotenv import load_dotenv
+    load_dotenv()
 # Configure logging
 logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
 logger = logging.getLogger(__name__)
 # Log environment setup for debugging
+logger.info(f"🌍 Environment: {'DEPLOYED' if IS_DEPLOYED else 'LOCAL'}")
+logger.info(f"📁 PROJECT_DIR: {PROJECT_DIR}")
+logger.info(f"📁 HuggingFace cache: {os.environ.get('HF_HOME', 'DEFAULT (not overridden)')}")
 logger.info(f"🔧 OMP_NUM_THREADS: {os.environ.get('OMP_NUM_THREADS', 'NOT SET')}")
     page_title="Intelligent Audit Report Chatbot"
 )
+st.markdown(get_custom_css(), unsafe_allow_html=True)
 def get_system_type():
     """Get the current system type"""
     else:
         return "Multi-Agent System"
+def get_chatbot(version: str = "v1"):
+    """Initialize and return the chatbot based on version"""
+    if version == "beta":
+        return get_gemini_chatbot()
     else:
+        # Check environment variable for system type (v1)
+        system = os.environ.get('CHATBOT_SYSTEM', 'multi-agent')
+        if system == 'smart':
+            return get_smart_chatbot()
+        else:
+            return get_multi_agent_chatbot()
 def serialize_messages(messages):
     """Serialize LangChain messages to dictionaries"""
     return serialized
+feedback_manager = FeedbackManager()
 @st.cache_data
 def load_filter_options():
     try:
+        filter_options_path = PROJECT_DIR / "src" / "config" / "filter_options.json"
+        with open(filter_options_path, "r") as f:
             return json.load(f)
     except FileNotFoundError:
+        st.info(f"Looking for filter_options.json in: {PROJECT_DIR / 'src' / 'config'}")
         st.error("filter_options.json not found. Please run the metadata analysis script.")
         return {"sources": [], "years": [], "districts": [], 'filenames': []}
     # Track RAG retrieval history for feedback
     if 'rag_retrieval_history' not in st.session_state:
         st.session_state.rag_retrieval_history = []
+    # Version selection (v1 or beta)
+    if 'chatbot_version' not in st.session_state:
+        st.session_state.chatbot_version = "v1"
+    # Initialize chatbot based on version (only if not already initialized for this version)
+    chatbot_version_key = f"chatbot_{st.session_state.chatbot_version}"
+    # Check if we need to initialize: chatbot doesn't exist OR version changed
+    needs_init = (
+        chatbot_version_key not in st.session_state or
+        st.session_state.get('_last_version') != st.session_state.chatbot_version
+    )
+    if needs_init:
+        try:
+            # Different spinner messages for different versions
+            if st.session_state.chatbot_version == "beta":
+                spinner_msg = "🔄 Initializing Gemini FSA"
+            else:
+                spinner_msg = "🔄 Loading AI models and connecting to database..."
+            with st.spinner(spinner_msg):
+                st.session_state[chatbot_version_key] = get_chatbot(st.session_state.chatbot_version)
+                st.session_state['_last_version'] = st.session_state.chatbot_version
+                st.session_state.chatbot = st.session_state[chatbot_version_key]
+            print("✅ AI system ready!")
+        except Exception as e:
+            st.error(f"❌ Failed to initialize chatbot: {str(e)}")
+            # Only show Gemini-specific error message for beta version
+            if st.session_state.chatbot_version == "beta":
+                st.error("Please check your environment variables (GEMINI_API_KEY, GEMINI_FILESTORE_NAME for beta)")
+            else:
+                st.error("Please check your configuration and ensure all required models and databases are accessible.")
+            # Reset to v1 to prevent infinite loop
+            st.session_state.chatbot_version = "v1"
+            st.session_state['_last_version'] = "v1"
+            if 'chatbot' in st.session_state:
+                del st.session_state['chatbot']
+            st.stop()  # Stop execution to prevent infinite loop
+    else:
+        # Chatbot already initialized for this version, just use it
+        st.session_state.chatbot = st.session_state[chatbot_version_key]
     # Reset conversation history if needed (but keep chatbot cached)
     if 'reset_conversation' in st.session_state and st.session_state.reset_conversation:
         st.session_state.reset_conversation = False
         st.rerun()
+    # Version selection radio button (top right)
     col1, col2 = st.columns([3, 1])
     with col1:
+        st.markdown('<p class="subtitle">Ask questions about audit reports. Use the sidebar filters to narrow down your search!</p>', unsafe_allow_html=True)
     with col2:
+        st.markdown("<br>", unsafe_allow_html=True)  # Add some spacing
+        selected_version = st.radio(
+            "**Version:**",
+            options=["v1", "beta"],
+            index=0 if st.session_state.chatbot_version == "v1" else 1,
+            horizontal=True,
+            key="version_selector",
+            help="Select v1 (default RAG system) or beta (Gemini FSA)"
+        )
+        # Update version if changed
+        if selected_version != st.session_state.chatbot_version:
+            # Store the old version to check if we need to switch
+            old_version = st.session_state.chatbot_version
+            st.session_state.chatbot_version = selected_version
+            # If chatbot for new version already exists, just switch to it
+            new_chatbot_key = f"chatbot_{selected_version}"
+            if new_chatbot_key in st.session_state:
+                # Chatbot already exists, just switch
+                st.session_state.chatbot = st.session_state[new_chatbot_key]
+                st.session_state['_last_version'] = selected_version
+            else:
+                # Need to initialize new version - will be handled by initialization logic above
+                st.session_state['_last_version'] = old_version  # Set to old to trigger init check
+            st.rerun()
+    # Show version info
+    if st.session_state.chatbot_version == "beta":
+        st.info("🔬 **Beta Mode**: Using Google Gemini FSA")
     # Session info
     duration = int(time.time() - st.session_state.session_start_time)
     # Sidebar for filters
     with st.sidebar:
+        # Instructions section (collapsible)
+        with st.expander("📖 How to Use", expanded=False):
+            st.markdown("""
+            #### 🎯 Using Filters
+            1. **Select filters** from the sidebar to narrow your search:
+            2. **Leave filters empty** to search across all data
+            3. **Type your question** in the chat input at the bottom
+            4. **Click "Send"** to submit your question
+            #### 💡 Tips
+            - Use specific questions for better results
+            - Combine multiple filters for precise searches
+            - Check the "Retrieved Documents" tab to see source material
+            #### ⚠️ Important
+            **When finished, please close the browser window** to free up computational resources.
+            ---
+            For more detailed help, see the example questions at the bottom of the page.
+            """)
         st.markdown("### 🔍 Search Filters")
         st.markdown("Select filters to narrow down your search. Leave empty to search all data.")
         # Determine if filename filter is active
         filename_mode = len(selected_filenames) > 0
         # Sources filter
+        # st.markdown('<div class="filter-section">', unsafe_allow_html=True)
         st.markdown('<div class="filter-title">📊 Sources</div>', unsafe_allow_html=True)
         selected_sources = st.multiselect(
             "Select sources:",
         st.markdown('</div>', unsafe_allow_html=True)
         # Years filter
+        # st.markdown('<div class="filter-section">', unsafe_allow_html=True)
         st.markdown('<div class="filter-title">📅 Years</div>', unsafe_allow_html=True)
         selected_years = st.multiselect(
             "Select years:",
         st.markdown('</div>', unsafe_allow_html=True)
         # Districts filter
+        # st.markdown('<div class="filter-section">', unsafe_allow_html=True)
         st.markdown('<div class="filter-title">🏘️ Districts</div>', unsafe_allow_html=True)
         selected_districts = st.multiselect(
             "Select districts:",
             if 'input_counter' not in st.session_state:
                 st.session_state.input_counter = 0
+            # Handle pending question from example questions section
+            if 'pending_question' in st.session_state and st.session_state.pending_question:
+                default_value = st.session_state.pending_question
+                # Increment counter to force new input widget
+                st.session_state.input_counter = (st.session_state.get('input_counter', 0) + 1) % 1000
+                del st.session_state.pending_question
+                key_suffix = st.session_state.input_counter
+            else:
+                default_value = ""
+                key_suffix = st.session_state.input_counter
             user_input = st.text_input(
                 "Type your message here...",
                 placeholder="Ask about budget allocations, expenditures, or audit findings...",
+                key=f"user_input_{key_suffix}",
+                label_visibility="collapsed",
+                value=default_value if default_value else None
             )
         with col2:
+            send_button = st.button("Send", key="send_button", width='stretch')
         # Clear chat button
         if st.button("🗑️ Clear Chat", key="clear_chat_button"):
             st.session_state.reset_conversation = True
             # Clear all conversation files
+            conversations_path = CONVERSATIONS_DIR
+            if conversations_path.exists():
+                for file in conversations_path.iterdir():
+                    if file.suffix == '.json':
+                        file.unlink()
             st.rerun()
         # Handle user input
                         if rag_result:
                             sources = rag_result.get('sources', []) if isinstance(rag_result, dict) else (rag_result.sources if hasattr(rag_result, 'sources') else [])
+                            # For Gemini, also check gemini_result for sources
+                            if not sources or len(sources) == 0:
+                                gemini_result = chat_result.get('gemini_result')
+                                print(f"🔍 DEBUG: Checking gemini_result for sources...")
+                                print(f"   gemini_result exists: {gemini_result is not None}")
+                                if gemini_result:
+                                    print(f"   gemini_result type: {type(gemini_result)}")
+                                    print(f"   has sources attr: {hasattr(gemini_result, 'sources')}")
+                                    if hasattr(gemini_result, 'sources'):
+                                        print(f"   sources length: {len(gemini_result.sources) if gemini_result.sources else 0}")
+                                if gemini_result and hasattr(gemini_result, 'sources'):
+                                    # Format Gemini sources for display
+                                    if hasattr(st.session_state.chatbot, 'gemini_client'):
+                                        sources = st.session_state.chatbot.gemini_client.format_sources_for_display(gemini_result)
+                                        print(f"✅ Formatted {len(sources)} sources from gemini_client")
+                                    elif hasattr(st.session_state.chatbot, '_format_gemini_sources'):
+                                        sources = st.session_state.chatbot._format_gemini_sources(gemini_result)
+                                        print(f"✅ Formatted {len(sources)} sources from _format_gemini_sources")
+                            # Update rag_result with sources if we found them
+                            if sources and len(sources) > 0:
+                                if isinstance(rag_result, dict):
+                                    rag_result['sources'] = sources
+                                elif hasattr(rag_result, 'sources'):
+                                    rag_result.sources = sources
+                                # Update last_rag_result with sources
+                                st.session_state.last_rag_result = rag_result
+                                print(f"✅ Updated rag_result with {len(sources)} sources")
                             # Get the actual RAG query
                             actual_rag_query = chat_result.get('actual_rag_query', '')
                             if actual_rag_query:
                             else:
                                 formatted_query = "No RAG query available"
+                            # Extract filters from active filters
+                            filters_used = {
+                                "sources": st.session_state.active_filters.get('sources', []),
+                                "years": st.session_state.active_filters.get('years', []),
+                                "districts": st.session_state.active_filters.get('districts', []),
+                                "filenames": st.session_state.active_filters.get('filenames', [])
+                            }
                             retrieval_entry = {
                                 "conversation_up_to": serialize_messages(st.session_state.messages),
                                 "rag_query_expansion": formatted_query,
+                                "docs_retrieved": serialize_documents(sources),
+                                "filters_applied": filters_used,
+                                "timestamp": time.time()
                             }
                             st.session_state.rag_retrieval_history.append(retrieval_entry)
+                            # Debug logging
+                            print(f"📊 RETRIEVAL TRACKING: {len(sources)} sources stored in retrieval history")
                     else:
                         response = chat_result
                         st.session_state.last_rag_result = None
                 # Dictionary format from multi-agent system
                 sources = rag_result['sources']
+            # For Gemini, also check if we need to format sources from gemini_result
+            if (not sources or len(sources) == 0) and isinstance(rag_result, dict):
+                gemini_result = rag_result.get('gemini_result')
+                if gemini_result and hasattr(gemini_result, 'sources'):
+                    # Format Gemini sources for display
+                    if hasattr(st.session_state.chatbot, 'gemini_client'):
+                        sources = st.session_state.chatbot.gemini_client.format_sources_for_display(gemini_result)
+                    elif hasattr(st.session_state.chatbot, '_format_gemini_sources'):
+                        sources = st.session_state.chatbot._format_gemini_sources(gemini_result)
             if sources and len(sources) > 0:
                 # Count unique filenames
                 unique_filenames = set()
                     filename = getattr(doc, 'metadata', {}).get('filename', 'Unknown')
                     unique_filenames.add(filename)
+                st.markdown(f"**Found {len(sources)} document chunks from {len(unique_filenames)} unique documents (showing top 20):**")
                 if len(unique_filenames) < len(sources):
                     st.info(f"💡 **Note**: Each document is split into multiple chunks. You're seeing {len(sources)} chunks from {len(unique_filenames)} documents.")
+                # Extract and display statistics
+                stats = extract_chunk_statistics(sources)
+                # Show charts for 10+ results, tables for fewer
+                if len(sources) >= 10:
+                    display_chunk_statistics_charts(stats, "Retrieval Statistics")
+                    # Also show tables below charts for detailed view
+                    st.markdown("---")
+                    display_chunk_statistics_table(stats, "Retrieval Distribution")
+                else:
+                    display_chunk_statistics_table(stats, "Retrieval Distribution")
+                st.markdown("---")
+                st.markdown("### 📄 Document Details")
+                for i, doc in enumerate(sources):  # Show all documents
                     # Get relevance score and ID if available
                     metadata = getattr(doc, 'metadata', {})
+                    # Handle both standard RAG scores and Gemini scores
+                    score = metadata.get('reranked_score') or metadata.get('original_score') or metadata.get('score')
+                    chunk_id = metadata.get('_id') or metadata.get('chunk_id', 'Unknown')
+                    if score is not None:
+                        try:
+                            score_text = f" (Score: {float(score):.3f})"
+                        except (ValueError, TypeError):
+                            score_text = ""
+                    else:
+                        score_text = ""
+                    if chunk_id and chunk_id != 'Unknown':
+                        score_text += f" (ID: {str(chunk_id)[:8]}...)" if score_text else f" (ID: {str(chunk_id)[:8]}...)"
                     with st.expander(f"📄 Document {i+1}: {getattr(doc, 'metadata', {}).get('filename', 'Unknown')[:50]}...{score_text}"):
                         # Display document metadata with emojis
         if 'feedback_submitted' not in st.session_state:
             st.session_state.feedback_submitted = False
+        # Feedback form - only show if feedback not already submitted
+        if not st.session_state.feedback_submitted:
+            with st.form("feedback_form", clear_on_submit=False):
+                col1, col2 = st.columns([1, 1])
+                with col1:
+                    feedback_score = st.slider(
+                        "Rate this conversation (1-5)",
+                        min_value=1,
+                        max_value=5,
+                        help="How satisfied are you with the conversation?"
+                    )
+                with col2:
+                    is_feedback_about_last_retrieval = st.checkbox(
+                        "Feedback about last retrieval only",
+                        value=True,
+                        help="If checked, feedback applies to the most recent document retrieval"
+                    )
+                open_ended_feedback = st.text_area(
+                    "Your feedback (optional)",
+                    placeholder="Tell us what went well or what could be improved...",
+                    height=100
+                )
+                # Disable submit if no score selected
+                submit_disabled = feedback_score is None
+                submitted = st.form_submit_button(
+                    "📤 Submit Feedback",
+                    width='stretch',
+                    disabled=submit_disabled
+                )
+                if submitted:
+                    # Log the feedback data being submitted
+                    print("=" * 80)
+                    print("🔄 FEEDBACK SUBMISSION: Starting...")
+                    print("=" * 80)
+                    st.write("🔍 **Debug: Feedback Data Being Submitted:**")
+                    # Extract transcript from messages
+                    transcript = feedback_manager.extract_transcript(st.session_state.messages)
+                    # Build retrievals structure
+                    retrievals = feedback_manager.build_retrievals_structure(
+                        st.session_state.rag_retrieval_history.copy() if st.session_state.rag_retrieval_history else [],
+                        st.session_state.messages
+                    )
+                    # Build feedback_score_related_retrieval_docs
+                    feedback_score_related_retrieval_docs = feedback_manager.build_feedback_score_related_retrieval_docs(
+                        is_feedback_about_last_retrieval,
+                        st.session_state.messages,
+                        st.session_state.rag_retrieval_history.copy() if st.session_state.rag_retrieval_history else []
+                    )
+                    # Preserve old retrieved_data format for backward compatibility
+                    retrieved_data_old_format = st.session_state.rag_retrieval_history.copy() if st.session_state.rag_retrieval_history else []
+                    # Create feedback data dictionary
+                    feedback_dict = {
+                        "open_ended_feedback": open_ended_feedback,
+                        "score": feedback_score,
+                        "is_feedback_about_last_retrieval": is_feedback_about_last_retrieval,
+                        "conversation_id": st.session_state.conversation_id,
+                        "timestamp": time.time(),
+                        "message_count": len(st.session_state.messages),
+                        "has_retrievals": has_retrievals,
+                        "retrieval_count": len(st.session_state.rag_retrieval_history) if st.session_state.rag_retrieval_history else 0,
+                        "transcript": transcript,
+                        "retrievals": retrievals,
+                        "feedback_score_related_retrieval_docs": feedback_score_related_retrieval_docs,
+                        "retrieved_data": retrieved_data_old_format  # Preserved old column
+                    }
+                    print(f"📝 FEEDBACK SUBMISSION: Score={feedback_score}, Retrievals={len(st.session_state.rag_retrieval_history) if st.session_state.rag_retrieval_history else 0}")
+                    # Create UserFeedback dataclass instance
+                    feedback_obj = None  # Initialize outside try block
                     try:
+                        feedback_obj = feedback_manager.create_feedback_from_dict(feedback_dict)
+                        print(f"✅ FEEDBACK SUBMISSION: Feedback object created - ID={feedback_obj.feedback_id}")
+                        st.write(f"✅ **Feedback Object Created**")
+                        st.write(f"- Feedback ID: {feedback_obj.feedback_id}")
+                        st.write(f"- Score: {feedback_obj.score}/5")
+                        st.write(f"- Has Retrievals: {feedback_obj.has_retrievals}")
+                        # Convert back to dict for JSON serialization
+                        feedback_data = feedback_obj.to_dict()
                     except Exception as e:
+                        print(f"❌ FEEDBACK SUBMISSION: Failed to create feedback object: {e}")
+                        st.error(f"Failed to create feedback object: {e}")
+                        feedback_data = feedback_dict
+                    # Display the data being submitted
+                    st.json(feedback_data)
+                    # Save feedback to file - use PROJECT_DIR to ensure writability
+                    feedback_dir = FEEDBACK_DIR
+                    try:
+                        # Ensure directory exists with write permissions (777 for compatibility)
+                        feedback_dir.mkdir(parents=True, mode=0o777, exist_ok=True)
+                    except (PermissionError, OSError) as e:
+                        logger.warning(f"Could not create feedback directory at {feedback_dir}: {e}")
+                        # Fallback to relative path
+                        feedback_dir = Path("feedback")
+                        feedback_dir.mkdir(parents=True, mode=0o777, exist_ok=True)
+                    feedback_file = feedback_dir / f"feedback_{st.session_state.conversation_id}_{int(time.time())}.json"
+                    try:
+                        # Ensure parent directory exists before writing
+                        feedback_file.parent.mkdir(parents=True, mode=0o777, exist_ok=True)
+                        # Save to local file first
+                        print(f"💾 FEEDBACK SAVE: Saving to local file: {feedback_file}")
+                        with open(feedback_file, 'w') as f:
+                            json.dump(feedback_data, f, indent=2, default=str)
+                        print(f"✅ FEEDBACK SAVE: Local file saved successfully")
+                        # Save to Snowflake if enabled and credentials available
+                        logger.info("🔄 FEEDBACK SAVE: Starting Snowflake save process...")
+                        logger.info(f"📊 FEEDBACK SAVE: feedback_obj={'exists' if feedback_obj else 'None'}")
+                        snowflake_success = False
+                        try:
+                            snowflake_enabled = os.getenv("SNOWFLAKE_ENABLED", "false").lower() == "true"
+                            logger.info(f"🔍 SNOWFLAKE CHECK: enabled={snowflake_enabled}")
+                            if snowflake_enabled:
+                                if feedback_obj:
+                                    try:
+                                        logger.info("📤 SNOWFLAKE UI: Attempting to save feedback to Snowflake...")
+                                        print("📤 SNOWFLAKE UI: Attempting to save feedback to Snowflake...")
+                                        # Show spinner while saving to Snowflake (can take 10-15 seconds)
+                                        # This includes: connection establishment (~5s), data preparation, and SQL execution (~5s)
+                                        with st.spinner("💾 Saving feedback to Snowflake... This may take 10-15 seconds (connecting to database, preparing data, and executing query)"):
+                                            snowflake_success = feedback_manager.save_to_snowflake(feedback_obj)
+                                        if snowflake_success:
+                                            logger.info("✅ SNOWFLAKE UI: Successfully saved to Snowflake")
+                                            print("✅ SNOWFLAKE UI: Successfully saved to Snowflake")
+                                        else:
+                                            logger.warning("⚠️ SNOWFLAKE UI: Save failed")
+                                            print("⚠️ SNOWFLAKE UI: Save failed")
+                                    except Exception as e:
+                                        logger.error(f"❌ SNOWFLAKE UI ERROR: {e}")
+                                        print(f"❌ SNOWFLAKE UI ERROR: {e}")
+                                        traceback.print_exc()
+                                        snowflake_success = False
+                                else:
+                                    logger.warning("⚠️ SNOWFLAKE UI: Skipping (feedback object not created)")
+                                    print("⚠️ SNOWFLAKE UI: Skipping (feedback object not created)")
+                                    snowflake_success = False
+                            else:
+                                logger.info("💡 SNOWFLAKE UI: Integration disabled")
+                                print("💡 SNOWFLAKE UI: Integration disabled")
+                                # If Snowflake is disabled, consider it successful (local save only)
+                                snowflake_success = True
+                        except Exception as e:
+                            logger.error(f"❌ Exception in Snowflake save: {type(e).__name__}: {e}")
+                            print(f"❌ Exception in Snowflake save: {type(e).__name__}: {e}")
+                            snowflake_success = False
+                        # Only show success if Snowflake save succeeded (or if Snowflake is disabled)
+                        if snowflake_success:
+                            st.success("✅ Thank you for your feedback! It has been saved successfully.")
+                            st.balloons()
+                        else:
+                            st.warning("⚠️ Feedback saved locally, but Snowflake save failed. Please check logs.")
+                        # Mark feedback as submitted to prevent resubmission
+                        st.session_state.feedback_submitted = True
+                        print("=" * 80)
+                        print(f"✅ FEEDBACK SUBMISSION: Completed successfully")
+                        print("=" * 80)
+                        # Log file location
+                        st.info(f"📁 Feedback saved to: {feedback_file}")
+                    except Exception as e:
+                        print(f"❌ FEEDBACK SUBMISSION: Error saving feedback: {e}")
+                        print(f"❌ FEEDBACK SUBMISSION: Error type: {type(e).__name__}")
+                        traceback.print_exc()
+                        st.error(f"❌ Error saving feedback: {e}")
+                        st.write(f"Debug error: {str(e)}")
+        else:
+            # Feedback already submitted - show success message and reset option
+            st.success("✅ Feedback already submitted for this conversation!")
+            col1, col2 = st.columns([1, 1])
+            with col1:
+                if st.button("🔄 Submit New Feedback", key="new_feedback_button", width='stretch'):
+                    try:
+                        st.session_state.feedback_submitted = False
+                        st.rerun()
+                    except Exception as e:
+                        # Handle any Streamlit API exceptions gracefully
+                        logger.error(f"Error resetting feedback state: {e}")
+                        st.error(f"Error resetting feedback. Please refresh the page.")
+            with col2:
+                if st.button("📋 View Conversation", key="view_conversation_button", width='stretch'):
+                    # Scroll to conversation - this is handled by the auto-scroll at bottom
+                    pass
     # Display retrieval history stats
     if st.session_state.rag_retrieval_history:
         st.markdown("---")
         st.markdown("#### 📊 Retrieval History")
+        with st.expander(f"View {len(st.session_state.rag_retrieval_history)} retrieval entries", expanded=True):
             for idx, entry in enumerate(st.session_state.rag_retrieval_history, 1):
+                st.markdown(f"### **Retrieval #{idx}**")
+                # Display timestamp if available
+                if entry.get("timestamp"):
+                    timestamp_str = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(entry["timestamp"]))
+                    st.caption(f"🕐 {timestamp_str}")
                 # Display the actual RAG query
                 rag_query_expansion = entry.get("rag_query_expansion", "No query available")
+                st.markdown("**🔍 RAG Query:**")
                 st.code(rag_query_expansion, language="text")
+                # Display filters used
+                filters_applied = entry.get("filters_applied", {})
+                if filters_applied and any(filters_applied.values()):
+                    st.markdown("**🎯 Filters Applied:**")
+                    filter_display = {}
+                    if filters_applied.get("sources"):
+                        filter_display["Sources"] = filters_applied["sources"]
+                    if filters_applied.get("years"):
+                        filter_display["Years"] = filters_applied["years"]
+                    if filters_applied.get("districts"):
+                        filter_display["Districts"] = filters_applied["districts"]
+                    if filters_applied.get("filenames"):
+                        filter_display["Filenames"] = filters_applied["filenames"]
+                    if filter_display:
+                        st.json(filter_display)
+                    else:
+                        st.info("No filters applied")
+                else:
+                    st.info("No filters applied")
+                # Display conversation history up to retrieval point
+                conversation_up_to = entry.get("conversation_up_to", [])
+                if conversation_up_to:
+                    st.markdown("**💬 Conversation History (up to retrieval point):**")
+                    with st.expander(f"View {len(conversation_up_to)} messages", expanded=False):
+                        for msg_idx, msg in enumerate(conversation_up_to, 1):
+                            role = msg.get("type", "unknown")
+                            content = msg.get("content", "")
+                            if role == "HumanMessage" or role == "human":
+                                st.markdown(f"**👤 User {msg_idx}:** {content[:200]}{'...' if len(content) > 200 else ''}")
+                            elif role == "AIMessage" or role == "ai":
+                                st.markdown(f"**🤖 Assistant {msg_idx}:** {content[:200]}{'...' if len(content) > 200 else ''}")
+                else:
+                    st.info("No conversation history available")
+                # Display documents retrieved
+                docs_retrieved = entry.get("docs_retrieved", [])
+                if docs_retrieved:
+                    st.markdown(f"**📄 Documents Retrieved ({len(docs_retrieved)}):**")
+                    with st.expander(f"View {len(docs_retrieved)} documents", expanded=False):
+                        for doc_idx, doc in enumerate(docs_retrieved, 1):
+                            st.markdown(f"**Document {doc_idx}:**")
+                            # Display metadata
+                            metadata = doc.get("metadata", {})
+                            if metadata:
+                                col1, col2, col3 = st.columns(3)
+                                with col1:
+                                    st.write(f"📄 **File:** {metadata.get('filename', 'Unknown')}")
+                                with col2:
+                                    st.write(f"🏛️ **Source:** {metadata.get('source', 'Unknown')}")
+                                with col3:
+                                    st.write(f"📅 **Year:** {metadata.get('year', 'Unknown')}")
+                                # Additional metadata
+                                if metadata.get('district'):
+                                    st.write(f"📍 **District:** {metadata.get('district')}")
+                                if metadata.get('page'):
+                                    st.write(f"📖 **Page:** {metadata.get('page')}")
+                                if metadata.get('score') is not None:
+                                    st.write(f"⭐ **Score:** {metadata.get('score'):.3f}" if isinstance(metadata.get('score'), (int, float)) else f"⭐ **Score:** {metadata.get('score')}")
+                            # Display content preview (first 200 chars)
+                            content = doc.get("content", doc.get("page_content", ""))
+                            if content:
+                                st.markdown("**Content Preview:**")
+                                st.text_area(
+                                    "Content Preview",
+                                    value=content[:200] + ("..." if len(content) > 200 else ""),
+                                    height=100,
+                                    disabled=True,
+                                    label_visibility="collapsed",
+                                    key=f"retrieval_{idx}_doc_{doc_idx}_preview"
+                                )
+                            if doc_idx < len(docs_retrieved):
+                                st.markdown("---")
+                else:
+                    st.info("No documents retrieved")
                 # Display summary stats
+                st.markdown("**📊 Summary:**")
                 st.json({
+                    "conversation_length": len(conversation_up_to),
+                    "documents_retrieved": len(docs_retrieved)
                 })
+                if idx < len(st.session_state.rag_retrieval_history):
+                    st.markdown("---")
+    # Example Questions Section
+    st.markdown("---")
+    st.markdown("### 💡 Example Questions")
+    st.markdown("Click on any question below to use it, or modify the editable examples:")
+    # Initialize example question state
+    if 'custom_question_1' not in st.session_state:
+        st.session_state.custom_question_1 = "How were administrative costs managed in the PDM implementation, and what issues arose with budget execution regarding staff salaries?"
+    if 'custom_question_2' not in st.session_state:
+        st.session_state.custom_question_2 = "What did the National Coordinator say about the release of funds for PDM administrative costs in the letter dated 29th September 2022 and how did the funding received affect the activities of the PDCs and PDM SACCOs in the FY 2022/23?"
+    # Question 1: Filename insights (fixed, clickable)
+    st.markdown("#### 📄 Question 1: List insights from a specific file")
+    col1, col2 = st.columns([3, 1])
+    with col1:
+        example_q1 = "List couple of insights from the filename."
+        st.markdown(f"**Example:** `{example_q1}`")
+        st.info("💡 **Filter to apply:** Select a Filename from the sidebar panel before asking this question.")
+    with col2:
+        if st.button("📋 Use This Question", key="use_example_1", width='stretch'):
+            st.session_state.pending_question = example_q1
+            st.session_state.input_counter = (st.session_state.get('input_counter', 0) + 1) % 1000
+            st.rerun()
+    st.markdown("---")
+    # Questions 2 & 3: Editable examples
+    st.markdown("#### ✏️ Customizable Questions (Edit and use)")
+    # Question 2
+    # st.markdown("**Question 2:**")
+    custom_q1 = st.text_area(
+        "Edit question 2:",
+        value=st.session_state.custom_question_1,
+        height=80,
+        key="edit_question_2",
+        help="Modify this question to fit your needs, then click 'Use This Question'"
+    )
+    col1, col2 = st.columns([1, 4])
+    with col1:
+        if st.button("📋 Use Question 2", key="use_custom_1", width='stretch'):
+            if custom_q1.strip():
+                st.session_state.pending_question = custom_q1.strip()
+                st.session_state.custom_question_1 = custom_q1.strip()
+                st.session_state.input_counter = (st.session_state.get('input_counter', 0) + 1) % 1000
+                st.rerun()
+            else:
+                st.warning("Please enter a question first!")
+    with col2:
+        st.caption("💡 Tip: Add specific details like dates, names, or amounts to get more precise answers")
+    st.info("💡 **Filter to apply:** Select District(s) and Year(s) sidebar panel before asking this question.")
+    st.markdown("---")
+    # Question 3
+    # st.markdown("**Question 3:**")
+    custom_q2 = st.text_area(
+        "Edit question 3:",
+        value=st.session_state.custom_question_2,
+        height=80,
+        key="edit_question_3",
+        help="Modify this question to fit your needs, then click 'Use This Question'"
+    )
+    col1, col2 = st.columns([1, 4])
+    with col1:
+        if st.button("📋 Use Question 3", key="use_custom_2", width='stretch'):
+            if custom_q2.strip():
+                st.session_state.pending_question = custom_q2.strip()
+                st.session_state.custom_question_2 = custom_q2.strip()
+                st.session_state.input_counter = (st.session_state.get('input_counter', 0) + 1) % 1000
+                st.rerun()
+            else:
+                st.warning("Please enter a question first!")
+    with col2:
+        st.caption("💡 Tip: Use specific terms from the documents (e.g., 'PDM', 'SACCOs', 'FY 2022/23')")
+    # Store selected question for next render (handled in input section above)
+    # This ensures the question populates the input field correctly
     # Auto-scroll to bottom
     st.markdown("""
     </script>
     """, unsafe_allow_html=True)
 if __name__ == "__main__":
+    # Check if running in Streamlit context
+    try:
+        from streamlit.runtime.scriptrunner import get_script_run_ctx
+        if get_script_run_ctx() is None:
+            # Not in Streamlit runtime - show helpful message
+            print("=" * 80)
+            print("⚠️  WARNING: This is a Streamlit app!")
+            print("=" * 80)
+            print("\nPlease run this app using:")
+            print("  streamlit run app.py")
+            print("\nNot: python app.py")
+            print("\nThe app will not function correctly when run with 'python app.py'")
+            print("=" * 80)
+            import sys
+            sys.exit(1)
+    except ImportError:
+        # Streamlit not installed or not in Streamlit context
+        print("=" * 80)
+        print("⚠️  WARNING: This is a Streamlit app!")
+        print("=" * 80)
+        print("\nPlease run this app using:")
+        print("  streamlit run app.py")
+        print("\nNot: python app.py")
+        print("=" * 80)
+        import sys
+        sys.exit(1)
     main()

src/agents/__init__.py ADDED Viewed

	@@ -0,0 +1,10 @@

+"""
+Agent modules for chatbot implementations
+"""
+from .smart_chatbot import get_chatbot as get_smart_chatbot
+from .multi_agent_chatbot import get_multi_agent_chatbot
+from .gemini_chatbot import get_gemini_chatbot
+__all__ = ["get_smart_chatbot", "get_multi_agent_chatbot", "get_gemini_chatbot"]

src/agents/gemini_chatbot.py ADDED Viewed

	@@ -0,0 +1,392 @@

+"""
+Gemini File Search Chatbot (Beta Version)
+This chatbot uses Google Gemini File Search API for RAG.
+It provides a simpler architecture: Main Agent + Gemini Agent
+"""
+import os
+import json
+import time
+import logging
+import traceback
+from pathlib import Path
+from typing import Dict, List, Any, Optional, TypedDict
+from langgraph.graph import StateGraph, END
+from langchain_core.prompts import ChatPromptTemplate
+from langchain_core.messages import HumanMessage, AIMessage, SystemMessage
+from src.gemini.file_search import GeminiFileSearchClient, GeminiFileSearchResult
+from src.config.paths import CONVERSATIONS_DIR
+logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
+logger = logging.getLogger(__name__)
+class GeminiState(TypedDict):
+    """State for Gemini chatbot conversation flow"""
+    conversation_id: str
+    messages: List[Any]
+    current_query: str
+    query_context: Optional[Dict[str, Any]]
+    gemini_result: Optional[GeminiFileSearchResult]
+    final_response: Optional[str]
+    agent_logs: List[str]
+    conversation_context: Dict[str, Any]
+    session_start_time: float
+    last_ai_message_time: float
+    filters: Optional[Dict[str, Any]]
+class GeminiRAGChatbot:
+    """Gemini File Search RAG chatbot (Beta version)"""
+    def __init__(self):
+        """Initialize the Gemini chatbot"""
+        logger.info("🤖 INITIALIZING: Gemini File Search Chatbot (Beta)")
+        # Initialize Gemini File Search client
+        try:
+            self.gemini_client = GeminiFileSearchClient()
+            logger.info("✅ Gemini File Search client initialized")
+        except Exception as e:
+            logger.error(f"❌ Failed to initialize Gemini client: {e}")
+            raise RuntimeError(f"Gemini client initialization failed: {e}")
+        # Build the LangGraph with LangSmith tracing if enabled
+        self.graph = self._build_graph()
+        # Enable LangSmith tracing if configured
+        langsmith_enabled = os.getenv("LANGCHAIN_TRACING_V2", "false").lower() == "true"
+        if langsmith_enabled:
+            logger.info("🔍 LangSmith tracing enabled")
+            langsmith_project = os.getenv("LANGCHAIN_PROJECT", "gemini-chatbot")
+            logger.info(f"📊 LangSmith project: {langsmith_project}")
+        # Conversations directory
+        self.conversations_dir = CONVERSATIONS_DIR
+        try:
+            self.conversations_dir.mkdir(parents=True, mode=0o777, exist_ok=True)
+        except (PermissionError, OSError) as e:
+            logger.warning(f"Could not create conversations directory: {e}")
+            self.conversations_dir = Path("conversations")
+            self.conversations_dir.mkdir(parents=True, mode=0o777, exist_ok=True)
+        logger.info("✅ Gemini File Search Chatbot initialized")
+    def _build_graph(self) -> StateGraph:
+        """Build the LangGraph for Gemini chatbot"""
+        graph = StateGraph(GeminiState)
+        # Add nodes
+        graph.add_node("main_agent", self._main_agent)
+        graph.add_node("gemini_agent", self._gemini_agent)
+        # Define the flow
+        graph.set_entry_point("main_agent")
+        graph.add_edge("main_agent", "gemini_agent")
+        graph.add_edge("gemini_agent", END)
+        return graph.compile()
+    def _main_agent(self, state: GeminiState) -> GeminiState:
+        """Main Agent: Extracts filters and prepares query"""
+        logger.info("🎯 MAIN AGENT: Processing query")
+        query = state["current_query"]
+        messages = state["messages"]
+        # Extract UI filters if present in query
+        ui_filters = self._extract_ui_filters(query)
+        # Extract context from conversation
+        context = self._extract_context_from_conversation(messages, ui_filters)
+        # Store context and filters
+        state["query_context"] = context
+        state["filters"] = context.get("filters", {})
+        logger.info(f"🎯 MAIN AGENT: Filters extracted: {state['filters']}")
+        return state
+    def _gemini_agent(self, state: GeminiState) -> GeminiState:
+        """Gemini Agent: Performs file search and generates response"""
+        logger.info("🔍 GEMINI AGENT: Starting file search")
+        query = state["current_query"]
+        filters = state.get("filters", {})
+        # Perform Gemini file search
+        try:
+            result = self.gemini_client.search(query=query, filters=filters)
+            logger.info(f"✅ GEMINI AGENT: Search completed, {len(result.sources)} sources found")
+            # Enhance response with document references
+            enhanced_response = self._enhance_response_with_references(
+                result.answer,
+                result.sources,
+                query
+            )
+            state["gemini_result"] = result
+            state["final_response"] = enhanced_response
+            state["last_ai_message_time"] = time.time()
+            state["agent_logs"].append(f"GEMINI AGENT: Found {len(result.sources)} sources")
+        except Exception as e:
+            logger.error(f"❌ GEMINI AGENT ERROR: {e}")
+            traceback.print_exc()
+            state["final_response"] = "I apologize, but I encountered an error while searching. Please try again."
+            state["last_ai_message_time"] = time.time()
+        return state
+    def _enhance_response_with_references(self, answer: str, sources: List[Any], query: str) -> str:
+        """Enhance Gemini response to include document references and format nicely"""
+        if not sources or not answer:
+            return answer
+        # Use LLM to intelligently add document references and format nicely
+        try:
+            from src.llm.adapters import get_llm_client
+            llm = get_llm_client()
+            # Prepare document summaries for the LLM
+            doc_summaries = []
+            for idx, doc in enumerate(sources, 1):
+                metadata = getattr(doc, 'metadata', {}) if hasattr(doc, 'metadata') else (doc if isinstance(doc, dict) else {})
+                content = getattr(doc, 'page_content', '') if hasattr(doc, 'page_content') else (doc.get('content', '') if isinstance(doc, dict) else '')
+                filename = metadata.get('filename', 'Unknown') if isinstance(metadata, dict) else 'Unknown'
+                year = metadata.get('year', 'Unknown') if isinstance(metadata, dict) else 'Unknown'
+                source = metadata.get('source', 'Unknown') if isinstance(metadata, dict) else 'Unknown'
+                district = metadata.get('district', '') if isinstance(metadata, dict) else ''
+                doc_info = f"{filename}"
+                if year and year != 'Unknown':
+                    doc_info += f" ({year})"
+                if source and source != 'Unknown':
+                    doc_info += f" - {source}"
+                if district:
+                    doc_info += f" - {district}"
+                doc_summaries.append(f"[Doc {idx}] {doc_info}: {content[:300]}...")
+            prompt = f"""You are enhancing a response from a document search system. The original response is:
+{answer}
+The following documents were retrieved and used to generate this response:
+{chr(10).join(doc_summaries)}
+CRITICAL RULES:
+1. Format the response nicely with proper paragraphs, bullet points, or structured sections where appropriate
+2. The response should ONLY contain information from the retrieved documents listed above
+3. If the response mentions information NOT found in the retrieved documents, you must REMOVE or CORRECT that information
+4. Add document references [Doc i] at the end of sentences that use information from specific documents
+5. Only reference documents that are actually used in the response
+6. If the response mentions years, sources, or data that don't match the retrieved documents, you must correct it
+7. Keep the response natural, conversational, and well-formatted
+8. Use proper formatting: paragraphs, line breaks, and structure for readability
+9. Don't change the core content that matches the documents, just add references where appropriate and improve formatting
+10. If multiple documents support the same claim, use [Doc i, Doc j] format
+11. If the response contains information that cannot be verified in the retrieved documents, add a note like: "Note: This information may not be in the retrieved documents."
+Return ONLY the enhanced, well-formatted response with references added and any corrections made. Do not include any explanation or meta-commentary."""
+            enhanced = llm.invoke(prompt).content if hasattr(llm.invoke(prompt), 'content') else str(llm.invoke(prompt))
+            # Fallback: if LLM fails, just return original with basic formatting
+            if not enhanced or len(enhanced) < len(answer) * 0.5:
+                logger.warning("LLM enhancement failed, using original response with basic formatting")
+                # Basic formatting: add line breaks after periods for readability
+                formatted = answer.replace('. ', '.\n\n')
+                if sources:
+                    ref_list = ", ".join([f"[Doc {i+1}]" for i in range(min(len(sources), 5))])
+                    formatted += f"\n\n*Based on documents: {ref_list}*"
+                return formatted
+            return enhanced
+        except Exception as e:
+            logger.warning(f"Failed to enhance response with references: {e}")
+            # Fallback: add basic formatting and references at the end
+            formatted = answer.replace('. ', '.\n\n')  # Basic paragraph formatting
+            if sources:
+                ref_list = ", ".join([f"[Doc {i+1}]" for i in range(min(len(sources), 5))])
+                formatted += f"\n\n*Based on documents: {ref_list}*"
+            return formatted
+    def _extract_ui_filters(self, query: str) -> Dict[str, List[str]]:
+        """Extract UI filters from query if present"""
+        filters = {}
+        if "FILTER CONTEXT:" in query:
+            filter_section = query.split("FILTER CONTEXT:")[1]
+            if "USER QUERY:" in filter_section:
+                filter_section = filter_section.split("USER QUERY:")[0]
+            filter_section = filter_section.strip()
+            if "Sources:" in filter_section:
+                sources_line = [line for line in filter_section.split('\n') if line.strip().startswith('Sources:')]
+                if sources_line:
+                    sources_str = sources_line[0].split("Sources:")[1].strip()
+                    if sources_str and sources_str != "None":
+                        filters["sources"] = [s.strip() for s in sources_str.split(",")]
+            if "Years:" in filter_section:
+                years_line = [line for line in filter_section.split('\n') if line.strip().startswith('Years:')]
+                if years_line:
+                    years_str = years_line[0].split("Years:")[1].strip()
+                    if years_str and years_str != "None":
+                        filters["year"] = [y.strip() for y in years_str.split(",")]
+            if "Districts:" in filter_section:
+                districts_line = [line for line in filter_section.split('\n') if line.strip().startswith('Districts:')]
+                if districts_line:
+                    districts_str = districts_line[0].split("Districts:")[1].strip()
+                    if districts_str and districts_str != "None":
+                        filters["district"] = [d.strip() for d in districts_str.split(",")]
+            if "Filenames:" in filter_section:
+                filenames_line = [line for line in filter_section.split('\n') if line.strip().startswith('Filenames:')]
+                if filenames_line:
+                    filenames_str = filenames_line[0].split("Filenames:")[1].strip()
+                    if filenames_str and filenames_str != "None":
+                        filters["filenames"] = [f.strip() for f in filenames_str.split(",")]
+        return filters
+    def _extract_context_from_conversation(
+        self,
+        messages: List[Any],
+        ui_filters: Dict[str, List[str]]
+    ) -> Dict[str, Any]:
+        """Extract context from conversation history"""
+        # Use UI filters if available
+        filters = ui_filters.copy() if ui_filters else {}
+        # For Gemini, we pass filters directly to the search function
+        # The filters will be used to add context to the query
+        return {
+            "filters": filters,
+            "has_filters": bool(filters)
+        }
+    def chat(self, user_input: str, conversation_id: str = "default") -> Dict[str, Any]:
+        """Main chat interface"""
+        logger.info(f"💬 GEMINI CHAT: Processing '{user_input[:50]}...'")
+        # Load conversation
+        conversation_file = self.conversations_dir / f"{conversation_id}.json"
+        conversation = self._load_conversation(conversation_file)
+        # Add user message
+        conversation["messages"].append(HumanMessage(content=user_input))
+        # Prepare state
+        state = GeminiState(
+            conversation_id=conversation_id,
+            messages=conversation["messages"],
+            current_query=user_input,
+            query_context=None,
+            gemini_result=None,
+            final_response=None,
+            agent_logs=[],
+            conversation_context=conversation.get("context", {}),
+            session_start_time=conversation["session_start_time"],
+            last_ai_message_time=conversation["last_ai_message_time"],
+            filters=None
+        )
+        # Run graph
+        final_state = self.graph.invoke(state)
+        # Add AI response to conversation
+        if final_state["final_response"]:
+            conversation["messages"].append(AIMessage(content=final_state["final_response"]))
+        # Update conversation
+        conversation["last_ai_message_time"] = final_state["last_ai_message_time"]
+        conversation["context"] = final_state["conversation_context"]
+        # Save conversation
+        self._save_conversation(conversation_file, conversation)
+        # Format sources for display
+        sources = []
+        gemini_result = final_state.get("gemini_result")
+        if gemini_result:
+            sources = self.gemini_client.format_sources_for_display(gemini_result)
+            logger.info(f"📋 GEMINI CHAT: Formatted {len(sources)} sources for display")
+        return {
+            'response': final_state["final_response"] or "I apologize, but I couldn't process your request.",
+            'rag_result': {
+                'sources': sources,
+                'answer': final_state["final_response"]
+            },
+            'agent_logs': final_state["agent_logs"],
+            'actual_rag_query': final_state["current_query"],
+            'gemini_result': gemini_result  # Include raw result for tracking
+        }
+    def _load_conversation(self, conversation_file: Path) -> Dict[str, Any]:
+        """Load conversation from file"""
+        if conversation_file.exists():
+            try:
+                with open(conversation_file) as f:
+                    data = json.load(f)
+                    messages = []
+                    for msg_data in data.get("messages", []):
+                        if msg_data["type"] == "human":
+                            messages.append(HumanMessage(content=msg_data["content"]))
+                        elif msg_data["type"] == "ai":
+                            messages.append(AIMessage(content=msg_data["content"]))
+                    data["messages"] = messages
+                    return data
+            except Exception as e:
+                logger.warning(f"Could not load conversation: {e}")
+        return {
+            "messages": [],
+            "session_start_time": time.time(),
+            "last_ai_message_time": time.time(),
+            "context": {}
+        }
+    def _save_conversation(self, conversation_file: Path, conversation: Dict[str, Any]):
+        """Save conversation to file"""
+        try:
+            conversation_file.parent.mkdir(parents=True, mode=0o777, exist_ok=True)
+            messages_data = []
+            for msg in conversation["messages"]:
+                if isinstance(msg, HumanMessage):
+                    messages_data.append({"type": "human", "content": msg.content})
+                elif isinstance(msg, AIMessage):
+                    messages_data.append({"type": "ai", "content": msg.content})
+            conversation_data = {
+                "messages": messages_data,
+                "session_start_time": conversation["session_start_time"],
+                "last_ai_message_time": conversation["last_ai_message_time"],
+                "context": conversation.get("context", {})
+            }
+            with open(conversation_file, 'w') as f:
+                json.dump(conversation_data, f, indent=2)
+        except Exception as e:
+            logger.error(f"Could not save conversation: {e}")
+def get_gemini_chatbot():
+    """Get Gemini chatbot instance"""
+    return GeminiRAGChatbot()

multi_agent_chatbot.py → src/agents/multi_agent_chatbot.py RENAMED Viewed

@@ -8,24 +8,26 @@ This system implements a 3-agent architecture:
 Each agent has specialized prompts and responsibilities.
 """
 import json
 import time
 import logging
 from pathlib import Path
 from datetime import datetime
 from dataclasses import dataclass
 from typing import Dict, List, Any, Optional, TypedDict
 from langchain_core.tools import tool
 from langgraph.graph import StateGraph, END
-from langchain_core.messages import HumanMessage, AIMessage, SystemMessage
 from langchain_core.prompts import ChatPromptTemplate
 from src.pipeline import PipelineManager
-from src.config.loader import load_config
 from src.llm.adapters import get_llm_client
 logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
@@ -46,6 +48,7 @@ class QueryContext:
     needs_follow_up: bool = False
     follow_up_question: Optional[str] = None
 class MultiAgentState(TypedDict):
     """State for the multi-agent conversation flow"""
     conversation_id: str
@@ -61,6 +64,7 @@ class MultiAgentState(TypedDict):
     session_start_time: float
     last_ai_message_time: float
 class MultiAgentRAGChatbot:
     """Multi-agent RAG chatbot with specialized agents"""
@@ -112,7 +116,6 @@ class MultiAgentRAGChatbot:
             logger.info("✅ Pipeline manager initialized and models loaded")
         except Exception as e:
             logger.error(f"❌ Failed to initialize pipeline manager: {e}")
-            import traceback
             traceback.print_exc()
             raise RuntimeError(f"Pipeline manager initialization failed: {e}")
@@ -129,7 +132,6 @@ class MultiAgentRAGChatbot:
             raise  # Re-raise RuntimeError as-is
         except Exception as e:
             logger.error(f"❌ Error during vector store connection: {e}")
-            import traceback
             traceback.print_exc()
             raise RuntimeError(f"Vector store connection failed: {e}")
@@ -139,8 +141,8 @@ class MultiAgentRAGChatbot:
         # Build the multi-agent graph
         self.graph = self._build_graph()
-        # Conversations directory - use absolute path in /app to ensure writability
-        self.conversations_dir = Path("/app/conversations")
         try:
             # Use 777 permissions for maximum compatibility (HF Spaces runs as different user)
             self.conversations_dir.mkdir(parents=True, mode=0o777, exist_ok=True)
@@ -158,9 +160,9 @@ class MultiAgentRAGChatbot:
     def _load_dynamic_data(self):
         """Load dynamic data from filter_options.json and add_district_metadata.py"""
-        # Load filter options
         try:
-            fo = Path("src/config/filter_options.json")
             if fo.exists():
                 with open(fo) as f:
                     data = json.load(f)
@@ -178,7 +180,7 @@ class MultiAgentRAGChatbot:
             self.source_whitelist = ['Consolidated', 'Local Government', 'Ministry, Department and Agency']
             self.district_whitelist = ['Kampala', 'Gulu', 'Kalangala']
-        # Enrich district list from add_district_metadata.py
         try:
             from add_district_metadata import DistrictMetadataProcessor
             proc = DistrictMetadataProcessor()
@@ -206,6 +208,59 @@ class MultiAgentRAGChatbot:
         logger.info(f"   Sources: {self.source_whitelist}")
         logger.info(f"   Districts: {len(self.district_whitelist)} districts (first 10: {self.district_whitelist[:10]})")
     def _build_graph(self) -> StateGraph:
         """Build the multi-agent LangGraph"""
         graph = StateGraph(MultiAgentState)
@@ -510,6 +565,10 @@ class MultiAgentRAGChatbot:
    - If user mentions "Lwengo, Kiboga and Namutumba" - extract ["Lwengo", "Kiboga", "Namutumba"] (as JSON array)
    - If user mentions "Lwengo District and Kiboga District" - extract ["Lwengo", "Kiboga"] (as JSON array, remove "District" suffix)
    - Always return districts as JSON arrays when multiple districts are mentioned
    - If no exact matches found, set extracted values to null
 4. **FILENAME FILTERING (MUTUALLY EXCLUSIVE)**:
@@ -590,7 +649,6 @@ Analyze this query using ONLY the exact values provided above:""")
             # Clean and parse JSON with better error handling
             try:
                 # Remove comments (// and /* */) from JSON
-                import re
                 # Remove single-line comments
                 content = re.sub(r'//.*?$', '', content, flags=re.MULTILINE)
                 # Remove multi-line comments
@@ -603,7 +661,6 @@ Analyze this query using ONLY the exact values provided above:""")
                 logger.error(f"❌ Raw content: {content[:200]}...")
                 # Try to extract JSON from text if embedded
-                import re
                 json_match = re.search(r'\{.*\}', content, re.DOTALL)
                 if json_match:
                     try:
@@ -656,13 +713,9 @@ Analyze this query using ONLY the exact values provided above:""")
                     # Validate each district in the array
                     valid_districts = []
                     for district in extracted_district:
-                        if district in self.district_whitelist:
-                            valid_districts.append(district)
-                        else:
-                            # Try removing "District" suffix
-                            district_name = district.replace(" District", "").replace(" district", "")
-                            if district_name in self.district_whitelist:
-                                valid_districts.append(district_name)
                     if valid_districts:
                         extracted_district = valid_districts[0] if len(valid_districts) == 1 else valid_districts
@@ -671,16 +724,15 @@ Analyze this query using ONLY the exact values provided above:""")
                         logger.warning(f"⚠️ No valid districts found in: '{extracted_district}'")
                         extracted_district = None
                 else:
-                    # Single district validation
-                    if extracted_district not in self.district_whitelist:
-                        # Try removing "District" suffix
-                        district_name = extracted_district.replace(" District", "").replace(" district", "")
-                        if district_name in self.district_whitelist:
-                            logger.info(f"🔍 QUERY ANALYSIS: Normalized district '{extracted_district}' to '{district_name}'")
-                            extracted_district = district_name
-                        else:
-                            logger.warning(f"⚠️ Invalid district extracted: '{extracted_district}' not in whitelist")
-                            extracted_district = None
             # Validate source (handle both single values and arrays)
             if extracted_source:
@@ -918,6 +970,23 @@ Rewrite the best retrieval query:""")
                 logger.info(f"🔧 FILTER BUILDING: Added districts filter from UI: {context.ui_filters['districts']} → normalized: {normalized_districts}")
             # Merge with extracted context for missing filters
             if not filters.get("year") and context.extracted_year:
                 # Handle both single values and arrays
                 if isinstance(context.extracted_year, list):
@@ -926,16 +995,6 @@ Rewrite the best retrieval query:""")
                     filters["year"] = [context.extracted_year]
                 logger.info(f"🔧 FILTER BUILDING: Added extracted year filter (UI missing): {context.extracted_year}")
-            if not filters.get("district") and context.extracted_district:
-                # Handle both single values and arrays
-                if isinstance(context.extracted_district, list):
-                    # Normalize district names to title case (match Qdrant metadata format)
-                    normalized = [d.title() for d in context.extracted_district]
-                    filters["district"] = normalized
-                else:
-                    filters["district"] = [context.extracted_district.title()]
-                logger.info(f"🔧 FILTER BUILDING: Added extracted district filter (UI missing): {context.extracted_district}")
             if not filters.get("sources") and context.extracted_source:
                 # Handle both single values and arrays
                 if isinstance(context.extracted_source, list):
@@ -963,12 +1022,21 @@ Rewrite the best retrieval query:""")
                 logger.info(f"🔧 FILTER BUILDING: Added extracted year filter: {context.extracted_year}")
             if context.extracted_district:
-                # Handle both single values and arrays
                 if isinstance(context.extracted_district, list):
-                    filters["district"] = context.extracted_district
                 else:
-                    filters["district"] = [context.extracted_district]
-                logger.info(f"🔧 FILTER BUILDING: Added extracted district filter: {context.extracted_district}")
         logger.info(f"🔧 FILTER BUILDING: Final filters: {filters}")
         return filters
@@ -978,49 +1046,212 @@ Rewrite the best retrieval query:""")
         logger.info("💬 RESPONSE GENERATION: Starting conversational response generation")
         logger.info(f"💬 RESPONSE GENERATION: Processing {len(documents)} documents")
         logger.info(f"💬 RESPONSE GENERATION: Query: '{query[:50]}...'")
         # Create response prompt
         logger.info(f"💬 RESPONSE GENERATION: Building response prompt")
         response_prompt = ChatPromptTemplate.from_messages([
             SystemMessage(content="""You are a helpful audit report assistant. Generate a natural, conversational response.
 RULES:
 1. Answer the user's question directly and clearly
-2. Use the retrieved documents as evidence
 3. Be conversational, not technical
 4. Don't mention scores, retrieval details, or technical implementation
 5. If relevant documents were found, reference them naturally
-6. If no relevant documents, explain based on your knowledge (if you have it) or just say you do not have enough information.
-7. If the passages have useful facts or numbers, use them in your answer.
-8. When you use information from a passage, mention where it came from by using [Doc i] at the end of the sentence. i stands for the number of the document.
 9. Do not use the sentence 'Doc i says ...' to say where information came from.
 10. If the same thing is said in more than one document, you can mention all of them like this: [Doc i, Doc j, Doc k]
 11. Do not just summarize each passage one by one. Group your summaries to highlight the key parts in the explanation.
 12. If it makes sense, use bullet points and lists to make your answers easier to understand.
 13. You do not need to use every passage. Only use the ones that help answer the question.
-14. If the documents do not have the information needed to answer the question, just say you do not have enough information.
 TONE: Professional but friendly, like talking to a colleague."""),
-            HumanMessage(content=f"""User Question: {query}
 Retrieved Documents: {len(documents)} documents found
 RAG Answer: {rag_answer}
-Generate a conversational response:""")
         ])
         try:
             logger.info(f"💬 RESPONSE GENERATION: Calling LLM for final response")
             response = self.llm.invoke(response_prompt.format_messages())
             logger.info(f"💬 RESPONSE GENERATION: LLM response received: {response.content[:100]}...")
-            return response.content.strip()
         except Exception as e:
             logger.error(f"❌ RESPONSE GENERATION: Error during generation: {e}")
             logger.info(f"💬 RESPONSE GENERATION: Using RAG answer as fallback")
             return rag_answer  # Fallback to RAG answer
     def _generate_conversational_response_without_docs(self, query: str, messages: List[Any]) -> str:
         """Generate conversational response using only LLM knowledge and conversation history"""
         logger.info("💬 RESPONSE GENERATION (NO DOCS): Starting response generation without documents")
@@ -1178,7 +1409,6 @@ Generate a conversational response based on your knowledge:""")
         except Exception as e:
             logger.error(f"Could not save conversation: {e}")
-            import traceback
             logger.error(f"Traceback: {traceback.format_exc()}")

 Each agent has specialized prompts and responsibilities.
 """
+import re
 import json
 import time
 import logging
+import traceback
 from pathlib import Path
 from datetime import datetime
 from dataclasses import dataclass
 from typing import Dict, List, Any, Optional, TypedDict
 from langchain_core.tools import tool
 from langgraph.graph import StateGraph, END
 from langchain_core.prompts import ChatPromptTemplate
+from langchain_core.messages import HumanMessage, AIMessage, SystemMessage
 from src.pipeline import PipelineManager
 from src.llm.adapters import get_llm_client
+from src.config.paths import PROJECT_DIR, CONVERSATIONS_DIR
+from src.config.loader import load_config, get_embedding_model_for_collection
 logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
     needs_follow_up: bool = False
     follow_up_question: Optional[str] = None
 class MultiAgentState(TypedDict):
     """State for the multi-agent conversation flow"""
     conversation_id: str
     session_start_time: float
     last_ai_message_time: float
 class MultiAgentRAGChatbot:
     """Multi-agent RAG chatbot with specialized agents"""
             logger.info("✅ Pipeline manager initialized and models loaded")
         except Exception as e:
             logger.error(f"❌ Failed to initialize pipeline manager: {e}")
             traceback.print_exc()
             raise RuntimeError(f"Pipeline manager initialization failed: {e}")
             raise  # Re-raise RuntimeError as-is
         except Exception as e:
             logger.error(f"❌ Error during vector store connection: {e}")
             traceback.print_exc()
             raise RuntimeError(f"Vector store connection failed: {e}")
         # Build the multi-agent graph
         self.graph = self._build_graph()
+        # Conversations directory - use PROJECT_DIR for local vs deployed compatibility
+        self.conversations_dir = CONVERSATIONS_DIR
         try:
             # Use 777 permissions for maximum compatibility (HF Spaces runs as different user)
             self.conversations_dir.mkdir(parents=True, mode=0o777, exist_ok=True)
     def _load_dynamic_data(self):
         """Load dynamic data from filter_options.json and add_district_metadata.py"""
+        # Load filter options - use PROJECT_DIR relative path
         try:
+            fo = PROJECT_DIR / "src" / "config" / "filter_options.json"
             if fo.exists():
                 with open(fo) as f:
                     data = json.load(f)
             self.source_whitelist = ['Consolidated', 'Local Government', 'Ministry, Department and Agency']
             self.district_whitelist = ['Kampala', 'Gulu', 'Kalangala']
+        # Enrich district list from add_district_metadata.py (if available)
         try:
             from add_district_metadata import DistrictMetadataProcessor
             proc = DistrictMetadataProcessor()
         logger.info(f"   Sources: {self.source_whitelist}")
         logger.info(f"   Districts: {len(self.district_whitelist)} districts (first 10: {self.district_whitelist[:10]})")
+    def _normalize_district_name(self, district: str) -> Optional[str]:
+        """Normalize district name with fuzzy matching for common misspellings."""
+        if not district:
+            return None
+        district = district.strip()
+        # Direct match
+        if district in self.district_whitelist:
+            return district
+        # Remove "District" suffix
+        district_name = district.replace(" District", "").replace(" district", "").strip()
+        if district_name in self.district_whitelist:
+            return district_name
+        # Common misspellings mapping
+        misspelling_map = {
+            "kalagala": "Kalangala",
+            "Kalagala": "Kalangala",
+            "KALAGALA": "Kalangala",
+            "kalangala": "Kalangala",
+            "gulu": "Gulu",
+            "GULU": "Gulu",
+            "kampala": "Kampala",
+            "KAMPALA": "Kampala",
+        }
+        # Check misspelling map (case-insensitive)
+        district_lower = district_name.lower()
+        if district_lower in misspelling_map:
+            corrected = misspelling_map[district_lower]
+            if corrected in self.district_whitelist:
+                return corrected
+        # Fuzzy matching for similar names (simple Levenshtein-like check)
+        # Check if the district name is very similar to any whitelist entry
+        for whitelist_district in self.district_whitelist:
+            # Case-insensitive comparison
+            if district_name.lower() == whitelist_district.lower():
+                return whitelist_district
+            # Check if one is a substring of the other (for partial matches)
+            if len(district_name) >= 4 and len(whitelist_district) >= 4:
+                if district_name.lower() in whitelist_district.lower() or whitelist_district.lower() in district_name.lower():
+                    # Only return if it's a strong match (at least 80% of characters match)
+                    min_len = min(len(district_name), len(whitelist_district))
+                    max_len = max(len(district_name), len(whitelist_district))
+                    if min_len / max_len >= 0.8:
+                        return whitelist_district
+        return None
     def _build_graph(self) -> StateGraph:
         """Build the multi-agent LangGraph"""
         graph = StateGraph(MultiAgentState)
    - If user mentions "Lwengo, Kiboga and Namutumba" - extract ["Lwengo", "Kiboga", "Namutumba"] (as JSON array)
    - If user mentions "Lwengo District and Kiboga District" - extract ["Lwengo", "Kiboga"] (as JSON array, remove "District" suffix)
    - Always return districts as JSON arrays when multiple districts are mentioned
+   - **COMMON MISSPELLINGS**: Handle common misspellings intelligently:
+     * "Kalagala" (missing 'n') should be extracted as "Kalangala"
+     * "kalagala", "Kalagala", "KALAGALA" should all be normalized to "Kalangala"
+     * Similar case-insensitive variations should be normalized to the correct district name
    - If no exact matches found, set extracted values to null
 4. **FILENAME FILTERING (MUTUALLY EXCLUSIVE)**:
             # Clean and parse JSON with better error handling
             try:
                 # Remove comments (// and /* */) from JSON
                 # Remove single-line comments
                 content = re.sub(r'//.*?$', '', content, flags=re.MULTILINE)
                 # Remove multi-line comments
                 logger.error(f"❌ Raw content: {content[:200]}...")
                 # Try to extract JSON from text if embedded
                 json_match = re.search(r'\{.*\}', content, re.DOTALL)
                 if json_match:
                     try:
                     # Validate each district in the array
                     valid_districts = []
                     for district in extracted_district:
+                        normalized = self._normalize_district_name(district)
+                        if normalized:
+                            valid_districts.append(normalized)
                     if valid_districts:
                         extracted_district = valid_districts[0] if len(valid_districts) == 1 else valid_districts
                         logger.warning(f"⚠️ No valid districts found in: '{extracted_district}'")
                         extracted_district = None
                 else:
+                    # Single district validation with fuzzy matching
+                    normalized = self._normalize_district_name(extracted_district)
+                    if normalized:
+                        if normalized != extracted_district:
+                            logger.info(f"🔍 QUERY ANALYSIS: Normalized district '{extracted_district}' to '{normalized}'")
+                        extracted_district = normalized
+                    else:
+                        logger.warning(f"⚠️ Invalid district extracted: '{extracted_district}' not in whitelist")
+                        extracted_district = None
             # Validate source (handle both single values and arrays)
             if extracted_source:
                 logger.info(f"🔧 FILTER BUILDING: Added districts filter from UI: {context.ui_filters['districts']} → normalized: {normalized_districts}")
             # Merge with extracted context for missing filters
+            if not filters.get("district") and context.extracted_district:
+                # Normalize district names using the normalization function
+                if isinstance(context.extracted_district, list):
+                    normalized_districts = []
+                    for d in context.extracted_district:
+                        normalized = self._normalize_district_name(d)
+                        if normalized:
+                            normalized_districts.append(normalized)
+                    if normalized_districts:
+                        filters["district"] = normalized_districts
+                        logger.info(f"🔧 FILTER BUILDING: Added districts filter from context: {context.extracted_district} → normalized: {normalized_districts}")
+                else:
+                    normalized = self._normalize_district_name(context.extracted_district)
+                    if normalized:
+                        filters["district"] = [normalized]
+                        logger.info(f"🔧 FILTER BUILDING: Added district filter from context: {context.extracted_district} → normalized: {normalized}")
             if not filters.get("year") and context.extracted_year:
                 # Handle both single values and arrays
                 if isinstance(context.extracted_year, list):
                     filters["year"] = [context.extracted_year]
                 logger.info(f"🔧 FILTER BUILDING: Added extracted year filter (UI missing): {context.extracted_year}")
             if not filters.get("sources") and context.extracted_source:
                 # Handle both single values and arrays
                 if isinstance(context.extracted_source, list):
                 logger.info(f"🔧 FILTER BUILDING: Added extracted year filter: {context.extracted_year}")
             if context.extracted_district:
+                # Normalize district names using the normalization function
                 if isinstance(context.extracted_district, list):
+                    normalized_districts = []
+                    for d in context.extracted_district:
+                        normalized = self._normalize_district_name(d)
+                        if normalized:
+                            normalized_districts.append(normalized)
+                    if normalized_districts:
+                        filters["district"] = normalized_districts
+                        logger.info(f"🔧 FILTER BUILDING: Added districts filter from context: {context.extracted_district} → normalized: {normalized_districts}")
                 else:
+                    normalized = self._normalize_district_name(context.extracted_district)
+                    if normalized:
+                        filters["district"] = [normalized]
+                        logger.info(f"🔧 FILTER BUILDING: Added district filter from context: {context.extracted_district} → normalized: {normalized}")
         logger.info(f"🔧 FILTER BUILDING: Final filters: {filters}")
         return filters
         logger.info("💬 RESPONSE GENERATION: Starting conversational response generation")
         logger.info(f"💬 RESPONSE GENERATION: Processing {len(documents)} documents")
         logger.info(f"💬 RESPONSE GENERATION: Query: '{query[:50]}...'")
+        logger.info(f"💬 RESPONSE GENERATION: Conversation history: {len(messages)} messages")
+        # Build conversation history context
+        conversation_context = self._build_conversation_context(messages)
+        # Build detailed document information
+        document_details = self._build_document_details(documents)
+        # Extract correct district/source/year names from documents (to correct misspellings)
+        correct_names = self._extract_correct_names_from_documents(documents)
         # Create response prompt
         logger.info(f"💬 RESPONSE GENERATION: Building response prompt")
         response_prompt = ChatPromptTemplate.from_messages([
             SystemMessage(content="""You are a helpful audit report assistant. Generate a natural, conversational response.
+CRITICAL RULES - NO HALLUCINATION:
+1. **ONLY use information from the retrieved documents provided below**
+2. **EVERY sentence with facts, numbers, or specific claims MUST have a [Doc i] reference**
+3. **If a document doesn't contain the information, DO NOT make it up**
+4. **If the user asks about a year/district that's NOT in the retrieved documents, explicitly state that**
+5. **Check the document years/districts before making any claims about them**
+6. **USE CORRECT NAMES**: If the conversation mentions a misspelled district/source name (e.g., "Kalagala"), use the CORRECT spelling from the document metadata (e.g., "Kalangala"). Always use the exact names from document metadata, not misspellings from conversation.
 RULES:
 1. Answer the user's question directly and clearly
+2. Use ONLY the retrieved documents as evidence - DO NOT use your training data
 3. Be conversational, not technical
 4. Don't mention scores, retrieval details, or technical implementation
 5. If relevant documents were found, reference them naturally
+6. If no relevant documents, say you do not have enough information - DO NOT hallucinate
+7. If the passages have useful facts or numbers, use them in your answer WITH references
+8. **MANDATORY**: When you use information from a passage, mention where it came from by using [Doc i] at the end of the sentence. i stands for the number of the document.
 9. Do not use the sentence 'Doc i says ...' to say where information came from.
 10. If the same thing is said in more than one document, you can mention all of them like this: [Doc i, Doc j, Doc k]
 11. Do not just summarize each passage one by one. Group your summaries to highlight the key parts in the explanation.
 12. If it makes sense, use bullet points and lists to make your answers easier to understand.
 13. You do not need to use every passage. Only use the ones that help answer the question.
+14. **VERIFY**: Before mentioning any year, district, or number, check that it exists in the retrieved documents. If it doesn't, say "I don't have information about [year/district] in the retrieved documents."
+15. **NO HALLUCINATION**: If documents show years 2021, 2022, 2023 but user asks about 2020, DO NOT provide 2020 data. Instead say "The retrieved documents cover 2021-2023, but I don't have information for 2020."
+16. **USE CORRECT SPELLING**: Always use the district/source names exactly as they appear in the document metadata below, even if the conversation history has misspellings.
 TONE: Professional but friendly, like talking to a colleague."""),
+            HumanMessage(content=f"""Conversation History:
+{conversation_context}
+Current User Question: {query}
 Retrieved Documents: {len(documents)} documents found
+CORRECT NAMES TO USE (from document metadata - use these exact spellings):
+{correct_names}
+Full Document Details:
+{document_details}
 RAG Answer: {rag_answer}
+CRITICAL:
+- Responses should be grounded to what is available in the retrieved documents
+- If user asks about a specific year but documents show other years, or districts or sources then explicitly state "can't provide response on ... because ..."
+- Every factual claim MUST have [Doc i] reference
+- If information is not in documents, explicitly state it's not available
+- **USE THE CORRECT DISTRICT/SOURCE NAMES from the document metadata above, not misspellings from conversation**
+Generate a conversational response with proper document references:""")
         ])
         try:
             logger.info(f"💬 RESPONSE GENERATION: Calling LLM for final response")
             response = self.llm.invoke(response_prompt.format_messages())
             logger.info(f"💬 RESPONSE GENERATION: LLM response received: {response.content[:100]}...")
+            # Post-process response to ensure no hallucination
+            final_response = self._validate_and_enhance_response(
+                response.content.strip(),
+                documents,
+                query
+            )
+            return final_response
         except Exception as e:
             logger.error(f"❌ RESPONSE GENERATION: Error during generation: {e}")
             logger.info(f"💬 RESPONSE GENERATION: Using RAG answer as fallback")
             return rag_answer  # Fallback to RAG answer
+    def _build_conversation_context(self, messages: List[Any]) -> str:
+        """Build conversation history context for response generation."""
+        if not messages:
+            return "No previous conversation."
+        context_lines = []
+        # Show last 6 messages for context (to capture the current exchange)
+        for msg in messages[-6:]:
+            if isinstance(msg, HumanMessage):
+                context_lines.append(f"User: {msg.content}")
+            elif isinstance(msg, AIMessage):
+                context_lines.append(f"Assistant: {msg.content}")
+        return "\n".join(context_lines) if context_lines else "No previous conversation."
+    def _build_document_details(self, documents: List[Any]) -> str:
+        """Build detailed document information for response generation."""
+        if not documents:
+            return "No documents retrieved."
+        details = []
+        for i, doc in enumerate(documents[:15], 1):  # Show up to 15 documents
+            metadata = getattr(doc, 'metadata', {}) if hasattr(doc, 'metadata') else (doc if isinstance(doc, dict) else {})
+            content = getattr(doc, 'page_content', '') if hasattr(doc, 'page_content') else (doc.get('content', '') if isinstance(doc, dict) else '')
+            if isinstance(metadata, dict):
+                filename = metadata.get('filename', 'Unknown')
+                year = metadata.get('year', 'Unknown')
+                district = metadata.get('district', 'Unknown')
+                source = metadata.get('source', 'Unknown')
+                page = metadata.get('page', metadata.get('page_label', 'Unknown'))
+                doc_info = f"[Doc {i}]"
+                doc_info += f"\n  Filename: {filename}"
+                doc_info += f"\n  Year: {year}"
+                doc_info += f"\n  District: {district}"
+                doc_info += f"\n  Source: {source}"
+                if page != 'Unknown':
+                    doc_info += f"\n  Page: {page}"
+                doc_info += f"\n  Content: {content[:300]}{'...' if len(content) > 300 else ''}"
+                details.append(doc_info)
+        return "\n\n".join(details) if details else "No document details available."
+    def _extract_correct_names_from_documents(self, documents: List[Any]) -> str:
+        """Extract correct district/source names from documents to correct misspellings."""
+        districts = set()
+        sources = set()
+        years = set()
+        for doc in documents:
+            metadata = getattr(doc, 'metadata', {}) if hasattr(doc, 'metadata') else (doc if isinstance(doc, dict) else {})
+            if isinstance(metadata, dict):
+                if metadata.get('district'):
+                    districts.add(str(metadata['district']))
+                if metadata.get('source'):
+                    sources.add(str(metadata['source']))
+                if metadata.get('year'):
+                    years.add(str(metadata['year']))
+        result = []
+        if districts:
+            result.append(f"Districts: {', '.join(sorted(districts))}")
+        if sources:
+            result.append(f"Sources: {', '.join(sorted(sources))}")
+        if years:
+            result.append(f"Years: {', '.join(sorted(years))}")
+        if result:
+            return "\n".join(result) + "\n\nIMPORTANT: Use these EXACT spellings in your response, even if the conversation history has misspellings."
+        return "No metadata available."
+    def _validate_and_enhance_response(self, response: str, documents: List[Any], query: str) -> str:
+        """Validate response and ensure all claims are referenced."""
+        # Extract years and districts from documents
+        doc_years = set()
+        doc_districts = set()
+        doc_sources = set()
+        for doc in documents:
+            metadata = getattr(doc, 'metadata', {}) if hasattr(doc, 'metadata') else (doc if isinstance(doc, dict) else {})
+            if isinstance(metadata, dict):
+                if metadata.get('year'):
+                    doc_years.add(str(metadata['year']))
+                if metadata.get('district'):
+                    doc_districts.add(str(metadata['district']))
+                if metadata.get('source'):
+                    doc_sources.add(str(metadata['source']))
+        # Correct misspellings in response using correct names from documents
+        # response = self._correct_misspellings_in_response(response, doc_districts, doc_sources)
+        # Check if response mentions years not in documents
+        year_pattern = r'\b(20\d{2})\b'
+        mentioned_years = set(re.findall(year_pattern, response))
+        # Check if user query mentions a year
+        query_years = set(re.findall(year_pattern, query))
+        # If user asks about a year not in documents, add a warning
+        missing_years = query_years - doc_years
+        if missing_years and doc_years:
+            warning = f"\n\n⚠️ Note: The retrieved documents cover years {', '.join(sorted(doc_years))}, but I don't have information for {', '.join(sorted(missing_years))} in the retrieved documents."
+            if warning not in response:
+                response = response + warning
+        # Check if response has document references
+        doc_ref_pattern = r'\[Doc\s+\d+\]'
+        has_refs = bool(re.search(doc_ref_pattern, response))
+        # If response has factual claims but no references, add a note
+        if not has_refs and len(documents) > 0:
+            # Check if response has numbers or specific claims (simple heuristic)
+            has_numbers = bool(re.search(r'\d+', response))
+            if has_numbers and len(response) > 50:
+                logger.warning("⚠️ Response contains factual claims but no document references")
+                # Don't modify response, but log the issue
+        return response
     def _generate_conversational_response_without_docs(self, query: str, messages: List[Any]) -> str:
         """Generate conversational response using only LLM knowledge and conversation history"""
         logger.info("💬 RESPONSE GENERATION (NO DOCS): Starting response generation without documents")
         except Exception as e:
             logger.error(f"Could not save conversation: {e}")
             logger.error(f"Traceback: {traceback.format_exc()}")

smart_chatbot.py → src/agents/smart_chatbot.py RENAMED Viewed

@@ -26,6 +26,7 @@ from langchain_core.messages import HumanMessage, AIMessage, SystemMessage
 from src.pipeline import PipelineManager
 from src.config.loader import load_config
 @dataclass
@@ -161,7 +162,7 @@ class IntelligentRAGChatbot:
         # Try to load district whitelist from filter_options.json
         try:
-            fo = Path("filter_options.json")
             if fo.exists():
                 with open(fo) as f:
                     data = json.load(f)
@@ -174,7 +175,7 @@ class IntelligentRAGChatbot:
         except Exception:
             self.district_whitelist = self.available_metadata['districts']
-        # Enrich whitelist from add_district_metadata.py if available
         try:
             from add_district_metadata import DistrictMetadataProcessor
             proc = DistrictMetadataProcessor()
@@ -195,7 +196,7 @@ class IntelligentRAGChatbot:
         # Get dynamic year list from filter_options.json
         try:
-            fo = Path("filter_options.json")
             if fo.exists():
                 with open(fo) as f:
                     data = json.load(f)

 from src.pipeline import PipelineManager
 from src.config.loader import load_config
+from src.config.paths import PROJECT_DIR
 @dataclass
         # Try to load district whitelist from filter_options.json
         try:
+            fo = PROJECT_DIR / "src" / "config" / "filter_options.json"
             if fo.exists():
                 with open(fo) as f:
                     data = json.load(f)
         except Exception:
             self.district_whitelist = self.available_metadata['districts']
+        # Enrich whitelist from add_district_metadata.py if available (optional module)
         try:
             from add_district_metadata import DistrictMetadataProcessor
             proc = DistrictMetadataProcessor()
         # Get dynamic year list from filter_options.json
         try:
+            fo = PROJECT_DIR / "src" / "config" / "filter_options.json"
             if fo.exists():
                 with open(fo) as f:
                     data = json.load(f)

src/config/paths.py ADDED Viewed

	@@ -0,0 +1,59 @@

+"""
+Path configuration for local vs deployed environments.
+This module handles different paths for local development vs deployed (HF Spaces) environments.
+"""
+import os
+from pathlib import Path
+# Determine if we're in a deployed environment (HF Spaces/Docker) or local
+# Check for environment variable or Docker-like paths
+IS_DEPLOYED = (
+    os.getenv("DEPLOYED", "false").lower() == "true" or
+    os.path.exists("/app") or
+    os.getenv("SPACES_ID") is not None or
+    os.path.exists("/.dockerenv")
+)
+# PROJECT_DIR: Base directory for application files
+# In deployed: /app, in local: current working directory or project root
+if IS_DEPLOYED:
+    PROJECT_DIR = Path("/app")
+else:
+    # For local development, use current working directory or find project root
+    cwd = Path.cwd()
+    # Try to find project root (directory containing this src/ folder)
+    project_root = cwd
+    while project_root != project_root.parent:
+        if (project_root / "src" / "config").exists():
+            break
+        project_root = project_root.parent
+    PROJECT_DIR = project_root
+# Cache directories - different for local vs deployed
+# Local: Use default user cache locations (don't override)
+# Deployed: Use PROJECT_DIR/.cache
+if IS_DEPLOYED:
+    CACHE_DIR = PROJECT_DIR / ".cache"
+    HF_CACHE_DIR = CACHE_DIR / "huggingface"
+    STREAMLIT_CACHE_DIR = CACHE_DIR / "streamlit"
+else:
+    # For local, use default user cache (let libraries use their defaults)
+    HF_CACHE_DIR = None  # Will use HF defaults (~/.cache/huggingface)
+    STREAMLIT_CACHE_DIR = None  # Will use Streamlit defaults
+# Application directories
+FEEDBACK_DIR = PROJECT_DIR / "feedback"
+CONVERSATIONS_DIR = PROJECT_DIR / "conversations"
+STREAMLIT_CONFIG_DIR = PROJECT_DIR / ".streamlit"
+# Log the configuration
+if __name__ == "__main__":
+    print(f"IS_DEPLOYED: {IS_DEPLOYED}")
+    print(f"PROJECT_DIR: {PROJECT_DIR}")
+    print(f"HF_CACHE_DIR: {HF_CACHE_DIR}")
+    print(f"FEEDBACK_DIR: {FEEDBACK_DIR}")
+    print(f"CONVERSATIONS_DIR: {CONVERSATIONS_DIR}")

src/feedback/__init__.py ADDED Viewed

	@@ -0,0 +1,152 @@

+"""
+Feedback Management Module
+This module provides a unified interface for handling user feedback,
+including data preparation, validation, and Snowflake storage.
+"""
+from typing import Dict, Any, List, Optional
+from langchain_core.messages import HumanMessage, AIMessage
+from .feedback_schema import UserFeedback, create_feedback_from_dict, generate_snowflake_schema_sql
+from .snowflake_connector import SnowflakeFeedbackConnector, save_to_snowflake, get_snowflake_connector_from_env
+class FeedbackManager:
+    """
+    Unified manager for feedback operations.
+    This class provides a single interface for all feedback-related functionality,
+    including data preparation, validation, and storage.
+    """
+    def __init__(self):
+        """Initialize the FeedbackManager"""
+        pass
+    @staticmethod
+    def extract_transcript(messages: List[Any]) -> List[Dict[str, str]]:
+        """Extract transcript from messages - only user and bot messages, no extra metadata"""
+        transcript = []
+        for msg in messages:
+            if isinstance(msg, HumanMessage):
+                transcript.append({
+                    "role": "user",
+                    "content": str(msg.content) if hasattr(msg, 'content') else str(msg)
+                })
+            elif isinstance(msg, AIMessage):
+                transcript.append({
+                    "role": "assistant",
+                    "content": str(msg.content) if hasattr(msg, 'content') else str(msg)
+                })
+        return transcript
+    @staticmethod
+    def build_retrievals_structure(rag_retrieval_history: List[Dict[str, Any]], messages: List[Any]) -> List[Dict[str, Any]]:
+        """Build retrievals structure from retrieval history"""
+        retrievals = []
+        for entry in rag_retrieval_history:
+            # Get the user message that triggered this retrieval
+            # The entry has conversation_up_to which includes messages up to that point
+            conversation_up_to = entry.get("conversation_up_to", [])
+            # Find the last user message in conversation_up_to (this is the trigger)
+            user_message_trigger = ""
+            for msg_dict in reversed(conversation_up_to):
+                if msg_dict.get("type") == "HumanMessage":
+                    user_message_trigger = msg_dict.get("content", "")
+                    break
+            # Fallback: if not found in conversation_up_to, get from actual messages
+            # This handles edge cases where conversation_up_to might be incomplete
+            if not user_message_trigger:
+                # Find which retrieval this is (0-indexed)
+                retrieval_idx = rag_retrieval_history.index(entry)
+                # The user message that triggered this retrieval is at position (retrieval_idx * 2)
+                # because each retrieval is preceded by: user message, bot response, user message, ...
+                # But we need to account for the fact that the first retrieval happens after the first user message
+                user_msgs = [msg for msg in messages if isinstance(msg, HumanMessage)]
+                if retrieval_idx < len(user_msgs):
+                    user_message_trigger = str(user_msgs[retrieval_idx].content)
+                elif user_msgs:
+                    # Fallback to last user message
+                    user_message_trigger = str(user_msgs[-1].content)
+            # Get retrieved documents and truncate content to 100 chars
+            docs_retrieved = entry.get("docs_retrieved", [])
+            retrieved_docs = []
+            for doc in docs_retrieved:
+                doc_copy = doc.copy()
+                # Truncate content to 100 characters (keep all other fields)
+                if "content" in doc_copy:
+                    doc_copy["content"] = doc_copy["content"][:100]
+                retrieved_docs.append(doc_copy)
+            retrievals.append({
+                "retrieved_docs": retrieved_docs,
+                "user_message_trigger": user_message_trigger
+            })
+        return retrievals
+    @staticmethod
+    def build_feedback_score_related_retrieval_docs(
+        is_feedback_about_last_retrieval: bool,
+        messages: List[Any],
+        rag_retrieval_history: List[Dict[str, Any]]
+    ) -> Optional[Dict[str, Any]]:
+        """Build feedback_score_related_retrieval_docs structure"""
+        if not rag_retrieval_history:
+            return None
+        # Get the relevant retrieval entry
+        if is_feedback_about_last_retrieval:
+            relevant_entry = rag_retrieval_history[-1]
+        else:
+            # If feedback is about all retrievals, use the last one as default
+            relevant_entry = rag_retrieval_history[-1]
+        # Get conversation up to that point
+        conversation_up_to = relevant_entry.get("conversation_up_to", [])
+        # Convert to transcript format (role/content)
+        conversation_up_to_point = []
+        for msg_dict in conversation_up_to:
+            if msg_dict.get("type") == "HumanMessage":
+                conversation_up_to_point.append({
+                    "role": "user",
+                    "content": msg_dict.get("content", "")
+                })
+            elif msg_dict.get("type") == "AIMessage":
+                conversation_up_to_point.append({
+                    "role": "assistant",
+                    "content": msg_dict.get("content", "")
+                })
+        # Get retrieved docs with full content (not truncated)
+        retrieved_docs = relevant_entry.get("docs_retrieved", [])
+        return {
+            "conversation_up_to_point": conversation_up_to_point,
+            "retrieved_docs": retrieved_docs
+        }
+    @staticmethod
+    def create_feedback_from_dict(data: Dict[str, Any]) -> UserFeedback:
+        """Create UserFeedback instance from dictionary"""
+        return create_feedback_from_dict(data)
+    @staticmethod
+    def save_to_snowflake(feedback: UserFeedback, table_name: Optional[str] = None) -> bool:
+        """Save feedback to Snowflake"""
+        return save_to_snowflake(feedback, table_name)
+    @staticmethod
+    def generate_snowflake_schema_sql(table_name: Optional[str] = None) -> str:
+        """Generate Snowflake schema SQL"""
+        return generate_snowflake_schema_sql(table_name)
+__all__ = ["FeedbackManager", "UserFeedback", "save_to_snowflake", "SnowflakeFeedbackConnector"]

src/feedback/feedback_schema.py ADDED Viewed

	@@ -0,0 +1,161 @@

+"""
+Feedback Schema for RAG Chatbot
+This module defines dataclasses for feedback data structures
+and provides Snowflake schema generation.
+"""
+import os
+from datetime import datetime
+from dataclasses import dataclass, asdict, field
+from typing import List, Optional, Dict, Any, Union
+@dataclass
+class RetrievedDocument:
+    """Single retrieved document metadata"""
+    doc_id: str
+    filename: str
+    page: int
+    score: float
+    content: str
+    metadata: Dict[str, Any]
+@dataclass
+class RetrievalEntry:
+    """Single retrieval operation metadata"""
+    rag_query: str
+    documents_retrieved: List[RetrievedDocument]
+    conversation_length: int
+    filters_applied: Optional[Dict[str, Any]] = None
+    timestamp: Optional[float] = None
+    _raw_data: Optional[Dict[str, Any]] = None
+@dataclass
+class UserFeedback:
+    """User feedback submission data"""
+    feedback_id: str
+    open_ended_feedback: Optional[str]
+    score: int
+    is_feedback_about_last_retrieval: bool
+    conversation_id: str
+    timestamp: float
+    message_count: int
+    has_retrievals: bool
+    retrieval_count: int
+    transcript: List[Dict[str, str]]  # List of {"role": "user"/"assistant", "content": "..."}
+    retrievals: List[Dict[str, Any]]  # List of retrieval objects with retrieved_docs and user_message_trigger
+    feedback_score_related_retrieval_docs: Optional[Dict[str, Any]] = None  # Conversation subset + retrieved docs
+    retrieved_data: Optional[List[Dict[str, Any]]] = None  # Preserved old column for backward compatibility
+    created_at: str = field(default_factory=lambda: datetime.now().isoformat())
+    def to_dict(self) -> Dict[str, Any]:
+        """Convert to dictionary with nested data structures"""
+        result = asdict(self)
+        return result
+    def to_snowflake_schema(self) -> Dict[str, Any]:
+        """Generate Snowflake schema for this dataclass"""
+        schema = {
+            "feedback_id": "VARCHAR(255)",
+            "open_ended_feedback": "VARCHAR(16777216)",  # Large text
+            "score": "INTEGER",
+            "is_feedback_about_last_retrieval": "BOOLEAN",
+            "conversation_id": "VARCHAR(255)",
+            "timestamp": "NUMBER(20, 0)",
+            "message_count": "INTEGER",
+            "has_retrievals": "BOOLEAN",
+            "retrieval_count": "INTEGER",
+            "transcript": "VARCHAR(16777216)",  # JSON string of ARRAY of {"role": "user"/"assistant", "content": "..."}
+            "retrievals": "VARCHAR(16777216)",  # JSON string of ARRAY of retrieval objects
+            "feedback_score_related_retrieval_docs": "VARCHAR(16777216)",  # JSON string of OBJECT with conversation subset + retrieved docs
+            "retrieved_data": "VARCHAR(16777216)",  # JSON string - preserved old column for backward compatibility
+            "created_at": "TIMESTAMP_NTZ",
+            # transcript structure: [{"role": "user", "content": "..."}, {"role": "assistant", "content": "..."}, ...]
+            # retrievals structure: [
+            #   {
+            #     "retrieved_docs": [{"content": "...", "metadata": {...}, ...}],  # content truncated to 100 chars
+            #     "user_message_trigger": "final user message that triggered this retrieval"
+            #   },
+            #   ...
+            # ]
+            # feedback_score_related_retrieval_docs structure: {
+            #   "conversation_up_to_point": [{"role": "user", "content": "..."}, ...],  # subset of transcript
+            #   "retrieved_docs": [{"content": "...", "metadata": {...}, ...}]  # full chunks with all info
+            # }
+        }
+        return schema
+    @classmethod
+    def get_snowflake_create_table_sql(cls, table_name: str = "USER_FEEDBACK_V3") -> str:
+        """Generate CREATE TABLE SQL for Snowflake"""
+        schema = cls.to_snowflake_schema(None)
+        columns = []
+        for col_name, col_type in schema.items():
+            nullable = "NULL" if col_name not in ["feedback_id", "score", "timestamp"] else "NOT NULL"
+            columns.append(f"  {col_name} {col_type} {nullable}")
+        # Build SQL string properly
+        columns_str = ",\n".join(columns)
+        sql = f"""CREATE TABLE IF NOT EXISTS {table_name} (
+{columns_str},
+  PRIMARY KEY (feedback_id)
+)
+CLUSTER BY (timestamp, conversation_id, score);
+-- Note: Snowflake doesn't support traditional indexes on regular tables.
+-- Instead, we use CLUSTER BY to optimize queries on these columns.
+-- Snowflake automatically maintains clustering for efficient querying.
+-- Note: transcript, retrievals, and feedback_score_related_retrieval_docs are stored as VARCHAR (JSON strings),
+-- same approach as the old retrieved_data column. This allows easy storage and retrieval without VARIANT type complexity.
+"""
+        return sql
+# Snowflake variant schema for retrieved_data array
+RETRIEVAL_ENTRY_SCHEMA = {
+    "rag_query": "VARCHAR",
+    "documents_retrieved": "ARRAY",  # Array of document objects
+    "conversation_length": "INTEGER",
+    "filters_applied": "OBJECT",
+    "timestamp": "NUMBER"
+}
+DOCUMENT_SCHEMA = {
+    "doc_id": "VARCHAR",
+    "filename": "VARCHAR",
+    "page": "INTEGER",
+    "score": "DOUBLE",
+    "content": "VARCHAR(16777216)",
+    "metadata": "OBJECT"
+}
+def generate_snowflake_schema_sql(table_name: Optional[str] = None) -> str:
+    """Generate complete Snowflake schema SQL for feedback system"""
+    if table_name is None:
+        table_name = os.getenv("SNOWFLAKE_FEEDBACK_TABLE", "USER_FEEDBACK_V3")
+    return UserFeedback.get_snowflake_create_table_sql(table_name)
+def create_feedback_from_dict(data: Dict[str, Any]) -> UserFeedback:
+    """Create UserFeedback instance from dictionary"""
+    return UserFeedback(
+        feedback_id=data.get("feedback_id", f"feedback_{data.get('timestamp', 'unknown')}"),
+        open_ended_feedback=data.get("open_ended_feedback"),
+        score=data["score"],
+        is_feedback_about_last_retrieval=data["is_feedback_about_last_retrieval"],
+        conversation_id=data["conversation_id"],
+        timestamp=data["timestamp"],
+        message_count=data["message_count"],
+        has_retrievals=data["has_retrievals"],
+        retrieval_count=data["retrieval_count"],
+        transcript=data.get("transcript", []),
+        retrievals=data.get("retrievals", []),
+        feedback_score_related_retrieval_docs=data.get("feedback_score_related_retrieval_docs"),
+        retrieved_data=data.get("retrieved_data")
+    )

src/feedback/snowflake_connector.py ADDED Viewed

	@@ -0,0 +1,331 @@

+"""
+Snowflake Connector for Feedback System
+This module handles inserting user feedback into Snowflake.
+"""
+import os
+import json
+import logging
+from typing import Dict, Any, Optional
+from .feedback_schema import UserFeedback
+# Try to import snowflake connector
+try:
+    import snowflake.connector
+    SNOWFLAKE_AVAILABLE = True
+except ImportError:
+    SNOWFLAKE_AVAILABLE = False
+    logging.warning("⚠️ snowflake-connector-python not installed. Install with: pip install snowflake-connector-python")
+# Configure logging
+logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
+logger = logging.getLogger(__name__)
+class SnowflakeFeedbackConnector:
+    """Connector for inserting feedback into Snowflake"""
+    def __init__(
+        self,
+        user: str,
+        password: str,
+        account: str,
+        warehouse: str,
+        database: str = "SNOWFLAKE_LEARNING",
+        schema: str = "PUBLIC"
+    ):
+        self.user = user
+        self.password = password
+        self.account = account
+        self.warehouse = warehouse
+        self.database = database
+        self.schema = schema
+        self._connection = None
+    def connect(self):
+        """Establish Snowflake connection"""
+        if not SNOWFLAKE_AVAILABLE:
+            raise ImportError("snowflake-connector-python is not installed. Install with: pip install snowflake-connector-python")
+        logger.info("=" * 80)
+        logger.info("🔌 SNOWFLAKE CONNECTION: Attempting to connect...")
+        logger.info(f"   - Account: {self.account}")
+        logger.info(f"   - Warehouse: {self.warehouse}")
+        logger.info(f"   - Database: {self.database}")
+        logger.info(f"   - Schema: {self.schema}")
+        logger.info(f"   - User: {self.user}")
+        try:
+            self._connection = snowflake.connector.connect(
+                user=self.user,
+                password=self.password,
+                account=self.account,
+                warehouse=self.warehouse
+                # Don't set database/schema in connection - we'll do it per query
+            )
+            logger.info("✅ SNOWFLAKE CONNECTION: Successfully connected")
+            logger.info("=" * 80)
+            print(f"✅ Connected to Snowflake: {self.database}.{self.schema}")
+        except Exception as e:
+            logger.error(f"❌ SNOWFLAKE CONNECTION FAILED: {e}")
+            logger.error("=" * 80)
+            print(f"❌ Failed to connect to Snowflake: {e}")
+            raise
+    def disconnect(self):
+        """Close Snowflake connection"""
+        if self._connection:
+            self._connection.close()
+            print("✅ Disconnected from Snowflake")
+    def insert_feedback(self, feedback: UserFeedback, table_name: Optional[str] = None) -> bool:
+        """Insert a single feedback record into Snowflake"""
+        logger.info("=" * 80)
+        logger.info("🔄 SNOWFLAKE INSERT: Starting feedback insertion process")
+        logger.info(f"📝 Feedback ID: {feedback.feedback_id}")
+        # Get table name from parameter, env var, or default
+        if table_name is None:
+            table_name = os.getenv("SNOWFLAKE_FEEDBACK_TABLE", "USER_FEEDBACK_V3")
+        if not self._connection:
+            logger.error("❌ Not connected to Snowflake. Call connect() first.")
+            raise RuntimeError("Not connected to Snowflake. Call connect() first.")
+        try:
+            logger.info("📊 VALIDATION: Validating feedback data structure...")
+            # Validate feedback object
+            validation_errors = []
+            if not feedback.feedback_id:
+                validation_errors.append("Missing feedback_id")
+            if feedback.score is None:
+                validation_errors.append("Missing score")
+            if feedback.timestamp is None:
+                validation_errors.append("Missing timestamp")
+            if validation_errors:
+                logger.error(f"❌ VALIDATION FAILED: {validation_errors}")
+                return False
+            else:
+                logger.info("✅ VALIDATION PASSED: All required fields present")
+            logger.info("📋 Data Summary:")
+            logger.info(f"   - Feedback ID: {feedback.feedback_id}")
+            logger.info(f"   - Score: {feedback.score}")
+            logger.info(f"   - Conversation ID: {feedback.conversation_id}")
+            logger.info(f"   - Has Retrievals: {feedback.has_retrievals}")
+            logger.info(f"   - Retrieval Count: {feedback.retrieval_count}")
+            logger.info(f"   - Message Count: {feedback.message_count}")
+            logger.info(f"   - Timestamp: {feedback.timestamp}")
+            cursor = self._connection.cursor()
+            logger.info("✅ SNOWFLAKE CONNECTION: Cursor created")
+            # Set database and schema context
+            logger.info(f"🔧 SETTING CONTEXT: Database={self.database}, Schema={self.schema}")
+            try:
+                cursor.execute(f'USE DATABASE "{self.database}"')
+                cursor.execute(f'USE SCHEMA "{self.schema}"')
+                cursor.execute("SELECT CURRENT_DATABASE(), CURRENT_SCHEMA()")
+                current_db, current_schema = cursor.fetchone()
+                logger.info(f"✅ Current context verified: Database={current_db}, Schema={current_schema}")
+            except Exception as e:
+                logger.error(f"❌ Could not set context: {e}")
+                raise
+            # Prepare data - convert to JSON strings for VARIANT columns (same approach as old retrieved_data)
+            logger.info("🔧 DATA PREPARATION: Preparing VARIANT columns...")
+            feedback_dict = feedback.to_dict()
+            # Prepare transcript (ARRAY) - convert to JSON string
+            transcript_raw = feedback_dict.get('transcript', [])
+            if transcript_raw:
+                # Convert to JSON string (same approach as old retrieved_data)
+                transcript_for_db = json.dumps(transcript_raw)
+                logger.info(f"   - Transcript: {len(transcript_raw)} messages, JSON length: {len(transcript_for_db)}")
+            else:
+                transcript_for_db = None
+                logger.info("   - Transcript: None")
+            # Prepare retrievals (ARRAY) - convert to JSON string
+            retrievals_raw = feedback_dict.get('retrievals', [])
+            if retrievals_raw:
+                # Convert to JSON string (same approach as old retrieved_data)
+                retrievals_for_db = json.dumps(retrievals_raw)
+                logger.info(f"   - Retrievals: {len(retrievals_raw)} entries, JSON length: {len(retrievals_for_db)}")
+            else:
+                retrievals_for_db = None
+                logger.info("   - Retrievals: None")
+            # Prepare feedback_score_related_retrieval_docs (OBJECT) - convert to JSON string
+            feedback_score_related_raw = feedback_dict.get('feedback_score_related_retrieval_docs')
+            if feedback_score_related_raw:
+                # Convert to JSON string (same approach as old retrieved_data)
+                feedback_score_related_for_db = json.dumps(feedback_score_related_raw)
+                logger.info(f"   - Feedback score related docs: present, JSON length: {len(feedback_score_related_for_db)}")
+            else:
+                feedback_score_related_for_db = None
+                logger.info("   - Feedback score related docs: None")
+            # Prepare retrieved_data (preserved old column) - convert to JSON string
+            retrieved_data_raw = feedback_dict.get('retrieved_data')
+            if retrieved_data_raw:
+                # Convert to JSON string (same approach as old retrieved_data)
+                retrieved_data_for_db = json.dumps(retrieved_data_raw)
+                logger.info(f"   - Retrieved data (preserved): present, JSON length: {len(retrieved_data_for_db)}")
+            else:
+                retrieved_data_for_db = None
+                logger.info("   - Retrieved data (preserved): None")
+            # Build SQL with new column structure
+            # Columns are VARCHAR (storing JSON strings), same approach as old retrieved_data
+            sql = f"""INSERT INTO {table_name} (
+                feedback_id,
+                open_ended_feedback,
+                score,
+                is_feedback_about_last_retrieval,
+                conversation_id,
+                timestamp,
+                message_count,
+                has_retrievals,
+                retrieval_count,
+                transcript,
+                retrievals,
+                feedback_score_related_retrieval_docs,
+                retrieved_data,
+                created_at
+            ) VALUES (
+                %(feedback_id)s, %(open_ended_feedback)s, %(score)s, %(is_feedback_about_last_retrieval)s,
+                %(conversation_id)s, %(timestamp)s, %(message_count)s, %(has_retrievals)s,
+                %(retrieval_count)s, %(transcript)s, %(retrievals)s, %(feedback_score_related_retrieval_docs)s,
+                %(retrieved_data)s, %(created_at)s
+            )"""
+            logger.info("📝 SQL PREPARATION: Building INSERT statement...")
+            logger.info(f"   - Target table: {table_name}")
+            logger.info(f"   - Database: {self.database}")
+            logger.info(f"   - Schema: {self.schema}")
+            # Prepare parameters
+            # Pass JSON strings for VARIANT columns (same approach as old retrieved_data)
+            params = {
+                'feedback_id': feedback.feedback_id,
+                'open_ended_feedback': feedback.open_ended_feedback,
+                'score': feedback.score,
+                'is_feedback_about_last_retrieval': feedback.is_feedback_about_last_retrieval,
+                'conversation_id': feedback.conversation_id,
+                'timestamp': int(feedback.timestamp),
+                'message_count': feedback.message_count,
+                'has_retrievals': feedback.has_retrievals,
+                'retrieval_count': feedback.retrieval_count,
+                'transcript': transcript_for_db,  # JSON string
+                'retrievals': retrievals_for_db,  # JSON string
+                'feedback_score_related_retrieval_docs': feedback_score_related_for_db,  # JSON string
+                'retrieved_data': retrieved_data_for_db,  # JSON string - preserved old column
+                'created_at': feedback.created_at
+            }
+            # Execute insert
+            logger.info("🚀 SQL EXECUTION: Executing INSERT query...")
+            cursor.execute(sql, params)
+            logger.info("✅ SQL EXECUTION: Query executed successfully")
+            logger.info(f"   - Rows affected: 1")
+            logger.info(f"   - Status: SUCCESS")
+            cursor.close()
+            logger.info("✅ SNOWFLAKE INSERT: Feedback inserted successfully")
+            logger.info(f"📝 Inserted feedback: {feedback.feedback_id}")
+            logger.info("=" * 80)
+            return True
+        except Exception as e:
+            # Check if it's a Snowflake error
+            if SNOWFLAKE_AVAILABLE and "ProgrammingError" in str(type(e)):
+                logger.error(f"❌ SQL EXECUTION ERROR: {e}")
+                logger.error(f"   - Error code: {getattr(e, 'errno', 'Unknown')}")
+                logger.error(f"   - SQL state: {getattr(e, 'sqlstate', 'Unknown')}")
+            else:
+                logger.error(f"❌ SNOWFLAKE INSERT FAILED: {type(e).__name__}")
+                logger.error(f"   - Error: {e}")
+            logger.error("=" * 80)
+            return False
+    def __enter__(self):
+        """Context manager entry"""
+        self.connect()
+        return self
+    def __exit__(self, exc_type, exc_val, exc_tb):
+        """Context manager exit"""
+        self.disconnect()
+def get_snowflake_connector_from_env() -> Optional[SnowflakeFeedbackConnector]:
+    """Create Snowflake connector from environment variables"""
+    user = os.getenv("SNOWFLAKE_USER")
+    password = os.getenv("SNOWFLAKE_PASSWORD")
+    account = os.getenv("SNOWFLAKE_ACCOUNT")
+    warehouse = os.getenv("SNOWFLAKE_WAREHOUSE")
+    database = os.getenv("SNOWFLAKE_DATABASE", "SNOWFLAKE_LEARN")
+    schema = os.getenv("SNOWFLAKE_SCHEMA", "PUBLIC")
+    if not all([user, password, account, warehouse]):
+        print("⚠️ Snowflake credentials not found in environment variables")
+        print("Required variables: SNOWFLAKE_USER, SNOWFLAKE_PASSWORD, SNOWFLAKE_ACCOUNT, SNOWFLAKE_WAREHOUSE")
+        return None
+    return SnowflakeFeedbackConnector(
+        user=user,
+        password=password,
+        account=account,
+        warehouse=warehouse,
+        database=database,
+        schema=schema
+    )
+def save_to_snowflake(feedback: UserFeedback, table_name: Optional[str] = None) -> bool:
+    """Helper function to save feedback to Snowflake"""
+    logger.info("=" * 80)
+    logger.info("🔵 SNOWFLAKE SAVE: Starting save process")
+    logger.info(f"📝 Feedback ID: {feedback.feedback_id}")
+    # Get table name from parameter or env var
+    if table_name is None:
+        table_name = os.getenv("SNOWFLAKE_FEEDBACK_TABLE", "USER_FEEDBACK_V3")
+    connector = get_snowflake_connector_from_env()
+    if not connector:
+        logger.warning("⚠️ SNOWFLAKE SAVE: Skipping insertion (credentials not configured)")
+        logger.warning("   Required variables: SNOWFLAKE_USER, SNOWFLAKE_PASSWORD, SNOWFLAKE_ACCOUNT, SNOWFLAKE_WAREHOUSE")
+        logger.info("=" * 80)
+        return False
+    try:
+        logger.info("📡 SNOWFLAKE SAVE: Establishing connection...")
+        connector.connect()
+        logger.info("✅ SNOWFLAKE SAVE: Connection established")
+        logger.info("📥 SNOWFLAKE SAVE: Attempting to insert feedback...")
+        success = connector.insert_feedback(feedback, table_name=table_name)
+        logger.info("🔌 SNOWFLAKE SAVE: Disconnecting...")
+        connector.disconnect()
+        if success:
+            logger.info("✅ SNOWFLAKE SAVE: Successfully saved feedback")
+        else:
+            logger.error("❌ SNOWFLAKE SAVE: Failed to save feedback")
+        logger.info("=" * 80)
+        return success
+    except Exception as e:
+        logger.error(f"❌ SNOWFLAKE SAVE ERROR: {type(e).__name__}")
+        logger.error(f"   - Error: {e}")
+        logger.info("=" * 80)
+        return False

src/gemini/__init__.py ADDED Viewed

	@@ -0,0 +1,11 @@

+"""
+Gemini File Search Integration Module
+This module provides integration with Google Gemini File Search API
+for RAG functionality using Gemini's built-in file search capabilities.
+"""
+from .file_search import GeminiFileSearchClient, GeminiFileSearchResult
+__all__ = ["GeminiFileSearchClient", "GeminiFileSearchResult"]

src/gemini/file_search.py ADDED Viewed

	@@ -0,0 +1,427 @@

+"""
+Gemini File Search Client
+Handles interaction with Google Gemini File Search API for RAG.
+"""
+import os
+import logging
+from typing import List, Dict, Any, Optional
+from dataclasses import dataclass
+logger = logging.getLogger(__name__)
+try:
+    from google import genai
+    from google.genai import types
+    GEMINI_AVAILABLE = True
+except ImportError:
+    GEMINI_AVAILABLE = False
+@dataclass
+class GeminiFileSearchResult:
+    """Result from Gemini File Search query"""
+    answer: str
+    sources: List[Dict[str, Any]]  # List of document references
+    grounding_metadata: Optional[Dict[str, Any]] = None
+    query: str = ""
+class GeminiFileSearchClient:
+    """Client for interacting with Gemini File Search API"""
+    def __init__(self, api_key: Optional[str] = None, store_name: Optional[str] = None):
+        """
+        Initialize Gemini File Search client.
+        Args:
+            api_key: Gemini API key (defaults to GEMINI_API_KEY env var)
+            store_name: File search store name (defaults to GEMINI_FILESTORE_NAME env var)
+        """
+        if not GEMINI_AVAILABLE:
+            raise ImportError("google-genai package not installed. Install with: pip install google-genai")
+        self.api_key = api_key or os.getenv("GEMINI_API_KEY")
+        if not self.api_key:
+            raise ValueError("GEMINI_API_KEY not found. Set it in .env file or pass as argument.")
+        store_name_raw = store_name or os.getenv("GEMINI_FILESTORE_NAME")
+        if not store_name_raw:
+            raise ValueError("GEMINI_FILESTORE_NAME not found. Set it in .env file or pass as argument.")
+        # Normalize store name: API expects the FULL path format (fileSearchStores/xxx)
+        # If just the ID is provided, construct the full path
+        if store_name_raw.startswith("fileSearchStores/"):
+            self.store_name = store_name_raw  # Already full path
+        else:
+            # Just the ID provided, construct full path
+            self.store_name = f"fileSearchStores/{store_name_raw}"
+        logger.info(f"📦 Using file search store: {self.store_name}")
+        self.client = genai.Client(api_key=self.api_key)
+        self.model = "gemini-2.5-flash"  # or "gemini-2.5-pro"
+    def search(
+        self,
+        query: str,
+        filters: Optional[Dict[str, Any]] = None,
+        model: Optional[str] = None
+    ) -> GeminiFileSearchResult:
+        """
+        Search using Gemini File Search.
+        Args:
+            query: User query
+            filters: Optional filters (year, source, district, etc.)
+            model: Model to use (defaults to gemini-2.5-flash)
+        Returns:
+            GeminiFileSearchResult with answer and sources
+        """
+        model = model or self.model
+        # Build filter context for the query if filters are provided
+        # Gemini File Search doesn't support explicit filters in the API,
+        # so we add them as context in the query
+        filter_context = ""
+        if filters:
+            filter_parts = []
+            if filters.get("year"):
+                years = filters["year"] if isinstance(filters["year"], list) else [filters["year"]]
+                filter_parts.append(f"Year: {', '.join(years)}")
+            if filters.get("sources"):
+                sources = filters["sources"] if isinstance(filters["sources"], list) else [filters["sources"]]
+                filter_parts.append(f"Source: {', '.join(sources)}")
+            if filters.get("district"):
+                districts = filters["district"] if isinstance(filters["district"], list) else [filters["district"]]
+                filter_parts.append(f"District: {', '.join(districts)}")
+            if filters.get("filenames"):
+                filenames = filters["filenames"] if isinstance(filters["filenames"], list) else [filters["filenames"]]
+                filter_parts.append(f"Filename: {', '.join(filenames)}")
+            if filter_parts:
+                filter_context = f"\n\nPlease focus on documents matching these criteria: {', '.join(filter_parts)}"
+        # Combine query with filter context
+        # Add comprehensive system instructions similar to multi-agent system
+        system_instructions = """You are a helpful audit report assistant specialized in analyzing government audit reports from Uganda's Office of the Auditor General.
+CRITICAL RULES:
+1. **NO HALLUCINATION**: Only use information that is explicitly stated in the retrieved documents. Do not make up facts, numbers, or details.
+2. **Document References**: Always cite which documents you're using with [Doc i] references at the end of sentences that use specific information.
+3. **Formatting**: Structure your response with clear paragraphs, bullet points, or sections for readability.
+4. **Accuracy**: If the retrieved documents don't contain the requested information, explicitly state "The retrieved documents do not contain information about [topic]."
+5. **Years and Data**: Pay careful attention to years mentioned in documents. If a user asks about a specific year but documents show different years, explicitly state this.
+6. **District/Source Names**: Use the exact district and source names as they appear in the document metadata (e.g., "Kalangala" not "Kalagala").
+7. **Financial Data**: When providing financial figures, include the currency (UGX) and be precise about amounts.
+8. **Conversational Tone**: Be helpful, clear, and conversational while maintaining accuracy.
+IMPORTANT: Only use information from the retrieved documents. Do not use information from your training data unless it's explicitly mentioned in the retrieved documents."""
+        # Combine system instructions with query
+        full_query = f"{system_instructions}\n\nUser Question: {query}{filter_context}\n\nPlease provide a detailed, well-formatted response with proper document references."
+        try:
+            # Generate content with file search
+            # Based on Gemini API docs: https://ai.google.dev/gemini-api/docs/file-search
+            # Try with full path format first, then fallback to just ID if needed
+            store_name_to_try = self.store_name
+            try:
+                # Try the documented format first with full path
+                response = self.client.models.generate_content(
+                    model=model,
+                    contents=full_query,
+                    config=types.GenerateContentConfig(
+                        tools=[
+                            types.Tool(
+                                file_search=types.FileSearch(
+                                    file_search_store_names=[store_name_to_try]
+                                )
+                            )
+                        ]
+                    )
+                )
+            except Exception as api_error:
+                error_str = str(api_error).lower()
+                # If format error, try with just the ID (without fileSearchStores/ prefix)
+                if 'format' in error_str or 'invalid' in error_str or 'too long' in error_str:
+                    logger.warning(f"Full path format failed, trying with just store ID: {api_error}")
+                    # Extract just the ID part
+                    if store_name_to_try.startswith("fileSearchStores/"):
+                        store_id = store_name_to_try.split("/", 1)[1]
+                        store_name_to_try = store_id
+                    try:
+                        response = self.client.models.generate_content(
+                            model=model,
+                            contents=full_query,
+                            config=types.GenerateContentConfig(
+                                tools=[
+                                    types.Tool(
+                                        file_search=types.FileSearch(
+                                            file_search_store_names=[store_name_to_try]
+                                        )
+                                    )
+                                ]
+                            )
+                        )
+                    except Exception as e2:
+                        raise Exception(f"Failed to call Gemini API with both formats. Full path error: {api_error}, ID-only error: {e2}")
+                else:
+                    # Try alternative dict format
+                    logger.warning(f"Primary API format failed, trying alternative: {api_error}")
+                    try:
+                        response = self.client.models.generate_content(
+                            model=model,
+                            contents=full_query,
+                            tools=[{
+                                "file_search": {
+                                    "file_search_store_names": [store_name_to_try]
+                                }
+                            }]
+                        )
+                    except Exception as e2:
+                        raise Exception(f"Failed to call Gemini API: {e2}")
+            # Extract answer
+            answer = ""
+            if hasattr(response, 'text'):
+                answer = response.text
+            elif hasattr(response, 'candidates') and response.candidates:
+                # Try to get text from first candidate
+                candidate = response.candidates[0]
+                if hasattr(candidate, 'content') and candidate.content:
+                    if hasattr(candidate.content, 'parts'):
+                        text_parts = []
+                        for part in candidate.content.parts:
+                            if hasattr(part, 'text'):
+                                text_parts.append(part.text)
+                        answer = " ".join(text_parts)
+                    elif isinstance(candidate.content, str):
+                        answer = candidate.content
+            else:
+                answer = str(response)
+            # Extract grounding metadata (document references)
+            sources = []
+            grounding_metadata = None
+            logger.info(f"🔍 Extracting sources from Gemini response...")
+            if hasattr(response, 'candidates') and response.candidates:
+                candidate = response.candidates[0]
+                logger.info(f"   Found candidate, checking for grounding_metadata...")
+                # Get grounding metadata
+                if hasattr(candidate, 'grounding_metadata'):
+                    grounding_metadata = candidate.grounding_metadata
+                    logger.info(f"   Found grounding_metadata: {type(grounding_metadata)}")
+                    # Extract source documents from grounding metadata
+                    # Handle different response formats
+                    grounding_chunks = None
+                    if hasattr(grounding_metadata, 'grounding_chunks'):
+                        grounding_chunks = grounding_metadata.grounding_chunks
+                        logger.info(f"   Found grounding_chunks (attr): {len(grounding_chunks) if grounding_chunks else 0}")
+                    elif isinstance(grounding_metadata, dict) and 'grounding_chunks' in grounding_metadata:
+                        grounding_chunks = grounding_metadata['grounding_chunks']
+                        logger.info(f"   Found grounding_chunks (dict): {len(grounding_chunks) if grounding_chunks else 0}")
+                    elif hasattr(grounding_metadata, '__dict__'):
+                        # Try to access as object attributes
+                        metadata_dict = grounding_metadata.__dict__
+                        if 'grounding_chunks' in metadata_dict:
+                            grounding_chunks = metadata_dict['grounding_chunks']
+                            logger.info(f"   Found grounding_chunks (__dict__): {len(grounding_chunks) if grounding_chunks else 0}")
+                    if grounding_chunks:
+                        logger.info(f"   Processing {len(grounding_chunks)} grounding chunks...")
+                        for idx, chunk in enumerate(grounding_chunks):
+                            # Handle both object and dict formats
+                            try:
+                                if isinstance(chunk, dict):
+                                    chunk_data = chunk
+                                else:
+                                    # Object format - convert to dict-like access
+                                    chunk_data = {}
+                                    if hasattr(chunk, 'chunk'):
+                                        chunk_obj = chunk.chunk
+                                        chunk_data['chunk'] = {
+                                            'text': getattr(chunk_obj, 'text', ''),
+                                            'file_name': getattr(chunk_obj, 'file_name', '')
+                                        }
+                                    if hasattr(chunk, 'relevance_score'):
+                                        score_obj = chunk.relevance_score
+                                        chunk_data['relevance_score'] = {
+                                            'score': getattr(score_obj, 'score', 0.0)
+                                        }
+                                chunk_info = chunk_data.get('chunk', {})
+                                text = chunk_info.get('text', '') if isinstance(chunk_info, dict) else ''
+                                file_name = chunk_info.get('file_name', '') if isinstance(chunk_info, dict) else ''
+                                # Try to extract file URI and parse metadata from it
+                                file_uri = chunk_info.get('file_uri', '') if isinstance(chunk_info, dict) else ''
+                                # Also check for 'web' attribute (GroundingChunkData structure)
+                                if hasattr(chunk, 'web') and chunk.web:
+                                    web_data = chunk.web
+                                    file_uri = getattr(web_data, 'file_uri', '') or file_uri
+                                    file_name = getattr(web_data, 'title', '') or getattr(web_data, 'filename', '') or file_name
+                                    text = getattr(web_data, 'text', '') or getattr(web_data, 'content', '') or text
+                                # Check retrieved_context - this is where the actual data seems to be!
+                                if hasattr(chunk, 'retrieved_context') and chunk.retrieved_context:
+                                    rc = chunk.retrieved_context
+                                    # Get text content
+                                    if hasattr(rc, 'text'):
+                                        text = getattr(rc, 'text', '') or text
+                                    # Get document name
+                                    if hasattr(rc, 'document_name'):
+                                        doc_name = getattr(rc, 'document_name', '')
+                                        if doc_name:
+                                            file_name = doc_name or file_name
+                                # Fallback: Parse from string representation if we still don't have filename
+                                if not file_name:
+                                    chunk_str = str(chunk)
+                                    import re
+                                    # Look for PDF filenames
+                                    pdf_match = re.search(r"([A-Za-z0-9\s_-]+\.pdf)", chunk_str)
+                                    if pdf_match:
+                                        file_name = pdf_match.group(1)
+                                    # Or look for title= pattern
+                                    if not file_name and 'title=' in chunk_str:
+                                        title_match = re.search(r"title=['\"]([^'\"]+)['\"]", chunk_str)
+                                        if title_match:
+                                            file_name = title_match.group(1)
+                                if not file_name and file_uri:
+                                    # Extract filename from URI if available
+                                    file_name = file_uri.split('/')[-1] if '/' in file_uri else file_uri
+                                score_data = chunk_data.get('relevance_score', {})
+                                score = score_data.get('score', 0.0) if isinstance(score_data, dict) else 0.0
+                                if text or file_name:  # Only add if we have content
+                                    source_info = {
+                                        "content": text,
+                                        "filename": file_name,
+                                        "score": score,
+                                        "file_uri": file_uri,
+                                    }
+                                    sources.append(source_info)
+                                    logger.info(f"📄 Extracted source {idx+1}: {file_name} (score: {score:.3f}, content length: {len(text)})")
+                            except Exception as e:
+                                logger.warning(f"Error extracting chunk {idx+1} info: {e}")
+                                import traceback
+                                logger.debug(traceback.format_exc())
+                                continue
+                    else:
+                        logger.warning(f"   No grounding_chunks found in grounding_metadata")
+                else:
+                    logger.warning(f"   Candidate does not have grounding_metadata attribute")
+                # Also try to get file references from other parts of the response
+                # Sometimes Gemini includes file references in the response itself
+                if not sources or len(sources) == 0:
+                    logger.info(f"   No sources from grounding_metadata, trying alternative extraction...")
+                    # Check if response has file references in other attributes
+                    if hasattr(candidate, 'content') and candidate.content:
+                        if hasattr(candidate.content, 'parts'):
+                            for part in candidate.content.parts:
+                                if hasattr(part, 'file_data'):
+                                    file_data = part.file_data
+                                    if hasattr(file_data, 'file_uri') or (isinstance(file_data, dict) and 'file_uri' in file_data):
+                                        file_uri = getattr(file_data, 'file_uri', None) or (file_data.get('file_uri') if isinstance(file_data, dict) else None)
+                                        if file_uri:
+                                            file_name = file_uri.split('/')[-1] if '/' in file_uri else file_uri
+                                            sources.append({
+                                                "content": "",
+                                                "filename": file_name,
+                                                "score": 0.0,
+                                                "file_uri": file_uri,
+                                            })
+                                            logger.info(f"📄 Extracted source from file_data: {file_name}")
+            logger.info(f"✅ Total sources extracted: {len(sources)}")
+            return GeminiFileSearchResult(
+                answer=answer,
+                sources=sources,
+                grounding_metadata=grounding_metadata,
+                query=query
+            )
+        except Exception as e:
+            # Return error result
+            return GeminiFileSearchResult(
+                answer=f"I apologize, but I encountered an error: {str(e)}",
+                sources=[],
+                query=query
+            )
+    def format_sources_for_display(self, result: GeminiFileSearchResult) -> List[Any]:
+        """
+        Format Gemini sources to match the format expected by the UI.
+        Returns list of document-like objects compatible with existing display code.
+        """
+        from langchain.docstore.document import Document
+        formatted_sources = []
+        for i, source in enumerate(result.sources):
+            filename = source.get("filename", "Unknown")
+            # Try to extract metadata from filename (e.g., "Kalangala DLG Report of Auditor General 2021.pdf")
+            year = None
+            district = None
+            source_name = "Gemini File Search"
+            # Parse filename for year
+            import re
+            year_match = re.search(r'\b(20\d{2})\b', filename)
+            if year_match:
+                year = int(year_match.group(1))
+            # Parse filename for district/source
+            if "Kalangala" in filename:
+                district = "Kalangala"
+                source_name = "Kalangala DLG"
+            elif "Gulu" in filename:
+                district = "Gulu"
+                source_name = "Gulu DLG"
+            elif "KCCA" in filename:
+                district = "Kampala"
+                source_name = "KCCA"
+            elif "MAAIF" in filename:
+                source_name = "MAAIF"
+            elif "MWTS" in filename:
+                source_name = "MWTS"
+            elif "Consolidated" in filename:
+                source_name = "Consolidated"
+            # Create a Document object compatible with existing code
+            doc = Document(
+                page_content=source.get("content", ""),
+                metadata={
+                    "filename": filename,
+                    "source": source_name,
+                    "score": source.get("score"),
+                    "chunk_index": i,
+                    "page": None,  # Gemini doesn't provide page numbers
+                    "year": year,
+                    "district": district,
+                    "chunk_id": f"gemini_{i}",
+                    "_id": f"gemini_{i}",
+                }
+            )
+            formatted_sources.append(doc)
+            logger.info(f"📋 Formatted source {i+1}: {filename} ({year}, {source_name})")
+        logger.info(f"✅ Formatted {len(formatted_sources)} sources for display")
+        return formatted_sources

src/{loader.py → llm/loader.py} RENAMED Viewed

File without changes

src/pipeline.py CHANGED Viewed

@@ -1,5 +1,7 @@
 """Main pipeline orchestrator for the Audit QA system."""
 import time
 from pathlib import Path
 from dataclasses import dataclass
 from typing import Dict, Any, List, Optional
@@ -11,11 +13,21 @@ except ModuleNotFoundError as me:
     from langchain.schema import Document
 from .logging import log_error
-from .llm.adapters import LLMRegistry
-from .loader import chunks_to_documents
 from .vectorstore import VectorStoreManager
 from .retrieval.context import ContextRetriever
-from .config.loader import get_embedding_model_for_collection
@@ -41,12 +53,13 @@ class PipelineManager:
         """
         Initialize the pipeline manager.
         """
         self.config = config or {}
         self.vectorstore_manager = None
         self.context_retriever = None  # Initialize as None
-        self.llm_client = None
-        self.report_service = None
-        self.chunks = None
         # Initialize components
         self._initialize_components()
@@ -118,13 +131,7 @@ class PipelineManager:
         try:
             # Load config if not provided
             if not self.config:
-                try:
-                    from src.config.loader import load_config
-                    self.config = load_config()
-                except ImportError:
-                    # Try alternate import path
-                    from src.config.loader import load_config
-                    self.config = load_config()
             # Validate config structure
             if not isinstance(self.config, dict):
@@ -159,7 +166,6 @@ class PipelineManager:
                 print("✅ VectorStoreManager initialized successfully")
             except Exception as vs_error:
                 print(f"❌ Error initializing VectorStoreManager: {vs_error}")
-                import traceback
                 traceback.print_exc()
                 self.vectorstore_manager = None
                 raise  # Re-raise to be caught by outer try-except
@@ -175,40 +181,35 @@ class PipelineManager:
             except Exception as e:
                 try:
                     # Try direct instantiation with config
-                    from src.llm.adapters import get_llm_client
                     self.llm_client = get_llm_client("openai", self.config)
                     print("✅ LLM CLIENT: Initialized using direct get_llm_client function with config")
                 except Exception as e2:
                     print(f"❌ LLM CLIENT: Registry methods failed - {e2}")
                     # Try to create a simple LLM client directly
                     try:
-                        from langchain_openai import ChatOpenAI
-                        import os
-                        api_key = os.getenv("OPENAI_API_KEY") or os.getenv("OPENROUTER_API_KEY")
-                        if api_key:
-                            self.llm_client = ChatOpenAI(
-                                model="gpt-3.5-turbo",
-                                api_key=api_key,
-                                temperature=0.1,
-                                max_tokens=1000
-                            )
-                            print("✅ LLM CLIENT: Initialized using direct ChatOpenAI")
                         else:
-                            print("❌ LLM CLIENT: No API key available")
                     except Exception as e3:
                         print(f"❌ LLM CLIENT: Direct instantiation also failed - {e3}")
                         self.llm_client = None
             # Load system prompt
-            from src.llm.templates import DEFAULT_AUDIT_SYSTEM_PROMPT
             self.system_prompt = DEFAULT_AUDIT_SYSTEM_PROMPT
             # Initialize report service
             try:
-                try:
-                    from src.reporting.service import ReportService
-                except ImportError:
-                    from src.reporting.service import ReportService
                 self.report_service = ReportService()
             except Exception as e:
                 print(f"Warning: Could not initialize report service: {e}")
@@ -216,7 +217,6 @@ class PipelineManager:
         except Exception as e:
             print(f"❌ Error initializing components: {e}")
-            import traceback
             traceback.print_exc()
             # Don't set vectorstore_manager to None if it was already set
             if not hasattr(self, 'vectorstore_manager') or self.vectorstore_manager is None:
@@ -337,7 +337,6 @@ class PipelineManager:
                     return False
             except Exception as init_error:
                 print(f"❌ Error initializing vector store manager: {init_error}")
-                import traceback
                 traceback.print_exc()
                 return False
@@ -352,7 +351,6 @@ class PipelineManager:
         except Exception as e:
             print(f"❌ Error connecting to vector store: {e}")
             log_error(e, {"component": "vectorstore_connection"})
-            import traceback
             traceback.print_exc()
             # If it's a dimension mismatch error, try with force_recreate
@@ -541,9 +539,6 @@ Answer:"""
             if auto_infer_filters and not any([reports, sources, subtype]):
                 print(f"🤖 AUTO-INFERRING FILTERS: No explicit filters provided, analyzing query...")
                 try:
-                    # Import get_available_metadata here to avoid circular imports
-                    from src.retrieval.filter import get_available_metadata, infer_filters_from_query
                     # Get available metadata
                     available_metadata = get_available_metadata(self.vectorstore_manager.get_vectorstore())

 """Main pipeline orchestrator for the Audit QA system."""
+import os
 import time
+import traceback
 from pathlib import Path
 from dataclasses import dataclass
 from typing import Dict, Any, List, Optional
     from langchain.schema import Document
 from .logging import log_error
+from .llm.loader import chunks_to_documents
 from .vectorstore import VectorStoreManager
+from .reporting.service import ReportService
 from .retrieval.context import ContextRetriever
+from .llm.adapters import LLMRegistry, get_llm_client
+from .llm.templates import DEFAULT_AUDIT_SYSTEM_PROMPT
+from .config.loader import load_config, get_embedding_model_for_collection
+from .retrieval.filter import get_available_metadata, infer_filters_from_query
+try:
+    from langchain_openai import ChatOpenAI
+    LANGCHAIN_OPENAI_AVAILABLE = True
+except ImportError:
+    LANGCHAIN_OPENAI_AVAILABLE = False
         """
         Initialize the pipeline manager.
         """
+        self.chunks = None
+        self.llm_client = None
         self.config = config or {}
+        self.report_service = None
         self.vectorstore_manager = None
         self.context_retriever = None  # Initialize as None
         # Initialize components
         self._initialize_components()
         try:
             # Load config if not provided
             if not self.config:
+                self.config = load_config()
             # Validate config structure
             if not isinstance(self.config, dict):
                 print("✅ VectorStoreManager initialized successfully")
             except Exception as vs_error:
                 print(f"❌ Error initializing VectorStoreManager: {vs_error}")
                 traceback.print_exc()
                 self.vectorstore_manager = None
                 raise  # Re-raise to be caught by outer try-except
             except Exception as e:
                 try:
                     # Try direct instantiation with config
                     self.llm_client = get_llm_client("openai", self.config)
                     print("✅ LLM CLIENT: Initialized using direct get_llm_client function with config")
                 except Exception as e2:
                     print(f"❌ LLM CLIENT: Registry methods failed - {e2}")
                     # Try to create a simple LLM client directly
                     try:
+                        if LANGCHAIN_OPENAI_AVAILABLE:
+                            api_key = os.getenv("OPENAI_API_KEY") or os.getenv("OPENROUTER_API_KEY")
+                            if api_key:
+                                self.llm_client = ChatOpenAI(
+                                    model="gpt-3.5-turbo",
+                                    api_key=api_key,
+                                    temperature=0.1,
+                                    max_tokens=1000
+                                )
+                                print("✅ LLM CLIENT: Initialized using direct ChatOpenAI")
+                            else:
+                                print("❌ LLM CLIENT: No API key available")
                         else:
+                            print("❌ LLM CLIENT: langchain-openai not available")
                     except Exception as e3:
                         print(f"❌ LLM CLIENT: Direct instantiation also failed - {e3}")
                         self.llm_client = None
             # Load system prompt
             self.system_prompt = DEFAULT_AUDIT_SYSTEM_PROMPT
             # Initialize report service
             try:
                 self.report_service = ReportService()
             except Exception as e:
                 print(f"Warning: Could not initialize report service: {e}")
         except Exception as e:
             print(f"❌ Error initializing components: {e}")
             traceback.print_exc()
             # Don't set vectorstore_manager to None if it was already set
             if not hasattr(self, 'vectorstore_manager') or self.vectorstore_manager is None:
                     return False
             except Exception as init_error:
                 print(f"❌ Error initializing vector store manager: {init_error}")
                 traceback.print_exc()
                 return False
         except Exception as e:
             print(f"❌ Error connecting to vector store: {e}")
             log_error(e, {"component": "vectorstore_connection"})
             traceback.print_exc()
             # If it's a dimension mismatch error, try with force_recreate
             if auto_infer_filters and not any([reports, sources, subtype]):
                 print(f"🤖 AUTO-INFERRING FILTERS: No explicit filters provided, analyzing query...")
                 try:
                     # Get available metadata
                     available_metadata = get_available_metadata(self.vectorstore_manager.get_vectorstore())

src/reporting/__init__.py CHANGED Viewed

@@ -1,4 +1,8 @@
-"""Report metadata and utilities."""
 from .metadata import get_report_metadata, get_available_sources
 from .service import ReportService

+"""Report metadata and utilities.
+This module is kept for backward compatibility with pipeline.py.
+For feedback-related functionality, use src.feedback instead.
+"""
 from .metadata import get_report_metadata, get_available_sources
 from .service import ReportService

src/reporting/feedback_schema.py CHANGED Viewed

@@ -4,10 +4,12 @@ Feedback Schema for RAG Chatbot
 This module defines dataclasses for feedback data structures
 and provides Snowflake schema generation.
 """
 from dataclasses import dataclass, asdict, field
 from typing import List, Optional, Dict, Any, Union
-from datetime import datetime
 @dataclass
@@ -39,34 +41,20 @@ class UserFeedback:
     open_ended_feedback: Optional[str]
     score: int
     is_feedback_about_last_retrieval: bool
-    retrieved_data: List[RetrievalEntry]
     conversation_id: str
     timestamp: float
     message_count: int
     has_retrievals: bool
     retrieval_count: int
-    user_query: Optional[str] = None
-    bot_response: Optional[str] = None
     created_at: str = field(default_factory=lambda: datetime.now().isoformat())
     def to_dict(self) -> Dict[str, Any]:
         """Convert to dictionary with nested data structures"""
         result = asdict(self)
-        # Handle nested objects
-        if self.retrieved_data:
-            result['retrieved_data'] = [self._serialize_retrieval_entry(entry) for entry in self.retrieved_data]
-        return result
-    def _serialize_retrieval_entry(self, entry: RetrievalEntry) -> Dict[str, Any]:
-        """Serialize retrieval entry to dict"""
-        # If raw data exists, use it (it's already properly formatted)
-        if hasattr(entry, '_raw_data') and entry._raw_data:
-            return entry._raw_data
-        # Otherwise, serialize the dataclass
-        result = asdict(entry)
-        if entry.documents_retrieved:
-            result['documents_retrieved'] = [asdict(doc) for doc in entry.documents_retrieved]
         return result
     def to_snowflake_schema(self) -> Dict[str, Any]:
@@ -81,28 +69,28 @@ class UserFeedback:
             "message_count": "INTEGER",
             "has_retrievals": "BOOLEAN",
             "retrieval_count": "INTEGER",
-            "user_query": "VARCHAR(16777216)",
-            "bot_response": "VARCHAR(16777216)",
             "created_at": "TIMESTAMP_NTZ",
-            "retrieved_data": "VARIANT",  # Array of retrieval entries
-            # retrieved_data structure:
-            # [
             #   {
-            #     "rag_query": "...",
-            #     "conversation_length": 5,
-            #     "timestamp": 1234567890,
-            #     "docs_retrieved": [
-            #       {"filename": "...", "page": 14, "score": 0.95, ...},
-            #       ...
-            #     ]
             #   },
             #   ...
             # ]
         }
         return schema
     @classmethod
-    def get_snowflake_create_table_sql(cls, table_name: str = "user_feedback") -> str:
         """Generate CREATE TABLE SQL for Snowflake"""
         schema = cls.to_snowflake_schema(None)
@@ -117,16 +105,13 @@ class UserFeedback:
         sql = f"""CREATE TABLE IF NOT EXISTS {table_name} (
 {columns_str},
   PRIMARY KEY (feedback_id)
-);
--- Create index on timestamp for querying by time
-CREATE INDEX IF NOT EXISTS idx_feedback_timestamp ON {table_name} (timestamp);
--- Create index on conversation_id for querying by conversation
-CREATE INDEX IF NOT EXISTS idx_feedback_conversation ON {table_name} (conversation_id);
--- Create index on score for feedback analysis
-CREATE INDEX IF NOT EXISTS idx_feedback_score ON {table_name} (score);
 """
         return sql
@@ -150,47 +135,27 @@ DOCUMENT_SCHEMA = {
 }
-def generate_snowflake_schema_sql() -> str:
     """Generate complete Snowflake schema SQL for feedback system"""
-    return UserFeedback.get_snowflake_create_table_sql("user_feedback")
 def create_feedback_from_dict(data: Dict[str, Any]) -> UserFeedback:
     """Create UserFeedback instance from dictionary"""
-    # Parse retrieved_data if present
-    retrieved_data = []
-    if "retrieved_data" in data and data["retrieved_data"]:
-        for entry_dict in data.get("retrieved_data", []):
-            # Map the actual structure from rag_retrieval_history
-            # Entry has: conversation_up_to, rag_query_expansion, docs_retrieved
-            try:
-                # Try to map to expected structure
-                entry = RetrievalEntry(
-                    rag_query=entry_dict.get("rag_query_expansion", ""),
-                    documents_retrieved=[],  # Empty for now, will store as raw data
-                    conversation_length=len(entry_dict.get("conversation_up_to", [])),
-                    filters_applied=None,
-                    timestamp=entry_dict.get("timestamp", None)
-                )
-                # Store raw data in the entry
-                entry._raw_data = entry_dict  # Store original for preservation
-                retrieved_data.append(entry)
-            except Exception as e:
-                # If mapping fails, store as-is without strict typing
-                pass
     return UserFeedback(
         feedback_id=data.get("feedback_id", f"feedback_{data.get('timestamp', 'unknown')}"),
         open_ended_feedback=data.get("open_ended_feedback"),
         score=data["score"],
         is_feedback_about_last_retrieval=data["is_feedback_about_last_retrieval"],
-        retrieved_data=retrieved_data,
         conversation_id=data["conversation_id"],
         timestamp=data["timestamp"],
         message_count=data["message_count"],
         has_retrievals=data["has_retrievals"],
         retrieval_count=data["retrieval_count"],
-        user_query=data.get("user_query"),
-        bot_response=data.get("bot_response")
     )

 This module defines dataclasses for feedback data structures
 and provides Snowflake schema generation.
 """
+import os
+from datetime import datetime
 from dataclasses import dataclass, asdict, field
 from typing import List, Optional, Dict, Any, Union
 @dataclass
     open_ended_feedback: Optional[str]
     score: int
     is_feedback_about_last_retrieval: bool
     conversation_id: str
     timestamp: float
     message_count: int
     has_retrievals: bool
     retrieval_count: int
+    transcript: List[Dict[str, str]]  # List of {"role": "user"/"assistant", "content": "..."}
+    retrievals: List[Dict[str, Any]]  # List of retrieval objects with retrieved_docs and user_message_trigger
+    feedback_score_related_retrieval_docs: Optional[Dict[str, Any]] = None  # Conversation subset + retrieved docs
+    retrieved_data: Optional[List[Dict[str, Any]]] = None  # Preserved old column for backward compatibility
     created_at: str = field(default_factory=lambda: datetime.now().isoformat())
     def to_dict(self) -> Dict[str, Any]:
         """Convert to dictionary with nested data structures"""
         result = asdict(self)
         return result
     def to_snowflake_schema(self) -> Dict[str, Any]:
             "message_count": "INTEGER",
             "has_retrievals": "BOOLEAN",
             "retrieval_count": "INTEGER",
+            "transcript": "VARCHAR(16777216)",  # JSON string of ARRAY of {"role": "user"/"assistant", "content": "..."}
+            "retrievals": "VARCHAR(16777216)",  # JSON string of ARRAY of retrieval objects
+            "feedback_score_related_retrieval_docs": "VARCHAR(16777216)",  # JSON string of OBJECT with conversation subset + retrieved docs
+            "retrieved_data": "VARCHAR(16777216)",  # JSON string - preserved old column for backward compatibility
             "created_at": "TIMESTAMP_NTZ",
+            # transcript structure: [{"role": "user", "content": "..."}, {"role": "assistant", "content": "..."}, ...]
+            # retrievals structure: [
             #   {
+            #     "retrieved_docs": [{"content": "...", "metadata": {...}, ...}],  # content truncated to 100 chars
+            #     "user_message_trigger": "final user message that triggered this retrieval"
             #   },
             #   ...
             # ]
+            # feedback_score_related_retrieval_docs structure: {
+            #   "conversation_up_to_point": [{"role": "user", "content": "..."}, ...],  # subset of transcript
+            #   "retrieved_docs": [{"content": "...", "metadata": {...}, ...}]  # full chunks with all info
+            # }
         }
         return schema
     @classmethod
+    def get_snowflake_create_table_sql(cls, table_name: str = "USER_FEEDBACK_V3") -> str:
         """Generate CREATE TABLE SQL for Snowflake"""
         schema = cls.to_snowflake_schema(None)
         sql = f"""CREATE TABLE IF NOT EXISTS {table_name} (
 {columns_str},
   PRIMARY KEY (feedback_id)
+)
+CLUSTER BY (timestamp, conversation_id, score);
+-- Note: Snowflake doesn't support traditional indexes on regular tables.
+-- Instead, we use CLUSTER BY to optimize queries on these columns.
+-- Snowflake automatically maintains clustering for efficient querying.
+-- Note: transcript, retrievals, and feedback_score_related_retrieval_docs are stored as VARCHAR (JSON strings),
+-- same approach as the old retrieved_data column. This allows easy storage and retrieval without VARIANT type complexity.
 """
         return sql
 }
+def generate_snowflake_schema_sql(table_name: Optional[str] = None) -> str:
     """Generate complete Snowflake schema SQL for feedback system"""
+    if table_name is None:
+        table_name = os.getenv("SNOWFLAKE_FEEDBACK_TABLE", "USER_FEEDBACK_V3")
+    return UserFeedback.get_snowflake_create_table_sql(table_name)
 def create_feedback_from_dict(data: Dict[str, Any]) -> UserFeedback:
     """Create UserFeedback instance from dictionary"""
     return UserFeedback(
         feedback_id=data.get("feedback_id", f"feedback_{data.get('timestamp', 'unknown')}"),
         open_ended_feedback=data.get("open_ended_feedback"),
         score=data["score"],
         is_feedback_about_last_retrieval=data["is_feedback_about_last_retrieval"],
         conversation_id=data["conversation_id"],
         timestamp=data["timestamp"],
         message_count=data["message_count"],
         has_retrievals=data["has_retrievals"],
         retrieval_count=data["retrieval_count"],
+        transcript=data.get("transcript", []),
+        retrievals=data.get("retrievals", []),
+        feedback_score_related_retrieval_docs=data.get("feedback_score_related_retrieval_docs"),
+        retrieved_data=data.get("retrieved_data")
     )

src/reporting/snowflake_connector.py CHANGED Viewed

@@ -8,8 +8,11 @@ import os
 import json
 import logging
 from typing import Dict, Any, Optional
 from src.reporting.feedback_schema import UserFeedback
 # Try to import snowflake connector
 try:
     import snowflake.connector
@@ -79,12 +82,16 @@ class SnowflakeFeedbackConnector:
             self._connection.close()
             print("✅ Disconnected from Snowflake")
-    def insert_feedback(self, feedback: UserFeedback) -> bool:
         """Insert a single feedback record into Snowflake"""
         logger.info("=" * 80)
         logger.info("🔄 SNOWFLAKE INSERT: Starting feedback insertion process")
         logger.info(f"📝 Feedback ID: {feedback.feedback_id}")
         if not self._connection:
             logger.error("❌ Not connected to Snowflake. Call connect() first.")
             raise RuntimeError("Not connected to Snowflake. Call connect() first.")
@@ -131,38 +138,53 @@ class SnowflakeFeedbackConnector:
                 logger.error(f"❌ Could not set context: {e}")
                 raise
-            # Prepare data
-            logger.info("🔧 DATA PREPARATION: Preparing retrieved_data...")
-            retrieved_data_raw = feedback.to_dict()['retrieved_data']
-            logger.info(f"   - Retrieved data type (raw): {type(retrieved_data_raw).__name__}")
-            logger.info(f"   - Retrieved data: {repr(retrieved_data_raw)[:200]}")
-            # If retrieved_data is already a string (from UI), parse it
-            if isinstance(retrieved_data_raw, str):
-                logger.info("   - Parsing string to Python object")
-                retrieved_data = json.loads(retrieved_data_raw)
-            elif retrieved_data_raw is None:
-                retrieved_data = None
             else:
-                # It's already a Python object (list/dict)
-                logger.info("   - Data is already a Python object")
-                retrieved_data = retrieved_data_raw
-            logger.info(f"   - Retrieved data size: {len(str(retrieved_data)) if retrieved_data else 0} characters")
-            logger.info(f"   - Retrieved data type: {type(retrieved_data).__name__}")
-            # Convert to JSON string for TEXT column
-            if retrieved_data:
-                retrieved_data_for_db = json.dumps(retrieved_data)
-                logger.info(f"   - Converting to JSON string for TEXT column")
-                logger.info(f"   - JSON string length: {len(retrieved_data_for_db)}")
             else:
-                logger.info(f"   - Retrieved data is None, using NULL")
                 retrieved_data_for_db = None
-            # Build SQL with retrieved_data as a TEXT column parameter
-            sql = f"""INSERT INTO user_feedback (
                 feedback_id,
                 open_ended_feedback,
                 score,
@@ -172,23 +194,25 @@ class SnowflakeFeedbackConnector:
                 message_count,
                 has_retrievals,
                 retrieval_count,
-                user_query,
-                bot_response,
-                created_at,
-                retrieved_data
             ) VALUES (
                 %(feedback_id)s, %(open_ended_feedback)s, %(score)s, %(is_feedback_about_last_retrieval)s,
                 %(conversation_id)s, %(timestamp)s, %(message_count)s, %(has_retrievals)s,
-                %(retrieval_count)s, %(user_query)s, %(bot_response)s, %(created_at)s,
-                %(retrieved_data)s
             )"""
             logger.info("📝 SQL PREPARATION: Building INSERT statement...")
-            logger.info(f"   - Target table: user_feedback")
             logger.info(f"   - Database: {self.database}")
             logger.info(f"   - Schema: {self.schema}")
             # Prepare parameters
             params = {
                 'feedback_id': feedback.feedback_id,
                 'open_ended_feedback': feedback.open_ended_feedback,
@@ -199,10 +223,11 @@ class SnowflakeFeedbackConnector:
                 'message_count': feedback.message_count,
                 'has_retrievals': feedback.has_retrievals,
                 'retrieval_count': feedback.retrieval_count,
-                'user_query': feedback.user_query,
-                'bot_response': feedback.bot_response,
-                'created_at': feedback.created_at,
-                'retrieved_data': retrieved_data_for_db
             }
             # Execute insert
@@ -265,12 +290,16 @@ def get_snowflake_connector_from_env() -> Optional[SnowflakeFeedbackConnector]:
     )
-def save_to_snowflake(feedback: UserFeedback) -> bool:
     """Helper function to save feedback to Snowflake"""
     logger.info("=" * 80)
     logger.info("🔵 SNOWFLAKE SAVE: Starting save process")
     logger.info(f"📝 Feedback ID: {feedback.feedback_id}")
     connector = get_snowflake_connector_from_env()
     if not connector:
@@ -285,7 +314,7 @@ def save_to_snowflake(feedback: UserFeedback) -> bool:
         logger.info("✅ SNOWFLAKE SAVE: Connection established")
         logger.info("📥 SNOWFLAKE SAVE: Attempting to insert feedback...")
-        success = connector.insert_feedback(feedback)
         logger.info("🔌 SNOWFLAKE SAVE: Disconnecting...")
         connector.disconnect()
@@ -302,4 +331,3 @@ def save_to_snowflake(feedback: UserFeedback) -> bool:
         logger.error(f"   - Error: {e}")
         logger.info("=" * 80)
         return False

 import json
 import logging
 from typing import Dict, Any, Optional
 from src.reporting.feedback_schema import UserFeedback
 # Try to import snowflake connector
 try:
     import snowflake.connector
             self._connection.close()
             print("✅ Disconnected from Snowflake")
+    def insert_feedback(self, feedback: UserFeedback, table_name: Optional[str] = None) -> bool:
         """Insert a single feedback record into Snowflake"""
         logger.info("=" * 80)
         logger.info("🔄 SNOWFLAKE INSERT: Starting feedback insertion process")
         logger.info(f"📝 Feedback ID: {feedback.feedback_id}")
+        # Get table name from parameter, env var, or default
+        if table_name is None:
+            table_name = os.getenv("SNOWFLAKE_FEEDBACK_TABLE", "USER_FEEDBACK_V3")
         if not self._connection:
             logger.error("❌ Not connected to Snowflake. Call connect() first.")
             raise RuntimeError("Not connected to Snowflake. Call connect() first.")
                 logger.error(f"❌ Could not set context: {e}")
                 raise
+            # Prepare data - convert to JSON strings for VARIANT columns (same approach as old retrieved_data)
+            logger.info("🔧 DATA PREPARATION: Preparing VARIANT columns...")
+            feedback_dict = feedback.to_dict()
+            # Prepare transcript (ARRAY) - convert to JSON string
+            transcript_raw = feedback_dict.get('transcript', [])
+            if transcript_raw:
+                # Convert to JSON string (same approach as old retrieved_data)
+                transcript_for_db = json.dumps(transcript_raw)
+                logger.info(f"   - Transcript: {len(transcript_raw)} messages, JSON length: {len(transcript_for_db)}")
+            else:
+                transcript_for_db = None
+                logger.info("   - Transcript: None")
+            # Prepare retrievals (ARRAY) - convert to JSON string
+            retrievals_raw = feedback_dict.get('retrievals', [])
+            if retrievals_raw:
+                # Convert to JSON string (same approach as old retrieved_data)
+                retrievals_for_db = json.dumps(retrievals_raw)
+                logger.info(f"   - Retrievals: {len(retrievals_raw)} entries, JSON length: {len(retrievals_for_db)}")
             else:
+                retrievals_for_db = None
+                logger.info("   - Retrievals: None")
+            # Prepare feedback_score_related_retrieval_docs (OBJECT) - convert to JSON string
+            feedback_score_related_raw = feedback_dict.get('feedback_score_related_retrieval_docs')
+            if feedback_score_related_raw:
+                # Convert to JSON string (same approach as old retrieved_data)
+                feedback_score_related_for_db = json.dumps(feedback_score_related_raw)
+                logger.info(f"   - Feedback score related docs: present, JSON length: {len(feedback_score_related_for_db)}")
+            else:
+                feedback_score_related_for_db = None
+                logger.info("   - Feedback score related docs: None")
+            # Prepare retrieved_data (preserved old column) - convert to JSON string
+            retrieved_data_raw = feedback_dict.get('retrieved_data')
+            if retrieved_data_raw:
+                # Convert to JSON string (same approach as old retrieved_data)
+                retrieved_data_for_db = json.dumps(retrieved_data_raw)
+                logger.info(f"   - Retrieved data (preserved): present, JSON length: {len(retrieved_data_for_db)}")
             else:
                 retrieved_data_for_db = None
+                logger.info("   - Retrieved data (preserved): None")
+            # Build SQL with new column structure
+            # Columns are VARCHAR (storing JSON strings), same approach as old retrieved_data
+            sql = f"""INSERT INTO {table_name} (
                 feedback_id,
                 open_ended_feedback,
                 score,
                 message_count,
                 has_retrievals,
                 retrieval_count,
+                transcript,
+                retrievals,
+                feedback_score_related_retrieval_docs,
+                retrieved_data,
+                created_at
             ) VALUES (
                 %(feedback_id)s, %(open_ended_feedback)s, %(score)s, %(is_feedback_about_last_retrieval)s,
                 %(conversation_id)s, %(timestamp)s, %(message_count)s, %(has_retrievals)s,
+                %(retrieval_count)s, %(transcript)s, %(retrievals)s, %(feedback_score_related_retrieval_docs)s,
+                %(retrieved_data)s, %(created_at)s
             )"""
             logger.info("📝 SQL PREPARATION: Building INSERT statement...")
+            logger.info(f"   - Target table: {table_name}")
             logger.info(f"   - Database: {self.database}")
             logger.info(f"   - Schema: {self.schema}")
             # Prepare parameters
+            # Pass JSON strings for VARIANT columns (same approach as old retrieved_data)
             params = {
                 'feedback_id': feedback.feedback_id,
                 'open_ended_feedback': feedback.open_ended_feedback,
                 'message_count': feedback.message_count,
                 'has_retrievals': feedback.has_retrievals,
                 'retrieval_count': feedback.retrieval_count,
+                'transcript': transcript_for_db,  # JSON string
+                'retrievals': retrievals_for_db,  # JSON string
+                'feedback_score_related_retrieval_docs': feedback_score_related_for_db,  # JSON string
+                'retrieved_data': retrieved_data_for_db,  # JSON string - preserved old column
+                'created_at': feedback.created_at
             }
             # Execute insert
     )
+def save_to_snowflake(feedback: UserFeedback, table_name: Optional[str] = None) -> bool:
     """Helper function to save feedback to Snowflake"""
     logger.info("=" * 80)
     logger.info("🔵 SNOWFLAKE SAVE: Starting save process")
     logger.info(f"📝 Feedback ID: {feedback.feedback_id}")
+    # Get table name from parameter or env var
+    if table_name is None:
+        table_name = os.getenv("SNOWFLAKE_FEEDBACK_TABLE", "USER_FEEDBACK_V3")
     connector = get_snowflake_connector_from_env()
     if not connector:
         logger.info("✅ SNOWFLAKE SAVE: Connection established")
         logger.info("📥 SNOWFLAKE SAVE: Attempting to insert feedback...")
+        success = connector.insert_feedback(feedback, table_name=table_name)
         logger.info("🔌 SNOWFLAKE SAVE: Disconnecting...")
         connector.disconnect()
         logger.error(f"   - Error: {e}")
         logger.info("=" * 80)
         return False

src/streamlit_app.py DELETED Viewed

@@ -1,40 +0,0 @@
-import altair as alt
-import numpy as np
-import pandas as pd
-import streamlit as st
-"""
-# Welcome to Streamlit!
-Edit `/streamlit_app.py` to customize this app to your heart's desire :heart:.
-If you have any questions, checkout our [documentation](https://docs.streamlit.io) and [community
-forums](https://discuss.streamlit.io).
-In the meantime, below is an example of what you can do with just a few lines of code:
-"""
-num_points = st.slider("Number of points in spiral", 1, 10000, 1100)
-num_turns = st.slider("Number of turns in spiral", 1, 300, 31)
-indices = np.linspace(0, 1, num_points)
-theta = 2 * np.pi * num_turns * indices
-radius = indices
-x = radius * np.cos(theta)
-y = radius * np.sin(theta)
-df = pd.DataFrame({
-    "x": x,
-    "y": y,
-    "idx": indices,
-    "rand": np.random.randn(num_points),
-})
-st.altair_chart(alt.Chart(df, height=700, width=700)
-    .mark_point(filled=True)
-    .encode(
-        x=alt.X("x", axis=None),
-        y=alt.Y("y", axis=None),
-        color=alt.Color("idx", legend=None, scale=alt.Scale()),
-        size=alt.Size("rand", legend=None, scale=alt.Scale(range=[1, 150])),
-    ))

src/ui_components/__init__.py ADDED Viewed

	@@ -0,0 +1,21 @@

+"""
+UI Components Module
+This module contains UI-related components including styles, visualizations,
+and utility functions for the Streamlit application.
+"""
+from .styles import get_custom_css
+from .components import (
+    display_chunk_statistics_charts,
+    display_chunk_statistics_table
+)
+from .utils import extract_chunk_statistics
+__all__ = [
+    "get_custom_css",
+    "display_chunk_statistics_charts",
+    "display_chunk_statistics_table",
+    "extract_chunk_statistics"
+]

src/ui_components/components.py ADDED Viewed

	@@ -0,0 +1,202 @@

+"""
+UI components for displaying statistics and visualizations
+"""
+import streamlit as st
+import pandas as pd
+import plotly.express as px
+from typing import Dict, Any
+def display_chunk_statistics_charts(stats: Dict[str, Any], title: str = "Retrieval Statistics"):
+    """Display statistics as interactive charts for 10+ results."""
+    if not stats or stats.get('total_chunks', 0) == 0:
+        return
+    # Wrap everything in one styled container - open it
+    st.markdown(f"""
+    <div class="retrieval-distribution-container">
+        <h3 style="margin-top: 0;">📊 {title}</h3>
+        <div style="display: flex; justify-content: space-around; align-items: center; padding: 15px 0; border-bottom: 1px solid #e0e0e0; margin-bottom: 20px;">
+            <div class="metric-container">
+                <div class="metric-label">Total Chunks</div>
+                <div class="metric-value">{stats['total_chunks']}</div>
+            </div>
+            <div class="metric-container">
+                <div class="metric-label">Unique Sources</div>
+                <div class="metric-value">{stats['unique_sources']}</div>
+            </div>
+            <div class="metric-container">
+                <div class="metric-label">Unique Years</div>
+                <div class="metric-value">{stats['unique_years']}</div>
+            </div>
+            <div class="metric-container">
+                <div class="metric-label">Unique Files</div>
+                <div class="metric-value">{stats['unique_filenames']}</div>
+            </div>
+        </div>
+    """, unsafe_allow_html=True)
+    # Charts - three columns to include Districts
+    col1, col2, col3 = st.columns(3)
+    with col1:
+        # Source distribution chart
+        if stats['source_distribution']:
+            source_df = pd.DataFrame(
+                list(stats['source_distribution'].items()),
+                columns=['Source', 'Count']
+            )
+            fig_source = px.bar(
+                source_df,
+                x='Count',
+                y='Source',
+                orientation='h',
+                title='Distribution by Source',
+                color='Count',
+                color_continuous_scale='viridis'
+            )
+            fig_source.update_layout(height=400, showlegend=False)
+            st.plotly_chart(fig_source, use_container_width=True)  # Note: plotly_chart still uses use_container_width
+    with col2:
+        # Year distribution chart
+        if stats['year_distribution']:
+            # Filter out 'Unknown' years for the chart
+            year_dist_filtered = {k: v for k, v in stats['year_distribution'].items() if k != 'Unknown'}
+            if year_dist_filtered:
+                year_df = pd.DataFrame(
+                    list(year_dist_filtered.items()),
+                    columns=['Year', 'Count']
+                )
+                # Sort by year as integer but keep as string for categorical display
+                year_df['Year_Int'] = year_df['Year'].astype(int)
+                year_df = year_df.sort_values('Year_Int').drop('Year_Int', axis=1)
+                fig_year = px.bar(
+                    year_df,
+                    x='Year',
+                    y='Count',
+                    title='Distribution by Year',
+                    color='Count',
+                    color_continuous_scale='plasma'
+                )
+                # Ensure years are treated as categorical (discrete) not continuous
+                fig_year.update_xaxes(type='category')
+                fig_year.update_layout(height=400, showlegend=False)
+                st.plotly_chart(fig_year, use_container_width=True)  # Note: plotly_chart still uses use_container_width
+            else:
+                st.info("No valid years found in the results")
+    with col3:
+        # District distribution chart
+        if stats.get('district_distribution'):
+            district_dist_filtered = {k: v for k, v in stats['district_distribution'].items() if k != 'Unknown'}
+            if district_dist_filtered:
+                district_df = pd.DataFrame(
+                    list(district_dist_filtered.items()),
+                    columns=['District', 'Count']
+                )
+                district_df = district_df.sort_values('Count', ascending=False)
+                fig_district = px.bar(
+                    district_df,
+                    x='Count',
+                    y='District',
+                    orientation='h',
+                    title='Distribution by District',
+                    color='Count',
+                    color_continuous_scale='blues'
+                )
+                fig_district.update_layout(height=400, showlegend=False)
+                st.plotly_chart(fig_district, use_container_width=True)  # Note: plotly_chart still uses use_container_width
+            else:
+                st.info("No valid districts found in the results")
+    # Close the container
+    st.markdown('</div>', unsafe_allow_html=True)
+def display_chunk_statistics_table(stats: Dict[str, Any], title: str = "Retrieval Distribution"):
+    """Display statistics as tables for smaller results with fixed alignment."""
+    if not stats or stats.get('total_chunks', 0) == 0:
+        return
+    # Wrap in styled container
+    st.markdown('<div class="retrieval-distribution-container">', unsafe_allow_html=True)
+    st.subheader(f"📊 {title}")
+    # Create a container with fixed height for alignment
+    stats_container = st.container()
+    with stats_container:
+        # Create 4 equal columns for consistent alignment
+        col1, col2, col3, col4 = st.columns(4)
+        with col1:
+            st.markdown("**🏘️ Districts**")
+            if stats.get('district_distribution'):
+                district_dist_filtered = {k: v for k, v in stats['district_distribution'].items() if k != 'Unknown'}
+                if district_dist_filtered:
+                    district_data = {
+                        "District": list(district_dist_filtered.keys()),
+                        "Count": list(district_dist_filtered.values())
+                    }
+                    district_df = pd.DataFrame(district_data).sort_values('Count', ascending=False)
+                    st.dataframe(district_df, hide_index=True, width='stretch')
+                else:
+                    st.write("No district data")
+            else:
+                st.write("No district data")
+        with col2:
+            st.markdown("**📂 Sources**")
+            if stats['source_distribution']:
+                source_data = {
+                    "Source": list(stats['source_distribution'].keys()),
+                    "Count": list(stats['source_distribution'].values())
+                }
+                source_df = pd.DataFrame(source_data).sort_values('Count', ascending=False)
+                st.dataframe(source_df, hide_index=True, width='stretch')
+            else:
+                st.write("No source data")
+        with col3:
+            st.markdown("**📅 Years**")
+            if stats['year_distribution']:
+                year_dist_filtered = {k: v for k, v in stats['year_distribution'].items() if k != 'Unknown'}
+                if year_dist_filtered:
+                    year_data = {
+                        "Year": list(year_dist_filtered.keys()),
+                        "Count": list(year_dist_filtered.values())
+                    }
+                    year_df = pd.DataFrame(year_data)
+                    # Sort by year as integer but display as string
+                    year_df['Year_Int'] = year_df['Year'].astype(int)
+                    year_df = year_df.sort_values('Year_Int')[['Year', 'Count']]
+                    st.dataframe(year_df, hide_index=True, width='stretch')
+                else:
+                    st.write("No year data")
+            else:
+                st.write("No year data")
+        with col4:
+            st.markdown("**📄 Files**")
+            if stats['filename_distribution']:
+                filename_items = list(stats['filename_distribution'].items())
+                filename_items.sort(key=lambda x: x[1], reverse=True)
+                # Show top files with truncated names
+                file_data = {
+                    "File": [f[:30] + "..." if len(f) > 30 else f for f, c in filename_items[:5]],
+                    "Count": [c for f, c in filename_items[:5]]
+                }
+                file_df = pd.DataFrame(file_data)
+                st.dataframe(file_df, hide_index=True, width='stretch')
+            else:
+                st.write("No file data")
+    # Close container
+    st.markdown('</div>', unsafe_allow_html=True)

src/ui_components/styles.py ADDED Viewed

	@@ -0,0 +1,117 @@

+"""
+Custom CSS styles for Streamlit application
+"""
+def get_custom_css() -> str:
+    """Get custom CSS styles as a string"""
+    return """
+<style>
+    .main-header {
+        font-size: 2.5rem;
+        font-weight: bold;
+        color: #1f77b4;
+        text-align: center;
+        margin-bottom: 1rem;
+        width: 100%;
+        display: block;
+    }
+    .subtitle {
+        font-size: 1.2rem;
+        color: #666;
+        text-align: center;
+        margin-bottom: 2rem;
+        width: 100%;
+        display: block;
+    }
+    .session-info {
+        background-color: #f0f2f6;
+        padding: 10px;
+        border-radius: 5px;
+        margin-bottom: 20px;
+        font-size: 0.9rem;
+    }
+    .user-message {
+        background-color: #007bff;
+        color: white;
+        padding: 12px 16px;
+        border-radius: 18px 18px 4px 18px;
+        margin: 8px 0;
+        margin-left: 20%;
+        word-wrap: break-word;
+    }
+    .bot-message {
+        background-color: #f1f3f4;
+        color: #333;
+        padding: 12px 16px;
+        border-radius: 18px 18px 18px 4px;
+        margin: 8px 0;
+        margin-right: 20%;
+        word-wrap: break-word;
+        border: 1px solid #e0e0e0;
+    }
+    .filter-section {
+        margin-bottom: 20px;
+        padding: 15px;
+        background-color: #f8f9fa;
+        border-radius: 8px;
+        border: 1px solid #e9ecef;
+    }
+    .filter-title {
+        font-weight: bold;
+        margin-bottom: 10px;
+        color: #495057;
+    }
+    .feedback-section {
+        background-color: #f8f9fa;
+        padding: 20px;
+        border-radius: 10px;
+        margin-top: 30px;
+        border: 2px solid #dee2e6;
+    }
+    .retrieval-history {
+        background-color: #ffffff;
+        padding: 15px;
+        border-radius: 5px;
+        margin: 10px 0;
+        border-left: 4px solid #007bff;
+    }
+    .retrieval-distribution-container {
+        background-color: #ffffff;
+        padding: 25px;
+        border-radius: 10px;
+        margin: 20px 0;
+        border: 2px solid #e0e0e0;
+        box-shadow: 0 4px 6px rgba(0, 0, 0, 0.1), 0 2px 4px rgba(0, 0, 0, 0.06);
+    }
+    .metric-label {
+        font-size: 0.9rem;
+        color: #555;
+        margin-bottom: 5px;
+        text-align: center;
+    }
+    .metric-value {
+        font-size: 1.8rem;
+        font-weight: bold;
+        color: #000000;
+        text-align: center;
+    }
+    .metric-container {
+        text-align: center;
+        padding: 10px;
+    }
+</style>
+"""

src/ui_components/utils.py ADDED Viewed

	@@ -0,0 +1,73 @@

+"""
+UI utility functions for data processing and statistics
+"""
+from typing import Dict, Any, List
+from collections import Counter
+def extract_chunk_statistics(sources: List[Any]) -> Dict[str, Any]:
+    """Extract statistics from retrieved chunks."""
+    if not sources:
+        return {}
+    sources_list = []
+    years = []
+    filenames = []
+    districts = []
+    for doc in sources:
+        metadata = getattr(doc, 'metadata', {})
+        # Extract source
+        source = metadata.get('source', 'Unknown')
+        sources_list.append(source)
+        # Extract year
+        year = metadata.get('year', 'Unknown')
+        if year and year != 'Unknown':
+            try:
+                # Convert to int first, then back to string to ensure it's a proper year
+                year_int = int(float(year))  # Handle both int and float strings
+                if 1900 <= year_int <= 2030:  # Reasonable year range
+                    years.append(str(year_int))
+                else:
+                    years.append('Unknown')
+            except (ValueError, TypeError):
+                years.append('Unknown')
+        else:
+            years.append('Unknown')
+        # Extract filename
+        filename = metadata.get('filename', 'Unknown')
+        filenames.append(filename)
+        # Extract district
+        district = metadata.get('district', 'Unknown')
+        if district and district != 'Unknown':
+            districts.append(district)
+        else:
+            districts.append('Unknown')
+    # Count occurrences
+    source_counts = Counter(sources_list)
+    year_counts = Counter(years)
+    filename_counts = Counter(filenames)
+    district_counts = Counter(districts)
+    return {
+        'total_chunks': len(sources),
+        'unique_sources': len(source_counts),
+        'unique_years': len([y for y in year_counts.keys() if y != 'Unknown']),
+        'unique_filenames': len(filename_counts),
+        'unique_districts': len([d for d in district_counts.keys() if d != 'Unknown']),
+        'source_distribution': dict(source_counts),
+        'year_distribution': dict(year_counts),
+        'filename_distribution': dict(filename_counts),
+        'district_distribution': dict(district_counts),
+        'sources': sources_list,
+        'years': years,
+        'filenames': filenames,
+        'districts': districts
+    }

utils.py → src/utils.py RENAMED Viewed

File without changes

src/vectorstore.py CHANGED Viewed

@@ -1,9 +1,20 @@
 """Vector store management and operations."""
 from pathlib import Path
 from typing import Dict, Any, List, Optional
 import torch
 from langchain_qdrant import QdrantVectorStore
 from langchain.docstore.document import Document
 from langchain_core.embeddings import Embeddings
@@ -28,11 +39,23 @@ class MatryoshkaEmbeddings(Embeddings):
         if truncate_dim and "matryoshka" in model_name.lower():
             # Use SentenceTransformer directly for Matryoshka models
-            device = "cuda" if torch.cuda.is_available() else "cpu"
-            self.model = SentenceTransformer(model_name, truncate_dim=truncate_dim, device=device)
             print(f"🔧 Matryoshka model configured for {truncate_dim} dimensions")
         else:
             # Use standard HuggingFaceEmbeddings
             self.model = HuggingFaceEmbeddings(model_name=model_name, **kwargs)
     def embed_documents(self, texts: List[str]) -> List[List[float]]:
@@ -76,12 +99,17 @@ class VectorStoreManager:
     def _create_embeddings(self) -> HuggingFaceEmbeddings:
         """Create embeddings model from configuration."""
-        device = "cuda" if torch.cuda.is_available() else "cpu"
         model_name = self.config["retriever"]["model"]
         normalize = self.config["retriever"]["normalize"]
-        model_kwargs = {"device": device}
         encode_kwargs = {
             "normalize_embeddings": normalize,
             "batch_size": 100,
@@ -108,6 +136,8 @@ class VectorStoreManager:
                 return embeddings
         # Use standard HuggingFaceEmbeddings for non-Matryoshka models
         embeddings = HuggingFaceEmbeddings(
             model_name=model_name,
             model_kwargs=model_kwargs,

 """Vector store management and operations."""
+import os
+# Disable MPS before importing torch to prevent meta tensor issues on Mac
+os.environ.setdefault("PYTORCH_ENABLE_MPS_FALLBACK", "1")
+os.environ.setdefault("PYTORCH_MPS_HIGH_WATERMARK_RATIO", "0.0")
 from pathlib import Path
 from typing import Dict, Any, List, Optional
 import torch
+# Disable MPS backend explicitly to prevent meta tensor issues
+if hasattr(torch.backends, 'mps'):
+    # Monkey patch to disable MPS
+    original_mps_available = torch.backends.mps.is_available
+    torch.backends.mps.is_available = lambda: False
 from langchain_qdrant import QdrantVectorStore
 from langchain.docstore.document import Document
 from langchain_core.embeddings import Embeddings
         if truncate_dim and "matryoshka" in model_name.lower():
             # Use SentenceTransformer directly for Matryoshka models
+            # Fix for meta tensor issue: Explicitly force CPU
+            # MPS is already disabled at module level
+            # Explicitly pass device="cpu" to prevent MPS/CUDA detection
+            self.model = SentenceTransformer(
+                model_name,
+                truncate_dim=truncate_dim,
+                device="cpu"  # Force CPU to prevent meta tensor issues
+            )
             print(f"🔧 Matryoshka model configured for {truncate_dim} dimensions")
         else:
             # Use standard HuggingFaceEmbeddings
+            # Don't pass device parameter - let it load naturally on CPU
+            # This prevents the meta tensor error
+            if "model_kwargs" not in kwargs:
+                kwargs["model_kwargs"] = {}
+            # Remove device from model_kwargs if present to prevent meta tensor issues
+            kwargs["model_kwargs"].pop("device", None)
             self.model = HuggingFaceEmbeddings(model_name=model_name, **kwargs)
     def embed_documents(self, texts: List[str]) -> List[List[float]]:
     def _create_embeddings(self) -> HuggingFaceEmbeddings:
         """Create embeddings model from configuration."""
         model_name = self.config["retriever"]["model"]
         normalize = self.config["retriever"]["normalize"]
+        # Fix for meta tensor issue: Force CPU usage to prevent MPS/CUDA detection
+        # The error occurs when SentenceTransformer detects MPS/CUDA and tries to move meta tensors
+        # MPS is already disabled at module level, now we explicitly force CPU in model_kwargs
+        model_kwargs = {
+            "device": "cpu",  # Explicitly force CPU to prevent MPS/CUDA detection
+            "trust_remote_code": True,  # Some models need this
+        }
         encode_kwargs = {
             "normalize_embeddings": normalize,
             "batch_size": 100,
                 return embeddings
         # Use standard HuggingFaceEmbeddings for non-Matryoshka models
+        # Don't pass device in model_kwargs - let HuggingFaceEmbeddings handle it
+        # but ensure we're not using meta device
         embeddings = HuggingFaceEmbeddings(
             model_name=model_name,
             model_kwargs=model_kwargs,