Spaces:
Sleeping
Sleeping
demo version
#5
by
Yeroyan
- opened
- .gitignore +112 -0
- add_district_metadata.py +379 -0
- app.py +804 -233
- multi_agent_chatbot.py +12 -13
- smart_chatbot.py +4 -3
- src/config/paths.py +59 -0
- src/pipeline.py +32 -37
.gitignore
ADDED
|
@@ -0,0 +1,112 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# ==========================================
|
| 2 |
+
# PYTHON
|
| 3 |
+
# ==========================================
|
| 4 |
+
__pycache__/
|
| 5 |
+
*.py[cod]
|
| 6 |
+
*.pyo
|
| 7 |
+
*.pyd
|
| 8 |
+
*$py.class
|
| 9 |
+
|
| 10 |
+
# Virtual environments
|
| 11 |
+
.venv/
|
| 12 |
+
venv/
|
| 13 |
+
env/
|
| 14 |
+
ENV/
|
| 15 |
+
.conda/
|
| 16 |
+
.venv*/
|
| 17 |
+
|
| 18 |
+
# Byte-compiled / optimized / DLL files
|
| 19 |
+
*.so
|
| 20 |
+
*.dll
|
| 21 |
+
*.dylib
|
| 22 |
+
|
| 23 |
+
# Logs and debug
|
| 24 |
+
*.log
|
| 25 |
+
*.out
|
| 26 |
+
*.err
|
| 27 |
+
logs/
|
| 28 |
+
debug/
|
| 29 |
+
*.sqlite3
|
| 30 |
+
|
| 31 |
+
# ==========================================
|
| 32 |
+
# BUILD / PACKAGING
|
| 33 |
+
# ==========================================
|
| 34 |
+
build/
|
| 35 |
+
dist/
|
| 36 |
+
*.egg-info/
|
| 37 |
+
.eggs/
|
| 38 |
+
pip-wheel-metadata/
|
| 39 |
+
.wheels/
|
| 40 |
+
|
| 41 |
+
# ==========================================
|
| 42 |
+
# JUPYTER / NOTEBOOKS
|
| 43 |
+
# ==========================================
|
| 44 |
+
.ipynb_checkpoints/
|
| 45 |
+
*.ipynb_convert/
|
| 46 |
+
|
| 47 |
+
# ==========================================
|
| 48 |
+
# DATA / MODELS / CACHE
|
| 49 |
+
# ==========================================
|
| 50 |
+
data/
|
| 51 |
+
datasets/
|
| 52 |
+
.cache/
|
| 53 |
+
*.ckpt
|
| 54 |
+
*.h5
|
| 55 |
+
*.hdf5
|
| 56 |
+
*.tflite
|
| 57 |
+
*.onnx
|
| 58 |
+
*.pth
|
| 59 |
+
*.pt
|
| 60 |
+
*.joblib
|
| 61 |
+
*.pkl
|
| 62 |
+
*.pickle
|
| 63 |
+
*.npz
|
| 64 |
+
*.npy
|
| 65 |
+
outputs/
|
| 66 |
+
artifacts/
|
| 67 |
+
checkpoints/
|
| 68 |
+
runs/
|
| 69 |
+
wandb/
|
| 70 |
+
mlruns/
|
| 71 |
+
lightning_logs/
|
| 72 |
+
|
| 73 |
+
# Hugging Face
|
| 74 |
+
huggingface/
|
| 75 |
+
~/.cache/huggingface/
|
| 76 |
+
~/.cache/torch/
|
| 77 |
+
~/.cache/datasets/
|
| 78 |
+
~/.cache/transformers/
|
| 79 |
+
|
| 80 |
+
# ==========================================
|
| 81 |
+
# EDITORS / TOOLS
|
| 82 |
+
# ==========================================
|
| 83 |
+
.vscode/
|
| 84 |
+
.idea/
|
| 85 |
+
*.swp
|
| 86 |
+
*.swo
|
| 87 |
+
*.bak
|
| 88 |
+
.DS_Store
|
| 89 |
+
Thumbs.db
|
| 90 |
+
|
| 91 |
+
# ==========================================
|
| 92 |
+
# ENV FILES / CREDENTIALS
|
| 93 |
+
# ==========================================
|
| 94 |
+
.env
|
| 95 |
+
.env.*
|
| 96 |
+
*.env.local
|
| 97 |
+
secrets.*
|
| 98 |
+
config.json
|
| 99 |
+
token.json
|
| 100 |
+
|
| 101 |
+
# ==========================================
|
| 102 |
+
# TESTS / TEMP FILES
|
| 103 |
+
# ==========================================
|
| 104 |
+
__tests__/
|
| 105 |
+
.tox/
|
| 106 |
+
.coverage
|
| 107 |
+
.cache/
|
| 108 |
+
pytest_cache/
|
| 109 |
+
tmp/
|
| 110 |
+
temp/
|
| 111 |
+
*.tmp
|
| 112 |
+
*.temp
|
add_district_metadata.py
ADDED
|
@@ -0,0 +1,379 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
"""
|
| 3 |
+
Script to add District metadata to Qdrant chunks based on filename analysis.
|
| 4 |
+
Handles Uganda districts, ministry mappings, and LLM inference for ambiguous cases.
|
| 5 |
+
"""
|
| 6 |
+
import re
|
| 7 |
+
import yaml
|
| 8 |
+
import logging
|
| 9 |
+
from dataclasses import dataclass
|
| 10 |
+
from typing import Dict, List, Optional
|
| 11 |
+
|
| 12 |
+
|
| 13 |
+
from qdrant_client import QdrantClient
|
| 14 |
+
|
| 15 |
+
# Configure logging
|
| 16 |
+
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
|
| 17 |
+
logger = logging.getLogger(__name__)
|
| 18 |
+
|
| 19 |
+
|
| 20 |
+
@dataclass
|
| 21 |
+
class DistrictMapping:
|
| 22 |
+
"""Mapping for district-related entities"""
|
| 23 |
+
name: str
|
| 24 |
+
aliases: List[str]
|
| 25 |
+
is_district: bool = True
|
| 26 |
+
|
| 27 |
+
|
| 28 |
+
class DistrictMetadataProcessor:
|
| 29 |
+
def __init__(self, config_path: str = "src/config/settings.yaml"):
|
| 30 |
+
# Load config manually
|
| 31 |
+
with open(config_path, 'r') as f:
|
| 32 |
+
self.config = yaml.safe_load(f)
|
| 33 |
+
|
| 34 |
+
# Initialize Qdrant client (will be imported when needed)
|
| 35 |
+
self.llm_client = None
|
| 36 |
+
self.qdrant_client = None
|
| 37 |
+
self.collection_name = self.config["qdrant"]["collection_name"]
|
| 38 |
+
|
| 39 |
+
# Initialize district mappings
|
| 40 |
+
self.district_mappings = self._initialize_district_mappings()
|
| 41 |
+
self.ministry_mappings = self._initialize_ministry_mappings()
|
| 42 |
+
|
| 43 |
+
def _initialize_district_mappings(self) -> Dict[str, DistrictMapping]:
|
| 44 |
+
"""Initialize Uganda districts and their aliases"""
|
| 45 |
+
districts = [
|
| 46 |
+
# Central Region
|
| 47 |
+
DistrictMapping("Kampala", ["KCCA", "Kampala Capital City Authority"]),
|
| 48 |
+
DistrictMapping("Wakiso", ["Wakiso"]),
|
| 49 |
+
DistrictMapping("Mukono", ["Mukono"]),
|
| 50 |
+
DistrictMapping("Luweero", ["Luweero"]),
|
| 51 |
+
DistrictMapping("Nakaseke", ["Nakaseke"]),
|
| 52 |
+
DistrictMapping("Nakasongola", ["Nakasongola"]),
|
| 53 |
+
DistrictMapping("Kayunga", ["Kayunga"]),
|
| 54 |
+
DistrictMapping("Buikwe", ["Buikwe"]),
|
| 55 |
+
DistrictMapping("Buvuma", ["Buvuma"]),
|
| 56 |
+
|
| 57 |
+
# Northern Region
|
| 58 |
+
DistrictMapping("Gulu", ["Gulu", "Gulu DLG"]),
|
| 59 |
+
DistrictMapping("Kitgum", ["Kitgum"]),
|
| 60 |
+
DistrictMapping("Pader", ["Pader"]),
|
| 61 |
+
DistrictMapping("Agago", ["Agago"]),
|
| 62 |
+
DistrictMapping("Lamwo", ["Lamwo"]),
|
| 63 |
+
DistrictMapping("Nwoya", ["Nwoya"]),
|
| 64 |
+
DistrictMapping("Amuru", ["Amuru"]),
|
| 65 |
+
DistrictMapping("Omoro", ["Omoro"]),
|
| 66 |
+
DistrictMapping("Oyam", ["Oyam"]),
|
| 67 |
+
DistrictMapping("Kole", ["Kole"]),
|
| 68 |
+
DistrictMapping("Apac", ["Apac", "Apac District"]),
|
| 69 |
+
DistrictMapping("Lira", ["Lira"]),
|
| 70 |
+
DistrictMapping("Alebtong", ["Alebtong"]),
|
| 71 |
+
DistrictMapping("Amolatar", ["Amolatar"]),
|
| 72 |
+
DistrictMapping("Dokolo", ["Dokolo"]),
|
| 73 |
+
DistrictMapping("Otuke", ["Otuke"]),
|
| 74 |
+
DistrictMapping("Kwania", ["Kwania"]),
|
| 75 |
+
|
| 76 |
+
# Eastern Region
|
| 77 |
+
DistrictMapping("Jinja", ["Jinja"]),
|
| 78 |
+
DistrictMapping("Kamuli", ["Kamuli"]),
|
| 79 |
+
DistrictMapping("Iganga", ["Iganga"]),
|
| 80 |
+
DistrictMapping("Bugiri", ["Bugiri"]),
|
| 81 |
+
DistrictMapping("Mayuge", ["Mayuge"]),
|
| 82 |
+
DistrictMapping("Namayingo", ["Namayingo"]),
|
| 83 |
+
DistrictMapping("Busia", ["Busia"]),
|
| 84 |
+
DistrictMapping("Tororo", ["Tororo"]),
|
| 85 |
+
DistrictMapping("Pallisa", ["Pallisa"]),
|
| 86 |
+
DistrictMapping("Kumi", ["Kumi"]),
|
| 87 |
+
DistrictMapping("Bukedea", ["Bukedea"]),
|
| 88 |
+
DistrictMapping("Soroti", ["Soroti"]),
|
| 89 |
+
DistrictMapping("Serere", ["Serere"]),
|
| 90 |
+
DistrictMapping("Ngora", ["Ngora"]),
|
| 91 |
+
DistrictMapping("Kaberamaido", ["Kaberamaido"]),
|
| 92 |
+
DistrictMapping("Kalaki", ["Kalaki"]),
|
| 93 |
+
DistrictMapping("Kapelebyong", ["Kapelebyong"]),
|
| 94 |
+
DistrictMapping("Amuria", ["Amuria"]),
|
| 95 |
+
DistrictMapping("Katakwi", ["Katakwi"]),
|
| 96 |
+
DistrictMapping("Kotido", ["Kotido"]),
|
| 97 |
+
DistrictMapping("Abim", ["Abim"]),
|
| 98 |
+
DistrictMapping("Kaabong", ["Kaabong", "Kaabong District"]),
|
| 99 |
+
DistrictMapping("Karenga", ["Karenga"]),
|
| 100 |
+
DistrictMapping("Moroto", ["Moroto"]),
|
| 101 |
+
DistrictMapping("Napak", ["Napak"]),
|
| 102 |
+
DistrictMapping("Nabilatuk", ["Nabilatuk"]),
|
| 103 |
+
DistrictMapping("Amudat", ["Amudat"]),
|
| 104 |
+
DistrictMapping("Nakapiripirit", ["Nakapiripirit"]),
|
| 105 |
+
DistrictMapping("Bukwo", ["Bukwo"]),
|
| 106 |
+
DistrictMapping("Kween", ["Kween"]),
|
| 107 |
+
DistrictMapping("Kapchorwa", ["Kapchorwa"]),
|
| 108 |
+
DistrictMapping("Sironko", ["Sironko"]),
|
| 109 |
+
DistrictMapping("Manafwa", ["Manafwa"]),
|
| 110 |
+
DistrictMapping("Bududa", ["Bududa"]),
|
| 111 |
+
DistrictMapping("Mbale", ["Mbale"]),
|
| 112 |
+
DistrictMapping("Butaleja", ["Butaleja"]),
|
| 113 |
+
DistrictMapping("Namisindwa", ["Namisindwa"]),
|
| 114 |
+
DistrictMapping("Bulambuli", ["Bulambuli"]),
|
| 115 |
+
|
| 116 |
+
# Western Region
|
| 117 |
+
DistrictMapping("Masaka", ["Masaka"]),
|
| 118 |
+
DistrictMapping("Kalungu", ["Kalungu"]),
|
| 119 |
+
DistrictMapping("Bukomansimbi", ["Bukomansimbi"]),
|
| 120 |
+
DistrictMapping("Lwengo", ["Lwengo"]),
|
| 121 |
+
DistrictMapping("Sembabule", ["Sembabule"]),
|
| 122 |
+
DistrictMapping("Rakai", ["Rakai"]),
|
| 123 |
+
DistrictMapping("Kyotera", ["Kyotera"]),
|
| 124 |
+
DistrictMapping("Mpigi", ["Mpigi"]),
|
| 125 |
+
DistrictMapping("Butambala", ["Butambala"]),
|
| 126 |
+
DistrictMapping("Gomba", ["Gomba"]),
|
| 127 |
+
DistrictMapping("Mityana", ["Mityana"]),
|
| 128 |
+
DistrictMapping("Mubende", ["Mubende"]),
|
| 129 |
+
DistrictMapping("Kassanda", ["Kassanda"]),
|
| 130 |
+
DistrictMapping("Kiboga", ["Kiboga"]),
|
| 131 |
+
DistrictMapping("Kyankwanzi", ["Kyankwanzi"]),
|
| 132 |
+
DistrictMapping("Hoima", ["Hoima"]),
|
| 133 |
+
DistrictMapping("Kikuube", ["Kikuube"]),
|
| 134 |
+
DistrictMapping("Kakumiro", ["Kakumiro"]),
|
| 135 |
+
DistrictMapping("Kibaale", ["Kibaale"]),
|
| 136 |
+
DistrictMapping("Kagadi", ["Kagadi"]),
|
| 137 |
+
DistrictMapping("Buliisa", ["Buliisa"]),
|
| 138 |
+
DistrictMapping("Masindi", ["Masindi"]),
|
| 139 |
+
DistrictMapping("Kiryandongo", ["Kiryandongo"]),
|
| 140 |
+
DistrictMapping("Buliisa", ["Buliisa"]),
|
| 141 |
+
DistrictMapping("Pakwach", ["Pakwach"]),
|
| 142 |
+
DistrictMapping("Nebbi", ["Nebbi"]),
|
| 143 |
+
DistrictMapping("Zombo", ["Zombo"]),
|
| 144 |
+
DistrictMapping("Arua", ["Arua"]),
|
| 145 |
+
DistrictMapping("Terego", ["Terego"]),
|
| 146 |
+
DistrictMapping("Madi-Okollo", ["Madi-Okollo"]),
|
| 147 |
+
DistrictMapping("Obongi", ["Obongi"]),
|
| 148 |
+
DistrictMapping("Moyo", ["Moyo"]),
|
| 149 |
+
DistrictMapping("Yumbe", ["Yumbe"]),
|
| 150 |
+
DistrictMapping("Koboko", ["Koboko"]),
|
| 151 |
+
DistrictMapping("Maracha", ["Maracha"]),
|
| 152 |
+
DistrictMapping("Adjumani", ["Adjumani"]),
|
| 153 |
+
|
| 154 |
+
# South Western Region
|
| 155 |
+
DistrictMapping("Mbarara", ["Mbarara"]),
|
| 156 |
+
DistrictMapping("Ibanda", ["Ibanda"]),
|
| 157 |
+
DistrictMapping("Isingiro", ["Isingiro"]),
|
| 158 |
+
DistrictMapping("Kiruhura", ["Kiruhura"]),
|
| 159 |
+
DistrictMapping("Kazo", ["Kazo"]),
|
| 160 |
+
DistrictMapping("Ntungamo", ["Ntungamo"]),
|
| 161 |
+
DistrictMapping("Rwampara", ["Rwampara"]),
|
| 162 |
+
DistrictMapping("Rubanda", ["Rubanda"]),
|
| 163 |
+
DistrictMapping("Rukiga", ["Rukiga"]),
|
| 164 |
+
DistrictMapping("Kanungu", ["Kanungu"]),
|
| 165 |
+
DistrictMapping("Rukungiri", ["Rukungiri"]),
|
| 166 |
+
DistrictMapping("Kisoro", ["Kisoro"]),
|
| 167 |
+
DistrictMapping("Bundibugyo", ["Bundibugyo"]),
|
| 168 |
+
DistrictMapping("Ntoroko", ["Ntoroko"]),
|
| 169 |
+
DistrictMapping("Kasese", ["Kasese"]),
|
| 170 |
+
DistrictMapping("Bunyangabu", ["Bunyangabu"]),
|
| 171 |
+
DistrictMapping("Fort Portal", ["Fort Portal"]),
|
| 172 |
+
DistrictMapping("Kabarole", ["Kabarole"]),
|
| 173 |
+
DistrictMapping("Kyenjojo", ["Kyenjojo"]),
|
| 174 |
+
DistrictMapping("Kamwenge", ["Kamwenge"]),
|
| 175 |
+
DistrictMapping("Kitagwenda", ["Kitagwenda"]),
|
| 176 |
+
DistrictMapping("Kyegegwa", ["Kyegegwa"]),
|
| 177 |
+
DistrictMapping("Mitooma", ["Mitooma"]),
|
| 178 |
+
DistrictMapping("Rubirizi", ["Rubirizi"]),
|
| 179 |
+
DistrictMapping("Sheema", ["Sheema"]),
|
| 180 |
+
DistrictMapping("Bushenyi", ["Bushenyi"]),
|
| 181 |
+
|
| 182 |
+
# Special cases
|
| 183 |
+
DistrictMapping("Kalangala", ["Kalangala", "Kalangala DLG"]),
|
| 184 |
+
]
|
| 185 |
+
|
| 186 |
+
# Create mapping dictionary
|
| 187 |
+
mapping_dict = {}
|
| 188 |
+
for district in districts:
|
| 189 |
+
mapping_dict[district.name.lower()] = district
|
| 190 |
+
for alias in district.aliases:
|
| 191 |
+
mapping_dict[alias.lower()] = district
|
| 192 |
+
return mapping_dict
|
| 193 |
+
|
| 194 |
+
def _initialize_ministry_mappings(self) -> Dict[str, str]:
|
| 195 |
+
"""Initialize ministry and organization mappings"""
|
| 196 |
+
return {
|
| 197 |
+
"maaif": "Ministry of Agriculture, Animal Industry and Fisheries",
|
| 198 |
+
"mwts": "Ministry of Works and Transport",
|
| 199 |
+
"kcca": "Kampala Capital City Authority",
|
| 200 |
+
"oag": "Office of the Auditor General",
|
| 201 |
+
"arsdp": "Albertine Regional Sustainable Development Project",
|
| 202 |
+
"avcdp": "Agriculture Value Chain Development Project",
|
| 203 |
+
"ida": "International Development Association",
|
| 204 |
+
"dlg": "District Local Government",
|
| 205 |
+
"lg": "Local Government",
|
| 206 |
+
}
|
| 207 |
+
|
| 208 |
+
def _extract_district_from_filename(self, filename: str) -> Optional[str]:
|
| 209 |
+
"""Extract district from filename using pattern matching"""
|
| 210 |
+
filename_lower = filename.lower()
|
| 211 |
+
|
| 212 |
+
# Check for explicit district mentions
|
| 213 |
+
for key, district_mapping in self.district_mappings.items():
|
| 214 |
+
if key in filename_lower:
|
| 215 |
+
return district_mapping.name
|
| 216 |
+
|
| 217 |
+
# Check for ministry/organization patterns that are NOT districts
|
| 218 |
+
for ministry_key in self.ministry_mappings.keys():
|
| 219 |
+
if ministry_key in filename_lower:
|
| 220 |
+
return None # This is a ministry, not a district
|
| 221 |
+
|
| 222 |
+
# Check for patterns like "District Local Government"
|
| 223 |
+
district_pattern = r'(\w+)\s+district\s+local\s+government'
|
| 224 |
+
match = re.search(district_pattern, filename_lower)
|
| 225 |
+
if match:
|
| 226 |
+
district_name = match.group(1).title()
|
| 227 |
+
if district_name.lower() in self.district_mappings:
|
| 228 |
+
return self.district_mappings[district_name.lower()].name
|
| 229 |
+
|
| 230 |
+
# Check for patterns like "DLG Report"
|
| 231 |
+
dlg_pattern = r'(\w+)\s+dlg\s+report'
|
| 232 |
+
match = re.search(dlg_pattern, filename_lower)
|
| 233 |
+
if match:
|
| 234 |
+
district_name = match.group(1).title()
|
| 235 |
+
if district_name.lower() in self.district_mappings:
|
| 236 |
+
return self.district_mappings[district_name.lower()].name
|
| 237 |
+
|
| 238 |
+
return None
|
| 239 |
+
|
| 240 |
+
def _infer_district_with_llm(self, filename: str) -> Optional[str]:
|
| 241 |
+
"""Use LLM to infer district from filename when pattern matching fails"""
|
| 242 |
+
# For now, return None - LLM integration can be added later
|
| 243 |
+
logger.info(f"LLM inference needed for filename: {filename}")
|
| 244 |
+
return None
|
| 245 |
+
|
| 246 |
+
def infer_district(self, filename: str) -> Optional[str]:
|
| 247 |
+
"""Main method to infer district from filename"""
|
| 248 |
+
# First try pattern matching
|
| 249 |
+
district = self._extract_district_from_filename(filename)
|
| 250 |
+
if district:
|
| 251 |
+
return district
|
| 252 |
+
|
| 253 |
+
# If pattern matching fails, use LLM
|
| 254 |
+
return self._infer_district_with_llm(filename)
|
| 255 |
+
|
| 256 |
+
def fetch_chunks_batch(self, batch_size: int = 100, offset: int = 0) -> List[Dict]:
|
| 257 |
+
"""Fetch a batch of chunks from Qdrant (metadata only)"""
|
| 258 |
+
try:
|
| 259 |
+
# Import Qdrant client when needed
|
| 260 |
+
if self.qdrant_client is None:
|
| 261 |
+
self.qdrant_client = QdrantClient(
|
| 262 |
+
url=self.config["qdrant"]["url"],
|
| 263 |
+
api_key=self.config["qdrant"]["api_key"]
|
| 264 |
+
)
|
| 265 |
+
|
| 266 |
+
# Get points with metadata only (no vectors)
|
| 267 |
+
points = self.qdrant_client.scroll(
|
| 268 |
+
collection_name=self.collection_name,
|
| 269 |
+
limit=batch_size,
|
| 270 |
+
offset=offset,
|
| 271 |
+
with_payload=True,
|
| 272 |
+
with_vectors=False
|
| 273 |
+
)[0]
|
| 274 |
+
|
| 275 |
+
return points
|
| 276 |
+
except Exception as e:
|
| 277 |
+
logger.error(f"Failed to fetch batch: {e}")
|
| 278 |
+
return []
|
| 279 |
+
|
| 280 |
+
def update_chunks_with_district(self, points: List[Dict]) -> int:
|
| 281 |
+
"""Update chunks with district metadata"""
|
| 282 |
+
updated_count = 0
|
| 283 |
+
|
| 284 |
+
# Import Qdrant client when needed
|
| 285 |
+
if self.qdrant_client is None:
|
| 286 |
+
from qdrant_client import QdrantClient
|
| 287 |
+
self.qdrant_client = QdrantClient(
|
| 288 |
+
url=self.config["qdrant"]["url"],
|
| 289 |
+
api_key=self.config["qdrant"]["api_key"]
|
| 290 |
+
)
|
| 291 |
+
|
| 292 |
+
for point in points:
|
| 293 |
+
try:
|
| 294 |
+
point_id = point.id
|
| 295 |
+
metadata = point.payload.get("metadata", {})
|
| 296 |
+
filename = metadata.get("filename", "")
|
| 297 |
+
|
| 298 |
+
if not filename:
|
| 299 |
+
logger.warning(f"Point {point_id} has no filename")
|
| 300 |
+
continue
|
| 301 |
+
|
| 302 |
+
# Infer district
|
| 303 |
+
district = self.infer_district(filename)
|
| 304 |
+
|
| 305 |
+
# Update metadata
|
| 306 |
+
updated_metadata = metadata.copy()
|
| 307 |
+
updated_metadata["district"] = district
|
| 308 |
+
|
| 309 |
+
# Update point in Qdrant
|
| 310 |
+
self.qdrant_client.set_payload(
|
| 311 |
+
collection_name=self.collection_name,
|
| 312 |
+
payload={"metadata": updated_metadata},
|
| 313 |
+
points=[point_id]
|
| 314 |
+
)
|
| 315 |
+
|
| 316 |
+
updated_count += 1
|
| 317 |
+
logger.info(f"Updated point {point_id}: {filename} -> {district}")
|
| 318 |
+
|
| 319 |
+
except Exception as e:
|
| 320 |
+
logger.error(f"Failed to update point {point_id}: {e}")
|
| 321 |
+
|
| 322 |
+
return updated_count
|
| 323 |
+
|
| 324 |
+
def process_all_chunks(self, batch_size: int = 100):
|
| 325 |
+
"""Process all chunks in batches"""
|
| 326 |
+
total_updated = 0
|
| 327 |
+
offset = 0
|
| 328 |
+
|
| 329 |
+
logger.info(f"Starting to process chunks in batches of {batch_size}")
|
| 330 |
+
|
| 331 |
+
while True:
|
| 332 |
+
# Fetch batch
|
| 333 |
+
points = self.fetch_chunks_batch(batch_size, offset)
|
| 334 |
+
if not points:
|
| 335 |
+
break
|
| 336 |
+
|
| 337 |
+
logger.info(f"Processing batch: {len(points)} points (offset: {offset})")
|
| 338 |
+
|
| 339 |
+
# Update batch
|
| 340 |
+
updated_count = self.update_chunks_with_district(points)
|
| 341 |
+
total_updated += updated_count
|
| 342 |
+
|
| 343 |
+
logger.info(f"Updated {updated_count} points in this batch")
|
| 344 |
+
|
| 345 |
+
# Move to next batch
|
| 346 |
+
offset += batch_size
|
| 347 |
+
|
| 348 |
+
logger.info(f"Total updated: {total_updated} points")
|
| 349 |
+
return total_updated
|
| 350 |
+
|
| 351 |
+
def main():
|
| 352 |
+
"""Main function to run the district metadata processor"""
|
| 353 |
+
try:
|
| 354 |
+
processor = DistrictMetadataProcessor()
|
| 355 |
+
|
| 356 |
+
# Test with a small batch first
|
| 357 |
+
logger.info("Testing with first 10 chunks...")
|
| 358 |
+
test_points = processor.fetch_chunks_batch(10, 0)
|
| 359 |
+
|
| 360 |
+
if test_points:
|
| 361 |
+
logger.info("Test batch fetched successfully. Processing...")
|
| 362 |
+
for point in test_points:
|
| 363 |
+
filename = point.payload.get("metadata", {}).get("filename", "")
|
| 364 |
+
district = processor.infer_district(filename)
|
| 365 |
+
logger.info(f"Test: {filename} -> {district}")
|
| 366 |
+
|
| 367 |
+
# Ask user if they want to proceed with full processing
|
| 368 |
+
response = input("\nProceed with full processing? (y/n): ")
|
| 369 |
+
if response.lower() == 'y':
|
| 370 |
+
processor.process_all_chunks(batch_size=100)
|
| 371 |
+
else:
|
| 372 |
+
logger.info("Processing cancelled by user")
|
| 373 |
+
|
| 374 |
+
except Exception as e:
|
| 375 |
+
logger.error(f"Error in main: {e}")
|
| 376 |
+
raise
|
| 377 |
+
|
| 378 |
+
if __name__ == "__main__":
|
| 379 |
+
main()
|
app.py
CHANGED
|
@@ -3,7 +3,32 @@ Intelligent Audit Report Chatbot UI
|
|
| 3 |
"""
|
| 4 |
|
| 5 |
import os
|
| 6 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 7 |
|
| 8 |
# ===== CRITICAL: Fix OMP_NUM_THREADS FIRST, before ANY other imports =====
|
| 9 |
# Some libraries load at import time and will fail if OMP_NUM_THREADS is invalid
|
|
@@ -29,42 +54,30 @@ except (ValueError, TypeError):
|
|
| 29 |
|
| 30 |
# ===== Setup HuggingFace cache directories BEFORE any model imports =====
|
| 31 |
# CRITICAL: Set these before any imports that might use HuggingFace (like sentence-transformers)
|
| 32 |
-
#
|
| 33 |
-
|
| 34 |
-
|
| 35 |
-
os.environ["
|
| 36 |
-
os.environ["
|
| 37 |
-
os.environ["
|
| 38 |
-
os.environ["
|
| 39 |
-
|
| 40 |
-
|
| 41 |
-
|
| 42 |
-
|
| 43 |
-
|
| 44 |
-
|
| 45 |
-
|
| 46 |
-
|
| 47 |
-
|
| 48 |
-
import time
|
| 49 |
-
import json
|
| 50 |
-
import uuid
|
| 51 |
-
import logging
|
| 52 |
-
from pathlib import Path
|
| 53 |
-
|
| 54 |
-
import argparse
|
| 55 |
-
import streamlit as st
|
| 56 |
-
from langchain_core.messages import HumanMessage, AIMessage
|
| 57 |
-
|
| 58 |
-
from multi_agent_chatbot import get_multi_agent_chatbot
|
| 59 |
-
from smart_chatbot import get_chatbot as get_smart_chatbot
|
| 60 |
-
from src.reporting.feedback_schema import create_feedback_from_dict
|
| 61 |
|
| 62 |
# Configure logging
|
| 63 |
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
|
| 64 |
logger = logging.getLogger(__name__)
|
| 65 |
|
| 66 |
# Log environment setup for debugging
|
| 67 |
-
logger.info(f"
|
|
|
|
|
|
|
| 68 |
logger.info(f"π§ OMP_NUM_THREADS: {os.environ.get('OMP_NUM_THREADS', 'NOT SET')}")
|
| 69 |
|
| 70 |
|
|
@@ -94,6 +107,54 @@ st.markdown("""
|
|
| 94 |
margin-bottom: 2rem;
|
| 95 |
}
|
| 96 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 97 |
.session-info {
|
| 98 |
background-color: #f0f2f6;
|
| 99 |
padding: 10px;
|
|
@@ -152,6 +213,34 @@ st.markdown("""
|
|
| 152 |
margin: 10px 0;
|
| 153 |
border-left: 4px solid #007bff;
|
| 154 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 155 |
</style>
|
| 156 |
""", unsafe_allow_html=True)
|
| 157 |
|
|
@@ -215,13 +304,270 @@ def serialize_documents(sources):
|
|
| 215 |
|
| 216 |
return serialized
|
| 217 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 218 |
@st.cache_data
|
| 219 |
def load_filter_options():
|
| 220 |
try:
|
| 221 |
-
|
|
|
|
| 222 |
return json.load(f)
|
| 223 |
except FileNotFoundError:
|
| 224 |
-
st.info(
|
| 225 |
st.error("filter_options.json not found. Please run the metadata analysis script.")
|
| 226 |
return {"sources": [], "years": [], "districts": [], 'filenames': []}
|
| 227 |
|
|
@@ -254,16 +600,8 @@ def main():
|
|
| 254 |
st.session_state.reset_conversation = False
|
| 255 |
st.rerun()
|
| 256 |
|
| 257 |
-
# Header
|
| 258 |
-
|
| 259 |
-
with col1:
|
| 260 |
-
st.markdown('<h1 class="main-header">π€ Intelligent Audit Report Chatbot</h1>', unsafe_allow_html=True)
|
| 261 |
-
with col2:
|
| 262 |
-
system_type = get_system_type()
|
| 263 |
-
if "Multi-Agent" in system_type:
|
| 264 |
-
st.success(f"π§ {system_type}")
|
| 265 |
-
else:
|
| 266 |
-
st.info(f"π§ {system_type}")
|
| 267 |
st.markdown('<p class="subtitle">Ask questions about audit reports. Use the sidebar filters to narrow down your search!</p>', unsafe_allow_html=True)
|
| 268 |
|
| 269 |
# Session info
|
|
@@ -280,6 +618,40 @@ def main():
|
|
| 280 |
|
| 281 |
# Sidebar for filters
|
| 282 |
with st.sidebar:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 283 |
st.markdown("### π Search Filters")
|
| 284 |
st.markdown("Select filters to narrow down your search. Leave empty to search all data.")
|
| 285 |
|
|
@@ -294,11 +666,13 @@ def main():
|
|
| 294 |
help="Choose specific reports to search. When enabled, all other filters are ignored."
|
| 295 |
)
|
| 296 |
st.markdown('</div>', unsafe_allow_html=True)
|
|
|
|
|
|
|
| 297 |
|
| 298 |
# Determine if filename filter is active
|
| 299 |
filename_mode = len(selected_filenames) > 0
|
| 300 |
# Sources filter
|
| 301 |
-
st.markdown('<div class="filter-section">', unsafe_allow_html=True)
|
| 302 |
st.markdown('<div class="filter-title">π Sources</div>', unsafe_allow_html=True)
|
| 303 |
selected_sources = st.multiselect(
|
| 304 |
"Select sources:",
|
|
@@ -311,7 +685,7 @@ def main():
|
|
| 311 |
st.markdown('</div>', unsafe_allow_html=True)
|
| 312 |
|
| 313 |
# Years filter
|
| 314 |
-
st.markdown('<div class="filter-section">', unsafe_allow_html=True)
|
| 315 |
st.markdown('<div class="filter-title">π
Years</div>', unsafe_allow_html=True)
|
| 316 |
selected_years = st.multiselect(
|
| 317 |
"Select years:",
|
|
@@ -324,7 +698,7 @@ def main():
|
|
| 324 |
st.markdown('</div>', unsafe_allow_html=True)
|
| 325 |
|
| 326 |
# Districts filter
|
| 327 |
-
st.markdown('<div class="filter-section">', unsafe_allow_html=True)
|
| 328 |
st.markdown('<div class="filter-title">ποΈ Districts</div>', unsafe_allow_html=True)
|
| 329 |
selected_districts = st.multiselect(
|
| 330 |
"Select districts:",
|
|
@@ -375,12 +749,85 @@ def main():
|
|
| 375 |
if 'input_counter' not in st.session_state:
|
| 376 |
st.session_state.input_counter = 0
|
| 377 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 378 |
user_input = st.text_input(
|
| 379 |
"Type your message here...",
|
| 380 |
placeholder="Ask about budget allocations, expenditures, or audit findings...",
|
| 381 |
-
key=f"user_input_{
|
| 382 |
-
label_visibility="collapsed"
|
|
|
|
| 383 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 384 |
|
| 385 |
with col2:
|
| 386 |
send_button = st.button("Send", key="send_button", use_container_width=True)
|
|
@@ -389,12 +836,11 @@ def main():
|
|
| 389 |
if st.button("ποΈ Clear Chat", key="clear_chat_button"):
|
| 390 |
st.session_state.reset_conversation = True
|
| 391 |
# Clear all conversation files
|
| 392 |
-
|
| 393 |
-
|
| 394 |
-
|
| 395 |
-
|
| 396 |
-
|
| 397 |
-
os.remove(os.path.join(conversations_dir, file))
|
| 398 |
st.rerun()
|
| 399 |
|
| 400 |
# Handle user input
|
|
@@ -484,14 +930,30 @@ def main():
|
|
| 484 |
# Count unique filenames
|
| 485 |
unique_filenames = set()
|
| 486 |
for doc in sources:
|
| 487 |
-
|
|
|
|
| 488 |
unique_filenames.add(filename)
|
| 489 |
|
| 490 |
-
st.markdown(f"**Found {len(sources)} document chunks from {len(unique_filenames)} unique documents (showing top
|
| 491 |
if len(unique_filenames) < len(sources):
|
| 492 |
st.info(f"π‘ **Note**: Each document is split into multiple chunks. You're seeing {len(sources)} chunks from {len(unique_filenames)} documents.")
|
| 493 |
|
| 494 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 495 |
# Get relevance score and ID if available
|
| 496 |
metadata = getattr(doc, 'metadata', {})
|
| 497 |
score = metadata.get('reranked_score', metadata.get('original_score', None))
|
|
@@ -524,6 +986,44 @@ def main():
|
|
| 524 |
st.info("No documents were retrieved for the last query.")
|
| 525 |
else:
|
| 526 |
st.info("No documents have been retrieved yet. Start a conversation to see retrieved documents here.")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 527 |
|
| 528 |
# Feedback Dashboard Section
|
| 529 |
st.markdown("---")
|
|
@@ -543,200 +1043,271 @@ def main():
|
|
| 543 |
if 'feedback_submitted' not in st.session_state:
|
| 544 |
st.session_state.feedback_submitted = False
|
| 545 |
|
| 546 |
-
# Feedback form
|
| 547 |
-
|
| 548 |
-
|
| 549 |
-
|
| 550 |
-
with col1:
|
| 551 |
-
feedback_score = st.slider(
|
| 552 |
-
"Rate this conversation (1-5)",
|
| 553 |
-
min_value=1,
|
| 554 |
-
max_value=5,
|
| 555 |
-
help="How satisfied are you with the conversation?"
|
| 556 |
-
)
|
| 557 |
-
|
| 558 |
-
with col2:
|
| 559 |
-
is_feedback_about_last_retrieval = st.checkbox(
|
| 560 |
-
"Feedback about last retrieval only",
|
| 561 |
-
value=True,
|
| 562 |
-
help="If checked, feedback applies to the most recent document retrieval"
|
| 563 |
-
)
|
| 564 |
-
|
| 565 |
-
open_ended_feedback = st.text_area(
|
| 566 |
-
"Your feedback (optional)",
|
| 567 |
-
placeholder="Tell us what went well or what could be improved...",
|
| 568 |
-
height=100
|
| 569 |
-
)
|
| 570 |
-
|
| 571 |
-
# Disable submit if no score selected
|
| 572 |
-
submit_disabled = feedback_score is None
|
| 573 |
-
|
| 574 |
-
submitted = st.form_submit_button(
|
| 575 |
-
"π€ Submit Feedback",
|
| 576 |
-
use_container_width=True,
|
| 577 |
-
disabled=submit_disabled
|
| 578 |
-
)
|
| 579 |
-
|
| 580 |
-
if submitted and not st.session_state.feedback_submitted:
|
| 581 |
-
# Log the feedback data being submitted
|
| 582 |
-
print("=" * 80)
|
| 583 |
-
print("π FEEDBACK SUBMISSION: Starting...")
|
| 584 |
-
print("=" * 80)
|
| 585 |
-
st.write("π **Debug: Feedback Data Being Submitted:**")
|
| 586 |
|
| 587 |
-
|
| 588 |
-
|
| 589 |
-
|
| 590 |
-
|
| 591 |
-
|
| 592 |
-
|
| 593 |
-
|
| 594 |
-
"timestamp": time.time(),
|
| 595 |
-
"message_count": len(st.session_state.messages),
|
| 596 |
-
"has_retrievals": has_retrievals,
|
| 597 |
-
"retrieval_count": len(st.session_state.rag_retrieval_history)
|
| 598 |
-
}
|
| 599 |
|
| 600 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 601 |
|
| 602 |
-
|
| 603 |
-
|
| 604 |
-
|
| 605 |
-
|
| 606 |
-
|
| 607 |
-
st.write(f"β
**Feedback Object Created**")
|
| 608 |
-
st.write(f"- Feedback ID: {feedback_obj.feedback_id}")
|
| 609 |
-
st.write(f"- Score: {feedback_obj.score}/5")
|
| 610 |
-
st.write(f"- Has Retrievals: {feedback_obj.has_retrievals}")
|
| 611 |
-
|
| 612 |
-
# Convert back to dict for JSON serialization
|
| 613 |
-
feedback_data = feedback_obj.to_dict()
|
| 614 |
-
except Exception as e:
|
| 615 |
-
print(f"β FEEDBACK SUBMISSION: Failed to create feedback object: {e}")
|
| 616 |
-
st.error(f"Failed to create feedback object: {e}")
|
| 617 |
-
feedback_data = feedback_dict
|
| 618 |
-
|
| 619 |
-
# Display the data being submitted
|
| 620 |
-
st.json(feedback_data)
|
| 621 |
|
| 622 |
-
#
|
| 623 |
-
|
| 624 |
-
try:
|
| 625 |
-
# Ensure directory exists with write permissions (777 for compatibility)
|
| 626 |
-
feedback_dir.mkdir(parents=True, mode=0o777, exist_ok=True)
|
| 627 |
-
except (PermissionError, OSError) as e:
|
| 628 |
-
logger.warning(f"Could not create feedback directory at {feedback_dir}: {e}")
|
| 629 |
-
# Fallback to relative path
|
| 630 |
-
feedback_dir = Path("feedback")
|
| 631 |
-
feedback_dir.mkdir(parents=True, mode=0o777, exist_ok=True)
|
| 632 |
|
| 633 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 634 |
|
| 635 |
-
|
| 636 |
-
#
|
| 637 |
-
|
| 638 |
-
|
| 639 |
-
|
| 640 |
-
|
| 641 |
-
with open(feedback_file, 'w') as f:
|
| 642 |
-
json.dump(feedback_data, f, indent=2, default=str)
|
| 643 |
|
| 644 |
-
|
| 645 |
-
|
| 646 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 647 |
|
| 648 |
-
|
| 649 |
-
logger.info("π FEEDBACK SAVE: Starting Snowflake save process...")
|
| 650 |
-
logger.info(f"π FEEDBACK SAVE: feedback_obj={'exists' if feedback_obj else 'None'}")
|
| 651 |
|
|
|
|
|
|
|
| 652 |
try:
|
| 653 |
-
|
| 654 |
-
|
| 655 |
-
|
|
|
|
|
|
|
|
|
|
| 656 |
|
| 657 |
-
|
| 658 |
-
|
| 659 |
-
try:
|
| 660 |
-
from src.reporting.snowflake_connector import save_to_snowflake
|
| 661 |
-
logger.info("π€ SNOWFLAKE UI: Attempting to save feedback to Snowflake...")
|
| 662 |
-
print("π€ SNOWFLAKE UI: Attempting to save feedback to Snowflake...") # Also print to terminal
|
| 663 |
-
|
| 664 |
-
if save_to_snowflake(feedback_obj):
|
| 665 |
-
logger.info("β
SNOWFLAKE UI: Successfully saved to Snowflake")
|
| 666 |
-
print("β
SNOWFLAKE UI: Successfully saved to Snowflake") # Also print to terminal
|
| 667 |
-
st.success("β
Feedback also saved to Snowflake!")
|
| 668 |
-
else:
|
| 669 |
-
logger.warning("β οΈ SNOWFLAKE UI: Save failed")
|
| 670 |
-
print("β οΈ SNOWFLAKE UI: Save failed") # Also print to terminal
|
| 671 |
-
st.warning("β οΈ Snowflake save failed, but local save succeeded")
|
| 672 |
-
except Exception as e:
|
| 673 |
-
logger.error(f"β SNOWFLAKE UI ERROR: {e}")
|
| 674 |
-
print(f"β SNOWFLAKE UI ERROR: {e}") # Also print to terminal
|
| 675 |
-
import traceback
|
| 676 |
-
traceback.print_exc() # Print full traceback to terminal
|
| 677 |
-
st.warning(f"β οΈ Could not save to Snowflake: {e}")
|
| 678 |
-
else:
|
| 679 |
-
logger.warning("β οΈ SNOWFLAKE UI: Skipping (feedback object not created)")
|
| 680 |
-
print("β οΈ SNOWFLAKE UI: Skipping (feedback object not created)") # Also print to terminal
|
| 681 |
-
st.warning("β οΈ Skipping Snowflake save (feedback object not created)")
|
| 682 |
-
else:
|
| 683 |
-
logger.info("π‘ SNOWFLAKE UI: Integration disabled")
|
| 684 |
-
print("π‘ SNOWFLAKE UI: Integration disabled") # Also print to terminal
|
| 685 |
-
st.info("π‘ Snowflake integration disabled (set SNOWFLAKE_ENABLED=true to enable)")
|
| 686 |
-
except NameError as e:
|
| 687 |
-
import traceback
|
| 688 |
-
traceback.print_exc()
|
| 689 |
-
logger.error(f"β NameError in Snowflake save: {e}")
|
| 690 |
-
print(f"β NameError in Snowflake save: {e}") # Also print to terminal
|
| 691 |
-
st.warning(f"β οΈ Snowflake save error: {e}")
|
| 692 |
except Exception as e:
|
| 693 |
-
|
| 694 |
-
|
| 695 |
-
|
| 696 |
|
| 697 |
-
#
|
| 698 |
-
st.
|
| 699 |
|
| 700 |
-
|
| 701 |
-
|
| 702 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 703 |
|
| 704 |
-
|
| 705 |
-
st.info(f"π Feedback saved to: {feedback_file}")
|
| 706 |
|
| 707 |
-
|
| 708 |
-
|
| 709 |
-
|
| 710 |
-
|
| 711 |
-
|
| 712 |
-
|
| 713 |
-
|
| 714 |
-
|
| 715 |
-
|
| 716 |
-
|
| 717 |
-
|
| 718 |
-
|
| 719 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 720 |
|
| 721 |
-
#
|
| 722 |
-
|
| 723 |
-
|
| 724 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 725 |
|
| 726 |
-
|
| 727 |
-
|
| 728 |
-
|
| 729 |
-
|
| 730 |
-
|
| 731 |
-
|
| 732 |
-
|
| 733 |
-
|
| 734 |
-
|
| 735 |
-
|
| 736 |
-
|
| 737 |
-
|
| 738 |
-
|
| 739 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 740 |
|
| 741 |
# Auto-scroll to bottom
|
| 742 |
st.markdown("""
|
|
|
|
| 3 |
"""
|
| 4 |
|
| 5 |
import os
|
| 6 |
+
|
| 7 |
+
import time
|
| 8 |
+
import json
|
| 9 |
+
import uuid
|
| 10 |
+
import logging
|
| 11 |
+
import traceback
|
| 12 |
+
from pathlib import Path
|
| 13 |
+
from typing import List, Dict, Any
|
| 14 |
+
from collections import Counter
|
| 15 |
+
|
| 16 |
+
import streamlit as st
|
| 17 |
+
from langchain_core.messages import HumanMessage, AIMessage
|
| 18 |
+
import pandas as pd
|
| 19 |
+
import plotly.express as px
|
| 20 |
+
|
| 21 |
+
from multi_agent_chatbot import get_multi_agent_chatbot
|
| 22 |
+
from smart_chatbot import get_chatbot as get_smart_chatbot
|
| 23 |
+
from src.reporting.feedback_schema import create_feedback_from_dict
|
| 24 |
+
from src.reporting.snowflake_connector import save_to_snowflake
|
| 25 |
+
from src.config.paths import (
|
| 26 |
+
IS_DEPLOYED,
|
| 27 |
+
PROJECT_DIR,
|
| 28 |
+
HF_CACHE_DIR,
|
| 29 |
+
FEEDBACK_DIR,
|
| 30 |
+
CONVERSATIONS_DIR,
|
| 31 |
+
)
|
| 32 |
|
| 33 |
# ===== CRITICAL: Fix OMP_NUM_THREADS FIRST, before ANY other imports =====
|
| 34 |
# Some libraries load at import time and will fail if OMP_NUM_THREADS is invalid
|
|
|
|
| 54 |
|
| 55 |
# ===== Setup HuggingFace cache directories BEFORE any model imports =====
|
| 56 |
# CRITICAL: Set these before any imports that might use HuggingFace (like sentence-transformers)
|
| 57 |
+
# Only override cache directories in deployed environment (local uses defaults)
|
| 58 |
+
if IS_DEPLOYED and HF_CACHE_DIR:
|
| 59 |
+
cache_dir = str(HF_CACHE_DIR)
|
| 60 |
+
os.environ["HF_HOME"] = cache_dir
|
| 61 |
+
os.environ["TRANSFORMERS_CACHE"] = cache_dir
|
| 62 |
+
os.environ["HF_DATASETS_CACHE"] = cache_dir
|
| 63 |
+
os.environ["HF_HUB_CACHE"] = cache_dir
|
| 64 |
+
os.environ["SENTENCE_TRANSFORMERS_HOME"] = cache_dir
|
| 65 |
+
|
| 66 |
+
# Ensure cache directory exists (created in Dockerfile, but ensure it's there)
|
| 67 |
+
try:
|
| 68 |
+
os.makedirs(cache_dir, mode=0o755, exist_ok=True)
|
| 69 |
+
except (PermissionError, OSError):
|
| 70 |
+
# If we can't create it, log but continue (might already exist from Dockerfile)
|
| 71 |
+
pass
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 72 |
|
| 73 |
# Configure logging
|
| 74 |
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
|
| 75 |
logger = logging.getLogger(__name__)
|
| 76 |
|
| 77 |
# Log environment setup for debugging
|
| 78 |
+
logger.info(f"π Environment: {'DEPLOYED' if IS_DEPLOYED else 'LOCAL'}")
|
| 79 |
+
logger.info(f"π PROJECT_DIR: {PROJECT_DIR}")
|
| 80 |
+
logger.info(f"π HuggingFace cache: {os.environ.get('HF_HOME', 'DEFAULT (not overridden)')}")
|
| 81 |
logger.info(f"π§ OMP_NUM_THREADS: {os.environ.get('OMP_NUM_THREADS', 'NOT SET')}")
|
| 82 |
|
| 83 |
|
|
|
|
| 107 |
margin-bottom: 2rem;
|
| 108 |
}
|
| 109 |
|
| 110 |
+
.example-questions-header {
|
| 111 |
+
text-align: center;
|
| 112 |
+
margin-bottom: 1rem;
|
| 113 |
+
}
|
| 114 |
+
|
| 115 |
+
.example-questions-description {
|
| 116 |
+
text-align: center;
|
| 117 |
+
color: #666;
|
| 118 |
+
margin-bottom: 2rem;
|
| 119 |
+
}
|
| 120 |
+
|
| 121 |
+
/* Hide ALL default Streamlit text input help messages about Enter key */
|
| 122 |
+
/* This is the key one - hides "Press Enter to apply" message inside input field */
|
| 123 |
+
div[data-testid="InputInstructions"],
|
| 124 |
+
span[data-testid="InputInstructions"],
|
| 125 |
+
*[data-testid="InputInstructions"] {
|
| 126 |
+
display: none !important;
|
| 127 |
+
visibility: hidden !important;
|
| 128 |
+
opacity: 0 !important;
|
| 129 |
+
height: 0 !important;
|
| 130 |
+
width: 0 !important;
|
| 131 |
+
overflow: hidden !important;
|
| 132 |
+
position: absolute !important;
|
| 133 |
+
left: -9999px !important;
|
| 134 |
+
}
|
| 135 |
+
|
| 136 |
+
/* Also hide other potential locations */
|
| 137 |
+
div[data-testid="stTextInput"] + div > small,
|
| 138 |
+
div[data-testid="stTextInput"] ~ div > small,
|
| 139 |
+
div[data-testid="stTextInputContainer"] + div > small,
|
| 140 |
+
div[data-testid="stTextInputContainer"] ~ div > small,
|
| 141 |
+
div[data-baseweb="input"] + div > small,
|
| 142 |
+
div[data-baseweb="input"] ~ div > small {
|
| 143 |
+
display: none !important;
|
| 144 |
+
visibility: hidden !important;
|
| 145 |
+
opacity: 0 !important;
|
| 146 |
+
height: 0 !important;
|
| 147 |
+
overflow: hidden !important;
|
| 148 |
+
}
|
| 149 |
+
|
| 150 |
+
/* Custom help text for input */
|
| 151 |
+
.input-help-text {
|
| 152 |
+
font-size: 0.85rem;
|
| 153 |
+
color: #666;
|
| 154 |
+
margin-top: 0.25rem;
|
| 155 |
+
text-align: left;
|
| 156 |
+
}
|
| 157 |
+
|
| 158 |
.session-info {
|
| 159 |
background-color: #f0f2f6;
|
| 160 |
padding: 10px;
|
|
|
|
| 213 |
margin: 10px 0;
|
| 214 |
border-left: 4px solid #007bff;
|
| 215 |
}
|
| 216 |
+
|
| 217 |
+
.retrieval-distribution-container {
|
| 218 |
+
background-color: #ffffff;
|
| 219 |
+
padding: 25px;
|
| 220 |
+
border-radius: 10px;
|
| 221 |
+
margin: 20px 0;
|
| 222 |
+
border: 2px solid #e0e0e0;
|
| 223 |
+
box-shadow: 0 4px 6px rgba(0, 0, 0, 0.1), 0 2px 4px rgba(0, 0, 0, 0.06);
|
| 224 |
+
}
|
| 225 |
+
|
| 226 |
+
.metric-label {
|
| 227 |
+
font-size: 0.9rem;
|
| 228 |
+
color: #555;
|
| 229 |
+
margin-bottom: 5px;
|
| 230 |
+
text-align: center;
|
| 231 |
+
}
|
| 232 |
+
|
| 233 |
+
.metric-value {
|
| 234 |
+
font-size: 1.8rem;
|
| 235 |
+
font-weight: bold;
|
| 236 |
+
color: #000000;
|
| 237 |
+
text-align: center;
|
| 238 |
+
}
|
| 239 |
+
|
| 240 |
+
.metric-container {
|
| 241 |
+
text-align: center;
|
| 242 |
+
padding: 10px;
|
| 243 |
+
}
|
| 244 |
</style>
|
| 245 |
""", unsafe_allow_html=True)
|
| 246 |
|
|
|
|
| 304 |
|
| 305 |
return serialized
|
| 306 |
|
| 307 |
+
def extract_chunk_statistics(sources: List[Any]) -> Dict[str, Any]:
|
| 308 |
+
"""Extract statistics from retrieved chunks."""
|
| 309 |
+
if not sources:
|
| 310 |
+
return {}
|
| 311 |
+
|
| 312 |
+
sources_list = []
|
| 313 |
+
years = []
|
| 314 |
+
filenames = []
|
| 315 |
+
districts = []
|
| 316 |
+
|
| 317 |
+
for doc in sources:
|
| 318 |
+
metadata = getattr(doc, 'metadata', {})
|
| 319 |
+
|
| 320 |
+
# Extract source
|
| 321 |
+
source = metadata.get('source', 'Unknown')
|
| 322 |
+
sources_list.append(source)
|
| 323 |
+
|
| 324 |
+
# Extract year
|
| 325 |
+
year = metadata.get('year', 'Unknown')
|
| 326 |
+
if year and year != 'Unknown':
|
| 327 |
+
try:
|
| 328 |
+
# Convert to int first, then back to string to ensure it's a proper year
|
| 329 |
+
year_int = int(float(year)) # Handle both int and float strings
|
| 330 |
+
if 1900 <= year_int <= 2030: # Reasonable year range
|
| 331 |
+
years.append(str(year_int))
|
| 332 |
+
else:
|
| 333 |
+
years.append('Unknown')
|
| 334 |
+
except (ValueError, TypeError):
|
| 335 |
+
years.append('Unknown')
|
| 336 |
+
else:
|
| 337 |
+
years.append('Unknown')
|
| 338 |
+
|
| 339 |
+
# Extract filename
|
| 340 |
+
filename = metadata.get('filename', 'Unknown')
|
| 341 |
+
filenames.append(filename)
|
| 342 |
+
|
| 343 |
+
# Extract district
|
| 344 |
+
district = metadata.get('district', 'Unknown')
|
| 345 |
+
if district and district != 'Unknown':
|
| 346 |
+
districts.append(district)
|
| 347 |
+
else:
|
| 348 |
+
districts.append('Unknown')
|
| 349 |
+
|
| 350 |
+
# Count occurrences
|
| 351 |
+
source_counts = Counter(sources_list)
|
| 352 |
+
year_counts = Counter(years)
|
| 353 |
+
filename_counts = Counter(filenames)
|
| 354 |
+
district_counts = Counter(districts)
|
| 355 |
+
|
| 356 |
+
return {
|
| 357 |
+
'total_chunks': len(sources),
|
| 358 |
+
'unique_sources': len(source_counts),
|
| 359 |
+
'unique_years': len([y for y in year_counts.keys() if y != 'Unknown']),
|
| 360 |
+
'unique_filenames': len(filename_counts),
|
| 361 |
+
'unique_districts': len([d for d in district_counts.keys() if d != 'Unknown']),
|
| 362 |
+
'source_distribution': dict(source_counts),
|
| 363 |
+
'year_distribution': dict(year_counts),
|
| 364 |
+
'filename_distribution': dict(filename_counts),
|
| 365 |
+
'district_distribution': dict(district_counts),
|
| 366 |
+
'sources': sources_list,
|
| 367 |
+
'years': years,
|
| 368 |
+
'filenames': filenames,
|
| 369 |
+
'districts': districts
|
| 370 |
+
}
|
| 371 |
+
|
| 372 |
+
def display_chunk_statistics_charts(stats: Dict[str, Any], title: str = "Retrieval Statistics"):
|
| 373 |
+
"""Display statistics as interactive charts for 10+ results."""
|
| 374 |
+
if not stats or stats.get('total_chunks', 0) == 0:
|
| 375 |
+
return
|
| 376 |
+
|
| 377 |
+
# Wrap everything in one styled container - open it
|
| 378 |
+
st.markdown(f"""
|
| 379 |
+
<div class="retrieval-distribution-container">
|
| 380 |
+
<h3 style="margin-top: 0;">π {title}</h3>
|
| 381 |
+
<div style="display: flex; justify-content: space-around; align-items: center; padding: 15px 0; border-bottom: 1px solid #e0e0e0; margin-bottom: 20px;">
|
| 382 |
+
<div class="metric-container">
|
| 383 |
+
<div class="metric-label">Total Chunks</div>
|
| 384 |
+
<div class="metric-value">{stats['total_chunks']}</div>
|
| 385 |
+
</div>
|
| 386 |
+
<div class="metric-container">
|
| 387 |
+
<div class="metric-label">Unique Sources</div>
|
| 388 |
+
<div class="metric-value">{stats['unique_sources']}</div>
|
| 389 |
+
</div>
|
| 390 |
+
<div class="metric-container">
|
| 391 |
+
<div class="metric-label">Unique Years</div>
|
| 392 |
+
<div class="metric-value">{stats['unique_years']}</div>
|
| 393 |
+
</div>
|
| 394 |
+
<div class="metric-container">
|
| 395 |
+
<div class="metric-label">Unique Files</div>
|
| 396 |
+
<div class="metric-value">{stats['unique_filenames']}</div>
|
| 397 |
+
</div>
|
| 398 |
+
</div>
|
| 399 |
+
""", unsafe_allow_html=True)
|
| 400 |
+
|
| 401 |
+
# Charts - three columns to include Districts
|
| 402 |
+
col1, col2, col3 = st.columns(3)
|
| 403 |
+
|
| 404 |
+
with col1:
|
| 405 |
+
# Source distribution chart
|
| 406 |
+
if stats['source_distribution']:
|
| 407 |
+
source_df = pd.DataFrame(
|
| 408 |
+
list(stats['source_distribution'].items()),
|
| 409 |
+
columns=['Source', 'Count']
|
| 410 |
+
)
|
| 411 |
+
fig_source = px.bar(
|
| 412 |
+
source_df,
|
| 413 |
+
x='Count',
|
| 414 |
+
y='Source',
|
| 415 |
+
orientation='h',
|
| 416 |
+
title='Distribution by Source',
|
| 417 |
+
color='Count',
|
| 418 |
+
color_continuous_scale='viridis'
|
| 419 |
+
)
|
| 420 |
+
fig_source.update_layout(height=400, showlegend=False)
|
| 421 |
+
st.plotly_chart(fig_source, use_container_width=True)
|
| 422 |
+
|
| 423 |
+
with col2:
|
| 424 |
+
# Year distribution chart
|
| 425 |
+
if stats['year_distribution']:
|
| 426 |
+
# Filter out 'Unknown' years for the chart
|
| 427 |
+
year_dist_filtered = {k: v for k, v in stats['year_distribution'].items() if k != 'Unknown'}
|
| 428 |
+
if year_dist_filtered:
|
| 429 |
+
year_df = pd.DataFrame(
|
| 430 |
+
list(year_dist_filtered.items()),
|
| 431 |
+
columns=['Year', 'Count']
|
| 432 |
+
)
|
| 433 |
+
# Sort by year as integer but keep as string for categorical display
|
| 434 |
+
year_df['Year_Int'] = year_df['Year'].astype(int)
|
| 435 |
+
year_df = year_df.sort_values('Year_Int').drop('Year_Int', axis=1)
|
| 436 |
+
|
| 437 |
+
fig_year = px.bar(
|
| 438 |
+
year_df,
|
| 439 |
+
x='Year',
|
| 440 |
+
y='Count',
|
| 441 |
+
title='Distribution by Year',
|
| 442 |
+
color='Count',
|
| 443 |
+
color_continuous_scale='plasma'
|
| 444 |
+
)
|
| 445 |
+
# Ensure years are treated as categorical (discrete) not continuous
|
| 446 |
+
fig_year.update_xaxes(type='category')
|
| 447 |
+
fig_year.update_layout(height=400, showlegend=False)
|
| 448 |
+
st.plotly_chart(fig_year, use_container_width=True)
|
| 449 |
+
else:
|
| 450 |
+
st.info("No valid years found in the results")
|
| 451 |
+
|
| 452 |
+
with col3:
|
| 453 |
+
# District distribution chart
|
| 454 |
+
if stats.get('district_distribution'):
|
| 455 |
+
district_dist_filtered = {k: v for k, v in stats['district_distribution'].items() if k != 'Unknown'}
|
| 456 |
+
if district_dist_filtered:
|
| 457 |
+
district_df = pd.DataFrame(
|
| 458 |
+
list(district_dist_filtered.items()),
|
| 459 |
+
columns=['District', 'Count']
|
| 460 |
+
)
|
| 461 |
+
district_df = district_df.sort_values('Count', ascending=False)
|
| 462 |
+
|
| 463 |
+
fig_district = px.bar(
|
| 464 |
+
district_df,
|
| 465 |
+
x='Count',
|
| 466 |
+
y='District',
|
| 467 |
+
orientation='h',
|
| 468 |
+
title='Distribution by District',
|
| 469 |
+
color='Count',
|
| 470 |
+
color_continuous_scale='blues'
|
| 471 |
+
)
|
| 472 |
+
fig_district.update_layout(height=400, showlegend=False)
|
| 473 |
+
st.plotly_chart(fig_district, use_container_width=True)
|
| 474 |
+
else:
|
| 475 |
+
st.info("No valid districts found in the results")
|
| 476 |
+
|
| 477 |
+
# Close the container
|
| 478 |
+
st.markdown('</div>', unsafe_allow_html=True)
|
| 479 |
+
|
| 480 |
+
def display_chunk_statistics_table(stats: Dict[str, Any], title: str = "Retrieval Distribution"):
|
| 481 |
+
"""Display statistics as tables for smaller results with fixed alignment."""
|
| 482 |
+
if not stats or stats.get('total_chunks', 0) == 0:
|
| 483 |
+
return
|
| 484 |
+
|
| 485 |
+
# Wrap in styled container
|
| 486 |
+
# st.markdown('<div class="retrieval-distribution-container">', unsafe_allow_html=True)
|
| 487 |
+
|
| 488 |
+
st.subheader(f"π {title}")
|
| 489 |
+
|
| 490 |
+
# Create a container with fixed height for alignment
|
| 491 |
+
stats_container = st.container()
|
| 492 |
+
|
| 493 |
+
with stats_container:
|
| 494 |
+
# Create 4 equal columns for consistent alignment
|
| 495 |
+
col1, col2, col3, col4 = st.columns(4)
|
| 496 |
+
|
| 497 |
+
with col1:
|
| 498 |
+
st.markdown("**ποΈ Districts**")
|
| 499 |
+
if stats.get('district_distribution'):
|
| 500 |
+
district_dist_filtered = {k: v for k, v in stats['district_distribution'].items() if k != 'Unknown'}
|
| 501 |
+
if district_dist_filtered:
|
| 502 |
+
district_data = {
|
| 503 |
+
"District": list(district_dist_filtered.keys()),
|
| 504 |
+
"Count": list(district_dist_filtered.values())
|
| 505 |
+
}
|
| 506 |
+
district_df = pd.DataFrame(district_data).sort_values('Count', ascending=False)
|
| 507 |
+
st.dataframe(district_df, hide_index=True, use_container_width=True)
|
| 508 |
+
else:
|
| 509 |
+
st.write("No district data")
|
| 510 |
+
else:
|
| 511 |
+
st.write("No district data")
|
| 512 |
+
|
| 513 |
+
with col2:
|
| 514 |
+
st.markdown("**π Sources**")
|
| 515 |
+
if stats['source_distribution']:
|
| 516 |
+
source_data = {
|
| 517 |
+
"Source": list(stats['source_distribution'].keys()),
|
| 518 |
+
"Count": list(stats['source_distribution'].values())
|
| 519 |
+
}
|
| 520 |
+
source_df = pd.DataFrame(source_data).sort_values('Count', ascending=False)
|
| 521 |
+
st.dataframe(source_df, hide_index=True, use_container_width=True)
|
| 522 |
+
else:
|
| 523 |
+
st.write("No source data")
|
| 524 |
+
|
| 525 |
+
with col3:
|
| 526 |
+
st.markdown("**π
Years**")
|
| 527 |
+
if stats['year_distribution']:
|
| 528 |
+
year_dist_filtered = {k: v for k, v in stats['year_distribution'].items() if k != 'Unknown'}
|
| 529 |
+
if year_dist_filtered:
|
| 530 |
+
year_data = {
|
| 531 |
+
"Year": list(year_dist_filtered.keys()),
|
| 532 |
+
"Count": list(year_dist_filtered.values())
|
| 533 |
+
}
|
| 534 |
+
year_df = pd.DataFrame(year_data)
|
| 535 |
+
# Sort by year as integer but display as string
|
| 536 |
+
year_df['Year_Int'] = year_df['Year'].astype(int)
|
| 537 |
+
year_df = year_df.sort_values('Year_Int')[['Year', 'Count']]
|
| 538 |
+
st.dataframe(year_df, hide_index=True, use_container_width=True)
|
| 539 |
+
else:
|
| 540 |
+
st.write("No year data")
|
| 541 |
+
else:
|
| 542 |
+
st.write("No year data")
|
| 543 |
+
|
| 544 |
+
with col4:
|
| 545 |
+
st.markdown("**π Files**")
|
| 546 |
+
if stats['filename_distribution']:
|
| 547 |
+
filename_items = list(stats['filename_distribution'].items())
|
| 548 |
+
filename_items.sort(key=lambda x: x[1], reverse=True)
|
| 549 |
+
|
| 550 |
+
# Show top files with truncated names
|
| 551 |
+
file_data = {
|
| 552 |
+
"File": [f[:30] + "..." if len(f) > 30 else f for f, c in filename_items[:5]],
|
| 553 |
+
"Count": [c for f, c in filename_items[:5]]
|
| 554 |
+
}
|
| 555 |
+
file_df = pd.DataFrame(file_data)
|
| 556 |
+
st.dataframe(file_df, hide_index=True, use_container_width=True)
|
| 557 |
+
else:
|
| 558 |
+
st.write("No file data")
|
| 559 |
+
|
| 560 |
+
# Close container
|
| 561 |
+
st.markdown('</div>', unsafe_allow_html=True)
|
| 562 |
+
|
| 563 |
@st.cache_data
|
| 564 |
def load_filter_options():
|
| 565 |
try:
|
| 566 |
+
filter_options_path = PROJECT_DIR / "src" / "config" / "filter_options.json"
|
| 567 |
+
with open(filter_options_path, "r") as f:
|
| 568 |
return json.load(f)
|
| 569 |
except FileNotFoundError:
|
| 570 |
+
st.info(f"Looking for filter_options.json in: {PROJECT_DIR / 'src' / 'config'}")
|
| 571 |
st.error("filter_options.json not found. Please run the metadata analysis script.")
|
| 572 |
return {"sources": [], "years": [], "districts": [], 'filenames': []}
|
| 573 |
|
|
|
|
| 600 |
st.session_state.reset_conversation = False
|
| 601 |
st.rerun()
|
| 602 |
|
| 603 |
+
# Header - centered
|
| 604 |
+
st.markdown('<h1 class="main-header">π€ Intelligent Audit Report Chatbot</h1>', unsafe_allow_html=True)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 605 |
st.markdown('<p class="subtitle">Ask questions about audit reports. Use the sidebar filters to narrow down your search!</p>', unsafe_allow_html=True)
|
| 606 |
|
| 607 |
# Session info
|
|
|
|
| 618 |
|
| 619 |
# Sidebar for filters
|
| 620 |
with st.sidebar:
|
| 621 |
+
# Instructions section (collapsible)
|
| 622 |
+
with st.expander("π How to Use", expanded=False):
|
| 623 |
+
st.markdown("""
|
| 624 |
+
#### π― Using Filters
|
| 625 |
+
|
| 626 |
+
1. **Select filters** from the sidebar to narrow your search:
|
| 627 |
+
|
| 628 |
+
2. **Leave filters empty** to search across all data
|
| 629 |
+
|
| 630 |
+
3. **Type your question** in the chat and click "Send"
|
| 631 |
+
|
| 632 |
+
4. **Choose sample questions from the bottom of the page**
|
| 633 |
+
|
| 634 |
+
#### π‘ Tips
|
| 635 |
+
|
| 636 |
+
- Use specific questions for better results
|
| 637 |
+
- Combine multiple filters for precise searches
|
| 638 |
+
- Check the "Retrieved Documents" tab to get various insights
|
| 639 |
+
|
| 640 |
+
#### π¬ Feedback Section
|
| 641 |
+
|
| 642 |
+
- Rate your experience (1-5 stars)
|
| 643 |
+
- Provide optional text feedback
|
| 644 |
+
- Located at the bottom of the page
|
| 645 |
+
|
| 646 |
+
#### β οΈ Important
|
| 647 |
+
|
| 648 |
+
**When finished, please close the browser window** to free up computational resources.
|
| 649 |
+
|
| 650 |
+
---
|
| 651 |
+
|
| 652 |
+
For more detailed help, see the example questions at the bottom of the page.
|
| 653 |
+
""")
|
| 654 |
+
|
| 655 |
st.markdown("### π Search Filters")
|
| 656 |
st.markdown("Select filters to narrow down your search. Leave empty to search all data.")
|
| 657 |
|
|
|
|
| 666 |
help="Choose specific reports to search. When enabled, all other filters are ignored."
|
| 667 |
)
|
| 668 |
st.markdown('</div>', unsafe_allow_html=True)
|
| 669 |
+
|
| 670 |
+
st.markdown('---')
|
| 671 |
|
| 672 |
# Determine if filename filter is active
|
| 673 |
filename_mode = len(selected_filenames) > 0
|
| 674 |
# Sources filter
|
| 675 |
+
# st.markdown('<div class="filter-section">', unsafe_allow_html=True)
|
| 676 |
st.markdown('<div class="filter-title">π Sources</div>', unsafe_allow_html=True)
|
| 677 |
selected_sources = st.multiselect(
|
| 678 |
"Select sources:",
|
|
|
|
| 685 |
st.markdown('</div>', unsafe_allow_html=True)
|
| 686 |
|
| 687 |
# Years filter
|
| 688 |
+
# st.markdown('<div class="filter-section">', unsafe_allow_html=True)
|
| 689 |
st.markdown('<div class="filter-title">π
Years</div>', unsafe_allow_html=True)
|
| 690 |
selected_years = st.multiselect(
|
| 691 |
"Select years:",
|
|
|
|
| 698 |
st.markdown('</div>', unsafe_allow_html=True)
|
| 699 |
|
| 700 |
# Districts filter
|
| 701 |
+
# st.markdown('<div class="filter-section">', unsafe_allow_html=True)
|
| 702 |
st.markdown('<div class="filter-title">ποΈ Districts</div>', unsafe_allow_html=True)
|
| 703 |
selected_districts = st.multiselect(
|
| 704 |
"Select districts:",
|
|
|
|
| 749 |
if 'input_counter' not in st.session_state:
|
| 750 |
st.session_state.input_counter = 0
|
| 751 |
|
| 752 |
+
# Handle pending question from example questions section
|
| 753 |
+
if 'pending_question' in st.session_state and st.session_state.pending_question:
|
| 754 |
+
default_value = st.session_state.pending_question
|
| 755 |
+
# Increment counter to force new input widget
|
| 756 |
+
st.session_state.input_counter = (st.session_state.get('input_counter', 0) + 1) % 1000
|
| 757 |
+
del st.session_state.pending_question
|
| 758 |
+
key_suffix = st.session_state.input_counter
|
| 759 |
+
else:
|
| 760 |
+
default_value = ""
|
| 761 |
+
key_suffix = st.session_state.input_counter
|
| 762 |
+
|
| 763 |
user_input = st.text_input(
|
| 764 |
"Type your message here...",
|
| 765 |
placeholder="Ask about budget allocations, expenditures, or audit findings...",
|
| 766 |
+
key=f"user_input_{key_suffix}",
|
| 767 |
+
label_visibility="collapsed",
|
| 768 |
+
value=default_value if default_value else None
|
| 769 |
)
|
| 770 |
+
|
| 771 |
+
# Use JavaScript to specifically target and hide "Press Enter to apply" message
|
| 772 |
+
st.markdown("""
|
| 773 |
+
<script>
|
| 774 |
+
(function() {
|
| 775 |
+
// Hide InputInstructions element (contains "Press Enter to apply")
|
| 776 |
+
function hideInputInstructions() {
|
| 777 |
+
// Target the specific Streamlit element
|
| 778 |
+
const instructions = document.querySelector('[data-testid="InputInstructions"]');
|
| 779 |
+
if (instructions) {
|
| 780 |
+
instructions.style.display = 'none';
|
| 781 |
+
instructions.style.visibility = 'hidden';
|
| 782 |
+
instructions.style.opacity = '0';
|
| 783 |
+
instructions.style.height = '0';
|
| 784 |
+
instructions.style.width = '0';
|
| 785 |
+
instructions.style.overflow = 'hidden';
|
| 786 |
+
instructions.style.position = 'absolute';
|
| 787 |
+
instructions.style.left = '-9999px';
|
| 788 |
+
}
|
| 789 |
+
|
| 790 |
+
// Also search for any text containing "Press Enter" or "apply" inside input containers
|
| 791 |
+
const allElements = document.querySelectorAll('*');
|
| 792 |
+
allElements.forEach(el => {
|
| 793 |
+
const text = el.textContent || el.innerText || '';
|
| 794 |
+
if ((text.toLowerCase().includes('press enter') ||
|
| 795 |
+
text.toLowerCase().includes('enter to') ||
|
| 796 |
+
text.toLowerCase().includes('to apply')) &&
|
| 797 |
+
(el.tagName === 'SPAN' || el.tagName === 'DIV' || el.tagName === 'SMALL')) {
|
| 798 |
+
const style = window.getComputedStyle(el);
|
| 799 |
+
const fontSize = parseFloat(style.fontSize);
|
| 800 |
+
// Hide if it's small text (likely help text)
|
| 801 |
+
if (fontSize < 14 || el.hasAttribute('data-testid')) {
|
| 802 |
+
el.style.display = 'none';
|
| 803 |
+
el.style.visibility = 'hidden';
|
| 804 |
+
el.style.height = '0';
|
| 805 |
+
el.style.overflow = 'hidden';
|
| 806 |
+
}
|
| 807 |
+
}
|
| 808 |
+
});
|
| 809 |
+
}
|
| 810 |
+
|
| 811 |
+
// Run immediately and after delays to catch dynamic elements
|
| 812 |
+
hideInputInstructions();
|
| 813 |
+
setTimeout(hideInputInstructions, 50);
|
| 814 |
+
setTimeout(hideInputInstructions, 100);
|
| 815 |
+
setTimeout(hideInputInstructions, 500);
|
| 816 |
+
|
| 817 |
+
// Observe for new elements added by Streamlit
|
| 818 |
+
const observer = new MutationObserver(function(mutations) {
|
| 819 |
+
hideInputInstructions();
|
| 820 |
+
});
|
| 821 |
+
observer.observe(document.body, { childList: true, subtree: true, attributes: true });
|
| 822 |
+
})();
|
| 823 |
+
</script>
|
| 824 |
+
""", unsafe_allow_html=True)
|
| 825 |
+
|
| 826 |
+
# # Show custom help text below input - this replaces the default "Press Enter" message
|
| 827 |
+
# st.markdown(
|
| 828 |
+
# "<div class='input-help-text'>π‘ Press the <strong>Send</strong> button to submit your question</div>",
|
| 829 |
+
# unsafe_allow_html=True
|
| 830 |
+
# )
|
| 831 |
|
| 832 |
with col2:
|
| 833 |
send_button = st.button("Send", key="send_button", use_container_width=True)
|
|
|
|
| 836 |
if st.button("ποΈ Clear Chat", key="clear_chat_button"):
|
| 837 |
st.session_state.reset_conversation = True
|
| 838 |
# Clear all conversation files
|
| 839 |
+
conversations_path = CONVERSATIONS_DIR
|
| 840 |
+
if conversations_path.exists():
|
| 841 |
+
for file in conversations_path.iterdir():
|
| 842 |
+
if file.suffix == '.json':
|
| 843 |
+
file.unlink()
|
|
|
|
| 844 |
st.rerun()
|
| 845 |
|
| 846 |
# Handle user input
|
|
|
|
| 930 |
# Count unique filenames
|
| 931 |
unique_filenames = set()
|
| 932 |
for doc in sources:
|
| 933 |
+
metadata = getattr(doc, 'metadata', {})
|
| 934 |
+
filename = metadata.get('filename', 'Unknown')
|
| 935 |
unique_filenames.add(filename)
|
| 936 |
|
| 937 |
+
st.markdown(f"**Found {len(sources)} document chunks from {len(unique_filenames)} unique documents (showing top 20):**")
|
| 938 |
if len(unique_filenames) < len(sources):
|
| 939 |
st.info(f"π‘ **Note**: Each document is split into multiple chunks. You're seeing {len(sources)} chunks from {len(unique_filenames)} documents.")
|
| 940 |
|
| 941 |
+
# Extract and display statistics
|
| 942 |
+
stats = extract_chunk_statistics(sources)
|
| 943 |
+
|
| 944 |
+
# Show charts for 10+ results, tables for fewer
|
| 945 |
+
if len(sources) >= 10:
|
| 946 |
+
display_chunk_statistics_charts(stats, "Retrieval Statistics")
|
| 947 |
+
# Also show tables below charts for detailed view
|
| 948 |
+
st.markdown("---")
|
| 949 |
+
display_chunk_statistics_table(stats, "Retrieval Distribution")
|
| 950 |
+
else:
|
| 951 |
+
display_chunk_statistics_table(stats, "Retrieval Distribution")
|
| 952 |
+
|
| 953 |
+
st.markdown("---")
|
| 954 |
+
st.markdown("### π Document Details")
|
| 955 |
+
|
| 956 |
+
for i, doc in enumerate(sources): # Show all documents
|
| 957 |
# Get relevance score and ID if available
|
| 958 |
metadata = getattr(doc, 'metadata', {})
|
| 959 |
score = metadata.get('reranked_score', metadata.get('original_score', None))
|
|
|
|
| 986 |
st.info("No documents were retrieved for the last query.")
|
| 987 |
else:
|
| 988 |
st.info("No documents have been retrieved yet. Start a conversation to see retrieved documents here.")
|
| 989 |
+
|
| 990 |
+
# Display retrieval history stats
|
| 991 |
+
st.markdown("---")
|
| 992 |
+
if st.session_state.rag_retrieval_history:
|
| 993 |
+
st.markdown("#### π Retrieval History")
|
| 994 |
+
st.markdown(f"This conversation has **{len(st.session_state.rag_retrieval_history)}** retrieval entries.")
|
| 995 |
+
|
| 996 |
+
with st.expander(f"View {len(st.session_state.rag_retrieval_history)} retrieval entries", expanded=False):
|
| 997 |
+
for idx, entry in enumerate(st.session_state.rag_retrieval_history, 1):
|
| 998 |
+
with st.expander(f"Entry {idx}: {entry.get('rag_query_expansion', 'N/A')[:50]}...", expanded=False):
|
| 999 |
+
st.markdown(f"**Query:** {entry.get('rag_query_expansion', 'N/A')}")
|
| 1000 |
+
st.markdown(f"**Documents Retrieved:** {len(entry.get('docs_retrieved', []))}")
|
| 1001 |
+
|
| 1002 |
+
# Show conversation up to this point
|
| 1003 |
+
conversation = entry.get('conversation_up_to', [])
|
| 1004 |
+
if conversation:
|
| 1005 |
+
st.markdown("**Conversation Context:**")
|
| 1006 |
+
for msg in conversation[-3:]: # Show last 3 messages
|
| 1007 |
+
role = msg.get('type', 'unknown')
|
| 1008 |
+
content = msg.get('content', '')[:200] + "..." if len(msg.get('content', '')) > 200 else msg.get('content', '')
|
| 1009 |
+
if role == 'human':
|
| 1010 |
+
st.markdown(f"- **You:** {content}")
|
| 1011 |
+
elif role == 'ai':
|
| 1012 |
+
st.markdown(f"- **Bot:** {content}")
|
| 1013 |
+
|
| 1014 |
+
# Show retrieved documents summary
|
| 1015 |
+
docs = entry.get('docs_retrieved', [])
|
| 1016 |
+
if docs:
|
| 1017 |
+
st.markdown("**Retrieved Documents:**")
|
| 1018 |
+
for doc_idx, doc in enumerate(docs[:5], 1): # Show first 5
|
| 1019 |
+
doc_meta = doc.get('metadata', {})
|
| 1020 |
+
filename = doc_meta.get('filename', 'Unknown')[:50]
|
| 1021 |
+
st.markdown(f"{doc_idx}. {filename}")
|
| 1022 |
+
if len(docs) > 5:
|
| 1023 |
+
st.markdown(f"... and {len(docs) - 5} more documents")
|
| 1024 |
+
else:
|
| 1025 |
+
st.markdown("---")
|
| 1026 |
+
st.info("π Retrieval history will appear here after you start asking questions.")
|
| 1027 |
|
| 1028 |
# Feedback Dashboard Section
|
| 1029 |
st.markdown("---")
|
|
|
|
| 1043 |
if 'feedback_submitted' not in st.session_state:
|
| 1044 |
st.session_state.feedback_submitted = False
|
| 1045 |
|
| 1046 |
+
# Feedback form - only show if feedback not already submitted
|
| 1047 |
+
if not st.session_state.feedback_submitted:
|
| 1048 |
+
with st.form("feedback_form", clear_on_submit=False):
|
| 1049 |
+
col1, col2 = st.columns([1, 1])
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1050 |
|
| 1051 |
+
with col1:
|
| 1052 |
+
feedback_score = st.slider(
|
| 1053 |
+
"Rate this conversation (1-5)",
|
| 1054 |
+
min_value=1,
|
| 1055 |
+
max_value=5,
|
| 1056 |
+
help="How satisfied are you with the conversation?"
|
| 1057 |
+
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1058 |
|
| 1059 |
+
with col2:
|
| 1060 |
+
is_feedback_about_last_retrieval = st.checkbox(
|
| 1061 |
+
"Feedback about last retrieval only",
|
| 1062 |
+
value=True,
|
| 1063 |
+
help="If checked, feedback applies to the most recent document retrieval"
|
| 1064 |
+
)
|
| 1065 |
|
| 1066 |
+
open_ended_feedback = st.text_area(
|
| 1067 |
+
"Your feedback (optional)",
|
| 1068 |
+
placeholder="Tell us what went well or what could be improved...",
|
| 1069 |
+
height=100
|
| 1070 |
+
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1071 |
|
| 1072 |
+
# Disable submit if no score selected
|
| 1073 |
+
submit_disabled = feedback_score is None
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1074 |
|
| 1075 |
+
submitted = st.form_submit_button(
|
| 1076 |
+
"π€ Submit Feedback",
|
| 1077 |
+
use_container_width=True,
|
| 1078 |
+
disabled=submit_disabled
|
| 1079 |
+
)
|
| 1080 |
|
| 1081 |
+
if submitted:
|
| 1082 |
+
# Log the feedback data being submitted
|
| 1083 |
+
print("=" * 80)
|
| 1084 |
+
print("π FEEDBACK SUBMISSION: Starting...")
|
| 1085 |
+
print("=" * 80)
|
| 1086 |
+
st.write("π **Debug: Feedback Data Being Submitted:**")
|
|
|
|
|
|
|
| 1087 |
|
| 1088 |
+
# Create feedback data dictionary
|
| 1089 |
+
feedback_dict = {
|
| 1090 |
+
"open_ended_feedback": open_ended_feedback,
|
| 1091 |
+
"score": feedback_score,
|
| 1092 |
+
"is_feedback_about_last_retrieval": is_feedback_about_last_retrieval,
|
| 1093 |
+
"retrieved_data": st.session_state.rag_retrieval_history.copy() if st.session_state.rag_retrieval_history else [],
|
| 1094 |
+
"conversation_id": st.session_state.conversation_id,
|
| 1095 |
+
"timestamp": time.time(),
|
| 1096 |
+
"message_count": len(st.session_state.messages),
|
| 1097 |
+
"has_retrievals": has_retrievals,
|
| 1098 |
+
"retrieval_count": len(st.session_state.rag_retrieval_history)
|
| 1099 |
+
}
|
| 1100 |
|
| 1101 |
+
print(f"π FEEDBACK SUBMISSION: Score={feedback_score}, Retrievals={len(st.session_state.rag_retrieval_history) if st.session_state.rag_retrieval_history else 0}")
|
|
|
|
|
|
|
| 1102 |
|
| 1103 |
+
# Create UserFeedback dataclass instance
|
| 1104 |
+
feedback_obj = None # Initialize outside try block
|
| 1105 |
try:
|
| 1106 |
+
feedback_obj = create_feedback_from_dict(feedback_dict)
|
| 1107 |
+
print(f"β
FEEDBACK SUBMISSION: Feedback object created - ID={feedback_obj.feedback_id}")
|
| 1108 |
+
st.write(f"β
**Feedback Object Created**")
|
| 1109 |
+
st.write(f"- Feedback ID: {feedback_obj.feedback_id}")
|
| 1110 |
+
st.write(f"- Score: {feedback_obj.score}/5")
|
| 1111 |
+
st.write(f"- Has Retrievals: {feedback_obj.has_retrievals}")
|
| 1112 |
|
| 1113 |
+
# Convert back to dict for JSON serialization
|
| 1114 |
+
feedback_data = feedback_obj.to_dict()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1115 |
except Exception as e:
|
| 1116 |
+
print(f"β FEEDBACK SUBMISSION: Failed to create feedback object: {e}")
|
| 1117 |
+
st.error(f"Failed to create feedback object: {e}")
|
| 1118 |
+
feedback_data = feedback_dict
|
| 1119 |
|
| 1120 |
+
# Display the data being submitted
|
| 1121 |
+
st.json(feedback_data)
|
| 1122 |
|
| 1123 |
+
# Save feedback to file - use PROJECT_DIR to ensure writability
|
| 1124 |
+
feedback_dir = FEEDBACK_DIR
|
| 1125 |
+
try:
|
| 1126 |
+
# Ensure directory exists with write permissions (777 for compatibility)
|
| 1127 |
+
feedback_dir.mkdir(parents=True, mode=0o777, exist_ok=True)
|
| 1128 |
+
except (PermissionError, OSError) as e:
|
| 1129 |
+
logger.warning(f"Could not create feedback directory at {feedback_dir}: {e}")
|
| 1130 |
+
# Fallback to relative path
|
| 1131 |
+
feedback_dir = Path("feedback")
|
| 1132 |
+
feedback_dir.mkdir(parents=True, mode=0o777, exist_ok=True)
|
| 1133 |
|
| 1134 |
+
feedback_file = feedback_dir / f"feedback_{st.session_state.conversation_id}_{int(time.time())}.json"
|
|
|
|
| 1135 |
|
| 1136 |
+
try:
|
| 1137 |
+
# Ensure parent directory exists before writing
|
| 1138 |
+
feedback_file.parent.mkdir(parents=True, mode=0o777, exist_ok=True)
|
| 1139 |
+
|
| 1140 |
+
# Save to local file
|
| 1141 |
+
print(f"πΎ FEEDBACK SAVE: Saving to local file: {feedback_file}")
|
| 1142 |
+
with open(feedback_file, 'w') as f:
|
| 1143 |
+
json.dump(feedback_data, f, indent=2, default=str)
|
| 1144 |
+
|
| 1145 |
+
print(f"β
FEEDBACK SAVE: Local file saved successfully")
|
| 1146 |
+
st.success("β
Thank you for your feedback! It has been saved locally.")
|
| 1147 |
+
st.balloons()
|
| 1148 |
+
|
| 1149 |
+
# Save to Snowflake if enabled and credentials available
|
| 1150 |
+
logger.info("π FEEDBACK SAVE: Starting Snowflake save process...")
|
| 1151 |
+
logger.info(f"π FEEDBACK SAVE: feedback_obj={'exists' if feedback_obj else 'None'}")
|
| 1152 |
+
|
| 1153 |
+
try:
|
| 1154 |
+
snowflake_enabled = os.getenv("SNOWFLAKE_ENABLED", "false").lower() == "true"
|
| 1155 |
+
logger.info(f"π SNOWFLAKE CHECK: enabled={snowflake_enabled}")
|
| 1156 |
+
|
| 1157 |
+
if snowflake_enabled:
|
| 1158 |
+
if feedback_obj:
|
| 1159 |
+
try:
|
| 1160 |
+
logger.info("π€ SNOWFLAKE UI: Attempting to save feedback to Snowflake...")
|
| 1161 |
+
print("π€ SNOWFLAKE UI: Attempting to save feedback to Snowflake...")
|
| 1162 |
+
|
| 1163 |
+
if save_to_snowflake(feedback_obj):
|
| 1164 |
+
logger.info("β
SNOWFLAKE UI: Successfully saved to Snowflake")
|
| 1165 |
+
print("β
SNOWFLAKE UI: Successfully saved to Snowflake")
|
| 1166 |
+
st.success("β
Feedback also saved to Snowflake!")
|
| 1167 |
+
else:
|
| 1168 |
+
logger.warning("β οΈ SNOWFLAKE UI: Save failed")
|
| 1169 |
+
print("β οΈ SNOWFLAKE UI: Save failed")
|
| 1170 |
+
st.warning("β οΈ Snowflake save failed, but local save succeeded")
|
| 1171 |
+
except Exception as e:
|
| 1172 |
+
logger.error(f"β SNOWFLAKE UI ERROR: {e}")
|
| 1173 |
+
print(f"β SNOWFLAKE UI ERROR: {e}")
|
| 1174 |
+
traceback.print_exc()
|
| 1175 |
+
st.warning(f"β οΈ Could not save to Snowflake: {e}")
|
| 1176 |
+
else:
|
| 1177 |
+
logger.warning("β οΈ SNOWFLAKE UI: Skipping (feedback object not created)")
|
| 1178 |
+
print("β οΈ SNOWFLAKE UI: Skipping (feedback object not created)")
|
| 1179 |
+
st.warning("β οΈ Skipping Snowflake save (feedback object not created)")
|
| 1180 |
+
else:
|
| 1181 |
+
logger.info("π‘ SNOWFLAKE UI: Integration disabled")
|
| 1182 |
+
print("π‘ SNOWFLAKE UI: Integration disabled")
|
| 1183 |
+
st.info("π‘ Snowflake integration disabled (set SNOWFLAKE_ENABLED=true to enable)")
|
| 1184 |
+
except NameError as e:
|
| 1185 |
+
traceback.print_exc()
|
| 1186 |
+
logger.error(f"β NameError in Snowflake save: {e}")
|
| 1187 |
+
print(f"β NameError in Snowflake save: {e}")
|
| 1188 |
+
st.warning(f"β οΈ Snowflake save error: {e}")
|
| 1189 |
+
except Exception as e:
|
| 1190 |
+
logger.error(f"β Exception in Snowflake save: {type(e).__name__}: {e}")
|
| 1191 |
+
print(f"β Exception in Snowflake save: {type(e).__name__}: {e}")
|
| 1192 |
+
st.warning(f"β οΈ Snowflake save error: {e}")
|
| 1193 |
+
|
| 1194 |
+
# Mark feedback as submitted to prevent resubmission
|
| 1195 |
+
st.session_state.feedback_submitted = True
|
| 1196 |
+
|
| 1197 |
+
print("=" * 80)
|
| 1198 |
+
print(f"β
FEEDBACK SUBMISSION: Completed successfully")
|
| 1199 |
+
print("=" * 80)
|
| 1200 |
+
|
| 1201 |
+
# Log file location
|
| 1202 |
+
st.info(f"π Feedback saved to: {feedback_file}")
|
| 1203 |
+
|
| 1204 |
+
except Exception as e:
|
| 1205 |
+
print(f"β FEEDBACK SUBMISSION: Error saving feedback: {e}")
|
| 1206 |
+
print(f"β FEEDBACK SUBMISSION: Error type: {type(e).__name__}")
|
| 1207 |
+
traceback.print_exc()
|
| 1208 |
+
st.error(f"β Error saving feedback: {e}")
|
| 1209 |
+
st.write(f"Debug error: {str(e)}")
|
| 1210 |
+
else:
|
| 1211 |
+
# Feedback already submitted - show success message and reset option
|
| 1212 |
+
st.success("β
Feedback already submitted for this conversation!")
|
| 1213 |
+
col1, col2 = st.columns([1, 1])
|
| 1214 |
+
with col1:
|
| 1215 |
+
if st.button("π Submit New Feedback", key="new_feedback_button", use_container_width=True):
|
| 1216 |
+
try:
|
| 1217 |
+
st.session_state.feedback_submitted = False
|
| 1218 |
+
st.rerun()
|
| 1219 |
+
except Exception as e:
|
| 1220 |
+
# Handle any Streamlit API exceptions gracefully
|
| 1221 |
+
logger.error(f"Error resetting feedback state: {e}")
|
| 1222 |
+
st.error(f"Error resetting feedback. Please refresh the page.")
|
| 1223 |
+
with col2:
|
| 1224 |
+
if st.button("π View Conversation", key="view_conversation_button", use_container_width=True):
|
| 1225 |
+
# Scroll to conversation - this is handled by the auto-scroll at bottom
|
| 1226 |
+
pass
|
| 1227 |
|
| 1228 |
+
# Example Questions Section
|
| 1229 |
+
st.markdown("---")
|
| 1230 |
+
st.markdown(
|
| 1231 |
+
"<h3 class='example-questions-header'>π‘ Example Questions</h3>",
|
| 1232 |
+
unsafe_allow_html=True
|
| 1233 |
+
)
|
| 1234 |
+
st.markdown(
|
| 1235 |
+
"<p class='example-questions-description'>Click on any question below to use it, or modify the editable examples:</p>",
|
| 1236 |
+
unsafe_allow_html=True
|
| 1237 |
+
)
|
| 1238 |
+
|
| 1239 |
+
# Initialize example question state
|
| 1240 |
+
if 'custom_question_1' not in st.session_state:
|
| 1241 |
+
st.session_state.custom_question_1 = "How were administrative costs managed in the PDM implementation, and what issues arose with budget execution regarding staff salaries?"
|
| 1242 |
+
if 'custom_question_2' not in st.session_state:
|
| 1243 |
+
st.session_state.custom_question_2 = "What did the National Coordinator say about the release of funds for PDM administrative costs in the letter dated 29th September 2022 and how did the funding received affect the activities of the PDCs and PDM SACCOs in the FY 2022/23?"
|
| 1244 |
+
|
| 1245 |
+
# Question 1: Filename insights (fixed, clickable)
|
| 1246 |
+
st.markdown("#### π Question 1: List insights from a specific file")
|
| 1247 |
+
col1, col2 = st.columns([3, 1])
|
| 1248 |
+
with col1:
|
| 1249 |
+
example_q1 = "List couple of insights from the filename."
|
| 1250 |
+
st.markdown(f"**Example:** `{example_q1}`")
|
| 1251 |
+
st.info("π‘ **Filter to apply:** Select a Filename from the sidebar panel before asking this question.")
|
| 1252 |
+
with col2:
|
| 1253 |
+
if st.button("π Use This Question", key="use_example_1", use_container_width=True):
|
| 1254 |
+
st.session_state.pending_question = example_q1
|
| 1255 |
+
st.session_state.input_counter = (st.session_state.get('input_counter', 0) + 1) % 1000
|
| 1256 |
+
st.rerun()
|
| 1257 |
+
|
| 1258 |
+
st.markdown("---")
|
| 1259 |
+
|
| 1260 |
+
# Questions 2 & 3: Editable examples (collapsible, side by side)
|
| 1261 |
+
with st.expander("#### βοΈ Customizable Questions (Edit and use)", expanded=False):
|
| 1262 |
+
# Place questions side by side
|
| 1263 |
+
col1, col2 = st.columns(2)
|
| 1264 |
|
| 1265 |
+
# Question 2
|
| 1266 |
+
with col1:
|
| 1267 |
+
st.markdown("**Question 2:**")
|
| 1268 |
+
custom_q1 = st.text_area(
|
| 1269 |
+
"Edit question 2:",
|
| 1270 |
+
value=st.session_state.custom_question_1,
|
| 1271 |
+
height=100,
|
| 1272 |
+
key="edit_question_2",
|
| 1273 |
+
help="Modify this question to fit your needs, then click 'Use This Question'",
|
| 1274 |
+
label_visibility="collapsed"
|
| 1275 |
+
)
|
| 1276 |
+
if st.button("π Use Question 2", key="use_custom_1", use_container_width=True):
|
| 1277 |
+
if custom_q1.strip():
|
| 1278 |
+
st.session_state.pending_question = custom_q1.strip()
|
| 1279 |
+
st.session_state.custom_question_1 = custom_q1.strip()
|
| 1280 |
+
st.session_state.input_counter = (st.session_state.get('input_counter', 0) + 1) % 1000
|
| 1281 |
+
st.rerun()
|
| 1282 |
+
else:
|
| 1283 |
+
st.warning("Please enter a question first!")
|
| 1284 |
+
st.caption("π‘ Tip: Add specific details like dates, names, or amounts to get more precise answers")
|
| 1285 |
+
st.info("π‘ **Filter to apply:** Select District(s) and Year(s) from sidebar panel")
|
| 1286 |
+
|
| 1287 |
+
# Question 3
|
| 1288 |
+
with col2:
|
| 1289 |
+
st.markdown("**Question 3:**")
|
| 1290 |
+
custom_q2 = st.text_area(
|
| 1291 |
+
"Edit question 3:",
|
| 1292 |
+
value=st.session_state.custom_question_2,
|
| 1293 |
+
height=100,
|
| 1294 |
+
key="edit_question_3",
|
| 1295 |
+
help="Modify this question to fit your needs, then click 'Use This Question'",
|
| 1296 |
+
label_visibility="collapsed"
|
| 1297 |
+
)
|
| 1298 |
+
if st.button("π Use Question 3", key="use_custom_2", use_container_width=True):
|
| 1299 |
+
if custom_q2.strip():
|
| 1300 |
+
st.session_state.pending_question = custom_q2.strip()
|
| 1301 |
+
st.session_state.custom_question_2 = custom_q2.strip()
|
| 1302 |
+
st.session_state.input_counter = (st.session_state.get('input_counter', 0) + 1) % 1000
|
| 1303 |
+
st.rerun()
|
| 1304 |
+
else:
|
| 1305 |
+
st.warning("Please enter a question first!")
|
| 1306 |
+
st.caption("π‘ Tip: Use specific terms from the documents (e.g., 'PDM', 'SACCOs', 'FY 2022/23')")
|
| 1307 |
+
|
| 1308 |
+
|
| 1309 |
+
# Store selected question for next render (handled in input section above)
|
| 1310 |
+
# This ensures the question populates the input field correctly
|
| 1311 |
|
| 1312 |
# Auto-scroll to bottom
|
| 1313 |
st.markdown("""
|
multi_agent_chatbot.py
CHANGED
|
@@ -8,24 +8,26 @@ This system implements a 3-agent architecture:
|
|
| 8 |
|
| 9 |
Each agent has specialized prompts and responsibilities.
|
| 10 |
"""
|
|
|
|
| 11 |
import json
|
| 12 |
import time
|
| 13 |
import logging
|
|
|
|
| 14 |
from pathlib import Path
|
| 15 |
from datetime import datetime
|
| 16 |
from dataclasses import dataclass
|
| 17 |
from typing import Dict, List, Any, Optional, TypedDict
|
| 18 |
|
| 19 |
-
|
| 20 |
from langchain_core.tools import tool
|
| 21 |
from langgraph.graph import StateGraph, END
|
| 22 |
-
from langchain_core.messages import HumanMessage, AIMessage, SystemMessage
|
| 23 |
from langchain_core.prompts import ChatPromptTemplate
|
|
|
|
| 24 |
|
| 25 |
|
| 26 |
from src.pipeline import PipelineManager
|
| 27 |
-
from src.config.loader import load_config
|
| 28 |
from src.llm.adapters import get_llm_client
|
|
|
|
|
|
|
| 29 |
|
| 30 |
|
| 31 |
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
|
|
@@ -46,6 +48,7 @@ class QueryContext:
|
|
| 46 |
needs_follow_up: bool = False
|
| 47 |
follow_up_question: Optional[str] = None
|
| 48 |
|
|
|
|
| 49 |
class MultiAgentState(TypedDict):
|
| 50 |
"""State for the multi-agent conversation flow"""
|
| 51 |
conversation_id: str
|
|
@@ -61,6 +64,7 @@ class MultiAgentState(TypedDict):
|
|
| 61 |
session_start_time: float
|
| 62 |
last_ai_message_time: float
|
| 63 |
|
|
|
|
| 64 |
class MultiAgentRAGChatbot:
|
| 65 |
"""Multi-agent RAG chatbot with specialized agents"""
|
| 66 |
|
|
@@ -112,7 +116,6 @@ class MultiAgentRAGChatbot:
|
|
| 112 |
logger.info("β
Pipeline manager initialized and models loaded")
|
| 113 |
except Exception as e:
|
| 114 |
logger.error(f"β Failed to initialize pipeline manager: {e}")
|
| 115 |
-
import traceback
|
| 116 |
traceback.print_exc()
|
| 117 |
raise RuntimeError(f"Pipeline manager initialization failed: {e}")
|
| 118 |
|
|
@@ -129,7 +132,6 @@ class MultiAgentRAGChatbot:
|
|
| 129 |
raise # Re-raise RuntimeError as-is
|
| 130 |
except Exception as e:
|
| 131 |
logger.error(f"β Error during vector store connection: {e}")
|
| 132 |
-
import traceback
|
| 133 |
traceback.print_exc()
|
| 134 |
raise RuntimeError(f"Vector store connection failed: {e}")
|
| 135 |
|
|
@@ -139,8 +141,8 @@ class MultiAgentRAGChatbot:
|
|
| 139 |
# Build the multi-agent graph
|
| 140 |
self.graph = self._build_graph()
|
| 141 |
|
| 142 |
-
# Conversations directory - use
|
| 143 |
-
self.conversations_dir =
|
| 144 |
try:
|
| 145 |
# Use 777 permissions for maximum compatibility (HF Spaces runs as different user)
|
| 146 |
self.conversations_dir.mkdir(parents=True, mode=0o777, exist_ok=True)
|
|
@@ -158,9 +160,9 @@ class MultiAgentRAGChatbot:
|
|
| 158 |
|
| 159 |
def _load_dynamic_data(self):
|
| 160 |
"""Load dynamic data from filter_options.json and add_district_metadata.py"""
|
| 161 |
-
# Load filter options
|
| 162 |
try:
|
| 163 |
-
fo =
|
| 164 |
if fo.exists():
|
| 165 |
with open(fo) as f:
|
| 166 |
data = json.load(f)
|
|
@@ -178,7 +180,7 @@ class MultiAgentRAGChatbot:
|
|
| 178 |
self.source_whitelist = ['Consolidated', 'Local Government', 'Ministry, Department and Agency']
|
| 179 |
self.district_whitelist = ['Kampala', 'Gulu', 'Kalangala']
|
| 180 |
|
| 181 |
-
# Enrich district list from add_district_metadata.py
|
| 182 |
try:
|
| 183 |
from add_district_metadata import DistrictMetadataProcessor
|
| 184 |
proc = DistrictMetadataProcessor()
|
|
@@ -590,7 +592,6 @@ Analyze this query using ONLY the exact values provided above:""")
|
|
| 590 |
# Clean and parse JSON with better error handling
|
| 591 |
try:
|
| 592 |
# Remove comments (// and /* */) from JSON
|
| 593 |
-
import re
|
| 594 |
# Remove single-line comments
|
| 595 |
content = re.sub(r'//.*?$', '', content, flags=re.MULTILINE)
|
| 596 |
# Remove multi-line comments
|
|
@@ -603,7 +604,6 @@ Analyze this query using ONLY the exact values provided above:""")
|
|
| 603 |
logger.error(f"β Raw content: {content[:200]}...")
|
| 604 |
|
| 605 |
# Try to extract JSON from text if embedded
|
| 606 |
-
import re
|
| 607 |
json_match = re.search(r'\{.*\}', content, re.DOTALL)
|
| 608 |
if json_match:
|
| 609 |
try:
|
|
@@ -1178,7 +1178,6 @@ Generate a conversational response based on your knowledge:""")
|
|
| 1178 |
|
| 1179 |
except Exception as e:
|
| 1180 |
logger.error(f"Could not save conversation: {e}")
|
| 1181 |
-
import traceback
|
| 1182 |
logger.error(f"Traceback: {traceback.format_exc()}")
|
| 1183 |
|
| 1184 |
|
|
|
|
| 8 |
|
| 9 |
Each agent has specialized prompts and responsibilities.
|
| 10 |
"""
|
| 11 |
+
import re
|
| 12 |
import json
|
| 13 |
import time
|
| 14 |
import logging
|
| 15 |
+
import traceback
|
| 16 |
from pathlib import Path
|
| 17 |
from datetime import datetime
|
| 18 |
from dataclasses import dataclass
|
| 19 |
from typing import Dict, List, Any, Optional, TypedDict
|
| 20 |
|
|
|
|
| 21 |
from langchain_core.tools import tool
|
| 22 |
from langgraph.graph import StateGraph, END
|
|
|
|
| 23 |
from langchain_core.prompts import ChatPromptTemplate
|
| 24 |
+
from langchain_core.messages import HumanMessage, AIMessage, SystemMessage
|
| 25 |
|
| 26 |
|
| 27 |
from src.pipeline import PipelineManager
|
|
|
|
| 28 |
from src.llm.adapters import get_llm_client
|
| 29 |
+
from src.config.paths import PROJECT_DIR, CONVERSATIONS_DIR
|
| 30 |
+
from src.config.loader import load_config, get_embedding_model_for_collection
|
| 31 |
|
| 32 |
|
| 33 |
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
|
|
|
|
| 48 |
needs_follow_up: bool = False
|
| 49 |
follow_up_question: Optional[str] = None
|
| 50 |
|
| 51 |
+
|
| 52 |
class MultiAgentState(TypedDict):
|
| 53 |
"""State for the multi-agent conversation flow"""
|
| 54 |
conversation_id: str
|
|
|
|
| 64 |
session_start_time: float
|
| 65 |
last_ai_message_time: float
|
| 66 |
|
| 67 |
+
|
| 68 |
class MultiAgentRAGChatbot:
|
| 69 |
"""Multi-agent RAG chatbot with specialized agents"""
|
| 70 |
|
|
|
|
| 116 |
logger.info("β
Pipeline manager initialized and models loaded")
|
| 117 |
except Exception as e:
|
| 118 |
logger.error(f"β Failed to initialize pipeline manager: {e}")
|
|
|
|
| 119 |
traceback.print_exc()
|
| 120 |
raise RuntimeError(f"Pipeline manager initialization failed: {e}")
|
| 121 |
|
|
|
|
| 132 |
raise # Re-raise RuntimeError as-is
|
| 133 |
except Exception as e:
|
| 134 |
logger.error(f"β Error during vector store connection: {e}")
|
|
|
|
| 135 |
traceback.print_exc()
|
| 136 |
raise RuntimeError(f"Vector store connection failed: {e}")
|
| 137 |
|
|
|
|
| 141 |
# Build the multi-agent graph
|
| 142 |
self.graph = self._build_graph()
|
| 143 |
|
| 144 |
+
# Conversations directory - use PROJECT_DIR for local vs deployed compatibility
|
| 145 |
+
self.conversations_dir = CONVERSATIONS_DIR
|
| 146 |
try:
|
| 147 |
# Use 777 permissions for maximum compatibility (HF Spaces runs as different user)
|
| 148 |
self.conversations_dir.mkdir(parents=True, mode=0o777, exist_ok=True)
|
|
|
|
| 160 |
|
| 161 |
def _load_dynamic_data(self):
|
| 162 |
"""Load dynamic data from filter_options.json and add_district_metadata.py"""
|
| 163 |
+
# Load filter options - use PROJECT_DIR relative path
|
| 164 |
try:
|
| 165 |
+
fo = PROJECT_DIR / "src" / "config" / "filter_options.json"
|
| 166 |
if fo.exists():
|
| 167 |
with open(fo) as f:
|
| 168 |
data = json.load(f)
|
|
|
|
| 180 |
self.source_whitelist = ['Consolidated', 'Local Government', 'Ministry, Department and Agency']
|
| 181 |
self.district_whitelist = ['Kampala', 'Gulu', 'Kalangala']
|
| 182 |
|
| 183 |
+
# Enrich district list from add_district_metadata.py (if available)
|
| 184 |
try:
|
| 185 |
from add_district_metadata import DistrictMetadataProcessor
|
| 186 |
proc = DistrictMetadataProcessor()
|
|
|
|
| 592 |
# Clean and parse JSON with better error handling
|
| 593 |
try:
|
| 594 |
# Remove comments (// and /* */) from JSON
|
|
|
|
| 595 |
# Remove single-line comments
|
| 596 |
content = re.sub(r'//.*?$', '', content, flags=re.MULTILINE)
|
| 597 |
# Remove multi-line comments
|
|
|
|
| 604 |
logger.error(f"β Raw content: {content[:200]}...")
|
| 605 |
|
| 606 |
# Try to extract JSON from text if embedded
|
|
|
|
| 607 |
json_match = re.search(r'\{.*\}', content, re.DOTALL)
|
| 608 |
if json_match:
|
| 609 |
try:
|
|
|
|
| 1178 |
|
| 1179 |
except Exception as e:
|
| 1180 |
logger.error(f"Could not save conversation: {e}")
|
|
|
|
| 1181 |
logger.error(f"Traceback: {traceback.format_exc()}")
|
| 1182 |
|
| 1183 |
|
smart_chatbot.py
CHANGED
|
@@ -26,6 +26,7 @@ from langchain_core.messages import HumanMessage, AIMessage, SystemMessage
|
|
| 26 |
|
| 27 |
from src.pipeline import PipelineManager
|
| 28 |
from src.config.loader import load_config
|
|
|
|
| 29 |
|
| 30 |
|
| 31 |
@dataclass
|
|
@@ -161,7 +162,7 @@ class IntelligentRAGChatbot:
|
|
| 161 |
|
| 162 |
# Try to load district whitelist from filter_options.json
|
| 163 |
try:
|
| 164 |
-
fo =
|
| 165 |
if fo.exists():
|
| 166 |
with open(fo) as f:
|
| 167 |
data = json.load(f)
|
|
@@ -174,7 +175,7 @@ class IntelligentRAGChatbot:
|
|
| 174 |
except Exception:
|
| 175 |
self.district_whitelist = self.available_metadata['districts']
|
| 176 |
|
| 177 |
-
# Enrich whitelist from add_district_metadata.py if available
|
| 178 |
try:
|
| 179 |
from add_district_metadata import DistrictMetadataProcessor
|
| 180 |
proc = DistrictMetadataProcessor()
|
|
@@ -195,7 +196,7 @@ class IntelligentRAGChatbot:
|
|
| 195 |
|
| 196 |
# Get dynamic year list from filter_options.json
|
| 197 |
try:
|
| 198 |
-
fo =
|
| 199 |
if fo.exists():
|
| 200 |
with open(fo) as f:
|
| 201 |
data = json.load(f)
|
|
|
|
| 26 |
|
| 27 |
from src.pipeline import PipelineManager
|
| 28 |
from src.config.loader import load_config
|
| 29 |
+
from src.config.paths import PROJECT_DIR
|
| 30 |
|
| 31 |
|
| 32 |
@dataclass
|
|
|
|
| 162 |
|
| 163 |
# Try to load district whitelist from filter_options.json
|
| 164 |
try:
|
| 165 |
+
fo = PROJECT_DIR / "src" / "config" / "filter_options.json"
|
| 166 |
if fo.exists():
|
| 167 |
with open(fo) as f:
|
| 168 |
data = json.load(f)
|
|
|
|
| 175 |
except Exception:
|
| 176 |
self.district_whitelist = self.available_metadata['districts']
|
| 177 |
|
| 178 |
+
# Enrich whitelist from add_district_metadata.py if available (optional module)
|
| 179 |
try:
|
| 180 |
from add_district_metadata import DistrictMetadataProcessor
|
| 181 |
proc = DistrictMetadataProcessor()
|
|
|
|
| 196 |
|
| 197 |
# Get dynamic year list from filter_options.json
|
| 198 |
try:
|
| 199 |
+
fo = PROJECT_DIR / "src" / "config" / "filter_options.json"
|
| 200 |
if fo.exists():
|
| 201 |
with open(fo) as f:
|
| 202 |
data = json.load(f)
|
src/config/paths.py
ADDED
|
@@ -0,0 +1,59 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Path configuration for local vs deployed environments.
|
| 3 |
+
|
| 4 |
+
This module handles different paths for local development vs deployed (HF Spaces) environments.
|
| 5 |
+
"""
|
| 6 |
+
import os
|
| 7 |
+
from pathlib import Path
|
| 8 |
+
|
| 9 |
+
# Determine if we're in a deployed environment (HF Spaces/Docker) or local
|
| 10 |
+
# Check for environment variable or Docker-like paths
|
| 11 |
+
IS_DEPLOYED = (
|
| 12 |
+
os.getenv("DEPLOYED", "false").lower() == "true" or
|
| 13 |
+
os.path.exists("/app") or
|
| 14 |
+
os.getenv("SPACES_ID") is not None or
|
| 15 |
+
os.path.exists("/.dockerenv")
|
| 16 |
+
)
|
| 17 |
+
|
| 18 |
+
# PROJECT_DIR: Base directory for application files
|
| 19 |
+
# In deployed: /app, in local: current working directory or project root
|
| 20 |
+
if IS_DEPLOYED:
|
| 21 |
+
PROJECT_DIR = Path("/app")
|
| 22 |
+
else:
|
| 23 |
+
# For local development, use current working directory or find project root
|
| 24 |
+
cwd = Path.cwd()
|
| 25 |
+
# Try to find project root (directory containing this src/ folder)
|
| 26 |
+
project_root = cwd
|
| 27 |
+
while project_root != project_root.parent:
|
| 28 |
+
if (project_root / "src" / "config").exists():
|
| 29 |
+
break
|
| 30 |
+
project_root = project_root.parent
|
| 31 |
+
PROJECT_DIR = project_root
|
| 32 |
+
|
| 33 |
+
# Cache directories - different for local vs deployed
|
| 34 |
+
# Local: Use default user cache locations (don't override)
|
| 35 |
+
# Deployed: Use PROJECT_DIR/.cache
|
| 36 |
+
if IS_DEPLOYED:
|
| 37 |
+
CACHE_DIR = PROJECT_DIR / ".cache"
|
| 38 |
+
HF_CACHE_DIR = CACHE_DIR / "huggingface"
|
| 39 |
+
STREAMLIT_CACHE_DIR = CACHE_DIR / "streamlit"
|
| 40 |
+
else:
|
| 41 |
+
# For local, use default user cache (let libraries use their defaults)
|
| 42 |
+
HF_CACHE_DIR = None # Will use HF defaults (~/.cache/huggingface)
|
| 43 |
+
STREAMLIT_CACHE_DIR = None # Will use Streamlit defaults
|
| 44 |
+
|
| 45 |
+
# Application directories
|
| 46 |
+
FEEDBACK_DIR = PROJECT_DIR / "feedback"
|
| 47 |
+
CONVERSATIONS_DIR = PROJECT_DIR / "conversations"
|
| 48 |
+
STREAMLIT_CONFIG_DIR = PROJECT_DIR / ".streamlit"
|
| 49 |
+
|
| 50 |
+
# Log the configuration
|
| 51 |
+
if __name__ == "__main__":
|
| 52 |
+
print(f"IS_DEPLOYED: {IS_DEPLOYED}")
|
| 53 |
+
print(f"PROJECT_DIR: {PROJECT_DIR}")
|
| 54 |
+
print(f"HF_CACHE_DIR: {HF_CACHE_DIR}")
|
| 55 |
+
print(f"FEEDBACK_DIR: {FEEDBACK_DIR}")
|
| 56 |
+
print(f"CONVERSATIONS_DIR: {CONVERSATIONS_DIR}")
|
| 57 |
+
|
| 58 |
+
|
| 59 |
+
|
src/pipeline.py
CHANGED
|
@@ -1,5 +1,7 @@
|
|
| 1 |
"""Main pipeline orchestrator for the Audit QA system."""
|
|
|
|
| 2 |
import time
|
|
|
|
| 3 |
from pathlib import Path
|
| 4 |
from dataclasses import dataclass
|
| 5 |
from typing import Dict, Any, List, Optional
|
|
@@ -11,11 +13,21 @@ except ModuleNotFoundError as me:
|
|
| 11 |
from langchain.schema import Document
|
| 12 |
|
| 13 |
from .logging import log_error
|
| 14 |
-
|
| 15 |
from .loader import chunks_to_documents
|
| 16 |
from .vectorstore import VectorStoreManager
|
|
|
|
| 17 |
from .retrieval.context import ContextRetriever
|
| 18 |
-
from .
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 19 |
|
| 20 |
|
| 21 |
|
|
@@ -41,12 +53,13 @@ class PipelineManager:
|
|
| 41 |
"""
|
| 42 |
Initialize the pipeline manager.
|
| 43 |
"""
|
|
|
|
|
|
|
| 44 |
self.config = config or {}
|
|
|
|
| 45 |
self.vectorstore_manager = None
|
| 46 |
self.context_retriever = None # Initialize as None
|
| 47 |
-
|
| 48 |
-
self.report_service = None
|
| 49 |
-
self.chunks = None
|
| 50 |
|
| 51 |
# Initialize components
|
| 52 |
self._initialize_components()
|
|
@@ -118,13 +131,7 @@ class PipelineManager:
|
|
| 118 |
try:
|
| 119 |
# Load config if not provided
|
| 120 |
if not self.config:
|
| 121 |
-
|
| 122 |
-
from src.config.loader import load_config
|
| 123 |
-
self.config = load_config()
|
| 124 |
-
except ImportError:
|
| 125 |
-
# Try alternate import path
|
| 126 |
-
from src.config.loader import load_config
|
| 127 |
-
self.config = load_config()
|
| 128 |
|
| 129 |
# Validate config structure
|
| 130 |
if not isinstance(self.config, dict):
|
|
@@ -159,7 +166,6 @@ class PipelineManager:
|
|
| 159 |
print("β
VectorStoreManager initialized successfully")
|
| 160 |
except Exception as vs_error:
|
| 161 |
print(f"β Error initializing VectorStoreManager: {vs_error}")
|
| 162 |
-
import traceback
|
| 163 |
traceback.print_exc()
|
| 164 |
self.vectorstore_manager = None
|
| 165 |
raise # Re-raise to be caught by outer try-except
|
|
@@ -175,40 +181,35 @@ class PipelineManager:
|
|
| 175 |
except Exception as e:
|
| 176 |
try:
|
| 177 |
# Try direct instantiation with config
|
| 178 |
-
from src.llm.adapters import get_llm_client
|
| 179 |
self.llm_client = get_llm_client("openai", self.config)
|
| 180 |
print("β
LLM CLIENT: Initialized using direct get_llm_client function with config")
|
| 181 |
except Exception as e2:
|
| 182 |
print(f"β LLM CLIENT: Registry methods failed - {e2}")
|
| 183 |
# Try to create a simple LLM client directly
|
| 184 |
try:
|
| 185 |
-
|
| 186 |
-
|
| 187 |
-
|
| 188 |
-
|
| 189 |
-
|
| 190 |
-
|
| 191 |
-
|
| 192 |
-
|
| 193 |
-
|
| 194 |
-
|
| 195 |
-
|
|
|
|
| 196 |
else:
|
| 197 |
-
print("β LLM CLIENT:
|
| 198 |
except Exception as e3:
|
| 199 |
print(f"β LLM CLIENT: Direct instantiation also failed - {e3}")
|
| 200 |
self.llm_client = None
|
| 201 |
|
| 202 |
# Load system prompt
|
| 203 |
-
from src.llm.templates import DEFAULT_AUDIT_SYSTEM_PROMPT
|
| 204 |
self.system_prompt = DEFAULT_AUDIT_SYSTEM_PROMPT
|
| 205 |
|
| 206 |
# Initialize report service
|
| 207 |
try:
|
| 208 |
-
try:
|
| 209 |
-
from src.reporting.service import ReportService
|
| 210 |
-
except ImportError:
|
| 211 |
-
from src.reporting.service import ReportService
|
| 212 |
self.report_service = ReportService()
|
| 213 |
except Exception as e:
|
| 214 |
print(f"Warning: Could not initialize report service: {e}")
|
|
@@ -216,7 +217,6 @@ class PipelineManager:
|
|
| 216 |
|
| 217 |
except Exception as e:
|
| 218 |
print(f"β Error initializing components: {e}")
|
| 219 |
-
import traceback
|
| 220 |
traceback.print_exc()
|
| 221 |
# Don't set vectorstore_manager to None if it was already set
|
| 222 |
if not hasattr(self, 'vectorstore_manager') or self.vectorstore_manager is None:
|
|
@@ -337,7 +337,6 @@ class PipelineManager:
|
|
| 337 |
return False
|
| 338 |
except Exception as init_error:
|
| 339 |
print(f"β Error initializing vector store manager: {init_error}")
|
| 340 |
-
import traceback
|
| 341 |
traceback.print_exc()
|
| 342 |
return False
|
| 343 |
|
|
@@ -352,7 +351,6 @@ class PipelineManager:
|
|
| 352 |
except Exception as e:
|
| 353 |
print(f"β Error connecting to vector store: {e}")
|
| 354 |
log_error(e, {"component": "vectorstore_connection"})
|
| 355 |
-
import traceback
|
| 356 |
traceback.print_exc()
|
| 357 |
|
| 358 |
# If it's a dimension mismatch error, try with force_recreate
|
|
@@ -541,9 +539,6 @@ Answer:"""
|
|
| 541 |
if auto_infer_filters and not any([reports, sources, subtype]):
|
| 542 |
print(f"π€ AUTO-INFERRING FILTERS: No explicit filters provided, analyzing query...")
|
| 543 |
try:
|
| 544 |
-
# Import get_available_metadata here to avoid circular imports
|
| 545 |
-
from src.retrieval.filter import get_available_metadata, infer_filters_from_query
|
| 546 |
-
|
| 547 |
# Get available metadata
|
| 548 |
available_metadata = get_available_metadata(self.vectorstore_manager.get_vectorstore())
|
| 549 |
|
|
|
|
| 1 |
"""Main pipeline orchestrator for the Audit QA system."""
|
| 2 |
+
import os
|
| 3 |
import time
|
| 4 |
+
import traceback
|
| 5 |
from pathlib import Path
|
| 6 |
from dataclasses import dataclass
|
| 7 |
from typing import Dict, Any, List, Optional
|
|
|
|
| 13 |
from langchain.schema import Document
|
| 14 |
|
| 15 |
from .logging import log_error
|
| 16 |
+
|
| 17 |
from .loader import chunks_to_documents
|
| 18 |
from .vectorstore import VectorStoreManager
|
| 19 |
+
from .reporting.service import ReportService
|
| 20 |
from .retrieval.context import ContextRetriever
|
| 21 |
+
from .llm.adapters import LLMRegistry, get_llm_client
|
| 22 |
+
from .llm.templates import DEFAULT_AUDIT_SYSTEM_PROMPT
|
| 23 |
+
from .config.loader import load_config, get_embedding_model_for_collection
|
| 24 |
+
from .retrieval.filter import get_available_metadata, infer_filters_from_query
|
| 25 |
+
|
| 26 |
+
try:
|
| 27 |
+
from langchain_openai import ChatOpenAI
|
| 28 |
+
LANGCHAIN_OPENAI_AVAILABLE = True
|
| 29 |
+
except ImportError:
|
| 30 |
+
LANGCHAIN_OPENAI_AVAILABLE = False
|
| 31 |
|
| 32 |
|
| 33 |
|
|
|
|
| 53 |
"""
|
| 54 |
Initialize the pipeline manager.
|
| 55 |
"""
|
| 56 |
+
self.chunks = None
|
| 57 |
+
self.llm_client = None
|
| 58 |
self.config = config or {}
|
| 59 |
+
self.report_service = None
|
| 60 |
self.vectorstore_manager = None
|
| 61 |
self.context_retriever = None # Initialize as None
|
| 62 |
+
|
|
|
|
|
|
|
| 63 |
|
| 64 |
# Initialize components
|
| 65 |
self._initialize_components()
|
|
|
|
| 131 |
try:
|
| 132 |
# Load config if not provided
|
| 133 |
if not self.config:
|
| 134 |
+
self.config = load_config()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 135 |
|
| 136 |
# Validate config structure
|
| 137 |
if not isinstance(self.config, dict):
|
|
|
|
| 166 |
print("β
VectorStoreManager initialized successfully")
|
| 167 |
except Exception as vs_error:
|
| 168 |
print(f"β Error initializing VectorStoreManager: {vs_error}")
|
|
|
|
| 169 |
traceback.print_exc()
|
| 170 |
self.vectorstore_manager = None
|
| 171 |
raise # Re-raise to be caught by outer try-except
|
|
|
|
| 181 |
except Exception as e:
|
| 182 |
try:
|
| 183 |
# Try direct instantiation with config
|
|
|
|
| 184 |
self.llm_client = get_llm_client("openai", self.config)
|
| 185 |
print("β
LLM CLIENT: Initialized using direct get_llm_client function with config")
|
| 186 |
except Exception as e2:
|
| 187 |
print(f"β LLM CLIENT: Registry methods failed - {e2}")
|
| 188 |
# Try to create a simple LLM client directly
|
| 189 |
try:
|
| 190 |
+
if LANGCHAIN_OPENAI_AVAILABLE:
|
| 191 |
+
api_key = os.getenv("OPENAI_API_KEY") or os.getenv("OPENROUTER_API_KEY")
|
| 192 |
+
if api_key:
|
| 193 |
+
self.llm_client = ChatOpenAI(
|
| 194 |
+
model="gpt-3.5-turbo",
|
| 195 |
+
api_key=api_key,
|
| 196 |
+
temperature=0.1,
|
| 197 |
+
max_tokens=1000
|
| 198 |
+
)
|
| 199 |
+
print("β
LLM CLIENT: Initialized using direct ChatOpenAI")
|
| 200 |
+
else:
|
| 201 |
+
print("β LLM CLIENT: No API key available")
|
| 202 |
else:
|
| 203 |
+
print("β LLM CLIENT: langchain-openai not available")
|
| 204 |
except Exception as e3:
|
| 205 |
print(f"β LLM CLIENT: Direct instantiation also failed - {e3}")
|
| 206 |
self.llm_client = None
|
| 207 |
|
| 208 |
# Load system prompt
|
|
|
|
| 209 |
self.system_prompt = DEFAULT_AUDIT_SYSTEM_PROMPT
|
| 210 |
|
| 211 |
# Initialize report service
|
| 212 |
try:
|
|
|
|
|
|
|
|
|
|
|
|
|
| 213 |
self.report_service = ReportService()
|
| 214 |
except Exception as e:
|
| 215 |
print(f"Warning: Could not initialize report service: {e}")
|
|
|
|
| 217 |
|
| 218 |
except Exception as e:
|
| 219 |
print(f"β Error initializing components: {e}")
|
|
|
|
| 220 |
traceback.print_exc()
|
| 221 |
# Don't set vectorstore_manager to None if it was already set
|
| 222 |
if not hasattr(self, 'vectorstore_manager') or self.vectorstore_manager is None:
|
|
|
|
| 337 |
return False
|
| 338 |
except Exception as init_error:
|
| 339 |
print(f"β Error initializing vector store manager: {init_error}")
|
|
|
|
| 340 |
traceback.print_exc()
|
| 341 |
return False
|
| 342 |
|
|
|
|
| 351 |
except Exception as e:
|
| 352 |
print(f"β Error connecting to vector store: {e}")
|
| 353 |
log_error(e, {"component": "vectorstore_connection"})
|
|
|
|
| 354 |
traceback.print_exc()
|
| 355 |
|
| 356 |
# If it's a dimension mismatch error, try with force_recreate
|
|
|
|
| 539 |
if auto_infer_filters and not any([reports, sources, subtype]):
|
| 540 |
print(f"π€ AUTO-INFERRING FILTERS: No explicit filters provided, analyzing query...")
|
| 541 |
try:
|
|
|
|
|
|
|
|
|
|
| 542 |
# Get available metadata
|
| 543 |
available_metadata = get_available_metadata(self.vectorstore_manager.get_vectorstore())
|
| 544 |
|