Files changed (7) hide show
  1. .gitignore +112 -0
  2. add_district_metadata.py +379 -0
  3. app.py +804 -233
  4. multi_agent_chatbot.py +12 -13
  5. smart_chatbot.py +4 -3
  6. src/config/paths.py +59 -0
  7. src/pipeline.py +32 -37
.gitignore ADDED
@@ -0,0 +1,112 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # ==========================================
2
+ # PYTHON
3
+ # ==========================================
4
+ __pycache__/
5
+ *.py[cod]
6
+ *.pyo
7
+ *.pyd
8
+ *$py.class
9
+
10
+ # Virtual environments
11
+ .venv/
12
+ venv/
13
+ env/
14
+ ENV/
15
+ .conda/
16
+ .venv*/
17
+
18
+ # Byte-compiled / optimized / DLL files
19
+ *.so
20
+ *.dll
21
+ *.dylib
22
+
23
+ # Logs and debug
24
+ *.log
25
+ *.out
26
+ *.err
27
+ logs/
28
+ debug/
29
+ *.sqlite3
30
+
31
+ # ==========================================
32
+ # BUILD / PACKAGING
33
+ # ==========================================
34
+ build/
35
+ dist/
36
+ *.egg-info/
37
+ .eggs/
38
+ pip-wheel-metadata/
39
+ .wheels/
40
+
41
+ # ==========================================
42
+ # JUPYTER / NOTEBOOKS
43
+ # ==========================================
44
+ .ipynb_checkpoints/
45
+ *.ipynb_convert/
46
+
47
+ # ==========================================
48
+ # DATA / MODELS / CACHE
49
+ # ==========================================
50
+ data/
51
+ datasets/
52
+ .cache/
53
+ *.ckpt
54
+ *.h5
55
+ *.hdf5
56
+ *.tflite
57
+ *.onnx
58
+ *.pth
59
+ *.pt
60
+ *.joblib
61
+ *.pkl
62
+ *.pickle
63
+ *.npz
64
+ *.npy
65
+ outputs/
66
+ artifacts/
67
+ checkpoints/
68
+ runs/
69
+ wandb/
70
+ mlruns/
71
+ lightning_logs/
72
+
73
+ # Hugging Face
74
+ huggingface/
75
+ ~/.cache/huggingface/
76
+ ~/.cache/torch/
77
+ ~/.cache/datasets/
78
+ ~/.cache/transformers/
79
+
80
+ # ==========================================
81
+ # EDITORS / TOOLS
82
+ # ==========================================
83
+ .vscode/
84
+ .idea/
85
+ *.swp
86
+ *.swo
87
+ *.bak
88
+ .DS_Store
89
+ Thumbs.db
90
+
91
+ # ==========================================
92
+ # ENV FILES / CREDENTIALS
93
+ # ==========================================
94
+ .env
95
+ .env.*
96
+ *.env.local
97
+ secrets.*
98
+ config.json
99
+ token.json
100
+
101
+ # ==========================================
102
+ # TESTS / TEMP FILES
103
+ # ==========================================
104
+ __tests__/
105
+ .tox/
106
+ .coverage
107
+ .cache/
108
+ pytest_cache/
109
+ tmp/
110
+ temp/
111
+ *.tmp
112
+ *.temp
add_district_metadata.py ADDED
@@ -0,0 +1,379 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ Script to add District metadata to Qdrant chunks based on filename analysis.
4
+ Handles Uganda districts, ministry mappings, and LLM inference for ambiguous cases.
5
+ """
6
+ import re
7
+ import yaml
8
+ import logging
9
+ from dataclasses import dataclass
10
+ from typing import Dict, List, Optional
11
+
12
+
13
+ from qdrant_client import QdrantClient
14
+
15
+ # Configure logging
16
+ logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
17
+ logger = logging.getLogger(__name__)
18
+
19
+
20
+ @dataclass
21
+ class DistrictMapping:
22
+ """Mapping for district-related entities"""
23
+ name: str
24
+ aliases: List[str]
25
+ is_district: bool = True
26
+
27
+
28
+ class DistrictMetadataProcessor:
29
+ def __init__(self, config_path: str = "src/config/settings.yaml"):
30
+ # Load config manually
31
+ with open(config_path, 'r') as f:
32
+ self.config = yaml.safe_load(f)
33
+
34
+ # Initialize Qdrant client (will be imported when needed)
35
+ self.llm_client = None
36
+ self.qdrant_client = None
37
+ self.collection_name = self.config["qdrant"]["collection_name"]
38
+
39
+ # Initialize district mappings
40
+ self.district_mappings = self._initialize_district_mappings()
41
+ self.ministry_mappings = self._initialize_ministry_mappings()
42
+
43
+ def _initialize_district_mappings(self) -> Dict[str, DistrictMapping]:
44
+ """Initialize Uganda districts and their aliases"""
45
+ districts = [
46
+ # Central Region
47
+ DistrictMapping("Kampala", ["KCCA", "Kampala Capital City Authority"]),
48
+ DistrictMapping("Wakiso", ["Wakiso"]),
49
+ DistrictMapping("Mukono", ["Mukono"]),
50
+ DistrictMapping("Luweero", ["Luweero"]),
51
+ DistrictMapping("Nakaseke", ["Nakaseke"]),
52
+ DistrictMapping("Nakasongola", ["Nakasongola"]),
53
+ DistrictMapping("Kayunga", ["Kayunga"]),
54
+ DistrictMapping("Buikwe", ["Buikwe"]),
55
+ DistrictMapping("Buvuma", ["Buvuma"]),
56
+
57
+ # Northern Region
58
+ DistrictMapping("Gulu", ["Gulu", "Gulu DLG"]),
59
+ DistrictMapping("Kitgum", ["Kitgum"]),
60
+ DistrictMapping("Pader", ["Pader"]),
61
+ DistrictMapping("Agago", ["Agago"]),
62
+ DistrictMapping("Lamwo", ["Lamwo"]),
63
+ DistrictMapping("Nwoya", ["Nwoya"]),
64
+ DistrictMapping("Amuru", ["Amuru"]),
65
+ DistrictMapping("Omoro", ["Omoro"]),
66
+ DistrictMapping("Oyam", ["Oyam"]),
67
+ DistrictMapping("Kole", ["Kole"]),
68
+ DistrictMapping("Apac", ["Apac", "Apac District"]),
69
+ DistrictMapping("Lira", ["Lira"]),
70
+ DistrictMapping("Alebtong", ["Alebtong"]),
71
+ DistrictMapping("Amolatar", ["Amolatar"]),
72
+ DistrictMapping("Dokolo", ["Dokolo"]),
73
+ DistrictMapping("Otuke", ["Otuke"]),
74
+ DistrictMapping("Kwania", ["Kwania"]),
75
+
76
+ # Eastern Region
77
+ DistrictMapping("Jinja", ["Jinja"]),
78
+ DistrictMapping("Kamuli", ["Kamuli"]),
79
+ DistrictMapping("Iganga", ["Iganga"]),
80
+ DistrictMapping("Bugiri", ["Bugiri"]),
81
+ DistrictMapping("Mayuge", ["Mayuge"]),
82
+ DistrictMapping("Namayingo", ["Namayingo"]),
83
+ DistrictMapping("Busia", ["Busia"]),
84
+ DistrictMapping("Tororo", ["Tororo"]),
85
+ DistrictMapping("Pallisa", ["Pallisa"]),
86
+ DistrictMapping("Kumi", ["Kumi"]),
87
+ DistrictMapping("Bukedea", ["Bukedea"]),
88
+ DistrictMapping("Soroti", ["Soroti"]),
89
+ DistrictMapping("Serere", ["Serere"]),
90
+ DistrictMapping("Ngora", ["Ngora"]),
91
+ DistrictMapping("Kaberamaido", ["Kaberamaido"]),
92
+ DistrictMapping("Kalaki", ["Kalaki"]),
93
+ DistrictMapping("Kapelebyong", ["Kapelebyong"]),
94
+ DistrictMapping("Amuria", ["Amuria"]),
95
+ DistrictMapping("Katakwi", ["Katakwi"]),
96
+ DistrictMapping("Kotido", ["Kotido"]),
97
+ DistrictMapping("Abim", ["Abim"]),
98
+ DistrictMapping("Kaabong", ["Kaabong", "Kaabong District"]),
99
+ DistrictMapping("Karenga", ["Karenga"]),
100
+ DistrictMapping("Moroto", ["Moroto"]),
101
+ DistrictMapping("Napak", ["Napak"]),
102
+ DistrictMapping("Nabilatuk", ["Nabilatuk"]),
103
+ DistrictMapping("Amudat", ["Amudat"]),
104
+ DistrictMapping("Nakapiripirit", ["Nakapiripirit"]),
105
+ DistrictMapping("Bukwo", ["Bukwo"]),
106
+ DistrictMapping("Kween", ["Kween"]),
107
+ DistrictMapping("Kapchorwa", ["Kapchorwa"]),
108
+ DistrictMapping("Sironko", ["Sironko"]),
109
+ DistrictMapping("Manafwa", ["Manafwa"]),
110
+ DistrictMapping("Bududa", ["Bududa"]),
111
+ DistrictMapping("Mbale", ["Mbale"]),
112
+ DistrictMapping("Butaleja", ["Butaleja"]),
113
+ DistrictMapping("Namisindwa", ["Namisindwa"]),
114
+ DistrictMapping("Bulambuli", ["Bulambuli"]),
115
+
116
+ # Western Region
117
+ DistrictMapping("Masaka", ["Masaka"]),
118
+ DistrictMapping("Kalungu", ["Kalungu"]),
119
+ DistrictMapping("Bukomansimbi", ["Bukomansimbi"]),
120
+ DistrictMapping("Lwengo", ["Lwengo"]),
121
+ DistrictMapping("Sembabule", ["Sembabule"]),
122
+ DistrictMapping("Rakai", ["Rakai"]),
123
+ DistrictMapping("Kyotera", ["Kyotera"]),
124
+ DistrictMapping("Mpigi", ["Mpigi"]),
125
+ DistrictMapping("Butambala", ["Butambala"]),
126
+ DistrictMapping("Gomba", ["Gomba"]),
127
+ DistrictMapping("Mityana", ["Mityana"]),
128
+ DistrictMapping("Mubende", ["Mubende"]),
129
+ DistrictMapping("Kassanda", ["Kassanda"]),
130
+ DistrictMapping("Kiboga", ["Kiboga"]),
131
+ DistrictMapping("Kyankwanzi", ["Kyankwanzi"]),
132
+ DistrictMapping("Hoima", ["Hoima"]),
133
+ DistrictMapping("Kikuube", ["Kikuube"]),
134
+ DistrictMapping("Kakumiro", ["Kakumiro"]),
135
+ DistrictMapping("Kibaale", ["Kibaale"]),
136
+ DistrictMapping("Kagadi", ["Kagadi"]),
137
+ DistrictMapping("Buliisa", ["Buliisa"]),
138
+ DistrictMapping("Masindi", ["Masindi"]),
139
+ DistrictMapping("Kiryandongo", ["Kiryandongo"]),
140
+ DistrictMapping("Buliisa", ["Buliisa"]),
141
+ DistrictMapping("Pakwach", ["Pakwach"]),
142
+ DistrictMapping("Nebbi", ["Nebbi"]),
143
+ DistrictMapping("Zombo", ["Zombo"]),
144
+ DistrictMapping("Arua", ["Arua"]),
145
+ DistrictMapping("Terego", ["Terego"]),
146
+ DistrictMapping("Madi-Okollo", ["Madi-Okollo"]),
147
+ DistrictMapping("Obongi", ["Obongi"]),
148
+ DistrictMapping("Moyo", ["Moyo"]),
149
+ DistrictMapping("Yumbe", ["Yumbe"]),
150
+ DistrictMapping("Koboko", ["Koboko"]),
151
+ DistrictMapping("Maracha", ["Maracha"]),
152
+ DistrictMapping("Adjumani", ["Adjumani"]),
153
+
154
+ # South Western Region
155
+ DistrictMapping("Mbarara", ["Mbarara"]),
156
+ DistrictMapping("Ibanda", ["Ibanda"]),
157
+ DistrictMapping("Isingiro", ["Isingiro"]),
158
+ DistrictMapping("Kiruhura", ["Kiruhura"]),
159
+ DistrictMapping("Kazo", ["Kazo"]),
160
+ DistrictMapping("Ntungamo", ["Ntungamo"]),
161
+ DistrictMapping("Rwampara", ["Rwampara"]),
162
+ DistrictMapping("Rubanda", ["Rubanda"]),
163
+ DistrictMapping("Rukiga", ["Rukiga"]),
164
+ DistrictMapping("Kanungu", ["Kanungu"]),
165
+ DistrictMapping("Rukungiri", ["Rukungiri"]),
166
+ DistrictMapping("Kisoro", ["Kisoro"]),
167
+ DistrictMapping("Bundibugyo", ["Bundibugyo"]),
168
+ DistrictMapping("Ntoroko", ["Ntoroko"]),
169
+ DistrictMapping("Kasese", ["Kasese"]),
170
+ DistrictMapping("Bunyangabu", ["Bunyangabu"]),
171
+ DistrictMapping("Fort Portal", ["Fort Portal"]),
172
+ DistrictMapping("Kabarole", ["Kabarole"]),
173
+ DistrictMapping("Kyenjojo", ["Kyenjojo"]),
174
+ DistrictMapping("Kamwenge", ["Kamwenge"]),
175
+ DistrictMapping("Kitagwenda", ["Kitagwenda"]),
176
+ DistrictMapping("Kyegegwa", ["Kyegegwa"]),
177
+ DistrictMapping("Mitooma", ["Mitooma"]),
178
+ DistrictMapping("Rubirizi", ["Rubirizi"]),
179
+ DistrictMapping("Sheema", ["Sheema"]),
180
+ DistrictMapping("Bushenyi", ["Bushenyi"]),
181
+
182
+ # Special cases
183
+ DistrictMapping("Kalangala", ["Kalangala", "Kalangala DLG"]),
184
+ ]
185
+
186
+ # Create mapping dictionary
187
+ mapping_dict = {}
188
+ for district in districts:
189
+ mapping_dict[district.name.lower()] = district
190
+ for alias in district.aliases:
191
+ mapping_dict[alias.lower()] = district
192
+ return mapping_dict
193
+
194
+ def _initialize_ministry_mappings(self) -> Dict[str, str]:
195
+ """Initialize ministry and organization mappings"""
196
+ return {
197
+ "maaif": "Ministry of Agriculture, Animal Industry and Fisheries",
198
+ "mwts": "Ministry of Works and Transport",
199
+ "kcca": "Kampala Capital City Authority",
200
+ "oag": "Office of the Auditor General",
201
+ "arsdp": "Albertine Regional Sustainable Development Project",
202
+ "avcdp": "Agriculture Value Chain Development Project",
203
+ "ida": "International Development Association",
204
+ "dlg": "District Local Government",
205
+ "lg": "Local Government",
206
+ }
207
+
208
+ def _extract_district_from_filename(self, filename: str) -> Optional[str]:
209
+ """Extract district from filename using pattern matching"""
210
+ filename_lower = filename.lower()
211
+
212
+ # Check for explicit district mentions
213
+ for key, district_mapping in self.district_mappings.items():
214
+ if key in filename_lower:
215
+ return district_mapping.name
216
+
217
+ # Check for ministry/organization patterns that are NOT districts
218
+ for ministry_key in self.ministry_mappings.keys():
219
+ if ministry_key in filename_lower:
220
+ return None # This is a ministry, not a district
221
+
222
+ # Check for patterns like "District Local Government"
223
+ district_pattern = r'(\w+)\s+district\s+local\s+government'
224
+ match = re.search(district_pattern, filename_lower)
225
+ if match:
226
+ district_name = match.group(1).title()
227
+ if district_name.lower() in self.district_mappings:
228
+ return self.district_mappings[district_name.lower()].name
229
+
230
+ # Check for patterns like "DLG Report"
231
+ dlg_pattern = r'(\w+)\s+dlg\s+report'
232
+ match = re.search(dlg_pattern, filename_lower)
233
+ if match:
234
+ district_name = match.group(1).title()
235
+ if district_name.lower() in self.district_mappings:
236
+ return self.district_mappings[district_name.lower()].name
237
+
238
+ return None
239
+
240
+ def _infer_district_with_llm(self, filename: str) -> Optional[str]:
241
+ """Use LLM to infer district from filename when pattern matching fails"""
242
+ # For now, return None - LLM integration can be added later
243
+ logger.info(f"LLM inference needed for filename: {filename}")
244
+ return None
245
+
246
+ def infer_district(self, filename: str) -> Optional[str]:
247
+ """Main method to infer district from filename"""
248
+ # First try pattern matching
249
+ district = self._extract_district_from_filename(filename)
250
+ if district:
251
+ return district
252
+
253
+ # If pattern matching fails, use LLM
254
+ return self._infer_district_with_llm(filename)
255
+
256
+ def fetch_chunks_batch(self, batch_size: int = 100, offset: int = 0) -> List[Dict]:
257
+ """Fetch a batch of chunks from Qdrant (metadata only)"""
258
+ try:
259
+ # Import Qdrant client when needed
260
+ if self.qdrant_client is None:
261
+ self.qdrant_client = QdrantClient(
262
+ url=self.config["qdrant"]["url"],
263
+ api_key=self.config["qdrant"]["api_key"]
264
+ )
265
+
266
+ # Get points with metadata only (no vectors)
267
+ points = self.qdrant_client.scroll(
268
+ collection_name=self.collection_name,
269
+ limit=batch_size,
270
+ offset=offset,
271
+ with_payload=True,
272
+ with_vectors=False
273
+ )[0]
274
+
275
+ return points
276
+ except Exception as e:
277
+ logger.error(f"Failed to fetch batch: {e}")
278
+ return []
279
+
280
+ def update_chunks_with_district(self, points: List[Dict]) -> int:
281
+ """Update chunks with district metadata"""
282
+ updated_count = 0
283
+
284
+ # Import Qdrant client when needed
285
+ if self.qdrant_client is None:
286
+ from qdrant_client import QdrantClient
287
+ self.qdrant_client = QdrantClient(
288
+ url=self.config["qdrant"]["url"],
289
+ api_key=self.config["qdrant"]["api_key"]
290
+ )
291
+
292
+ for point in points:
293
+ try:
294
+ point_id = point.id
295
+ metadata = point.payload.get("metadata", {})
296
+ filename = metadata.get("filename", "")
297
+
298
+ if not filename:
299
+ logger.warning(f"Point {point_id} has no filename")
300
+ continue
301
+
302
+ # Infer district
303
+ district = self.infer_district(filename)
304
+
305
+ # Update metadata
306
+ updated_metadata = metadata.copy()
307
+ updated_metadata["district"] = district
308
+
309
+ # Update point in Qdrant
310
+ self.qdrant_client.set_payload(
311
+ collection_name=self.collection_name,
312
+ payload={"metadata": updated_metadata},
313
+ points=[point_id]
314
+ )
315
+
316
+ updated_count += 1
317
+ logger.info(f"Updated point {point_id}: {filename} -> {district}")
318
+
319
+ except Exception as e:
320
+ logger.error(f"Failed to update point {point_id}: {e}")
321
+
322
+ return updated_count
323
+
324
+ def process_all_chunks(self, batch_size: int = 100):
325
+ """Process all chunks in batches"""
326
+ total_updated = 0
327
+ offset = 0
328
+
329
+ logger.info(f"Starting to process chunks in batches of {batch_size}")
330
+
331
+ while True:
332
+ # Fetch batch
333
+ points = self.fetch_chunks_batch(batch_size, offset)
334
+ if not points:
335
+ break
336
+
337
+ logger.info(f"Processing batch: {len(points)} points (offset: {offset})")
338
+
339
+ # Update batch
340
+ updated_count = self.update_chunks_with_district(points)
341
+ total_updated += updated_count
342
+
343
+ logger.info(f"Updated {updated_count} points in this batch")
344
+
345
+ # Move to next batch
346
+ offset += batch_size
347
+
348
+ logger.info(f"Total updated: {total_updated} points")
349
+ return total_updated
350
+
351
+ def main():
352
+ """Main function to run the district metadata processor"""
353
+ try:
354
+ processor = DistrictMetadataProcessor()
355
+
356
+ # Test with a small batch first
357
+ logger.info("Testing with first 10 chunks...")
358
+ test_points = processor.fetch_chunks_batch(10, 0)
359
+
360
+ if test_points:
361
+ logger.info("Test batch fetched successfully. Processing...")
362
+ for point in test_points:
363
+ filename = point.payload.get("metadata", {}).get("filename", "")
364
+ district = processor.infer_district(filename)
365
+ logger.info(f"Test: {filename} -> {district}")
366
+
367
+ # Ask user if they want to proceed with full processing
368
+ response = input("\nProceed with full processing? (y/n): ")
369
+ if response.lower() == 'y':
370
+ processor.process_all_chunks(batch_size=100)
371
+ else:
372
+ logger.info("Processing cancelled by user")
373
+
374
+ except Exception as e:
375
+ logger.error(f"Error in main: {e}")
376
+ raise
377
+
378
+ if __name__ == "__main__":
379
+ main()
app.py CHANGED
@@ -3,7 +3,32 @@ Intelligent Audit Report Chatbot UI
3
  """
4
 
5
  import os
6
- import sys
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7
 
8
  # ===== CRITICAL: Fix OMP_NUM_THREADS FIRST, before ANY other imports =====
9
  # Some libraries load at import time and will fail if OMP_NUM_THREADS is invalid
@@ -29,42 +54,30 @@ except (ValueError, TypeError):
29
 
30
  # ===== Setup HuggingFace cache directories BEFORE any model imports =====
31
  # CRITICAL: Set these before any imports that might use HuggingFace (like sentence-transformers)
32
- # This ensures models downloaded during Docker build are found at runtime
33
- cache_dir = "/app/.cache/huggingface"
34
- os.environ["HF_HOME"] = cache_dir
35
- os.environ["TRANSFORMERS_CACHE"] = cache_dir
36
- os.environ["HF_DATASETS_CACHE"] = cache_dir
37
- os.environ["HF_HUB_CACHE"] = cache_dir
38
- os.environ["SENTENCE_TRANSFORMERS_HOME"] = cache_dir
39
-
40
- # Ensure cache directory exists (created in Dockerfile, but ensure it's there)
41
- try:
42
- os.makedirs(cache_dir, mode=0o755, exist_ok=True)
43
- except (PermissionError, OSError) as e:
44
- # If we can't create it, log but continue (might already exist from Dockerfile)
45
- # HuggingFace will try to create subdirectories, but we need parent to exist
46
- pass
47
-
48
- import time
49
- import json
50
- import uuid
51
- import logging
52
- from pathlib import Path
53
-
54
- import argparse
55
- import streamlit as st
56
- from langchain_core.messages import HumanMessage, AIMessage
57
-
58
- from multi_agent_chatbot import get_multi_agent_chatbot
59
- from smart_chatbot import get_chatbot as get_smart_chatbot
60
- from src.reporting.feedback_schema import create_feedback_from_dict
61
 
62
  # Configure logging
63
  logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
64
  logger = logging.getLogger(__name__)
65
 
66
  # Log environment setup for debugging
67
- logger.info(f"πŸ“ HuggingFace cache directory: {os.environ.get('HF_HOME', 'NOT SET')}")
 
 
68
  logger.info(f"πŸ”§ OMP_NUM_THREADS: {os.environ.get('OMP_NUM_THREADS', 'NOT SET')}")
69
 
70
 
@@ -94,6 +107,54 @@ st.markdown("""
94
  margin-bottom: 2rem;
95
  }
96
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
97
  .session-info {
98
  background-color: #f0f2f6;
99
  padding: 10px;
@@ -152,6 +213,34 @@ st.markdown("""
152
  margin: 10px 0;
153
  border-left: 4px solid #007bff;
154
  }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
155
  </style>
156
  """, unsafe_allow_html=True)
157
 
@@ -215,13 +304,270 @@ def serialize_documents(sources):
215
 
216
  return serialized
217
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
218
  @st.cache_data
219
  def load_filter_options():
220
  try:
221
- with open("src/config/filter_options.json", "r") as f:
 
222
  return json.load(f)
223
  except FileNotFoundError:
224
- st.info([x for x in os.listdir() if x.endswith('.json')])
225
  st.error("filter_options.json not found. Please run the metadata analysis script.")
226
  return {"sources": [], "years": [], "districts": [], 'filenames': []}
227
 
@@ -254,16 +600,8 @@ def main():
254
  st.session_state.reset_conversation = False
255
  st.rerun()
256
 
257
- # Header with system indicator
258
- col1, col2 = st.columns([3, 1])
259
- with col1:
260
- st.markdown('<h1 class="main-header">πŸ€– Intelligent Audit Report Chatbot</h1>', unsafe_allow_html=True)
261
- with col2:
262
- system_type = get_system_type()
263
- if "Multi-Agent" in system_type:
264
- st.success(f"πŸ”§ {system_type}")
265
- else:
266
- st.info(f"πŸ”§ {system_type}")
267
  st.markdown('<p class="subtitle">Ask questions about audit reports. Use the sidebar filters to narrow down your search!</p>', unsafe_allow_html=True)
268
 
269
  # Session info
@@ -280,6 +618,40 @@ def main():
280
 
281
  # Sidebar for filters
282
  with st.sidebar:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
283
  st.markdown("### πŸ” Search Filters")
284
  st.markdown("Select filters to narrow down your search. Leave empty to search all data.")
285
 
@@ -294,11 +666,13 @@ def main():
294
  help="Choose specific reports to search. When enabled, all other filters are ignored."
295
  )
296
  st.markdown('</div>', unsafe_allow_html=True)
 
 
297
 
298
  # Determine if filename filter is active
299
  filename_mode = len(selected_filenames) > 0
300
  # Sources filter
301
- st.markdown('<div class="filter-section">', unsafe_allow_html=True)
302
  st.markdown('<div class="filter-title">πŸ“Š Sources</div>', unsafe_allow_html=True)
303
  selected_sources = st.multiselect(
304
  "Select sources:",
@@ -311,7 +685,7 @@ def main():
311
  st.markdown('</div>', unsafe_allow_html=True)
312
 
313
  # Years filter
314
- st.markdown('<div class="filter-section">', unsafe_allow_html=True)
315
  st.markdown('<div class="filter-title">πŸ“… Years</div>', unsafe_allow_html=True)
316
  selected_years = st.multiselect(
317
  "Select years:",
@@ -324,7 +698,7 @@ def main():
324
  st.markdown('</div>', unsafe_allow_html=True)
325
 
326
  # Districts filter
327
- st.markdown('<div class="filter-section">', unsafe_allow_html=True)
328
  st.markdown('<div class="filter-title">🏘️ Districts</div>', unsafe_allow_html=True)
329
  selected_districts = st.multiselect(
330
  "Select districts:",
@@ -375,12 +749,85 @@ def main():
375
  if 'input_counter' not in st.session_state:
376
  st.session_state.input_counter = 0
377
 
 
 
 
 
 
 
 
 
 
 
 
378
  user_input = st.text_input(
379
  "Type your message here...",
380
  placeholder="Ask about budget allocations, expenditures, or audit findings...",
381
- key=f"user_input_{st.session_state.input_counter}",
382
- label_visibility="collapsed"
 
383
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
384
 
385
  with col2:
386
  send_button = st.button("Send", key="send_button", use_container_width=True)
@@ -389,12 +836,11 @@ def main():
389
  if st.button("πŸ—‘οΈ Clear Chat", key="clear_chat_button"):
390
  st.session_state.reset_conversation = True
391
  # Clear all conversation files
392
- import os
393
- conversations_dir = "conversations"
394
- if os.path.exists(conversations_dir):
395
- for file in os.listdir(conversations_dir):
396
- if file.endswith('.json'):
397
- os.remove(os.path.join(conversations_dir, file))
398
  st.rerun()
399
 
400
  # Handle user input
@@ -484,14 +930,30 @@ def main():
484
  # Count unique filenames
485
  unique_filenames = set()
486
  for doc in sources:
487
- filename = getattr(doc, 'metadata', {}).get('filename', 'Unknown')
 
488
  unique_filenames.add(filename)
489
 
490
- st.markdown(f"**Found {len(sources)} document chunks from {len(unique_filenames)} unique documents (showing top 10):**")
491
  if len(unique_filenames) < len(sources):
492
  st.info(f"πŸ’‘ **Note**: Each document is split into multiple chunks. You're seeing {len(sources)} chunks from {len(unique_filenames)} documents.")
493
 
494
- for i, doc in enumerate(sources[:10]): # Show top 10
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
495
  # Get relevance score and ID if available
496
  metadata = getattr(doc, 'metadata', {})
497
  score = metadata.get('reranked_score', metadata.get('original_score', None))
@@ -524,6 +986,44 @@ def main():
524
  st.info("No documents were retrieved for the last query.")
525
  else:
526
  st.info("No documents have been retrieved yet. Start a conversation to see retrieved documents here.")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
527
 
528
  # Feedback Dashboard Section
529
  st.markdown("---")
@@ -543,200 +1043,271 @@ def main():
543
  if 'feedback_submitted' not in st.session_state:
544
  st.session_state.feedback_submitted = False
545
 
546
- # Feedback form
547
- with st.form("feedback_form", clear_on_submit=False):
548
- col1, col2 = st.columns([1, 1])
549
-
550
- with col1:
551
- feedback_score = st.slider(
552
- "Rate this conversation (1-5)",
553
- min_value=1,
554
- max_value=5,
555
- help="How satisfied are you with the conversation?"
556
- )
557
-
558
- with col2:
559
- is_feedback_about_last_retrieval = st.checkbox(
560
- "Feedback about last retrieval only",
561
- value=True,
562
- help="If checked, feedback applies to the most recent document retrieval"
563
- )
564
-
565
- open_ended_feedback = st.text_area(
566
- "Your feedback (optional)",
567
- placeholder="Tell us what went well or what could be improved...",
568
- height=100
569
- )
570
-
571
- # Disable submit if no score selected
572
- submit_disabled = feedback_score is None
573
-
574
- submitted = st.form_submit_button(
575
- "πŸ“€ Submit Feedback",
576
- use_container_width=True,
577
- disabled=submit_disabled
578
- )
579
-
580
- if submitted and not st.session_state.feedback_submitted:
581
- # Log the feedback data being submitted
582
- print("=" * 80)
583
- print("πŸ”„ FEEDBACK SUBMISSION: Starting...")
584
- print("=" * 80)
585
- st.write("πŸ” **Debug: Feedback Data Being Submitted:**")
586
 
587
- # Create feedback data dictionary
588
- feedback_dict = {
589
- "open_ended_feedback": open_ended_feedback,
590
- "score": feedback_score,
591
- "is_feedback_about_last_retrieval": is_feedback_about_last_retrieval,
592
- "retrieved_data": st.session_state.rag_retrieval_history.copy() if st.session_state.rag_retrieval_history else [],
593
- "conversation_id": st.session_state.conversation_id,
594
- "timestamp": time.time(),
595
- "message_count": len(st.session_state.messages),
596
- "has_retrievals": has_retrievals,
597
- "retrieval_count": len(st.session_state.rag_retrieval_history)
598
- }
599
 
600
- print(f"πŸ“ FEEDBACK SUBMISSION: Score={feedback_score}, Retrievals={len(st.session_state.rag_retrieval_history) if st.session_state.rag_retrieval_history else 0}")
 
 
 
 
 
601
 
602
- # Create UserFeedback dataclass instance
603
- feedback_obj = None # Initialize outside try block
604
- try:
605
- feedback_obj = create_feedback_from_dict(feedback_dict)
606
- print(f"βœ… FEEDBACK SUBMISSION: Feedback object created - ID={feedback_obj.feedback_id}")
607
- st.write(f"βœ… **Feedback Object Created**")
608
- st.write(f"- Feedback ID: {feedback_obj.feedback_id}")
609
- st.write(f"- Score: {feedback_obj.score}/5")
610
- st.write(f"- Has Retrievals: {feedback_obj.has_retrievals}")
611
-
612
- # Convert back to dict for JSON serialization
613
- feedback_data = feedback_obj.to_dict()
614
- except Exception as e:
615
- print(f"❌ FEEDBACK SUBMISSION: Failed to create feedback object: {e}")
616
- st.error(f"Failed to create feedback object: {e}")
617
- feedback_data = feedback_dict
618
-
619
- # Display the data being submitted
620
- st.json(feedback_data)
621
 
622
- # Save feedback to file - use absolute path in /app to ensure writability
623
- feedback_dir = Path("/app/feedback")
624
- try:
625
- # Ensure directory exists with write permissions (777 for compatibility)
626
- feedback_dir.mkdir(parents=True, mode=0o777, exist_ok=True)
627
- except (PermissionError, OSError) as e:
628
- logger.warning(f"Could not create feedback directory at {feedback_dir}: {e}")
629
- # Fallback to relative path
630
- feedback_dir = Path("feedback")
631
- feedback_dir.mkdir(parents=True, mode=0o777, exist_ok=True)
632
 
633
- feedback_file = feedback_dir / f"feedback_{st.session_state.conversation_id}_{int(time.time())}.json"
 
 
 
 
634
 
635
- try:
636
- # Ensure parent directory exists before writing
637
- feedback_file.parent.mkdir(parents=True, mode=0o777, exist_ok=True)
638
-
639
- # Save to local file
640
- print(f"πŸ’Ύ FEEDBACK SAVE: Saving to local file: {feedback_file}")
641
- with open(feedback_file, 'w') as f:
642
- json.dump(feedback_data, f, indent=2, default=str)
643
 
644
- print(f"βœ… FEEDBACK SAVE: Local file saved successfully")
645
- st.success("βœ… Thank you for your feedback! It has been saved locally.")
646
- st.balloons()
 
 
 
 
 
 
 
 
 
647
 
648
- # Save to Snowflake if enabled and credentials available
649
- logger.info("πŸ”„ FEEDBACK SAVE: Starting Snowflake save process...")
650
- logger.info(f"πŸ“Š FEEDBACK SAVE: feedback_obj={'exists' if feedback_obj else 'None'}")
651
 
 
 
652
  try:
653
- import os
654
- snowflake_enabled = os.getenv("SNOWFLAKE_ENABLED", "false").lower() == "true"
655
- logger.info(f"πŸ” SNOWFLAKE CHECK: enabled={snowflake_enabled}")
 
 
 
656
 
657
- if snowflake_enabled:
658
- if feedback_obj:
659
- try:
660
- from src.reporting.snowflake_connector import save_to_snowflake
661
- logger.info("πŸ“€ SNOWFLAKE UI: Attempting to save feedback to Snowflake...")
662
- print("πŸ“€ SNOWFLAKE UI: Attempting to save feedback to Snowflake...") # Also print to terminal
663
-
664
- if save_to_snowflake(feedback_obj):
665
- logger.info("βœ… SNOWFLAKE UI: Successfully saved to Snowflake")
666
- print("βœ… SNOWFLAKE UI: Successfully saved to Snowflake") # Also print to terminal
667
- st.success("βœ… Feedback also saved to Snowflake!")
668
- else:
669
- logger.warning("⚠️ SNOWFLAKE UI: Save failed")
670
- print("⚠️ SNOWFLAKE UI: Save failed") # Also print to terminal
671
- st.warning("⚠️ Snowflake save failed, but local save succeeded")
672
- except Exception as e:
673
- logger.error(f"❌ SNOWFLAKE UI ERROR: {e}")
674
- print(f"❌ SNOWFLAKE UI ERROR: {e}") # Also print to terminal
675
- import traceback
676
- traceback.print_exc() # Print full traceback to terminal
677
- st.warning(f"⚠️ Could not save to Snowflake: {e}")
678
- else:
679
- logger.warning("⚠️ SNOWFLAKE UI: Skipping (feedback object not created)")
680
- print("⚠️ SNOWFLAKE UI: Skipping (feedback object not created)") # Also print to terminal
681
- st.warning("⚠️ Skipping Snowflake save (feedback object not created)")
682
- else:
683
- logger.info("πŸ’‘ SNOWFLAKE UI: Integration disabled")
684
- print("πŸ’‘ SNOWFLAKE UI: Integration disabled") # Also print to terminal
685
- st.info("πŸ’‘ Snowflake integration disabled (set SNOWFLAKE_ENABLED=true to enable)")
686
- except NameError as e:
687
- import traceback
688
- traceback.print_exc()
689
- logger.error(f"❌ NameError in Snowflake save: {e}")
690
- print(f"❌ NameError in Snowflake save: {e}") # Also print to terminal
691
- st.warning(f"⚠️ Snowflake save error: {e}")
692
  except Exception as e:
693
- logger.error(f"❌ Exception in Snowflake save: {type(e).__name__}: {e}")
694
- print(f"❌ Exception in Snowflake save: {type(e).__name__}: {e}") # Also print to terminal
695
- st.warning(f"⚠️ Snowflake save error: {e}")
696
 
697
- # Mark feedback as submitted to prevent resubmission
698
- st.session_state.feedback_submitted = True
699
 
700
- print("=" * 80)
701
- print(f"βœ… FEEDBACK SUBMISSION: Completed successfully")
702
- print("=" * 80)
 
 
 
 
 
 
 
703
 
704
- # Log file location
705
- st.info(f"πŸ“ Feedback saved to: {feedback_file}")
706
 
707
- except Exception as e:
708
- print(f"❌ FEEDBACK SUBMISSION: Error saving feedback: {e}")
709
- print(f"❌ FEEDBACK SUBMISSION: Error type: {type(e).__name__}")
710
- import traceback
711
- traceback.print_exc()
712
- st.error(f"❌ Error saving feedback: {e}")
713
- st.write(f"Debug error: {str(e)}")
714
-
715
- elif st.session_state.feedback_submitted:
716
- st.success("βœ… Feedback already submitted for this conversation!")
717
- if st.button("πŸ”„ Submit New Feedback", key="new_feedback_button"):
718
- st.session_state.feedback_submitted = False
719
- st.rerun()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
720
 
721
- # Display retrieval history stats
722
- if st.session_state.rag_retrieval_history:
723
- st.markdown("---")
724
- st.markdown("#### πŸ“Š Retrieval History")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
725
 
726
- with st.expander(f"View {len(st.session_state.rag_retrieval_history)} retrieval entries", expanded=False):
727
- for idx, entry in enumerate(st.session_state.rag_retrieval_history, 1):
728
- st.markdown(f"**Retrieval #{idx}**")
729
-
730
- # Display the actual RAG query
731
- rag_query_expansion = entry.get("rag_query_expansion", "No query available")
732
- st.code(rag_query_expansion, language="text")
733
-
734
- # Display summary stats
735
- st.json({
736
- "conversation_length": len(entry.get("conversation_up_to", [])),
737
- "documents_retrieved": len(entry.get("docs_retrieved", []))
738
- })
739
- st.markdown("---")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
740
 
741
  # Auto-scroll to bottom
742
  st.markdown("""
 
3
  """
4
 
5
  import os
6
+
7
+ import time
8
+ import json
9
+ import uuid
10
+ import logging
11
+ import traceback
12
+ from pathlib import Path
13
+ from typing import List, Dict, Any
14
+ from collections import Counter
15
+
16
+ import streamlit as st
17
+ from langchain_core.messages import HumanMessage, AIMessage
18
+ import pandas as pd
19
+ import plotly.express as px
20
+
21
+ from multi_agent_chatbot import get_multi_agent_chatbot
22
+ from smart_chatbot import get_chatbot as get_smart_chatbot
23
+ from src.reporting.feedback_schema import create_feedback_from_dict
24
+ from src.reporting.snowflake_connector import save_to_snowflake
25
+ from src.config.paths import (
26
+ IS_DEPLOYED,
27
+ PROJECT_DIR,
28
+ HF_CACHE_DIR,
29
+ FEEDBACK_DIR,
30
+ CONVERSATIONS_DIR,
31
+ )
32
 
33
  # ===== CRITICAL: Fix OMP_NUM_THREADS FIRST, before ANY other imports =====
34
  # Some libraries load at import time and will fail if OMP_NUM_THREADS is invalid
 
54
 
55
  # ===== Setup HuggingFace cache directories BEFORE any model imports =====
56
  # CRITICAL: Set these before any imports that might use HuggingFace (like sentence-transformers)
57
+ # Only override cache directories in deployed environment (local uses defaults)
58
+ if IS_DEPLOYED and HF_CACHE_DIR:
59
+ cache_dir = str(HF_CACHE_DIR)
60
+ os.environ["HF_HOME"] = cache_dir
61
+ os.environ["TRANSFORMERS_CACHE"] = cache_dir
62
+ os.environ["HF_DATASETS_CACHE"] = cache_dir
63
+ os.environ["HF_HUB_CACHE"] = cache_dir
64
+ os.environ["SENTENCE_TRANSFORMERS_HOME"] = cache_dir
65
+
66
+ # Ensure cache directory exists (created in Dockerfile, but ensure it's there)
67
+ try:
68
+ os.makedirs(cache_dir, mode=0o755, exist_ok=True)
69
+ except (PermissionError, OSError):
70
+ # If we can't create it, log but continue (might already exist from Dockerfile)
71
+ pass
 
 
 
 
 
 
 
 
 
 
 
 
 
 
72
 
73
  # Configure logging
74
  logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
75
  logger = logging.getLogger(__name__)
76
 
77
  # Log environment setup for debugging
78
+ logger.info(f"🌍 Environment: {'DEPLOYED' if IS_DEPLOYED else 'LOCAL'}")
79
+ logger.info(f"πŸ“ PROJECT_DIR: {PROJECT_DIR}")
80
+ logger.info(f"πŸ“ HuggingFace cache: {os.environ.get('HF_HOME', 'DEFAULT (not overridden)')}")
81
  logger.info(f"πŸ”§ OMP_NUM_THREADS: {os.environ.get('OMP_NUM_THREADS', 'NOT SET')}")
82
 
83
 
 
107
  margin-bottom: 2rem;
108
  }
109
 
110
+ .example-questions-header {
111
+ text-align: center;
112
+ margin-bottom: 1rem;
113
+ }
114
+
115
+ .example-questions-description {
116
+ text-align: center;
117
+ color: #666;
118
+ margin-bottom: 2rem;
119
+ }
120
+
121
+ /* Hide ALL default Streamlit text input help messages about Enter key */
122
+ /* This is the key one - hides "Press Enter to apply" message inside input field */
123
+ div[data-testid="InputInstructions"],
124
+ span[data-testid="InputInstructions"],
125
+ *[data-testid="InputInstructions"] {
126
+ display: none !important;
127
+ visibility: hidden !important;
128
+ opacity: 0 !important;
129
+ height: 0 !important;
130
+ width: 0 !important;
131
+ overflow: hidden !important;
132
+ position: absolute !important;
133
+ left: -9999px !important;
134
+ }
135
+
136
+ /* Also hide other potential locations */
137
+ div[data-testid="stTextInput"] + div > small,
138
+ div[data-testid="stTextInput"] ~ div > small,
139
+ div[data-testid="stTextInputContainer"] + div > small,
140
+ div[data-testid="stTextInputContainer"] ~ div > small,
141
+ div[data-baseweb="input"] + div > small,
142
+ div[data-baseweb="input"] ~ div > small {
143
+ display: none !important;
144
+ visibility: hidden !important;
145
+ opacity: 0 !important;
146
+ height: 0 !important;
147
+ overflow: hidden !important;
148
+ }
149
+
150
+ /* Custom help text for input */
151
+ .input-help-text {
152
+ font-size: 0.85rem;
153
+ color: #666;
154
+ margin-top: 0.25rem;
155
+ text-align: left;
156
+ }
157
+
158
  .session-info {
159
  background-color: #f0f2f6;
160
  padding: 10px;
 
213
  margin: 10px 0;
214
  border-left: 4px solid #007bff;
215
  }
216
+
217
+ .retrieval-distribution-container {
218
+ background-color: #ffffff;
219
+ padding: 25px;
220
+ border-radius: 10px;
221
+ margin: 20px 0;
222
+ border: 2px solid #e0e0e0;
223
+ box-shadow: 0 4px 6px rgba(0, 0, 0, 0.1), 0 2px 4px rgba(0, 0, 0, 0.06);
224
+ }
225
+
226
+ .metric-label {
227
+ font-size: 0.9rem;
228
+ color: #555;
229
+ margin-bottom: 5px;
230
+ text-align: center;
231
+ }
232
+
233
+ .metric-value {
234
+ font-size: 1.8rem;
235
+ font-weight: bold;
236
+ color: #000000;
237
+ text-align: center;
238
+ }
239
+
240
+ .metric-container {
241
+ text-align: center;
242
+ padding: 10px;
243
+ }
244
  </style>
245
  """, unsafe_allow_html=True)
246
 
 
304
 
305
  return serialized
306
 
307
+ def extract_chunk_statistics(sources: List[Any]) -> Dict[str, Any]:
308
+ """Extract statistics from retrieved chunks."""
309
+ if not sources:
310
+ return {}
311
+
312
+ sources_list = []
313
+ years = []
314
+ filenames = []
315
+ districts = []
316
+
317
+ for doc in sources:
318
+ metadata = getattr(doc, 'metadata', {})
319
+
320
+ # Extract source
321
+ source = metadata.get('source', 'Unknown')
322
+ sources_list.append(source)
323
+
324
+ # Extract year
325
+ year = metadata.get('year', 'Unknown')
326
+ if year and year != 'Unknown':
327
+ try:
328
+ # Convert to int first, then back to string to ensure it's a proper year
329
+ year_int = int(float(year)) # Handle both int and float strings
330
+ if 1900 <= year_int <= 2030: # Reasonable year range
331
+ years.append(str(year_int))
332
+ else:
333
+ years.append('Unknown')
334
+ except (ValueError, TypeError):
335
+ years.append('Unknown')
336
+ else:
337
+ years.append('Unknown')
338
+
339
+ # Extract filename
340
+ filename = metadata.get('filename', 'Unknown')
341
+ filenames.append(filename)
342
+
343
+ # Extract district
344
+ district = metadata.get('district', 'Unknown')
345
+ if district and district != 'Unknown':
346
+ districts.append(district)
347
+ else:
348
+ districts.append('Unknown')
349
+
350
+ # Count occurrences
351
+ source_counts = Counter(sources_list)
352
+ year_counts = Counter(years)
353
+ filename_counts = Counter(filenames)
354
+ district_counts = Counter(districts)
355
+
356
+ return {
357
+ 'total_chunks': len(sources),
358
+ 'unique_sources': len(source_counts),
359
+ 'unique_years': len([y for y in year_counts.keys() if y != 'Unknown']),
360
+ 'unique_filenames': len(filename_counts),
361
+ 'unique_districts': len([d for d in district_counts.keys() if d != 'Unknown']),
362
+ 'source_distribution': dict(source_counts),
363
+ 'year_distribution': dict(year_counts),
364
+ 'filename_distribution': dict(filename_counts),
365
+ 'district_distribution': dict(district_counts),
366
+ 'sources': sources_list,
367
+ 'years': years,
368
+ 'filenames': filenames,
369
+ 'districts': districts
370
+ }
371
+
372
+ def display_chunk_statistics_charts(stats: Dict[str, Any], title: str = "Retrieval Statistics"):
373
+ """Display statistics as interactive charts for 10+ results."""
374
+ if not stats or stats.get('total_chunks', 0) == 0:
375
+ return
376
+
377
+ # Wrap everything in one styled container - open it
378
+ st.markdown(f"""
379
+ <div class="retrieval-distribution-container">
380
+ <h3 style="margin-top: 0;">πŸ“Š {title}</h3>
381
+ <div style="display: flex; justify-content: space-around; align-items: center; padding: 15px 0; border-bottom: 1px solid #e0e0e0; margin-bottom: 20px;">
382
+ <div class="metric-container">
383
+ <div class="metric-label">Total Chunks</div>
384
+ <div class="metric-value">{stats['total_chunks']}</div>
385
+ </div>
386
+ <div class="metric-container">
387
+ <div class="metric-label">Unique Sources</div>
388
+ <div class="metric-value">{stats['unique_sources']}</div>
389
+ </div>
390
+ <div class="metric-container">
391
+ <div class="metric-label">Unique Years</div>
392
+ <div class="metric-value">{stats['unique_years']}</div>
393
+ </div>
394
+ <div class="metric-container">
395
+ <div class="metric-label">Unique Files</div>
396
+ <div class="metric-value">{stats['unique_filenames']}</div>
397
+ </div>
398
+ </div>
399
+ """, unsafe_allow_html=True)
400
+
401
+ # Charts - three columns to include Districts
402
+ col1, col2, col3 = st.columns(3)
403
+
404
+ with col1:
405
+ # Source distribution chart
406
+ if stats['source_distribution']:
407
+ source_df = pd.DataFrame(
408
+ list(stats['source_distribution'].items()),
409
+ columns=['Source', 'Count']
410
+ )
411
+ fig_source = px.bar(
412
+ source_df,
413
+ x='Count',
414
+ y='Source',
415
+ orientation='h',
416
+ title='Distribution by Source',
417
+ color='Count',
418
+ color_continuous_scale='viridis'
419
+ )
420
+ fig_source.update_layout(height=400, showlegend=False)
421
+ st.plotly_chart(fig_source, use_container_width=True)
422
+
423
+ with col2:
424
+ # Year distribution chart
425
+ if stats['year_distribution']:
426
+ # Filter out 'Unknown' years for the chart
427
+ year_dist_filtered = {k: v for k, v in stats['year_distribution'].items() if k != 'Unknown'}
428
+ if year_dist_filtered:
429
+ year_df = pd.DataFrame(
430
+ list(year_dist_filtered.items()),
431
+ columns=['Year', 'Count']
432
+ )
433
+ # Sort by year as integer but keep as string for categorical display
434
+ year_df['Year_Int'] = year_df['Year'].astype(int)
435
+ year_df = year_df.sort_values('Year_Int').drop('Year_Int', axis=1)
436
+
437
+ fig_year = px.bar(
438
+ year_df,
439
+ x='Year',
440
+ y='Count',
441
+ title='Distribution by Year',
442
+ color='Count',
443
+ color_continuous_scale='plasma'
444
+ )
445
+ # Ensure years are treated as categorical (discrete) not continuous
446
+ fig_year.update_xaxes(type='category')
447
+ fig_year.update_layout(height=400, showlegend=False)
448
+ st.plotly_chart(fig_year, use_container_width=True)
449
+ else:
450
+ st.info("No valid years found in the results")
451
+
452
+ with col3:
453
+ # District distribution chart
454
+ if stats.get('district_distribution'):
455
+ district_dist_filtered = {k: v for k, v in stats['district_distribution'].items() if k != 'Unknown'}
456
+ if district_dist_filtered:
457
+ district_df = pd.DataFrame(
458
+ list(district_dist_filtered.items()),
459
+ columns=['District', 'Count']
460
+ )
461
+ district_df = district_df.sort_values('Count', ascending=False)
462
+
463
+ fig_district = px.bar(
464
+ district_df,
465
+ x='Count',
466
+ y='District',
467
+ orientation='h',
468
+ title='Distribution by District',
469
+ color='Count',
470
+ color_continuous_scale='blues'
471
+ )
472
+ fig_district.update_layout(height=400, showlegend=False)
473
+ st.plotly_chart(fig_district, use_container_width=True)
474
+ else:
475
+ st.info("No valid districts found in the results")
476
+
477
+ # Close the container
478
+ st.markdown('</div>', unsafe_allow_html=True)
479
+
480
+ def display_chunk_statistics_table(stats: Dict[str, Any], title: str = "Retrieval Distribution"):
481
+ """Display statistics as tables for smaller results with fixed alignment."""
482
+ if not stats or stats.get('total_chunks', 0) == 0:
483
+ return
484
+
485
+ # Wrap in styled container
486
+ # st.markdown('<div class="retrieval-distribution-container">', unsafe_allow_html=True)
487
+
488
+ st.subheader(f"πŸ“Š {title}")
489
+
490
+ # Create a container with fixed height for alignment
491
+ stats_container = st.container()
492
+
493
+ with stats_container:
494
+ # Create 4 equal columns for consistent alignment
495
+ col1, col2, col3, col4 = st.columns(4)
496
+
497
+ with col1:
498
+ st.markdown("**🏘️ Districts**")
499
+ if stats.get('district_distribution'):
500
+ district_dist_filtered = {k: v for k, v in stats['district_distribution'].items() if k != 'Unknown'}
501
+ if district_dist_filtered:
502
+ district_data = {
503
+ "District": list(district_dist_filtered.keys()),
504
+ "Count": list(district_dist_filtered.values())
505
+ }
506
+ district_df = pd.DataFrame(district_data).sort_values('Count', ascending=False)
507
+ st.dataframe(district_df, hide_index=True, use_container_width=True)
508
+ else:
509
+ st.write("No district data")
510
+ else:
511
+ st.write("No district data")
512
+
513
+ with col2:
514
+ st.markdown("**πŸ“‚ Sources**")
515
+ if stats['source_distribution']:
516
+ source_data = {
517
+ "Source": list(stats['source_distribution'].keys()),
518
+ "Count": list(stats['source_distribution'].values())
519
+ }
520
+ source_df = pd.DataFrame(source_data).sort_values('Count', ascending=False)
521
+ st.dataframe(source_df, hide_index=True, use_container_width=True)
522
+ else:
523
+ st.write("No source data")
524
+
525
+ with col3:
526
+ st.markdown("**πŸ“… Years**")
527
+ if stats['year_distribution']:
528
+ year_dist_filtered = {k: v for k, v in stats['year_distribution'].items() if k != 'Unknown'}
529
+ if year_dist_filtered:
530
+ year_data = {
531
+ "Year": list(year_dist_filtered.keys()),
532
+ "Count": list(year_dist_filtered.values())
533
+ }
534
+ year_df = pd.DataFrame(year_data)
535
+ # Sort by year as integer but display as string
536
+ year_df['Year_Int'] = year_df['Year'].astype(int)
537
+ year_df = year_df.sort_values('Year_Int')[['Year', 'Count']]
538
+ st.dataframe(year_df, hide_index=True, use_container_width=True)
539
+ else:
540
+ st.write("No year data")
541
+ else:
542
+ st.write("No year data")
543
+
544
+ with col4:
545
+ st.markdown("**πŸ“„ Files**")
546
+ if stats['filename_distribution']:
547
+ filename_items = list(stats['filename_distribution'].items())
548
+ filename_items.sort(key=lambda x: x[1], reverse=True)
549
+
550
+ # Show top files with truncated names
551
+ file_data = {
552
+ "File": [f[:30] + "..." if len(f) > 30 else f for f, c in filename_items[:5]],
553
+ "Count": [c for f, c in filename_items[:5]]
554
+ }
555
+ file_df = pd.DataFrame(file_data)
556
+ st.dataframe(file_df, hide_index=True, use_container_width=True)
557
+ else:
558
+ st.write("No file data")
559
+
560
+ # Close container
561
+ st.markdown('</div>', unsafe_allow_html=True)
562
+
563
  @st.cache_data
564
  def load_filter_options():
565
  try:
566
+ filter_options_path = PROJECT_DIR / "src" / "config" / "filter_options.json"
567
+ with open(filter_options_path, "r") as f:
568
  return json.load(f)
569
  except FileNotFoundError:
570
+ st.info(f"Looking for filter_options.json in: {PROJECT_DIR / 'src' / 'config'}")
571
  st.error("filter_options.json not found. Please run the metadata analysis script.")
572
  return {"sources": [], "years": [], "districts": [], 'filenames': []}
573
 
 
600
  st.session_state.reset_conversation = False
601
  st.rerun()
602
 
603
+ # Header - centered
604
+ st.markdown('<h1 class="main-header">πŸ€– Intelligent Audit Report Chatbot</h1>', unsafe_allow_html=True)
 
 
 
 
 
 
 
 
605
  st.markdown('<p class="subtitle">Ask questions about audit reports. Use the sidebar filters to narrow down your search!</p>', unsafe_allow_html=True)
606
 
607
  # Session info
 
618
 
619
  # Sidebar for filters
620
  with st.sidebar:
621
+ # Instructions section (collapsible)
622
+ with st.expander("πŸ“– How to Use", expanded=False):
623
+ st.markdown("""
624
+ #### 🎯 Using Filters
625
+
626
+ 1. **Select filters** from the sidebar to narrow your search:
627
+
628
+ 2. **Leave filters empty** to search across all data
629
+
630
+ 3. **Type your question** in the chat and click "Send"
631
+
632
+ 4. **Choose sample questions from the bottom of the page**
633
+
634
+ #### πŸ’‘ Tips
635
+
636
+ - Use specific questions for better results
637
+ - Combine multiple filters for precise searches
638
+ - Check the "Retrieved Documents" tab to get various insights
639
+
640
+ #### πŸ’¬ Feedback Section
641
+
642
+ - Rate your experience (1-5 stars)
643
+ - Provide optional text feedback
644
+ - Located at the bottom of the page
645
+
646
+ #### ⚠️ Important
647
+
648
+ **When finished, please close the browser window** to free up computational resources.
649
+
650
+ ---
651
+
652
+ For more detailed help, see the example questions at the bottom of the page.
653
+ """)
654
+
655
  st.markdown("### πŸ” Search Filters")
656
  st.markdown("Select filters to narrow down your search. Leave empty to search all data.")
657
 
 
666
  help="Choose specific reports to search. When enabled, all other filters are ignored."
667
  )
668
  st.markdown('</div>', unsafe_allow_html=True)
669
+
670
+ st.markdown('---')
671
 
672
  # Determine if filename filter is active
673
  filename_mode = len(selected_filenames) > 0
674
  # Sources filter
675
+ # st.markdown('<div class="filter-section">', unsafe_allow_html=True)
676
  st.markdown('<div class="filter-title">πŸ“Š Sources</div>', unsafe_allow_html=True)
677
  selected_sources = st.multiselect(
678
  "Select sources:",
 
685
  st.markdown('</div>', unsafe_allow_html=True)
686
 
687
  # Years filter
688
+ # st.markdown('<div class="filter-section">', unsafe_allow_html=True)
689
  st.markdown('<div class="filter-title">πŸ“… Years</div>', unsafe_allow_html=True)
690
  selected_years = st.multiselect(
691
  "Select years:",
 
698
  st.markdown('</div>', unsafe_allow_html=True)
699
 
700
  # Districts filter
701
+ # st.markdown('<div class="filter-section">', unsafe_allow_html=True)
702
  st.markdown('<div class="filter-title">🏘️ Districts</div>', unsafe_allow_html=True)
703
  selected_districts = st.multiselect(
704
  "Select districts:",
 
749
  if 'input_counter' not in st.session_state:
750
  st.session_state.input_counter = 0
751
 
752
+ # Handle pending question from example questions section
753
+ if 'pending_question' in st.session_state and st.session_state.pending_question:
754
+ default_value = st.session_state.pending_question
755
+ # Increment counter to force new input widget
756
+ st.session_state.input_counter = (st.session_state.get('input_counter', 0) + 1) % 1000
757
+ del st.session_state.pending_question
758
+ key_suffix = st.session_state.input_counter
759
+ else:
760
+ default_value = ""
761
+ key_suffix = st.session_state.input_counter
762
+
763
  user_input = st.text_input(
764
  "Type your message here...",
765
  placeholder="Ask about budget allocations, expenditures, or audit findings...",
766
+ key=f"user_input_{key_suffix}",
767
+ label_visibility="collapsed",
768
+ value=default_value if default_value else None
769
  )
770
+
771
+ # Use JavaScript to specifically target and hide "Press Enter to apply" message
772
+ st.markdown("""
773
+ <script>
774
+ (function() {
775
+ // Hide InputInstructions element (contains "Press Enter to apply")
776
+ function hideInputInstructions() {
777
+ // Target the specific Streamlit element
778
+ const instructions = document.querySelector('[data-testid="InputInstructions"]');
779
+ if (instructions) {
780
+ instructions.style.display = 'none';
781
+ instructions.style.visibility = 'hidden';
782
+ instructions.style.opacity = '0';
783
+ instructions.style.height = '0';
784
+ instructions.style.width = '0';
785
+ instructions.style.overflow = 'hidden';
786
+ instructions.style.position = 'absolute';
787
+ instructions.style.left = '-9999px';
788
+ }
789
+
790
+ // Also search for any text containing "Press Enter" or "apply" inside input containers
791
+ const allElements = document.querySelectorAll('*');
792
+ allElements.forEach(el => {
793
+ const text = el.textContent || el.innerText || '';
794
+ if ((text.toLowerCase().includes('press enter') ||
795
+ text.toLowerCase().includes('enter to') ||
796
+ text.toLowerCase().includes('to apply')) &&
797
+ (el.tagName === 'SPAN' || el.tagName === 'DIV' || el.tagName === 'SMALL')) {
798
+ const style = window.getComputedStyle(el);
799
+ const fontSize = parseFloat(style.fontSize);
800
+ // Hide if it's small text (likely help text)
801
+ if (fontSize < 14 || el.hasAttribute('data-testid')) {
802
+ el.style.display = 'none';
803
+ el.style.visibility = 'hidden';
804
+ el.style.height = '0';
805
+ el.style.overflow = 'hidden';
806
+ }
807
+ }
808
+ });
809
+ }
810
+
811
+ // Run immediately and after delays to catch dynamic elements
812
+ hideInputInstructions();
813
+ setTimeout(hideInputInstructions, 50);
814
+ setTimeout(hideInputInstructions, 100);
815
+ setTimeout(hideInputInstructions, 500);
816
+
817
+ // Observe for new elements added by Streamlit
818
+ const observer = new MutationObserver(function(mutations) {
819
+ hideInputInstructions();
820
+ });
821
+ observer.observe(document.body, { childList: true, subtree: true, attributes: true });
822
+ })();
823
+ </script>
824
+ """, unsafe_allow_html=True)
825
+
826
+ # # Show custom help text below input - this replaces the default "Press Enter" message
827
+ # st.markdown(
828
+ # "<div class='input-help-text'>πŸ’‘ Press the <strong>Send</strong> button to submit your question</div>",
829
+ # unsafe_allow_html=True
830
+ # )
831
 
832
  with col2:
833
  send_button = st.button("Send", key="send_button", use_container_width=True)
 
836
  if st.button("πŸ—‘οΈ Clear Chat", key="clear_chat_button"):
837
  st.session_state.reset_conversation = True
838
  # Clear all conversation files
839
+ conversations_path = CONVERSATIONS_DIR
840
+ if conversations_path.exists():
841
+ for file in conversations_path.iterdir():
842
+ if file.suffix == '.json':
843
+ file.unlink()
 
844
  st.rerun()
845
 
846
  # Handle user input
 
930
  # Count unique filenames
931
  unique_filenames = set()
932
  for doc in sources:
933
+ metadata = getattr(doc, 'metadata', {})
934
+ filename = metadata.get('filename', 'Unknown')
935
  unique_filenames.add(filename)
936
 
937
+ st.markdown(f"**Found {len(sources)} document chunks from {len(unique_filenames)} unique documents (showing top 20):**")
938
  if len(unique_filenames) < len(sources):
939
  st.info(f"πŸ’‘ **Note**: Each document is split into multiple chunks. You're seeing {len(sources)} chunks from {len(unique_filenames)} documents.")
940
 
941
+ # Extract and display statistics
942
+ stats = extract_chunk_statistics(sources)
943
+
944
+ # Show charts for 10+ results, tables for fewer
945
+ if len(sources) >= 10:
946
+ display_chunk_statistics_charts(stats, "Retrieval Statistics")
947
+ # Also show tables below charts for detailed view
948
+ st.markdown("---")
949
+ display_chunk_statistics_table(stats, "Retrieval Distribution")
950
+ else:
951
+ display_chunk_statistics_table(stats, "Retrieval Distribution")
952
+
953
+ st.markdown("---")
954
+ st.markdown("### πŸ“„ Document Details")
955
+
956
+ for i, doc in enumerate(sources): # Show all documents
957
  # Get relevance score and ID if available
958
  metadata = getattr(doc, 'metadata', {})
959
  score = metadata.get('reranked_score', metadata.get('original_score', None))
 
986
  st.info("No documents were retrieved for the last query.")
987
  else:
988
  st.info("No documents have been retrieved yet. Start a conversation to see retrieved documents here.")
989
+
990
+ # Display retrieval history stats
991
+ st.markdown("---")
992
+ if st.session_state.rag_retrieval_history:
993
+ st.markdown("#### πŸ“Š Retrieval History")
994
+ st.markdown(f"This conversation has **{len(st.session_state.rag_retrieval_history)}** retrieval entries.")
995
+
996
+ with st.expander(f"View {len(st.session_state.rag_retrieval_history)} retrieval entries", expanded=False):
997
+ for idx, entry in enumerate(st.session_state.rag_retrieval_history, 1):
998
+ with st.expander(f"Entry {idx}: {entry.get('rag_query_expansion', 'N/A')[:50]}...", expanded=False):
999
+ st.markdown(f"**Query:** {entry.get('rag_query_expansion', 'N/A')}")
1000
+ st.markdown(f"**Documents Retrieved:** {len(entry.get('docs_retrieved', []))}")
1001
+
1002
+ # Show conversation up to this point
1003
+ conversation = entry.get('conversation_up_to', [])
1004
+ if conversation:
1005
+ st.markdown("**Conversation Context:**")
1006
+ for msg in conversation[-3:]: # Show last 3 messages
1007
+ role = msg.get('type', 'unknown')
1008
+ content = msg.get('content', '')[:200] + "..." if len(msg.get('content', '')) > 200 else msg.get('content', '')
1009
+ if role == 'human':
1010
+ st.markdown(f"- **You:** {content}")
1011
+ elif role == 'ai':
1012
+ st.markdown(f"- **Bot:** {content}")
1013
+
1014
+ # Show retrieved documents summary
1015
+ docs = entry.get('docs_retrieved', [])
1016
+ if docs:
1017
+ st.markdown("**Retrieved Documents:**")
1018
+ for doc_idx, doc in enumerate(docs[:5], 1): # Show first 5
1019
+ doc_meta = doc.get('metadata', {})
1020
+ filename = doc_meta.get('filename', 'Unknown')[:50]
1021
+ st.markdown(f"{doc_idx}. {filename}")
1022
+ if len(docs) > 5:
1023
+ st.markdown(f"... and {len(docs) - 5} more documents")
1024
+ else:
1025
+ st.markdown("---")
1026
+ st.info("πŸ“Š Retrieval history will appear here after you start asking questions.")
1027
 
1028
  # Feedback Dashboard Section
1029
  st.markdown("---")
 
1043
  if 'feedback_submitted' not in st.session_state:
1044
  st.session_state.feedback_submitted = False
1045
 
1046
+ # Feedback form - only show if feedback not already submitted
1047
+ if not st.session_state.feedback_submitted:
1048
+ with st.form("feedback_form", clear_on_submit=False):
1049
+ col1, col2 = st.columns([1, 1])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1050
 
1051
+ with col1:
1052
+ feedback_score = st.slider(
1053
+ "Rate this conversation (1-5)",
1054
+ min_value=1,
1055
+ max_value=5,
1056
+ help="How satisfied are you with the conversation?"
1057
+ )
 
 
 
 
 
1058
 
1059
+ with col2:
1060
+ is_feedback_about_last_retrieval = st.checkbox(
1061
+ "Feedback about last retrieval only",
1062
+ value=True,
1063
+ help="If checked, feedback applies to the most recent document retrieval"
1064
+ )
1065
 
1066
+ open_ended_feedback = st.text_area(
1067
+ "Your feedback (optional)",
1068
+ placeholder="Tell us what went well or what could be improved...",
1069
+ height=100
1070
+ )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1071
 
1072
+ # Disable submit if no score selected
1073
+ submit_disabled = feedback_score is None
 
 
 
 
 
 
 
 
1074
 
1075
+ submitted = st.form_submit_button(
1076
+ "πŸ“€ Submit Feedback",
1077
+ use_container_width=True,
1078
+ disabled=submit_disabled
1079
+ )
1080
 
1081
+ if submitted:
1082
+ # Log the feedback data being submitted
1083
+ print("=" * 80)
1084
+ print("πŸ”„ FEEDBACK SUBMISSION: Starting...")
1085
+ print("=" * 80)
1086
+ st.write("πŸ” **Debug: Feedback Data Being Submitted:**")
 
 
1087
 
1088
+ # Create feedback data dictionary
1089
+ feedback_dict = {
1090
+ "open_ended_feedback": open_ended_feedback,
1091
+ "score": feedback_score,
1092
+ "is_feedback_about_last_retrieval": is_feedback_about_last_retrieval,
1093
+ "retrieved_data": st.session_state.rag_retrieval_history.copy() if st.session_state.rag_retrieval_history else [],
1094
+ "conversation_id": st.session_state.conversation_id,
1095
+ "timestamp": time.time(),
1096
+ "message_count": len(st.session_state.messages),
1097
+ "has_retrievals": has_retrievals,
1098
+ "retrieval_count": len(st.session_state.rag_retrieval_history)
1099
+ }
1100
 
1101
+ print(f"πŸ“ FEEDBACK SUBMISSION: Score={feedback_score}, Retrievals={len(st.session_state.rag_retrieval_history) if st.session_state.rag_retrieval_history else 0}")
 
 
1102
 
1103
+ # Create UserFeedback dataclass instance
1104
+ feedback_obj = None # Initialize outside try block
1105
  try:
1106
+ feedback_obj = create_feedback_from_dict(feedback_dict)
1107
+ print(f"βœ… FEEDBACK SUBMISSION: Feedback object created - ID={feedback_obj.feedback_id}")
1108
+ st.write(f"βœ… **Feedback Object Created**")
1109
+ st.write(f"- Feedback ID: {feedback_obj.feedback_id}")
1110
+ st.write(f"- Score: {feedback_obj.score}/5")
1111
+ st.write(f"- Has Retrievals: {feedback_obj.has_retrievals}")
1112
 
1113
+ # Convert back to dict for JSON serialization
1114
+ feedback_data = feedback_obj.to_dict()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1115
  except Exception as e:
1116
+ print(f"❌ FEEDBACK SUBMISSION: Failed to create feedback object: {e}")
1117
+ st.error(f"Failed to create feedback object: {e}")
1118
+ feedback_data = feedback_dict
1119
 
1120
+ # Display the data being submitted
1121
+ st.json(feedback_data)
1122
 
1123
+ # Save feedback to file - use PROJECT_DIR to ensure writability
1124
+ feedback_dir = FEEDBACK_DIR
1125
+ try:
1126
+ # Ensure directory exists with write permissions (777 for compatibility)
1127
+ feedback_dir.mkdir(parents=True, mode=0o777, exist_ok=True)
1128
+ except (PermissionError, OSError) as e:
1129
+ logger.warning(f"Could not create feedback directory at {feedback_dir}: {e}")
1130
+ # Fallback to relative path
1131
+ feedback_dir = Path("feedback")
1132
+ feedback_dir.mkdir(parents=True, mode=0o777, exist_ok=True)
1133
 
1134
+ feedback_file = feedback_dir / f"feedback_{st.session_state.conversation_id}_{int(time.time())}.json"
 
1135
 
1136
+ try:
1137
+ # Ensure parent directory exists before writing
1138
+ feedback_file.parent.mkdir(parents=True, mode=0o777, exist_ok=True)
1139
+
1140
+ # Save to local file
1141
+ print(f"πŸ’Ύ FEEDBACK SAVE: Saving to local file: {feedback_file}")
1142
+ with open(feedback_file, 'w') as f:
1143
+ json.dump(feedback_data, f, indent=2, default=str)
1144
+
1145
+ print(f"βœ… FEEDBACK SAVE: Local file saved successfully")
1146
+ st.success("βœ… Thank you for your feedback! It has been saved locally.")
1147
+ st.balloons()
1148
+
1149
+ # Save to Snowflake if enabled and credentials available
1150
+ logger.info("πŸ”„ FEEDBACK SAVE: Starting Snowflake save process...")
1151
+ logger.info(f"πŸ“Š FEEDBACK SAVE: feedback_obj={'exists' if feedback_obj else 'None'}")
1152
+
1153
+ try:
1154
+ snowflake_enabled = os.getenv("SNOWFLAKE_ENABLED", "false").lower() == "true"
1155
+ logger.info(f"πŸ” SNOWFLAKE CHECK: enabled={snowflake_enabled}")
1156
+
1157
+ if snowflake_enabled:
1158
+ if feedback_obj:
1159
+ try:
1160
+ logger.info("πŸ“€ SNOWFLAKE UI: Attempting to save feedback to Snowflake...")
1161
+ print("πŸ“€ SNOWFLAKE UI: Attempting to save feedback to Snowflake...")
1162
+
1163
+ if save_to_snowflake(feedback_obj):
1164
+ logger.info("βœ… SNOWFLAKE UI: Successfully saved to Snowflake")
1165
+ print("βœ… SNOWFLAKE UI: Successfully saved to Snowflake")
1166
+ st.success("βœ… Feedback also saved to Snowflake!")
1167
+ else:
1168
+ logger.warning("⚠️ SNOWFLAKE UI: Save failed")
1169
+ print("⚠️ SNOWFLAKE UI: Save failed")
1170
+ st.warning("⚠️ Snowflake save failed, but local save succeeded")
1171
+ except Exception as e:
1172
+ logger.error(f"❌ SNOWFLAKE UI ERROR: {e}")
1173
+ print(f"❌ SNOWFLAKE UI ERROR: {e}")
1174
+ traceback.print_exc()
1175
+ st.warning(f"⚠️ Could not save to Snowflake: {e}")
1176
+ else:
1177
+ logger.warning("⚠️ SNOWFLAKE UI: Skipping (feedback object not created)")
1178
+ print("⚠️ SNOWFLAKE UI: Skipping (feedback object not created)")
1179
+ st.warning("⚠️ Skipping Snowflake save (feedback object not created)")
1180
+ else:
1181
+ logger.info("πŸ’‘ SNOWFLAKE UI: Integration disabled")
1182
+ print("πŸ’‘ SNOWFLAKE UI: Integration disabled")
1183
+ st.info("πŸ’‘ Snowflake integration disabled (set SNOWFLAKE_ENABLED=true to enable)")
1184
+ except NameError as e:
1185
+ traceback.print_exc()
1186
+ logger.error(f"❌ NameError in Snowflake save: {e}")
1187
+ print(f"❌ NameError in Snowflake save: {e}")
1188
+ st.warning(f"⚠️ Snowflake save error: {e}")
1189
+ except Exception as e:
1190
+ logger.error(f"❌ Exception in Snowflake save: {type(e).__name__}: {e}")
1191
+ print(f"❌ Exception in Snowflake save: {type(e).__name__}: {e}")
1192
+ st.warning(f"⚠️ Snowflake save error: {e}")
1193
+
1194
+ # Mark feedback as submitted to prevent resubmission
1195
+ st.session_state.feedback_submitted = True
1196
+
1197
+ print("=" * 80)
1198
+ print(f"βœ… FEEDBACK SUBMISSION: Completed successfully")
1199
+ print("=" * 80)
1200
+
1201
+ # Log file location
1202
+ st.info(f"πŸ“ Feedback saved to: {feedback_file}")
1203
+
1204
+ except Exception as e:
1205
+ print(f"❌ FEEDBACK SUBMISSION: Error saving feedback: {e}")
1206
+ print(f"❌ FEEDBACK SUBMISSION: Error type: {type(e).__name__}")
1207
+ traceback.print_exc()
1208
+ st.error(f"❌ Error saving feedback: {e}")
1209
+ st.write(f"Debug error: {str(e)}")
1210
+ else:
1211
+ # Feedback already submitted - show success message and reset option
1212
+ st.success("βœ… Feedback already submitted for this conversation!")
1213
+ col1, col2 = st.columns([1, 1])
1214
+ with col1:
1215
+ if st.button("πŸ”„ Submit New Feedback", key="new_feedback_button", use_container_width=True):
1216
+ try:
1217
+ st.session_state.feedback_submitted = False
1218
+ st.rerun()
1219
+ except Exception as e:
1220
+ # Handle any Streamlit API exceptions gracefully
1221
+ logger.error(f"Error resetting feedback state: {e}")
1222
+ st.error(f"Error resetting feedback. Please refresh the page.")
1223
+ with col2:
1224
+ if st.button("πŸ“‹ View Conversation", key="view_conversation_button", use_container_width=True):
1225
+ # Scroll to conversation - this is handled by the auto-scroll at bottom
1226
+ pass
1227
 
1228
+ # Example Questions Section
1229
+ st.markdown("---")
1230
+ st.markdown(
1231
+ "<h3 class='example-questions-header'>πŸ’‘ Example Questions</h3>",
1232
+ unsafe_allow_html=True
1233
+ )
1234
+ st.markdown(
1235
+ "<p class='example-questions-description'>Click on any question below to use it, or modify the editable examples:</p>",
1236
+ unsafe_allow_html=True
1237
+ )
1238
+
1239
+ # Initialize example question state
1240
+ if 'custom_question_1' not in st.session_state:
1241
+ st.session_state.custom_question_1 = "How were administrative costs managed in the PDM implementation, and what issues arose with budget execution regarding staff salaries?"
1242
+ if 'custom_question_2' not in st.session_state:
1243
+ st.session_state.custom_question_2 = "What did the National Coordinator say about the release of funds for PDM administrative costs in the letter dated 29th September 2022 and how did the funding received affect the activities of the PDCs and PDM SACCOs in the FY 2022/23?"
1244
+
1245
+ # Question 1: Filename insights (fixed, clickable)
1246
+ st.markdown("#### πŸ“„ Question 1: List insights from a specific file")
1247
+ col1, col2 = st.columns([3, 1])
1248
+ with col1:
1249
+ example_q1 = "List couple of insights from the filename."
1250
+ st.markdown(f"**Example:** `{example_q1}`")
1251
+ st.info("πŸ’‘ **Filter to apply:** Select a Filename from the sidebar panel before asking this question.")
1252
+ with col2:
1253
+ if st.button("πŸ“‹ Use This Question", key="use_example_1", use_container_width=True):
1254
+ st.session_state.pending_question = example_q1
1255
+ st.session_state.input_counter = (st.session_state.get('input_counter', 0) + 1) % 1000
1256
+ st.rerun()
1257
+
1258
+ st.markdown("---")
1259
+
1260
+ # Questions 2 & 3: Editable examples (collapsible, side by side)
1261
+ with st.expander("#### ✏️ Customizable Questions (Edit and use)", expanded=False):
1262
+ # Place questions side by side
1263
+ col1, col2 = st.columns(2)
1264
 
1265
+ # Question 2
1266
+ with col1:
1267
+ st.markdown("**Question 2:**")
1268
+ custom_q1 = st.text_area(
1269
+ "Edit question 2:",
1270
+ value=st.session_state.custom_question_1,
1271
+ height=100,
1272
+ key="edit_question_2",
1273
+ help="Modify this question to fit your needs, then click 'Use This Question'",
1274
+ label_visibility="collapsed"
1275
+ )
1276
+ if st.button("πŸ“‹ Use Question 2", key="use_custom_1", use_container_width=True):
1277
+ if custom_q1.strip():
1278
+ st.session_state.pending_question = custom_q1.strip()
1279
+ st.session_state.custom_question_1 = custom_q1.strip()
1280
+ st.session_state.input_counter = (st.session_state.get('input_counter', 0) + 1) % 1000
1281
+ st.rerun()
1282
+ else:
1283
+ st.warning("Please enter a question first!")
1284
+ st.caption("πŸ’‘ Tip: Add specific details like dates, names, or amounts to get more precise answers")
1285
+ st.info("πŸ’‘ **Filter to apply:** Select District(s) and Year(s) from sidebar panel")
1286
+
1287
+ # Question 3
1288
+ with col2:
1289
+ st.markdown("**Question 3:**")
1290
+ custom_q2 = st.text_area(
1291
+ "Edit question 3:",
1292
+ value=st.session_state.custom_question_2,
1293
+ height=100,
1294
+ key="edit_question_3",
1295
+ help="Modify this question to fit your needs, then click 'Use This Question'",
1296
+ label_visibility="collapsed"
1297
+ )
1298
+ if st.button("πŸ“‹ Use Question 3", key="use_custom_2", use_container_width=True):
1299
+ if custom_q2.strip():
1300
+ st.session_state.pending_question = custom_q2.strip()
1301
+ st.session_state.custom_question_2 = custom_q2.strip()
1302
+ st.session_state.input_counter = (st.session_state.get('input_counter', 0) + 1) % 1000
1303
+ st.rerun()
1304
+ else:
1305
+ st.warning("Please enter a question first!")
1306
+ st.caption("πŸ’‘ Tip: Use specific terms from the documents (e.g., 'PDM', 'SACCOs', 'FY 2022/23')")
1307
+
1308
+
1309
+ # Store selected question for next render (handled in input section above)
1310
+ # This ensures the question populates the input field correctly
1311
 
1312
  # Auto-scroll to bottom
1313
  st.markdown("""
multi_agent_chatbot.py CHANGED
@@ -8,24 +8,26 @@ This system implements a 3-agent architecture:
8
 
9
  Each agent has specialized prompts and responsibilities.
10
  """
 
11
  import json
12
  import time
13
  import logging
 
14
  from pathlib import Path
15
  from datetime import datetime
16
  from dataclasses import dataclass
17
  from typing import Dict, List, Any, Optional, TypedDict
18
 
19
-
20
  from langchain_core.tools import tool
21
  from langgraph.graph import StateGraph, END
22
- from langchain_core.messages import HumanMessage, AIMessage, SystemMessage
23
  from langchain_core.prompts import ChatPromptTemplate
 
24
 
25
 
26
  from src.pipeline import PipelineManager
27
- from src.config.loader import load_config
28
  from src.llm.adapters import get_llm_client
 
 
29
 
30
 
31
  logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
@@ -46,6 +48,7 @@ class QueryContext:
46
  needs_follow_up: bool = False
47
  follow_up_question: Optional[str] = None
48
 
 
49
  class MultiAgentState(TypedDict):
50
  """State for the multi-agent conversation flow"""
51
  conversation_id: str
@@ -61,6 +64,7 @@ class MultiAgentState(TypedDict):
61
  session_start_time: float
62
  last_ai_message_time: float
63
 
 
64
  class MultiAgentRAGChatbot:
65
  """Multi-agent RAG chatbot with specialized agents"""
66
 
@@ -112,7 +116,6 @@ class MultiAgentRAGChatbot:
112
  logger.info("βœ… Pipeline manager initialized and models loaded")
113
  except Exception as e:
114
  logger.error(f"❌ Failed to initialize pipeline manager: {e}")
115
- import traceback
116
  traceback.print_exc()
117
  raise RuntimeError(f"Pipeline manager initialization failed: {e}")
118
 
@@ -129,7 +132,6 @@ class MultiAgentRAGChatbot:
129
  raise # Re-raise RuntimeError as-is
130
  except Exception as e:
131
  logger.error(f"❌ Error during vector store connection: {e}")
132
- import traceback
133
  traceback.print_exc()
134
  raise RuntimeError(f"Vector store connection failed: {e}")
135
 
@@ -139,8 +141,8 @@ class MultiAgentRAGChatbot:
139
  # Build the multi-agent graph
140
  self.graph = self._build_graph()
141
 
142
- # Conversations directory - use absolute path in /app to ensure writability
143
- self.conversations_dir = Path("/app/conversations")
144
  try:
145
  # Use 777 permissions for maximum compatibility (HF Spaces runs as different user)
146
  self.conversations_dir.mkdir(parents=True, mode=0o777, exist_ok=True)
@@ -158,9 +160,9 @@ class MultiAgentRAGChatbot:
158
 
159
  def _load_dynamic_data(self):
160
  """Load dynamic data from filter_options.json and add_district_metadata.py"""
161
- # Load filter options
162
  try:
163
- fo = Path("src/config/filter_options.json")
164
  if fo.exists():
165
  with open(fo) as f:
166
  data = json.load(f)
@@ -178,7 +180,7 @@ class MultiAgentRAGChatbot:
178
  self.source_whitelist = ['Consolidated', 'Local Government', 'Ministry, Department and Agency']
179
  self.district_whitelist = ['Kampala', 'Gulu', 'Kalangala']
180
 
181
- # Enrich district list from add_district_metadata.py
182
  try:
183
  from add_district_metadata import DistrictMetadataProcessor
184
  proc = DistrictMetadataProcessor()
@@ -590,7 +592,6 @@ Analyze this query using ONLY the exact values provided above:""")
590
  # Clean and parse JSON with better error handling
591
  try:
592
  # Remove comments (// and /* */) from JSON
593
- import re
594
  # Remove single-line comments
595
  content = re.sub(r'//.*?$', '', content, flags=re.MULTILINE)
596
  # Remove multi-line comments
@@ -603,7 +604,6 @@ Analyze this query using ONLY the exact values provided above:""")
603
  logger.error(f"❌ Raw content: {content[:200]}...")
604
 
605
  # Try to extract JSON from text if embedded
606
- import re
607
  json_match = re.search(r'\{.*\}', content, re.DOTALL)
608
  if json_match:
609
  try:
@@ -1178,7 +1178,6 @@ Generate a conversational response based on your knowledge:""")
1178
 
1179
  except Exception as e:
1180
  logger.error(f"Could not save conversation: {e}")
1181
- import traceback
1182
  logger.error(f"Traceback: {traceback.format_exc()}")
1183
 
1184
 
 
8
 
9
  Each agent has specialized prompts and responsibilities.
10
  """
11
+ import re
12
  import json
13
  import time
14
  import logging
15
+ import traceback
16
  from pathlib import Path
17
  from datetime import datetime
18
  from dataclasses import dataclass
19
  from typing import Dict, List, Any, Optional, TypedDict
20
 
 
21
  from langchain_core.tools import tool
22
  from langgraph.graph import StateGraph, END
 
23
  from langchain_core.prompts import ChatPromptTemplate
24
+ from langchain_core.messages import HumanMessage, AIMessage, SystemMessage
25
 
26
 
27
  from src.pipeline import PipelineManager
 
28
  from src.llm.adapters import get_llm_client
29
+ from src.config.paths import PROJECT_DIR, CONVERSATIONS_DIR
30
+ from src.config.loader import load_config, get_embedding_model_for_collection
31
 
32
 
33
  logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
 
48
  needs_follow_up: bool = False
49
  follow_up_question: Optional[str] = None
50
 
51
+
52
  class MultiAgentState(TypedDict):
53
  """State for the multi-agent conversation flow"""
54
  conversation_id: str
 
64
  session_start_time: float
65
  last_ai_message_time: float
66
 
67
+
68
  class MultiAgentRAGChatbot:
69
  """Multi-agent RAG chatbot with specialized agents"""
70
 
 
116
  logger.info("βœ… Pipeline manager initialized and models loaded")
117
  except Exception as e:
118
  logger.error(f"❌ Failed to initialize pipeline manager: {e}")
 
119
  traceback.print_exc()
120
  raise RuntimeError(f"Pipeline manager initialization failed: {e}")
121
 
 
132
  raise # Re-raise RuntimeError as-is
133
  except Exception as e:
134
  logger.error(f"❌ Error during vector store connection: {e}")
 
135
  traceback.print_exc()
136
  raise RuntimeError(f"Vector store connection failed: {e}")
137
 
 
141
  # Build the multi-agent graph
142
  self.graph = self._build_graph()
143
 
144
+ # Conversations directory - use PROJECT_DIR for local vs deployed compatibility
145
+ self.conversations_dir = CONVERSATIONS_DIR
146
  try:
147
  # Use 777 permissions for maximum compatibility (HF Spaces runs as different user)
148
  self.conversations_dir.mkdir(parents=True, mode=0o777, exist_ok=True)
 
160
 
161
  def _load_dynamic_data(self):
162
  """Load dynamic data from filter_options.json and add_district_metadata.py"""
163
+ # Load filter options - use PROJECT_DIR relative path
164
  try:
165
+ fo = PROJECT_DIR / "src" / "config" / "filter_options.json"
166
  if fo.exists():
167
  with open(fo) as f:
168
  data = json.load(f)
 
180
  self.source_whitelist = ['Consolidated', 'Local Government', 'Ministry, Department and Agency']
181
  self.district_whitelist = ['Kampala', 'Gulu', 'Kalangala']
182
 
183
+ # Enrich district list from add_district_metadata.py (if available)
184
  try:
185
  from add_district_metadata import DistrictMetadataProcessor
186
  proc = DistrictMetadataProcessor()
 
592
  # Clean and parse JSON with better error handling
593
  try:
594
  # Remove comments (// and /* */) from JSON
 
595
  # Remove single-line comments
596
  content = re.sub(r'//.*?$', '', content, flags=re.MULTILINE)
597
  # Remove multi-line comments
 
604
  logger.error(f"❌ Raw content: {content[:200]}...")
605
 
606
  # Try to extract JSON from text if embedded
 
607
  json_match = re.search(r'\{.*\}', content, re.DOTALL)
608
  if json_match:
609
  try:
 
1178
 
1179
  except Exception as e:
1180
  logger.error(f"Could not save conversation: {e}")
 
1181
  logger.error(f"Traceback: {traceback.format_exc()}")
1182
 
1183
 
smart_chatbot.py CHANGED
@@ -26,6 +26,7 @@ from langchain_core.messages import HumanMessage, AIMessage, SystemMessage
26
 
27
  from src.pipeline import PipelineManager
28
  from src.config.loader import load_config
 
29
 
30
 
31
  @dataclass
@@ -161,7 +162,7 @@ class IntelligentRAGChatbot:
161
 
162
  # Try to load district whitelist from filter_options.json
163
  try:
164
- fo = Path("filter_options.json")
165
  if fo.exists():
166
  with open(fo) as f:
167
  data = json.load(f)
@@ -174,7 +175,7 @@ class IntelligentRAGChatbot:
174
  except Exception:
175
  self.district_whitelist = self.available_metadata['districts']
176
 
177
- # Enrich whitelist from add_district_metadata.py if available
178
  try:
179
  from add_district_metadata import DistrictMetadataProcessor
180
  proc = DistrictMetadataProcessor()
@@ -195,7 +196,7 @@ class IntelligentRAGChatbot:
195
 
196
  # Get dynamic year list from filter_options.json
197
  try:
198
- fo = Path("filter_options.json")
199
  if fo.exists():
200
  with open(fo) as f:
201
  data = json.load(f)
 
26
 
27
  from src.pipeline import PipelineManager
28
  from src.config.loader import load_config
29
+ from src.config.paths import PROJECT_DIR
30
 
31
 
32
  @dataclass
 
162
 
163
  # Try to load district whitelist from filter_options.json
164
  try:
165
+ fo = PROJECT_DIR / "src" / "config" / "filter_options.json"
166
  if fo.exists():
167
  with open(fo) as f:
168
  data = json.load(f)
 
175
  except Exception:
176
  self.district_whitelist = self.available_metadata['districts']
177
 
178
+ # Enrich whitelist from add_district_metadata.py if available (optional module)
179
  try:
180
  from add_district_metadata import DistrictMetadataProcessor
181
  proc = DistrictMetadataProcessor()
 
196
 
197
  # Get dynamic year list from filter_options.json
198
  try:
199
+ fo = PROJECT_DIR / "src" / "config" / "filter_options.json"
200
  if fo.exists():
201
  with open(fo) as f:
202
  data = json.load(f)
src/config/paths.py ADDED
@@ -0,0 +1,59 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Path configuration for local vs deployed environments.
3
+
4
+ This module handles different paths for local development vs deployed (HF Spaces) environments.
5
+ """
6
+ import os
7
+ from pathlib import Path
8
+
9
+ # Determine if we're in a deployed environment (HF Spaces/Docker) or local
10
+ # Check for environment variable or Docker-like paths
11
+ IS_DEPLOYED = (
12
+ os.getenv("DEPLOYED", "false").lower() == "true" or
13
+ os.path.exists("/app") or
14
+ os.getenv("SPACES_ID") is not None or
15
+ os.path.exists("/.dockerenv")
16
+ )
17
+
18
+ # PROJECT_DIR: Base directory for application files
19
+ # In deployed: /app, in local: current working directory or project root
20
+ if IS_DEPLOYED:
21
+ PROJECT_DIR = Path("/app")
22
+ else:
23
+ # For local development, use current working directory or find project root
24
+ cwd = Path.cwd()
25
+ # Try to find project root (directory containing this src/ folder)
26
+ project_root = cwd
27
+ while project_root != project_root.parent:
28
+ if (project_root / "src" / "config").exists():
29
+ break
30
+ project_root = project_root.parent
31
+ PROJECT_DIR = project_root
32
+
33
+ # Cache directories - different for local vs deployed
34
+ # Local: Use default user cache locations (don't override)
35
+ # Deployed: Use PROJECT_DIR/.cache
36
+ if IS_DEPLOYED:
37
+ CACHE_DIR = PROJECT_DIR / ".cache"
38
+ HF_CACHE_DIR = CACHE_DIR / "huggingface"
39
+ STREAMLIT_CACHE_DIR = CACHE_DIR / "streamlit"
40
+ else:
41
+ # For local, use default user cache (let libraries use their defaults)
42
+ HF_CACHE_DIR = None # Will use HF defaults (~/.cache/huggingface)
43
+ STREAMLIT_CACHE_DIR = None # Will use Streamlit defaults
44
+
45
+ # Application directories
46
+ FEEDBACK_DIR = PROJECT_DIR / "feedback"
47
+ CONVERSATIONS_DIR = PROJECT_DIR / "conversations"
48
+ STREAMLIT_CONFIG_DIR = PROJECT_DIR / ".streamlit"
49
+
50
+ # Log the configuration
51
+ if __name__ == "__main__":
52
+ print(f"IS_DEPLOYED: {IS_DEPLOYED}")
53
+ print(f"PROJECT_DIR: {PROJECT_DIR}")
54
+ print(f"HF_CACHE_DIR: {HF_CACHE_DIR}")
55
+ print(f"FEEDBACK_DIR: {FEEDBACK_DIR}")
56
+ print(f"CONVERSATIONS_DIR: {CONVERSATIONS_DIR}")
57
+
58
+
59
+
src/pipeline.py CHANGED
@@ -1,5 +1,7 @@
1
  """Main pipeline orchestrator for the Audit QA system."""
 
2
  import time
 
3
  from pathlib import Path
4
  from dataclasses import dataclass
5
  from typing import Dict, Any, List, Optional
@@ -11,11 +13,21 @@ except ModuleNotFoundError as me:
11
  from langchain.schema import Document
12
 
13
  from .logging import log_error
14
- from .llm.adapters import LLMRegistry
15
  from .loader import chunks_to_documents
16
  from .vectorstore import VectorStoreManager
 
17
  from .retrieval.context import ContextRetriever
18
- from .config.loader import get_embedding_model_for_collection
 
 
 
 
 
 
 
 
 
19
 
20
 
21
 
@@ -41,12 +53,13 @@ class PipelineManager:
41
  """
42
  Initialize the pipeline manager.
43
  """
 
 
44
  self.config = config or {}
 
45
  self.vectorstore_manager = None
46
  self.context_retriever = None # Initialize as None
47
- self.llm_client = None
48
- self.report_service = None
49
- self.chunks = None
50
 
51
  # Initialize components
52
  self._initialize_components()
@@ -118,13 +131,7 @@ class PipelineManager:
118
  try:
119
  # Load config if not provided
120
  if not self.config:
121
- try:
122
- from src.config.loader import load_config
123
- self.config = load_config()
124
- except ImportError:
125
- # Try alternate import path
126
- from src.config.loader import load_config
127
- self.config = load_config()
128
 
129
  # Validate config structure
130
  if not isinstance(self.config, dict):
@@ -159,7 +166,6 @@ class PipelineManager:
159
  print("βœ… VectorStoreManager initialized successfully")
160
  except Exception as vs_error:
161
  print(f"❌ Error initializing VectorStoreManager: {vs_error}")
162
- import traceback
163
  traceback.print_exc()
164
  self.vectorstore_manager = None
165
  raise # Re-raise to be caught by outer try-except
@@ -175,40 +181,35 @@ class PipelineManager:
175
  except Exception as e:
176
  try:
177
  # Try direct instantiation with config
178
- from src.llm.adapters import get_llm_client
179
  self.llm_client = get_llm_client("openai", self.config)
180
  print("βœ… LLM CLIENT: Initialized using direct get_llm_client function with config")
181
  except Exception as e2:
182
  print(f"❌ LLM CLIENT: Registry methods failed - {e2}")
183
  # Try to create a simple LLM client directly
184
  try:
185
- from langchain_openai import ChatOpenAI
186
- import os
187
- api_key = os.getenv("OPENAI_API_KEY") or os.getenv("OPENROUTER_API_KEY")
188
- if api_key:
189
- self.llm_client = ChatOpenAI(
190
- model="gpt-3.5-turbo",
191
- api_key=api_key,
192
- temperature=0.1,
193
- max_tokens=1000
194
- )
195
- print("βœ… LLM CLIENT: Initialized using direct ChatOpenAI")
 
196
  else:
197
- print("❌ LLM CLIENT: No API key available")
198
  except Exception as e3:
199
  print(f"❌ LLM CLIENT: Direct instantiation also failed - {e3}")
200
  self.llm_client = None
201
 
202
  # Load system prompt
203
- from src.llm.templates import DEFAULT_AUDIT_SYSTEM_PROMPT
204
  self.system_prompt = DEFAULT_AUDIT_SYSTEM_PROMPT
205
 
206
  # Initialize report service
207
  try:
208
- try:
209
- from src.reporting.service import ReportService
210
- except ImportError:
211
- from src.reporting.service import ReportService
212
  self.report_service = ReportService()
213
  except Exception as e:
214
  print(f"Warning: Could not initialize report service: {e}")
@@ -216,7 +217,6 @@ class PipelineManager:
216
 
217
  except Exception as e:
218
  print(f"❌ Error initializing components: {e}")
219
- import traceback
220
  traceback.print_exc()
221
  # Don't set vectorstore_manager to None if it was already set
222
  if not hasattr(self, 'vectorstore_manager') or self.vectorstore_manager is None:
@@ -337,7 +337,6 @@ class PipelineManager:
337
  return False
338
  except Exception as init_error:
339
  print(f"❌ Error initializing vector store manager: {init_error}")
340
- import traceback
341
  traceback.print_exc()
342
  return False
343
 
@@ -352,7 +351,6 @@ class PipelineManager:
352
  except Exception as e:
353
  print(f"❌ Error connecting to vector store: {e}")
354
  log_error(e, {"component": "vectorstore_connection"})
355
- import traceback
356
  traceback.print_exc()
357
 
358
  # If it's a dimension mismatch error, try with force_recreate
@@ -541,9 +539,6 @@ Answer:"""
541
  if auto_infer_filters and not any([reports, sources, subtype]):
542
  print(f"πŸ€– AUTO-INFERRING FILTERS: No explicit filters provided, analyzing query...")
543
  try:
544
- # Import get_available_metadata here to avoid circular imports
545
- from src.retrieval.filter import get_available_metadata, infer_filters_from_query
546
-
547
  # Get available metadata
548
  available_metadata = get_available_metadata(self.vectorstore_manager.get_vectorstore())
549
 
 
1
  """Main pipeline orchestrator for the Audit QA system."""
2
+ import os
3
  import time
4
+ import traceback
5
  from pathlib import Path
6
  from dataclasses import dataclass
7
  from typing import Dict, Any, List, Optional
 
13
  from langchain.schema import Document
14
 
15
  from .logging import log_error
16
+
17
  from .loader import chunks_to_documents
18
  from .vectorstore import VectorStoreManager
19
+ from .reporting.service import ReportService
20
  from .retrieval.context import ContextRetriever
21
+ from .llm.adapters import LLMRegistry, get_llm_client
22
+ from .llm.templates import DEFAULT_AUDIT_SYSTEM_PROMPT
23
+ from .config.loader import load_config, get_embedding_model_for_collection
24
+ from .retrieval.filter import get_available_metadata, infer_filters_from_query
25
+
26
+ try:
27
+ from langchain_openai import ChatOpenAI
28
+ LANGCHAIN_OPENAI_AVAILABLE = True
29
+ except ImportError:
30
+ LANGCHAIN_OPENAI_AVAILABLE = False
31
 
32
 
33
 
 
53
  """
54
  Initialize the pipeline manager.
55
  """
56
+ self.chunks = None
57
+ self.llm_client = None
58
  self.config = config or {}
59
+ self.report_service = None
60
  self.vectorstore_manager = None
61
  self.context_retriever = None # Initialize as None
62
+
 
 
63
 
64
  # Initialize components
65
  self._initialize_components()
 
131
  try:
132
  # Load config if not provided
133
  if not self.config:
134
+ self.config = load_config()
 
 
 
 
 
 
135
 
136
  # Validate config structure
137
  if not isinstance(self.config, dict):
 
166
  print("βœ… VectorStoreManager initialized successfully")
167
  except Exception as vs_error:
168
  print(f"❌ Error initializing VectorStoreManager: {vs_error}")
 
169
  traceback.print_exc()
170
  self.vectorstore_manager = None
171
  raise # Re-raise to be caught by outer try-except
 
181
  except Exception as e:
182
  try:
183
  # Try direct instantiation with config
 
184
  self.llm_client = get_llm_client("openai", self.config)
185
  print("βœ… LLM CLIENT: Initialized using direct get_llm_client function with config")
186
  except Exception as e2:
187
  print(f"❌ LLM CLIENT: Registry methods failed - {e2}")
188
  # Try to create a simple LLM client directly
189
  try:
190
+ if LANGCHAIN_OPENAI_AVAILABLE:
191
+ api_key = os.getenv("OPENAI_API_KEY") or os.getenv("OPENROUTER_API_KEY")
192
+ if api_key:
193
+ self.llm_client = ChatOpenAI(
194
+ model="gpt-3.5-turbo",
195
+ api_key=api_key,
196
+ temperature=0.1,
197
+ max_tokens=1000
198
+ )
199
+ print("βœ… LLM CLIENT: Initialized using direct ChatOpenAI")
200
+ else:
201
+ print("❌ LLM CLIENT: No API key available")
202
  else:
203
+ print("❌ LLM CLIENT: langchain-openai not available")
204
  except Exception as e3:
205
  print(f"❌ LLM CLIENT: Direct instantiation also failed - {e3}")
206
  self.llm_client = None
207
 
208
  # Load system prompt
 
209
  self.system_prompt = DEFAULT_AUDIT_SYSTEM_PROMPT
210
 
211
  # Initialize report service
212
  try:
 
 
 
 
213
  self.report_service = ReportService()
214
  except Exception as e:
215
  print(f"Warning: Could not initialize report service: {e}")
 
217
 
218
  except Exception as e:
219
  print(f"❌ Error initializing components: {e}")
 
220
  traceback.print_exc()
221
  # Don't set vectorstore_manager to None if it was already set
222
  if not hasattr(self, 'vectorstore_manager') or self.vectorstore_manager is None:
 
337
  return False
338
  except Exception as init_error:
339
  print(f"❌ Error initializing vector store manager: {init_error}")
 
340
  traceback.print_exc()
341
  return False
342
 
 
351
  except Exception as e:
352
  print(f"❌ Error connecting to vector store: {e}")
353
  log_error(e, {"component": "vectorstore_connection"})
 
354
  traceback.print_exc()
355
 
356
  # If it's a dimension mismatch error, try with force_recreate
 
539
  if auto_infer_filters and not any([reports, sources, subtype]):
540
  print(f"πŸ€– AUTO-INFERRING FILTERS: No explicit filters provided, analyzing query...")
541
  try:
 
 
 
542
  # Get available metadata
543
  available_metadata = get_available_metadata(self.vectorstore_manager.get_vectorstore())
544