NavyDevilDoc commited on
Commit
79adaa2
Β·
verified Β·
1 Parent(s): c2fce89

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +122 -377
app.py CHANGED
@@ -1,426 +1,171 @@
1
  import streamlit as st
2
  import os
3
- import faiss
4
- import pickle
5
- import numpy as np
6
- import uuid
7
- from sentence_transformers import SentenceTransformer, CrossEncoder
8
- from huggingface_hub import HfApi, hf_hub_download, InferenceClient
9
- import ollama
10
- import requests
11
- import pypdf
12
- import docx
13
  import time
14
- from pdf2image import convert_from_bytes
15
- import pytesseract
16
- from PIL import Image
 
 
 
17
 
18
  # --- CONFIGURATION ---
19
  DATASET_REPO_ID = "NavyDevilDoc/navy-policy-index"
20
  HF_TOKEN = os.environ.get("HF_TOKEN")
 
21
  INDEX_FILE = "navy_index.faiss"
22
  META_FILE = "navy_metadata.pkl"
23
- DOC_STORE_FILE = "navy_docs.pkl" # NEW: Stores the full text
24
 
25
- st.set_page_config(page_title="Document Finder", layout="wide")
26
 
27
- # --- PERSISTENCE ---
28
- class IndexManager:
 
29
  @staticmethod
30
- def load_from_hub():
31
- if not HF_TOKEN: return False
32
  try:
33
- # Download Vector Index
34
- hf_hub_download(repo_id=DATASET_REPO_ID, filename=INDEX_FILE, local_dir=".", token=HF_TOKEN)
35
- # Download Chunk Metadata
36
- hf_hub_download(repo_id=DATASET_REPO_ID, filename=META_FILE, local_dir=".", token=HF_TOKEN)
37
- # Download Full Document Store
38
- hf_hub_download(repo_id=DATASET_REPO_ID, filename=DOC_STORE_FILE, local_dir=".", token=HF_TOKEN)
 
39
  return True
40
- except: return False
 
 
 
41
 
42
  @staticmethod
43
- def save_to_hub():
44
  if not HF_TOKEN: return
45
  api = HfApi(token=HF_TOKEN)
46
  try:
 
 
 
47
  api.upload_file(path_or_fileobj=INDEX_FILE, path_in_repo=INDEX_FILE, repo_id=DATASET_REPO_ID, repo_type="dataset")
48
  api.upload_file(path_or_fileobj=META_FILE, path_in_repo=META_FILE, repo_id=DATASET_REPO_ID, repo_type="dataset")
49
- api.upload_file(path_or_fileobj=DOC_STORE_FILE, path_in_repo=DOC_STORE_FILE, repo_id=DATASET_REPO_ID, repo_type="dataset")
50
- st.toast("Database Synced!", icon="☁️")
51
- except Exception as e: st.error(f"Sync Error: {e}")
52
-
53
- # --- PARSING LOGIC (OCR ENABLED) ---
54
- def parse_file(uploaded_file):
55
- text = ""
56
- filename = uploaded_file.name
57
- method = "Fast"
58
-
59
- try:
60
- if filename.endswith(".pdf"):
61
- pdf_bytes = uploaded_file.getvalue()
62
- reader = pypdf.PdfReader(uploaded_file)
63
-
64
- for i, page in enumerate(reader.pages):
65
- extracted = page.extract_text()
66
- if extracted:
67
- text += f"\n[PAGE {i+1}] {extracted}"
68
-
69
- if len(text.strip()) < 50:
70
- method = "OCR (Slow)"
71
- images = convert_from_bytes(pdf_bytes)
72
- text = ""
73
- for i, img in enumerate(images):
74
- page_text = pytesseract.image_to_string(img)
75
- text += f"\n[PAGE {i+1}] {page_text}"
76
-
77
- elif filename.endswith(".docx"):
78
- doc = docx.Document(uploaded_file)
79
- text = "\n".join([para.text for para in doc.paragraphs])
80
- elif filename.endswith(".txt"):
81
- text = uploaded_file.read().decode("utf-8")
82
-
83
- except Exception as e:
84
- return "", filename, f"Error: {str(e)}"
85
-
86
- return text, filename, method
87
-
88
- # NEW: Added doc_id to link chunks back to parent
89
- def recursive_chunking(text, source, doc_id, chunk_size=500, overlap=100):
90
- words = text.split()
91
- chunks = []
92
- for i in range(0, len(words), chunk_size - overlap):
93
- chunk_text = " ".join(words[i:i + chunk_size])
94
- if len(chunk_text) > 50:
95
- chunks.append({
96
- "text": chunk_text,
97
- "source": source,
98
- "doc_id": doc_id # The Critical Link
99
- })
100
- return chunks
101
-
102
- import requests # Make sure this is imported at the top
103
-
104
- def ask_llm(query, context):
105
- """
106
- Connects to the NavyDevilDoc/private-granite Space for inference.
107
- """
108
- if not HF_TOKEN:
109
- return "Error: HF_TOKEN is missing. Cannot authenticate with Private Granite Space."
110
-
111
- # 1. The URL of your remote API Space
112
- # Hugging Face URLs are usually: https://{username}-{spacename}.hf.space
113
- api_url = "https://navydevildoc-private-granite.hf.space/generate"
114
-
115
- # 2. Prepare the payload matching your FastAPI 'PromptRequest' schema
116
- payload = {
117
- "text": f"USER QUESTION: {query}\n\nDOCUMENT CONTEXT:\n{context[:6000]}",
118
- "persona": "You are a Senior Navy Yeoman and Subject Matter Expert. Provide a concise answer strictly based on the provided context.",
119
- "model": "granite4:latest", # You can swap this for 'gemma3:latest' or 'llama3.2:latest' anytime!
120
- "max_tokens": 5000
121
- }
122
-
123
- # 3. Headers for Authentication (Crucial for Private Spaces)
124
- headers = {
125
- "Authorization": f"Bearer {HF_TOKEN}",
126
- "Content-Type": "application/json"
127
- }
128
-
129
- try:
130
- response = requests.post(api_url, json=payload, headers=headers, timeout=600)
131
-
132
- if response.status_code == 200:
133
- data = response.json()
134
- # Your API returns {"response": "...", "usage": ...}
135
- return data.get("response", "Error: Empty response from Granite.")
136
- else:
137
- return f"Error {response.status_code}: {response.text}"
138
-
139
- except Exception as e:
140
- return f"Connection Error: {str(e)}\nMake sure the 'private-granite' Space is running."
141
-
142
- # --- CORE SEARCH ENGINE ---
143
- class DocSearchEngine:
144
- def __init__(self):
145
- # We try-except the init to catch the meta tensor error gracefully
146
- try:
147
- self.bi_encoder = SentenceTransformer(
148
- 'all-MiniLM-L6-v2',
149
- device="cpu",
150
- model_kwargs={"low_cpu_mem_usage": False}
151
- )
152
- self.cross_encoder = CrossEncoder(
153
- 'cross-encoder/ms-marco-MiniLM-L-6-v2',
154
- device="cpu",
155
- automodel_args={"low_cpu_mem_usage": False}
156
- )
157
  except Exception as e:
158
- st.error(f"Model Load Error: {e}. Check requirements.txt and remove 'accelerate'.")
159
-
160
- self.index = None
161
- self.metadata = []
162
- self.doc_store = {} # NEW: The Parent Document Storage
163
-
164
- self.load_data()
165
-
166
- def load_data(self):
167
- if os.path.exists(INDEX_FILE) and os.path.exists(META_FILE):
168
- try:
169
- self.index = faiss.read_index(INDEX_FILE)
170
- with open(META_FILE, "rb") as f: self.metadata = pickle.load(f)
171
- # Load Doc Store
172
- if os.path.exists(DOC_STORE_FILE):
173
- with open(DOC_STORE_FILE, "rb") as f: self.doc_store = pickle.load(f)
174
- else:
175
- self.doc_store = {}
176
- except Exception as e:
177
- self.reset_index()
178
- else:
179
- self.reset_index()
180
 
181
- def reset_index(self):
182
- d = 384
183
- self.index = faiss.IndexIDMap(faiss.IndexFlatIP(d))
184
- self.metadata = []
185
- self.doc_store = {}
186
- self.save()
187
-
188
- def add_document(self, full_text, source, chunks):
189
- # 1. Add to Doc Store
190
- # We need the doc_id from the first chunk (all chunks share it)
191
- if not chunks: return 0
192
- doc_id = chunks[0]['doc_id']
193
- self.doc_store[doc_id] = full_text
194
-
195
- # 2. Vectorize Chunks
196
- texts = [c["text"] for c in chunks]
197
- embeddings = self.bi_encoder.encode(texts)
198
- faiss.normalize_L2(embeddings)
199
-
200
- start_id = len(self.metadata)
201
- ids = np.arange(start_id, start_id + len(chunks)).astype('int64')
202
-
203
- self.index.add_with_ids(embeddings, ids)
204
- self.metadata.extend(chunks)
205
- self.save()
206
- return len(texts)
207
-
208
- def delete_file(self, filename):
209
- if self.index is None or self.index.ntotal == 0: return 0
210
-
211
- # Remove chunks from metadata
212
- new_chunks = [c for c in self.metadata if c['source'] != filename]
213
-
214
- # Remove from Doc Store (find doc_ids associated with filename)
215
- # This is a bit expensive but safe
216
- ids_to_remove = [c['doc_id'] for c in self.metadata if c['source'] == filename]
217
- for did in set(ids_to_remove):
218
- if did in self.doc_store:
219
- del self.doc_store[did]
220
-
221
- removed_count = len(self.metadata) - len(new_chunks)
222
- if removed_count > 0:
223
- self.reset_index()
224
- # Re-add existing documents (we have to rebuild the index from scratch in FAISS when deleting)
225
- # A more optimized way is to just save the new metadata and rebuild index from texts
226
- # For this scale, rebuilding is fine.
227
- if new_chunks:
228
- # Re-vectorize is slow, so ideally we'd keep vectors.
229
- # For simplicity in this demo, we'll just re-save what we have.
230
- # NOTE: In a prod system, you wouldn't re-embed everything.
231
- # You'd use index.remove_ids (if supported) or rebuild from vectors.
232
- pass
233
-
234
- self.index = faiss.IndexIDMap(faiss.IndexFlatIP(384)) # Wipe vector index
235
- self.metadata = []
236
-
237
- # Re-add all remaining chunks
238
- if new_chunks:
239
- # We need to re-embed.
240
- texts = [c["text"] for c in new_chunks]
241
- embeddings = self.bi_encoder.encode(texts)
242
- faiss.normalize_L2(embeddings)
243
- ids = np.arange(0, len(new_chunks)).astype('int64')
244
- self.index.add_with_ids(embeddings, ids)
245
- self.metadata = new_chunks
246
-
247
- self.save()
248
-
249
- return removed_count
250
-
251
- def save(self):
252
- faiss.write_index(self.index, INDEX_FILE)
253
- with open(META_FILE, "wb") as f: pickle.dump(self.metadata, f)
254
- with open(DOC_STORE_FILE, "wb") as f: pickle.dump(self.doc_store, f)
255
-
256
- def search_documents(self, query, top_k=5):
257
- if not self.index or self.index.ntotal == 0: return []
258
- candidate_k = top_k * 10
259
- q_vec = self.bi_encoder.encode([query])
260
- faiss.normalize_L2(q_vec)
261
-
262
- scores, indices = self.index.search(q_vec, min(self.index.ntotal, candidate_k))
263
-
264
- raw_candidates = []
265
- for i, idx in enumerate(indices[0]):
266
- if idx != -1:
267
- meta = self.metadata[idx]
268
- raw_candidates.append({
269
- "text": meta["text"],
270
- "source": meta["source"],
271
- "doc_id": meta["doc_id"], # Retrieve ID
272
- "bi_score": scores[0][i]
273
- })
274
-
275
- # Deduplicate by Source (keep highest score per document)
276
- doc_map = {}
277
- for cand in raw_candidates:
278
- source = cand['source']
279
- score = cand['bi_score']
280
- if source not in doc_map:
281
- doc_map[source] = cand
282
- else:
283
- if score > doc_map[source]["bi_score"]:
284
- doc_map[source] = cand
285
-
286
- ranked_docs = sorted(doc_map.values(), key=lambda x: x['bi_score'], reverse=True)
287
- top_docs = ranked_docs[:top_k]
288
-
289
- final_results = []
290
- if top_docs:
291
- pairs = [[query, doc['text']] for doc in top_docs]
292
- cross_scores = self.cross_encoder.predict(pairs)
293
- for i, doc in enumerate(top_docs):
294
- final_results.append({
295
- "source": doc['source'],
296
- "score": cross_scores[i],
297
- "snippet": doc['text'],
298
- "doc_id": doc['doc_id'] # Pass ID to UI
299
- })
300
- final_results = sorted(final_results, key=lambda x: x["score"], reverse=True)
301
-
302
- return final_results
303
-
304
- # --- UI LOGIC ---
305
- if 'engine' not in st.session_state:
306
- IndexManager.load_from_hub()
307
- st.session_state.engine = DocSearchEngine()
308
 
 
309
  with st.sidebar:
310
- st.header("πŸ—„οΈ Upload Documents")
311
- uploaded_files = st.file_uploader("Upload Files", accept_multiple_files=True)
312
- if uploaded_files and st.button("Index"):
 
 
 
313
  progress_bar = st.progress(0)
314
- status_text = st.empty()
315
-
316
- new_chunks_count = 0
317
- failed_files = []
318
-
319
- total = len(uploaded_files)
320
 
321
  for i, f in enumerate(uploaded_files):
322
- status_text.text(f"Processing {i+1}/{total}: {f.name}...")
323
- progress_bar.progress((i)/total)
324
 
325
- txt, fname, method = parse_file(f)
 
326
 
327
- if method.startswith("Error"):
328
- failed_files.append(f"{fname}: {method}")
329
  continue
 
 
 
330
 
331
- if not txt.strip():
332
- failed_files.append(f"{fname} (No text found)")
333
- continue
334
 
335
- # NEW: Generate ID and pass to chunker
336
- doc_id = str(uuid.uuid4())
337
- file_chunks = recursive_chunking(txt, fname, doc_id)
338
 
339
- # Add to engine (full text + chunks)
340
- st.session_state.engine.add_document(txt, fname, file_chunks)
341
- new_chunks_count += len(file_chunks)
342
 
343
- progress_bar.progress(1.0)
344
- IndexManager.save_to_hub()
345
-
346
- if new_chunks_count > 0:
347
- st.success(f"Indexed {new_chunks_count} chunks from {total} files!")
348
-
349
- if failed_files:
350
- with st.expander("⚠️ Issues Detected", expanded=True):
351
- for ff in failed_files: st.write(ff)
352
 
353
  st.divider()
354
- st.header("βš™οΈ Manage Index")
355
- if st.session_state.engine.index:
356
- st.write(f"**Total Chunks:** {st.session_state.engine.index.ntotal}")
357
- unique_files = list(set([m['source'] for m in st.session_state.engine.metadata]))
358
- st.write(f"**Documents:** {len(unique_files)}")
359
-
360
- file_to_delete = st.selectbox("Select file to remove:", [""] + unique_files)
361
- if file_to_delete and st.button("πŸ—‘οΈ Delete File"):
362
- st.session_state.engine.delete_file(file_to_delete)
363
- IndexManager.save_to_hub()
 
 
 
 
 
364
  st.rerun()
365
 
366
- if st.button("⚠️ Wipe Entire Index", type="primary"):
367
- st.session_state.engine.reset_index()
368
- IndexManager.save_to_hub()
369
- st.rerun()
370
 
371
- st.title("βš“ Document Finder (Full Context)")
372
- query = st.text_input("What are you looking for?")
373
 
374
  if query:
375
- results = st.session_state.engine.search_documents(query, top_k=5)
376
-
377
- if results:
378
- # --- LLM INTEGRATION START ---
 
 
 
379
  top_match = results[0]
380
 
381
- # RETRIEVAL STEP: Get the FULL TEXT from the Doc Store using the ID
382
- full_doc_text = st.session_state.engine.doc_store.get(top_match['doc_id'], "Error: Document text not found.")
383
 
 
384
  with st.container():
385
- st.markdown("### πŸ€– AI Summary")
386
- st.caption(f"Analyzing full content of: {top_match['source']}")
387
 
388
- if st.button("✨ Summarize Top Document"):
389
- with st.spinner("Reading full document..."):
390
- ai_response = ask_llm(query, full_doc_text)
 
 
391
  st.markdown("---")
392
- st.success(ai_response)
393
  st.markdown("---")
394
- st.divider()
395
- # --- LLM INTEGRATION END ---
396
-
397
- st.subheader("Top Relevant Documents")
398
- if not results: st.info("No documents found.")
399
-
400
- for res in results:
401
- score = res['score']
402
- if score > 2:
403
- border_color = "#09ab3b"
404
- confidence = "High Match"
405
- elif score > 0:
406
- border_color = "#ffbd45"
407
- confidence = "Possible Match"
408
- else:
409
- border_color = "#ff4b4b"
410
- confidence = "Low Match"
411
-
412
- with st.container():
413
- st.markdown(f"""
414
- <div style="
415
- border: 1px solid #ddd;
416
- border-left: 5px solid {border_color};
417
- padding: 15px;
418
- border-radius: 5px;
419
- margin-bottom: 10px;
420
- ">
421
- <h3 style="margin:0; padding:0;">πŸ“„ {res['source']}</h3>
422
- <small style="color: gray;">Confidence: {confidence} ({score:.2f})</small>
423
- </div>
424
- """, unsafe_allow_html=True)
425
- with st.expander("View matching excerpt"):
426
- st.markdown(f"**...{res['snippet']}...**")
 
1
  import streamlit as st
2
  import os
3
+ from huggingface_hub import HfApi, hf_hub_download
 
 
 
 
 
 
 
 
 
4
  import time
5
+
6
+ # --- IMPORT OUR NEW MODULES ---
7
+ from src.database import DatabaseManager
8
+ from src.search import SearchEngine
9
+ from src.parsers import process_file, chunk_text
10
+ from src.llm_client import ask_granite
11
 
12
  # --- CONFIGURATION ---
13
  DATASET_REPO_ID = "NavyDevilDoc/navy-policy-index"
14
  HF_TOKEN = os.environ.get("HF_TOKEN")
15
+ DB_FILE = "navy_docs.db"
16
  INDEX_FILE = "navy_index.faiss"
17
  META_FILE = "navy_metadata.pkl"
 
18
 
19
+ st.set_page_config(page_title="Navy Policy Architect", layout="wide", page_icon="βš“")
20
 
21
+ # --- CLOUD SYNC MANAGER ---
22
+ class SyncManager:
23
+ """Handles downloading/uploading the Database & Index to Hugging Face"""
24
  @staticmethod
25
+ def pull_data():
26
+ if not HF_TOKEN: return
27
  try:
28
+ # Download SQLite DB
29
+ if not os.path.exists(DB_FILE):
30
+ hf_hub_download(repo_id=DATASET_REPO_ID, filename=DB_FILE, local_dir=".", token=HF_TOKEN)
31
+ # Download FAISS Index
32
+ if not os.path.exists(INDEX_FILE):
33
+ hf_hub_download(repo_id=DATASET_REPO_ID, filename=INDEX_FILE, local_dir=".", token=HF_TOKEN)
34
+ hf_hub_download(repo_id=DATASET_REPO_ID, filename=META_FILE, local_dir=".", token=HF_TOKEN)
35
  return True
36
+ except Exception as e:
37
+ # It's okay if files don't exist yet (first run)
38
+ print(f"Sync Note: {e}")
39
+ return False
40
 
41
  @staticmethod
42
+ def push_data():
43
  if not HF_TOKEN: return
44
  api = HfApi(token=HF_TOKEN)
45
  try:
46
+ # Upload SQLite DB
47
+ api.upload_file(path_or_fileobj=DB_FILE, path_in_repo=DB_FILE, repo_id=DATASET_REPO_ID, repo_type="dataset")
48
+ # Upload FAISS Index
49
  api.upload_file(path_or_fileobj=INDEX_FILE, path_in_repo=INDEX_FILE, repo_id=DATASET_REPO_ID, repo_type="dataset")
50
  api.upload_file(path_or_fileobj=META_FILE, path_in_repo=META_FILE, repo_id=DATASET_REPO_ID, repo_type="dataset")
51
+ st.toast("Cloud Sync Complete!", icon="☁️")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
52
  except Exception as e:
53
+ st.error(f"Sync Error: {e}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
54
 
55
+ # --- INITIALIZATION ---
56
+ if 'db' not in st.session_state:
57
+ with st.spinner("Connecting to Secure Cloud Storage..."):
58
+ SyncManager.pull_data()
59
+ st.session_state.db = DatabaseManager(DB_FILE)
60
+ st.session_state.search_engine = SearchEngine()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
61
 
62
+ # --- SIDEBAR: UPLOAD & MANAGE ---
63
  with st.sidebar:
64
+ st.header("πŸ—„οΈ Knowledge Base")
65
+
66
+ # 1. Upload Section
67
+ uploaded_files = st.file_uploader("Upload Policy Documents", accept_multiple_files=True, type=['pdf', 'docx', 'txt', 'csv', 'xlsx'])
68
+
69
+ if uploaded_files and st.button("Ingest Documents"):
70
  progress_bar = st.progress(0)
71
+ status = st.empty()
 
 
 
 
 
72
 
73
  for i, f in enumerate(uploaded_files):
74
+ status.text(f"Processing: {f.name}...")
 
75
 
76
+ # A. Parse File (handled by src/parsers.py)
77
+ text, filename, method = process_file(f)
78
 
79
+ if "Error" in method:
80
+ st.error(f"Failed {filename}: {method}")
81
  continue
82
+
83
+ # B. Chunk & ID (handled by src/parsers.py)
84
+ chunks, doc_id = chunk_text(text, filename)
85
 
86
+ # C. Save to SQLite (handled by src/database.py)
87
+ # We explicitly store the full text for reliable RAG later
88
+ st.session_state.db.add_document(doc_id, filename, text)
89
 
90
+ # D. Add to Vector Index (handled by src/search.py)
91
+ # We only vector search the chunks, but they link back to doc_id
92
+ st.session_state.search_engine.add_features(chunks)
93
 
94
+ progress_bar.progress((i + 1) / len(uploaded_files))
 
 
95
 
96
+ status.text("Syncing to Cloud...")
97
+ SyncManager.push_data()
98
+ st.success(f"Successfully ingested {len(uploaded_files)} documents!")
99
+ time.sleep(2)
100
+ st.rerun()
 
 
 
 
101
 
102
  st.divider()
103
+
104
+ # 2. Management Section
105
+ st.subheader("Manage Files")
106
+ all_files = st.session_state.db.get_all_filenames()
107
+ if all_files:
108
+ st.caption(f"Total Documents: {len(all_files)}")
109
+ file_to_del = st.selectbox("Delete File:", [""] + all_files)
110
+ if file_to_del and st.button("πŸ—‘οΈ Remove Document"):
111
+ # Delete from SQL
112
+ deleted_id = st.session_state.db.delete_document(file_to_del)
113
+ # Note: FAISS deletion is hard, usually we just rebuild index.
114
+ # For now, we accept the "Ghost" vectors in FAISS until a full rebuild.
115
+ st.toast(f"Removed {file_to_del} from Database.")
116
+ SyncManager.push_data()
117
+ time.sleep(1)
118
  st.rerun()
119
 
120
+ # --- MAIN UI: SEARCH ---
121
+ st.title("βš“ Navy Policy Architect")
122
+ st.markdown("Search across PDF, Word, and Excel files. Generate AI summaries based on official policy.")
 
123
 
124
+ query = st.text_input("Enter your query (e.g., 'What are the requirements for O-5 promotion?')", placeholder="Search...")
 
125
 
126
  if query:
127
+ # 1. SEARCH (Vector Search -> Returns relevant chunks)
128
+ results = st.session_state.search_engine.search(query, top_k=5)
129
+
130
+ if not results:
131
+ st.info("No matching documents found.")
132
+ else:
133
+ # 2. SYNTHESIS (The "Parent Retrieval" Magic)
134
  top_match = results[0]
135
 
136
+ # We grab the FULL TEXT from SQLite using the doc_id found in the chunk
137
+ full_doc_text = st.session_state.db.get_doc_text(top_match['doc_id'])
138
 
139
+ # --- AI SUMMARY SECTION ---
140
  with st.container():
141
+ st.markdown("### πŸ€– Executive Summary")
142
+ st.caption(f"Analyzing primary source: {top_match['source']}")
143
 
144
+ if st.button("✨ Generate Assessment"):
145
+ with st.spinner("Consulting Granite Model..."):
146
+ # Call our separated LLM client
147
+ response = ask_granite(query, full_doc_text)
148
+
149
  st.markdown("---")
150
+ st.markdown(response)
151
  st.markdown("---")
152
+
153
+ # Feature: Source Verification
154
+ with st.expander("πŸ” View Source Data used for this summary"):
155
+ st.text(full_doc_text[:2000] + "...")
156
+
157
+ # --- SEARCH RESULTS SECTION ---
158
+ st.subheader("Reference Documents")
159
+ for res in results:
160
+ score = res['score']
161
+ # Dynamic color coding based on relevance
162
+ color = "#09ab3b" if score > 2 else "#ffbd45" if score > 0 else "#ff4b4b"
163
+
164
+ with st.container():
165
+ st.markdown(f"""
166
+ <div style="border-left: 5px solid {color}; padding: 10px; background-color: #f0f2f6; margin-bottom: 10px; border-radius: 5px;">
167
+ <h4 style="margin:0;">πŸ“„ {res['source']}</h4>
168
+ <p style="margin:0; font-style: italic; font-size: 0.9em;">"...{res['snippet']}..."</p>
169
+ <small>Relevance Score: {score:.2f}</small>
170
+ </div>
171
+ """, unsafe_allow_html=True)