NavyDevilDoc commited on
Commit
73ca4a0
·
verified ·
1 Parent(s): d71c08c

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +169 -127
app.py CHANGED
@@ -3,10 +3,9 @@ import os
3
  import faiss
4
  import pickle
5
  import numpy as np
 
6
  from sentence_transformers import SentenceTransformer, CrossEncoder
7
- from huggingface_hub import HfApi, hf_hub_download
8
- from huggingface_hub.utils import EntryNotFoundError, RepositoryNotFoundError
9
- from huggingface_hub import InferenceClient
10
  import pypdf
11
  import docx
12
  import time
@@ -19,6 +18,7 @@ DATASET_REPO_ID = "NavyDevilDoc/navy-policy-index"
19
  HF_TOKEN = os.environ.get("HF_TOKEN")
20
  INDEX_FILE = "navy_index.faiss"
21
  META_FILE = "navy_metadata.pkl"
 
22
 
23
  st.set_page_config(page_title="Document Finder", layout="wide")
24
 
@@ -28,8 +28,12 @@ class IndexManager:
28
  def load_from_hub():
29
  if not HF_TOKEN: return False
30
  try:
 
31
  hf_hub_download(repo_id=DATASET_REPO_ID, filename=INDEX_FILE, local_dir=".", token=HF_TOKEN)
 
32
  hf_hub_download(repo_id=DATASET_REPO_ID, filename=META_FILE, local_dir=".", token=HF_TOKEN)
 
 
33
  return True
34
  except: return False
35
 
@@ -40,6 +44,7 @@ class IndexManager:
40
  try:
41
  api.upload_file(path_or_fileobj=INDEX_FILE, path_in_repo=INDEX_FILE, repo_id=DATASET_REPO_ID, repo_type="dataset")
42
  api.upload_file(path_or_fileobj=META_FILE, path_in_repo=META_FILE, repo_id=DATASET_REPO_ID, repo_type="dataset")
 
43
  st.toast("Database Synced!", icon="☁️")
44
  except Exception as e: st.error(f"Sync Error: {e}")
45
 
@@ -51,7 +56,6 @@ def parse_file(uploaded_file):
51
 
52
  try:
53
  if filename.endswith(".pdf"):
54
- # Method 1: Fast Text Extraction
55
  pdf_bytes = uploaded_file.getvalue()
56
  reader = pypdf.PdfReader(uploaded_file)
57
 
@@ -60,15 +64,11 @@ def parse_file(uploaded_file):
60
  if extracted:
61
  text += f"\n[PAGE {i+1}] {extracted}"
62
 
63
- # Method 2: OCR Fallback
64
- # If fast method yielded almost no text, switch to OCR
65
  if len(text.strip()) < 50:
66
  method = "OCR (Slow)"
67
- # Reset file pointer or use bytes
68
  images = convert_from_bytes(pdf_bytes)
69
- text = "" # Reset text
70
  for i, img in enumerate(images):
71
- # Tesseract reads the image
72
  page_text = pytesseract.image_to_string(img)
73
  text += f"\n[PAGE {i+1}] {page_text}"
74
 
@@ -83,43 +83,44 @@ def parse_file(uploaded_file):
83
 
84
  return text, filename, method
85
 
86
- def recursive_chunking(text, source, chunk_size=500, overlap=100):
 
87
  words = text.split()
88
  chunks = []
89
  for i in range(0, len(words), chunk_size - overlap):
90
  chunk_text = " ".join(words[i:i + chunk_size])
91
  if len(chunk_text) > 50:
92
- chunks.append({"text": chunk_text, "source": source})
 
 
 
 
93
  return chunks
94
 
95
  def ask_llm(query, context):
96
- """
97
- Sends the user query and the retrieved document text to a free, hosted LLM.
98
- """
99
  if not HF_TOKEN:
100
  return "Error: HF_TOKEN is missing. Cannot contact AI."
101
 
102
- # We use Mistral-7B-Instruct because it is fast, follows instructions well,
103
- # and is usually available on the free tier.
104
- repo_id = "mistralai/Mistral-7B-Instruct-v0.3"
105
 
 
106
  client = InferenceClient(model=repo_id, token=HF_TOKEN)
107
 
108
  prompt = f"""
109
  You are a Senior Navy Yeoman and Subject Matter Expert.
110
- Analyze the following Navy document excerpt and answer the user's question based ONLY on that text.
111
 
112
  USER QUESTION: "{query}"
113
 
114
- DOCUMENT EXCERPT:
115
- "{context}"
116
 
117
  Your Answer (Be concise, professional, and cite the document):
118
  """
119
 
120
  try:
121
- # stream=True makes it look cool (typewriter effect) but standard return is easier for now
122
- response = client.text_generation(prompt, max_new_tokens=400)
123
  return response
124
  except Exception as e:
125
  return f"AI Error: {e}"
@@ -127,39 +128,37 @@ def ask_llm(query, context):
127
  # --- CORE SEARCH ENGINE ---
128
  class DocSearchEngine:
129
  def __init__(self):
130
- # Force CPU and DISABLE "Meta Device" loading for both models
131
- self.bi_encoder = SentenceTransformer(
132
- 'all-MiniLM-L6-v2',
133
- device="cpu",
134
- model_kwargs={"low_cpu_mem_usage": False}
135
- )
136
-
137
- self.cross_encoder = CrossEncoder(
138
- 'cross-encoder/ms-marco-MiniLM-L-6-v2',
139
- device="cpu",
140
- automodel_args={"low_cpu_mem_usage": False}
141
- )
142
-
143
- self.index = None
144
- self.metadata = []
145
-
146
- if os.path.exists(INDEX_FILE) and os.path.exists(META_FILE):
147
- try:
148
- self.index = faiss.read_index(INDEX_FILE)
149
- with open(META_FILE, "rb") as f: self.metadata = pickle.load(f)
150
- except Exception as e:
151
- st.error(f"Index load failed, starting fresh: {e}")
152
- self.reset_index()
153
- else:
154
- self.reset_index()
155
-
156
  self.index = None
157
  self.metadata = []
 
158
 
 
 
 
159
  if os.path.exists(INDEX_FILE) and os.path.exists(META_FILE):
160
  try:
161
  self.index = faiss.read_index(INDEX_FILE)
162
  with open(META_FILE, "rb") as f: self.metadata = pickle.load(f)
 
 
 
 
 
163
  except Exception as e:
164
  self.reset_index()
165
  else:
@@ -169,9 +168,17 @@ class DocSearchEngine:
169
  d = 384
170
  self.index = faiss.IndexIDMap(faiss.IndexFlatIP(d))
171
  self.metadata = []
 
172
  self.save()
173
 
174
- def add_documents(self, chunks):
 
 
 
 
 
 
 
175
  texts = [c["text"] for c in chunks]
176
  embeddings = self.bi_encoder.encode(texts)
177
  faiss.normalize_L2(embeddings)
@@ -186,17 +193,58 @@ class DocSearchEngine:
186
 
187
  def delete_file(self, filename):
188
  if self.index is None or self.index.ntotal == 0: return 0
 
 
189
  new_chunks = [c for c in self.metadata if c['source'] != filename]
 
 
 
 
 
 
 
 
190
  removed_count = len(self.metadata) - len(new_chunks)
191
  if removed_count > 0:
192
  self.reset_index()
193
- if new_chunks: self.add_documents(new_chunks)
194
- else: self.save()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
195
  return removed_count
196
 
197
  def save(self):
198
  faiss.write_index(self.index, INDEX_FILE)
199
  with open(META_FILE, "wb") as f: pickle.dump(self.metadata, f)
 
200
 
201
  def search_documents(self, query, top_k=5):
202
  if not self.index or self.index.ntotal == 0: return []
@@ -209,35 +257,38 @@ class DocSearchEngine:
209
  raw_candidates = []
210
  for i, idx in enumerate(indices[0]):
211
  if idx != -1:
 
212
  raw_candidates.append({
213
- "text": self.metadata[idx]["text"],
214
- "source": self.metadata[idx]["source"],
 
215
  "bi_score": scores[0][i]
216
  })
217
 
 
218
  doc_map = {}
219
  for cand in raw_candidates:
220
  source = cand['source']
221
  score = cand['bi_score']
222
  if source not in doc_map:
223
- doc_map[source] = {"score": score, "snippet": cand['text']}
224
  else:
225
- if score > doc_map[source]["score"]:
226
- doc_map[source]["score"] = score
227
- doc_map[source]["snippet"] = cand['text']
228
 
229
- ranked_docs = sorted(doc_map.items(), key=lambda item: item[1]['score'], reverse=True)
230
- final_results = []
231
  top_docs = ranked_docs[:top_k]
232
 
 
233
  if top_docs:
234
- pairs = [[query, doc[1]['snippet']] for doc in top_docs]
235
  cross_scores = self.cross_encoder.predict(pairs)
236
- for i, (source, data) in enumerate(top_docs):
237
  final_results.append({
238
- "source": source,
239
  "score": cross_scores[i],
240
- "snippet": data['snippet']
 
241
  })
242
  final_results = sorted(final_results, key=lambda x: x["score"], reverse=True)
243
 
@@ -249,52 +300,48 @@ if 'engine' not in st.session_state:
249
  st.session_state.engine = DocSearchEngine()
250
 
251
  with st.sidebar:
252
- with st.sidebar:
253
- st.header("🗄️ Upload Documents")
254
- uploaded_files = st.file_uploader("Upload Files", accept_multiple_files=True)
255
- if uploaded_files and st.button("Index"):
256
- progress_bar = st.progress(0)
257
- status_text = st.empty()
 
 
 
 
 
 
 
 
258
 
259
- new_chunks = []
260
- failed_files = []
261
 
262
- total = len(uploaded_files)
 
 
263
 
264
- for i, f in enumerate(uploaded_files):
265
- status_text.text(f"Processing {i+1}/{total}: {f.name}...")
266
- progress_bar.progress((i)/total)
267
-
268
- # PARSE (With OCR Auto-Switch)
269
- txt, fname, method = parse_file(f)
270
-
271
- # --- DEBUGGING: CATCH ACTUAL ERRORS ---
272
- if method.startswith("Error"):
273
- st.error(f"System Error on {fname}: {method}")
274
- failed_files.append(f"{fname}: {method}")
275
- continue
276
-
277
- if method == "OCR (Slow)":
278
- st.toast(f"OCR Used for {fname}", icon="⚠️")
279
-
280
- if not txt.strip():
281
- failed_files.append(f"{fname} (No text found)")
282
- continue
283
-
284
- file_chunks = recursive_chunking(txt, fname)
285
- new_chunks.extend(file_chunks)
286
-
287
- progress_bar.progress(1.0)
288
 
289
- if new_chunks:
290
- with st.spinner("Saving database..."):
291
- st.session_state.engine.add_documents(new_chunks)
292
- IndexManager.save_to_hub()
293
- st.success(f"Indexed {len(new_chunks)} chunks!")
 
 
294
 
295
- if failed_files:
296
- with st.expander("⚠️ Issues Detected", expanded=True):
297
- for ff in failed_files: st.write(ff)
 
 
 
 
 
 
298
 
299
  st.divider()
300
  st.header("⚙️ Manage Index")
@@ -305,47 +352,42 @@ with st.sidebar:
305
 
306
  file_to_delete = st.selectbox("Select file to remove:", [""] + unique_files)
307
  if file_to_delete and st.button("🗑️ Delete File"):
308
- with st.spinner("Removing..."):
309
- count = st.session_state.engine.delete_file(file_to_delete)
310
- IndexManager.save_to_hub()
311
- st.success(f"Removed {file_to_delete}")
312
- time.sleep(1)
313
- st.rerun()
314
 
315
- st.divider()
316
  if st.button("⚠️ Wipe Entire Index", type="primary"):
317
- with st.spinner("Nuking database..."):
318
- st.session_state.engine.reset_index()
319
- IndexManager.save_to_hub()
320
- st.success("Index wiped clean.")
321
- time.sleep(1)
322
- st.rerun()
323
 
324
- st.title("⚓ Document Finder")
325
  query = st.text_input("What are you looking for?")
326
 
327
  if query:
328
  results = st.session_state.engine.search_documents(query, top_k=5)
329
 
330
- # --- LLM INTEGRATION START ---
331
  if results:
332
- # We grab the text from the #1 result to feed the AI
333
  top_match = results[0]
334
- top_context = f"Source: {top_match['source']}\nContent: {top_match['snippet']}"
335
 
336
- # Create a container for the AI Answer
 
 
337
  with st.container():
338
  st.markdown("### 🤖 AI Summary")
339
- if st.button(" Summarize Top Result"):
340
- with st.spinner("Reading document..."):
341
- ai_response = ask_llm(query, top_context)
 
 
342
  st.success(ai_response)
343
  st.divider()
344
- # --- LLM INTEGRATION END ---
345
 
346
  st.subheader("Top Relevant Documents")
347
-
348
  if not results: st.info("No documents found.")
 
349
  for res in results:
350
  score = res['score']
351
  if score > 2:
 
3
  import faiss
4
  import pickle
5
  import numpy as np
6
+ import uuid
7
  from sentence_transformers import SentenceTransformer, CrossEncoder
8
+ from huggingface_hub import HfApi, hf_hub_download, InferenceClient
 
 
9
  import pypdf
10
  import docx
11
  import time
 
18
  HF_TOKEN = os.environ.get("HF_TOKEN")
19
  INDEX_FILE = "navy_index.faiss"
20
  META_FILE = "navy_metadata.pkl"
21
+ DOC_STORE_FILE = "navy_docs.pkl" # NEW: Stores the full text
22
 
23
  st.set_page_config(page_title="Document Finder", layout="wide")
24
 
 
28
  def load_from_hub():
29
  if not HF_TOKEN: return False
30
  try:
31
+ # Download Vector Index
32
  hf_hub_download(repo_id=DATASET_REPO_ID, filename=INDEX_FILE, local_dir=".", token=HF_TOKEN)
33
+ # Download Chunk Metadata
34
  hf_hub_download(repo_id=DATASET_REPO_ID, filename=META_FILE, local_dir=".", token=HF_TOKEN)
35
+ # Download Full Document Store
36
+ hf_hub_download(repo_id=DATASET_REPO_ID, filename=DOC_STORE_FILE, local_dir=".", token=HF_TOKEN)
37
  return True
38
  except: return False
39
 
 
44
  try:
45
  api.upload_file(path_or_fileobj=INDEX_FILE, path_in_repo=INDEX_FILE, repo_id=DATASET_REPO_ID, repo_type="dataset")
46
  api.upload_file(path_or_fileobj=META_FILE, path_in_repo=META_FILE, repo_id=DATASET_REPO_ID, repo_type="dataset")
47
+ api.upload_file(path_or_fileobj=DOC_STORE_FILE, path_in_repo=DOC_STORE_FILE, repo_id=DATASET_REPO_ID, repo_type="dataset")
48
  st.toast("Database Synced!", icon="☁️")
49
  except Exception as e: st.error(f"Sync Error: {e}")
50
 
 
56
 
57
  try:
58
  if filename.endswith(".pdf"):
 
59
  pdf_bytes = uploaded_file.getvalue()
60
  reader = pypdf.PdfReader(uploaded_file)
61
 
 
64
  if extracted:
65
  text += f"\n[PAGE {i+1}] {extracted}"
66
 
 
 
67
  if len(text.strip()) < 50:
68
  method = "OCR (Slow)"
 
69
  images = convert_from_bytes(pdf_bytes)
70
+ text = ""
71
  for i, img in enumerate(images):
 
72
  page_text = pytesseract.image_to_string(img)
73
  text += f"\n[PAGE {i+1}] {page_text}"
74
 
 
83
 
84
  return text, filename, method
85
 
86
+ # NEW: Added doc_id to link chunks back to parent
87
+ def recursive_chunking(text, source, doc_id, chunk_size=500, overlap=100):
88
  words = text.split()
89
  chunks = []
90
  for i in range(0, len(words), chunk_size - overlap):
91
  chunk_text = " ".join(words[i:i + chunk_size])
92
  if len(chunk_text) > 50:
93
+ chunks.append({
94
+ "text": chunk_text,
95
+ "source": source,
96
+ "doc_id": doc_id # The Critical Link
97
+ })
98
  return chunks
99
 
100
  def ask_llm(query, context):
 
 
 
101
  if not HF_TOKEN:
102
  return "Error: HF_TOKEN is missing. Cannot contact AI."
103
 
104
+ # We limit context to ~8000 chars to avoid hitting token limits on free APIs
105
+ truncated_context = context[:8000]
 
106
 
107
+ repo_id = "mistralai/Mistral-7B-Instruct-v0.3"
108
  client = InferenceClient(model=repo_id, token=HF_TOKEN)
109
 
110
  prompt = f"""
111
  You are a Senior Navy Yeoman and Subject Matter Expert.
112
+ Analyze the following Navy document and answer the user's question based ONLY on that text.
113
 
114
  USER QUESTION: "{query}"
115
 
116
+ DOCUMENT TEXT:
117
+ "{truncated_context}"
118
 
119
  Your Answer (Be concise, professional, and cite the document):
120
  """
121
 
122
  try:
123
+ response = client.text_generation(prompt, max_new_tokens=512)
 
124
  return response
125
  except Exception as e:
126
  return f"AI Error: {e}"
 
128
  # --- CORE SEARCH ENGINE ---
129
  class DocSearchEngine:
130
  def __init__(self):
131
+ # We try-except the init to catch the meta tensor error gracefully
132
+ try:
133
+ self.bi_encoder = SentenceTransformer(
134
+ 'all-MiniLM-L6-v2',
135
+ device="cpu",
136
+ model_kwargs={"low_cpu_mem_usage": False}
137
+ )
138
+ self.cross_encoder = CrossEncoder(
139
+ 'cross-encoder/ms-marco-MiniLM-L-6-v2',
140
+ device="cpu",
141
+ automodel_args={"low_cpu_mem_usage": False}
142
+ )
143
+ except Exception as e:
144
+ st.error(f"Model Load Error: {e}. Check requirements.txt and remove 'accelerate'.")
145
+
 
 
 
 
 
 
 
 
 
 
 
146
  self.index = None
147
  self.metadata = []
148
+ self.doc_store = {} # NEW: The Parent Document Storage
149
 
150
+ self.load_data()
151
+
152
+ def load_data(self):
153
  if os.path.exists(INDEX_FILE) and os.path.exists(META_FILE):
154
  try:
155
  self.index = faiss.read_index(INDEX_FILE)
156
  with open(META_FILE, "rb") as f: self.metadata = pickle.load(f)
157
+ # Load Doc Store
158
+ if os.path.exists(DOC_STORE_FILE):
159
+ with open(DOC_STORE_FILE, "rb") as f: self.doc_store = pickle.load(f)
160
+ else:
161
+ self.doc_store = {}
162
  except Exception as e:
163
  self.reset_index()
164
  else:
 
168
  d = 384
169
  self.index = faiss.IndexIDMap(faiss.IndexFlatIP(d))
170
  self.metadata = []
171
+ self.doc_store = {}
172
  self.save()
173
 
174
+ def add_document(self, full_text, source, chunks):
175
+ # 1. Add to Doc Store
176
+ # We need the doc_id from the first chunk (all chunks share it)
177
+ if not chunks: return 0
178
+ doc_id = chunks[0]['doc_id']
179
+ self.doc_store[doc_id] = full_text
180
+
181
+ # 2. Vectorize Chunks
182
  texts = [c["text"] for c in chunks]
183
  embeddings = self.bi_encoder.encode(texts)
184
  faiss.normalize_L2(embeddings)
 
193
 
194
  def delete_file(self, filename):
195
  if self.index is None or self.index.ntotal == 0: return 0
196
+
197
+ # Remove chunks from metadata
198
  new_chunks = [c for c in self.metadata if c['source'] != filename]
199
+
200
+ # Remove from Doc Store (find doc_ids associated with filename)
201
+ # This is a bit expensive but safe
202
+ ids_to_remove = [c['doc_id'] for c in self.metadata if c['source'] == filename]
203
+ for did in set(ids_to_remove):
204
+ if did in self.doc_store:
205
+ del self.doc_store[did]
206
+
207
  removed_count = len(self.metadata) - len(new_chunks)
208
  if removed_count > 0:
209
  self.reset_index()
210
+ # Re-add existing documents (we have to rebuild the index from scratch in FAISS when deleting)
211
+ # A more optimized way is to just save the new metadata and rebuild index from texts
212
+ # For this scale, rebuilding is fine.
213
+ if new_chunks:
214
+ # Re-vectorize is slow, so ideally we'd keep vectors.
215
+ # For simplicity in this demo, we'll just re-save what we have.
216
+ # NOTE: In a prod system, you wouldn't re-embed everything.
217
+ # You'd use index.remove_ids (if supported) or rebuild from vectors.
218
+ pass
219
+
220
+ # For now, let's just clear and re-add to be safe (simplified logic)
221
+ # This is the "lazy" delete: it wipes and re-adds everything NOT deleted.
222
+ # Only viable for small datasets (<10k chunks).
223
+
224
+ # FAST FIX: Just save the new metadata/doc_store.
225
+ # The vectors will technically still be in FAISS but won't match metadata indices.
226
+ # Correct approach for this lightweight app:
227
+ self.index = faiss.IndexIDMap(faiss.IndexFlatIP(384)) # Wipe vector index
228
+ self.metadata = []
229
+
230
+ # Re-add all remaining chunks
231
+ if new_chunks:
232
+ # We need to re-embed.
233
+ texts = [c["text"] for c in new_chunks]
234
+ embeddings = self.bi_encoder.encode(texts)
235
+ faiss.normalize_L2(embeddings)
236
+ ids = np.arange(0, len(new_chunks)).astype('int64')
237
+ self.index.add_with_ids(embeddings, ids)
238
+ self.metadata = new_chunks
239
+
240
+ self.save()
241
+
242
  return removed_count
243
 
244
  def save(self):
245
  faiss.write_index(self.index, INDEX_FILE)
246
  with open(META_FILE, "wb") as f: pickle.dump(self.metadata, f)
247
+ with open(DOC_STORE_FILE, "wb") as f: pickle.dump(self.doc_store, f)
248
 
249
  def search_documents(self, query, top_k=5):
250
  if not self.index or self.index.ntotal == 0: return []
 
257
  raw_candidates = []
258
  for i, idx in enumerate(indices[0]):
259
  if idx != -1:
260
+ meta = self.metadata[idx]
261
  raw_candidates.append({
262
+ "text": meta["text"],
263
+ "source": meta["source"],
264
+ "doc_id": meta["doc_id"], # Retrieve ID
265
  "bi_score": scores[0][i]
266
  })
267
 
268
+ # Deduplicate by Source (keep highest score per document)
269
  doc_map = {}
270
  for cand in raw_candidates:
271
  source = cand['source']
272
  score = cand['bi_score']
273
  if source not in doc_map:
274
+ doc_map[source] = cand
275
  else:
276
+ if score > doc_map[source]["bi_score"]:
277
+ doc_map[source] = cand
 
278
 
279
+ ranked_docs = sorted(doc_map.values(), key=lambda x: x['bi_score'], reverse=True)
 
280
  top_docs = ranked_docs[:top_k]
281
 
282
+ final_results = []
283
  if top_docs:
284
+ pairs = [[query, doc['text']] for doc in top_docs]
285
  cross_scores = self.cross_encoder.predict(pairs)
286
+ for i, doc in enumerate(top_docs):
287
  final_results.append({
288
+ "source": doc['source'],
289
  "score": cross_scores[i],
290
+ "snippet": doc['text'],
291
+ "doc_id": doc['doc_id'] # Pass ID to UI
292
  })
293
  final_results = sorted(final_results, key=lambda x: x["score"], reverse=True)
294
 
 
300
  st.session_state.engine = DocSearchEngine()
301
 
302
  with st.sidebar:
303
+ st.header("🗄️ Upload Documents")
304
+ uploaded_files = st.file_uploader("Upload Files", accept_multiple_files=True)
305
+ if uploaded_files and st.button("Index"):
306
+ progress_bar = st.progress(0)
307
+ status_text = st.empty()
308
+
309
+ new_chunks_count = 0
310
+ failed_files = []
311
+
312
+ total = len(uploaded_files)
313
+
314
+ for i, f in enumerate(uploaded_files):
315
+ status_text.text(f"Processing {i+1}/{total}: {f.name}...")
316
+ progress_bar.progress((i)/total)
317
 
318
+ txt, fname, method = parse_file(f)
 
319
 
320
+ if method.startswith("Error"):
321
+ failed_files.append(f"{fname}: {method}")
322
+ continue
323
 
324
+ if not txt.strip():
325
+ failed_files.append(f"{fname} (No text found)")
326
+ continue
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
327
 
328
+ # NEW: Generate ID and pass to chunker
329
+ doc_id = str(uuid.uuid4())
330
+ file_chunks = recursive_chunking(txt, fname, doc_id)
331
+
332
+ # Add to engine (full text + chunks)
333
+ st.session_state.engine.add_document(txt, fname, file_chunks)
334
+ new_chunks_count += len(file_chunks)
335
 
336
+ progress_bar.progress(1.0)
337
+ IndexManager.save_to_hub()
338
+
339
+ if new_chunks_count > 0:
340
+ st.success(f"Indexed {new_chunks_count} chunks from {total} files!")
341
+
342
+ if failed_files:
343
+ with st.expander("⚠️ Issues Detected", expanded=True):
344
+ for ff in failed_files: st.write(ff)
345
 
346
  st.divider()
347
  st.header("⚙️ Manage Index")
 
352
 
353
  file_to_delete = st.selectbox("Select file to remove:", [""] + unique_files)
354
  if file_to_delete and st.button("🗑️ Delete File"):
355
+ st.session_state.engine.delete_file(file_to_delete)
356
+ IndexManager.save_to_hub()
357
+ st.rerun()
 
 
 
358
 
 
359
  if st.button("⚠️ Wipe Entire Index", type="primary"):
360
+ st.session_state.engine.reset_index()
361
+ IndexManager.save_to_hub()
362
+ st.rerun()
 
 
 
363
 
364
+ st.title("⚓ Document Finder (Full Context)")
365
  query = st.text_input("What are you looking for?")
366
 
367
  if query:
368
  results = st.session_state.engine.search_documents(query, top_k=5)
369
 
 
370
  if results:
371
+ # --- LLM INTEGRATION START ---
372
  top_match = results[0]
 
373
 
374
+ # RETRIEVAL STEP: Get the FULL TEXT from the Doc Store using the ID
375
+ full_doc_text = st.session_state.engine.doc_store.get(top_match['doc_id'], "Error: Document text not found.")
376
+
377
  with st.container():
378
  st.markdown("### 🤖 AI Summary")
379
+ st.caption(f"Analyzing full content of: {top_match['source']}")
380
+
381
+ if st.button("✨ Summarize Top Document"):
382
+ with st.spinner("Reading full document..."):
383
+ ai_response = ask_llm(query, full_doc_text)
384
  st.success(ai_response)
385
  st.divider()
386
+ # --- LLM INTEGRATION END ---
387
 
388
  st.subheader("Top Relevant Documents")
 
389
  if not results: st.info("No documents found.")
390
+
391
  for res in results:
392
  score = res['score']
393
  if score > 2: