NavyDevilDoc commited on
Commit
0b474cc
·
verified ·
1 Parent(s): 8868ebf

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +144 -246
app.py CHANGED
@@ -1,70 +1,88 @@
1
  import streamlit as st
2
- import pandas as pd
 
 
3
  import numpy as np
4
- import chromadb
5
- from chromadb.config import Settings
6
  from sentence_transformers import SentenceTransformer, CrossEncoder
7
- from rank_bm25 import BM25Okapi
8
- from huggingface_hub import HfApi, snapshot_download
9
- from huggingface_hub.utils import RepositoryNotFoundError
10
  import pypdf
11
  import docx
12
- import os
13
- import shutil
14
- import pickle
15
  import time
16
 
17
  # --- CONFIGURATION ---
18
- # REPLACE THIS WITH YOUR NEW DATASET NAME!
19
- DATASET_REPO_ID = "NavyDevilDoc/navy-policy-index"
20
- LOCAL_DB_PATH = "./data_store"
21
  HF_TOKEN = os.environ.get("HF_TOKEN")
22
 
23
- st.set_page_config(page_title="Navy Search & Intel", layout="wide")
 
 
 
 
24
 
25
  # --- PERSISTENCE MANAGER ---
26
- class DataManager:
27
- """Handles syncing the ChromaDB and BM25 index with the Hugging Face Hub"""
28
 
29
  @staticmethod
30
- def sync_from_hub():
31
- """Downloads the latest DB from the HF Dataset"""
32
  if not HF_TOKEN:
33
- st.warning("HF_TOKEN not found in Secrets. Persistence will not work.")
34
  return False
35
 
36
  try:
37
- st.toast("Syncing database from Cloud...", icon="☁️")
38
- snapshot_download(
39
- repo_id=DATASET_REPO_ID,
40
- repo_type="dataset",
41
- local_dir=LOCAL_DB_PATH,
42
- token=HF_TOKEN
43
- )
 
 
 
 
 
 
 
 
 
 
44
  return True
45
- except (RepositoryNotFoundError, Exception) as e:
46
- # If dataset is empty or doesn't exist yet, that's fine for a fresh start
47
- print(f"Cloud sync note: {e}")
 
 
48
  return False
49
 
50
  @staticmethod
51
- def sync_to_hub():
52
- """Uploads the local DB to the HF Dataset"""
53
  if not HF_TOKEN:
54
  return
55
 
56
  api = HfApi(token=HF_TOKEN)
57
  try:
58
- st.toast("Uploading new index to Cloud...", icon="🚀")
59
- api.upload_folder(
60
- folder_path=LOCAL_DB_PATH,
 
 
 
 
 
 
 
 
61
  repo_id=DATASET_REPO_ID,
62
  repo_type="dataset",
63
- commit_message="Auto-save: Update Index"
64
  )
65
- st.success("Database saved to Cloud!")
66
  except Exception as e:
67
- st.error(f"Failed to sync to cloud: {e}")
68
 
69
  # --- HELPER FUNCTIONS ---
70
  def parse_file(uploaded_file):
@@ -76,7 +94,6 @@ def parse_file(uploaded_file):
76
  for i, page in enumerate(reader.pages):
77
  page_text = page.extract_text()
78
  if page_text:
79
- # We inject Page markers into the text for the LLM to see later
80
  text += f"\n[PAGE {i+1}] {page_text}"
81
  elif filename.endswith(".docx"):
82
  doc = docx.Document(uploaded_file)
@@ -88,199 +105,116 @@ def parse_file(uploaded_file):
88
  return text, filename
89
 
90
  def recursive_chunking(text, source, chunk_size=500, overlap=100):
91
- """
92
- Splits text into chunks, trying to respect page boundaries if possible.
93
- """
94
  words = text.split()
95
  chunks = []
96
-
97
  for i in range(0, len(words), chunk_size - overlap):
98
  chunk_words = words[i:i + chunk_size]
99
  chunk_text = " ".join(chunk_words)
100
 
101
- # Metadata extraction (simple heuristic for page numbers we injected)
102
  page_num = "Unknown"
103
  if "[PAGE" in chunk_text:
104
  try:
105
- # Find the last page marker in this chunk
106
  start = chunk_text.rfind("[PAGE") + 6
107
  end = chunk_text.find("]", start)
108
  page_num = chunk_text[start:end]
109
- except:
110
- pass
111
 
112
  if len(chunk_text) > 50:
113
  chunks.append({
114
  "text": chunk_text,
115
- "metadata": {"source": source, "page": page_num}
 
116
  })
117
  return chunks
118
 
119
- # --- CORE SEARCH ENGINE ---
120
- class PersistentSearchEngine:
121
- def __init__(self, collection_name="navy_docs"):
122
- # 1. Initialize ChromaDB (Persistent)
123
- self.client = chromadb.PersistentClient(path=os.path.join(LOCAL_DB_PATH, "chroma"))
124
- self.collection = self.client.get_or_create_collection(name=collection_name)
125
-
126
- # 2. Load Models
127
- # We force the device to CPU to avoid "meta tensor" errors in Docker
128
- device = "cpu"
129
- self.bi_encoder = SentenceTransformer('all-MiniLM-L6-v2', device=device)
130
-
131
- # We disable "low_cpu_mem_usage" to prevent the model from loading as a ghost (meta device)
132
- self.cross_encoder = CrossEncoder(
133
- 'cross-encoder/ms-marco-MiniLM-L-6-v2',
134
- device=device,
135
- automodel_args={"low_cpu_mem_usage": False}
136
- )
137
-
138
- # 3. Initialize/Load BM25 (Sparse)
139
- self.bm25 = None
140
- self.doc_store = [] # We need a shadow copy for BM25
141
- self.load_bm25()
142
-
143
- def load_bm25(self):
144
- """Loads BM25 index from disk if it exists"""
145
- bm25_path = os.path.join(LOCAL_DB_PATH, "bm25.pkl")
146
- if os.path.exists(bm25_path):
147
- with open(bm25_path, "rb") as f:
148
- data = pickle.load(f)
149
- self.bm25 = data['model']
150
- self.doc_store = data['docs']
151
-
152
- def save_bm25(self):
153
- """Saves BM25 index to disk"""
154
- bm25_path = os.path.join(LOCAL_DB_PATH, "bm25.pkl")
155
- with open(bm25_path, "wb") as f:
156
- pickle.dump({'model': self.bm25, 'docs': self.doc_store}, f)
157
-
158
- def add_documents(self, parsed_chunks):
159
- # 1. Add to Chroma (Dense)
160
- ids = [f"{c['metadata']['source']}_{i}_{time.time()}" for i, c in enumerate(parsed_chunks)]
161
- texts = [c['text'] for c in parsed_chunks]
162
- metadatas = [c['metadata'] for c in parsed_chunks]
163
-
164
- embeddings = self.bi_encoder.encode(texts).tolist()
165
-
166
- self.collection.add(
167
- documents=texts,
168
- embeddings=embeddings,
169
- metadatas=metadatas,
170
- ids=ids
171
- )
172
-
173
- # 2. Update BM25 (Sparse)
174
- # Note: BM25 is not incremental by default, we rebuild it.
175
- # For huge datasets, we would implement incremental updates, but for <10k docs, rebuilding is fast.
176
- current_docs = self.doc_store + texts
177
- tokenized_corpus = [doc.lower().split() for doc in current_docs]
178
- self.bm25 = BM25Okapi(tokenized_corpus)
179
- self.doc_store = current_docs
180
-
181
- # 3. Save Aux Data
182
- self.save_bm25()
183
-
184
  return len(texts)
185
 
186
- def search(self, query, top_k=5, alpha=0.5):
187
- # --- DENSE SEARCH (Chroma) ---
188
- # Get more candidates for re-ranking
189
- candidate_k = top_k * 3
190
-
191
- query_embedding = self.bi_encoder.encode([query]).tolist()
192
-
193
- chroma_results = self.collection.query(
194
- query_embeddings=query_embedding,
195
- n_results=candidate_k
196
- )
197
-
198
- # If DB is empty
199
- if not chroma_results['documents']:
200
  return []
201
-
202
- # Process Chroma Results
203
- # Chroma structure: {'ids': [[]], 'documents': [[]], 'metadatas': [[]], 'distances': [[]]}
204
- dense_hits = {}
205
- retrieved_docs_map = {} # ID -> Text/Meta mapping
206
-
207
- for i, doc_id in enumerate(chroma_results['ids'][0]):
208
- score = 1 - chroma_results['distances'][0][i] # Convert distance to similarity
209
- dense_hits[doc_id] = score
210
- retrieved_docs_map[doc_id] = {
211
- 'text': chroma_results['documents'][0][i],
212
- 'metadata': chroma_results['metadatas'][0][i]
213
- }
214
-
215
- # --- SPARSE SEARCH (BM25) ---
216
- # Note: Mapping BM25 indices back to Chroma IDs is complex if lists aren't perfectly synced.
217
- # For this Hybrid implementation, we will rely heavily on Chroma for the *candidates* # and use BM25 to score the *Query vs The Candidates* specifically.
218
-
219
- hybrid_candidates = []
220
-
221
- q_tokens = query.lower().split()
222
-
223
- for doc_id, dense_score in dense_hits.items():
224
- doc_text = retrieved_docs_map[doc_id]['text']
225
-
226
- # Score this specific candidate with BM25 logic (on the fly)
227
- # This is "Re-scoring" rather than "Retrieving" with BM25, which is safer for sync
228
- doc_tokens = doc_text.lower().split()
229
- # Simple term frequency for the candidate
230
- bm25_score = 0
231
- for token in q_tokens:
232
- bm25_score += doc_tokens.count(token)
233
-
234
- # Normalize BM25 score roughly (0-10 range usually, squeeze to 0-1)
235
- bm25_score = min(bm25_score / 5.0, 1.0)
236
 
237
- final_hybrid_score = (alpha * dense_score) + ((1-alpha) * bm25_score)
238
-
239
- hybrid_candidates.append({
240
- "id": doc_id,
241
- "text": doc_text,
242
- "metadata": retrieved_docs_map[doc_id]['metadata'],
243
- "hybrid_score": final_hybrid_score
244
- })
245
-
246
- # Sort by Hybrid Score
247
- hybrid_candidates.sort(key=lambda x: x['hybrid_score'], reverse=True)
248
-
249
- # --- RE-RANKING (Cross-Encoder) ---
250
- top_candidates = hybrid_candidates[:candidate_k]
251
-
252
- pairs = [[query, c['text']] for c in top_candidates]
 
 
 
253
  cross_scores = self.cross_encoder.predict(pairs)
254
 
255
- final_results = []
256
- for i, cand in enumerate(top_candidates):
257
- final_results.append({
258
- "chunk": cand['text'],
259
- "metadata": cand['metadata'],
260
- "score": cross_scores[i]
261
- })
262
 
263
- final_results.sort(key=lambda x: x['score'], reverse=True)
 
264
  return final_results[:top_k]
265
 
266
  # --- UI LOGIC ---
267
-
268
- # 1. Sync on Startup
269
- if 'synced' not in st.session_state:
270
- DataManager.sync_from_hub()
271
- st.session_state.synced = True
272
-
273
- # 2. Init Engine
274
  if 'engine' not in st.session_state:
275
- with st.spinner("Initializing Vector Database..."):
276
- st.session_state.engine = PersistentSearchEngine()
 
 
277
 
278
  with st.sidebar:
279
  st.header("🗄️ Knowledge Base")
280
  uploaded_files = st.file_uploader("Ingest Documents", accept_multiple_files=True)
281
 
282
- if uploaded_files and st.button("Add to Database"):
283
- with st.spinner("Parsing & Indexing..."):
284
  new_chunks = []
285
  for f in uploaded_files:
286
  txt, fname = parse_file(f)
@@ -289,64 +223,28 @@ with st.sidebar:
289
 
290
  if new_chunks:
291
  count = st.session_state.engine.add_documents(new_chunks)
292
- DataManager.sync_to_hub() # Auto-save to cloud
293
- st.success(f"Added {count} chunks and synced to Cloud!")
294
-
295
- st.divider()
296
- st.info(f"Connected to: {DATASET_REPO_ID}")
297
 
298
- # --- MAIN SEARCH UI ---
299
- st.title(" Navy Intelligent Search (RAG)")
300
-
301
- query = st.text_input("Enter Query (e.g. 'Leave policy for O-3 and below'):")
302
- col1, col2 = st.columns([1, 1])
303
- with col1:
304
- top_k = st.number_input("Documents", 1, 10, 3)
305
- with col2:
306
- alpha = st.slider("Hybrid Weight", 0.0, 1.0, 0.6, help="Higher = More Semantic")
307
 
308
  if query:
309
- results = st.session_state.engine.search(query, top_k=top_k, alpha=alpha)
310
 
311
- # Store results for RAG
312
  context_text = ""
313
-
314
- st.markdown("### 🔍 Search Results")
315
  for res in results:
316
- meta = res['metadata']
317
- score = res['score']
318
- text = res['chunk']
319
- context_text += f"Source: {meta['source']} (Page {meta['page']})\nContent: {text}\n\n"
320
-
321
- with st.expander(f"{meta['source']} | Pg {meta['page']} (Score: {score:.2f})", expanded=True):
322
- st.markdown(text)
323
-
324
- # --- RAG: SUMMARIZATION ---
325
- st.divider()
326
- st.markdown("### 🤖 AI Intelligence")
327
- if st.button("Generate Summary / Answer"):
328
  from huggingface_hub import InferenceClient
329
-
330
- # Use a free, powerful model via HF Inference API
331
- repo_id = "mistralai/Mistral-7B-Instruct-v0.3"
332
- llm_client = InferenceClient(model=repo_id, token=HF_TOKEN)
333
-
334
- prompt = f"""
335
- You are a Navy Administrative Aide. Answer the user's question based ONLY on the context provided below.
336
- If the answer is not in the context, say "I cannot find the answer in the provided documents."
337
-
338
- CONTEXT:
339
- {context_text}
340
-
341
- USER QUESTION:
342
- {query}
343
-
344
- ANSWER:
345
- """
346
-
347
- with st.spinner("Consulting LLM..."):
348
  try:
349
- response = llm_client.text_generation(prompt, max_new_tokens=500)
350
- st.success(response)
351
  except Exception as e:
352
  st.error(f"LLM Error: {e}")
 
1
  import streamlit as st
2
+ import os
3
+ import faiss
4
+ import pickle
5
  import numpy as np
 
 
6
  from sentence_transformers import SentenceTransformer, CrossEncoder
7
+ from huggingface_hub import HfApi, hf_hub_download
8
+ from huggingface_hub.utils import EntryNotFoundError, RepositoryNotFoundError
 
9
  import pypdf
10
  import docx
 
 
 
11
  import time
12
 
13
  # --- CONFIGURATION ---
14
+ DATASET_REPO_ID = "NavyDevilDoc/navy-policy-index" # Your Dataset
 
 
15
  HF_TOKEN = os.environ.get("HF_TOKEN")
16
 
17
+ # File paths for local storage
18
+ INDEX_FILE = "navy_index.faiss"
19
+ META_FILE = "navy_metadata.pkl"
20
+
21
+ st.set_page_config(page_title="Navy Search (FAISS)", layout="wide")
22
 
23
  # --- PERSISTENCE MANAGER ---
24
+ class IndexManager:
25
+ """Manages loading/saving the FAISS index and Metadata from Hugging Face"""
26
 
27
  @staticmethod
28
+ def load_from_hub():
29
+ """Downloads the index files from HF Dataset"""
30
  if not HF_TOKEN:
31
+ st.warning("HF_TOKEN missing. Running in local-only mode.")
32
  return False
33
 
34
  try:
35
+ with st.spinner("Downloading Knowledge Base..."):
36
+ # Download Vector Index
37
+ hf_hub_download(
38
+ repo_id=DATASET_REPO_ID,
39
+ filename=INDEX_FILE,
40
+ repo_type="dataset",
41
+ local_dir=".",
42
+ token=HF_TOKEN
43
+ )
44
+ # Download Metadata
45
+ hf_hub_download(
46
+ repo_id=DATASET_REPO_ID,
47
+ filename=META_FILE,
48
+ repo_type="dataset",
49
+ local_dir=".",
50
+ token=HF_TOKEN
51
+ )
52
  return True
53
+ except (EntryNotFoundError, RepositoryNotFoundError):
54
+ st.toast("No existing index found in Cloud. Starting fresh.", icon="🆕")
55
+ return False
56
+ except Exception as e:
57
+ st.error(f"Sync Error: {e}")
58
  return False
59
 
60
  @staticmethod
61
+ def save_to_hub():
62
+ """Uploads the local files to HF Dataset"""
63
  if not HF_TOKEN:
64
  return
65
 
66
  api = HfApi(token=HF_TOKEN)
67
  try:
68
+ st.toast("Syncing to Cloud...", icon="☁️")
69
+ api.upload_file(
70
+ path_or_fileobj=INDEX_FILE,
71
+ path_in_repo=INDEX_FILE,
72
+ repo_id=DATASET_REPO_ID,
73
+ repo_type="dataset",
74
+ commit_message="Update FAISS Index"
75
+ )
76
+ api.upload_file(
77
+ path_or_fileobj=META_FILE,
78
+ path_in_repo=META_FILE,
79
  repo_id=DATASET_REPO_ID,
80
  repo_type="dataset",
81
+ commit_message="Update Metadata"
82
  )
83
+ st.success("Knowledge Base Saved!")
84
  except Exception as e:
85
+ st.error(f"Upload failed: {e}")
86
 
87
  # --- HELPER FUNCTIONS ---
88
  def parse_file(uploaded_file):
 
94
  for i, page in enumerate(reader.pages):
95
  page_text = page.extract_text()
96
  if page_text:
 
97
  text += f"\n[PAGE {i+1}] {page_text}"
98
  elif filename.endswith(".docx"):
99
  doc = docx.Document(uploaded_file)
 
105
  return text, filename
106
 
107
  def recursive_chunking(text, source, chunk_size=500, overlap=100):
 
 
 
108
  words = text.split()
109
  chunks = []
 
110
  for i in range(0, len(words), chunk_size - overlap):
111
  chunk_words = words[i:i + chunk_size]
112
  chunk_text = " ".join(chunk_words)
113
 
114
+ # Simple Page Extraction
115
  page_num = "Unknown"
116
  if "[PAGE" in chunk_text:
117
  try:
 
118
  start = chunk_text.rfind("[PAGE") + 6
119
  end = chunk_text.find("]", start)
120
  page_num = chunk_text[start:end]
121
+ except: pass
 
122
 
123
  if len(chunk_text) > 50:
124
  chunks.append({
125
  "text": chunk_text,
126
+ "source": source,
127
+ "page": page_num
128
  })
129
  return chunks
130
 
131
+ # --- CORE SEARCH ENGINE (FAISS VERSION) ---
132
+ class RobustSearchEngine:
133
+ def __init__(self):
134
+ # Load Models (Force CPU to avoid meta tensor errors)
135
+ self.bi_encoder = SentenceTransformer('all-MiniLM-L6-v2', device="cpu")
136
+ self.cross_encoder = CrossEncoder('cross-encoder/ms-marco-MiniLM-L-6-v2', device="cpu", automodel_args={"low_cpu_mem_usage": False})
137
+
138
+ self.index = None
139
+ self.metadata = [] # List of dicts matching index order
140
+
141
+ # Try to load existing index from disk
142
+ if os.path.exists(INDEX_FILE) and os.path.exists(META_FILE):
143
+ self.index = faiss.read_index(INDEX_FILE)
144
+ with open(META_FILE, "rb") as f:
145
+ self.metadata = pickle.load(f)
146
+ else:
147
+ # Initialize new index
148
+ self.index = None # Will init on first add
149
+ self.metadata = []
150
+
151
+ def add_documents(self, chunks):
152
+ # 1. Encode
153
+ texts = [c["text"] for c in chunks]
154
+ embeddings = self.bi_encoder.encode(texts)
155
+ faiss.normalize_L2(embeddings) # Normalize for Cosine Sim
156
+
157
+ # 2. Init Index if needed
158
+ if self.index is None:
159
+ dimension = embeddings.shape[1]
160
+ self.index = faiss.IndexFlatIP(dimension) # Inner Product = Cosine
161
+
162
+ # 3. Add to Index
163
+ self.index.add(embeddings)
164
+ self.metadata.extend(chunks)
165
+
166
+ # 4. Save to Disk
167
+ faiss.write_index(self.index, INDEX_FILE)
168
+ with open(META_FILE, "wb") as f:
169
+ pickle.dump(self.metadata, f)
170
+
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
171
  return len(texts)
172
 
173
+ def search(self, query, top_k=5):
174
+ if not self.index or self.index.ntotal == 0:
 
 
 
 
 
 
 
 
 
 
 
 
175
  return []
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
176
 
177
+ # 1. Retrieval
178
+ candidate_k = top_k * 3
179
+ q_vec = self.bi_encoder.encode([query])
180
+ faiss.normalize_L2(q_vec)
181
+
182
+ scores, indices = self.index.search(q_vec, min(self.index.ntotal, candidate_k))
183
+
184
+ candidates = []
185
+ for i, idx in enumerate(indices[0]):
186
+ if idx != -1:
187
+ candidates.append({
188
+ "text": self.metadata[idx]["text"],
189
+ "source": self.metadata[idx]["source"],
190
+ "page": self.metadata[idx]["page"],
191
+ "base_score": scores[0][i]
192
+ })
193
+
194
+ # 2. Re-Ranking
195
+ pairs = [[query, c["text"]] for c in candidates]
196
  cross_scores = self.cross_encoder.predict(pairs)
197
 
198
+ for i, c in enumerate(candidates):
199
+ c["score"] = cross_scores[i]
 
 
 
 
 
200
 
201
+ # Sort
202
+ final_results = sorted(candidates, key=lambda x: x["score"], reverse=True)
203
  return final_results[:top_k]
204
 
205
  # --- UI LOGIC ---
 
 
 
 
 
 
 
206
  if 'engine' not in st.session_state:
207
+ # 1. Try cloud sync first
208
+ IndexManager.load_from_hub()
209
+ # 2. Start engine
210
+ st.session_state.engine = RobustSearchEngine()
211
 
212
  with st.sidebar:
213
  st.header("🗄️ Knowledge Base")
214
  uploaded_files = st.file_uploader("Ingest Documents", accept_multiple_files=True)
215
 
216
+ if uploaded_files and st.button("Index Documents"):
217
+ with st.spinner("Processing..."):
218
  new_chunks = []
219
  for f in uploaded_files:
220
  txt, fname = parse_file(f)
 
223
 
224
  if new_chunks:
225
  count = st.session_state.engine.add_documents(new_chunks)
226
+ IndexManager.save_to_hub()
227
+ st.success(f"Added {count} chunks!")
 
 
 
228
 
229
+ st.title("⚓ Navy Search (FAISS Architecture)")
230
+ query = st.text_input("Enter Query:")
 
 
 
 
 
 
 
231
 
232
  if query:
233
+ results = st.session_state.engine.search(query)
234
 
235
+ st.markdown("### 🔍 Results")
236
  context_text = ""
 
 
237
  for res in results:
238
+ context_text += f"Source: {res['source']}\n{res['text']}\n\n"
239
+ with st.expander(f"{res['source']} (Pg {res['page']}) - Score {res['score']:.2f}", expanded=True):
240
+ st.markdown(res['text'])
241
+
242
+ if st.button("Generate Summary"):
 
 
 
 
 
 
 
243
  from huggingface_hub import InferenceClient
244
+ client = InferenceClient("mistralai/Mistral-7B-Instruct-v0.3", token=HF_TOKEN)
245
+ prompt = f"Context:\n{context_text}\n\nUser: {query}\nAnswer:"
246
+ with st.spinner("Thinking..."):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
247
  try:
248
+ st.write(client.text_generation(prompt, max_new_tokens=400))
 
249
  except Exception as e:
250
  st.error(f"LLM Error: {e}")