NavyDevilDoc commited on
Commit
39f39ce
·
verified ·
1 Parent(s): f09334e

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +78 -33
app.py CHANGED
@@ -8,6 +8,7 @@ from huggingface_hub import HfApi, hf_hub_download
8
  from huggingface_hub.utils import EntryNotFoundError, RepositoryNotFoundError
9
  import pypdf
10
  import docx
 
11
 
12
  # --- CONFIGURATION ---
13
  DATASET_REPO_ID = "NavyDevilDoc/navy-policy-index"
@@ -17,7 +18,7 @@ META_FILE = "navy_metadata.pkl"
17
 
18
  st.set_page_config(page_title="Document Finder", layout="wide")
19
 
20
- # --- PERSISTENCE (SAME AS BEFORE) ---
21
  class IndexManager:
22
  @staticmethod
23
  def load_from_hub():
@@ -38,7 +39,7 @@ class IndexManager:
38
  st.toast("Database Synced!", icon="☁️")
39
  except Exception as e: st.error(f"Sync Error: {e}")
40
 
41
- # --- PARSING & CHUNKING (SAME AS BEFORE) ---
42
  def parse_file(uploaded_file):
43
  text = ""
44
  filename = uploaded_file.name
@@ -64,38 +65,69 @@ def recursive_chunking(text, source, chunk_size=500, overlap=100):
64
  chunks.append({"text": chunk_text, "source": source})
65
  return chunks
66
 
67
- # --- CORE SEARCH ENGINE (UPDATED FOR DOC LEVEL) ---
68
  class DocSearchEngine:
69
  def __init__(self):
70
- self.bi_encoder = SentenceTransformer('all-mpnet-base-v2', device="cpu")
 
71
  self.cross_encoder = CrossEncoder('cross-encoder/ms-marco-MiniLM-L-6-v2', device="cpu", automodel_args={"low_cpu_mem_usage": False})
 
72
  self.index = None
73
  self.metadata = []
74
 
75
  if os.path.exists(INDEX_FILE) and os.path.exists(META_FILE):
76
- self.index = faiss.read_index(INDEX_FILE)
77
- with open(META_FILE, "rb") as f: self.metadata = pickle.load(f)
 
 
 
 
 
 
 
 
 
 
 
 
 
78
 
79
  def add_documents(self, chunks):
80
  texts = [c["text"] for c in chunks]
81
  embeddings = self.bi_encoder.encode(texts)
82
  faiss.normalize_L2(embeddings)
83
 
84
- if self.index is None:
85
- self.index = faiss.IndexFlatIP(embeddings.shape[1])
86
-
87
- self.index.add(embeddings)
88
  self.metadata.extend(chunks)
89
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
90
  faiss.write_index(self.index, INDEX_FILE)
91
  with open(META_FILE, "wb") as f: pickle.dump(self.metadata, f)
92
- return len(texts)
93
 
94
  def search_documents(self, query, top_k=5):
95
  if not self.index or self.index.ntotal == 0: return []
96
 
97
- # 1. Retrieve MANY chunks (to ensure we find diverse documents)
98
- # If we only get top 5 chunks, they might all be from the same document.
99
  candidate_k = top_k * 10
100
 
101
  q_vec = self.bi_encoder.encode([query])
@@ -103,7 +135,6 @@ class DocSearchEngine:
103
 
104
  scores, indices = self.index.search(q_vec, min(self.index.ntotal, candidate_k))
105
 
106
- # 2. Extract Raw Candidates
107
  raw_candidates = []
108
  for i, idx in enumerate(indices[0]):
109
  if idx != -1:
@@ -113,30 +144,21 @@ class DocSearchEngine:
113
  "bi_score": scores[0][i]
114
  })
115
 
116
- # 3. Aggregation: Find the BEST chunk for each document
117
- # We group by 'source' and keep the max score
118
- doc_map = {} # {filename: {best_score, best_snippet}}
119
-
120
  for cand in raw_candidates:
121
  source = cand['source']
122
  score = cand['bi_score']
123
-
124
- # Initialization
125
  if source not in doc_map:
126
  doc_map[source] = {"score": score, "snippet": cand['text']}
127
  else:
128
- # Update if we found a better chunk in the same doc
129
  if score > doc_map[source]["score"]:
130
  doc_map[source]["score"] = score
131
  doc_map[source]["snippet"] = cand['text']
132
 
133
- # 4. Sort Documents by their Best Chunk Score
134
  ranked_docs = sorted(doc_map.items(), key=lambda item: item[1]['score'], reverse=True)
135
 
136
- # 5. Cross-Encoder Verification (Optional but recommended)
137
- # We verify the "Best Snippet" to ensure it's not a hallucination
138
  final_results = []
139
- top_docs = ranked_docs[:top_k] # Only re-rank the top contenders
140
 
141
  if top_docs:
142
  pairs = [[query, doc[1]['snippet']] for doc in top_docs]
@@ -145,11 +167,9 @@ class DocSearchEngine:
145
  for i, (source, data) in enumerate(top_docs):
146
  final_results.append({
147
  "source": source,
148
- "score": cross_scores[i], # High accuracy score
149
  "snippet": data['snippet']
150
  })
151
-
152
- # Final Sort after Cross-Encoder
153
  final_results = sorted(final_results, key=lambda x: x["score"], reverse=True)
154
 
155
  return final_results
@@ -173,6 +193,33 @@ with st.sidebar:
173
  IndexManager.save_to_hub()
174
  st.success("Indexed!")
175
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
176
  st.title("⚓ Document Finder")
177
  st.caption("Locates the specific Instruction or NAVADMIN relevant to your query.")
178
 
@@ -189,18 +236,16 @@ if query:
189
  for res in results:
190
  score = res['score']
191
 
192
- # Color coding the confidence
193
  if score > 2:
194
- border_color = "#09ab3b" # Green
195
  confidence = "High Match"
196
  elif score > 0:
197
- border_color = "#ffbd45" # Orange
198
  confidence = "Possible Match"
199
  else:
200
- border_color = "#ff4b4b" # Red
201
  confidence = "Low Match"
202
 
203
- # --- DOCUMENT CARD UI ---
204
  with st.container():
205
  st.markdown(f"""
206
  <div style="
 
8
  from huggingface_hub.utils import EntryNotFoundError, RepositoryNotFoundError
9
  import pypdf
10
  import docx
11
+ import time
12
 
13
  # --- CONFIGURATION ---
14
  DATASET_REPO_ID = "NavyDevilDoc/navy-policy-index"
 
18
 
19
  st.set_page_config(page_title="Document Finder", layout="wide")
20
 
21
+ # --- PERSISTENCE ---
22
  class IndexManager:
23
  @staticmethod
24
  def load_from_hub():
 
39
  st.toast("Database Synced!", icon="☁️")
40
  except Exception as e: st.error(f"Sync Error: {e}")
41
 
42
+ # --- PARSING & CHUNKING ---
43
  def parse_file(uploaded_file):
44
  text = ""
45
  filename = uploaded_file.name
 
65
  chunks.append({"text": chunk_text, "source": source})
66
  return chunks
67
 
68
+ # --- CORE SEARCH ENGINE ---
69
  class DocSearchEngine:
70
  def __init__(self):
71
+ # Force CPU to avoid Docker memory issues
72
+ self.bi_encoder = SentenceTransformer('all-MiniLM-L6-v2', device="cpu")
73
  self.cross_encoder = CrossEncoder('cross-encoder/ms-marco-MiniLM-L-6-v2', device="cpu", automodel_args={"low_cpu_mem_usage": False})
74
+
75
  self.index = None
76
  self.metadata = []
77
 
78
  if os.path.exists(INDEX_FILE) and os.path.exists(META_FILE):
79
+ try:
80
+ self.index = faiss.read_index(INDEX_FILE)
81
+ with open(META_FILE, "rb") as f: self.metadata = pickle.load(f)
82
+ except Exception as e:
83
+ st.error(f"Index load failed, starting fresh: {e}")
84
+ self.reset_index()
85
+ else:
86
+ self.reset_index()
87
+
88
+ def reset_index(self):
89
+ """Wipes the index clean"""
90
+ d = 384
91
+ self.index = faiss.IndexIDMap(faiss.IndexFlatIP(d))
92
+ self.metadata = []
93
+ self.save()
94
 
95
  def add_documents(self, chunks):
96
  texts = [c["text"] for c in chunks]
97
  embeddings = self.bi_encoder.encode(texts)
98
  faiss.normalize_L2(embeddings)
99
 
100
+ start_id = len(self.metadata)
101
+ ids = np.arange(start_id, start_id + len(chunks)).astype('int64')
102
+
103
+ self.index.add_with_ids(embeddings, ids)
104
  self.metadata.extend(chunks)
105
 
106
+ self.save()
107
+ return len(texts)
108
+
109
+ def delete_file(self, filename):
110
+ if self.index is None or self.index.ntotal == 0: return 0
111
+
112
+ new_chunks = [c for c in self.metadata if c['source'] != filename]
113
+ removed_count = len(self.metadata) - len(new_chunks)
114
+
115
+ if removed_count > 0:
116
+ self.reset_index()
117
+ if new_chunks:
118
+ self.add_documents(new_chunks)
119
+ else:
120
+ self.save()
121
+
122
+ return removed_count
123
+
124
+ def save(self):
125
  faiss.write_index(self.index, INDEX_FILE)
126
  with open(META_FILE, "wb") as f: pickle.dump(self.metadata, f)
 
127
 
128
  def search_documents(self, query, top_k=5):
129
  if not self.index or self.index.ntotal == 0: return []
130
 
 
 
131
  candidate_k = top_k * 10
132
 
133
  q_vec = self.bi_encoder.encode([query])
 
135
 
136
  scores, indices = self.index.search(q_vec, min(self.index.ntotal, candidate_k))
137
 
 
138
  raw_candidates = []
139
  for i, idx in enumerate(indices[0]):
140
  if idx != -1:
 
144
  "bi_score": scores[0][i]
145
  })
146
 
147
+ doc_map = {}
 
 
 
148
  for cand in raw_candidates:
149
  source = cand['source']
150
  score = cand['bi_score']
 
 
151
  if source not in doc_map:
152
  doc_map[source] = {"score": score, "snippet": cand['text']}
153
  else:
 
154
  if score > doc_map[source]["score"]:
155
  doc_map[source]["score"] = score
156
  doc_map[source]["snippet"] = cand['text']
157
 
 
158
  ranked_docs = sorted(doc_map.items(), key=lambda item: item[1]['score'], reverse=True)
159
 
 
 
160
  final_results = []
161
+ top_docs = ranked_docs[:top_k]
162
 
163
  if top_docs:
164
  pairs = [[query, doc[1]['snippet']] for doc in top_docs]
 
167
  for i, (source, data) in enumerate(top_docs):
168
  final_results.append({
169
  "source": source,
170
+ "score": cross_scores[i],
171
  "snippet": data['snippet']
172
  })
 
 
173
  final_results = sorted(final_results, key=lambda x: x["score"], reverse=True)
174
 
175
  return final_results
 
193
  IndexManager.save_to_hub()
194
  st.success("Indexed!")
195
 
196
+ st.divider()
197
+ st.header("⚙️ Manage Index")
198
+
199
+ if st.session_state.engine.index:
200
+ st.write(f"**Total Chunks:** {st.session_state.engine.index.ntotal}")
201
+ unique_files = list(set([m['source'] for m in st.session_state.engine.metadata]))
202
+ st.write(f"**Documents:** {len(unique_files)}")
203
+
204
+ file_to_delete = st.selectbox("Select file to remove:", [""] + unique_files)
205
+ if file_to_delete and st.button("🗑️ Delete File"):
206
+ with st.spinner("Removing..."):
207
+ count = st.session_state.engine.delete_file(file_to_delete)
208
+ IndexManager.save_to_hub()
209
+ st.success(f"Removed {file_to_delete}")
210
+ time.sleep(1)
211
+ st.rerun()
212
+
213
+ st.divider()
214
+ # THE NUCLEAR OPTION
215
+ if st.button("⚠️ Wipe Entire Index", type="primary"):
216
+ with st.spinner("Nuking database..."):
217
+ st.session_state.engine.reset_index()
218
+ IndexManager.save_to_hub()
219
+ st.success("Index wiped clean.")
220
+ time.sleep(1)
221
+ st.rerun()
222
+
223
  st.title("⚓ Document Finder")
224
  st.caption("Locates the specific Instruction or NAVADMIN relevant to your query.")
225
 
 
236
  for res in results:
237
  score = res['score']
238
 
 
239
  if score > 2:
240
+ border_color = "#09ab3b"
241
  confidence = "High Match"
242
  elif score > 0:
243
+ border_color = "#ffbd45"
244
  confidence = "Possible Match"
245
  else:
246
+ border_color = "#ff4b4b"
247
  confidence = "Low Match"
248
 
 
249
  with st.container():
250
  st.markdown(f"""
251
  <div style="