Spaces:

NavyDevilDoc
/

Semantic_Search

Sleeping

App Files Files Community

NavyDevilDoc commited on Dec 16, 2025

Commit

39f39ce

verified ·

1 Parent(s): f09334e

Update app.py

Browse files

Files changed (1) hide show

app.py +78 -33

app.py CHANGED Viewed

@@ -8,6 +8,7 @@ from huggingface_hub import HfApi, hf_hub_download
 from huggingface_hub.utils import EntryNotFoundError, RepositoryNotFoundError
 import pypdf
 import docx
 # --- CONFIGURATION ---
 DATASET_REPO_ID = "NavyDevilDoc/navy-policy-index"
@@ -17,7 +18,7 @@ META_FILE = "navy_metadata.pkl"
 st.set_page_config(page_title="Document Finder", layout="wide")
-# --- PERSISTENCE (SAME AS BEFORE) ---
 class IndexManager:
     @staticmethod
     def load_from_hub():
@@ -38,7 +39,7 @@ class IndexManager:
             st.toast("Database Synced!", icon="☁️")
         except Exception as e: st.error(f"Sync Error: {e}")
-# --- PARSING & CHUNKING (SAME AS BEFORE) ---
 def parse_file(uploaded_file):
     text = ""
     filename = uploaded_file.name
@@ -64,38 +65,69 @@ def recursive_chunking(text, source, chunk_size=500, overlap=100):
             chunks.append({"text": chunk_text, "source": source})
     return chunks
-# --- CORE SEARCH ENGINE (UPDATED FOR DOC LEVEL) ---
 class DocSearchEngine:
     def __init__(self):
-        self.bi_encoder = SentenceTransformer('all-mpnet-base-v2', device="cpu")
         self.cross_encoder = CrossEncoder('cross-encoder/ms-marco-MiniLM-L-6-v2', device="cpu", automodel_args={"low_cpu_mem_usage": False})
         self.index = None
         self.metadata = []
         if os.path.exists(INDEX_FILE) and os.path.exists(META_FILE):
-            self.index = faiss.read_index(INDEX_FILE)
-            with open(META_FILE, "rb") as f: self.metadata = pickle.load(f)
     def add_documents(self, chunks):
         texts = [c["text"] for c in chunks]
         embeddings = self.bi_encoder.encode(texts)
         faiss.normalize_L2(embeddings)
-        if self.index is None:
-            self.index = faiss.IndexFlatIP(embeddings.shape[1])
-        self.index.add(embeddings)
         self.metadata.extend(chunks)
         faiss.write_index(self.index, INDEX_FILE)
         with open(META_FILE, "wb") as f: pickle.dump(self.metadata, f)
-        return len(texts)
     def search_documents(self, query, top_k=5):
         if not self.index or self.index.ntotal == 0: return []
-        # 1. Retrieve MANY chunks (to ensure we find diverse documents)
-        # If we only get top 5 chunks, they might all be from the same document.
         candidate_k = top_k * 10
         q_vec = self.bi_encoder.encode([query])
@@ -103,7 +135,6 @@ class DocSearchEngine:
         scores, indices = self.index.search(q_vec, min(self.index.ntotal, candidate_k))
-        # 2. Extract Raw Candidates
         raw_candidates = []
         for i, idx in enumerate(indices[0]):
             if idx != -1:
@@ -113,30 +144,21 @@ class DocSearchEngine:
                     "bi_score": scores[0][i]
                 })
-        # 3. Aggregation: Find the BEST chunk for each document
-        # We group by 'source' and keep the max score
-        doc_map = {} # {filename: {best_score, best_snippet}}
         for cand in raw_candidates:
             source = cand['source']
             score = cand['bi_score']
-            # Initialization
             if source not in doc_map:
                 doc_map[source] = {"score": score, "snippet": cand['text']}
             else:
-                # Update if we found a better chunk in the same doc
                 if score > doc_map[source]["score"]:
                     doc_map[source]["score"] = score
                     doc_map[source]["snippet"] = cand['text']
-        # 4. Sort Documents by their Best Chunk Score
         ranked_docs = sorted(doc_map.items(), key=lambda item: item[1]['score'], reverse=True)
-        # 5. Cross-Encoder Verification (Optional but recommended)
-        # We verify the "Best Snippet" to ensure it's not a hallucination
         final_results = []
-        top_docs = ranked_docs[:top_k] # Only re-rank the top contenders
         if top_docs:
             pairs = [[query, doc[1]['snippet']] for doc in top_docs]
@@ -145,11 +167,9 @@ class DocSearchEngine:
             for i, (source, data) in enumerate(top_docs):
                 final_results.append({
                     "source": source,
-                    "score": cross_scores[i], # High accuracy score
                     "snippet": data['snippet']
                 })
-            # Final Sort after Cross-Encoder
             final_results = sorted(final_results, key=lambda x: x["score"], reverse=True)
         return final_results
@@ -173,6 +193,33 @@ with st.sidebar:
                 IndexManager.save_to_hub()
                 st.success("Indexed!")
 st.title("⚓ Document Finder")
 st.caption("Locates the specific Instruction or NAVADMIN relevant to your query.")
@@ -189,18 +236,16 @@ if query:
     for res in results:
         score = res['score']
-        # Color coding the confidence
         if score > 2:
-            border_color = "#09ab3b" # Green
             confidence = "High Match"
         elif score > 0:
-            border_color = "#ffbd45" # Orange
             confidence = "Possible Match"
         else:
-            border_color = "#ff4b4b" # Red
             confidence = "Low Match"
-        # --- DOCUMENT CARD UI ---
         with st.container():
             st.markdown(f"""
             <div style="

 from huggingface_hub.utils import EntryNotFoundError, RepositoryNotFoundError
 import pypdf
 import docx
+import time
 # --- CONFIGURATION ---
 DATASET_REPO_ID = "NavyDevilDoc/navy-policy-index"
 st.set_page_config(page_title="Document Finder", layout="wide")
+# --- PERSISTENCE ---
 class IndexManager:
     @staticmethod
     def load_from_hub():
             st.toast("Database Synced!", icon="☁️")
         except Exception as e: st.error(f"Sync Error: {e}")
+# --- PARSING & CHUNKING ---
 def parse_file(uploaded_file):
     text = ""
     filename = uploaded_file.name
             chunks.append({"text": chunk_text, "source": source})
     return chunks
+# --- CORE SEARCH ENGINE ---
 class DocSearchEngine:
     def __init__(self):
+        # Force CPU to avoid Docker memory issues
+        self.bi_encoder = SentenceTransformer('all-MiniLM-L6-v2', device="cpu")
         self.cross_encoder = CrossEncoder('cross-encoder/ms-marco-MiniLM-L-6-v2', device="cpu", automodel_args={"low_cpu_mem_usage": False})
         self.index = None
         self.metadata = []
         if os.path.exists(INDEX_FILE) and os.path.exists(META_FILE):
+            try:
+                self.index = faiss.read_index(INDEX_FILE)
+                with open(META_FILE, "rb") as f: self.metadata = pickle.load(f)
+            except Exception as e:
+                st.error(f"Index load failed, starting fresh: {e}")
+                self.reset_index()
+        else:
+            self.reset_index()
+    def reset_index(self):
+        """Wipes the index clean"""
+        d = 384
+        self.index = faiss.IndexIDMap(faiss.IndexFlatIP(d))
+        self.metadata = []
+        self.save()
     def add_documents(self, chunks):
         texts = [c["text"] for c in chunks]
         embeddings = self.bi_encoder.encode(texts)
         faiss.normalize_L2(embeddings)
+        start_id = len(self.metadata)
+        ids = np.arange(start_id, start_id + len(chunks)).astype('int64')
+        self.index.add_with_ids(embeddings, ids)
         self.metadata.extend(chunks)
+        self.save()
+        return len(texts)
+    def delete_file(self, filename):
+        if self.index is None or self.index.ntotal == 0: return 0
+        new_chunks = [c for c in self.metadata if c['source'] != filename]
+        removed_count = len(self.metadata) - len(new_chunks)
+        if removed_count > 0:
+            self.reset_index()
+            if new_chunks:
+                self.add_documents(new_chunks)
+            else:
+                self.save()
+        return removed_count
+    def save(self):
         faiss.write_index(self.index, INDEX_FILE)
         with open(META_FILE, "wb") as f: pickle.dump(self.metadata, f)
     def search_documents(self, query, top_k=5):
         if not self.index or self.index.ntotal == 0: return []
         candidate_k = top_k * 10
         q_vec = self.bi_encoder.encode([query])
         scores, indices = self.index.search(q_vec, min(self.index.ntotal, candidate_k))
         raw_candidates = []
         for i, idx in enumerate(indices[0]):
             if idx != -1:
                     "bi_score": scores[0][i]
                 })
+        doc_map = {}
         for cand in raw_candidates:
             source = cand['source']
             score = cand['bi_score']
             if source not in doc_map:
                 doc_map[source] = {"score": score, "snippet": cand['text']}
             else:
                 if score > doc_map[source]["score"]:
                     doc_map[source]["score"] = score
                     doc_map[source]["snippet"] = cand['text']
         ranked_docs = sorted(doc_map.items(), key=lambda item: item[1]['score'], reverse=True)
         final_results = []
+        top_docs = ranked_docs[:top_k]
         if top_docs:
             pairs = [[query, doc[1]['snippet']] for doc in top_docs]
             for i, (source, data) in enumerate(top_docs):
                 final_results.append({
                     "source": source,
+                    "score": cross_scores[i],
                     "snippet": data['snippet']
                 })
             final_results = sorted(final_results, key=lambda x: x["score"], reverse=True)
         return final_results
                 IndexManager.save_to_hub()
                 st.success("Indexed!")
+    st.divider()
+    st.header("⚙️ Manage Index")
+    if st.session_state.engine.index:
+        st.write(f"**Total Chunks:** {st.session_state.engine.index.ntotal}")
+        unique_files = list(set([m['source'] for m in st.session_state.engine.metadata]))
+        st.write(f"**Documents:** {len(unique_files)}")
+        file_to_delete = st.selectbox("Select file to remove:", [""] + unique_files)
+        if file_to_delete and st.button("🗑️ Delete File"):
+            with st.spinner("Removing..."):
+                count = st.session_state.engine.delete_file(file_to_delete)
+                IndexManager.save_to_hub()
+                st.success(f"Removed {file_to_delete}")
+                time.sleep(1)
+                st.rerun()
+        st.divider()
+        # THE NUCLEAR OPTION
+        if st.button("⚠️ Wipe Entire Index", type="primary"):
+            with st.spinner("Nuking database..."):
+                st.session_state.engine.reset_index()
+                IndexManager.save_to_hub()
+                st.success("Index wiped clean.")
+                time.sleep(1)
+                st.rerun()
 st.title("⚓ Document Finder")
 st.caption("Locates the specific Instruction or NAVADMIN relevant to your query.")
     for res in results:
         score = res['score']
         if score > 2:
+            border_color = "#09ab3b"
             confidence = "High Match"
         elif score > 0:
+            border_color = "#ffbd45"
             confidence = "Possible Match"
         else:
+            border_color = "#ff4b4b"
             confidence = "Low Match"
         with st.container():
             st.markdown(f"""
             <div style="