Spaces:

NavyDevilDoc
/

Semantic_Search

Sleeping

App Files Files Community

NavyDevilDoc commited on Dec 17, 2025

Commit

73ca4a0

verified ·

1 Parent(s): d71c08c

Update app.py

Browse files

Files changed (1) hide show

app.py +169 -127

app.py CHANGED Viewed

@@ -3,10 +3,9 @@ import os
 import faiss
 import pickle
 import numpy as np
 from sentence_transformers import SentenceTransformer, CrossEncoder
-from huggingface_hub import HfApi, hf_hub_download
-from huggingface_hub.utils import EntryNotFoundError, RepositoryNotFoundError
-from huggingface_hub import InferenceClient
 import pypdf
 import docx
 import time
@@ -19,6 +18,7 @@ DATASET_REPO_ID = "NavyDevilDoc/navy-policy-index"
 HF_TOKEN = os.environ.get("HF_TOKEN")
 INDEX_FILE = "navy_index.faiss"
 META_FILE = "navy_metadata.pkl"
 st.set_page_config(page_title="Document Finder", layout="wide")
@@ -28,8 +28,12 @@ class IndexManager:
     def load_from_hub():
         if not HF_TOKEN: return False
         try:
             hf_hub_download(repo_id=DATASET_REPO_ID, filename=INDEX_FILE, local_dir=".", token=HF_TOKEN)
             hf_hub_download(repo_id=DATASET_REPO_ID, filename=META_FILE, local_dir=".", token=HF_TOKEN)
             return True
         except: return False
@@ -40,6 +44,7 @@ class IndexManager:
         try:
             api.upload_file(path_or_fileobj=INDEX_FILE, path_in_repo=INDEX_FILE, repo_id=DATASET_REPO_ID, repo_type="dataset")
             api.upload_file(path_or_fileobj=META_FILE, path_in_repo=META_FILE, repo_id=DATASET_REPO_ID, repo_type="dataset")
             st.toast("Database Synced!", icon="☁️")
         except Exception as e: st.error(f"Sync Error: {e}")
@@ -51,7 +56,6 @@ def parse_file(uploaded_file):
     try:
         if filename.endswith(".pdf"):
-            # Method 1: Fast Text Extraction
             pdf_bytes = uploaded_file.getvalue()
             reader = pypdf.PdfReader(uploaded_file)
@@ -60,15 +64,11 @@ def parse_file(uploaded_file):
                 if extracted:
                     text += f"\n[PAGE {i+1}] {extracted}"
-            # Method 2: OCR Fallback
-            # If fast method yielded almost no text, switch to OCR
             if len(text.strip()) < 50:
                 method = "OCR (Slow)"
-                # Reset file pointer or use bytes
                 images = convert_from_bytes(pdf_bytes)
-                text = "" # Reset text
                 for i, img in enumerate(images):
-                    # Tesseract reads the image
                     page_text = pytesseract.image_to_string(img)
                     text += f"\n[PAGE {i+1}] {page_text}"
@@ -83,43 +83,44 @@ def parse_file(uploaded_file):
     return text, filename, method
-def recursive_chunking(text, source, chunk_size=500, overlap=100):
     words = text.split()
     chunks = []
     for i in range(0, len(words), chunk_size - overlap):
         chunk_text = " ".join(words[i:i + chunk_size])
         if len(chunk_text) > 50:
-            chunks.append({"text": chunk_text, "source": source})
     return chunks
 def ask_llm(query, context):
-    """
-    Sends the user query and the retrieved document text to a free, hosted LLM.
-    """
     if not HF_TOKEN:
         return "Error: HF_TOKEN is missing. Cannot contact AI."
-    # We use Mistral-7B-Instruct because it is fast, follows instructions well,
-    # and is usually available on the free tier.
-    repo_id = "mistralai/Mistral-7B-Instruct-v0.3"
     client = InferenceClient(model=repo_id, token=HF_TOKEN)
     prompt = f"""
     You are a Senior Navy Yeoman and Subject Matter Expert.
-    Analyze the following Navy document excerpt and answer the user's question based ONLY on that text.
     USER QUESTION: "{query}"
-    DOCUMENT EXCERPT:
-    "{context}"
     Your Answer (Be concise, professional, and cite the document):
     """
     try:
-        # stream=True makes it look cool (typewriter effect) but standard return is easier for now
-        response = client.text_generation(prompt, max_new_tokens=400)
         return response
     except Exception as e:
         return f"AI Error: {e}"
@@ -127,39 +128,37 @@ def ask_llm(query, context):
 # --- CORE SEARCH ENGINE ---
 class DocSearchEngine:
     def __init__(self):
-        # Force CPU and DISABLE "Meta Device" loading for both models
-        self.bi_encoder = SentenceTransformer(
-            'all-MiniLM-L6-v2',
-            device="cpu",
-            model_kwargs={"low_cpu_mem_usage": False}
-        )
-        self.cross_encoder = CrossEncoder(
-            'cross-encoder/ms-marco-MiniLM-L-6-v2',
-            device="cpu",
-            automodel_args={"low_cpu_mem_usage": False}
-        )
-        self.index = None
-        self.metadata = []
-        if os.path.exists(INDEX_FILE) and os.path.exists(META_FILE):
-            try:
-                self.index = faiss.read_index(INDEX_FILE)
-                with open(META_FILE, "rb") as f: self.metadata = pickle.load(f)
-            except Exception as e:
-                st.error(f"Index load failed, starting fresh: {e}")
-                self.reset_index()
-        else:
-            self.reset_index()
         self.index = None
         self.metadata = []
         if os.path.exists(INDEX_FILE) and os.path.exists(META_FILE):
             try:
                 self.index = faiss.read_index(INDEX_FILE)
                 with open(META_FILE, "rb") as f: self.metadata = pickle.load(f)
             except Exception as e:
                 self.reset_index()
         else:
@@ -169,9 +168,17 @@ class DocSearchEngine:
         d = 384
         self.index = faiss.IndexIDMap(faiss.IndexFlatIP(d))
         self.metadata = []
         self.save()
-    def add_documents(self, chunks):
         texts = [c["text"] for c in chunks]
         embeddings = self.bi_encoder.encode(texts)
         faiss.normalize_L2(embeddings)
@@ -186,17 +193,58 @@ class DocSearchEngine:
     def delete_file(self, filename):
         if self.index is None or self.index.ntotal == 0: return 0
         new_chunks = [c for c in self.metadata if c['source'] != filename]
         removed_count = len(self.metadata) - len(new_chunks)
         if removed_count > 0:
             self.reset_index()
-            if new_chunks: self.add_documents(new_chunks)
-            else: self.save()
         return removed_count
     def save(self):
         faiss.write_index(self.index, INDEX_FILE)
         with open(META_FILE, "wb") as f: pickle.dump(self.metadata, f)
     def search_documents(self, query, top_k=5):
         if not self.index or self.index.ntotal == 0: return []
@@ -209,35 +257,38 @@ class DocSearchEngine:
         raw_candidates = []
         for i, idx in enumerate(indices[0]):
             if idx != -1:
                 raw_candidates.append({
-                    "text": self.metadata[idx]["text"],
-                    "source": self.metadata[idx]["source"],
                     "bi_score": scores[0][i]
                 })
         doc_map = {}
         for cand in raw_candidates:
             source = cand['source']
             score = cand['bi_score']
             if source not in doc_map:
-                doc_map[source] = {"score": score, "snippet": cand['text']}
             else:
-                if score > doc_map[source]["score"]:
-                    doc_map[source]["score"] = score
-                    doc_map[source]["snippet"] = cand['text']
-        ranked_docs = sorted(doc_map.items(), key=lambda item: item[1]['score'], reverse=True)
-        final_results = []
         top_docs = ranked_docs[:top_k]
         if top_docs:
-            pairs = [[query, doc[1]['snippet']] for doc in top_docs]
             cross_scores = self.cross_encoder.predict(pairs)
-            for i, (source, data) in enumerate(top_docs):
                 final_results.append({
-                    "source": source,
                     "score": cross_scores[i],
-                    "snippet": data['snippet']
                 })
             final_results = sorted(final_results, key=lambda x: x["score"], reverse=True)
@@ -249,52 +300,48 @@ if 'engine' not in st.session_state:
     st.session_state.engine = DocSearchEngine()
 with st.sidebar:
-    with st.sidebar:
-        st.header("🗄️ Upload Documents")
-        uploaded_files = st.file_uploader("Upload Files", accept_multiple_files=True)
-        if uploaded_files and st.button("Index"):
-            progress_bar = st.progress(0)
-            status_text = st.empty()
-            new_chunks = []
-            failed_files = []
-            total = len(uploaded_files)
-            for i, f in enumerate(uploaded_files):
-                status_text.text(f"Processing {i+1}/{total}: {f.name}...")
-                progress_bar.progress((i)/total)
-                # PARSE (With OCR Auto-Switch)
-                txt, fname, method = parse_file(f)
-                # --- DEBUGGING: CATCH ACTUAL ERRORS ---
-                if method.startswith("Error"):
-                    st.error(f"System Error on {fname}: {method}")
-                    failed_files.append(f"{fname}: {method}")
-                    continue
-                if method == "OCR (Slow)":
-                    st.toast(f"OCR Used for {fname}", icon="⚠️")
-                if not txt.strip():
-                    failed_files.append(f"{fname} (No text found)")
-                    continue
-                file_chunks = recursive_chunking(txt, fname)
-                new_chunks.extend(file_chunks)
-            progress_bar.progress(1.0)
-            if new_chunks:
-                with st.spinner("Saving database..."):
-                    st.session_state.engine.add_documents(new_chunks)
-                    IndexManager.save_to_hub()
-                st.success(f"Indexed {len(new_chunks)} chunks!")
-            if failed_files:
-                with st.expander("⚠️ Issues Detected", expanded=True):
-                    for ff in failed_files: st.write(ff)
     st.divider()
     st.header("⚙️ Manage Index")
@@ -305,47 +352,42 @@ with st.sidebar:
         file_to_delete = st.selectbox("Select file to remove:", [""] + unique_files)
         if file_to_delete and st.button("🗑️ Delete File"):
-            with st.spinner("Removing..."):
-                count = st.session_state.engine.delete_file(file_to_delete)
-                IndexManager.save_to_hub()
-                st.success(f"Removed {file_to_delete}")
-                time.sleep(1)
-                st.rerun()
-        st.divider()
         if st.button("⚠️ Wipe Entire Index", type="primary"):
-            with st.spinner("Nuking database..."):
-                st.session_state.engine.reset_index()
-                IndexManager.save_to_hub()
-                st.success("Index wiped clean.")
-                time.sleep(1)
-                st.rerun()
-st.title("⚓ Document Finder")
 query = st.text_input("What are you looking for?")
 if query:
     results = st.session_state.engine.search_documents(query, top_k=5)
-    # --- LLM INTEGRATION START ---
     if results:
-        # We grab the text from the #1 result to feed the AI
         top_match = results[0]
-        top_context = f"Source: {top_match['source']}\nContent: {top_match['snippet']}"
-        # Create a container for the AI Answer
         with st.container():
             st.markdown("### 🤖 AI Summary")
-            if st.button("✨ Summarize Top Result"):
-                with st.spinner("Reading document..."):
-                    ai_response = ask_llm(query, top_context)
                     st.success(ai_response)
             st.divider()
-    # --- LLM INTEGRATION END ---
     st.subheader("Top Relevant Documents")
     if not results: st.info("No documents found.")
     for res in results:
         score = res['score']
         if score > 2:

 import faiss
 import pickle
 import numpy as np
+import uuid
 from sentence_transformers import SentenceTransformer, CrossEncoder
+from huggingface_hub import HfApi, hf_hub_download, InferenceClient
 import pypdf
 import docx
 import time
 HF_TOKEN = os.environ.get("HF_TOKEN")
 INDEX_FILE = "navy_index.faiss"
 META_FILE = "navy_metadata.pkl"
+DOC_STORE_FILE = "navy_docs.pkl" # NEW: Stores the full text
 st.set_page_config(page_title="Document Finder", layout="wide")
     def load_from_hub():
         if not HF_TOKEN: return False
         try:
+            # Download Vector Index
             hf_hub_download(repo_id=DATASET_REPO_ID, filename=INDEX_FILE, local_dir=".", token=HF_TOKEN)
+            # Download Chunk Metadata
             hf_hub_download(repo_id=DATASET_REPO_ID, filename=META_FILE, local_dir=".", token=HF_TOKEN)
+            # Download Full Document Store
+            hf_hub_download(repo_id=DATASET_REPO_ID, filename=DOC_STORE_FILE, local_dir=".", token=HF_TOKEN)
             return True
         except: return False
         try:
             api.upload_file(path_or_fileobj=INDEX_FILE, path_in_repo=INDEX_FILE, repo_id=DATASET_REPO_ID, repo_type="dataset")
             api.upload_file(path_or_fileobj=META_FILE, path_in_repo=META_FILE, repo_id=DATASET_REPO_ID, repo_type="dataset")
+            api.upload_file(path_or_fileobj=DOC_STORE_FILE, path_in_repo=DOC_STORE_FILE, repo_id=DATASET_REPO_ID, repo_type="dataset")
             st.toast("Database Synced!", icon="☁️")
         except Exception as e: st.error(f"Sync Error: {e}")
     try:
         if filename.endswith(".pdf"):
             pdf_bytes = uploaded_file.getvalue()
             reader = pypdf.PdfReader(uploaded_file)
                 if extracted:
                     text += f"\n[PAGE {i+1}] {extracted}"
             if len(text.strip()) < 50:
                 method = "OCR (Slow)"
                 images = convert_from_bytes(pdf_bytes)
+                text = ""
                 for i, img in enumerate(images):
                     page_text = pytesseract.image_to_string(img)
                     text += f"\n[PAGE {i+1}] {page_text}"
     return text, filename, method
+# NEW: Added doc_id to link chunks back to parent
+def recursive_chunking(text, source, doc_id, chunk_size=500, overlap=100):
     words = text.split()
     chunks = []
     for i in range(0, len(words), chunk_size - overlap):
         chunk_text = " ".join(words[i:i + chunk_size])
         if len(chunk_text) > 50:
+            chunks.append({
+                "text": chunk_text,
+                "source": source,
+                "doc_id": doc_id # The Critical Link
+            })
     return chunks
 def ask_llm(query, context):
     if not HF_TOKEN:
         return "Error: HF_TOKEN is missing. Cannot contact AI."
+    # We limit context to ~8000 chars to avoid hitting token limits on free APIs
+    truncated_context = context[:8000]
+    repo_id = "mistralai/Mistral-7B-Instruct-v0.3"
     client = InferenceClient(model=repo_id, token=HF_TOKEN)
     prompt = f"""
     You are a Senior Navy Yeoman and Subject Matter Expert.
+    Analyze the following Navy document and answer the user's question based ONLY on that text.
     USER QUESTION: "{query}"
+    DOCUMENT TEXT:
+    "{truncated_context}"
     Your Answer (Be concise, professional, and cite the document):
     """
     try:
+        response = client.text_generation(prompt, max_new_tokens=512)
         return response
     except Exception as e:
         return f"AI Error: {e}"
 # --- CORE SEARCH ENGINE ---
 class DocSearchEngine:
     def __init__(self):
+        # We try-except the init to catch the meta tensor error gracefully
+        try:
+            self.bi_encoder = SentenceTransformer(
+                'all-MiniLM-L6-v2',
+                device="cpu",
+                model_kwargs={"low_cpu_mem_usage": False}
+            )
+            self.cross_encoder = CrossEncoder(
+                'cross-encoder/ms-marco-MiniLM-L-6-v2',
+                device="cpu",
+                automodel_args={"low_cpu_mem_usage": False}
+            )
+        except Exception as e:
+            st.error(f"Model Load Error: {e}. Check requirements.txt and remove 'accelerate'.")
         self.index = None
         self.metadata = []
+        self.doc_store = {} # NEW: The Parent Document Storage
+        self.load_data()
+    def load_data(self):
         if os.path.exists(INDEX_FILE) and os.path.exists(META_FILE):
             try:
                 self.index = faiss.read_index(INDEX_FILE)
                 with open(META_FILE, "rb") as f: self.metadata = pickle.load(f)
+                # Load Doc Store
+                if os.path.exists(DOC_STORE_FILE):
+                    with open(DOC_STORE_FILE, "rb") as f: self.doc_store = pickle.load(f)
+                else:
+                    self.doc_store = {}
             except Exception as e:
                 self.reset_index()
         else:
         d = 384
         self.index = faiss.IndexIDMap(faiss.IndexFlatIP(d))
         self.metadata = []
+        self.doc_store = {}
         self.save()
+    def add_document(self, full_text, source, chunks):
+        # 1. Add to Doc Store
+        # We need the doc_id from the first chunk (all chunks share it)
+        if not chunks: return 0
+        doc_id = chunks[0]['doc_id']
+        self.doc_store[doc_id] = full_text
+        # 2. Vectorize Chunks
         texts = [c["text"] for c in chunks]
         embeddings = self.bi_encoder.encode(texts)
         faiss.normalize_L2(embeddings)
     def delete_file(self, filename):
         if self.index is None or self.index.ntotal == 0: return 0
+        # Remove chunks from metadata
         new_chunks = [c for c in self.metadata if c['source'] != filename]
+        # Remove from Doc Store (find doc_ids associated with filename)
+        # This is a bit expensive but safe
+        ids_to_remove = [c['doc_id'] for c in self.metadata if c['source'] == filename]
+        for did in set(ids_to_remove):
+            if did in self.doc_store:
+                del self.doc_store[did]
         removed_count = len(self.metadata) - len(new_chunks)
         if removed_count > 0:
             self.reset_index()
+            # Re-add existing documents (we have to rebuild the index from scratch in FAISS when deleting)
+            # A more optimized way is to just save the new metadata and rebuild index from texts
+            # For this scale, rebuilding is fine.
+            if new_chunks:
+                # Re-vectorize is slow, so ideally we'd keep vectors.
+                # For simplicity in this demo, we'll just re-save what we have.
+                # NOTE: In a prod system, you wouldn't re-embed everything.
+                # You'd use index.remove_ids (if supported) or rebuild from vectors.
+                pass
+            # For now, let's just clear and re-add to be safe (simplified logic)
+            # This is the "lazy" delete: it wipes and re-adds everything NOT deleted.
+            # Only viable for small datasets (<10k chunks).
+            # FAST FIX: Just save the new metadata/doc_store.
+            # The vectors will technically still be in FAISS but won't match metadata indices.
+            # Correct approach for this lightweight app:
+            self.index = faiss.IndexIDMap(faiss.IndexFlatIP(384)) # Wipe vector index
+            self.metadata = []
+            # Re-add all remaining chunks
+            if new_chunks:
+                # We need to re-embed.
+                texts = [c["text"] for c in new_chunks]
+                embeddings = self.bi_encoder.encode(texts)
+                faiss.normalize_L2(embeddings)
+                ids = np.arange(0, len(new_chunks)).astype('int64')
+                self.index.add_with_ids(embeddings, ids)
+                self.metadata = new_chunks
+            self.save()
         return removed_count
     def save(self):
         faiss.write_index(self.index, INDEX_FILE)
         with open(META_FILE, "wb") as f: pickle.dump(self.metadata, f)
+        with open(DOC_STORE_FILE, "wb") as f: pickle.dump(self.doc_store, f)
     def search_documents(self, query, top_k=5):
         if not self.index or self.index.ntotal == 0: return []
         raw_candidates = []
         for i, idx in enumerate(indices[0]):
             if idx != -1:
+                meta = self.metadata[idx]
                 raw_candidates.append({
+                    "text": meta["text"],
+                    "source": meta["source"],
+                    "doc_id": meta["doc_id"], # Retrieve ID
                     "bi_score": scores[0][i]
                 })
+        # Deduplicate by Source (keep highest score per document)
         doc_map = {}
         for cand in raw_candidates:
             source = cand['source']
             score = cand['bi_score']
             if source not in doc_map:
+                doc_map[source] = cand
             else:
+                if score > doc_map[source]["bi_score"]:
+                    doc_map[source] = cand
+        ranked_docs = sorted(doc_map.values(), key=lambda x: x['bi_score'], reverse=True)
         top_docs = ranked_docs[:top_k]
+        final_results = []
         if top_docs:
+            pairs = [[query, doc['text']] for doc in top_docs]
             cross_scores = self.cross_encoder.predict(pairs)
+            for i, doc in enumerate(top_docs):
                 final_results.append({
+                    "source": doc['source'],
                     "score": cross_scores[i],
+                    "snippet": doc['text'],
+                    "doc_id": doc['doc_id'] # Pass ID to UI
                 })
             final_results = sorted(final_results, key=lambda x: x["score"], reverse=True)
     st.session_state.engine = DocSearchEngine()
 with st.sidebar:
+    st.header("🗄️ Upload Documents")
+    uploaded_files = st.file_uploader("Upload Files", accept_multiple_files=True)
+    if uploaded_files and st.button("Index"):
+        progress_bar = st.progress(0)
+        status_text = st.empty()
+        new_chunks_count = 0
+        failed_files = []
+        total = len(uploaded_files)
+        for i, f in enumerate(uploaded_files):
+            status_text.text(f"Processing {i+1}/{total}: {f.name}...")
+            progress_bar.progress((i)/total)
+            txt, fname, method = parse_file(f)
+            if method.startswith("Error"):
+                failed_files.append(f"{fname}: {method}")
+                continue
+            if not txt.strip():
+                failed_files.append(f"{fname} (No text found)")
+                continue
+            # NEW: Generate ID and pass to chunker
+            doc_id = str(uuid.uuid4())
+            file_chunks = recursive_chunking(txt, fname, doc_id)
+            # Add to engine (full text + chunks)
+            st.session_state.engine.add_document(txt, fname, file_chunks)
+            new_chunks_count += len(file_chunks)
+        progress_bar.progress(1.0)
+        IndexManager.save_to_hub()
+        if new_chunks_count > 0:
+            st.success(f"Indexed {new_chunks_count} chunks from {total} files!")
+        if failed_files:
+            with st.expander("⚠️ Issues Detected", expanded=True):
+                for ff in failed_files: st.write(ff)
     st.divider()
     st.header("⚙️ Manage Index")
         file_to_delete = st.selectbox("Select file to remove:", [""] + unique_files)
         if file_to_delete and st.button("🗑️ Delete File"):
+            st.session_state.engine.delete_file(file_to_delete)
+            IndexManager.save_to_hub()
+            st.rerun()
         if st.button("⚠️ Wipe Entire Index", type="primary"):
+            st.session_state.engine.reset_index()
+            IndexManager.save_to_hub()
+            st.rerun()
+st.title("⚓ Document Finder (Full Context)")
 query = st.text_input("What are you looking for?")
 if query:
     results = st.session_state.engine.search_documents(query, top_k=5)
     if results:
+        # --- LLM INTEGRATION START ---
         top_match = results[0]
+        # RETRIEVAL STEP: Get the FULL TEXT from the Doc Store using the ID
+        full_doc_text = st.session_state.engine.doc_store.get(top_match['doc_id'], "Error: Document text not found.")
         with st.container():
             st.markdown("### 🤖 AI Summary")
+            st.caption(f"Analyzing full content of: {top_match['source']}")
+            if st.button("✨ Summarize Top Document"):
+                with st.spinner("Reading full document..."):
+                    ai_response = ask_llm(query, full_doc_text)
                     st.success(ai_response)
             st.divider()
+        # --- LLM INTEGRATION END ---
     st.subheader("Top Relevant Documents")
     if not results: st.info("No documents found.")
     for res in results:
         score = res['score']
         if score > 2: