Spaces:

NavyDevilDoc
/

Semantic_Search

Sleeping

App Files Files Community

NavyDevilDoc commited on Dec 16, 2025

Commit

0b474cc

verified ·

1 Parent(s): 8868ebf

Update app.py

Browse files

Files changed (1) hide show

app.py +144 -246

app.py CHANGED Viewed

@@ -1,70 +1,88 @@
 import streamlit as st
-import pandas as pd
 import numpy as np
-import chromadb
-from chromadb.config import Settings
 from sentence_transformers import SentenceTransformer, CrossEncoder
-from rank_bm25 import BM25Okapi
-from huggingface_hub import HfApi, snapshot_download
-from huggingface_hub.utils import RepositoryNotFoundError
 import pypdf
 import docx
-import os
-import shutil
-import pickle
 import time
 # --- CONFIGURATION ---
-# REPLACE THIS WITH YOUR NEW DATASET NAME!
-DATASET_REPO_ID = "NavyDevilDoc/navy-policy-index"
-LOCAL_DB_PATH = "./data_store"
 HF_TOKEN = os.environ.get("HF_TOKEN")
-st.set_page_config(page_title="Navy Search & Intel", layout="wide")
 # --- PERSISTENCE MANAGER ---
-class DataManager:
-    """Handles syncing the ChromaDB and BM25 index with the Hugging Face Hub"""
     @staticmethod
-    def sync_from_hub():
-        """Downloads the latest DB from the HF Dataset"""
         if not HF_TOKEN:
-            st.warning("HF_TOKEN not found in Secrets. Persistence will not work.")
             return False
         try:
-            st.toast("Syncing database from Cloud...", icon="☁️")
-            snapshot_download(
-                repo_id=DATASET_REPO_ID,
-                repo_type="dataset",
-                local_dir=LOCAL_DB_PATH,
-                token=HF_TOKEN
-            )
             return True
-        except (RepositoryNotFoundError, Exception) as e:
-            # If dataset is empty or doesn't exist yet, that's fine for a fresh start
-            print(f"Cloud sync note: {e}")
             return False
     @staticmethod
-    def sync_to_hub():
-        """Uploads the local DB to the HF Dataset"""
         if not HF_TOKEN:
             return
         api = HfApi(token=HF_TOKEN)
         try:
-            st.toast("Uploading new index to Cloud...", icon="🚀")
-            api.upload_folder(
-                folder_path=LOCAL_DB_PATH,
                 repo_id=DATASET_REPO_ID,
                 repo_type="dataset",
-                commit_message="Auto-save: Update Index"
             )
-            st.success("Database saved to Cloud!")
         except Exception as e:
-            st.error(f"Failed to sync to cloud: {e}")
 # --- HELPER FUNCTIONS ---
 def parse_file(uploaded_file):
@@ -76,7 +94,6 @@ def parse_file(uploaded_file):
             for i, page in enumerate(reader.pages):
                 page_text = page.extract_text()
                 if page_text:
-                    # We inject Page markers into the text for the LLM to see later
                     text += f"\n[PAGE {i+1}] {page_text}"
         elif filename.endswith(".docx"):
             doc = docx.Document(uploaded_file)
@@ -88,199 +105,116 @@ def parse_file(uploaded_file):
     return text, filename
 def recursive_chunking(text, source, chunk_size=500, overlap=100):
-    """
-    Splits text into chunks, trying to respect page boundaries if possible.
-    """
     words = text.split()
     chunks = []
     for i in range(0, len(words), chunk_size - overlap):
         chunk_words = words[i:i + chunk_size]
         chunk_text = " ".join(chunk_words)
-        # Metadata extraction (simple heuristic for page numbers we injected)
         page_num = "Unknown"
         if "[PAGE" in chunk_text:
             try:
-                # Find the last page marker in this chunk
                 start = chunk_text.rfind("[PAGE") + 6
                 end = chunk_text.find("]", start)
                 page_num = chunk_text[start:end]
-            except:
-                pass
         if len(chunk_text) > 50:
             chunks.append({
                 "text": chunk_text,
-                "metadata": {"source": source, "page": page_num}
             })
     return chunks
-# --- CORE SEARCH ENGINE ---
-class PersistentSearchEngine:
-    def __init__(self, collection_name="navy_docs"):
-        # 1. Initialize ChromaDB (Persistent)
-        self.client = chromadb.PersistentClient(path=os.path.join(LOCAL_DB_PATH, "chroma"))
-        self.collection = self.client.get_or_create_collection(name=collection_name)
-        # 2. Load Models
-        # We force the device to CPU to avoid "meta tensor" errors in Docker
-        device = "cpu"
-        self.bi_encoder = SentenceTransformer('all-MiniLM-L6-v2', device=device)
-        # We disable "low_cpu_mem_usage" to prevent the model from loading as a ghost (meta device)
-        self.cross_encoder = CrossEncoder(
-            'cross-encoder/ms-marco-MiniLM-L-6-v2',
-            device=device,
-            automodel_args={"low_cpu_mem_usage": False}
-        )
-        # 3. Initialize/Load BM25 (Sparse)
-        self.bm25 = None
-        self.doc_store = [] # We need a shadow copy for BM25
-        self.load_bm25()
-    def load_bm25(self):
-        """Loads BM25 index from disk if it exists"""
-        bm25_path = os.path.join(LOCAL_DB_PATH, "bm25.pkl")
-        if os.path.exists(bm25_path):
-            with open(bm25_path, "rb") as f:
-                data = pickle.load(f)
-                self.bm25 = data['model']
-                self.doc_store = data['docs']
-    def save_bm25(self):
-        """Saves BM25 index to disk"""
-        bm25_path = os.path.join(LOCAL_DB_PATH, "bm25.pkl")
-        with open(bm25_path, "wb") as f:
-            pickle.dump({'model': self.bm25, 'docs': self.doc_store}, f)
-    def add_documents(self, parsed_chunks):
-        # 1. Add to Chroma (Dense)
-        ids = [f"{c['metadata']['source']}_{i}_{time.time()}" for i, c in enumerate(parsed_chunks)]
-        texts = [c['text'] for c in parsed_chunks]
-        metadatas = [c['metadata'] for c in parsed_chunks]
-        embeddings = self.bi_encoder.encode(texts).tolist()
-        self.collection.add(
-            documents=texts,
-            embeddings=embeddings,
-            metadatas=metadatas,
-            ids=ids
-        )
-        # 2. Update BM25 (Sparse)
-        # Note: BM25 is not incremental by default, we rebuild it.
-        # For huge datasets, we would implement incremental updates, but for <10k docs, rebuilding is fast.
-        current_docs = self.doc_store + texts
-        tokenized_corpus = [doc.lower().split() for doc in current_docs]
-        self.bm25 = BM25Okapi(tokenized_corpus)
-        self.doc_store = current_docs
-        # 3. Save Aux Data
-        self.save_bm25()
         return len(texts)
-    def search(self, query, top_k=5, alpha=0.5):
-        # --- DENSE SEARCH (Chroma) ---
-        # Get more candidates for re-ranking
-        candidate_k = top_k * 3
-        query_embedding = self.bi_encoder.encode([query]).tolist()
-        chroma_results = self.collection.query(
-            query_embeddings=query_embedding,
-            n_results=candidate_k
-        )
-        # If DB is empty
-        if not chroma_results['documents']:
             return []
-        # Process Chroma Results
-        # Chroma structure: {'ids': [[]], 'documents': [[]], 'metadatas': [[]], 'distances': [[]]}
-        dense_hits = {}
-        retrieved_docs_map = {} # ID -> Text/Meta mapping
-        for i, doc_id in enumerate(chroma_results['ids'][0]):
-            score = 1 - chroma_results['distances'][0][i] # Convert distance to similarity
-            dense_hits[doc_id] = score
-            retrieved_docs_map[doc_id] = {
-                'text': chroma_results['documents'][0][i],
-                'metadata': chroma_results['metadatas'][0][i]
-            }
-        # --- SPARSE SEARCH (BM25) ---
-        # Note: Mapping BM25 indices back to Chroma IDs is complex if lists aren't perfectly synced.
-        # For this Hybrid implementation, we will rely heavily on Chroma for the *candidates* # and use BM25 to score the *Query vs The Candidates* specifically.
-        hybrid_candidates = []
-        q_tokens = query.lower().split()
-        for doc_id, dense_score in dense_hits.items():
-            doc_text = retrieved_docs_map[doc_id]['text']
-            # Score this specific candidate with BM25 logic (on the fly)
-            # This is "Re-scoring" rather than "Retrieving" with BM25, which is safer for sync
-            doc_tokens = doc_text.lower().split()
-            # Simple term frequency for the candidate
-            bm25_score = 0
-            for token in q_tokens:
-                bm25_score += doc_tokens.count(token)
-            # Normalize BM25 score roughly (0-10 range usually, squeeze to 0-1)
-            bm25_score = min(bm25_score / 5.0, 1.0)
-            final_hybrid_score = (alpha * dense_score) + ((1-alpha) * bm25_score)
-            hybrid_candidates.append({
-                "id": doc_id,
-                "text": doc_text,
-                "metadata": retrieved_docs_map[doc_id]['metadata'],
-                "hybrid_score": final_hybrid_score
-            })
-        # Sort by Hybrid Score
-        hybrid_candidates.sort(key=lambda x: x['hybrid_score'], reverse=True)
-        # --- RE-RANKING (Cross-Encoder) ---
-        top_candidates = hybrid_candidates[:candidate_k]
-        pairs = [[query, c['text']] for c in top_candidates]
         cross_scores = self.cross_encoder.predict(pairs)
-        final_results = []
-        for i, cand in enumerate(top_candidates):
-            final_results.append({
-                "chunk": cand['text'],
-                "metadata": cand['metadata'],
-                "score": cross_scores[i]
-            })
-        final_results.sort(key=lambda x: x['score'], reverse=True)
         return final_results[:top_k]
 # --- UI LOGIC ---
-# 1. Sync on Startup
-if 'synced' not in st.session_state:
-    DataManager.sync_from_hub()
-    st.session_state.synced = True
-# 2. Init Engine
 if 'engine' not in st.session_state:
-    with st.spinner("Initializing Vector Database..."):
-        st.session_state.engine = PersistentSearchEngine()
 with st.sidebar:
     st.header("🗄️ Knowledge Base")
     uploaded_files = st.file_uploader("Ingest Documents", accept_multiple_files=True)
-    if uploaded_files and st.button("Add to Database"):
-        with st.spinner("Parsing & Indexing..."):
             new_chunks = []
             for f in uploaded_files:
                 txt, fname = parse_file(f)
@@ -289,64 +223,28 @@ with st.sidebar:
             if new_chunks:
                 count = st.session_state.engine.add_documents(new_chunks)
-                DataManager.sync_to_hub() # Auto-save to cloud
-                st.success(f"Added {count} chunks and synced to Cloud!")
-    st.divider()
-    st.info(f"Connected to: {DATASET_REPO_ID}")
-# --- MAIN SEARCH UI ---
-st.title("⚓ Navy Intelligent Search (RAG)")
-query = st.text_input("Enter Query (e.g. 'Leave policy for O-3 and below'):")
-col1, col2 = st.columns([1, 1])
-with col1:
-    top_k = st.number_input("Documents", 1, 10, 3)
-with col2:
-    alpha = st.slider("Hybrid Weight", 0.0, 1.0, 0.6, help="Higher = More Semantic")
 if query:
-    results = st.session_state.engine.search(query, top_k=top_k, alpha=alpha)
-    # Store results for RAG
     context_text = ""
-    st.markdown("### 🔍 Search Results")
     for res in results:
-        meta = res['metadata']
-        score = res['score']
-        text = res['chunk']
-        context_text += f"Source: {meta['source']} (Page {meta['page']})\nContent: {text}\n\n"
-        with st.expander(f"{meta['source']} | Pg {meta['page']} (Score: {score:.2f})", expanded=True):
-            st.markdown(text)
-    # --- RAG: SUMMARIZATION ---
-    st.divider()
-    st.markdown("### 🤖 AI Intelligence")
-    if st.button("Generate Summary / Answer"):
         from huggingface_hub import InferenceClient
-        # Use a free, powerful model via HF Inference API
-        repo_id = "mistralai/Mistral-7B-Instruct-v0.3"
-        llm_client = InferenceClient(model=repo_id, token=HF_TOKEN)
-        prompt = f"""
-        You are a Navy Administrative Aide. Answer the user's question based ONLY on the context provided below.
-        If the answer is not in the context, say "I cannot find the answer in the provided documents."
-        CONTEXT:
-        {context_text}
-        USER QUESTION:
-        {query}
-        ANSWER:
-        """
-        with st.spinner("Consulting LLM..."):
             try:
-                response = llm_client.text_generation(prompt, max_new_tokens=500)
-                st.success(response)
             except Exception as e:
                 st.error(f"LLM Error: {e}")

 import streamlit as st
+import os
+import faiss
+import pickle
 import numpy as np
 from sentence_transformers import SentenceTransformer, CrossEncoder
+from huggingface_hub import HfApi, hf_hub_download
+from huggingface_hub.utils import EntryNotFoundError, RepositoryNotFoundError
 import pypdf
 import docx
 import time
 # --- CONFIGURATION ---
+DATASET_REPO_ID = "NavyDevilDoc/navy-policy-index" # Your Dataset
 HF_TOKEN = os.environ.get("HF_TOKEN")
+# File paths for local storage
+INDEX_FILE = "navy_index.faiss"
+META_FILE = "navy_metadata.pkl"
+st.set_page_config(page_title="Navy Search (FAISS)", layout="wide")
 # --- PERSISTENCE MANAGER ---
+class IndexManager:
+    """Manages loading/saving the FAISS index and Metadata from Hugging Face"""
     @staticmethod
+    def load_from_hub():
+        """Downloads the index files from HF Dataset"""
         if not HF_TOKEN:
+            st.warning("HF_TOKEN missing. Running in local-only mode.")
             return False
         try:
+            with st.spinner("Downloading Knowledge Base..."):
+                # Download Vector Index
+                hf_hub_download(
+                    repo_id=DATASET_REPO_ID,
+                    filename=INDEX_FILE,
+                    repo_type="dataset",
+                    local_dir=".",
+                    token=HF_TOKEN
+                )
+                # Download Metadata
+                hf_hub_download(
+                    repo_id=DATASET_REPO_ID,
+                    filename=META_FILE,
+                    repo_type="dataset",
+                    local_dir=".",
+                    token=HF_TOKEN
+                )
             return True
+        except (EntryNotFoundError, RepositoryNotFoundError):
+            st.toast("No existing index found in Cloud. Starting fresh.", icon="🆕")
+            return False
+        except Exception as e:
+            st.error(f"Sync Error: {e}")
             return False
     @staticmethod
+    def save_to_hub():
+        """Uploads the local files to HF Dataset"""
         if not HF_TOKEN:
             return
         api = HfApi(token=HF_TOKEN)
         try:
+            st.toast("Syncing to Cloud...", icon="☁️")
+            api.upload_file(
+                path_or_fileobj=INDEX_FILE,
+                path_in_repo=INDEX_FILE,
+                repo_id=DATASET_REPO_ID,
+                repo_type="dataset",
+                commit_message="Update FAISS Index"
+            )
+            api.upload_file(
+                path_or_fileobj=META_FILE,
+                path_in_repo=META_FILE,
                 repo_id=DATASET_REPO_ID,
                 repo_type="dataset",
+                commit_message="Update Metadata"
             )
+            st.success("Knowledge Base Saved!")
         except Exception as e:
+            st.error(f"Upload failed: {e}")
 # --- HELPER FUNCTIONS ---
 def parse_file(uploaded_file):
             for i, page in enumerate(reader.pages):
                 page_text = page.extract_text()
                 if page_text:
                     text += f"\n[PAGE {i+1}] {page_text}"
         elif filename.endswith(".docx"):
             doc = docx.Document(uploaded_file)
     return text, filename
 def recursive_chunking(text, source, chunk_size=500, overlap=100):
     words = text.split()
     chunks = []
     for i in range(0, len(words), chunk_size - overlap):
         chunk_words = words[i:i + chunk_size]
         chunk_text = " ".join(chunk_words)
+        # Simple Page Extraction
         page_num = "Unknown"
         if "[PAGE" in chunk_text:
             try:
                 start = chunk_text.rfind("[PAGE") + 6
                 end = chunk_text.find("]", start)
                 page_num = chunk_text[start:end]
+            except: pass
         if len(chunk_text) > 50:
             chunks.append({
                 "text": chunk_text,
+                "source": source,
+                "page": page_num
             })
     return chunks
+# --- CORE SEARCH ENGINE (FAISS VERSION) ---
+class RobustSearchEngine:
+    def __init__(self):
+        # Load Models (Force CPU to avoid meta tensor errors)
+        self.bi_encoder = SentenceTransformer('all-MiniLM-L6-v2', device="cpu")
+        self.cross_encoder = CrossEncoder('cross-encoder/ms-marco-MiniLM-L-6-v2', device="cpu", automodel_args={"low_cpu_mem_usage": False})
+        self.index = None
+        self.metadata = [] # List of dicts matching index order
+        # Try to load existing index from disk
+        if os.path.exists(INDEX_FILE) and os.path.exists(META_FILE):
+            self.index = faiss.read_index(INDEX_FILE)
+            with open(META_FILE, "rb") as f:
+                self.metadata = pickle.load(f)
+        else:
+            # Initialize new index
+            self.index = None # Will init on first add
+            self.metadata = []
+    def add_documents(self, chunks):
+        # 1. Encode
+        texts = [c["text"] for c in chunks]
+        embeddings = self.bi_encoder.encode(texts)
+        faiss.normalize_L2(embeddings) # Normalize for Cosine Sim
+        # 2. Init Index if needed
+        if self.index is None:
+            dimension = embeddings.shape[1]
+            self.index = faiss.IndexFlatIP(dimension) # Inner Product = Cosine
+        # 3. Add to Index
+        self.index.add(embeddings)
+        self.metadata.extend(chunks)
+        # 4. Save to Disk
+        faiss.write_index(self.index, INDEX_FILE)
+        with open(META_FILE, "wb") as f:
+            pickle.dump(self.metadata, f)
         return len(texts)
+    def search(self, query, top_k=5):
+        if not self.index or self.index.ntotal == 0:
             return []
+        # 1. Retrieval
+        candidate_k = top_k * 3
+        q_vec = self.bi_encoder.encode([query])
+        faiss.normalize_L2(q_vec)
+        scores, indices = self.index.search(q_vec, min(self.index.ntotal, candidate_k))
+        candidates = []
+        for i, idx in enumerate(indices[0]):
+            if idx != -1:
+                candidates.append({
+                    "text": self.metadata[idx]["text"],
+                    "source": self.metadata[idx]["source"],
+                    "page": self.metadata[idx]["page"],
+                    "base_score": scores[0][i]
+                })
+        # 2. Re-Ranking
+        pairs = [[query, c["text"]] for c in candidates]
         cross_scores = self.cross_encoder.predict(pairs)
+        for i, c in enumerate(candidates):
+            c["score"] = cross_scores[i]
+        # Sort
+        final_results = sorted(candidates, key=lambda x: x["score"], reverse=True)
         return final_results[:top_k]
 # --- UI LOGIC ---
 if 'engine' not in st.session_state:
+    # 1. Try cloud sync first
+    IndexManager.load_from_hub()
+    # 2. Start engine
+    st.session_state.engine = RobustSearchEngine()
 with st.sidebar:
     st.header("🗄️ Knowledge Base")
     uploaded_files = st.file_uploader("Ingest Documents", accept_multiple_files=True)
+    if uploaded_files and st.button("Index Documents"):
+        with st.spinner("Processing..."):
             new_chunks = []
             for f in uploaded_files:
                 txt, fname = parse_file(f)
             if new_chunks:
                 count = st.session_state.engine.add_documents(new_chunks)
+                IndexManager.save_to_hub()
+                st.success(f"Added {count} chunks!")
+st.title("⚓ Navy Search (FAISS Architecture)")
+query = st.text_input("Enter Query:")
 if query:
+    results = st.session_state.engine.search(query)
+    st.markdown("### 🔍 Results")
     context_text = ""
     for res in results:
+        context_text += f"Source: {res['source']}\n{res['text']}\n\n"
+        with st.expander(f"{res['source']} (Pg {res['page']}) - Score {res['score']:.2f}", expanded=True):
+            st.markdown(res['text'])
+    if st.button("Generate Summary"):
         from huggingface_hub import InferenceClient
+        client = InferenceClient("mistralai/Mistral-7B-Instruct-v0.3", token=HF_TOKEN)
+        prompt = f"Context:\n{context_text}\n\nUser: {query}\nAnswer:"
+        with st.spinner("Thinking..."):
             try:
+                st.write(client.text_generation(prompt, max_new_tokens=400))
             except Exception as e:
                 st.error(f"LLM Error: {e}")