Spaces:

NavyDevilDoc
/

Semantic_Search

Sleeping

App Files Files Community

NavyDevilDoc commited on Dec 15, 2025

Commit

39f313e

verified ·

1 Parent(s): 5a9d0e9

Update app.py

Browse files

Files changed (1) hide show

app.py +293 -160

app.py CHANGED Viewed

@@ -1,211 +1,344 @@
 import streamlit as st
 import pandas as pd
 import numpy as np
-from sentence_transformers import SentenceTransformer, CrossEncoder, util
-import faiss
 from rank_bm25 import BM25Okapi
 import pypdf
 import docx
-import torch
 # --- CONFIGURATION ---
-st.set_page_config(page_title="Advanced Semantic Search", layout="wide")
 # --- HELPER FUNCTIONS ---
 def parse_file(uploaded_file):
     text = ""
     try:
-        if uploaded_file.name.endswith(".pdf"):
             reader = pypdf.PdfReader(uploaded_file)
-            for page in reader.pages:
-                text += page.extract_text() + "\n"
-        elif uploaded_file.name.endswith(".docx"):
             doc = docx.Document(uploaded_file)
             text = "\n".join([para.text for para in doc.paragraphs])
-        elif uploaded_file.name.endswith(".txt"):
             text = uploaded_file.read().decode("utf-8")
-        elif uploaded_file.name.endswith(".csv"):
-            df = pd.read_csv(uploaded_file)
-            text = df.to_string()
     except Exception as e:
-        st.error(f"Error reading file: {e}")
-    return text
-def chunk_text(text, chunk_size=300, overlap=50):
     words = text.split()
     chunks = []
     for i in range(0, len(words), chunk_size - overlap):
-        chunk = " ".join(words[i:i + chunk_size])
-        if len(chunk) > 50:
-            chunks.append(chunk)
     return chunks
-# --- CORE LOGIC: RETRIEVER + RE-RANKER ---
-class SearchEngine:
-    def __init__(self, bi_encoder_name):
-        # 1. Bi-Encoder (Fast Retrieval)
-        self.bi_encoder = SentenceTransformer(bi_encoder_name)
-        # 2. Cross-Encoder (Accurate Re-Ranking)
-        # We use a standard MS MARCO model designed for this exact task
         self.cross_encoder = CrossEncoder('cross-encoder/ms-marco-MiniLM-L-6-v2')
-        self.documents = []
-        self.faiss_index = None
         self.bm25 = None
-    def fit(self, documents):
-        self.documents = documents
-        # Build Dense Index
-        embeddings = self.bi_encoder.encode(documents, convert_to_tensor=True)
-        # Convert to numpy for FAISS
-        embeddings_np = embeddings.cpu().numpy()
-        faiss.normalize_L2(embeddings_np)
-        dimension = embeddings_np.shape[1]
-        self.faiss_index = faiss.IndexFlatIP(dimension)
-        self.faiss_index.add(embeddings_np)
-        # Build Sparse Index
-        tokenized_corpus = [doc.lower().split() for doc in documents]
         self.bm25 = BM25Okapi(tokenized_corpus)
     def search(self, query, top_k=5, alpha=0.5):
-        # STAGE 1: RETRIEVAL (Get a candidate pool)
-        # We retrieve 3x the requested amount to give the re-ranker options
-        candidate_k = top_k * 3
-        # Vector Search
-        query_vector = self.bi_encoder.encode([query])
-        faiss.normalize_L2(query_vector)
-        v_scores, v_indices = self.faiss_index.search(query_vector, min(len(self.documents), candidate_k))
-        # BM25 Search
-        tokenized_query = query.lower().split()
-        bm25_scores = self.bm25.get_scores(tokenized_query)
-        # Normalize BM25
-        if len(bm25_scores) > 0 and max(bm25_scores) > 0:
-            bm25_scores = (bm25_scores - min(bm25_scores)) / (max(bm25_scores) - min(bm25_scores))
-        # Combine Scores to get candidates
-        candidates = {} # {doc_idx: hybrid_score}
-        # Map vector results
-        for i, idx in enumerate(v_indices[0]):
-            if idx != -1:
-                v_score = v_scores[0][i]
-                candidates[idx] = alpha * v_score
-        # Add BM25 results (for all docs, efficient enough for small corpora)
-        # In production, you'd only check top BM25 results
-        top_bm25_indices = np.argsort(bm25_scores)[-candidate_k:]
-        for idx in top_bm25_indices:
-            score = (1 - alpha) * bm25_scores[idx]
-            if idx in candidates:
-                candidates[idx] += score
-            else:
-                candidates[idx] = score
-        # Sort candidates by Hybrid Score
-        sorted_candidates = sorted(candidates.items(), key=lambda x: x[1], reverse=True)[:candidate_k]
-        # STAGE 2: RE-RANKING (Cross-Encoder)
-        # Prepare pairs for the Cross-Encoder: [[query, doc1], [query, doc2]...]
-        candidate_indices = [idx for idx, score in sorted_candidates]
-        candidate_docs = [self.documents[idx] for idx in candidate_indices]
-        pairs = [[query, doc] for doc in candidate_docs]
-        if not pairs:
             return []
-        # Predict scores (logits)
         cross_scores = self.cross_encoder.predict(pairs)
-        # Combine everything into final results
         final_results = []
-        for i, idx in enumerate(candidate_indices):
             final_results.append({
-                "chunk": self.documents[idx],
-                "score": cross_scores[i], # This is the high-accuracy score
-                "original_hybrid_score": sorted_candidates[i][1]
             })
-        # Sort by Cross-Encoder score
-        final_results = sorted(final_results, key=lambda x: x["score"], reverse=True)
         return final_results[:top_k]
-# --- UI LAYOUT ---
-st.title("🧠 Semantic Search: Hybrid + Cross-Encoder")
-st.markdown("""
-This system uses a **Two-Stage Retrieval Process**:
-1. **Retrieval:** Finds top candidates using Vector (semantic) and BM25 (keyword) search.
-2. **Re-Ranking:** A Cross-Encoder model reads the query and candidates to score true relevance.
-""")
 with st.sidebar:
-    st.header("1. Setup Knowledge Base")
-    uploaded_files = st.file_uploader(
-        "Upload Documents",
-        type=['txt', 'pdf', 'docx', 'csv'],
-        accept_multiple_files=True
-    )
     st.divider()
-    st.header("2. Tuning")
-    model_choice = st.selectbox(
-        "Base Embedding Model",
-        ["all-MiniLM-L6-v2", "all-mpnet-base-v2"],
-        help="Used for the initial fast retrieval."
-    )
-    alpha = st.slider("Hybrid Alpha", 0.0, 1.0, 0.4,
-                      help="0.0 = Keywords, 1.0 = Vectors. 0.4 is often best for Hybrid.")
-    top_k = st.number_input("Final Results", 1, 20, 5)
-    build_btn = st.button("Build Database")
-# --- APP STATE ---
-if 'engine' not in st.session_state:
-    st.session_state.engine = None
-if build_btn and uploaded_files:
-    with st.spinner("Processing files..."):
-        all_chunks = []
-        for file in uploaded_files:
-            raw = parse_file(file)
-            chunks = chunk_text(raw)
-            all_chunks.extend(chunks)
-        if all_chunks:
-            # Initialize Engine
-            st.session_state.engine = SearchEngine(model_choice)
-            st.session_state.engine.fit(all_chunks)
-            st.success(f"Indexed {len(all_chunks)} chunks!")
-        else:
-            st.warning("No text extracted.")
-# --- SEARCH ---
-if st.session_state.engine:
-    query = st.text_input("Ask a question:")
-    if query:
-        with st.spinner("Retrieving & Re-Ranking..."):
-            results = st.session_state.engine.search(query, top_k=top_k, alpha=alpha)
-        for i, res in enumerate(results):
-            score = res['score']
-            # Color code high relevance
-            color = "green" if score > 0 else "blue"
-            with st.container():
-                st.markdown(f"### Rank {i+1}")
-                st.caption(f"Relevance Score: :{color}[{score:.3f}]")
-                st.info(res['chunk'])
-                st.divider()
-else:
-    st.info("Upload documents to start.")

 import streamlit as st
 import pandas as pd
 import numpy as np
+import chromadb
+from chromadb.config import Settings
+from sentence_transformers import SentenceTransformer, CrossEncoder
 from rank_bm25 import BM25Okapi
+from huggingface_hub import HfApi, snapshot_download
+from huggingface_hub.utils import RepositoryNotFoundError
 import pypdf
 import docx
+import os
+import shutil
+import pickle
+import time
 # --- CONFIGURATION ---
+# REPLACE THIS WITH YOUR NEW DATASET NAME!
+DATASET_REPO_ID = "NavyDevilDoc/navy-policy-index"
+LOCAL_DB_PATH = "./data_store"
+HF_TOKEN = os.environ.get("HF_TOKEN")
+st.set_page_config(page_title="Navy Search & Intel", layout="wide")
+# --- PERSISTENCE MANAGER ---
+class DataManager:
+    """Handles syncing the ChromaDB and BM25 index with the Hugging Face Hub"""
+    @staticmethod
+    def sync_from_hub():
+        """Downloads the latest DB from the HF Dataset"""
+        if not HF_TOKEN:
+            st.warning("HF_TOKEN not found in Secrets. Persistence will not work.")
+            return False
+        try:
+            st.toast("Syncing database from Cloud...", icon="☁️")
+            snapshot_download(
+                repo_id=DATASET_REPO_ID,
+                repo_type="dataset",
+                local_dir=LOCAL_DB_PATH,
+                token=HF_TOKEN
+            )
+            return True
+        except (RepositoryNotFoundError, Exception) as e:
+            # If dataset is empty or doesn't exist yet, that's fine for a fresh start
+            print(f"Cloud sync note: {e}")
+            return False
+    @staticmethod
+    def sync_to_hub():
+        """Uploads the local DB to the HF Dataset"""
+        if not HF_TOKEN:
+            return
+        api = HfApi(token=HF_TOKEN)
+        try:
+            st.toast("Uploading new index to Cloud...", icon="🚀")
+            api.upload_folder(
+                folder_path=LOCAL_DB_PATH,
+                repo_id=DATASET_REPO_ID,
+                repo_type="dataset",
+                commit_message="Auto-save: Update Index"
+            )
+            st.success("Database saved to Cloud!")
+        except Exception as e:
+            st.error(f"Failed to sync to cloud: {e}")
 # --- HELPER FUNCTIONS ---
 def parse_file(uploaded_file):
     text = ""
+    filename = uploaded_file.name
     try:
+        if filename.endswith(".pdf"):
             reader = pypdf.PdfReader(uploaded_file)
+            for i, page in enumerate(reader.pages):
+                page_text = page.extract_text()
+                if page_text:
+                    # We inject Page markers into the text for the LLM to see later
+                    text += f"\n[PAGE {i+1}] {page_text}"
+        elif filename.endswith(".docx"):
             doc = docx.Document(uploaded_file)
             text = "\n".join([para.text for para in doc.paragraphs])
+        elif filename.endswith(".txt"):
             text = uploaded_file.read().decode("utf-8")
     except Exception as e:
+        st.error(f"Error parsing {filename}: {e}")
+    return text, filename
+def recursive_chunking(text, source, chunk_size=500, overlap=100):
+    """
+    Splits text into chunks, trying to respect page boundaries if possible.
+    """
     words = text.split()
     chunks = []
     for i in range(0, len(words), chunk_size - overlap):
+        chunk_words = words[i:i + chunk_size]
+        chunk_text = " ".join(chunk_words)
+        # Metadata extraction (simple heuristic for page numbers we injected)
+        page_num = "Unknown"
+        if "[PAGE" in chunk_text:
+            try:
+                # Find the last page marker in this chunk
+                start = chunk_text.rfind("[PAGE") + 6
+                end = chunk_text.find("]", start)
+                page_num = chunk_text[start:end]
+            except:
+                pass
+        if len(chunk_text) > 50:
+            chunks.append({
+                "text": chunk_text,
+                "metadata": {"source": source, "page": page_num}
+            })
     return chunks
+# --- CORE SEARCH ENGINE ---
+class PersistentSearchEngine:
+    def __init__(self, collection_name="navy_docs"):
+        # 1. Initialize ChromaDB (Persistent)
+        self.client = chromadb.PersistentClient(path=os.path.join(LOCAL_DB_PATH, "chroma"))
+        self.collection = self.client.get_or_create_collection(name=collection_name)
+        # 2. Load Models
+        self.bi_encoder = SentenceTransformer('all-MiniLM-L6-v2')
         self.cross_encoder = CrossEncoder('cross-encoder/ms-marco-MiniLM-L-6-v2')
+        # 3. Initialize/Load BM25 (Sparse)
         self.bm25 = None
+        self.doc_store = [] # We need a shadow copy for BM25
+        self.load_bm25()
+    def load_bm25(self):
+        """Loads BM25 index from disk if it exists"""
+        bm25_path = os.path.join(LOCAL_DB_PATH, "bm25.pkl")
+        if os.path.exists(bm25_path):
+            with open(bm25_path, "rb") as f:
+                data = pickle.load(f)
+                self.bm25 = data['model']
+                self.doc_store = data['docs']
+    def save_bm25(self):
+        """Saves BM25 index to disk"""
+        bm25_path = os.path.join(LOCAL_DB_PATH, "bm25.pkl")
+        with open(bm25_path, "wb") as f:
+            pickle.dump({'model': self.bm25, 'docs': self.doc_store}, f)
+    def add_documents(self, parsed_chunks):
+        # 1. Add to Chroma (Dense)
+        ids = [f"{c['metadata']['source']}_{i}_{time.time()}" for i, c in enumerate(parsed_chunks)]
+        texts = [c['text'] for c in parsed_chunks]
+        metadatas = [c['metadata'] for c in parsed_chunks]
+        embeddings = self.bi_encoder.encode(texts).tolist()
+        self.collection.add(
+            documents=texts,
+            embeddings=embeddings,
+            metadatas=metadatas,
+            ids=ids
+        )
+        # 2. Update BM25 (Sparse)
+        # Note: BM25 is not incremental by default, we rebuild it.
+        # For huge datasets, we would implement incremental updates, but for <10k docs, rebuilding is fast.
+        current_docs = self.doc_store + texts
+        tokenized_corpus = [doc.lower().split() for doc in current_docs]
         self.bm25 = BM25Okapi(tokenized_corpus)
+        self.doc_store = current_docs
+        # 3. Save Aux Data
+        self.save_bm25()
+        return len(texts)
     def search(self, query, top_k=5, alpha=0.5):
+        # --- DENSE SEARCH (Chroma) ---
+        # Get more candidates for re-ranking
+        candidate_k = top_k * 3
+        query_embedding = self.bi_encoder.encode([query]).tolist()
+        chroma_results = self.collection.query(
+            query_embeddings=query_embedding,
+            n_results=candidate_k
+        )
+        # If DB is empty
+        if not chroma_results['documents']:
             return []
+        # Process Chroma Results
+        # Chroma structure: {'ids': [[]], 'documents': [[]], 'metadatas': [[]], 'distances': [[]]}
+        dense_hits = {}
+        retrieved_docs_map = {} # ID -> Text/Meta mapping
+        for i, doc_id in enumerate(chroma_results['ids'][0]):
+            score = 1 - chroma_results['distances'][0][i] # Convert distance to similarity
+            dense_hits[doc_id] = score
+            retrieved_docs_map[doc_id] = {
+                'text': chroma_results['documents'][0][i],
+                'metadata': chroma_results['metadatas'][0][i]
+            }
+        # --- SPARSE SEARCH (BM25) ---
+        # Note: Mapping BM25 indices back to Chroma IDs is complex if lists aren't perfectly synced.
+        # For this Hybrid implementation, we will rely heavily on Chroma for the *candidates* # and use BM25 to score the *Query vs The Candidates* specifically.
+        hybrid_candidates = []
+        q_tokens = query.lower().split()
+        for doc_id, dense_score in dense_hits.items():
+            doc_text = retrieved_docs_map[doc_id]['text']
+            # Score this specific candidate with BM25 logic (on the fly)
+            # This is "Re-scoring" rather than "Retrieving" with BM25, which is safer for sync
+            doc_tokens = doc_text.lower().split()
+            # Simple term frequency for the candidate
+            bm25_score = 0
+            for token in q_tokens:
+                bm25_score += doc_tokens.count(token)
+            # Normalize BM25 score roughly (0-10 range usually, squeeze to 0-1)
+            bm25_score = min(bm25_score / 5.0, 1.0)
+            final_hybrid_score = (alpha * dense_score) + ((1-alpha) * bm25_score)
+            hybrid_candidates.append({
+                "id": doc_id,
+                "text": doc_text,
+                "metadata": retrieved_docs_map[doc_id]['metadata'],
+                "hybrid_score": final_hybrid_score
+            })
+        # Sort by Hybrid Score
+        hybrid_candidates.sort(key=lambda x: x['hybrid_score'], reverse=True)
+        # --- RE-RANKING (Cross-Encoder) ---
+        top_candidates = hybrid_candidates[:candidate_k]
+        pairs = [[query, c['text']] for c in top_candidates]
         cross_scores = self.cross_encoder.predict(pairs)
         final_results = []
+        for i, cand in enumerate(top_candidates):
             final_results.append({
+                "chunk": cand['text'],
+                "metadata": cand['metadata'],
+                "score": cross_scores[i]
             })
+        final_results.sort(key=lambda x: x['score'], reverse=True)
         return final_results[:top_k]
+# --- UI LOGIC ---
+# 1. Sync on Startup
+if 'synced' not in st.session_state:
+    DataManager.sync_from_hub()
+    st.session_state.synced = True
+# 2. Init Engine
+if 'engine' not in st.session_state:
+    with st.spinner("Initializing Vector Database..."):
+        st.session_state.engine = PersistentSearchEngine()
 with st.sidebar:
+    st.header("🗄️ Knowledge Base")
+    uploaded_files = st.file_uploader("Ingest Documents", accept_multiple_files=True)
+    if uploaded_files and st.button("Add to Database"):
+        with st.spinner("Parsing & Indexing..."):
+            new_chunks = []
+            for f in uploaded_files:
+                txt, fname = parse_file(f)
+                chunks = recursive_chunking(txt, fname)
+                new_chunks.extend(chunks)
+            if new_chunks:
+                count = st.session_state.engine.add_documents(new_chunks)
+                DataManager.sync_to_hub() # Auto-save to cloud
+                st.success(f"Added {count} chunks and synced to Cloud!")
     st.divider()
+    st.info(f"Connected to: {DATASET_REPO_ID}")
+# --- MAIN SEARCH UI ---
+st.title("⚓ Navy Intelligent Search (RAG)")
+query = st.text_input("Enter Query (e.g. 'Leave policy for O-3 and below'):")
+col1, col2 = st.columns([1, 1])
+with col1:
+    top_k = st.number_input("Documents", 1, 10, 3)
+with col2:
+    alpha = st.slider("Hybrid Weight", 0.0, 1.0, 0.6, help="Higher = More Semantic")
+if query:
+    results = st.session_state.engine.search(query, top_k=top_k, alpha=alpha)
+    # Store results for RAG
+    context_text = ""
+    st.markdown("### 🔍 Search Results")
+    for res in results:
+        meta = res['metadata']
+        score = res['score']
+        text = res['chunk']
+        context_text += f"Source: {meta['source']} (Page {meta['page']})\nContent: {text}\n\n"
+        with st.expander(f"{meta['source']} | Pg {meta['page']} (Score: {score:.2f})", expanded=True):
+            st.markdown(text)
+    # --- RAG: SUMMARIZATION ---
+    st.divider()
+    st.markdown("### 🤖 AI Intelligence")
+    if st.button("Generate Summary / Answer"):
+        from huggingface_hub import InferenceClient
+        # Use a free, powerful model via HF Inference API
+        repo_id = "mistralai/Mistral-7B-Instruct-v0.3"
+        llm_client = InferenceClient(model=repo_id, token=HF_TOKEN)
+        prompt = f"""
+        You are a Navy Administrative Aide. Answer the user's question based ONLY on the context provided below.
+        If the answer is not in the context, say "I cannot find the answer in the provided documents."
+        CONTEXT:
+        {context_text}
+        USER QUESTION:
+        {query}
+        ANSWER:
+        """
+        with st.spinner("Consulting LLM..."):
+            try:
+                response = llm_client.text_generation(prompt, max_new_tokens=500)
+                st.success(response)
+            except Exception as e:
+                st.error(f"LLM Error: {e}")