Spaces:

NavyDevilDoc
/

Semantic_Search

Sleeping

App Files Files Community

NavyDevilDoc commited on Dec 15, 2025

Commit

b045f9c

verified ·

1 Parent(s): a239196

Update app.py

Browse files

Files changed (1) hide show

app.py +129 -107

app.py CHANGED Viewed

@@ -1,19 +1,18 @@
 import streamlit as st
 import pandas as pd
 import numpy as np
-from sentence_transformers import SentenceTransformer
 import faiss
 from rank_bm25 import BM25Okapi
 import pypdf
 import docx
-from io import BytesIO
 # --- CONFIGURATION ---
-st.set_page_config(page_title="Hybrid Semantic Search", layout="wide")
-# --- HELPER FUNCTIONS: FILE PARSING ---
 def parse_file(uploaded_file):
-    """Extracts text from various file formats."""
     text = ""
     try:
         if uploaded_file.name.endswith(".pdf"):
@@ -27,26 +26,30 @@ def parse_file(uploaded_file):
             text = uploaded_file.read().decode("utf-8")
         elif uploaded_file.name.endswith(".csv"):
             df = pd.read_csv(uploaded_file)
-            # Assuming a generic CSV, we just flatten it to text for now
             text = df.to_string()
     except Exception as e:
         st.error(f"Error reading file: {e}")
     return text
 def chunk_text(text, chunk_size=300, overlap=50):
-    """Splits text into overlapping chunks for better context."""
     words = text.split()
     chunks = []
     for i in range(0, len(words), chunk_size - overlap):
         chunk = " ".join(words[i:i + chunk_size])
-        if len(chunk) > 50:  # Filter out tiny chunks
             chunks.append(chunk)
     return chunks
-# --- CORE LOGIC: HYBRID SEARCH ENGINE ---
-class HybridSearchEngine:
-    def __init__(self, model_name):
-        self.model = SentenceTransformer(model_name)
         self.documents = []
         self.faiss_index = None
         self.bm25 = None
@@ -54,136 +57,155 @@ class HybridSearchEngine:
     def fit(self, documents):
         self.documents = documents
-        # 1. Build Dense Index (FAISS)
-        embeddings = self.model.encode(documents)
-        # Normalize for Cosine Similarity (Inner Product)
-        faiss.normalize_L2(embeddings)
-        dimension = embeddings.shape[1]
-        self.faiss_index = faiss.IndexFlatIP(dimension) # Inner Product = Cosine Sim
-        self.faiss_index.add(embeddings)
-        # 2. Build Sparse Index (BM25)
         tokenized_corpus = [doc.lower().split() for doc in documents]
         self.bm25 = BM25Okapi(tokenized_corpus)
     def search(self, query, top_k=5, alpha=0.5):
-        """
-        Alpha: Weighting factor.
-        1.0 = Pure Vector Search
-        0.0 = Pure Keyword Search
-        0.5 = Equal Hybrid
-        """
-        # --- Vector Search ---
-        query_vector = self.model.encode([query])
         faiss.normalize_L2(query_vector)
-        # Search more than we need to allow for re-ranking
-        v_scores, v_indices = self.faiss_index.search(query_vector, len(self.documents))
-        # Create a map of {doc_index: vector_score}
-        # Normalize vector scores to 0-1 range (approx)
-        v_results = {}
-        for i, idx in enumerate(v_indices[0]):
-            if idx != -1:
-                v_results[idx] = v_scores[0][i]
-        # --- Keyword Search (BM25) ---
         tokenized_query = query.lower().split()
         bm25_scores = self.bm25.get_scores(tokenized_query)
-        # Normalize BM25 scores (Min-Max Scaling) to match Vector scale
-        if max(bm25_scores) > 0:
             bm25_scores = (bm25_scores - min(bm25_scores)) / (max(bm25_scores) - min(bm25_scores))
-        # --- Hybrid Combination ---
-        final_results = []
-        for idx, doc in enumerate(self.documents):
-            v_score = v_results.get(idx, 0.0)
-            k_score = bm25_scores[idx]
-            # Weighted Score
-            final_score = (alpha * v_score) + ((1 - alpha) * k_score)
             final_results.append({
-                "chunk": doc,
-                "score": final_score,
-                "vector_score": v_score,
-                "keyword_score": k_score
             })
-        # Sort by final score
         final_results = sorted(final_results, key=lambda x: x["score"], reverse=True)
         return final_results[:top_k]
-# --- STREAMLIT UI ---
-st.title("⚡ Hybrid Search: Vector + Keywords")
-st.caption("Robust semantic search powered by FAISS (Dense) and BM25 (Sparse).")
 with st.sidebar:
-    st.header("⚙️ Configuration")
-    # 3. Select Embedding Model
     model_choice = st.selectbox(
-        "Embedding Model",
-        options=["all-MiniLM-L6-v2", "all-mpnet-base-v2", "multi-qa-mpnet-base-dot-v1"],
-        index=0,
-        help="MiniLM is fast; MPNet is more accurate but slower."
     )
-    # 2. Results Count
-    top_k = st.number_input("Results to Retrieve", min_value=1, max_value=50, value=5, step=1)
-    # Hybrid Weight Slider
-    alpha = st.slider("Hybrid Balance (Alpha)", 0.0, 1.0, 0.5,
-                      help="0.0 = Keywords Only, 1.0 = Vectors Only")
-    st.divider()
-    # 1. File Upload
-    uploaded_files = st.file_uploader(
-        "Upload Knowledge Base",
-        type=['txt', 'pdf', 'docx', 'csv'],
-        accept_multiple_files=True
-    )
-    process_btn = st.button("Build Database")
-# --- APP STATE MANAGEMENT ---
-if 'search_engine' not in st.session_state:
-    st.session_state.search_engine = None
-if process_btn and uploaded_files:
-    with st.spinner(f"Parsing files and initializing {model_choice}..."):
         all_chunks = []
         for file in uploaded_files:
-            raw_text = parse_file(file)
-            file_chunks = chunk_text(raw_text)
-            all_chunks.extend(file_chunks)
         if all_chunks:
-            engine = HybridSearchEngine(model_choice)
-            engine.fit(all_chunks)
-            st.session_state.search_engine = engine
-            st.success(f"Indexed {len(all_chunks)} chunks from {len(uploaded_files)} files!")
         else:
-            st.warning("No text found in uploaded files.")
-# --- SEARCH INTERFACE ---
-if st.session_state.search_engine:
-    query = st.text_input("Enter your query:", placeholder="e.g., 'What are the safety protocols for the engine room?'")
     if query:
-        results = st.session_state.search_engine.search(query, top_k=top_k, alpha=alpha)
-        st.subheader(f"Top {top_k} Matches")
         for i, res in enumerate(results):
-            with st.expander(f"Rank {i+1} (Score: {res['score']:.4f})", expanded=(i==0)):
-                st.markdown(f"**{res['chunk']}**")
-                # Metadata columns
-                c1, c2, c3 = st.columns(3)
-                c1.metric("Hybrid Score", f"{res['score']:.4f}")
-                c2.metric("Vector Match", f"{res['vector_score']:.4f}")
-                c3.metric("Keyword Match", f"{res['keyword_score']:.4f}")
 else:
-    st.info("👈 Please upload documents in the sidebar to begin.")

 import streamlit as st
 import pandas as pd
 import numpy as np
+from sentence_transformers import SentenceTransformer, CrossEncoder, util
 import faiss
 from rank_bm25 import BM25Okapi
 import pypdf
 import docx
+import torch
 # --- CONFIGURATION ---
+st.set_page_config(page_title="Advanced Semantic Search", layout="wide")
+# --- HELPER FUNCTIONS ---
 def parse_file(uploaded_file):
     text = ""
     try:
         if uploaded_file.name.endswith(".pdf"):
             text = uploaded_file.read().decode("utf-8")
         elif uploaded_file.name.endswith(".csv"):
             df = pd.read_csv(uploaded_file)
             text = df.to_string()
     except Exception as e:
         st.error(f"Error reading file: {e}")
     return text
 def chunk_text(text, chunk_size=300, overlap=50):
     words = text.split()
     chunks = []
     for i in range(0, len(words), chunk_size - overlap):
         chunk = " ".join(words[i:i + chunk_size])
+        if len(chunk) > 50:
             chunks.append(chunk)
     return chunks
+# --- CORE LOGIC: RETRIEVER + RE-RANKER ---
+class SearchEngine:
+    def __init__(self, bi_encoder_name):
+        # 1. Bi-Encoder (Fast Retrieval)
+        self.bi_encoder = SentenceTransformer(bi_encoder_name)
+        # 2. Cross-Encoder (Accurate Re-Ranking)
+        # We use a standard MS MARCO model designed for this exact task
+        self.cross_encoder = CrossEncoder('cross-encoder/ms-marco-MiniLM-L-6-v2')
         self.documents = []
         self.faiss_index = None
         self.bm25 = None
     def fit(self, documents):
         self.documents = documents
+        # Build Dense Index
+        embeddings = self.bi_encoder.encode(documents, convert_to_tensor=True)
+        # Convert to numpy for FAISS
+        embeddings_np = embeddings.cpu().numpy()
+        faiss.normalize_L2(embeddings_np)
+        dimension = embeddings_np.shape[1]
+        self.faiss_index = faiss.IndexFlatIP(dimension)
+        self.faiss_index.add(embeddings_np)
+        # Build Sparse Index
         tokenized_corpus = [doc.lower().split() for doc in documents]
         self.bm25 = BM25Okapi(tokenized_corpus)
     def search(self, query, top_k=5, alpha=0.5):
+        # STAGE 1: RETRIEVAL (Get a candidate pool)
+        # We retrieve 3x the requested amount to give the re-ranker options
+        candidate_k = top_k * 3
+        # Vector Search
+        query_vector = self.bi_encoder.encode([query])
         faiss.normalize_L2(query_vector)
+        v_scores, v_indices = self.faiss_index.search(query_vector, min(len(self.documents), candidate_k))
+        # BM25 Search
         tokenized_query = query.lower().split()
         bm25_scores = self.bm25.get_scores(tokenized_query)
+        # Normalize BM25
+        if len(bm25_scores) > 0 and max(bm25_scores) > 0:
             bm25_scores = (bm25_scores - min(bm25_scores)) / (max(bm25_scores) - min(bm25_scores))
+        # Combine Scores to get candidates
+        candidates = {} # {doc_idx: hybrid_score}
+        # Map vector results
+        for i, idx in enumerate(v_indices[0]):
+            if idx != -1:
+                v_score = v_scores[0][i]
+                candidates[idx] = alpha * v_score
+        # Add BM25 results (for all docs, efficient enough for small corpora)
+        # In production, you'd only check top BM25 results
+        top_bm25_indices = np.argsort(bm25_scores)[-candidate_k:]
+        for idx in top_bm25_indices:
+            score = (1 - alpha) * bm25_scores[idx]
+            if idx in candidates:
+                candidates[idx] += score
+            else:
+                candidates[idx] = score
+        # Sort candidates by Hybrid Score
+        sorted_candidates = sorted(candidates.items(), key=lambda x: x[1], reverse=True)[:candidate_k]
+        # STAGE 2: RE-RANKING (Cross-Encoder)
+        # Prepare pairs for the Cross-Encoder: [[query, doc1], [query, doc2]...]
+        candidate_indices = [idx for idx, score in sorted_candidates]
+        candidate_docs = [self.documents[idx] for idx in candidate_indices]
+        pairs = [[query, doc] for doc in candidate_docs]
+        if not pairs:
+            return []
+        # Predict scores (logits)
+        cross_scores = self.cross_encoder.predict(pairs)
+        # Combine everything into final results
+        final_results = []
+        for i, idx in enumerate(candidate_indices):
             final_results.append({
+                "chunk": self.documents[idx],
+                "score": cross_scores[i], # This is the high-accuracy score
+                "original_hybrid_score": sorted_candidates[i][1]
             })
+        # Sort by Cross-Encoder score
         final_results = sorted(final_results, key=lambda x: x["score"], reverse=True)
         return final_results[:top_k]
+# --- UI LAYOUT ---
+st.title("🧠 Semantic Search: Hybrid + Cross-Encoder")
+st.markdown("""
+This system uses a **Two-Stage Retrieval Process**:
+1. **Retrieval:** Finds top candidates using Vector (semantic) and BM25 (keyword) search.
+2. **Re-Ranking:** A Cross-Encoder model reads the query and candidates to score true relevance.
+""")
 with st.sidebar:
+    st.header("1. Setup Knowledge Base")
+    uploaded_files = st.file_uploader(
+        "Upload Documents",
+        type=['txt', 'pdf', 'docx', 'csv'],
+        accept_multiple_files=True
+    )
+    st.divider()
+    st.header("2. Tuning")
     model_choice = st.selectbox(
+        "Base Embedding Model",
+        ["all-MiniLM-L6-v2", "all-mpnet-base-v2"],
+        help="Used for the initial fast retrieval."
     )
+    alpha = st.slider("Hybrid Alpha", 0.0, 1.0, 0.4,
+                      help="0.0 = Keywords, 1.0 = Vectors. 0.4 is often best for Hybrid.")
+    top_k = st.number_input("Final Results", 1, 20, 5)
+    build_btn = st.button("Build Database")
+# --- APP STATE ---
+if 'engine' not in st.session_state:
+    st.session_state.engine = None
+if build_btn and uploaded_files:
+    with st.spinner("Processing files..."):
         all_chunks = []
         for file in uploaded_files:
+            raw = parse_file(file)
+            chunks = chunk_text(raw)
+            all_chunks.extend(chunks)
         if all_chunks:
+            # Initialize Engine
+            st.session_state.engine = SearchEngine(model_choice)
+            st.session_state.engine.fit(all_chunks)
+            st.success(f"Indexed {len(all_chunks)} chunks!")
         else:
+            st.warning("No text extracted.")
+# --- SEARCH ---
+if st.session_state.engine:
+    query = st.text_input("Ask a question:")
     if query:
+        with st.spinner("Retrieving & Re-Ranking..."):
+            results = st.session_state.engine.search(query, top_k=top_k, alpha=alpha)
         for i, res in enumerate(results):
+            score = res['score']
+            # Color code high relevance
+            color = "green" if score > 0 else "blue"
+            with st.container():
+                st.markdown(f"### Rank {i+1}")
+                st.caption(f"Relevance Score: :{color}[{score:.3f}]")
+                st.info(res['chunk'])
+                st.divider()
 else:
+    st.info("Upload documents to start.")