NavyDevilDoc commited on
Commit
ef513a5
·
verified ·
1 Parent(s): fd2d4ca

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +189 -0
app.py ADDED
@@ -0,0 +1,189 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import pandas as pd
3
+ import numpy as np
4
+ from sentence_transformers import SentenceTransformer
5
+ import faiss
6
+ from rank_bm25 import BM25Okapi
7
+ import pypdf
8
+ import docx
9
+ from io import BytesIO
10
+
11
+ # --- CONFIGURATION ---
12
+ st.set_page_config(page_title="Hybrid Semantic Search", layout="wide")
13
+
14
+ # --- HELPER FUNCTIONS: FILE PARSING ---
15
+ def parse_file(uploaded_file):
16
+ """Extracts text from various file formats."""
17
+ text = ""
18
+ try:
19
+ if uploaded_file.name.endswith(".pdf"):
20
+ reader = pypdf.PdfReader(uploaded_file)
21
+ for page in reader.pages:
22
+ text += page.extract_text() + "\n"
23
+ elif uploaded_file.name.endswith(".docx"):
24
+ doc = docx.Document(uploaded_file)
25
+ text = "\n".join([para.text for para in doc.paragraphs])
26
+ elif uploaded_file.name.endswith(".txt"):
27
+ text = uploaded_file.read().decode("utf-8")
28
+ elif uploaded_file.name.endswith(".csv"):
29
+ df = pd.read_csv(uploaded_file)
30
+ # Assuming a generic CSV, we just flatten it to text for now
31
+ text = df.to_string()
32
+ except Exception as e:
33
+ st.error(f"Error reading file: {e}")
34
+ return text
35
+
36
+ def chunk_text(text, chunk_size=300, overlap=50):
37
+ """Splits text into overlapping chunks for better context."""
38
+ words = text.split()
39
+ chunks = []
40
+ for i in range(0, len(words), chunk_size - overlap):
41
+ chunk = " ".join(words[i:i + chunk_size])
42
+ if len(chunk) > 50: # Filter out tiny chunks
43
+ chunks.append(chunk)
44
+ return chunks
45
+
46
+ # --- CORE LOGIC: HYBRID SEARCH ENGINE ---
47
+ class HybridSearchEngine:
48
+ def __init__(self, model_name):
49
+ self.model = SentenceTransformer(model_name)
50
+ self.documents = []
51
+ self.faiss_index = None
52
+ self.bm25 = None
53
+
54
+ def fit(self, documents):
55
+ self.documents = documents
56
+
57
+ # 1. Build Dense Index (FAISS)
58
+ embeddings = self.model.encode(documents)
59
+ # Normalize for Cosine Similarity (Inner Product)
60
+ faiss.normalize_L2(embeddings)
61
+ dimension = embeddings.shape[1]
62
+ self.faiss_index = faiss.IndexFlatIP(dimension) # Inner Product = Cosine Sim
63
+ self.faiss_index.add(embeddings)
64
+
65
+ # 2. Build Sparse Index (BM25)
66
+ tokenized_corpus = [doc.lower().split() for doc in documents]
67
+ self.bm25 = BM25Okapi(tokenized_corpus)
68
+
69
+ def search(self, query, top_k=5, alpha=0.5):
70
+ """
71
+ Alpha: Weighting factor.
72
+ 1.0 = Pure Vector Search
73
+ 0.0 = Pure Keyword Search
74
+ 0.5 = Equal Hybrid
75
+ """
76
+ # --- Vector Search ---
77
+ query_vector = self.model.encode([query])
78
+ faiss.normalize_L2(query_vector)
79
+ # Search more than we need to allow for re-ranking
80
+ v_scores, v_indices = self.faiss_index.search(query_vector, len(self.documents))
81
+
82
+ # Create a map of {doc_index: vector_score}
83
+ # Normalize vector scores to 0-1 range (approx)
84
+ v_results = {}
85
+ for i, idx in enumerate(v_indices[0]):
86
+ if idx != -1:
87
+ v_results[idx] = v_scores[0][i]
88
+
89
+ # --- Keyword Search (BM25) ---
90
+ tokenized_query = query.lower().split()
91
+ bm25_scores = self.bm25.get_scores(tokenized_query)
92
+
93
+ # Normalize BM25 scores (Min-Max Scaling) to match Vector scale
94
+ if max(bm25_scores) > 0:
95
+ bm25_scores = (bm25_scores - min(bm25_scores)) / (max(bm25_scores) - min(bm25_scores))
96
+
97
+ # --- Hybrid Combination ---
98
+ final_results = []
99
+ for idx, doc in enumerate(self.documents):
100
+ v_score = v_results.get(idx, 0.0)
101
+ k_score = bm25_scores[idx]
102
+
103
+ # Weighted Score
104
+ final_score = (alpha * v_score) + ((1 - alpha) * k_score)
105
+ final_results.append({
106
+ "chunk": doc,
107
+ "score": final_score,
108
+ "vector_score": v_score,
109
+ "keyword_score": k_score
110
+ })
111
+
112
+ # Sort by final score
113
+ final_results = sorted(final_results, key=lambda x: x["score"], reverse=True)
114
+ return final_results[:top_k]
115
+
116
+ # --- STREAMLIT UI ---
117
+
118
+ st.title("⚡ Hybrid Search: Vector + Keywords")
119
+ st.caption("Robust semantic search powered by FAISS (Dense) and BM25 (Sparse).")
120
+
121
+ with st.sidebar:
122
+ st.header("⚙️ Configuration")
123
+
124
+ # 3. Select Embedding Model
125
+ model_choice = st.selectbox(
126
+ "Embedding Model",
127
+ options=["all-MiniLM-L6-v2", "all-mpnet-base-v2", "multi-qa-mpnet-base-dot-v1"],
128
+ index=0,
129
+ help="MiniLM is fast; MPNet is more accurate but slower."
130
+ )
131
+
132
+ # 2. Results Count
133
+ top_k = st.number_input("Results to Retrieve", min_value=1, max_value=50, value=5, step=1)
134
+
135
+ # Hybrid Weight Slider
136
+ alpha = st.slider("Hybrid Balance (Alpha)", 0.0, 1.0, 0.5,
137
+ help="0.0 = Keywords Only, 1.0 = Vectors Only")
138
+
139
+ st.divider()
140
+
141
+ # 1. File Upload
142
+ uploaded_files = st.file_uploader(
143
+ "Upload Knowledge Base",
144
+ type=['txt', 'pdf', 'docx', 'csv'],
145
+ accept_multiple_files=True
146
+ )
147
+
148
+ process_btn = st.button("Build Database")
149
+
150
+ # --- APP STATE MANAGEMENT ---
151
+ if 'search_engine' not in st.session_state:
152
+ st.session_state.search_engine = None
153
+
154
+ if process_btn and uploaded_files:
155
+ with st.spinner(f"Parsing files and initializing {model_choice}..."):
156
+ all_chunks = []
157
+ for file in uploaded_files:
158
+ raw_text = parse_file(file)
159
+ file_chunks = chunk_text(raw_text)
160
+ all_chunks.extend(file_chunks)
161
+
162
+ if all_chunks:
163
+ engine = HybridSearchEngine(model_choice)
164
+ engine.fit(all_chunks)
165
+ st.session_state.search_engine = engine
166
+ st.success(f"Indexed {len(all_chunks)} chunks from {len(uploaded_files)} files!")
167
+ else:
168
+ st.warning("No text found in uploaded files.")
169
+
170
+ # --- SEARCH INTERFACE ---
171
+ if st.session_state.search_engine:
172
+ query = st.text_input("Enter your query:", placeholder="e.g., 'What are the safety protocols for the engine room?'")
173
+
174
+ if query:
175
+ results = st.session_state.search_engine.search(query, top_k=top_k, alpha=alpha)
176
+
177
+ st.subheader(f"Top {top_k} Matches")
178
+
179
+ for i, res in enumerate(results):
180
+ with st.expander(f"Rank {i+1} (Score: {res['score']:.4f})", expanded=(i==0)):
181
+ st.markdown(f"**{res['chunk']}**")
182
+
183
+ # Metadata columns
184
+ c1, c2, c3 = st.columns(3)
185
+ c1.metric("Hybrid Score", f"{res['score']:.4f}")
186
+ c2.metric("Vector Match", f"{res['vector_score']:.4f}")
187
+ c3.metric("Keyword Match", f"{res['keyword_score']:.4f}")
188
+ else:
189
+ st.info("👈 Please upload documents in the sidebar to begin.")