pradeep4321 commited on
Commit
e079419
Β·
verified Β·
1 Parent(s): bef373d

Upload app.py

Browse files
Files changed (1) hide show
  1. src/app.py +282 -0
src/app.py ADDED
@@ -0,0 +1,282 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import pandas as pd
3
+ import numpy as np
4
+ import re
5
+
6
+ from sklearn.feature_extraction.text import TfidfVectorizer
7
+ from sentence_transformers import SentenceTransformer
8
+ from rank_bm25 import BM25Okapi
9
+ from rapidfuzz import fuzz
10
+ import faiss
11
+ import nltk
12
+
13
+ # ==============================
14
+ # FIX NLTK (HUGGINGFACE SAFE)
15
+ # ==============================
16
+ nltk.download('wordnet', quiet=True)
17
+ from nltk.corpus import wordnet
18
+
19
+ # ==============================
20
+ # PAGE CONFIG
21
+ # ==============================
22
+ st.set_page_config(page_title="Multi Search Engine", layout="wide")
23
+ st.title("πŸ” Advanced Multi-Search Product Engine")
24
+
25
+ # ==============================
26
+ # LOAD MODEL (NO CACHE BUG)
27
+ # ==============================
28
+ if "model" not in st.session_state:
29
+ with st.spinner("Loading AI model..."):
30
+ st.session_state.model = SentenceTransformer(
31
+ 'all-MiniLM-L6-v2',
32
+ device='cpu'
33
+ )
34
+
35
+ model = st.session_state.model
36
+
37
+ # ==============================
38
+ # SEARCH INFO
39
+ # ==============================
40
+ search_info = {
41
+ "Keyword": ("Find exact word match", "iphone β†’ iPhone"),
42
+ "Regex": ("Pattern-based search", "^S β†’ Samsung"),
43
+ "Boolean": ("Use AND / OR", "nike AND shoes"),
44
+ "Fuzzy": ("Handles spelling mistakes", "iphon β†’ iPhone"),
45
+ "N-Gram": ("Partial word match", "iph β†’ iPhone"),
46
+ "Prefix": ("Starts with query", "app β†’ Apple"),
47
+ "Suffix": ("Ends with query", "laptop β†’ Dell Laptop"),
48
+ "TF-IDF": ("Ranks important words", "wireless headphones"),
49
+ "BM25": ("Advanced keyword ranking", "gaming laptop"),
50
+ "Semantic": ("Understands meaning", "sports footwear"),
51
+ "FAISS": ("Fast semantic search", "music device"),
52
+ "Hybrid": ("Keyword + meaning", "sports shoes"),
53
+ "Query Expansion": ("Adds similar words", "speaker β†’ audio"),
54
+ "Weighted Hybrid": ("Weighted ranking", "better accuracy"),
55
+ "Ensemble": ("Combine all methods", "best results")
56
+ }
57
+
58
+ # ==============================
59
+ # CACHE PREPROCESSING (STABLE)
60
+ # ==============================
61
+ @st.cache(allow_output_mutation=True)
62
+ def preprocess_data(products):
63
+
64
+ # TF-IDF
65
+ tfidf = TfidfVectorizer()
66
+ tfidf_matrix = tfidf.fit_transform(products)
67
+
68
+ # Embeddings (NO progress bar β†’ HF fix)
69
+ embeddings = model.encode(products, batch_size=64, show_progress_bar=False)
70
+
71
+ # Normalize for FAISS
72
+ faiss.normalize_L2(embeddings)
73
+
74
+ # FAISS index
75
+ dim = embeddings.shape[1]
76
+ index = faiss.IndexFlatIP(dim)
77
+ index.add(np.array(embeddings))
78
+
79
+ # BM25
80
+ tokenized = [p.split() for p in products]
81
+ bm25 = BM25Okapi(tokenized)
82
+
83
+ return tfidf, tfidf_matrix, embeddings, index, bm25
84
+
85
+
86
+ @st.cache(allow_output_mutation=True)
87
+ def get_synonyms(word):
88
+ synonyms = set()
89
+ for syn in wordnet.synsets(word):
90
+ for lemma in syn.lemmas():
91
+ synonyms.add(lemma.name())
92
+ return synonyms
93
+
94
+ # ==============================
95
+ # FILE LOAD
96
+ # ==============================
97
+ uploaded_file = st.file_uploader("Upload CSV", type=["csv"])
98
+
99
+ if uploaded_file:
100
+ df = pd.read_csv(uploaded_file)
101
+ else:
102
+ st.info("Using sample dataset")
103
+ df = pd.DataFrame({
104
+ "product_name": [
105
+ "iPhone 14 Pro",
106
+ "Samsung Galaxy S23",
107
+ "Nike Running Shoes",
108
+ "Dell Gaming Laptop",
109
+ "Bluetooth Speaker"
110
+ ],
111
+ "category": ["Mobile", "Mobile", "Footwear", "Laptop", "Electronics"],
112
+ "brand": ["Apple", "Samsung", "Nike", "Dell", "JBL"],
113
+ "description": [
114
+ "Latest smartphone",
115
+ "Android flagship phone",
116
+ "Comfort sports shoes",
117
+ "High performance laptop",
118
+ "Portable music device"
119
+ ]
120
+ })
121
+
122
+ st.subheader("πŸ“„ Data Preview")
123
+ st.dataframe(df.head())
124
+
125
+ # ==============================
126
+ # COMBINE TEXT
127
+ # ==============================
128
+ df["combined"] = (
129
+ df["product_name"].astype(str) + " " +
130
+ df["category"].astype(str) + " " +
131
+ df["brand"].astype(str) + " " +
132
+ df["description"].astype(str)
133
+ )
134
+
135
+ products = df["combined"].tolist()
136
+
137
+ # ==============================
138
+ # PREPROCESS (ONLY ONCE)
139
+ # ==============================
140
+ with st.spinner("Processing data..."):
141
+ tfidf, tfidf_matrix, embeddings, index, bm25 = preprocess_data(products)
142
+
143
+ # ==============================
144
+ # SEARCH FUNCTIONS
145
+ # ==============================
146
+ def keyword_search(q):
147
+ return [(i, 1) for i, p in enumerate(products) if q.lower() in p.lower()]
148
+
149
+ def regex_search(q):
150
+ return [(i, 1) for i, p in enumerate(products) if re.search(q, p, re.IGNORECASE)]
151
+
152
+ def boolean_search(q):
153
+ if "AND" in q:
154
+ terms = q.split("AND")
155
+ return [(i, 1) for i, p in enumerate(products)
156
+ if all(t.strip().lower() in p.lower() for t in terms)]
157
+ elif "OR" in q:
158
+ terms = q.split("OR")
159
+ return [(i, 1) for i, p in enumerate(products)
160
+ if any(t.strip().lower() in p.lower() for t in terms)]
161
+ return []
162
+
163
+ def fuzzy_search(q):
164
+ scores = [(i, fuzz.ratio(q, p)) for i, p in enumerate(products)]
165
+ return sorted(scores, key=lambda x: x[1], reverse=True)[:10]
166
+
167
+ def ngram_search(q):
168
+ return [(i, 1) for i, p in enumerate(products) if q[:3].lower() in p.lower()]
169
+
170
+ def prefix_search(q):
171
+ return [(i, 1) for i, p in enumerate(products) if p.lower().startswith(q.lower())]
172
+
173
+ def suffix_search(q):
174
+ return [(i, 1) for i, p in enumerate(products) if p.lower().endswith(q.lower())]
175
+
176
+ def tfidf_search(q):
177
+ q_vec = tfidf.transform([q])
178
+ scores = (tfidf_matrix @ q_vec.T).toarray().flatten()
179
+ idx = np.argsort(scores)[::-1][:10]
180
+ return [(i, float(scores[i])) for i in idx]
181
+
182
+ def bm25_search(q):
183
+ scores = bm25.get_scores(q.split())
184
+ idx = np.argsort(scores)[::-1][:10]
185
+ return [(i, float(scores[i])) for i in idx]
186
+
187
+ def semantic_search(q):
188
+ q_emb = model.encode([q], show_progress_bar=False)
189
+ faiss.normalize_L2(q_emb)
190
+ scores = np.dot(embeddings, q_emb.T).flatten()
191
+ idx = np.argsort(scores)[::-1][:10]
192
+ return [(i, float(scores[i])) for i in idx]
193
+
194
+ def faiss_search(q):
195
+ q_emb = model.encode([q], show_progress_bar=False)
196
+ faiss.normalize_L2(q_emb)
197
+ D, I = index.search(np.array(q_emb), 10)
198
+ return [(i, float(D[0][idx])) for idx, i in enumerate(I[0])]
199
+
200
+ def hybrid_search(q):
201
+ tfidf_res = dict(tfidf_search(q))
202
+ sem_res = dict(semantic_search(q))
203
+ combined = {i: tfidf_res.get(i, 0) + sem_res.get(i, 0) for i in range(len(products))}
204
+ return sorted(combined.items(), key=lambda x: x[1], reverse=True)[:10]
205
+
206
+ def query_expansion_search(q):
207
+ synonyms = get_synonyms(q)
208
+ expanded_query = q + " " + " ".join(synonyms)
209
+ return tfidf_search(expanded_query)
210
+
211
+ def weighted_hybrid(q):
212
+ tfidf_res = dict(tfidf_search(q))
213
+ sem_res = dict(semantic_search(q))
214
+ bm25_res = dict(bm25_search(q))
215
+
216
+ combined = {}
217
+ for i in range(len(products)):
218
+ combined[i] = (
219
+ 0.4 * tfidf_res.get(i, 0) +
220
+ 0.4 * sem_res.get(i, 0) +
221
+ 0.2 * bm25_res.get(i, 0)
222
+ )
223
+ return sorted(combined.items(), key=lambda x: x[1], reverse=True)[:10]
224
+
225
+ def ensemble_search(q):
226
+ results = {}
227
+ for func in [tfidf_search, semantic_search, bm25_search]:
228
+ for i, score in func(q):
229
+ results[i] = results.get(i, 0) + score
230
+ return sorted(results.items(), key=lambda x: x[1], reverse=True)[:10]
231
+
232
+ # ==============================
233
+ # UI
234
+ # ==============================
235
+ search_type = st.selectbox("Select Search Type", list(search_info.keys()))
236
+
237
+ explanation, example = search_info[search_type]
238
+
239
+ st.markdown(f"""
240
+ ### πŸ” {search_type} Search
241
+ - **Explanation:** {explanation}
242
+ - **Example:** `{example}`
243
+ """)
244
+
245
+ query = st.text_input("Enter your search query")
246
+
247
+ if st.button("Try Example"):
248
+ query = example.split("β†’")[0].strip()
249
+ st.success(f"Example loaded: {query}")
250
+
251
+ top_k = st.slider("Top Results", 5, 20, 10)
252
+
253
+ if st.button("Search"):
254
+ if not query:
255
+ st.warning("Enter query")
256
+ else:
257
+ func_map = {
258
+ "Keyword": keyword_search,
259
+ "Regex": regex_search,
260
+ "Boolean": boolean_search,
261
+ "Fuzzy": fuzzy_search,
262
+ "N-Gram": ngram_search,
263
+ "Prefix": prefix_search,
264
+ "Suffix": suffix_search,
265
+ "TF-IDF": tfidf_search,
266
+ "BM25": bm25_search,
267
+ "Semantic": semantic_search,
268
+ "FAISS": faiss_search,
269
+ "Hybrid": hybrid_search,
270
+ "Query Expansion": query_expansion_search,
271
+ "Weighted Hybrid": weighted_hybrid,
272
+ "Ensemble": ensemble_search
273
+ }
274
+
275
+ results = func_map[search_type](query)[:top_k]
276
+
277
+ indices = [i for i, _ in results]
278
+ result_df = df.iloc[indices].copy()
279
+ result_df["Score"] = [score for _, score in results]
280
+
281
+ st.subheader("πŸ”Ž Results")
282
+ st.dataframe(result_df)