pradeep4321 commited on
Commit
3c85915
Β·
verified Β·
1 Parent(s): bc91d34

Update src/app.py

Browse files
Files changed (1) hide show
  1. src/app.py +296 -281
src/app.py CHANGED
@@ -1,282 +1,297 @@
1
- import streamlit as st
2
- import pandas as pd
3
- import numpy as np
4
- import re
5
-
6
- from sklearn.feature_extraction.text import TfidfVectorizer
7
- from sentence_transformers import SentenceTransformer
8
- from rank_bm25 import BM25Okapi
9
- from rapidfuzz import fuzz
10
- import faiss
11
- import nltk
12
-
13
- # ==============================
14
- # FIX NLTK (HUGGINGFACE SAFE)
15
- # ==============================
16
- nltk.download('wordnet', quiet=True)
17
- from nltk.corpus import wordnet
18
-
19
- # ==============================
20
- # PAGE CONFIG
21
- # ==============================
22
- st.set_page_config(page_title="Multi Search Engine", layout="wide")
23
- st.title("πŸ” Advanced Multi-Search Product Engine")
24
-
25
- # ==============================
26
- # LOAD MODEL (NO CACHE BUG)
27
- # ==============================
28
- if "model" not in st.session_state:
29
- with st.spinner("Loading AI model..."):
30
- st.session_state.model = SentenceTransformer(
31
- 'all-MiniLM-L6-v2',
32
- device='cpu'
33
- )
34
-
35
- model = st.session_state.model
36
-
37
- # ==============================
38
- # SEARCH INFO
39
- # ==============================
40
- search_info = {
41
- "Keyword": ("Find exact word match", "iphone β†’ iPhone"),
42
- "Regex": ("Pattern-based search", "^S β†’ Samsung"),
43
- "Boolean": ("Use AND / OR", "nike AND shoes"),
44
- "Fuzzy": ("Handles spelling mistakes", "iphon β†’ iPhone"),
45
- "N-Gram": ("Partial word match", "iph β†’ iPhone"),
46
- "Prefix": ("Starts with query", "app β†’ Apple"),
47
- "Suffix": ("Ends with query", "laptop β†’ Dell Laptop"),
48
- "TF-IDF": ("Ranks important words", "wireless headphones"),
49
- "BM25": ("Advanced keyword ranking", "gaming laptop"),
50
- "Semantic": ("Understands meaning", "sports footwear"),
51
- "FAISS": ("Fast semantic search", "music device"),
52
- "Hybrid": ("Keyword + meaning", "sports shoes"),
53
- "Query Expansion": ("Adds similar words", "speaker β†’ audio"),
54
- "Weighted Hybrid": ("Weighted ranking", "better accuracy"),
55
- "Ensemble": ("Combine all methods", "best results")
56
- }
57
-
58
- # ==============================
59
- # CACHE PREPROCESSING (STABLE)
60
- # ==============================
61
- @st.cache(allow_output_mutation=True)
62
- def preprocess_data(products):
63
-
64
- # TF-IDF
65
- tfidf = TfidfVectorizer()
66
- tfidf_matrix = tfidf.fit_transform(products)
67
-
68
- # Embeddings (NO progress bar β†’ HF fix)
69
- embeddings = model.encode(products, batch_size=64, show_progress_bar=False)
70
-
71
- # Normalize for FAISS
72
- faiss.normalize_L2(embeddings)
73
-
74
- # FAISS index
75
- dim = embeddings.shape[1]
76
- index = faiss.IndexFlatIP(dim)
77
- index.add(np.array(embeddings))
78
-
79
- # BM25
80
- tokenized = [p.split() for p in products]
81
- bm25 = BM25Okapi(tokenized)
82
-
83
- return tfidf, tfidf_matrix, embeddings, index, bm25
84
-
85
-
86
- @st.cache(allow_output_mutation=True)
87
- def get_synonyms(word):
88
- synonyms = set()
89
- for syn in wordnet.synsets(word):
90
- for lemma in syn.lemmas():
91
- synonyms.add(lemma.name())
92
- return synonyms
93
-
94
- # ==============================
95
- # FILE LOAD
96
- # ==============================
97
- uploaded_file = st.file_uploader("Upload CSV", type=["csv"])
98
-
99
- if uploaded_file:
100
- df = pd.read_csv(uploaded_file)
101
- else:
102
- st.info("Using sample dataset")
103
- df = pd.DataFrame({
104
- "product_name": [
105
- "iPhone 14 Pro",
106
- "Samsung Galaxy S23",
107
- "Nike Running Shoes",
108
- "Dell Gaming Laptop",
109
- "Bluetooth Speaker"
110
- ],
111
- "category": ["Mobile", "Mobile", "Footwear", "Laptop", "Electronics"],
112
- "brand": ["Apple", "Samsung", "Nike", "Dell", "JBL"],
113
- "description": [
114
- "Latest smartphone",
115
- "Android flagship phone",
116
- "Comfort sports shoes",
117
- "High performance laptop",
118
- "Portable music device"
119
- ]
120
- })
121
-
122
- st.subheader("πŸ“„ Data Preview")
123
- st.dataframe(df.head())
124
-
125
- # ==============================
126
- # COMBINE TEXT
127
- # ==============================
128
- df["combined"] = (
129
- df["product_name"].astype(str) + " " +
130
- df["category"].astype(str) + " " +
131
- df["brand"].astype(str) + " " +
132
- df["description"].astype(str)
133
- )
134
-
135
- products = df["combined"].tolist()
136
-
137
- # ==============================
138
- # PREPROCESS (ONLY ONCE)
139
- # ==============================
140
- with st.spinner("Processing data..."):
141
- tfidf, tfidf_matrix, embeddings, index, bm25 = preprocess_data(products)
142
-
143
- # ==============================
144
- # SEARCH FUNCTIONS
145
- # ==============================
146
- def keyword_search(q):
147
- return [(i, 1) for i, p in enumerate(products) if q.lower() in p.lower()]
148
-
149
- def regex_search(q):
150
- return [(i, 1) for i, p in enumerate(products) if re.search(q, p, re.IGNORECASE)]
151
-
152
- def boolean_search(q):
153
- if "AND" in q:
154
- terms = q.split("AND")
155
- return [(i, 1) for i, p in enumerate(products)
156
- if all(t.strip().lower() in p.lower() for t in terms)]
157
- elif "OR" in q:
158
- terms = q.split("OR")
159
- return [(i, 1) for i, p in enumerate(products)
160
- if any(t.strip().lower() in p.lower() for t in terms)]
161
- return []
162
-
163
- def fuzzy_search(q):
164
- scores = [(i, fuzz.ratio(q, p)) for i, p in enumerate(products)]
165
- return sorted(scores, key=lambda x: x[1], reverse=True)[:10]
166
-
167
- def ngram_search(q):
168
- return [(i, 1) for i, p in enumerate(products) if q[:3].lower() in p.lower()]
169
-
170
- def prefix_search(q):
171
- return [(i, 1) for i, p in enumerate(products) if p.lower().startswith(q.lower())]
172
-
173
- def suffix_search(q):
174
- return [(i, 1) for i, p in enumerate(products) if p.lower().endswith(q.lower())]
175
-
176
- def tfidf_search(q):
177
- q_vec = tfidf.transform([q])
178
- scores = (tfidf_matrix @ q_vec.T).toarray().flatten()
179
- idx = np.argsort(scores)[::-1][:10]
180
- return [(i, float(scores[i])) for i in idx]
181
-
182
- def bm25_search(q):
183
- scores = bm25.get_scores(q.split())
184
- idx = np.argsort(scores)[::-1][:10]
185
- return [(i, float(scores[i])) for i in idx]
186
-
187
- def semantic_search(q):
188
- q_emb = model.encode([q], show_progress_bar=False)
189
- faiss.normalize_L2(q_emb)
190
- scores = np.dot(embeddings, q_emb.T).flatten()
191
- idx = np.argsort(scores)[::-1][:10]
192
- return [(i, float(scores[i])) for i in idx]
193
-
194
- def faiss_search(q):
195
- q_emb = model.encode([q], show_progress_bar=False)
196
- faiss.normalize_L2(q_emb)
197
- D, I = index.search(np.array(q_emb), 10)
198
- return [(i, float(D[0][idx])) for idx, i in enumerate(I[0])]
199
-
200
- def hybrid_search(q):
201
- tfidf_res = dict(tfidf_search(q))
202
- sem_res = dict(semantic_search(q))
203
- combined = {i: tfidf_res.get(i, 0) + sem_res.get(i, 0) for i in range(len(products))}
204
- return sorted(combined.items(), key=lambda x: x[1], reverse=True)[:10]
205
-
206
- def query_expansion_search(q):
207
- synonyms = get_synonyms(q)
208
- expanded_query = q + " " + " ".join(synonyms)
209
- return tfidf_search(expanded_query)
210
-
211
- def weighted_hybrid(q):
212
- tfidf_res = dict(tfidf_search(q))
213
- sem_res = dict(semantic_search(q))
214
- bm25_res = dict(bm25_search(q))
215
-
216
- combined = {}
217
- for i in range(len(products)):
218
- combined[i] = (
219
- 0.4 * tfidf_res.get(i, 0) +
220
- 0.4 * sem_res.get(i, 0) +
221
- 0.2 * bm25_res.get(i, 0)
222
- )
223
- return sorted(combined.items(), key=lambda x: x[1], reverse=True)[:10]
224
-
225
- def ensemble_search(q):
226
- results = {}
227
- for func in [tfidf_search, semantic_search, bm25_search]:
228
- for i, score in func(q):
229
- results[i] = results.get(i, 0) + score
230
- return sorted(results.items(), key=lambda x: x[1], reverse=True)[:10]
231
-
232
- # ==============================
233
- # UI
234
- # ==============================
235
- search_type = st.selectbox("Select Search Type", list(search_info.keys()))
236
-
237
- explanation, example = search_info[search_type]
238
-
239
- st.markdown(f"""
240
- ### πŸ” {search_type} Search
241
- - **Explanation:** {explanation}
242
- - **Example:** `{example}`
243
- """)
244
-
245
- query = st.text_input("Enter your search query")
246
-
247
- if st.button("Try Example"):
248
- query = example.split("β†’")[0].strip()
249
- st.success(f"Example loaded: {query}")
250
-
251
- top_k = st.slider("Top Results", 5, 20, 10)
252
-
253
- if st.button("Search"):
254
- if not query:
255
- st.warning("Enter query")
256
- else:
257
- func_map = {
258
- "Keyword": keyword_search,
259
- "Regex": regex_search,
260
- "Boolean": boolean_search,
261
- "Fuzzy": fuzzy_search,
262
- "N-Gram": ngram_search,
263
- "Prefix": prefix_search,
264
- "Suffix": suffix_search,
265
- "TF-IDF": tfidf_search,
266
- "BM25": bm25_search,
267
- "Semantic": semantic_search,
268
- "FAISS": faiss_search,
269
- "Hybrid": hybrid_search,
270
- "Query Expansion": query_expansion_search,
271
- "Weighted Hybrid": weighted_hybrid,
272
- "Ensemble": ensemble_search
273
- }
274
-
275
- results = func_map[search_type](query)[:top_k]
276
-
277
- indices = [i for i, _ in results]
278
- result_df = df.iloc[indices].copy()
279
- result_df["Score"] = [score for _, score in results]
280
-
281
- st.subheader("πŸ”Ž Results")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
282
  st.dataframe(result_df)
 
1
+ import streamlit as st
2
+ import pandas as pd
3
+ import numpy as np
4
+ import re
5
+ import os
6
+
7
+ from sklearn.feature_extraction.text import TfidfVectorizer
8
+ from sentence_transformers import SentenceTransformer
9
+ from rank_bm25 import BM25Okapi
10
+ from rapidfuzz import fuzz
11
+ import faiss
12
+ import nltk
13
+
14
+ # ==============================
15
+ # FIX NLTK (HUGGINGFACE SAFE)
16
+ # ==============================
17
+ nltk_data_path = "/tmp/nltk_data"
18
+ os.makedirs(nltk_data_path, exist_ok=True)
19
+ nltk.data.path.append(nltk_data_path)
20
+
21
+ try:
22
+ nltk.data.find('corpora/wordnet')
23
+ except:
24
+ nltk.download('wordnet', download_dir=nltk_data_path)
25
+
26
+ from nltk.corpus import wordnet
27
+
28
+ # ==============================
29
+ # PAGE CONFIG
30
+ # ==============================
31
+ st.set_page_config(page_title="Multi Search Engine", layout="wide")
32
+ st.title("πŸ” Advanced Multi-Search Product Engine")
33
+
34
+ # ==============================
35
+ # LOAD MODEL (CACHED)
36
+ # ==============================
37
+ @st.cache_resource
38
+ def load_model():
39
+ return SentenceTransformer('all-MiniLM-L6-v2', device='cpu')
40
+
41
+ model = load_model()
42
+
43
+ # ==============================
44
+ # SEARCH INFO
45
+ # ==============================
46
+ search_info = {
47
+ "Keyword": ("Find exact word match", "iphone β†’ iPhone"),
48
+ "Regex": ("Pattern-based search", "^S β†’ Samsung"),
49
+ "Boolean": ("Use AND / OR", "nike AND shoes"),
50
+ "Fuzzy": ("Handles spelling mistakes", "iphon β†’ iPhone"),
51
+ "N-Gram": ("Partial word match", "iph β†’ iPhone"),
52
+ "Prefix": ("Starts with query", "app β†’ Apple"),
53
+ "Suffix": ("Ends with query", "laptop β†’ Dell Laptop"),
54
+ "TF-IDF": ("Ranks important words", "wireless headphones"),
55
+ "BM25": ("Advanced keyword ranking", "gaming laptop"),
56
+ "Semantic": ("Understands meaning", "sports footwear"),
57
+ "FAISS": ("Fast semantic search", "music device"),
58
+ "Hybrid": ("Keyword + meaning", "sports shoes"),
59
+ "Query Expansion": ("Adds similar words", "speaker β†’ audio"),
60
+ "Weighted Hybrid": ("Weighted ranking", "better accuracy"),
61
+ "Ensemble": ("Combine all methods", "best results")
62
+ }
63
+
64
+ # ==============================
65
+ # DATA SOURCE OPTION
66
+ # ==============================
67
+ data_option = st.radio("πŸ“‚ Choose Data Source", ["Sample Data", "Upload CSV"])
68
+
69
+ if data_option == "Upload CSV":
70
+ uploaded_file = st.file_uploader("Upload CSV", type=["csv"])
71
+
72
+ if uploaded_file is not None:
73
+ try:
74
+ df = pd.read_csv(uploaded_file)
75
+ st.success("βœ… File uploaded successfully")
76
+ except Exception as e:
77
+ st.error(f"Error reading file: {e}")
78
+ st.stop()
79
+ else:
80
+ st.warning("⚠️ Upload a file or switch to Sample Data")
81
+ st.stop()
82
+
83
+ else:
84
+ df = pd.DataFrame({
85
+ "product_name": [
86
+ "iPhone 14 Pro",
87
+ "Samsung Galaxy S23",
88
+ "Nike Running Shoes",
89
+ "Dell Gaming Laptop",
90
+ "Bluetooth Speaker"
91
+ ],
92
+ "category": ["Mobile", "Mobile", "Footwear", "Laptop", "Electronics"],
93
+ "brand": ["Apple", "Samsung", "Nike", "Dell", "JBL"],
94
+ "description": [
95
+ "Latest smartphone",
96
+ "Android flagship phone",
97
+ "Comfort sports shoes",
98
+ "High performance laptop",
99
+ "Portable music device"
100
+ ]
101
+ })
102
+ st.info("Using sample dataset")
103
+
104
+ # ==============================
105
+ # DATA PREVIEW
106
+ # ==============================
107
+ st.subheader("πŸ“„ Data Preview")
108
+ st.dataframe(df.head())
109
+
110
+ # ==============================
111
+ # COMBINE TEXT
112
+ # ==============================
113
+ df["combined"] = (
114
+ df["product_name"].astype(str) + " " +
115
+ df["category"].astype(str) + " " +
116
+ df["brand"].astype(str) + " " +
117
+ df["description"].astype(str)
118
+ )
119
+
120
+ products = df["combined"].tolist()
121
+
122
+ # ==============================
123
+ # CACHE PREPROCESSING
124
+ # ==============================
125
+ @st.cache_resource
126
+ def preprocess_data(products):
127
+
128
+ tfidf = TfidfVectorizer()
129
+ tfidf_matrix = tfidf.fit_transform(products)
130
+
131
+ embeddings = model.encode(products, batch_size=32, show_progress_bar=False)
132
+
133
+ faiss.normalize_L2(embeddings)
134
+
135
+ dim = embeddings.shape[1]
136
+ index = faiss.IndexFlatIP(dim)
137
+ index.add(np.array(embeddings))
138
+
139
+ tokenized = [p.split() for p in products]
140
+ bm25 = BM25Okapi(tokenized)
141
+
142
+ return tfidf, tfidf_matrix, embeddings, index, bm25
143
+
144
+ @st.cache_resource
145
+ def get_synonyms(word):
146
+ synonyms = set()
147
+ for syn in wordnet.synsets(word):
148
+ for lemma in syn.lemmas():
149
+ synonyms.add(lemma.name())
150
+ return synonyms
151
+
152
+ # ==============================
153
+ # PREPROCESS
154
+ # ==============================
155
+ with st.spinner("βš™οΈ Processing data..."):
156
+ tfidf, tfidf_matrix, embeddings, index, bm25 = preprocess_data(products)
157
+
158
+ # ==============================
159
+ # SEARCH FUNCTIONS
160
+ # ==============================
161
+ def keyword_search(q):
162
+ return [(i, 1) for i, p in enumerate(products) if q.lower() in p.lower()]
163
+
164
+ def regex_search(q):
165
+ return [(i, 1) for i, p in enumerate(products) if re.search(q, p, re.IGNORECASE)]
166
+
167
+ def boolean_search(q):
168
+ if "AND" in q:
169
+ terms = q.split("AND")
170
+ return [(i, 1) for i, p in enumerate(products)
171
+ if all(t.strip().lower() in p.lower() for t in terms)]
172
+ elif "OR" in q:
173
+ terms = q.split("OR")
174
+ return [(i, 1) for i, p in enumerate(products)
175
+ if any(t.strip().lower() in p.lower() for t in terms)]
176
+ return []
177
+
178
+ def fuzzy_search(q):
179
+ scores = [(i, fuzz.ratio(q, p)) for i, p in enumerate(products)]
180
+ return sorted(scores, key=lambda x: x[1], reverse=True)[:10]
181
+
182
+ def ngram_search(q):
183
+ return [(i, 1) for i, p in enumerate(products) if q[:3].lower() in p.lower()]
184
+
185
+ def prefix_search(q):
186
+ return [(i, 1) for i, p in enumerate(products) if p.lower().startswith(q.lower())]
187
+
188
+ def suffix_search(q):
189
+ return [(i, 1) for i, p in enumerate(products) if p.lower().endswith(q.lower())]
190
+
191
+ def tfidf_search(q):
192
+ q_vec = tfidf.transform([q])
193
+ scores = (tfidf_matrix @ q_vec.T).toarray().flatten()
194
+ idx = np.argsort(scores)[::-1][:10]
195
+ return [(i, float(scores[i])) for i in idx]
196
+
197
+ def bm25_search(q):
198
+ scores = bm25.get_scores(q.split())
199
+ idx = np.argsort(scores)[::-1][:10]
200
+ return [(i, float(scores[i])) for i in idx]
201
+
202
+ def semantic_search(q):
203
+ q_emb = model.encode([q], show_progress_bar=False)
204
+ faiss.normalize_L2(q_emb)
205
+ scores = np.dot(embeddings, q_emb.T).flatten()
206
+ idx = np.argsort(scores)[::-1][:10]
207
+ return [(i, float(scores[i])) for i in idx]
208
+
209
+ def faiss_search(q):
210
+ q_emb = model.encode([q], show_progress_bar=False)
211
+ faiss.normalize_L2(q_emb)
212
+ D, I = index.search(np.array(q_emb), 10)
213
+ return [(i, float(D[0][idx])) for idx, i in enumerate(I[0])]
214
+
215
+ def hybrid_search(q):
216
+ tfidf_res = dict(tfidf_search(q))
217
+ sem_res = dict(semantic_search(q))
218
+ combined = {i: tfidf_res.get(i, 0) + sem_res.get(i, 0) for i in range(len(products))}
219
+ return sorted(combined.items(), key=lambda x: x[1], reverse=True)[:10]
220
+
221
+ def query_expansion_search(q):
222
+ synonyms = get_synonyms(q)
223
+ expanded_query = q + " " + " ".join(synonyms)
224
+ return tfidf_search(expanded_query)
225
+
226
+ def weighted_hybrid(q):
227
+ tfidf_res = dict(tfidf_search(q))
228
+ sem_res = dict(semantic_search(q))
229
+ bm25_res = dict(bm25_search(q))
230
+
231
+ combined = {}
232
+ for i in range(len(products)):
233
+ combined[i] = (
234
+ 0.4 * tfidf_res.get(i, 0) +
235
+ 0.4 * sem_res.get(i, 0) +
236
+ 0.2 * bm25_res.get(i, 0)
237
+ )
238
+ return sorted(combined.items(), key=lambda x: x[1], reverse=True)[:10]
239
+
240
+ def ensemble_search(q):
241
+ results = {}
242
+ for func in [tfidf_search, semantic_search, bm25_search]:
243
+ for i, score in func(q):
244
+ results[i] = results.get(i, 0) + score
245
+ return sorted(results.items(), key=lambda x: x[1], reverse=True)[:10]
246
+
247
+ # ==============================
248
+ # UI
249
+ # ==============================
250
+ search_type = st.selectbox("πŸ”Ž Select Search Type", list(search_info.keys()))
251
+
252
+ explanation, example = search_info[search_type]
253
+
254
+ st.markdown(f"""
255
+ ### πŸ” {search_type} Search
256
+ - **Explanation:** {explanation}
257
+ - **Example:** `{example}`
258
+ """)
259
+
260
+ query = st.text_input("Enter your search query")
261
+
262
+ if st.button("Try Example"):
263
+ query = example.split("β†’")[0].strip()
264
+ st.success(f"Example loaded: {query}")
265
+
266
+ top_k = st.slider("Top Results", 5, 20, 10)
267
+
268
+ if st.button("Search"):
269
+ if not query:
270
+ st.warning("Enter query")
271
+ else:
272
+ func_map = {
273
+ "Keyword": keyword_search,
274
+ "Regex": regex_search,
275
+ "Boolean": boolean_search,
276
+ "Fuzzy": fuzzy_search,
277
+ "N-Gram": ngram_search,
278
+ "Prefix": prefix_search,
279
+ "Suffix": suffix_search,
280
+ "TF-IDF": tfidf_search,
281
+ "BM25": bm25_search,
282
+ "Semantic": semantic_search,
283
+ "FAISS": faiss_search,
284
+ "Hybrid": hybrid_search,
285
+ "Query Expansion": query_expansion_search,
286
+ "Weighted Hybrid": weighted_hybrid,
287
+ "Ensemble": ensemble_search
288
+ }
289
+
290
+ results = func_map[search_type](query)[:top_k]
291
+
292
+ indices = [i for i, _ in results]
293
+ result_df = df.iloc[indices].copy()
294
+ result_df["Score"] = [score for _, score in results]
295
+
296
+ st.subheader("πŸ”Ž Results")
297
  st.dataframe(result_df)