pradeep4321 commited on
Commit
0a76730
·
verified ·
1 Parent(s): 9020b3a

Upload 3 files

Browse files
Files changed (3) hide show
  1. app.py +282 -0
  2. products_sample.csv +16 -0
  3. requirements.txt +9 -3
app.py ADDED
@@ -0,0 +1,282 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import pandas as pd
3
+ import numpy as np
4
+ import re
5
+
6
+ from sklearn.feature_extraction.text import TfidfVectorizer
7
+ from sentence_transformers import SentenceTransformer
8
+ from rank_bm25 import BM25Okapi
9
+ from rapidfuzz import fuzz
10
+ import faiss
11
+ import nltk
12
+
13
+ # ==============================
14
+ # FIX NLTK (HUGGINGFACE SAFE)
15
+ # ==============================
16
+ nltk.download('wordnet', quiet=True)
17
+ from nltk.corpus import wordnet
18
+
19
+ # ==============================
20
+ # PAGE CONFIG
21
+ # ==============================
22
+ st.set_page_config(page_title="Multi Search Engine", layout="wide")
23
+ st.title("🔍 Advanced Multi-Search Product Engine")
24
+
25
+ # ==============================
26
+ # LOAD MODEL (NO CACHE BUG)
27
+ # ==============================
28
+ if "model" not in st.session_state:
29
+ with st.spinner("Loading AI model..."):
30
+ st.session_state.model = SentenceTransformer(
31
+ 'all-MiniLM-L6-v2',
32
+ device='cpu'
33
+ )
34
+
35
+ model = st.session_state.model
36
+
37
+ # ==============================
38
+ # SEARCH INFO
39
+ # ==============================
40
+ search_info = {
41
+ "Keyword": ("Find exact word match", "iphone → iPhone"),
42
+ "Regex": ("Pattern-based search", "^S → Samsung"),
43
+ "Boolean": ("Use AND / OR", "nike AND shoes"),
44
+ "Fuzzy": ("Handles spelling mistakes", "iphon → iPhone"),
45
+ "N-Gram": ("Partial word match", "iph → iPhone"),
46
+ "Prefix": ("Starts with query", "app → Apple"),
47
+ "Suffix": ("Ends with query", "laptop → Dell Laptop"),
48
+ "TF-IDF": ("Ranks important words", "wireless headphones"),
49
+ "BM25": ("Advanced keyword ranking", "gaming laptop"),
50
+ "Semantic": ("Understands meaning", "sports footwear"),
51
+ "FAISS": ("Fast semantic search", "music device"),
52
+ "Hybrid": ("Keyword + meaning", "sports shoes"),
53
+ "Query Expansion": ("Adds similar words", "speaker → audio"),
54
+ "Weighted Hybrid": ("Weighted ranking", "better accuracy"),
55
+ "Ensemble": ("Combine all methods", "best results")
56
+ }
57
+
58
+ # ==============================
59
+ # CACHE PREPROCESSING (STABLE)
60
+ # ==============================
61
+ @st.cache(allow_output_mutation=True)
62
+ def preprocess_data(products):
63
+
64
+ # TF-IDF
65
+ tfidf = TfidfVectorizer()
66
+ tfidf_matrix = tfidf.fit_transform(products)
67
+
68
+ # Embeddings (NO progress bar → HF fix)
69
+ embeddings = model.encode(products, batch_size=64, show_progress_bar=False)
70
+
71
+ # Normalize for FAISS
72
+ faiss.normalize_L2(embeddings)
73
+
74
+ # FAISS index
75
+ dim = embeddings.shape[1]
76
+ index = faiss.IndexFlatIP(dim)
77
+ index.add(np.array(embeddings))
78
+
79
+ # BM25
80
+ tokenized = [p.split() for p in products]
81
+ bm25 = BM25Okapi(tokenized)
82
+
83
+ return tfidf, tfidf_matrix, embeddings, index, bm25
84
+
85
+
86
+ @st.cache(allow_output_mutation=True)
87
+ def get_synonyms(word):
88
+ synonyms = set()
89
+ for syn in wordnet.synsets(word):
90
+ for lemma in syn.lemmas():
91
+ synonyms.add(lemma.name())
92
+ return synonyms
93
+
94
+ # ==============================
95
+ # FILE LOAD
96
+ # ==============================
97
+ uploaded_file = st.file_uploader("Upload CSV", type=["csv"])
98
+
99
+ if uploaded_file:
100
+ df = pd.read_csv(uploaded_file)
101
+ else:
102
+ st.info("Using sample dataset")
103
+ df = pd.DataFrame({
104
+ "product_name": [
105
+ "iPhone 14 Pro",
106
+ "Samsung Galaxy S23",
107
+ "Nike Running Shoes",
108
+ "Dell Gaming Laptop",
109
+ "Bluetooth Speaker"
110
+ ],
111
+ "category": ["Mobile", "Mobile", "Footwear", "Laptop", "Electronics"],
112
+ "brand": ["Apple", "Samsung", "Nike", "Dell", "JBL"],
113
+ "description": [
114
+ "Latest smartphone",
115
+ "Android flagship phone",
116
+ "Comfort sports shoes",
117
+ "High performance laptop",
118
+ "Portable music device"
119
+ ]
120
+ })
121
+
122
+ st.subheader("📄 Data Preview")
123
+ st.dataframe(df.head())
124
+
125
+ # ==============================
126
+ # COMBINE TEXT
127
+ # ==============================
128
+ df["combined"] = (
129
+ df["product_name"].astype(str) + " " +
130
+ df["category"].astype(str) + " " +
131
+ df["brand"].astype(str) + " " +
132
+ df["description"].astype(str)
133
+ )
134
+
135
+ products = df["combined"].tolist()
136
+
137
+ # ==============================
138
+ # PREPROCESS (ONLY ONCE)
139
+ # ==============================
140
+ with st.spinner("Processing data..."):
141
+ tfidf, tfidf_matrix, embeddings, index, bm25 = preprocess_data(products)
142
+
143
+ # ==============================
144
+ # SEARCH FUNCTIONS
145
+ # ==============================
146
+ def keyword_search(q):
147
+ return [(i, 1) for i, p in enumerate(products) if q.lower() in p.lower()]
148
+
149
+ def regex_search(q):
150
+ return [(i, 1) for i, p in enumerate(products) if re.search(q, p, re.IGNORECASE)]
151
+
152
+ def boolean_search(q):
153
+ if "AND" in q:
154
+ terms = q.split("AND")
155
+ return [(i, 1) for i, p in enumerate(products)
156
+ if all(t.strip().lower() in p.lower() for t in terms)]
157
+ elif "OR" in q:
158
+ terms = q.split("OR")
159
+ return [(i, 1) for i, p in enumerate(products)
160
+ if any(t.strip().lower() in p.lower() for t in terms)]
161
+ return []
162
+
163
+ def fuzzy_search(q):
164
+ scores = [(i, fuzz.ratio(q, p)) for i, p in enumerate(products)]
165
+ return sorted(scores, key=lambda x: x[1], reverse=True)[:10]
166
+
167
+ def ngram_search(q):
168
+ return [(i, 1) for i, p in enumerate(products) if q[:3].lower() in p.lower()]
169
+
170
+ def prefix_search(q):
171
+ return [(i, 1) for i, p in enumerate(products) if p.lower().startswith(q.lower())]
172
+
173
+ def suffix_search(q):
174
+ return [(i, 1) for i, p in enumerate(products) if p.lower().endswith(q.lower())]
175
+
176
+ def tfidf_search(q):
177
+ q_vec = tfidf.transform([q])
178
+ scores = (tfidf_matrix @ q_vec.T).toarray().flatten()
179
+ idx = np.argsort(scores)[::-1][:10]
180
+ return [(i, float(scores[i])) for i in idx]
181
+
182
+ def bm25_search(q):
183
+ scores = bm25.get_scores(q.split())
184
+ idx = np.argsort(scores)[::-1][:10]
185
+ return [(i, float(scores[i])) for i in idx]
186
+
187
+ def semantic_search(q):
188
+ q_emb = model.encode([q], show_progress_bar=False)
189
+ faiss.normalize_L2(q_emb)
190
+ scores = np.dot(embeddings, q_emb.T).flatten()
191
+ idx = np.argsort(scores)[::-1][:10]
192
+ return [(i, float(scores[i])) for i in idx]
193
+
194
+ def faiss_search(q):
195
+ q_emb = model.encode([q], show_progress_bar=False)
196
+ faiss.normalize_L2(q_emb)
197
+ D, I = index.search(np.array(q_emb), 10)
198
+ return [(i, float(D[0][idx])) for idx, i in enumerate(I[0])]
199
+
200
+ def hybrid_search(q):
201
+ tfidf_res = dict(tfidf_search(q))
202
+ sem_res = dict(semantic_search(q))
203
+ combined = {i: tfidf_res.get(i, 0) + sem_res.get(i, 0) for i in range(len(products))}
204
+ return sorted(combined.items(), key=lambda x: x[1], reverse=True)[:10]
205
+
206
+ def query_expansion_search(q):
207
+ synonyms = get_synonyms(q)
208
+ expanded_query = q + " " + " ".join(synonyms)
209
+ return tfidf_search(expanded_query)
210
+
211
+ def weighted_hybrid(q):
212
+ tfidf_res = dict(tfidf_search(q))
213
+ sem_res = dict(semantic_search(q))
214
+ bm25_res = dict(bm25_search(q))
215
+
216
+ combined = {}
217
+ for i in range(len(products)):
218
+ combined[i] = (
219
+ 0.4 * tfidf_res.get(i, 0) +
220
+ 0.4 * sem_res.get(i, 0) +
221
+ 0.2 * bm25_res.get(i, 0)
222
+ )
223
+ return sorted(combined.items(), key=lambda x: x[1], reverse=True)[:10]
224
+
225
+ def ensemble_search(q):
226
+ results = {}
227
+ for func in [tfidf_search, semantic_search, bm25_search]:
228
+ for i, score in func(q):
229
+ results[i] = results.get(i, 0) + score
230
+ return sorted(results.items(), key=lambda x: x[1], reverse=True)[:10]
231
+
232
+ # ==============================
233
+ # UI
234
+ # ==============================
235
+ search_type = st.selectbox("Select Search Type", list(search_info.keys()))
236
+
237
+ explanation, example = search_info[search_type]
238
+
239
+ st.markdown(f"""
240
+ ### 🔍 {search_type} Search
241
+ - **Explanation:** {explanation}
242
+ - **Example:** `{example}`
243
+ """)
244
+
245
+ query = st.text_input("Enter your search query")
246
+
247
+ if st.button("Try Example"):
248
+ query = example.split("→")[0].strip()
249
+ st.success(f"Example loaded: {query}")
250
+
251
+ top_k = st.slider("Top Results", 5, 20, 10)
252
+
253
+ if st.button("Search"):
254
+ if not query:
255
+ st.warning("Enter query")
256
+ else:
257
+ func_map = {
258
+ "Keyword": keyword_search,
259
+ "Regex": regex_search,
260
+ "Boolean": boolean_search,
261
+ "Fuzzy": fuzzy_search,
262
+ "N-Gram": ngram_search,
263
+ "Prefix": prefix_search,
264
+ "Suffix": suffix_search,
265
+ "TF-IDF": tfidf_search,
266
+ "BM25": bm25_search,
267
+ "Semantic": semantic_search,
268
+ "FAISS": faiss_search,
269
+ "Hybrid": hybrid_search,
270
+ "Query Expansion": query_expansion_search,
271
+ "Weighted Hybrid": weighted_hybrid,
272
+ "Ensemble": ensemble_search
273
+ }
274
+
275
+ results = func_map[search_type](query)[:top_k]
276
+
277
+ indices = [i for i, _ in results]
278
+ result_df = df.iloc[indices].copy()
279
+ result_df["Score"] = [score for _, score in results]
280
+
281
+ st.subheader("🔎 Results")
282
+ st.dataframe(result_df)
products_sample.csv ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ product_name,category,brand,description
2
+ iPhone 14 Pro Max,Mobile,Apple,Premium smartphone with advanced camera and performance
3
+ Samsung Galaxy S23,Mobile,Samsung,Android smartphone with high performance and display
4
+ Nike Running Shoes,Shoes,Nike,Lightweight sports footwear for running and training
5
+ Adidas Sports Shoes,Shoes,Adidas,Comfortable athletic shoes for sports and fitness
6
+ Sony Wireless Headphones,Electronics,Sony,Bluetooth headphones with noise cancellation
7
+ Boat Bluetooth Speaker,Electronics,Boat,Portable wireless speaker with powerful sound
8
+ Dell Gaming Laptop,Laptop,Dell,High performance laptop for gaming and heavy tasks
9
+ HP Office Laptop,Laptop,HP,Efficient laptop for office work and productivity
10
+ Apple MacBook Air,Laptop,Apple,Lightweight premium laptop with long battery life
11
+ Wooden Dining Table,Furniture,Ikea,Modern wooden table for dining room
12
+ Leather Sofa Set,Furniture,Urban Ladder,Comfortable seating sofa for living room
13
+ Canon DSLR Camera,Electronics,Canon,Professional camera for photography and video
14
+ Smart LED TV 55 inch,Electronics,Samsung,Ultra HD smart television with streaming apps
15
+ Puma Casual Shoes,Shoes,Puma,Stylish casual footwear for daily wear
16
+ Lenovo ThinkPad Laptop,Laptop,Lenovo,Business laptop with durability and performance
requirements.txt CHANGED
@@ -1,3 +1,9 @@
1
- altair
2
- pandas
3
- streamlit
 
 
 
 
 
 
 
1
+ streamlit==1.32.0
2
+ pandas
3
+ numpy
4
+ scikit-learn
5
+ sentence-transformers==2.2.2
6
+ faiss-cpu
7
+ rapidfuzz
8
+ rank-bm25
9
+ nltk