pradeep4321 commited on
Commit
24b5168
Β·
verified Β·
1 Parent(s): 828c082

Update src/app.py

Browse files
Files changed (1) hide show
  1. src/app.py +143 -66
src/app.py CHANGED
@@ -2,7 +2,6 @@ import streamlit as st
2
  import pandas as pd
3
  import numpy as np
4
  import re
5
- import os
6
 
7
  from sklearn.feature_extraction.text import TfidfVectorizer
8
  from sentence_transformers import SentenceTransformer
@@ -12,17 +11,9 @@ import faiss
12
  import nltk
13
 
14
  # ==============================
15
- # FIX NLTK (HF SAFE)
16
  # ==============================
17
- nltk_data_path = "/tmp/nltk_data"
18
- os.makedirs(nltk_data_path, exist_ok=True)
19
- nltk.data.path.append(nltk_data_path)
20
-
21
- try:
22
- nltk.data.find('corpora/wordnet')
23
- except:
24
- nltk.download('wordnet', download_dir=nltk_data_path)
25
-
26
  from nltk.corpus import wordnet
27
 
28
  # ==============================
@@ -34,44 +25,71 @@ st.title("πŸ” Advanced Multi-Search Product Engine")
34
  # ==============================
35
  # LOAD MODEL
36
  # ==============================
37
- @st.cache_resource
38
- def load_model():
39
- return SentenceTransformer('all-MiniLM-L6-v2', device='cpu')
 
 
 
40
 
41
- model = load_model()
42
 
43
  # ==============================
44
- # LOAD CSV FROM REPO
45
  # ==============================
46
- @st.cache_data
47
- def load_data():
48
- try:
49
- df = pd.read_csv("src/products_10k.csv")
50
- return df
51
- except:
52
- st.warning("⚠️ products_10k.csv not found. Using fallback data.")
53
- return pd.DataFrame({
54
- "product_name": ["iPhone 14 Pro", "Samsung Galaxy S23"],
55
- "category": ["Mobile", "Mobile"],
56
- "brand": ["Apple", "Samsung"],
57
- "description": ["Latest smartphone", "Android flagship phone"]
58
- })
 
 
 
 
59
 
60
- df = load_data()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
61
 
62
  # ==============================
63
- # DATA PREVIEW
64
  # ==============================
65
  st.subheader("πŸ“„ Data Preview")
66
 
67
- row_limit = st.selectbox(
68
- "Select number of rows to view",
69
- [10, 20, 30, 50, 100],
70
- index=0
71
- )
72
-
73
- st.caption(f"Showing top {row_limit} rows")
74
- st.dataframe(df.head(row_limit), use_container_width=True)
75
 
76
  # ==============================
77
  # COMBINE TEXT
@@ -88,18 +106,15 @@ products = df["combined"].tolist()
88
  # ==============================
89
  # PREPROCESS
90
  # ==============================
91
- @st.cache_resource
92
  def preprocess_data(products):
93
-
94
  tfidf = TfidfVectorizer()
95
  tfidf_matrix = tfidf.fit_transform(products)
96
 
97
- embeddings = model.encode(products, batch_size=32, show_progress_bar=False)
98
-
99
  faiss.normalize_L2(embeddings)
100
 
101
- dim = embeddings.shape[1]
102
- index = faiss.IndexFlatIP(dim)
103
  index.add(np.array(embeddings))
104
 
105
  tokenized = [p.split() for p in products]
@@ -107,7 +122,11 @@ def preprocess_data(products):
107
 
108
  return tfidf, tfidf_matrix, embeddings, index, bm25
109
 
110
- @st.cache_resource
 
 
 
 
111
  def get_synonyms(word):
112
  synonyms = set()
113
  for syn in wordnet.synsets(word):
@@ -115,9 +134,6 @@ def get_synonyms(word):
115
  synonyms.add(lemma.name())
116
  return synonyms
117
 
118
- with st.spinner("βš™οΈ Processing data..."):
119
- tfidf, tfidf_matrix, embeddings, index, bm25 = preprocess_data(products)
120
-
121
  # ==============================
122
  # SEARCH FUNCTIONS
123
  # ==============================
@@ -140,25 +156,35 @@ def boolean_search(q):
140
 
141
  def fuzzy_search(q):
142
  scores = [(i, fuzz.ratio(q, p)) for i, p in enumerate(products)]
143
- return sorted(scores, key=lambda x: x[1], reverse=True)[:10]
 
 
 
 
 
 
 
 
 
 
 
 
 
144
 
145
  def tfidf_search(q):
146
  q_vec = tfidf.transform([q])
147
  scores = (tfidf_matrix @ q_vec.T).toarray().flatten()
148
- idx = np.argsort(scores)[::-1][:10]
149
- return [(i, float(scores[i])) for i in idx]
150
 
151
  def bm25_search(q):
152
  scores = bm25.get_scores(q.split())
153
- idx = np.argsort(scores)[::-1][:10]
154
- return [(i, float(scores[i])) for i in idx]
155
 
156
  def semantic_search(q):
157
  q_emb = model.encode([q], show_progress_bar=False)
158
  faiss.normalize_L2(q_emb)
159
  scores = np.dot(embeddings, q_emb.T).flatten()
160
- idx = np.argsort(scores)[::-1][:10]
161
- return [(i, float(scores[i])) for i in idx]
162
 
163
  def faiss_search(q):
164
  q_emb = model.encode([q], show_progress_bar=False)
@@ -169,20 +195,62 @@ def faiss_search(q):
169
  def hybrid_search(q):
170
  tfidf_res = dict(tfidf_search(q))
171
  sem_res = dict(semantic_search(q))
172
- combined = {i: tfidf_res.get(i, 0) + sem_res.get(i, 0) for i in range(len(products))}
173
- return sorted(combined.items(), key=lambda x: x[1], reverse=True)[:10]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
174
 
175
  # ==============================
176
  # UI
177
  # ==============================
178
- search_type = st.selectbox(
179
- "πŸ”Ž Select Search Type",
180
- ["Keyword", "Regex", "Boolean", "Fuzzy", "TF-IDF", "BM25", "Semantic", "FAISS", "Hybrid"]
181
- )
 
 
 
 
182
 
183
  query = st.text_input("Enter your search query")
 
 
 
 
 
184
  top_k = st.slider("Top Results", 5, 20, 10)
185
 
 
 
 
186
  if st.button("Search"):
187
  if not query:
188
  st.warning("Enter query")
@@ -192,18 +260,27 @@ if st.button("Search"):
192
  "Regex": regex_search,
193
  "Boolean": boolean_search,
194
  "Fuzzy": fuzzy_search,
 
 
 
195
  "TF-IDF": tfidf_search,
196
  "BM25": bm25_search,
197
  "Semantic": semantic_search,
198
  "FAISS": faiss_search,
199
- "Hybrid": hybrid_search
 
 
 
200
  }
201
 
202
- results = func_map[search_type](query)[:top_k]
 
 
 
203
 
204
  indices = [i for i, _ in results]
205
  result_df = df.iloc[indices].copy()
206
- result_df["Score"] = [score for _, score in results]
207
 
208
  st.subheader("πŸ”Ž Results")
209
- st.dataframe(result_df, use_container_width=True)
 
2
  import pandas as pd
3
  import numpy as np
4
  import re
 
5
 
6
  from sklearn.feature_extraction.text import TfidfVectorizer
7
  from sentence_transformers import SentenceTransformer
 
11
  import nltk
12
 
13
  # ==============================
14
+ # NLTK FIX
15
  # ==============================
16
+ nltk.download('wordnet', quiet=True)
 
 
 
 
 
 
 
 
17
  from nltk.corpus import wordnet
18
 
19
  # ==============================
 
25
  # ==============================
26
  # LOAD MODEL
27
  # ==============================
28
+ if "model" not in st.session_state:
29
+ with st.spinner("Loading AI model..."):
30
+ st.session_state.model = SentenceTransformer(
31
+ 'all-MiniLM-L6-v2',
32
+ device='cpu'
33
+ )
34
 
35
+ model = st.session_state.model
36
 
37
  # ==============================
38
+ # SEARCH INFO (UPDATED)
39
  # ==============================
40
+ search_info = {
41
+ "Keyword": ("Exact match", "iphone"),
42
+ "Regex": ("Pattern match", "^Samsung"),
43
+ "Boolean": ("AND / OR logic", "nike AND shoes"),
44
+ "Fuzzy": ("Spelling mistakes", "iphon"),
45
+ "N-Gram": ("Partial word", "iph"),
46
+ "Prefix": ("Word starts with", "Sam"),
47
+ "Suffix": ("Word ends with", "phone"),
48
+ "TF-IDF": ("Keyword ranking", "wireless headphones"),
49
+ "BM25": ("Advanced ranking", "gaming laptop"),
50
+ "Semantic": ("Meaning search", "sports footwear"),
51
+ "FAISS": ("Fast semantic", "music device"),
52
+ "Hybrid": ("TF-IDF + Semantic", "running shoes"),
53
+ "Query Expansion": ("Auto synonyms", "speaker"),
54
+ "Weighted Hybrid": ("TF-IDF + Semantic + BM25", "best laptop"),
55
+ "Ensemble": ("Combine all scores", "smartphone")
56
+ }
57
 
58
+ # ==============================
59
+ # FILE LOAD (KEEP YOUR LOGIC)
60
+ # ==============================
61
+ uploaded_file = st.file_uploader("Upload CSV", type=["csv"])
62
+
63
+ if uploaded_file:
64
+ df = pd.read_csv(uploaded_file)
65
+ else:
66
+ st.info("Using sample dataset")
67
+ df = pd.DataFrame({
68
+ "product_name": [
69
+ "iPhone 14 Pro",
70
+ "Samsung Galaxy S23",
71
+ "Nike Running Shoes",
72
+ "Dell Gaming Laptop",
73
+ "Bluetooth Speaker"
74
+ ],
75
+ "category": ["Mobile", "Mobile", "Footwear", "Laptop", "Electronics"],
76
+ "brand": ["Apple", "Samsung", "Nike", "Dell", "JBL"],
77
+ "description": [
78
+ "Latest smartphone",
79
+ "Android flagship phone",
80
+ "Comfort sports shoes",
81
+ "High performance laptop",
82
+ "Portable music device"
83
+ ]
84
+ })
85
 
86
  # ==============================
87
+ # DATA PREVIEW CONTROL
88
  # ==============================
89
  st.subheader("πŸ“„ Data Preview")
90
 
91
+ rows_to_show = st.selectbox("Select rows to view", [10, 20, 50, 100])
92
+ st.dataframe(df.head(rows_to_show))
 
 
 
 
 
 
93
 
94
  # ==============================
95
  # COMBINE TEXT
 
106
  # ==============================
107
  # PREPROCESS
108
  # ==============================
109
+ @st.cache(allow_output_mutation=True)
110
  def preprocess_data(products):
 
111
  tfidf = TfidfVectorizer()
112
  tfidf_matrix = tfidf.fit_transform(products)
113
 
114
+ embeddings = model.encode(products, batch_size=64, show_progress_bar=False)
 
115
  faiss.normalize_L2(embeddings)
116
 
117
+ index = faiss.IndexFlatIP(embeddings.shape[1])
 
118
  index.add(np.array(embeddings))
119
 
120
  tokenized = [p.split() for p in products]
 
122
 
123
  return tfidf, tfidf_matrix, embeddings, index, bm25
124
 
125
+ tfidf, tfidf_matrix, embeddings, index, bm25 = preprocess_data(products)
126
+
127
+ # ==============================
128
+ # SYNONYMS
129
+ # ==============================
130
  def get_synonyms(word):
131
  synonyms = set()
132
  for syn in wordnet.synsets(word):
 
134
  synonyms.add(lemma.name())
135
  return synonyms
136
 
 
 
 
137
  # ==============================
138
  # SEARCH FUNCTIONS
139
  # ==============================
 
156
 
157
  def fuzzy_search(q):
158
  scores = [(i, fuzz.ratio(q, p)) for i, p in enumerate(products)]
159
+ return sorted(scores, key=lambda x: x[1], reverse=True)
160
+
161
+ def ngram_search(q):
162
+ return [(i, 1) for i, p in enumerate(products) if q.lower() in p.lower()]
163
+
164
+ # βœ… FIXED PREFIX (word-level)
165
+ def prefix_search(q):
166
+ return [(i, 1) for i, p in enumerate(products)
167
+ if any(word.startswith(q.lower()) for word in p.lower().split())]
168
+
169
+ # βœ… FIXED SUFFIX (word-level)
170
+ def suffix_search(q):
171
+ return [(i, 1) for i, p in enumerate(products)
172
+ if any(word.endswith(q.lower()) for word in p.lower().split())]
173
 
174
  def tfidf_search(q):
175
  q_vec = tfidf.transform([q])
176
  scores = (tfidf_matrix @ q_vec.T).toarray().flatten()
177
+ return list(enumerate(scores))
 
178
 
179
  def bm25_search(q):
180
  scores = bm25.get_scores(q.split())
181
+ return list(enumerate(scores))
 
182
 
183
  def semantic_search(q):
184
  q_emb = model.encode([q], show_progress_bar=False)
185
  faiss.normalize_L2(q_emb)
186
  scores = np.dot(embeddings, q_emb.T).flatten()
187
+ return list(enumerate(scores))
 
188
 
189
  def faiss_search(q):
190
  q_emb = model.encode([q], show_progress_bar=False)
 
195
  def hybrid_search(q):
196
  tfidf_res = dict(tfidf_search(q))
197
  sem_res = dict(semantic_search(q))
198
+ return [(i, tfidf_res.get(i, 0) + sem_res.get(i, 0)) for i in range(len(products))]
199
+
200
+ # βœ… IMPROVED QUERY EXPANSION
201
+ def query_expansion_search(q):
202
+ expanded = q.split()
203
+ for word in q.split():
204
+ expanded += list(get_synonyms(word))
205
+ return tfidf_search(" ".join(expanded))
206
+
207
+ # βœ… IMPROVED WEIGHTED HYBRID
208
+ def weighted_hybrid(q):
209
+ tfidf_res = dict(tfidf_search(q))
210
+ sem_res = dict(semantic_search(q))
211
+ bm25_res = dict(bm25_search(q))
212
+
213
+ return [(i,
214
+ 0.4 * tfidf_res.get(i, 0) +
215
+ 0.4 * sem_res.get(i, 0) +
216
+ 0.2 * bm25_res.get(i, 0))
217
+ for i in range(len(products))]
218
+
219
+ # βœ… FIXED ENSEMBLE (NORMALIZED)
220
+ def ensemble_search(q):
221
+ tfidf_res = np.array([s for _, s in tfidf_search(q)])
222
+ sem_res = np.array([s for _, s in semantic_search(q)])
223
+ bm25_res = np.array([s for _, s in bm25_search(q)])
224
+
225
+ combined = tfidf_res/np.max(tfidf_res+1e-6) + \
226
+ sem_res/np.max(sem_res+1e-6) + \
227
+ bm25_res/np.max(bm25_res+1e-6)
228
+
229
+ return list(enumerate(combined))
230
 
231
  # ==============================
232
  # UI
233
  # ==============================
234
+ search_type = st.selectbox("πŸ”Ž Select Search Type", list(search_info.keys()))
235
+ explanation, example = search_info[search_type]
236
+
237
+ st.markdown(f"""
238
+ ### πŸ” {search_type}
239
+ - **Explanation:** {explanation}
240
+ - **Example:** `{example}`
241
+ """)
242
 
243
  query = st.text_input("Enter your search query")
244
+
245
+ if st.button("Try Example"):
246
+ query = example
247
+ st.success(f"Loaded: {query}")
248
+
249
  top_k = st.slider("Top Results", 5, 20, 10)
250
 
251
+ # ==============================
252
+ # SEARCH EXECUTION
253
+ # ==============================
254
  if st.button("Search"):
255
  if not query:
256
  st.warning("Enter query")
 
260
  "Regex": regex_search,
261
  "Boolean": boolean_search,
262
  "Fuzzy": fuzzy_search,
263
+ "N-Gram": ngram_search,
264
+ "Prefix": prefix_search,
265
+ "Suffix": suffix_search,
266
  "TF-IDF": tfidf_search,
267
  "BM25": bm25_search,
268
  "Semantic": semantic_search,
269
  "FAISS": faiss_search,
270
+ "Hybrid": hybrid_search,
271
+ "Query Expansion": query_expansion_search,
272
+ "Weighted Hybrid": weighted_hybrid,
273
+ "Ensemble": ensemble_search
274
  }
275
 
276
+ results = func_map[search_type](query)
277
+
278
+ # Sort results
279
+ results = sorted(results, key=lambda x: x[1], reverse=True)[:top_k]
280
 
281
  indices = [i for i, _ in results]
282
  result_df = df.iloc[indices].copy()
283
+ result_df["Score"] = [round(score, 4) for _, score in results]
284
 
285
  st.subheader("πŸ”Ž Results")
286
+ st.dataframe(result_df)