pradeep4321 commited on
Commit
f86ae3e
Β·
verified Β·
1 Parent(s): 3d14b52

Update src/app.py

Browse files
Files changed (1) hide show
  1. src/app.py +25 -112
src/app.py CHANGED
@@ -12,7 +12,7 @@ import faiss
12
  import nltk
13
 
14
  # ==============================
15
- # FIX NLTK (HUGGINGFACE SAFE)
16
  # ==============================
17
  nltk_data_path = "/tmp/nltk_data"
18
  os.makedirs(nltk_data_path, exist_ok=True)
@@ -41,69 +41,33 @@ def load_model():
41
  model = load_model()
42
 
43
  # ==============================
44
- # SEARCH INFO
45
  # ==============================
46
- search_info = {
47
- "Keyword": ("Find exact word match", "iphone β†’ iPhone"),
48
- "Regex": ("Pattern-based search", "^S β†’ Samsung"),
49
- "Boolean": ("Use AND / OR", "nike AND shoes"),
50
- "Fuzzy": ("Handles spelling mistakes", "iphon β†’ iPhone"),
51
- "N-Gram": ("Partial word match", "iph β†’ iPhone"),
52
- "Prefix": ("Starts with query", "app β†’ Apple"),
53
- "Suffix": ("Ends with query", "laptop β†’ Dell Laptop"),
54
- "TF-IDF": ("Ranks important words", "wireless headphones"),
55
- "BM25": ("Advanced keyword ranking", "gaming laptop"),
56
- "Semantic": ("Understands meaning", "sports footwear"),
57
- "FAISS": ("Fast semantic search", "music device"),
58
- "Hybrid": ("Keyword + meaning", "sports shoes"),
59
- "Query Expansion": ("Adds similar words", "speaker β†’ audio"),
60
- "Weighted Hybrid": ("Weighted ranking", "better accuracy"),
61
- "Ensemble": ("Combine all methods", "best results")
62
- }
63
-
64
- # ==============================
65
- # DATA SOURCE (NO UPLOAD)
66
- # ==============================
67
- data_option = st.radio("πŸ“‚ Choose Data Source", ["Sample Data", "Default CSV (from repo)"])
68
-
69
- if data_option == "Default CSV (from repo)":
70
  try:
71
- df = pd.read_csv("products_sample.csv")
72
- st.success("βœ… Loaded dataset from repository")
73
  except:
74
- st.error("❌ products_sample.csv not found. Using sample data instead.")
75
- df = None
76
-
77
- if data_option == "Sample Data" or df is None:
78
- df = pd.DataFrame({
79
- "product_name": [
80
- "iPhone 14 Pro",
81
- "Samsung Galaxy S23",
82
- "Nike Running Shoes",
83
- "Dell Gaming Laptop",
84
- "Bluetooth Speaker"
85
- ],
86
- "category": ["Mobile", "Mobile", "Footwear", "Laptop", "Electronics"],
87
- "brand": ["Apple", "Samsung", "Nike", "Dell", "JBL"],
88
- "description": [
89
- "Latest smartphone",
90
- "Android flagship phone",
91
- "Comfort sports shoes",
92
- "High performance laptop",
93
- "Portable music device"
94
- ]
95
- })
96
- st.info("Using sample dataset")
97
 
98
  # ==============================
99
- # DATA PREVIEW (ROW CONTROL)
100
  # ==============================
101
  st.subheader("πŸ“„ Data Preview")
102
 
103
  row_limit = st.selectbox(
104
  "Select number of rows to view",
105
- [5, 10, 20, 30, 50, 100],
106
- index=1
107
  )
108
 
109
  st.caption(f"Showing top {row_limit} rows")
@@ -122,7 +86,7 @@ df["combined"] = (
122
  products = df["combined"].tolist()
123
 
124
  # ==============================
125
- # PREPROCESSING
126
  # ==============================
127
  @st.cache_resource
128
  def preprocess_data(products):
@@ -178,15 +142,6 @@ def fuzzy_search(q):
178
  scores = [(i, fuzz.ratio(q, p)) for i, p in enumerate(products)]
179
  return sorted(scores, key=lambda x: x[1], reverse=True)[:10]
180
 
181
- def ngram_search(q):
182
- return [(i, 1) for i, p in enumerate(products) if q[:3].lower() in p.lower()]
183
-
184
- def prefix_search(q):
185
- return [(i, 1) for i, p in enumerate(products) if p.lower().startswith(q.lower())]
186
-
187
- def suffix_search(q):
188
- return [(i, 1) for i, p in enumerate(products) if p.lower().endswith(q.lower())]
189
-
190
  def tfidf_search(q):
191
  q_vec = tfidf.transform([q])
192
  scores = (tfidf_matrix @ q_vec.T).toarray().flatten()
@@ -217,51 +172,15 @@ def hybrid_search(q):
217
  combined = {i: tfidf_res.get(i, 0) + sem_res.get(i, 0) for i in range(len(products))}
218
  return sorted(combined.items(), key=lambda x: x[1], reverse=True)[:10]
219
 
220
- def query_expansion_search(q):
221
- synonyms = get_synonyms(q)
222
- expanded_query = q + " " + " ".join(synonyms)
223
- return tfidf_search(expanded_query)
224
-
225
- def weighted_hybrid(q):
226
- tfidf_res = dict(tfidf_search(q))
227
- sem_res = dict(semantic_search(q))
228
- bm25_res = dict(bm25_search(q))
229
-
230
- combined = {}
231
- for i in range(len(products)):
232
- combined[i] = (
233
- 0.4 * tfidf_res.get(i, 0) +
234
- 0.4 * sem_res.get(i, 0) +
235
- 0.2 * bm25_res.get(i, 0)
236
- )
237
- return sorted(combined.items(), key=lambda x: x[1], reverse=True)[:10]
238
-
239
- def ensemble_search(q):
240
- results = {}
241
- for func in [tfidf_search, semantic_search, bm25_search]:
242
- for i, score in func(q):
243
- results[i] = results.get(i, 0) + score
244
- return sorted(results.items(), key=lambda x: x[1], reverse=True)[:10]
245
-
246
  # ==============================
247
- # SEARCH UI
248
  # ==============================
249
- search_type = st.selectbox("πŸ”Ž Select Search Type", list(search_info.keys()))
250
-
251
- explanation, example = search_info[search_type]
252
-
253
- st.markdown(f"""
254
- ### πŸ” {search_type} Search
255
- - **Explanation:** {explanation}
256
- - **Example:** `{example}`
257
- """)
258
 
259
  query = st.text_input("Enter your search query")
260
-
261
- if st.button("Try Example"):
262
- query = example.split("β†’")[0].strip()
263
- st.success(f"Example loaded: {query}")
264
-
265
  top_k = st.slider("Top Results", 5, 20, 10)
266
 
267
  if st.button("Search"):
@@ -273,17 +192,11 @@ if st.button("Search"):
273
  "Regex": regex_search,
274
  "Boolean": boolean_search,
275
  "Fuzzy": fuzzy_search,
276
- "N-Gram": ngram_search,
277
- "Prefix": prefix_search,
278
- "Suffix": suffix_search,
279
  "TF-IDF": tfidf_search,
280
  "BM25": bm25_search,
281
  "Semantic": semantic_search,
282
  "FAISS": faiss_search,
283
- "Hybrid": hybrid_search,
284
- "Query Expansion": query_expansion_search,
285
- "Weighted Hybrid": weighted_hybrid,
286
- "Ensemble": ensemble_search
287
  }
288
 
289
  results = func_map[search_type](query)[:top_k]
 
12
  import nltk
13
 
14
  # ==============================
15
+ # FIX NLTK (HF SAFE)
16
  # ==============================
17
  nltk_data_path = "/tmp/nltk_data"
18
  os.makedirs(nltk_data_path, exist_ok=True)
 
41
  model = load_model()
42
 
43
  # ==============================
44
+ # LOAD CSV FROM REPO
45
  # ==============================
46
+ @st.cache_data
47
+ def load_data():
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
48
  try:
49
+ df = pd.read_csv("products_10k.csv")
50
+ return df
51
  except:
52
+ st.warning("⚠️ products_10k.csv not found. Using fallback data.")
53
+ return pd.DataFrame({
54
+ "product_name": ["iPhone 14 Pro", "Samsung Galaxy S23"],
55
+ "category": ["Mobile", "Mobile"],
56
+ "brand": ["Apple", "Samsung"],
57
+ "description": ["Latest smartphone", "Android flagship phone"]
58
+ })
59
+
60
+ df = load_data()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
61
 
62
  # ==============================
63
+ # DATA PREVIEW
64
  # ==============================
65
  st.subheader("πŸ“„ Data Preview")
66
 
67
  row_limit = st.selectbox(
68
  "Select number of rows to view",
69
+ [10, 20, 30, 50, 100],
70
+ index=0
71
  )
72
 
73
  st.caption(f"Showing top {row_limit} rows")
 
86
  products = df["combined"].tolist()
87
 
88
  # ==============================
89
+ # PREPROCESS
90
  # ==============================
91
  @st.cache_resource
92
  def preprocess_data(products):
 
142
  scores = [(i, fuzz.ratio(q, p)) for i, p in enumerate(products)]
143
  return sorted(scores, key=lambda x: x[1], reverse=True)[:10]
144
 
 
 
 
 
 
 
 
 
 
145
  def tfidf_search(q):
146
  q_vec = tfidf.transform([q])
147
  scores = (tfidf_matrix @ q_vec.T).toarray().flatten()
 
172
  combined = {i: tfidf_res.get(i, 0) + sem_res.get(i, 0) for i in range(len(products))}
173
  return sorted(combined.items(), key=lambda x: x[1], reverse=True)[:10]
174
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
175
  # ==============================
176
+ # UI
177
  # ==============================
178
+ search_type = st.selectbox(
179
+ "πŸ”Ž Select Search Type",
180
+ ["Keyword", "Regex", "Boolean", "Fuzzy", "TF-IDF", "BM25", "Semantic", "FAISS", "Hybrid"]
181
+ )
 
 
 
 
 
182
 
183
  query = st.text_input("Enter your search query")
 
 
 
 
 
184
  top_k = st.slider("Top Results", 5, 20, 10)
185
 
186
  if st.button("Search"):
 
192
  "Regex": regex_search,
193
  "Boolean": boolean_search,
194
  "Fuzzy": fuzzy_search,
 
 
 
195
  "TF-IDF": tfidf_search,
196
  "BM25": bm25_search,
197
  "Semantic": semantic_search,
198
  "FAISS": faiss_search,
199
+ "Hybrid": hybrid_search
 
 
 
200
  }
201
 
202
  results = func_map[search_type](query)[:top_k]