pradeep4321 commited on
Commit
ca8f7bb
Β·
verified Β·
1 Parent(s): 131fb40

Update src/app.py

Browse files
Files changed (1) hide show
  1. src/app.py +172 -56
src/app.py CHANGED
@@ -2,20 +2,24 @@ import streamlit as st
2
  import pandas as pd
3
  import numpy as np
4
  import os
 
5
  import faiss
6
  import nltk
 
7
  from sklearn.feature_extraction.text import TfidfVectorizer
8
  from sentence_transformers import SentenceTransformer
9
  from rank_bm25 import BM25Okapi
 
 
10
 
11
  # ==============================
12
- # INITIALIZATION & NLTK
13
  # ==============================
14
  nltk.download('wordnet', quiet=True)
15
  LOG_FILE = "user_logs.csv"
16
 
17
  # ==============================
18
- # LOGGING UTILITY
19
  # ==============================
20
  def log_activity(user, action, query, search_type):
21
  log_entry = {
@@ -32,17 +36,15 @@ def log_activity(user, action, query, search_type):
32
  else:
33
  df_log = pd.DataFrame([log_entry])
34
  df_log.to_csv(LOG_FILE, index=False)
35
- except Exception:
36
- pass # Prevent app crash on logging errors
37
 
38
  # ==============================
39
- # SECRETS & AUTHENTICATION
40
  # ==============================
41
  def login():
42
  st.title("πŸ” Login Required")
43
-
44
- # Hugging Face exposes secrets as environment variables
45
- # We check both os.environ (Cloud) and st.secrets (Local)
46
  HF_USER = os.environ.get("USERNAME") or st.secrets.get("USERNAME")
47
  HF_PASS = os.environ.get("PASSWORD") or st.secrets.get("PASSWORD")
48
 
@@ -51,11 +53,12 @@ def login():
51
 
52
  if st.button("Login"):
53
  if not HF_USER or not HF_PASS:
54
- st.error("⚠️ Secrets not configured! Add USERNAME and PASSWORD in Hugging Face Settings.")
55
  elif username == HF_USER and password == HF_PASS:
56
  st.session_state["authenticated"] = True
57
  st.session_state["user"] = username
58
  st.session_state["login_time"] = pd.Timestamp.now()
 
59
  log_activity(username, "Login Success", "-", "-")
60
  st.rerun()
61
  else:
@@ -70,108 +73,221 @@ if not st.session_state["authenticated"]:
70
  st.stop()
71
 
72
  # ==============================
73
- # PAGE CONFIG & UI
74
  # ==============================
75
  st.set_page_config(page_title="Multi Search Engine", layout="wide")
76
  st.title("πŸ” Advanced Multi-Search Product Engine")
77
 
78
- st.sidebar.success(f"πŸ‘€ User: {st.session_state['user']}")
 
79
  if st.sidebar.button("πŸšͺ Logout"):
80
  log_activity(st.session_state["user"], "Logout", "-", "-")
81
  st.session_state.clear()
82
  st.rerun()
83
 
84
  # ==============================
85
- # DATA LOADING & CACHING
86
  # ==============================
87
  @st.cache_resource
88
  def load_model():
89
  return SentenceTransformer('all-MiniLM-L6-v2', device='cpu')
90
 
 
 
 
 
 
91
  @st.cache_data
92
  def load_data():
93
  path = "src/products_10k.csv"
94
  if not os.path.exists(path):
95
- st.error(f"Missing data file at {path}")
96
  return None
 
97
  df = pd.read_csv(path)
98
- # Fill NaN values to prevent search errors
99
  df["combined"] = (
100
  df["product_name"].fillna("") + " " +
101
  df["category"].fillna("") + " " +
102
  df["brand"].fillna("") + " " +
103
  df["description"].fillna("")
104
  )
 
105
  return df
106
 
107
- model = load_model()
108
  df = load_data()
109
- if df is None: st.stop()
 
 
 
 
 
 
 
 
 
 
110
 
111
  # ==============================
112
- # SEARCH PRE-PROCESSING
113
  # ==============================
114
  @st.cache_resource
115
- def get_search_assets(products):
116
- # TF-IDF
117
  tfidf = TfidfVectorizer()
118
  tfidf_matrix = tfidf.fit_transform(products)
119
-
120
- # Semantic/FAISS
121
  embeddings = model.encode(products, show_progress_bar=False)
122
  faiss.normalize_L2(embeddings)
 
123
  index = faiss.IndexFlatIP(embeddings.shape[1])
124
  index.add(np.array(embeddings))
125
-
126
- # BM25
127
- tokenized = [p.lower().split() for p in products]
128
- bm25 = BM25Okapi(tokenized)
129
-
130
  return tfidf, tfidf_matrix, embeddings, index, bm25
131
 
132
- products_list = df["combined"].tolist()
133
- tfidf, tf_matrix, embs, faiss_index, bm25 = get_search_assets(products_list)
 
 
 
 
 
 
 
 
 
134
 
135
  # ==============================
136
- # SEARCH FUNCTIONS
137
  # ==============================
138
- def run_search(q, mode, k):
 
139
  if mode == "Keyword":
140
- # Simple boolean check for exact matches
141
- matches = [(i, 1.0) for i, p in enumerate(products_list) if q.lower() in p.lower()]
142
- return matches[:k]
143
- else:
144
- # Semantic search using FAISS
145
- q_emb = model.encode([q])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
146
  faiss.normalize_L2(q_emb)
147
- scores, indices = faiss_index.search(np.array(q_emb), k)
148
- return list(zip(indices[0], scores[0]))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
149
 
150
  # ==============================
151
- # MAIN APP EXECUTION
152
  # ==============================
153
- search_type = st.selectbox("πŸ”Ž Search Type", ["Keyword", "Semantic"])
154
- query = st.text_input("Search for products...")
155
- top_k = st.slider("Results to show", 5, 50, 10)
 
 
156
 
157
- if st.button("Search") and query:
158
- results = run_search(query, search_type, top_k)
159
- log_activity(st.session_state["user"], "Search", query, search_type)
160
 
161
- if results:
162
- idx = [r[0] for r in results if r[0] != -1]
163
- scores = [r[1] for r in results if r[0] != -1]
164
-
165
- final_df = df.iloc[idx].copy()
166
- final_df["Match Score"] = scores
167
- st.dataframe(final_df.drop(columns=["combined"]), use_container_width=True)
168
  else:
169
- st.info("No matching products found.")
 
 
 
 
 
 
 
 
 
 
 
 
 
170
 
171
  # ==============================
172
  # SIDEBAR LOGS
173
  # ==============================
174
- st.sidebar.markdown("---")
175
- st.sidebar.subheader("πŸ“Š Recent Activity")
176
  if os.path.exists(LOG_FILE):
177
- st.sidebar.dataframe(pd.read_csv(LOG_FILE).tail(5))
 
 
 
2
  import pandas as pd
3
  import numpy as np
4
  import os
5
+ import re
6
  import faiss
7
  import nltk
8
+
9
  from sklearn.feature_extraction.text import TfidfVectorizer
10
  from sentence_transformers import SentenceTransformer
11
  from rank_bm25 import BM25Okapi
12
+ from rapidfuzz import fuzz
13
+ from nltk.corpus import wordnet
14
 
15
  # ==============================
16
+ # INITIAL SETUP
17
  # ==============================
18
  nltk.download('wordnet', quiet=True)
19
  LOG_FILE = "user_logs.csv"
20
 
21
  # ==============================
22
+ # LOGGING FUNCTION
23
  # ==============================
24
  def log_activity(user, action, query, search_type):
25
  log_entry = {
 
36
  else:
37
  df_log = pd.DataFrame([log_entry])
38
  df_log.to_csv(LOG_FILE, index=False)
39
+ except:
40
+ pass
41
 
42
  # ==============================
43
+ # AUTHENTICATION
44
  # ==============================
45
  def login():
46
  st.title("πŸ” Login Required")
47
+
 
 
48
  HF_USER = os.environ.get("USERNAME") or st.secrets.get("USERNAME")
49
  HF_PASS = os.environ.get("PASSWORD") or st.secrets.get("PASSWORD")
50
 
 
53
 
54
  if st.button("Login"):
55
  if not HF_USER or not HF_PASS:
56
+ st.error("⚠️ Secrets not configured!")
57
  elif username == HF_USER and password == HF_PASS:
58
  st.session_state["authenticated"] = True
59
  st.session_state["user"] = username
60
  st.session_state["login_time"] = pd.Timestamp.now()
61
+
62
  log_activity(username, "Login Success", "-", "-")
63
  st.rerun()
64
  else:
 
73
  st.stop()
74
 
75
  # ==============================
76
+ # PAGE CONFIG
77
  # ==============================
78
  st.set_page_config(page_title="Multi Search Engine", layout="wide")
79
  st.title("πŸ” Advanced Multi-Search Product Engine")
80
 
81
+ st.sidebar.success(f"πŸ‘€ {st.session_state['user']}")
82
+
83
  if st.sidebar.button("πŸšͺ Logout"):
84
  log_activity(st.session_state["user"], "Logout", "-", "-")
85
  st.session_state.clear()
86
  st.rerun()
87
 
88
  # ==============================
89
+ # LOAD MODEL
90
  # ==============================
91
  @st.cache_resource
92
  def load_model():
93
  return SentenceTransformer('all-MiniLM-L6-v2', device='cpu')
94
 
95
+ model = load_model()
96
+
97
+ # ==============================
98
+ # LOAD DATA
99
+ # ==============================
100
  @st.cache_data
101
  def load_data():
102
  path = "src/products_10k.csv"
103
  if not os.path.exists(path):
104
+ st.error("Dataset not found!")
105
  return None
106
+
107
  df = pd.read_csv(path)
108
+
109
  df["combined"] = (
110
  df["product_name"].fillna("") + " " +
111
  df["category"].fillna("") + " " +
112
  df["brand"].fillna("") + " " +
113
  df["description"].fillna("")
114
  )
115
+
116
  return df
117
 
 
118
  df = load_data()
119
+ if df is None:
120
+ st.stop()
121
+
122
+ # ==============================
123
+ # DATA PREVIEW
124
+ # ==============================
125
+ st.subheader("πŸ“„ Data Preview")
126
+ rows = st.selectbox("Rows to view", [10, 20, 50, 100])
127
+ st.dataframe(df.head(rows))
128
+
129
+ products = df["combined"].tolist()
130
 
131
  # ==============================
132
+ # PREPROCESS
133
  # ==============================
134
  @st.cache_resource
135
+ def preprocess(products):
 
136
  tfidf = TfidfVectorizer()
137
  tfidf_matrix = tfidf.fit_transform(products)
138
+
 
139
  embeddings = model.encode(products, show_progress_bar=False)
140
  faiss.normalize_L2(embeddings)
141
+
142
  index = faiss.IndexFlatIP(embeddings.shape[1])
143
  index.add(np.array(embeddings))
144
+
145
+ bm25 = BM25Okapi([p.lower().split() for p in products])
146
+
 
 
147
  return tfidf, tfidf_matrix, embeddings, index, bm25
148
 
149
+ tfidf, tf_matrix, embs, faiss_index, bm25 = preprocess(products)
150
+
151
+ # ==============================
152
+ # SYNONYMS
153
+ # ==============================
154
+ def get_synonyms(word):
155
+ synonyms = set()
156
+ for syn in wordnet.synsets(word):
157
+ for lemma in syn.lemmas():
158
+ synonyms.add(lemma.name())
159
+ return list(synonyms)
160
 
161
  # ==============================
162
+ # SEARCH ENGINE
163
  # ==============================
164
+ def search_engine(query, mode, top_k):
165
+
166
  if mode == "Keyword":
167
+ return [(i, 1) for i, p in enumerate(products) if query.lower() in p.lower()]
168
+
169
+ elif mode == "Regex":
170
+ return [(i, 1) for i, p in enumerate(products) if re.search(query, p, re.IGNORECASE)]
171
+
172
+ elif mode == "Boolean":
173
+ if "AND" in query:
174
+ terms = query.split("AND")
175
+ return [(i, 1) for i, p in enumerate(products)
176
+ if all(t.strip().lower() in p.lower() for t in terms)]
177
+ elif "OR" in query:
178
+ terms = query.split("OR")
179
+ return [(i, 1) for i, p in enumerate(products)
180
+ if any(t.strip().lower() in p.lower() for t in terms)]
181
+ return []
182
+
183
+ elif mode == "Fuzzy":
184
+ return sorted([(i, fuzz.ratio(query, p)) for i, p in enumerate(products)],
185
+ key=lambda x: x[1], reverse=True)
186
+
187
+ elif mode == "N-Gram":
188
+ return [(i, 1) for i, p in enumerate(products)
189
+ if any(query.lower() in w for w in p.lower().split())]
190
+
191
+ elif mode == "Prefix":
192
+ return [(i, 1) for i, p in enumerate(products)
193
+ if any(w.startswith(query.lower()) for w in p.lower().split())]
194
+
195
+ elif mode == "Suffix":
196
+ return [(i, 1) for i, p in enumerate(products)
197
+ if any(w.endswith(query.lower()) for w in p.lower().split())]
198
+
199
+ elif mode == "TF-IDF":
200
+ scores = (tf_matrix @ tfidf.transform([query]).T).toarray().flatten()
201
+ return list(enumerate(scores))
202
+
203
+ elif mode == "BM25":
204
+ return list(enumerate(bm25.get_scores(query.lower().split())))
205
+
206
+ elif mode == "Semantic":
207
+ q_emb = model.encode([query])
208
+ faiss.normalize_L2(q_emb)
209
+ scores = np.dot(embs, q_emb.T).flatten()
210
+ return list(enumerate(scores))
211
+
212
+ elif mode == "FAISS":
213
+ q_emb = model.encode([query])
214
  faiss.normalize_L2(q_emb)
215
+ D, I = faiss_index.search(np.array(q_emb), top_k)
216
+ return [(i, float(D[0][idx])) for idx, i in enumerate(I[0])]
217
+
218
+ elif mode == "Hybrid":
219
+ tfidf_s = dict(search_engine(query, "TF-IDF", top_k))
220
+ sem_s = dict(search_engine(query, "Semantic", top_k))
221
+ return [(i, tfidf_s.get(i, 0) + sem_s.get(i, 0)) for i in range(len(products))]
222
+
223
+ elif mode == "Query Expansion":
224
+ expanded = query.split()
225
+ for w in query.split():
226
+ expanded += get_synonyms(w)
227
+ return search_engine(" ".join(expanded), "TF-IDF", top_k)
228
+
229
+ elif mode == "Weighted Hybrid":
230
+ tfidf_s = dict(search_engine(query, "TF-IDF", top_k))
231
+ sem_s = dict(search_engine(query, "Semantic", top_k))
232
+ bm25_s = dict(search_engine(query, "BM25", top_k))
233
+
234
+ return [(i,
235
+ 0.4 * tfidf_s.get(i, 0) +
236
+ 0.4 * sem_s.get(i, 0) +
237
+ 0.2 * bm25_s.get(i, 0))
238
+ for i in range(len(products))]
239
+
240
+ elif mode == "Ensemble":
241
+ tfidf_s = np.array([s for _, s in search_engine(query, "TF-IDF", top_k)])
242
+ sem_s = np.array([s for _, s in search_engine(query, "Semantic", top_k)])
243
+ bm25_s = np.array([s for _, s in search_engine(query, "BM25", top_k)])
244
+
245
+ combined = (
246
+ tfidf_s / (np.max(tfidf_s) + 1e-6) +
247
+ sem_s / (np.max(sem_s) + 1e-6) +
248
+ bm25_s / (np.max(bm25_s) + 1e-6)
249
+ )
250
+ return list(enumerate(combined))
251
+
252
+ return []
253
 
254
  # ==============================
255
+ # UI SEARCH
256
  # ==============================
257
+ search_types = [
258
+ "Keyword","Regex","Boolean","Fuzzy","N-Gram","Prefix","Suffix",
259
+ "TF-IDF","BM25","Semantic","FAISS","Hybrid",
260
+ "Query Expansion","Weighted Hybrid","Ensemble"
261
+ ]
262
 
263
+ search_type = st.selectbox("πŸ”Ž Search Type", search_types)
264
+ query = st.text_input("Enter query")
265
+ top_k = st.slider("Top Results", 5, 50, 10)
266
 
267
+ if st.button("Search"):
268
+ if not query:
269
+ st.warning("Enter query")
 
 
 
 
270
  else:
271
+ results = search_engine(query, search_type, top_k)
272
+ results = sorted(results, key=lambda x: x[1], reverse=True)[:top_k]
273
+
274
+ log_activity(st.session_state["user"], "Search", query, search_type)
275
+
276
+ idx = [i for i, _ in results if i != -1]
277
+ scores = [round(s, 4) for i, s in results if i != -1]
278
+
279
+ if idx:
280
+ out = df.iloc[idx].copy()
281
+ out["Score"] = scores
282
+ st.dataframe(out.drop(columns=["combined"]), use_container_width=True)
283
+ else:
284
+ st.info("No results found")
285
 
286
  # ==============================
287
  # SIDEBAR LOGS
288
  # ==============================
289
+ st.sidebar.subheader("πŸ“Š Activity Logs")
 
290
  if os.path.exists(LOG_FILE):
291
+ st.sidebar.dataframe(pd.read_csv(LOG_FILE).tail(10))
292
+ else:
293
+ st.sidebar.write("No logs yet")