pradeep4321 commited on
Commit
131fb40
Β·
verified Β·
1 Parent(s): 0a8b71e

Update src/app.py

Browse files
Files changed (1) hide show
  1. src/app.py +93 -119
src/app.py CHANGED
@@ -1,27 +1,22 @@
1
  import streamlit as st
2
  import pandas as pd
3
  import numpy as np
4
- import re
5
  import os
6
-
 
7
  from sklearn.feature_extraction.text import TfidfVectorizer
8
  from sentence_transformers import SentenceTransformer
9
  from rank_bm25 import BM25Okapi
10
- from rapidfuzz import fuzz
11
- import faiss
12
- import nltk
13
 
14
  # ==============================
15
- # NLTK FIX
16
  # ==============================
17
  nltk.download('wordnet', quiet=True)
18
- from nltk.corpus import wordnet
19
 
20
  # ==============================
21
- # LOG FUNCTION (FIXED POSITION)
22
  # ==============================
23
- LOG_FILE = "user_logs.csv"
24
-
25
  def log_activity(user, action, query, search_type):
26
  log_entry = {
27
  "User": user,
@@ -30,48 +25,43 @@ def log_activity(user, action, query, search_type):
30
  "Search Type": search_type,
31
  "Time": str(pd.Timestamp.now())
32
  }
33
-
34
- if os.path.exists(LOG_FILE):
35
- df_log = pd.read_csv(LOG_FILE)
36
- df_log = pd.concat([df_log, pd.DataFrame([log_entry])])
37
- else:
38
- df_log = pd.DataFrame([log_entry])
39
-
40
- df_log.to_csv(LOG_FILE, index=False)
 
41
 
42
  # ==============================
43
- # AUTHENTICATION
44
  # ==============================
45
  def login():
46
  st.title("πŸ” Login Required")
 
 
 
 
 
47
 
48
  username = st.text_input("Username")
49
  password = st.text_input("Password", type="password")
50
 
51
  if st.button("Login"):
52
- try:
53
- if (
54
- username == st.secrets["USERNAME"] and
55
- password == st.secrets["PASSWORD"]
56
- ):
57
- st.session_state["authenticated"] = True
58
- st.session_state["user"] = username
59
- st.session_state["login_time"] = pd.Timestamp.now()
60
-
61
- log_activity(username, "Login Success", "-", "-")
62
-
63
- st.success("βœ… Login successful")
64
- st.rerun()
65
- else:
66
- log_activity(username, "Login Failed", "-", "-")
67
- st.error("❌ Invalid credentials")
68
-
69
- except Exception:
70
- st.error("⚠️ Secrets not configured properly")
71
 
72
- # ==============================
73
- # SESSION CONTROL
74
- # ==============================
75
  if "authenticated" not in st.session_state:
76
  st.session_state["authenticated"] = False
77
 
@@ -80,124 +70,108 @@ if not st.session_state["authenticated"]:
80
  st.stop()
81
 
82
  # ==============================
83
- # PAGE CONFIG
84
  # ==============================
85
  st.set_page_config(page_title="Multi Search Engine", layout="wide")
86
  st.title("πŸ” Advanced Multi-Search Product Engine")
87
 
88
- # Sidebar info
89
  st.sidebar.success(f"πŸ‘€ User: {st.session_state['user']}")
90
- st.sidebar.info(f"πŸ•’ Login: {st.session_state['login_time']}")
91
-
92
- # Logout button
93
  if st.sidebar.button("πŸšͺ Logout"):
94
  log_activity(st.session_state["user"], "Logout", "-", "-")
95
  st.session_state.clear()
96
  st.rerun()
97
 
98
  # ==============================
99
- # LOAD MODEL
100
  # ==============================
101
  @st.cache_resource
102
  def load_model():
103
  return SentenceTransformer('all-MiniLM-L6-v2', device='cpu')
104
 
105
- model = load_model()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
106
 
107
- # ==============================
108
- # LOAD DATA
109
- # ==============================
110
- try:
111
- df = pd.read_csv("src/products_10k.csv")
112
- st.success("βœ… Data loaded successfully (10K Products)")
113
- except Exception as e:
114
- st.error(f"❌ Error loading file: {e}")
115
- st.stop()
116
 
117
  # ==============================
118
- # PREPROCESS
119
  # ==============================
120
- df["combined"] = (
121
- df["product_name"].astype(str) + " " +
122
- df["category"].astype(str) + " " +
123
- df["brand"].astype(str) + " " +
124
- df["description"].astype(str)
125
- )
126
-
127
- products = df["combined"].tolist()
128
-
129
  @st.cache_resource
130
- def preprocess_data(products):
 
131
  tfidf = TfidfVectorizer()
132
  tfidf_matrix = tfidf.fit_transform(products)
133
-
 
134
  embeddings = model.encode(products, show_progress_bar=False)
135
  faiss.normalize_L2(embeddings)
136
-
137
  index = faiss.IndexFlatIP(embeddings.shape[1])
138
  index.add(np.array(embeddings))
139
-
140
- tokenized = [p.split() for p in products]
 
141
  bm25 = BM25Okapi(tokenized)
142
-
143
  return tfidf, tfidf_matrix, embeddings, index, bm25
144
 
145
- tfidf, tfidf_matrix, embeddings, index, bm25 = preprocess_data(products)
 
146
 
147
  # ==============================
148
  # SEARCH FUNCTIONS
149
  # ==============================
150
- def keyword_search(q):
151
- return [(i, 1) for i, p in enumerate(products) if q.lower() in p.lower()]
152
-
153
- def semantic_search(q):
154
- q_emb = model.encode([q])
155
- faiss.normalize_L2(q_emb)
156
- scores = np.dot(embeddings, q_emb.T).flatten()
157
- return list(enumerate(scores))
 
 
 
158
 
159
  # ==============================
160
- # UI
161
  # ==============================
162
  search_type = st.selectbox("πŸ”Ž Search Type", ["Keyword", "Semantic"])
163
- query = st.text_input("Enter your search query")
164
- top_k = st.slider("Top Results", 5, 20, 10)
165
-
166
- # ==============================
167
- # SEARCH
168
- # ==============================
169
- if st.button("Search"):
170
- if not query:
171
- st.warning("Enter query")
 
 
 
 
 
172
  else:
173
- if search_type == "Keyword":
174
- results = keyword_search(query)
175
- else:
176
- results = semantic_search(query)
177
-
178
- results = sorted(results, key=lambda x: x[1], reverse=True)[:top_k]
179
-
180
- # βœ… LOG SEARCH
181
- log_activity(
182
- st.session_state["user"],
183
- "Search",
184
- query,
185
- search_type
186
- )
187
-
188
- indices = [i for i, _ in results]
189
- result_df = df.iloc[indices].copy()
190
- result_df["Score"] = [round(score, 4) for _, score in results]
191
-
192
- st.dataframe(result_df)
193
 
194
  # ==============================
195
- # VIEW LOGS
196
  # ==============================
197
- st.sidebar.subheader("πŸ“Š User Logs")
198
-
199
  if os.path.exists(LOG_FILE):
200
- log_df = pd.read_csv(LOG_FILE)
201
- st.sidebar.dataframe(log_df.tail(10))
202
- else:
203
- st.sidebar.write("No logs yet")
 
1
  import streamlit as st
2
  import pandas as pd
3
  import numpy as np
 
4
  import os
5
+ import faiss
6
+ import nltk
7
  from sklearn.feature_extraction.text import TfidfVectorizer
8
  from sentence_transformers import SentenceTransformer
9
  from rank_bm25 import BM25Okapi
 
 
 
10
 
11
  # ==============================
12
+ # INITIALIZATION & NLTK
13
  # ==============================
14
  nltk.download('wordnet', quiet=True)
15
+ LOG_FILE = "user_logs.csv"
16
 
17
  # ==============================
18
+ # LOGGING UTILITY
19
  # ==============================
 
 
20
  def log_activity(user, action, query, search_type):
21
  log_entry = {
22
  "User": user,
 
25
  "Search Type": search_type,
26
  "Time": str(pd.Timestamp.now())
27
  }
28
+ try:
29
+ if os.path.exists(LOG_FILE):
30
+ df_log = pd.read_csv(LOG_FILE)
31
+ df_log = pd.concat([df_log, pd.DataFrame([log_entry])], ignore_index=True)
32
+ else:
33
+ df_log = pd.DataFrame([log_entry])
34
+ df_log.to_csv(LOG_FILE, index=False)
35
+ except Exception:
36
+ pass # Prevent app crash on logging errors
37
 
38
  # ==============================
39
+ # SECRETS & AUTHENTICATION
40
  # ==============================
41
  def login():
42
  st.title("πŸ” Login Required")
43
+
44
+ # Hugging Face exposes secrets as environment variables
45
+ # We check both os.environ (Cloud) and st.secrets (Local)
46
+ HF_USER = os.environ.get("USERNAME") or st.secrets.get("USERNAME")
47
+ HF_PASS = os.environ.get("PASSWORD") or st.secrets.get("PASSWORD")
48
 
49
  username = st.text_input("Username")
50
  password = st.text_input("Password", type="password")
51
 
52
  if st.button("Login"):
53
+ if not HF_USER or not HF_PASS:
54
+ st.error("⚠️ Secrets not configured! Add USERNAME and PASSWORD in Hugging Face Settings.")
55
+ elif username == HF_USER and password == HF_PASS:
56
+ st.session_state["authenticated"] = True
57
+ st.session_state["user"] = username
58
+ st.session_state["login_time"] = pd.Timestamp.now()
59
+ log_activity(username, "Login Success", "-", "-")
60
+ st.rerun()
61
+ else:
62
+ log_activity(username, "Login Failed", "-", "-")
63
+ st.error("❌ Invalid credentials")
 
 
 
 
 
 
 
 
64
 
 
 
 
65
  if "authenticated" not in st.session_state:
66
  st.session_state["authenticated"] = False
67
 
 
70
  st.stop()
71
 
72
  # ==============================
73
+ # PAGE CONFIG & UI
74
  # ==============================
75
  st.set_page_config(page_title="Multi Search Engine", layout="wide")
76
  st.title("πŸ” Advanced Multi-Search Product Engine")
77
 
 
78
  st.sidebar.success(f"πŸ‘€ User: {st.session_state['user']}")
 
 
 
79
  if st.sidebar.button("πŸšͺ Logout"):
80
  log_activity(st.session_state["user"], "Logout", "-", "-")
81
  st.session_state.clear()
82
  st.rerun()
83
 
84
  # ==============================
85
+ # DATA LOADING & CACHING
86
  # ==============================
87
  @st.cache_resource
88
  def load_model():
89
  return SentenceTransformer('all-MiniLM-L6-v2', device='cpu')
90
 
91
+ @st.cache_data
92
+ def load_data():
93
+ path = "src/products_10k.csv"
94
+ if not os.path.exists(path):
95
+ st.error(f"Missing data file at {path}")
96
+ return None
97
+ df = pd.read_csv(path)
98
+ # Fill NaN values to prevent search errors
99
+ df["combined"] = (
100
+ df["product_name"].fillna("") + " " +
101
+ df["category"].fillna("") + " " +
102
+ df["brand"].fillna("") + " " +
103
+ df["description"].fillna("")
104
+ )
105
+ return df
106
 
107
+ model = load_model()
108
+ df = load_data()
109
+ if df is None: st.stop()
 
 
 
 
 
 
110
 
111
  # ==============================
112
+ # SEARCH PRE-PROCESSING
113
  # ==============================
 
 
 
 
 
 
 
 
 
114
  @st.cache_resource
115
+ def get_search_assets(products):
116
+ # TF-IDF
117
  tfidf = TfidfVectorizer()
118
  tfidf_matrix = tfidf.fit_transform(products)
119
+
120
+ # Semantic/FAISS
121
  embeddings = model.encode(products, show_progress_bar=False)
122
  faiss.normalize_L2(embeddings)
 
123
  index = faiss.IndexFlatIP(embeddings.shape[1])
124
  index.add(np.array(embeddings))
125
+
126
+ # BM25
127
+ tokenized = [p.lower().split() for p in products]
128
  bm25 = BM25Okapi(tokenized)
129
+
130
  return tfidf, tfidf_matrix, embeddings, index, bm25
131
 
132
+ products_list = df["combined"].tolist()
133
+ tfidf, tf_matrix, embs, faiss_index, bm25 = get_search_assets(products_list)
134
 
135
  # ==============================
136
  # SEARCH FUNCTIONS
137
  # ==============================
138
+ def run_search(q, mode, k):
139
+ if mode == "Keyword":
140
+ # Simple boolean check for exact matches
141
+ matches = [(i, 1.0) for i, p in enumerate(products_list) if q.lower() in p.lower()]
142
+ return matches[:k]
143
+ else:
144
+ # Semantic search using FAISS
145
+ q_emb = model.encode([q])
146
+ faiss.normalize_L2(q_emb)
147
+ scores, indices = faiss_index.search(np.array(q_emb), k)
148
+ return list(zip(indices[0], scores[0]))
149
 
150
  # ==============================
151
+ # MAIN APP EXECUTION
152
  # ==============================
153
  search_type = st.selectbox("πŸ”Ž Search Type", ["Keyword", "Semantic"])
154
+ query = st.text_input("Search for products...")
155
+ top_k = st.slider("Results to show", 5, 50, 10)
156
+
157
+ if st.button("Search") and query:
158
+ results = run_search(query, search_type, top_k)
159
+ log_activity(st.session_state["user"], "Search", query, search_type)
160
+
161
+ if results:
162
+ idx = [r[0] for r in results if r[0] != -1]
163
+ scores = [r[1] for r in results if r[0] != -1]
164
+
165
+ final_df = df.iloc[idx].copy()
166
+ final_df["Match Score"] = scores
167
+ st.dataframe(final_df.drop(columns=["combined"]), use_container_width=True)
168
  else:
169
+ st.info("No matching products found.")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
170
 
171
  # ==============================
172
+ # SIDEBAR LOGS
173
  # ==============================
174
+ st.sidebar.markdown("---")
175
+ st.sidebar.subheader("πŸ“Š Recent Activity")
176
  if os.path.exists(LOG_FILE):
177
+ st.sidebar.dataframe(pd.read_csv(LOG_FILE).tail(5))