Spaces:

pradeep4321
/

sample_multi_search

Sleeping

App Files Files Community

pradeep4321 commited on Apr 2

Commit

ca8f7bb

verified ·

1 Parent(s): 131fb40

Update src/app.py

Browse files

Files changed (1) hide show

src/app.py +172 -56

src/app.py CHANGED Viewed

@@ -2,20 +2,24 @@ import streamlit as st
 import pandas as pd
 import numpy as np
 import os
 import faiss
 import nltk
 from sklearn.feature_extraction.text import TfidfVectorizer
 from sentence_transformers import SentenceTransformer
 from rank_bm25 import BM25Okapi
 # ==============================
-# INITIALIZATION & NLTK
 # ==============================
 nltk.download('wordnet', quiet=True)
 LOG_FILE = "user_logs.csv"
 # ==============================
-# LOGGING UTILITY
 # ==============================
 def log_activity(user, action, query, search_type):
     log_entry = {
@@ -32,17 +36,15 @@ def log_activity(user, action, query, search_type):
         else:
             df_log = pd.DataFrame([log_entry])
         df_log.to_csv(LOG_FILE, index=False)
-    except Exception:
-        pass # Prevent app crash on logging errors
 # ==============================
-# SECRETS & AUTHENTICATION
 # ==============================
 def login():
     st.title("🔐 Login Required")
-    # Hugging Face exposes secrets as environment variables
-    # We check both os.environ (Cloud) and st.secrets (Local)
     HF_USER = os.environ.get("USERNAME") or st.secrets.get("USERNAME")
     HF_PASS = os.environ.get("PASSWORD") or st.secrets.get("PASSWORD")
@@ -51,11 +53,12 @@ def login():
     if st.button("Login"):
         if not HF_USER or not HF_PASS:
-            st.error("⚠️ Secrets not configured! Add USERNAME and PASSWORD in Hugging Face Settings.")
         elif username == HF_USER and password == HF_PASS:
             st.session_state["authenticated"] = True
             st.session_state["user"] = username
             st.session_state["login_time"] = pd.Timestamp.now()
             log_activity(username, "Login Success", "-", "-")
             st.rerun()
         else:
@@ -70,108 +73,221 @@ if not st.session_state["authenticated"]:
     st.stop()
 # ==============================
-# PAGE CONFIG & UI
 # ==============================
 st.set_page_config(page_title="Multi Search Engine", layout="wide")
 st.title("🔍 Advanced Multi-Search Product Engine")
-st.sidebar.success(f"👤 User: {st.session_state['user']}")
 if st.sidebar.button("🚪 Logout"):
     log_activity(st.session_state["user"], "Logout", "-", "-")
     st.session_state.clear()
     st.rerun()
 # ==============================
-# DATA LOADING & CACHING
 # ==============================
 @st.cache_resource
 def load_model():
     return SentenceTransformer('all-MiniLM-L6-v2', device='cpu')
 @st.cache_data
 def load_data():
     path = "src/products_10k.csv"
     if not os.path.exists(path):
-        st.error(f"Missing data file at {path}")
         return None
     df = pd.read_csv(path)
-    # Fill NaN values to prevent search errors
     df["combined"] = (
         df["product_name"].fillna("") + " " +
         df["category"].fillna("") + " " +
         df["brand"].fillna("") + " " +
         df["description"].fillna("")
     )
     return df
-model = load_model()
 df = load_data()
-if df is None: st.stop()
 # ==============================
-# SEARCH PRE-PROCESSING
 # ==============================
 @st.cache_resource
-def get_search_assets(products):
-    # TF-IDF
     tfidf = TfidfVectorizer()
     tfidf_matrix = tfidf.fit_transform(products)
-    # Semantic/FAISS
     embeddings = model.encode(products, show_progress_bar=False)
     faiss.normalize_L2(embeddings)
     index = faiss.IndexFlatIP(embeddings.shape[1])
     index.add(np.array(embeddings))
-    # BM25
-    tokenized = [p.lower().split() for p in products]
-    bm25 = BM25Okapi(tokenized)
     return tfidf, tfidf_matrix, embeddings, index, bm25
-products_list = df["combined"].tolist()
-tfidf, tf_matrix, embs, faiss_index, bm25 = get_search_assets(products_list)
 # ==============================
-# SEARCH FUNCTIONS
 # ==============================
-def run_search(q, mode, k):
     if mode == "Keyword":
-        # Simple boolean check for exact matches
-        matches = [(i, 1.0) for i, p in enumerate(products_list) if q.lower() in p.lower()]
-        return matches[:k]
-    else:
-        # Semantic search using FAISS
-        q_emb = model.encode([q])
         faiss.normalize_L2(q_emb)
-        scores, indices = faiss_index.search(np.array(q_emb), k)
-        return list(zip(indices[0], scores[0]))
 # ==============================
-# MAIN APP EXECUTION
 # ==============================
-search_type = st.selectbox("🔎 Search Type", ["Keyword", "Semantic"])
-query = st.text_input("Search for products...")
-top_k = st.slider("Results to show", 5, 50, 10)
-if st.button("Search") and query:
-    results = run_search(query, search_type, top_k)
-    log_activity(st.session_state["user"], "Search", query, search_type)
-    if results:
-        idx = [r[0] for r in results if r[0] != -1]
-        scores = [r[1] for r in results if r[0] != -1]
-        final_df = df.iloc[idx].copy()
-        final_df["Match Score"] = scores
-        st.dataframe(final_df.drop(columns=["combined"]), use_container_width=True)
     else:
-        st.info("No matching products found.")
 # ==============================
 # SIDEBAR LOGS
 # ==============================
-st.sidebar.markdown("---")
-st.sidebar.subheader("📊 Recent Activity")
 if os.path.exists(LOG_FILE):
-    st.sidebar.dataframe(pd.read_csv(LOG_FILE).tail(5))

 import pandas as pd
 import numpy as np
 import os
+import re
 import faiss
 import nltk
 from sklearn.feature_extraction.text import TfidfVectorizer
 from sentence_transformers import SentenceTransformer
 from rank_bm25 import BM25Okapi
+from rapidfuzz import fuzz
+from nltk.corpus import wordnet
 # ==============================
+# INITIAL SETUP
 # ==============================
 nltk.download('wordnet', quiet=True)
 LOG_FILE = "user_logs.csv"
 # ==============================
+# LOGGING FUNCTION
 # ==============================
 def log_activity(user, action, query, search_type):
     log_entry = {
         else:
             df_log = pd.DataFrame([log_entry])
         df_log.to_csv(LOG_FILE, index=False)
+    except:
+        pass
 # ==============================
+# AUTHENTICATION
 # ==============================
 def login():
     st.title("🔐 Login Required")
     HF_USER = os.environ.get("USERNAME") or st.secrets.get("USERNAME")
     HF_PASS = os.environ.get("PASSWORD") or st.secrets.get("PASSWORD")
     if st.button("Login"):
         if not HF_USER or not HF_PASS:
+            st.error("⚠️ Secrets not configured!")
         elif username == HF_USER and password == HF_PASS:
             st.session_state["authenticated"] = True
             st.session_state["user"] = username
             st.session_state["login_time"] = pd.Timestamp.now()
             log_activity(username, "Login Success", "-", "-")
             st.rerun()
         else:
     st.stop()
 # ==============================
+# PAGE CONFIG
 # ==============================
 st.set_page_config(page_title="Multi Search Engine", layout="wide")
 st.title("🔍 Advanced Multi-Search Product Engine")
+st.sidebar.success(f"👤 {st.session_state['user']}")
 if st.sidebar.button("🚪 Logout"):
     log_activity(st.session_state["user"], "Logout", "-", "-")
     st.session_state.clear()
     st.rerun()
 # ==============================
+# LOAD MODEL
 # ==============================
 @st.cache_resource
 def load_model():
     return SentenceTransformer('all-MiniLM-L6-v2', device='cpu')
+model = load_model()
+# ==============================
+# LOAD DATA
+# ==============================
 @st.cache_data
 def load_data():
     path = "src/products_10k.csv"
     if not os.path.exists(path):
+        st.error("Dataset not found!")
         return None
     df = pd.read_csv(path)
     df["combined"] = (
         df["product_name"].fillna("") + " " +
         df["category"].fillna("") + " " +
         df["brand"].fillna("") + " " +
         df["description"].fillna("")
     )
     return df
 df = load_data()
+if df is None:
+    st.stop()
+# ==============================
+# DATA PREVIEW
+# ==============================
+st.subheader("📄 Data Preview")
+rows = st.selectbox("Rows to view", [10, 20, 50, 100])
+st.dataframe(df.head(rows))
+products = df["combined"].tolist()
 # ==============================
+# PREPROCESS
 # ==============================
 @st.cache_resource
+def preprocess(products):
     tfidf = TfidfVectorizer()
     tfidf_matrix = tfidf.fit_transform(products)
     embeddings = model.encode(products, show_progress_bar=False)
     faiss.normalize_L2(embeddings)
     index = faiss.IndexFlatIP(embeddings.shape[1])
     index.add(np.array(embeddings))
+    bm25 = BM25Okapi([p.lower().split() for p in products])
     return tfidf, tfidf_matrix, embeddings, index, bm25
+tfidf, tf_matrix, embs, faiss_index, bm25 = preprocess(products)
+# ==============================
+# SYNONYMS
+# ==============================
+def get_synonyms(word):
+    synonyms = set()
+    for syn in wordnet.synsets(word):
+        for lemma in syn.lemmas():
+            synonyms.add(lemma.name())
+    return list(synonyms)
 # ==============================
+# SEARCH ENGINE
 # ==============================
+def search_engine(query, mode, top_k):
     if mode == "Keyword":
+        return [(i, 1) for i, p in enumerate(products) if query.lower() in p.lower()]
+    elif mode == "Regex":
+        return [(i, 1) for i, p in enumerate(products) if re.search(query, p, re.IGNORECASE)]
+    elif mode == "Boolean":
+        if "AND" in query:
+            terms = query.split("AND")
+            return [(i, 1) for i, p in enumerate(products)
+                    if all(t.strip().lower() in p.lower() for t in terms)]
+        elif "OR" in query:
+            terms = query.split("OR")
+            return [(i, 1) for i, p in enumerate(products)
+                    if any(t.strip().lower() in p.lower() for t in terms)]
+        return []
+    elif mode == "Fuzzy":
+        return sorted([(i, fuzz.ratio(query, p)) for i, p in enumerate(products)],
+                      key=lambda x: x[1], reverse=True)
+    elif mode == "N-Gram":
+        return [(i, 1) for i, p in enumerate(products)
+                if any(query.lower() in w for w in p.lower().split())]
+    elif mode == "Prefix":
+        return [(i, 1) for i, p in enumerate(products)
+                if any(w.startswith(query.lower()) for w in p.lower().split())]
+    elif mode == "Suffix":
+        return [(i, 1) for i, p in enumerate(products)
+                if any(w.endswith(query.lower()) for w in p.lower().split())]
+    elif mode == "TF-IDF":
+        scores = (tf_matrix @ tfidf.transform([query]).T).toarray().flatten()
+        return list(enumerate(scores))
+    elif mode == "BM25":
+        return list(enumerate(bm25.get_scores(query.lower().split())))
+    elif mode == "Semantic":
+        q_emb = model.encode([query])
+        faiss.normalize_L2(q_emb)
+        scores = np.dot(embs, q_emb.T).flatten()
+        return list(enumerate(scores))
+    elif mode == "FAISS":
+        q_emb = model.encode([query])
         faiss.normalize_L2(q_emb)
+        D, I = faiss_index.search(np.array(q_emb), top_k)
+        return [(i, float(D[0][idx])) for idx, i in enumerate(I[0])]
+    elif mode == "Hybrid":
+        tfidf_s = dict(search_engine(query, "TF-IDF", top_k))
+        sem_s = dict(search_engine(query, "Semantic", top_k))
+        return [(i, tfidf_s.get(i, 0) + sem_s.get(i, 0)) for i in range(len(products))]
+    elif mode == "Query Expansion":
+        expanded = query.split()
+        for w in query.split():
+            expanded += get_synonyms(w)
+        return search_engine(" ".join(expanded), "TF-IDF", top_k)
+    elif mode == "Weighted Hybrid":
+        tfidf_s = dict(search_engine(query, "TF-IDF", top_k))
+        sem_s = dict(search_engine(query, "Semantic", top_k))
+        bm25_s = dict(search_engine(query, "BM25", top_k))
+        return [(i,
+                 0.4 * tfidf_s.get(i, 0) +
+                 0.4 * sem_s.get(i, 0) +
+                 0.2 * bm25_s.get(i, 0))
+                for i in range(len(products))]
+    elif mode == "Ensemble":
+        tfidf_s = np.array([s for _, s in search_engine(query, "TF-IDF", top_k)])
+        sem_s = np.array([s for _, s in search_engine(query, "Semantic", top_k)])
+        bm25_s = np.array([s for _, s in search_engine(query, "BM25", top_k)])
+        combined = (
+            tfidf_s / (np.max(tfidf_s) + 1e-6) +
+            sem_s / (np.max(sem_s) + 1e-6) +
+            bm25_s / (np.max(bm25_s) + 1e-6)
+        )
+        return list(enumerate(combined))
+    return []
 # ==============================
+# UI SEARCH
 # ==============================
+search_types = [
+    "Keyword","Regex","Boolean","Fuzzy","N-Gram","Prefix","Suffix",
+    "TF-IDF","BM25","Semantic","FAISS","Hybrid",
+    "Query Expansion","Weighted Hybrid","Ensemble"
+]
+search_type = st.selectbox("🔎 Search Type", search_types)
+query = st.text_input("Enter query")
+top_k = st.slider("Top Results", 5, 50, 10)
+if st.button("Search"):
+    if not query:
+        st.warning("Enter query")
     else:
+        results = search_engine(query, search_type, top_k)
+        results = sorted(results, key=lambda x: x[1], reverse=True)[:top_k]
+        log_activity(st.session_state["user"], "Search", query, search_type)
+        idx = [i for i, _ in results if i != -1]
+        scores = [round(s, 4) for i, s in results if i != -1]
+        if idx:
+            out = df.iloc[idx].copy()
+            out["Score"] = scores
+            st.dataframe(out.drop(columns=["combined"]), use_container_width=True)
+        else:
+            st.info("No results found")
 # ==============================
 # SIDEBAR LOGS
 # ==============================
+st.sidebar.subheader("📊 Activity Logs")
 if os.path.exists(LOG_FILE):
+    st.sidebar.dataframe(pd.read_csv(LOG_FILE).tail(10))
+else:
+    st.sidebar.write("No logs yet")