Spaces:

pradeep4321
/

sample_multi_search

Sleeping

App Files Files Community

pradeep4321 commited on Mar 31

Commit

24b5168

verified ·

1 Parent(s): 828c082

Update src/app.py

Browse files

Files changed (1) hide show

src/app.py +143 -66

src/app.py CHANGED Viewed

@@ -2,7 +2,6 @@ import streamlit as st
 import pandas as pd
 import numpy as np
 import re
-import os
 from sklearn.feature_extraction.text import TfidfVectorizer
 from sentence_transformers import SentenceTransformer
@@ -12,17 +11,9 @@ import faiss
 import nltk
 # ==============================
-# FIX NLTK (HF SAFE)
 # ==============================
-nltk_data_path = "/tmp/nltk_data"
-os.makedirs(nltk_data_path, exist_ok=True)
-nltk.data.path.append(nltk_data_path)
-try:
-    nltk.data.find('corpora/wordnet')
-except:
-    nltk.download('wordnet', download_dir=nltk_data_path)
 from nltk.corpus import wordnet
 # ==============================
@@ -34,44 +25,71 @@ st.title("🔍 Advanced Multi-Search Product Engine")
 # ==============================
 # LOAD MODEL
 # ==============================
-@st.cache_resource
-def load_model():
-    return SentenceTransformer('all-MiniLM-L6-v2', device='cpu')
-model = load_model()
 # ==============================
-# LOAD CSV FROM REPO
 # ==============================
-@st.cache_data
-def load_data():
-    try:
-        df = pd.read_csv("src/products_10k.csv")
-        return df
-    except:
-        st.warning("⚠️ products_10k.csv not found. Using fallback data.")
-        return pd.DataFrame({
-            "product_name": ["iPhone 14 Pro", "Samsung Galaxy S23"],
-            "category": ["Mobile", "Mobile"],
-            "brand": ["Apple", "Samsung"],
-            "description": ["Latest smartphone", "Android flagship phone"]
-        })
-df = load_data()
 # ==============================
-# DATA PREVIEW
 # ==============================
 st.subheader("📄 Data Preview")
-row_limit = st.selectbox(
-    "Select number of rows to view",
-    [10, 20, 30, 50, 100],
-    index=0
-)
-st.caption(f"Showing top {row_limit} rows")
-st.dataframe(df.head(row_limit), use_container_width=True)
 # ==============================
 # COMBINE TEXT
@@ -88,18 +106,15 @@ products = df["combined"].tolist()
 # ==============================
 # PREPROCESS
 # ==============================
-@st.cache_resource
 def preprocess_data(products):
     tfidf = TfidfVectorizer()
     tfidf_matrix = tfidf.fit_transform(products)
-    embeddings = model.encode(products, batch_size=32, show_progress_bar=False)
     faiss.normalize_L2(embeddings)
-    dim = embeddings.shape[1]
-    index = faiss.IndexFlatIP(dim)
     index.add(np.array(embeddings))
     tokenized = [p.split() for p in products]
@@ -107,7 +122,11 @@ def preprocess_data(products):
     return tfidf, tfidf_matrix, embeddings, index, bm25
-@st.cache_resource
 def get_synonyms(word):
     synonyms = set()
     for syn in wordnet.synsets(word):
@@ -115,9 +134,6 @@ def get_synonyms(word):
             synonyms.add(lemma.name())
     return synonyms
-with st.spinner("⚙️ Processing data..."):
-    tfidf, tfidf_matrix, embeddings, index, bm25 = preprocess_data(products)
 # ==============================
 # SEARCH FUNCTIONS
 # ==============================
@@ -140,25 +156,35 @@ def boolean_search(q):
 def fuzzy_search(q):
     scores = [(i, fuzz.ratio(q, p)) for i, p in enumerate(products)]
-    return sorted(scores, key=lambda x: x[1], reverse=True)[:10]
 def tfidf_search(q):
     q_vec = tfidf.transform([q])
     scores = (tfidf_matrix @ q_vec.T).toarray().flatten()
-    idx = np.argsort(scores)[::-1][:10]
-    return [(i, float(scores[i])) for i in idx]
 def bm25_search(q):
     scores = bm25.get_scores(q.split())
-    idx = np.argsort(scores)[::-1][:10]
-    return [(i, float(scores[i])) for i in idx]
 def semantic_search(q):
     q_emb = model.encode([q], show_progress_bar=False)
     faiss.normalize_L2(q_emb)
     scores = np.dot(embeddings, q_emb.T).flatten()
-    idx = np.argsort(scores)[::-1][:10]
-    return [(i, float(scores[i])) for i in idx]
 def faiss_search(q):
     q_emb = model.encode([q], show_progress_bar=False)
@@ -169,20 +195,62 @@ def faiss_search(q):
 def hybrid_search(q):
     tfidf_res = dict(tfidf_search(q))
     sem_res = dict(semantic_search(q))
-    combined = {i: tfidf_res.get(i, 0) + sem_res.get(i, 0) for i in range(len(products))}
-    return sorted(combined.items(), key=lambda x: x[1], reverse=True)[:10]
 # ==============================
 # UI
 # ==============================
-search_type = st.selectbox(
-    "🔎 Select Search Type",
-    ["Keyword", "Regex", "Boolean", "Fuzzy", "TF-IDF", "BM25", "Semantic", "FAISS", "Hybrid"]
-)
 query = st.text_input("Enter your search query")
 top_k = st.slider("Top Results", 5, 20, 10)
 if st.button("Search"):
     if not query:
         st.warning("Enter query")
@@ -192,18 +260,27 @@ if st.button("Search"):
             "Regex": regex_search,
             "Boolean": boolean_search,
             "Fuzzy": fuzzy_search,
             "TF-IDF": tfidf_search,
             "BM25": bm25_search,
             "Semantic": semantic_search,
             "FAISS": faiss_search,
-            "Hybrid": hybrid_search
         }
-        results = func_map[search_type](query)[:top_k]
         indices = [i for i, _ in results]
         result_df = df.iloc[indices].copy()
-        result_df["Score"] = [score for _, score in results]
         st.subheader("🔎 Results")
-        st.dataframe(result_df, use_container_width=True)

 import pandas as pd
 import numpy as np
 import re
 from sklearn.feature_extraction.text import TfidfVectorizer
 from sentence_transformers import SentenceTransformer
 import nltk
 # ==============================
+# NLTK FIX
 # ==============================
+nltk.download('wordnet', quiet=True)
 from nltk.corpus import wordnet
 # ==============================
 # ==============================
 # LOAD MODEL
 # ==============================
+if "model" not in st.session_state:
+    with st.spinner("Loading AI model..."):
+        st.session_state.model = SentenceTransformer(
+            'all-MiniLM-L6-v2',
+            device='cpu'
+        )
+model = st.session_state.model
 # ==============================
+# SEARCH INFO (UPDATED)
 # ==============================
+search_info = {
+    "Keyword": ("Exact match", "iphone"),
+    "Regex": ("Pattern match", "^Samsung"),
+    "Boolean": ("AND / OR logic", "nike AND shoes"),
+    "Fuzzy": ("Spelling mistakes", "iphon"),
+    "N-Gram": ("Partial word", "iph"),
+    "Prefix": ("Word starts with", "Sam"),
+    "Suffix": ("Word ends with", "phone"),
+    "TF-IDF": ("Keyword ranking", "wireless headphones"),
+    "BM25": ("Advanced ranking", "gaming laptop"),
+    "Semantic": ("Meaning search", "sports footwear"),
+    "FAISS": ("Fast semantic", "music device"),
+    "Hybrid": ("TF-IDF + Semantic", "running shoes"),
+    "Query Expansion": ("Auto synonyms", "speaker"),
+    "Weighted Hybrid": ("TF-IDF + Semantic + BM25", "best laptop"),
+    "Ensemble": ("Combine all scores", "smartphone")
+}
+# ==============================
+# FILE LOAD (KEEP YOUR LOGIC)
+# ==============================
+uploaded_file = st.file_uploader("Upload CSV", type=["csv"])
+if uploaded_file:
+    df = pd.read_csv(uploaded_file)
+else:
+    st.info("Using sample dataset")
+    df = pd.DataFrame({
+        "product_name": [
+            "iPhone 14 Pro",
+            "Samsung Galaxy S23",
+            "Nike Running Shoes",
+            "Dell Gaming Laptop",
+            "Bluetooth Speaker"
+        ],
+        "category": ["Mobile", "Mobile", "Footwear", "Laptop", "Electronics"],
+        "brand": ["Apple", "Samsung", "Nike", "Dell", "JBL"],
+        "description": [
+            "Latest smartphone",
+            "Android flagship phone",
+            "Comfort sports shoes",
+            "High performance laptop",
+            "Portable music device"
+        ]
+    })
 # ==============================
+# DATA PREVIEW CONTROL
 # ==============================
 st.subheader("📄 Data Preview")
+rows_to_show = st.selectbox("Select rows to view", [10, 20, 50, 100])
+st.dataframe(df.head(rows_to_show))
 # ==============================
 # COMBINE TEXT
 # ==============================
 # PREPROCESS
 # ==============================
+@st.cache(allow_output_mutation=True)
 def preprocess_data(products):
     tfidf = TfidfVectorizer()
     tfidf_matrix = tfidf.fit_transform(products)
+    embeddings = model.encode(products, batch_size=64, show_progress_bar=False)
     faiss.normalize_L2(embeddings)
+    index = faiss.IndexFlatIP(embeddings.shape[1])
     index.add(np.array(embeddings))
     tokenized = [p.split() for p in products]
     return tfidf, tfidf_matrix, embeddings, index, bm25
+tfidf, tfidf_matrix, embeddings, index, bm25 = preprocess_data(products)
+# ==============================
+# SYNONYMS
+# ==============================
 def get_synonyms(word):
     synonyms = set()
     for syn in wordnet.synsets(word):
             synonyms.add(lemma.name())
     return synonyms
 # ==============================
 # SEARCH FUNCTIONS
 # ==============================
 def fuzzy_search(q):
     scores = [(i, fuzz.ratio(q, p)) for i, p in enumerate(products)]
+    return sorted(scores, key=lambda x: x[1], reverse=True)
+def ngram_search(q):
+    return [(i, 1) for i, p in enumerate(products) if q.lower() in p.lower()]
+# ✅ FIXED PREFIX (word-level)
+def prefix_search(q):
+    return [(i, 1) for i, p in enumerate(products)
+            if any(word.startswith(q.lower()) for word in p.lower().split())]
+# ✅ FIXED SUFFIX (word-level)
+def suffix_search(q):
+    return [(i, 1) for i, p in enumerate(products)
+            if any(word.endswith(q.lower()) for word in p.lower().split())]
 def tfidf_search(q):
     q_vec = tfidf.transform([q])
     scores = (tfidf_matrix @ q_vec.T).toarray().flatten()
+    return list(enumerate(scores))
 def bm25_search(q):
     scores = bm25.get_scores(q.split())
+    return list(enumerate(scores))
 def semantic_search(q):
     q_emb = model.encode([q], show_progress_bar=False)
     faiss.normalize_L2(q_emb)
     scores = np.dot(embeddings, q_emb.T).flatten()
+    return list(enumerate(scores))
 def faiss_search(q):
     q_emb = model.encode([q], show_progress_bar=False)
 def hybrid_search(q):
     tfidf_res = dict(tfidf_search(q))
     sem_res = dict(semantic_search(q))
+    return [(i, tfidf_res.get(i, 0) + sem_res.get(i, 0)) for i in range(len(products))]
+# ✅ IMPROVED QUERY EXPANSION
+def query_expansion_search(q):
+    expanded = q.split()
+    for word in q.split():
+        expanded += list(get_synonyms(word))
+    return tfidf_search(" ".join(expanded))
+# ✅ IMPROVED WEIGHTED HYBRID
+def weighted_hybrid(q):
+    tfidf_res = dict(tfidf_search(q))
+    sem_res = dict(semantic_search(q))
+    bm25_res = dict(bm25_search(q))
+    return [(i,
+             0.4 * tfidf_res.get(i, 0) +
+             0.4 * sem_res.get(i, 0) +
+             0.2 * bm25_res.get(i, 0))
+            for i in range(len(products))]
+# ✅ FIXED ENSEMBLE (NORMALIZED)
+def ensemble_search(q):
+    tfidf_res = np.array([s for _, s in tfidf_search(q)])
+    sem_res = np.array([s for _, s in semantic_search(q)])
+    bm25_res = np.array([s for _, s in bm25_search(q)])
+    combined = tfidf_res/np.max(tfidf_res+1e-6) + \
+               sem_res/np.max(sem_res+1e-6) + \
+               bm25_res/np.max(bm25_res+1e-6)
+    return list(enumerate(combined))
 # ==============================
 # UI
 # ==============================
+search_type = st.selectbox("🔎 Select Search Type", list(search_info.keys()))
+explanation, example = search_info[search_type]
+st.markdown(f"""
+### 🔍 {search_type}
+- **Explanation:** {explanation}
+- **Example:** `{example}`
+""")
 query = st.text_input("Enter your search query")
+if st.button("Try Example"):
+    query = example
+    st.success(f"Loaded: {query}")
 top_k = st.slider("Top Results", 5, 20, 10)
+# ==============================
+# SEARCH EXECUTION
+# ==============================
 if st.button("Search"):
     if not query:
         st.warning("Enter query")
             "Regex": regex_search,
             "Boolean": boolean_search,
             "Fuzzy": fuzzy_search,
+            "N-Gram": ngram_search,
+            "Prefix": prefix_search,
+            "Suffix": suffix_search,
             "TF-IDF": tfidf_search,
             "BM25": bm25_search,
             "Semantic": semantic_search,
             "FAISS": faiss_search,
+            "Hybrid": hybrid_search,
+            "Query Expansion": query_expansion_search,
+            "Weighted Hybrid": weighted_hybrid,
+            "Ensemble": ensemble_search
         }
+        results = func_map[search_type](query)
+        # Sort results
+        results = sorted(results, key=lambda x: x[1], reverse=True)[:top_k]
         indices = [i for i, _ in results]
         result_df = df.iloc[indices].copy()
+        result_df["Score"] = [round(score, 4) for _, score in results]
         st.subheader("🔎 Results")
+        st.dataframe(result_df)