Spaces:

pradeep4321
/

sample_multi_search

Sleeping

App Files Files Community

pradeep4321 commited on Mar 31

Commit

f86ae3e

verified ·

1 Parent(s): 3d14b52

Update src/app.py

Browse files

Files changed (1) hide show

src/app.py +25 -112

src/app.py CHANGED Viewed

@@ -12,7 +12,7 @@ import faiss
 import nltk
 # ==============================
-# FIX NLTK (HUGGINGFACE SAFE)
 # ==============================
 nltk_data_path = "/tmp/nltk_data"
 os.makedirs(nltk_data_path, exist_ok=True)
@@ -41,69 +41,33 @@ def load_model():
 model = load_model()
 # ==============================
-# SEARCH INFO
 # ==============================
-search_info = {
-    "Keyword": ("Find exact word match", "iphone → iPhone"),
-    "Regex": ("Pattern-based search", "^S → Samsung"),
-    "Boolean": ("Use AND / OR", "nike AND shoes"),
-    "Fuzzy": ("Handles spelling mistakes", "iphon → iPhone"),
-    "N-Gram": ("Partial word match", "iph → iPhone"),
-    "Prefix": ("Starts with query", "app → Apple"),
-    "Suffix": ("Ends with query", "laptop → Dell Laptop"),
-    "TF-IDF": ("Ranks important words", "wireless headphones"),
-    "BM25": ("Advanced keyword ranking", "gaming laptop"),
-    "Semantic": ("Understands meaning", "sports footwear"),
-    "FAISS": ("Fast semantic search", "music device"),
-    "Hybrid": ("Keyword + meaning", "sports shoes"),
-    "Query Expansion": ("Adds similar words", "speaker → audio"),
-    "Weighted Hybrid": ("Weighted ranking", "better accuracy"),
-    "Ensemble": ("Combine all methods", "best results")
-}
-# ==============================
-# DATA SOURCE (NO UPLOAD)
-# ==============================
-data_option = st.radio("📂 Choose Data Source", ["Sample Data", "Default CSV (from repo)"])
-if data_option == "Default CSV (from repo)":
     try:
-        df = pd.read_csv("products_sample.csv")
-        st.success("✅ Loaded dataset from repository")
     except:
-        st.error("❌ products_sample.csv not found. Using sample data instead.")
-        df = None
-if data_option == "Sample Data" or df is None:
-    df = pd.DataFrame({
-        "product_name": [
-            "iPhone 14 Pro",
-            "Samsung Galaxy S23",
-            "Nike Running Shoes",
-            "Dell Gaming Laptop",
-            "Bluetooth Speaker"
-        ],
-        "category": ["Mobile", "Mobile", "Footwear", "Laptop", "Electronics"],
-        "brand": ["Apple", "Samsung", "Nike", "Dell", "JBL"],
-        "description": [
-            "Latest smartphone",
-            "Android flagship phone",
-            "Comfort sports shoes",
-            "High performance laptop",
-            "Portable music device"
-        ]
-    })
-    st.info("Using sample dataset")
 # ==============================
-# DATA PREVIEW (ROW CONTROL)
 # ==============================
 st.subheader("📄 Data Preview")
 row_limit = st.selectbox(
     "Select number of rows to view",
-    [5, 10, 20, 30, 50, 100],
-    index=1
 )
 st.caption(f"Showing top {row_limit} rows")
@@ -122,7 +86,7 @@ df["combined"] = (
 products = df["combined"].tolist()
 # ==============================
-# PREPROCESSING
 # ==============================
 @st.cache_resource
 def preprocess_data(products):
@@ -178,15 +142,6 @@ def fuzzy_search(q):
     scores = [(i, fuzz.ratio(q, p)) for i, p in enumerate(products)]
     return sorted(scores, key=lambda x: x[1], reverse=True)[:10]
-def ngram_search(q):
-    return [(i, 1) for i, p in enumerate(products) if q[:3].lower() in p.lower()]
-def prefix_search(q):
-    return [(i, 1) for i, p in enumerate(products) if p.lower().startswith(q.lower())]
-def suffix_search(q):
-    return [(i, 1) for i, p in enumerate(products) if p.lower().endswith(q.lower())]
 def tfidf_search(q):
     q_vec = tfidf.transform([q])
     scores = (tfidf_matrix @ q_vec.T).toarray().flatten()
@@ -217,51 +172,15 @@ def hybrid_search(q):
     combined = {i: tfidf_res.get(i, 0) + sem_res.get(i, 0) for i in range(len(products))}
     return sorted(combined.items(), key=lambda x: x[1], reverse=True)[:10]
-def query_expansion_search(q):
-    synonyms = get_synonyms(q)
-    expanded_query = q + " " + " ".join(synonyms)
-    return tfidf_search(expanded_query)
-def weighted_hybrid(q):
-    tfidf_res = dict(tfidf_search(q))
-    sem_res = dict(semantic_search(q))
-    bm25_res = dict(bm25_search(q))
-    combined = {}
-    for i in range(len(products)):
-        combined[i] = (
-            0.4 * tfidf_res.get(i, 0) +
-            0.4 * sem_res.get(i, 0) +
-            0.2 * bm25_res.get(i, 0)
-        )
-    return sorted(combined.items(), key=lambda x: x[1], reverse=True)[:10]
-def ensemble_search(q):
-    results = {}
-    for func in [tfidf_search, semantic_search, bm25_search]:
-        for i, score in func(q):
-            results[i] = results.get(i, 0) + score
-    return sorted(results.items(), key=lambda x: x[1], reverse=True)[:10]
 # ==============================
-# SEARCH UI
 # ==============================
-search_type = st.selectbox("🔎 Select Search Type", list(search_info.keys()))
-explanation, example = search_info[search_type]
-st.markdown(f"""
-### 🔍 {search_type} Search
-- **Explanation:** {explanation}
-- **Example:** `{example}`
-""")
 query = st.text_input("Enter your search query")
-if st.button("Try Example"):
-    query = example.split("→")[0].strip()
-    st.success(f"Example loaded: {query}")
 top_k = st.slider("Top Results", 5, 20, 10)
 if st.button("Search"):
@@ -273,17 +192,11 @@ if st.button("Search"):
             "Regex": regex_search,
             "Boolean": boolean_search,
             "Fuzzy": fuzzy_search,
-            "N-Gram": ngram_search,
-            "Prefix": prefix_search,
-            "Suffix": suffix_search,
             "TF-IDF": tfidf_search,
             "BM25": bm25_search,
             "Semantic": semantic_search,
             "FAISS": faiss_search,
-            "Hybrid": hybrid_search,
-            "Query Expansion": query_expansion_search,
-            "Weighted Hybrid": weighted_hybrid,
-            "Ensemble": ensemble_search
         }
         results = func_map[search_type](query)[:top_k]

 import nltk
 # ==============================
+# FIX NLTK (HF SAFE)
 # ==============================
 nltk_data_path = "/tmp/nltk_data"
 os.makedirs(nltk_data_path, exist_ok=True)
 model = load_model()
 # ==============================
+# LOAD CSV FROM REPO
 # ==============================
+@st.cache_data
+def load_data():
     try:
+        df = pd.read_csv("products_10k.csv")
+        return df
     except:
+        st.warning("⚠️ products_10k.csv not found. Using fallback data.")
+        return pd.DataFrame({
+            "product_name": ["iPhone 14 Pro", "Samsung Galaxy S23"],
+            "category": ["Mobile", "Mobile"],
+            "brand": ["Apple", "Samsung"],
+            "description": ["Latest smartphone", "Android flagship phone"]
+        })
+df = load_data()
 # ==============================
+# DATA PREVIEW
 # ==============================
 st.subheader("📄 Data Preview")
 row_limit = st.selectbox(
     "Select number of rows to view",
+    [10, 20, 30, 50, 100],
+    index=0
 )
 st.caption(f"Showing top {row_limit} rows")
 products = df["combined"].tolist()
 # ==============================
+# PREPROCESS
 # ==============================
 @st.cache_resource
 def preprocess_data(products):
     scores = [(i, fuzz.ratio(q, p)) for i, p in enumerate(products)]
     return sorted(scores, key=lambda x: x[1], reverse=True)[:10]
 def tfidf_search(q):
     q_vec = tfidf.transform([q])
     scores = (tfidf_matrix @ q_vec.T).toarray().flatten()
     combined = {i: tfidf_res.get(i, 0) + sem_res.get(i, 0) for i in range(len(products))}
     return sorted(combined.items(), key=lambda x: x[1], reverse=True)[:10]
 # ==============================
+# UI
 # ==============================
+search_type = st.selectbox(
+    "🔎 Select Search Type",
+    ["Keyword", "Regex", "Boolean", "Fuzzy", "TF-IDF", "BM25", "Semantic", "FAISS", "Hybrid"]
+)
 query = st.text_input("Enter your search query")
 top_k = st.slider("Top Results", 5, 20, 10)
 if st.button("Search"):
             "Regex": regex_search,
             "Boolean": boolean_search,
             "Fuzzy": fuzzy_search,
             "TF-IDF": tfidf_search,
             "BM25": bm25_search,
             "Semantic": semantic_search,
             "FAISS": faiss_search,
+            "Hybrid": hybrid_search
         }
         results = func_map[search_type](query)[:top_k]