import streamlit as st import pandas as pd import numpy as np import re from sklearn.feature_extraction.text import TfidfVectorizer from sentence_transformers import SentenceTransformer from rank_bm25 import BM25Okapi from rapidfuzz import fuzz import faiss import nltk # ============================== # NLTK FIX # ============================== nltk.download('wordnet', quiet=True) from nltk.corpus import wordnet # ============================== # PAGE CONFIG # ============================== st.set_page_config(page_title="Multi Search Engine", layout="wide") st.title("🔍 Advanced Multi-Search Product Engine") # ============================== # LOAD MODEL # ============================== if "model" not in st.session_state: with st.spinner("Loading AI model..."): st.session_state.model = SentenceTransformer( 'all-MiniLM-L6-v2', device='cpu' ) model = st.session_state.model # ============================== # SEARCH INFO (UPDATED) # ============================== search_info = { "Keyword": ("Exact match", "iphone"), "Regex": ("Pattern match", "^Samsung"), "Boolean": ("AND / OR logic", "nike AND shoes"), "Fuzzy": ("Spelling mistakes", "iphon"), "N-Gram": ("Partial word", "iph"), "Prefix": ("Word starts with", "Sam"), "Suffix": ("Word ends with", "phone"), "TF-IDF": ("Keyword ranking", "wireless headphones"), "BM25": ("Advanced ranking", "gaming laptop"), "Semantic": ("Meaning search", "sports footwear"), "FAISS": ("Fast semantic", "music device"), "Hybrid": ("TF-IDF + Semantic", "running shoes"), "Query Expansion": ("Auto synonyms", "speaker"), "Weighted Hybrid": ("TF-IDF + Semantic + BM25", "best laptop"), "Ensemble": ("Combine all scores", "smartphone") } # ============================== # FILE LOAD (KEEP YOUR LOGIC) # ============================== uploaded_file = st.file_uploader("Upload CSV", type=["csv"]) if uploaded_file: df = pd.read_csv(uploaded_file) else: st.info("Using sample dataset") df = pd.DataFrame({ "product_name": [ "iPhone 14 Pro", "Samsung Galaxy S23", "Nike Running Shoes", "Dell Gaming Laptop", "Bluetooth Speaker" ], "category": ["Mobile", "Mobile", "Footwear", "Laptop", "Electronics"], "brand": ["Apple", "Samsung", "Nike", "Dell", "JBL"], "description": [ "Latest smartphone", "Android flagship phone", "Comfort sports shoes", "High performance laptop", "Portable music device" ] }) # ============================== # DATA PREVIEW CONTROL # ============================== st.subheader("📄 Data Preview") rows_to_show = st.selectbox("Select rows to view", [10, 20, 50, 100]) st.dataframe(df.head(rows_to_show)) # ============================== # COMBINE TEXT # ============================== df["combined"] = ( df["product_name"].astype(str) + " " + df["category"].astype(str) + " " + df["brand"].astype(str) + " " + df["description"].astype(str) ) products = df["combined"].tolist() # ============================== # PREPROCESS # ============================== @st.cache(allow_output_mutation=True) def preprocess_data(products): tfidf = TfidfVectorizer() tfidf_matrix = tfidf.fit_transform(products) embeddings = model.encode(products, batch_size=64, show_progress_bar=False) faiss.normalize_L2(embeddings) index = faiss.IndexFlatIP(embeddings.shape[1]) index.add(np.array(embeddings)) tokenized = [p.split() for p in products] bm25 = BM25Okapi(tokenized) return tfidf, tfidf_matrix, embeddings, index, bm25 tfidf, tfidf_matrix, embeddings, index, bm25 = preprocess_data(products) # ============================== # SYNONYMS # ============================== def get_synonyms(word): synonyms = set() for syn in wordnet.synsets(word): for lemma in syn.lemmas(): synonyms.add(lemma.name()) return synonyms # ============================== # SEARCH FUNCTIONS # ============================== def keyword_search(q): return [(i, 1) for i, p in enumerate(products) if q.lower() in p.lower()] def regex_search(q): return [(i, 1) for i, p in enumerate(products) if re.search(q, p, re.IGNORECASE)] def boolean_search(q): if "AND" in q: terms = q.split("AND") return [(i, 1) for i, p in enumerate(products) if all(t.strip().lower() in p.lower() for t in terms)] elif "OR" in q: terms = q.split("OR") return [(i, 1) for i, p in enumerate(products) if any(t.strip().lower() in p.lower() for t in terms)] return [] def fuzzy_search(q): scores = [(i, fuzz.ratio(q, p)) for i, p in enumerate(products)] return sorted(scores, key=lambda x: x[1], reverse=True) def ngram_search(q): return [(i, 1) for i, p in enumerate(products) if q.lower() in p.lower()] # ✅ FIXED PREFIX (word-level) def prefix_search(q): return [(i, 1) for i, p in enumerate(products) if any(word.startswith(q.lower()) for word in p.lower().split())] # ✅ FIXED SUFFIX (word-level) def suffix_search(q): return [(i, 1) for i, p in enumerate(products) if any(word.endswith(q.lower()) for word in p.lower().split())] def tfidf_search(q): q_vec = tfidf.transform([q]) scores = (tfidf_matrix @ q_vec.T).toarray().flatten() return list(enumerate(scores)) def bm25_search(q): scores = bm25.get_scores(q.split()) return list(enumerate(scores)) def semantic_search(q): q_emb = model.encode([q], show_progress_bar=False) faiss.normalize_L2(q_emb) scores = np.dot(embeddings, q_emb.T).flatten() return list(enumerate(scores)) def faiss_search(q): q_emb = model.encode([q], show_progress_bar=False) faiss.normalize_L2(q_emb) D, I = index.search(np.array(q_emb), 10) return [(i, float(D[0][idx])) for idx, i in enumerate(I[0])] def hybrid_search(q): tfidf_res = dict(tfidf_search(q)) sem_res = dict(semantic_search(q)) return [(i, tfidf_res.get(i, 0) + sem_res.get(i, 0)) for i in range(len(products))] # ✅ IMPROVED QUERY EXPANSION def query_expansion_search(q): expanded = q.split() for word in q.split(): expanded += list(get_synonyms(word)) return tfidf_search(" ".join(expanded)) # ✅ IMPROVED WEIGHTED HYBRID def weighted_hybrid(q): tfidf_res = dict(tfidf_search(q)) sem_res = dict(semantic_search(q)) bm25_res = dict(bm25_search(q)) return [(i, 0.4 * tfidf_res.get(i, 0) + 0.4 * sem_res.get(i, 0) + 0.2 * bm25_res.get(i, 0)) for i in range(len(products))] # ✅ FIXED ENSEMBLE (NORMALIZED) def ensemble_search(q): tfidf_res = np.array([s for _, s in tfidf_search(q)]) sem_res = np.array([s for _, s in semantic_search(q)]) bm25_res = np.array([s for _, s in bm25_search(q)]) combined = tfidf_res/np.max(tfidf_res+1e-6) + \ sem_res/np.max(sem_res+1e-6) + \ bm25_res/np.max(bm25_res+1e-6) return list(enumerate(combined)) # ============================== # UI # ============================== search_type = st.selectbox("🔎 Select Search Type", list(search_info.keys())) explanation, example = search_info[search_type] st.markdown(f""" ### 🔍 {search_type} - **Explanation:** {explanation} - **Example:** `{example}` """) query = st.text_input("Enter your search query") if st.button("Try Example"): query = example st.success(f"Loaded: {query}") top_k = st.slider("Top Results", 5, 20, 10) # ============================== # SEARCH EXECUTION # ============================== if st.button("Search"): if not query: st.warning("Enter query") else: func_map = { "Keyword": keyword_search, "Regex": regex_search, "Boolean": boolean_search, "Fuzzy": fuzzy_search, "N-Gram": ngram_search, "Prefix": prefix_search, "Suffix": suffix_search, "TF-IDF": tfidf_search, "BM25": bm25_search, "Semantic": semantic_search, "FAISS": faiss_search, "Hybrid": hybrid_search, "Query Expansion": query_expansion_search, "Weighted Hybrid": weighted_hybrid, "Ensemble": ensemble_search } results = func_map[search_type](query) # Sort results results = sorted(results, key=lambda x: x[1], reverse=True)[:top_k] indices = [i for i, _ in results] result_df = df.iloc[indices].copy() result_df["Score"] = [round(score, 4) for _, score in results] st.subheader("🔎 Results") st.dataframe(result_df)