Spaces:

pradeep4321
/

sample_multi_search

Running

File size: 8,818 Bytes

import streamlit as st
import pandas as pd
import numpy as np
import re

from sklearn.feature_extraction.text import TfidfVectorizer
from sentence_transformers import SentenceTransformer
from rank_bm25 import BM25Okapi
from rapidfuzz import fuzz
import faiss
import nltk

# ==============================
# NLTK FIX
# ==============================
nltk.download('wordnet', quiet=True)
from nltk.corpus import wordnet

# ==============================
# PAGE CONFIG
# ==============================
st.set_page_config(page_title="Multi Search Engine", layout="wide")
st.title("🔍 Advanced Multi-Search Product Engine")

# ==============================
# LOAD MODEL
# ==============================
if "model" not in st.session_state:
    with st.spinner("Loading AI model..."):
        st.session_state.model = SentenceTransformer(
            'all-MiniLM-L6-v2',
            device='cpu'
        )

model = st.session_state.model

# ==============================
# SEARCH INFO (UPDATED)
# ==============================
search_info = {
    "Keyword": ("Exact match", "iphone"),
    "Regex": ("Pattern match", "^Samsung"),
    "Boolean": ("AND / OR logic", "nike AND shoes"),
    "Fuzzy": ("Spelling mistakes", "iphon"),
    "N-Gram": ("Partial word", "iph"),
    "Prefix": ("Word starts with", "Sam"),
    "Suffix": ("Word ends with", "phone"),
    "TF-IDF": ("Keyword ranking", "wireless headphones"),
    "BM25": ("Advanced ranking", "gaming laptop"),
    "Semantic": ("Meaning search", "sports footwear"),
    "FAISS": ("Fast semantic", "music device"),
    "Hybrid": ("TF-IDF + Semantic", "running shoes"),
    "Query Expansion": ("Auto synonyms", "speaker"),
    "Weighted Hybrid": ("TF-IDF + Semantic + BM25", "best laptop"),
    "Ensemble": ("Combine all scores", "smartphone")
}

# ==============================
# FILE LOAD (KEEP YOUR LOGIC)
# ==============================
uploaded_file = st.file_uploader("Upload CSV", type=["csv"])

if uploaded_file:
    df = pd.read_csv(uploaded_file)
else:
    st.info("Using sample dataset")
    df = pd.DataFrame({
        "product_name": [
            "iPhone 14 Pro",
            "Samsung Galaxy S23",
            "Nike Running Shoes",
            "Dell Gaming Laptop",
            "Bluetooth Speaker"
        ],
        "category": ["Mobile", "Mobile", "Footwear", "Laptop", "Electronics"],
        "brand": ["Apple", "Samsung", "Nike", "Dell", "JBL"],
        "description": [
            "Latest smartphone",
            "Android flagship phone",
            "Comfort sports shoes",
            "High performance laptop",
            "Portable music device"
        ]
    })

# ==============================
# DATA PREVIEW CONTROL
# ==============================
st.subheader("📄 Data Preview")

rows_to_show = st.selectbox("Select rows to view", [10, 20, 50, 100])
st.dataframe(df.head(rows_to_show))

# ==============================
# COMBINE TEXT
# ==============================
df["combined"] = (
    df["product_name"].astype(str) + " " +
    df["category"].astype(str) + " " +
    df["brand"].astype(str) + " " +
    df["description"].astype(str)
)

products = df["combined"].tolist()

# ==============================
# PREPROCESS
# ==============================
@st.cache(allow_output_mutation=True)
def preprocess_data(products):
    tfidf = TfidfVectorizer()
    tfidf_matrix = tfidf.fit_transform(products)

    embeddings = model.encode(products, batch_size=64, show_progress_bar=False)
    faiss.normalize_L2(embeddings)

    index = faiss.IndexFlatIP(embeddings.shape[1])
    index.add(np.array(embeddings))

    tokenized = [p.split() for p in products]
    bm25 = BM25Okapi(tokenized)

    return tfidf, tfidf_matrix, embeddings, index, bm25

tfidf, tfidf_matrix, embeddings, index, bm25 = preprocess_data(products)

# ==============================
# SYNONYMS
# ==============================
def get_synonyms(word):
    synonyms = set()
    for syn in wordnet.synsets(word):
        for lemma in syn.lemmas():
            synonyms.add(lemma.name())
    return synonyms

# ==============================
# SEARCH FUNCTIONS
# ==============================
def keyword_search(q):
    return [(i, 1) for i, p in enumerate(products) if q.lower() in p.lower()]

def regex_search(q):
    return [(i, 1) for i, p in enumerate(products) if re.search(q, p, re.IGNORECASE)]

def boolean_search(q):
    if "AND" in q:
        terms = q.split("AND")
        return [(i, 1) for i, p in enumerate(products)
                if all(t.strip().lower() in p.lower() for t in terms)]
    elif "OR" in q:
        terms = q.split("OR")
        return [(i, 1) for i, p in enumerate(products)
                if any(t.strip().lower() in p.lower() for t in terms)]
    return []

def fuzzy_search(q):
    scores = [(i, fuzz.ratio(q, p)) for i, p in enumerate(products)]
    return sorted(scores, key=lambda x: x[1], reverse=True)

def ngram_search(q):
    return [(i, 1) for i, p in enumerate(products) if q.lower() in p.lower()]

# ✅ FIXED PREFIX (word-level)
def prefix_search(q):
    return [(i, 1) for i, p in enumerate(products)
            if any(word.startswith(q.lower()) for word in p.lower().split())]

# ✅ FIXED SUFFIX (word-level)
def suffix_search(q):
    return [(i, 1) for i, p in enumerate(products)
            if any(word.endswith(q.lower()) for word in p.lower().split())]

def tfidf_search(q):
    q_vec = tfidf.transform([q])
    scores = (tfidf_matrix @ q_vec.T).toarray().flatten()
    return list(enumerate(scores))

def bm25_search(q):
    scores = bm25.get_scores(q.split())
    return list(enumerate(scores))

def semantic_search(q):
    q_emb = model.encode([q], show_progress_bar=False)
    faiss.normalize_L2(q_emb)
    scores = np.dot(embeddings, q_emb.T).flatten()
    return list(enumerate(scores))

def faiss_search(q):
    q_emb = model.encode([q], show_progress_bar=False)
    faiss.normalize_L2(q_emb)
    D, I = index.search(np.array(q_emb), 10)
    return [(i, float(D[0][idx])) for idx, i in enumerate(I[0])]

def hybrid_search(q):
    tfidf_res = dict(tfidf_search(q))
    sem_res = dict(semantic_search(q))
    return [(i, tfidf_res.get(i, 0) + sem_res.get(i, 0)) for i in range(len(products))]

# ✅ IMPROVED QUERY EXPANSION
def query_expansion_search(q):
    expanded = q.split()
    for word in q.split():
        expanded += list(get_synonyms(word))
    return tfidf_search(" ".join(expanded))

# ✅ IMPROVED WEIGHTED HYBRID
def weighted_hybrid(q):
    tfidf_res = dict(tfidf_search(q))
    sem_res = dict(semantic_search(q))
    bm25_res = dict(bm25_search(q))

    return [(i,
             0.4 * tfidf_res.get(i, 0) +
             0.4 * sem_res.get(i, 0) +
             0.2 * bm25_res.get(i, 0))
            for i in range(len(products))]

# ✅ FIXED ENSEMBLE (NORMALIZED)
def ensemble_search(q):
    tfidf_res = np.array([s for _, s in tfidf_search(q)])
    sem_res = np.array([s for _, s in semantic_search(q)])
    bm25_res = np.array([s for _, s in bm25_search(q)])

    combined = tfidf_res/np.max(tfidf_res+1e-6) + \
               sem_res/np.max(sem_res+1e-6) + \
               bm25_res/np.max(bm25_res+1e-6)

    return list(enumerate(combined))

# ==============================
# UI
# ==============================
search_type = st.selectbox("🔎 Select Search Type", list(search_info.keys()))
explanation, example = search_info[search_type]

st.markdown(f"""
### 🔍 {search_type}
- **Explanation:** {explanation}
- **Example:** `{example}`
""")

query = st.text_input("Enter your search query")

if st.button("Try Example"):
    query = example
    st.success(f"Loaded: {query}")

top_k = st.slider("Top Results", 5, 20, 10)

# ==============================
# SEARCH EXECUTION
# ==============================
if st.button("Search"):
    if not query:
        st.warning("Enter query")
    else:
        func_map = {
            "Keyword": keyword_search,
            "Regex": regex_search,
            "Boolean": boolean_search,
            "Fuzzy": fuzzy_search,
            "N-Gram": ngram_search,
            "Prefix": prefix_search,
            "Suffix": suffix_search,
            "TF-IDF": tfidf_search,
            "BM25": bm25_search,
            "Semantic": semantic_search,
            "FAISS": faiss_search,
            "Hybrid": hybrid_search,
            "Query Expansion": query_expansion_search,
            "Weighted Hybrid": weighted_hybrid,
            "Ensemble": ensemble_search
        }

        results = func_map[search_type](query)

        # Sort results
        results = sorted(results, key=lambda x: x[1], reverse=True)[:top_k]

        indices = [i for i, _ in results]
        result_df = df.iloc[indices].copy()
        result_df["Score"] = [round(score, 4) for _, score in results]

        st.subheader("🔎 Results")
        st.dataframe(result_df)