Spaces:

rizwankhan2123
/

Semantic-Search-Engine

Running

File size: 5,376 Bytes

import streamlit as st
import chromadb
from sentence_transformers import SentenceTransformer
import uuid

# ==========================================
# PAGE CONFIG
# ==========================================

st.set_page_config(
    page_title="Semantic Search Engine",
    page_icon="🔍",
    layout="wide"
)

# ==========================================
# CUSTOM CSS
# ==========================================

st.markdown("""

<style>



.main {

    padding-top: 1rem;

}



.block-container {

    padding-top: 2rem;

}



.result-box {

    padding: 1rem;

    border-radius: 12px;

    border: 1px solid #333;

    margin-bottom: 10px;

}



</style>

""", unsafe_allow_html=True)

# ==========================================
# TITLE
# ==========================================

st.title("🔍 Semantic Search Engine")
st.caption(
    "Search documents using semantic similarity powered by Hugging Face embeddings."
)

# ==========================================
# LOAD MODEL
# ==========================================

@st.cache_resource
def load_model():
    return SentenceTransformer(
        "sentence-transformers/all-MiniLM-L6-v2"
    )

model = load_model()

# ==========================================
# CHROMADB
# ==========================================

client = chromadb.PersistentClient(
    path="./chroma_db"
)

collection = client.get_or_create_collection(
    name="documents"
)

# ==========================================
# SIDEBAR
# ==========================================

with st.sidebar:

    st.header("⚙️ Settings")

    top_k = st.slider(
        "Number of Results",
        min_value=1,
        max_value=10,
        value=5
    )

    st.markdown("---")

    st.info(
        "Semantic Search compares meanings instead of matching exact keywords."
    )

# ==========================================
# DATABASE STATS
# ==========================================

st.markdown("## 📊 Database Statistics")

col1, col2 = st.columns(2)

with col1:
    st.metric(
        "Documents Stored",
        collection.count()
    )

with col2:
    st.metric(
        "Embedding Model",
        "MiniLM-L6-v2"
    )

# ==========================================
# DOCUMENT INPUT
# ==========================================

st.markdown("---")
st.markdown("## 📥 Add Documents")

documents = st.text_area(
    "Enter documents (one document per line)",
    height=220,
    placeholder="""

Python is a programming language.

FastAPI is used to build APIs.

Machine learning learns patterns from data.

ChromaDB stores embeddings.

"""
)

if st.button("💾 Store Documents"):

    docs = [
        doc.strip()
        for doc in documents.split("\n")
        if doc.strip()
    ]

    if len(docs) == 0:
        st.warning("Please enter at least one document.")

    else:

        with st.spinner("Generating embeddings..."):

            embeddings = model.encode(
                docs
            ).tolist()

            collection.add(
                ids=[
                    str(uuid.uuid4())
                    for _ in docs
                ],
                documents=docs,
                embeddings=embeddings
            )

        st.success(
            f"{len(docs)} document(s) stored successfully."
        )

        st.rerun()

# ==========================================
# SEARCH SECTION
# ==========================================

st.markdown("---")
st.markdown("## 🔎 Search")

query = st.text_input(
    "Enter your search query",
    placeholder="How can I build an API?"
)

if st.button(
    "🚀 Search",
    use_container_width=True
):

    if collection.count() == 0:

        st.error(
            "No documents available. Add documents first."
        )

    elif not query.strip():

        st.warning(
            "Please enter a search query."
        )

    else:

        with st.spinner(
            "Searching similar documents..."
        ):

            query_embedding = model.encode(
                query
            ).tolist()

            results = collection.query(
                query_embeddings=[
                    query_embedding
                ],
                n_results=min(
                    top_k,
                    collection.count()
                )
            )

        docs = results["documents"][0]
        distances = results["distances"][0]

        st.markdown("---")
        st.markdown("## 📄 Search Results")

        for rank, (doc, distance) in enumerate(
            zip(docs, distances),
            start=1
        ):

            # Relevance Label
            if distance < 0.7:
                relevance = "🟢 Highly Relevant"
            elif distance < 1.2:
                relevance = "🟡 Relevant"
            else:
                relevance = "🔴 Weak Match"

            with st.expander(
                f"#{rank} | {relevance}"
            ):
                st.write(doc)

                st.caption(
                    f"Distance Score: {distance:.4f}"
                )

# ==========================================
# FOOTER
# ==========================================

st.markdown("---")