import streamlit as st import chromadb from sentence_transformers import SentenceTransformer import uuid # ========================================== # PAGE CONFIG # ========================================== st.set_page_config( page_title="Semantic Search Engine", page_icon="🔍", layout="wide" ) # ========================================== # CUSTOM CSS # ========================================== st.markdown(""" """, unsafe_allow_html=True) # ========================================== # TITLE # ========================================== st.title("🔍 Semantic Search Engine") st.caption( "Search documents using semantic similarity powered by Hugging Face embeddings." ) # ========================================== # LOAD MODEL # ========================================== @st.cache_resource def load_model(): return SentenceTransformer( "sentence-transformers/all-MiniLM-L6-v2" ) model = load_model() # ========================================== # CHROMADB # ========================================== client = chromadb.PersistentClient( path="./chroma_db" ) collection = client.get_or_create_collection( name="documents" ) # ========================================== # SIDEBAR # ========================================== with st.sidebar: st.header("⚙️ Settings") top_k = st.slider( "Number of Results", min_value=1, max_value=10, value=5 ) st.markdown("---") st.info( "Semantic Search compares meanings instead of matching exact keywords." ) # ========================================== # DATABASE STATS # ========================================== st.markdown("## 📊 Database Statistics") col1, col2 = st.columns(2) with col1: st.metric( "Documents Stored", collection.count() ) with col2: st.metric( "Embedding Model", "MiniLM-L6-v2" ) # ========================================== # DOCUMENT INPUT # ========================================== st.markdown("---") st.markdown("## 📥 Add Documents") documents = st.text_area( "Enter documents (one document per line)", height=220, placeholder=""" Python is a programming language. FastAPI is used to build APIs. Machine learning learns patterns from data. ChromaDB stores embeddings. """ ) if st.button("💾 Store Documents"): docs = [ doc.strip() for doc in documents.split("\n") if doc.strip() ] if len(docs) == 0: st.warning("Please enter at least one document.") else: with st.spinner("Generating embeddings..."): embeddings = model.encode( docs ).tolist() collection.add( ids=[ str(uuid.uuid4()) for _ in docs ], documents=docs, embeddings=embeddings ) st.success( f"{len(docs)} document(s) stored successfully." ) st.rerun() # ========================================== # SEARCH SECTION # ========================================== st.markdown("---") st.markdown("## 🔎 Search") query = st.text_input( "Enter your search query", placeholder="How can I build an API?" ) if st.button( "🚀 Search", use_container_width=True ): if collection.count() == 0: st.error( "No documents available. Add documents first." ) elif not query.strip(): st.warning( "Please enter a search query." ) else: with st.spinner( "Searching similar documents..." ): query_embedding = model.encode( query ).tolist() results = collection.query( query_embeddings=[ query_embedding ], n_results=min( top_k, collection.count() ) ) docs = results["documents"][0] distances = results["distances"][0] st.markdown("---") st.markdown("## 📄 Search Results") for rank, (doc, distance) in enumerate( zip(docs, distances), start=1 ): # Relevance Label if distance < 0.7: relevance = "🟢 Highly Relevant" elif distance < 1.2: relevance = "🟡 Relevant" else: relevance = "🔴 Weak Match" with st.expander( f"#{rank} | {relevance}" ): st.write(doc) st.caption( f"Distance Score: {distance:.4f}" ) # ========================================== # FOOTER # ========================================== st.markdown("---")