from huggingface_hub import hf_hub_download from chromadb import PersistentClient from sentence_transformers import SentenceTransformer import gradio as gr import os import zipfile import shutil # ========================== # Step 1 — Download and unzip ChromaDB # ========================== persist_dir = "chromadb" os.makedirs(persist_dir, exist_ok=True) if not os.path.exists(os.path.join(persist_dir, "chroma.sqlite3")): print("📥 Downloading ChromaDB zip from Hugging Face...") db_zip_path = hf_hub_download( repo_id="tiffany101/my-chromadb", # your dataset repo filename="chromadb.zip", repo_type="dataset" ) print("✅ Download complete, extracting...") with zipfile.ZipFile(db_zip_path, "r") as zip_ref: zip_ref.extractall(persist_dir) print("✅ Extracted ChromaDB to:", persist_dir) # ========================== # Step 2 — Load Chroma client # ========================== print("🚀 Initializing Chroma client...") client = PersistentClient(path=persist_dir) # List collections for debugging collections = client.list_collections() print("📊 Collections found:", [c.name for c in collections]) # Load or create fallback collection try: collection = client.get_collection("my_collection") print("✅ Loaded existing collection: my_collection") except Exception: print("⚠️ my_collection not found, creating demo fallback...") collection = client.create_collection("my_collection") # Add sample fallback data model = SentenceTransformer("all-MiniLM-L6-v2") sample_texts = [ "The Eiffel Tower is one of the most famous landmarks in Paris.", "Machine learning enables computers to learn from data.", "The stock market rose today amid strong earnings reports.", "The football team won the championship game.", "Scientists discovered a new planet outside our solar system." ] embeddings = model.encode(sample_texts) collection.add( documents=sample_texts, embeddings=embeddings.tolist(), ids=[str(i) for i in range(len(sample_texts))] ) # ========================== # Step 3 — Verify collection size # ========================== print("🧩 Checking how many documents are stored...") try: count = len(collection.get()["ids"]) print(f"✅ Collection contains {count} documents.") except Exception as e: print("⚠️ Could not fetch count:", e) # ========================== # Step 4 — Load embedding model # ========================== model = SentenceTransformer("all-MiniLM-L6-v2") # ========================== # Step 5 — Define semantic search # ========================== def semantic_search(query): query_emb = model.encode([query]) results = collection.query(query_embeddings=query_emb.tolist(), n_results=3) if not results["documents"] or len(results["documents"][0]) == 0: return "No matching documents found in the ChromaDB." return "\n\n".join(results["documents"][0]) # ========================== # Step 6 — Launch Gradio app # ========================== demo = gr.Interface( fn=semantic_search, inputs=gr.Textbox(label="Enter your search query"), outputs=gr.Textbox(label="Top Matches"), title="Semantic Search Engine", description="Search across your Chroma database using semantic similarity." ) if __name__ == "__main__": demo.launch()