Spaces:
Sleeping
Sleeping
| from huggingface_hub import hf_hub_download | |
| from chromadb import PersistentClient | |
| from sentence_transformers import SentenceTransformer | |
| import gradio as gr | |
| import os | |
| import zipfile | |
| import shutil | |
| # ========================== | |
| # Step 1 β Download and unzip ChromaDB | |
| # ========================== | |
| persist_dir = "chromadb" | |
| os.makedirs(persist_dir, exist_ok=True) | |
| if not os.path.exists(os.path.join(persist_dir, "chroma.sqlite3")): | |
| print("π₯ Downloading ChromaDB zip from Hugging Face...") | |
| db_zip_path = hf_hub_download( | |
| repo_id="tiffany101/my-chromadb", # your dataset repo | |
| filename="chromadb.zip", | |
| repo_type="dataset" | |
| ) | |
| print("β Download complete, extracting...") | |
| with zipfile.ZipFile(db_zip_path, "r") as zip_ref: | |
| zip_ref.extractall(persist_dir) | |
| print("β Extracted ChromaDB to:", persist_dir) | |
| # ========================== | |
| # Step 2 β Load Chroma client | |
| # ========================== | |
| print("π Initializing Chroma client...") | |
| client = PersistentClient(path=persist_dir) | |
| # List collections for debugging | |
| collections = client.list_collections() | |
| print("π Collections found:", [c.name for c in collections]) | |
| # Load or create fallback collection | |
| try: | |
| collection = client.get_collection("my_collection") | |
| print("β Loaded existing collection: my_collection") | |
| except Exception: | |
| print("β οΈ my_collection not found, creating demo fallback...") | |
| collection = client.create_collection("my_collection") | |
| # Add sample fallback data | |
| model = SentenceTransformer("all-MiniLM-L6-v2") | |
| sample_texts = [ | |
| "The Eiffel Tower is one of the most famous landmarks in Paris.", | |
| "Machine learning enables computers to learn from data.", | |
| "The stock market rose today amid strong earnings reports.", | |
| "The football team won the championship game.", | |
| "Scientists discovered a new planet outside our solar system." | |
| ] | |
| embeddings = model.encode(sample_texts) | |
| collection.add( | |
| documents=sample_texts, | |
| embeddings=embeddings.tolist(), | |
| ids=[str(i) for i in range(len(sample_texts))] | |
| ) | |
| # ========================== | |
| # Step 3 β Verify collection size | |
| # ========================== | |
| print("π§© Checking how many documents are stored...") | |
| try: | |
| count = len(collection.get()["ids"]) | |
| print(f"β Collection contains {count} documents.") | |
| except Exception as e: | |
| print("β οΈ Could not fetch count:", e) | |
| # ========================== | |
| # Step 4 β Load embedding model | |
| # ========================== | |
| model = SentenceTransformer("all-MiniLM-L6-v2") | |
| # ========================== | |
| # Step 5 β Define semantic search | |
| # ========================== | |
| def semantic_search(query): | |
| query_emb = model.encode([query]) | |
| results = collection.query(query_embeddings=query_emb.tolist(), n_results=3) | |
| if not results["documents"] or len(results["documents"][0]) == 0: | |
| return "No matching documents found in the ChromaDB." | |
| return "\n\n".join(results["documents"][0]) | |
| # ========================== | |
| # Step 6 β Launch Gradio app | |
| # ========================== | |
| demo = gr.Interface( | |
| fn=semantic_search, | |
| inputs=gr.Textbox(label="Enter your search query"), | |
| outputs=gr.Textbox(label="Top Matches"), | |
| title="Semantic Search Engine", | |
| description="Search across your Chroma database using semantic similarity." | |
| ) | |
| if __name__ == "__main__": | |
| demo.launch() | |