tiffany101's picture
Update app.py
20ef8f2 verified
from huggingface_hub import hf_hub_download
from chromadb import PersistentClient
from sentence_transformers import SentenceTransformer
import gradio as gr
import os
import zipfile
import shutil
# ==========================
# Step 1 β€” Download and unzip ChromaDB
# ==========================
persist_dir = "chromadb"
os.makedirs(persist_dir, exist_ok=True)
if not os.path.exists(os.path.join(persist_dir, "chroma.sqlite3")):
print("πŸ“₯ Downloading ChromaDB zip from Hugging Face...")
db_zip_path = hf_hub_download(
repo_id="tiffany101/my-chromadb", # your dataset repo
filename="chromadb.zip",
repo_type="dataset"
)
print("βœ… Download complete, extracting...")
with zipfile.ZipFile(db_zip_path, "r") as zip_ref:
zip_ref.extractall(persist_dir)
print("βœ… Extracted ChromaDB to:", persist_dir)
# ==========================
# Step 2 β€” Load Chroma client
# ==========================
print("πŸš€ Initializing Chroma client...")
client = PersistentClient(path=persist_dir)
# List collections for debugging
collections = client.list_collections()
print("πŸ“Š Collections found:", [c.name for c in collections])
# Load or create fallback collection
try:
collection = client.get_collection("my_collection")
print("βœ… Loaded existing collection: my_collection")
except Exception:
print("⚠️ my_collection not found, creating demo fallback...")
collection = client.create_collection("my_collection")
# Add sample fallback data
model = SentenceTransformer("all-MiniLM-L6-v2")
sample_texts = [
"The Eiffel Tower is one of the most famous landmarks in Paris.",
"Machine learning enables computers to learn from data.",
"The stock market rose today amid strong earnings reports.",
"The football team won the championship game.",
"Scientists discovered a new planet outside our solar system."
]
embeddings = model.encode(sample_texts)
collection.add(
documents=sample_texts,
embeddings=embeddings.tolist(),
ids=[str(i) for i in range(len(sample_texts))]
)
# ==========================
# Step 3 β€” Verify collection size
# ==========================
print("🧩 Checking how many documents are stored...")
try:
count = len(collection.get()["ids"])
print(f"βœ… Collection contains {count} documents.")
except Exception as e:
print("⚠️ Could not fetch count:", e)
# ==========================
# Step 4 β€” Load embedding model
# ==========================
model = SentenceTransformer("all-MiniLM-L6-v2")
# ==========================
# Step 5 β€” Define semantic search
# ==========================
def semantic_search(query):
query_emb = model.encode([query])
results = collection.query(query_embeddings=query_emb.tolist(), n_results=3)
if not results["documents"] or len(results["documents"][0]) == 0:
return "No matching documents found in the ChromaDB."
return "\n\n".join(results["documents"][0])
# ==========================
# Step 6 β€” Launch Gradio app
# ==========================
demo = gr.Interface(
fn=semantic_search,
inputs=gr.Textbox(label="Enter your search query"),
outputs=gr.Textbox(label="Top Matches"),
title="Semantic Search Engine",
description="Search across your Chroma database using semantic similarity."
)
if __name__ == "__main__":
demo.launch()