Spaces:

tiffany101
/

week14_interactive

Sleeping

File size: 3,410 Bytes

f92c2ba
d74e01c
f92c2ba
d74e01c
f92c2ba
20ef8f2
88a99f5
f92c2ba
d8eb6b3
20ef8f2
d8eb6b3
f92c2ba
 
 
20ef8f2
 
 
 
 
d8eb6b3
f92c2ba
20ef8f2
 
 
 
d74e01c
d8eb6b3
 
 
7c1a3e1
f92c2ba
d74e01c
20ef8f2
7c1a3e1
20ef8f2
7c1a3e1
20ef8f2
5f41904
 
7c1a3e1
20ef8f2
 
5f41904
 
20ef8f2
 
d8eb6b3
20ef8f2
 
d8eb6b3
 
 
 
88a99f5
d8eb6b3
 
88a99f5
d8eb6b3
 
d74e01c
d8eb6b3
20ef8f2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
d8eb6b3
d74e01c
88a99f5
 
d8eb6b3
5f41904
d74e01c
 
d8eb6b3
20ef8f2
d8eb6b3
d74e01c
 
 
 
 
d8eb6b3
d74e01c

from huggingface_hub import hf_hub_download
from chromadb import PersistentClient
from sentence_transformers import SentenceTransformer
import gradio as gr
import os
import zipfile
import shutil

# ==========================
# Step 1 — Download and unzip ChromaDB
# ==========================
persist_dir = "chromadb"
os.makedirs(persist_dir, exist_ok=True)

if not os.path.exists(os.path.join(persist_dir, "chroma.sqlite3")):
    print("📥 Downloading ChromaDB zip from Hugging Face...")
    db_zip_path = hf_hub_download(
        repo_id="tiffany101/my-chromadb",   # your dataset repo
        filename="chromadb.zip",
        repo_type="dataset"
    )
    print("✅ Download complete, extracting...")
    with zipfile.ZipFile(db_zip_path, "r") as zip_ref:
        zip_ref.extractall(persist_dir)
    print("✅ Extracted ChromaDB to:", persist_dir)

# ==========================
# Step 2 — Load Chroma client
# ==========================
print("🚀 Initializing Chroma client...")
client = PersistentClient(path=persist_dir)

# List collections for debugging
collections = client.list_collections()
print("📊 Collections found:", [c.name for c in collections])

# Load or create fallback collection
try:
    collection = client.get_collection("my_collection")
    print("✅ Loaded existing collection: my_collection")
except Exception:
    print("⚠️ my_collection not found, creating demo fallback...")
    collection = client.create_collection("my_collection")

    # Add sample fallback data
    model = SentenceTransformer("all-MiniLM-L6-v2")
    sample_texts = [
        "The Eiffel Tower is one of the most famous landmarks in Paris.",
        "Machine learning enables computers to learn from data.",
        "The stock market rose today amid strong earnings reports.",
        "The football team won the championship game.",
        "Scientists discovered a new planet outside our solar system."
    ]
    embeddings = model.encode(sample_texts)
    collection.add(
        documents=sample_texts,
        embeddings=embeddings.tolist(),
        ids=[str(i) for i in range(len(sample_texts))]
    )

# ==========================
# Step 3 — Verify collection size
# ==========================
print("🧩 Checking how many documents are stored...")
try:
    count = len(collection.get()["ids"])
    print(f"✅ Collection contains {count} documents.")
except Exception as e:
    print("⚠️ Could not fetch count:", e)

# ==========================
# Step 4 — Load embedding model
# ==========================
model = SentenceTransformer("all-MiniLM-L6-v2")

# ==========================
# Step 5 — Define semantic search
# ==========================
def semantic_search(query):
    query_emb = model.encode([query])
    results = collection.query(query_embeddings=query_emb.tolist(), n_results=3)
    if not results["documents"] or len(results["documents"][0]) == 0:
        return "No matching documents found in the ChromaDB."
    return "\n\n".join(results["documents"][0])

# ==========================
# Step 6 — Launch Gradio app
# ==========================
demo = gr.Interface(
    fn=semantic_search,
    inputs=gr.Textbox(label="Enter your search query"),
    outputs=gr.Textbox(label="Top Matches"),
    title="Semantic Search Engine",
    description="Search across your Chroma database using semantic similarity."
)

if __name__ == "__main__":
    demo.launch()