Spaces:
Sleeping
Sleeping
File size: 3,410 Bytes
f92c2ba d74e01c f92c2ba d74e01c f92c2ba 20ef8f2 88a99f5 f92c2ba d8eb6b3 20ef8f2 d8eb6b3 f92c2ba 20ef8f2 d8eb6b3 f92c2ba 20ef8f2 d74e01c d8eb6b3 7c1a3e1 f92c2ba d74e01c 20ef8f2 7c1a3e1 20ef8f2 7c1a3e1 20ef8f2 5f41904 7c1a3e1 20ef8f2 5f41904 20ef8f2 d8eb6b3 20ef8f2 d8eb6b3 88a99f5 d8eb6b3 88a99f5 d8eb6b3 d74e01c d8eb6b3 20ef8f2 d8eb6b3 d74e01c 88a99f5 d8eb6b3 5f41904 d74e01c d8eb6b3 20ef8f2 d8eb6b3 d74e01c d8eb6b3 d74e01c |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 |
from huggingface_hub import hf_hub_download
from chromadb import PersistentClient
from sentence_transformers import SentenceTransformer
import gradio as gr
import os
import zipfile
import shutil
# ==========================
# Step 1 β Download and unzip ChromaDB
# ==========================
persist_dir = "chromadb"
os.makedirs(persist_dir, exist_ok=True)
if not os.path.exists(os.path.join(persist_dir, "chroma.sqlite3")):
print("π₯ Downloading ChromaDB zip from Hugging Face...")
db_zip_path = hf_hub_download(
repo_id="tiffany101/my-chromadb", # your dataset repo
filename="chromadb.zip",
repo_type="dataset"
)
print("β
Download complete, extracting...")
with zipfile.ZipFile(db_zip_path, "r") as zip_ref:
zip_ref.extractall(persist_dir)
print("β
Extracted ChromaDB to:", persist_dir)
# ==========================
# Step 2 β Load Chroma client
# ==========================
print("π Initializing Chroma client...")
client = PersistentClient(path=persist_dir)
# List collections for debugging
collections = client.list_collections()
print("π Collections found:", [c.name for c in collections])
# Load or create fallback collection
try:
collection = client.get_collection("my_collection")
print("β
Loaded existing collection: my_collection")
except Exception:
print("β οΈ my_collection not found, creating demo fallback...")
collection = client.create_collection("my_collection")
# Add sample fallback data
model = SentenceTransformer("all-MiniLM-L6-v2")
sample_texts = [
"The Eiffel Tower is one of the most famous landmarks in Paris.",
"Machine learning enables computers to learn from data.",
"The stock market rose today amid strong earnings reports.",
"The football team won the championship game.",
"Scientists discovered a new planet outside our solar system."
]
embeddings = model.encode(sample_texts)
collection.add(
documents=sample_texts,
embeddings=embeddings.tolist(),
ids=[str(i) for i in range(len(sample_texts))]
)
# ==========================
# Step 3 β Verify collection size
# ==========================
print("π§© Checking how many documents are stored...")
try:
count = len(collection.get()["ids"])
print(f"β
Collection contains {count} documents.")
except Exception as e:
print("β οΈ Could not fetch count:", e)
# ==========================
# Step 4 β Load embedding model
# ==========================
model = SentenceTransformer("all-MiniLM-L6-v2")
# ==========================
# Step 5 β Define semantic search
# ==========================
def semantic_search(query):
query_emb = model.encode([query])
results = collection.query(query_embeddings=query_emb.tolist(), n_results=3)
if not results["documents"] or len(results["documents"][0]) == 0:
return "No matching documents found in the ChromaDB."
return "\n\n".join(results["documents"][0])
# ==========================
# Step 6 β Launch Gradio app
# ==========================
demo = gr.Interface(
fn=semantic_search,
inputs=gr.Textbox(label="Enter your search query"),
outputs=gr.Textbox(label="Top Matches"),
title="Semantic Search Engine",
description="Search across your Chroma database using semantic similarity."
)
if __name__ == "__main__":
demo.launch()
|