Spaces:

tiffany101
/

week14_interactive

Sleeping

App Files Files Community

week14_interactive / app.py

tiffany101

Update app.py

20ef8f2 verified 3 months ago

raw

history blame contribute delete

3.41 kB

	from huggingface_hub import hf_hub_download
	from chromadb import PersistentClient
	from sentence_transformers import SentenceTransformer
	import gradio as gr
	import os
	import zipfile
	import shutil

	# ==========================
	# Step 1 — Download and unzip ChromaDB
	# ==========================
	persist_dir = "chromadb"
	os.makedirs(persist_dir, exist_ok=True)

	if not os.path.exists(os.path.join(persist_dir, "chroma.sqlite3")):
	print("📥 Downloading ChromaDB zip from Hugging Face...")
	db_zip_path = hf_hub_download(
	repo_id="tiffany101/my-chromadb", # your dataset repo
	filename="chromadb.zip",
	repo_type="dataset"
	)
	print("✅ Download complete, extracting...")
	with zipfile.ZipFile(db_zip_path, "r") as zip_ref:
	zip_ref.extractall(persist_dir)
	print("✅ Extracted ChromaDB to:", persist_dir)

	# ==========================
	# Step 2 — Load Chroma client
	# ==========================
	print("🚀 Initializing Chroma client...")
	client = PersistentClient(path=persist_dir)

	# List collections for debugging
	collections = client.list_collections()
	print("📊 Collections found:", [c.name for c in collections])

	# Load or create fallback collection
	try:
	collection = client.get_collection("my_collection")
	print("✅ Loaded existing collection: my_collection")
	except Exception:
	print("⚠️ my_collection not found, creating demo fallback...")
	collection = client.create_collection("my_collection")

	# Add sample fallback data
	model = SentenceTransformer("all-MiniLM-L6-v2")
	sample_texts = [
	"The Eiffel Tower is one of the most famous landmarks in Paris.",
	"Machine learning enables computers to learn from data.",
	"The stock market rose today amid strong earnings reports.",
	"The football team won the championship game.",
	"Scientists discovered a new planet outside our solar system."
	]
	embeddings = model.encode(sample_texts)
	collection.add(
	documents=sample_texts,
	embeddings=embeddings.tolist(),
	ids=[str(i) for i in range(len(sample_texts))]
	)

	# ==========================
	# Step 3 — Verify collection size
	# ==========================
	print("🧩 Checking how many documents are stored...")
	try:
	count = len(collection.get()["ids"])
	print(f"✅ Collection contains {count} documents.")
	except Exception as e:
	print("⚠️ Could not fetch count:", e)

	# ==========================
	# Step 4 — Load embedding model
	# ==========================
	model = SentenceTransformer("all-MiniLM-L6-v2")

	# ==========================
	# Step 5 — Define semantic search
	# ==========================
	def semantic_search(query):
	query_emb = model.encode([query])
	results = collection.query(query_embeddings=query_emb.tolist(), n_results=3)
	if not results["documents"] or len(results["documents"][0]) == 0:
	return "No matching documents found in the ChromaDB."
	return "\n\n".join(results["documents"][0])

	# ==========================
	# Step 6 — Launch Gradio app
	# ==========================
	demo = gr.Interface(
	fn=semantic_search,
	inputs=gr.Textbox(label="Enter your search query"),
	outputs=gr.Textbox(label="Top Matches"),
	title="Semantic Search Engine",
	description="Search across your Chroma database using semantic similarity."
	)

	if __name__ == "__main__":
	demo.launch()