tiffany101 commited on
Commit
20ef8f2
Β·
verified Β·
1 Parent(s): 8ae4dd9

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +37 -40
app.py CHANGED
@@ -3,64 +3,50 @@ from chromadb import PersistentClient
3
  from sentence_transformers import SentenceTransformer
4
  import gradio as gr
5
  import os
 
6
  import shutil
7
 
8
  # ==========================
9
- # Step 1 β€” Download ChromaDB
10
  # ==========================
11
  persist_dir = "chromadb"
12
  os.makedirs(persist_dir, exist_ok=True)
13
- local_db_path = os.path.join(persist_dir, "chroma.sqlite3")
14
 
15
- if not os.path.exists(local_db_path):
16
- print("πŸ“₯ Downloading ChromaDB from Hugging Face Dataset...")
17
- downloaded_db = hf_hub_download(
18
- repo_id="tiffany101/my-chromadb", # your dataset repo
19
- filename="chroma.sqlite3",
20
  repo_type="dataset"
21
  )
22
- shutil.copy(downloaded_db, local_db_path)
23
- print(f"βœ… Copied DB to {local_db_path}")
24
- else:
25
- print("βœ… Found local ChromaDB file, skipping download.")
26
 
27
  # ==========================
28
  # Step 2 β€” Load Chroma client
29
  # ==========================
30
  print("πŸš€ Initializing Chroma client...")
31
  client = PersistentClient(path=persist_dir)
32
- model = SentenceTransformer("all-MiniLM-L6-v2")
33
 
34
- # ==========================
35
- # Debug β€” List all collections
36
- # ==========================
37
- print("πŸ“Š Checking available collections...")
38
  collections = client.list_collections()
39
- if collections:
40
- print("βœ… Found the following collections:")
41
- for c in collections:
42
- print(f" β€’ {c.name}")
43
- else:
44
- print("⚠️ No collections found in this database!")
45
 
46
- # Try to load or create collection
47
  try:
48
  collection = client.get_collection("my_collection")
49
  print("βœ… Loaded existing collection: my_collection")
50
-
51
- # βœ… Check how many documents are stored
52
- print("🧩 Checking how many documents are stored...")
53
- count = len(collection.get()["ids"])
54
- print(f"βœ… Collection contains {count} documents.")
55
-
56
- except Exception as e:
57
- print(f"⚠️ Collection 'my_collection' not found ({e}), creating fallback...")
58
  collection = client.create_collection("my_collection")
59
 
60
- # Add fallback data for demo
 
61
  sample_texts = [
62
- "The Eiffel Tower is a famous landmark in Paris.",
63
- "Machine learning helps computers learn from data.",
64
  "The stock market rose today amid strong earnings reports.",
65
  "The football team won the championship game.",
66
  "Scientists discovered a new planet outside our solar system."
@@ -71,23 +57,34 @@ except Exception as e:
71
  embeddings=embeddings.tolist(),
72
  ids=[str(i) for i in range(len(sample_texts))]
73
  )
74
- print("βœ… Added fallback demo data to new collection.")
75
 
76
  # ==========================
77
- # Step 3 β€” Define search
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
78
  # ==========================
79
  def semantic_search(query):
80
- print(f"πŸ” Received query: {query}")
81
  query_emb = model.encode([query])
82
  results = collection.query(query_embeddings=query_emb.tolist(), n_results=3)
83
  if not results["documents"] or len(results["documents"][0]) == 0:
84
- print("⚠️ No matching documents found.")
85
  return "No matching documents found in the ChromaDB."
86
- print(f"βœ… Found {len(results['documents'][0])} results.")
87
  return "\n\n".join(results["documents"][0])
88
 
89
  # ==========================
90
- # Step 4 β€” Launch Gradio app
91
  # ==========================
92
  demo = gr.Interface(
93
  fn=semantic_search,
 
3
  from sentence_transformers import SentenceTransformer
4
  import gradio as gr
5
  import os
6
+ import zipfile
7
  import shutil
8
 
9
  # ==========================
10
+ # Step 1 β€” Download and unzip ChromaDB
11
  # ==========================
12
  persist_dir = "chromadb"
13
  os.makedirs(persist_dir, exist_ok=True)
 
14
 
15
+ if not os.path.exists(os.path.join(persist_dir, "chroma.sqlite3")):
16
+ print("πŸ“₯ Downloading ChromaDB zip from Hugging Face...")
17
+ db_zip_path = hf_hub_download(
18
+ repo_id="tiffany101/my-chromadb", # your dataset repo
19
+ filename="chromadb.zip",
20
  repo_type="dataset"
21
  )
22
+ print("βœ… Download complete, extracting...")
23
+ with zipfile.ZipFile(db_zip_path, "r") as zip_ref:
24
+ zip_ref.extractall(persist_dir)
25
+ print("βœ… Extracted ChromaDB to:", persist_dir)
26
 
27
  # ==========================
28
  # Step 2 β€” Load Chroma client
29
  # ==========================
30
  print("πŸš€ Initializing Chroma client...")
31
  client = PersistentClient(path=persist_dir)
 
32
 
33
+ # List collections for debugging
 
 
 
34
  collections = client.list_collections()
35
+ print("πŸ“Š Collections found:", [c.name for c in collections])
 
 
 
 
 
36
 
37
+ # Load or create fallback collection
38
  try:
39
  collection = client.get_collection("my_collection")
40
  print("βœ… Loaded existing collection: my_collection")
41
+ except Exception:
42
+ print("⚠️ my_collection not found, creating demo fallback...")
 
 
 
 
 
 
43
  collection = client.create_collection("my_collection")
44
 
45
+ # Add sample fallback data
46
+ model = SentenceTransformer("all-MiniLM-L6-v2")
47
  sample_texts = [
48
+ "The Eiffel Tower is one of the most famous landmarks in Paris.",
49
+ "Machine learning enables computers to learn from data.",
50
  "The stock market rose today amid strong earnings reports.",
51
  "The football team won the championship game.",
52
  "Scientists discovered a new planet outside our solar system."
 
57
  embeddings=embeddings.tolist(),
58
  ids=[str(i) for i in range(len(sample_texts))]
59
  )
 
60
 
61
  # ==========================
62
+ # Step 3 β€” Verify collection size
63
+ # ==========================
64
+ print("🧩 Checking how many documents are stored...")
65
+ try:
66
+ count = len(collection.get()["ids"])
67
+ print(f"βœ… Collection contains {count} documents.")
68
+ except Exception as e:
69
+ print("⚠️ Could not fetch count:", e)
70
+
71
+ # ==========================
72
+ # Step 4 β€” Load embedding model
73
+ # ==========================
74
+ model = SentenceTransformer("all-MiniLM-L6-v2")
75
+
76
+ # ==========================
77
+ # Step 5 β€” Define semantic search
78
  # ==========================
79
  def semantic_search(query):
 
80
  query_emb = model.encode([query])
81
  results = collection.query(query_embeddings=query_emb.tolist(), n_results=3)
82
  if not results["documents"] or len(results["documents"][0]) == 0:
 
83
  return "No matching documents found in the ChromaDB."
 
84
  return "\n\n".join(results["documents"][0])
85
 
86
  # ==========================
87
+ # Step 6 β€” Launch Gradio app
88
  # ==========================
89
  demo = gr.Interface(
90
  fn=semantic_search,