Spaces:

NitinMoturu
/

maiseumsChat

Sleeping

App Files Files Community

NitinMoturu commited on Aug 27, 2025

Commit

f02ba19

verified ·

1 Parent(s): 21f3194

Create vectore_store.py

Browse files

Files changed (1) hide show

vectore_store.py +51 -0

vectore_store.py ADDED Viewed

	@@ -0,0 +1,51 @@

+from chromadb import PersistentClient
+from dataset_loader import load_all_json
+from embedding_utils import get_embedding
+client = PersistentClient(path="chroma_db")
+collection = None
+def init_vector_store():
+    global collection
+    # Check if collection already exists with data
+    collection = client.get_or_create_collection("museum_data")
+    # Only initialize data if collection is empty
+    if collection.count() == 0:
+        print("Initializing vector store with data...")
+        df = load_all_json()
+        # Handle cases where 'title' column might be missing
+        if "title" not in df.columns:
+            df["title"] = df["text"].str[:50]  # use first 50 chars of text
+        # Process in smaller batches to save memory
+        batch_size = 10
+        for i in range(0, len(df), batch_size):
+            batch = df[i:i + batch_size]
+            ids = [str(j) for j in range(i, min(i + batch_size, len(df)))]
+            documents = batch["text"].tolist()
+            embeddings = [get_embedding(text) for text in documents]
+            metadatas = [{"title": title} for title in batch["title"].tolist()]
+            collection.add(
+                ids=ids,
+                documents=documents,
+                embeddings=embeddings,
+                metadatas=metadatas
+            )
+            # Clear memory after each batch
+            del batch, embeddings
+        print(f"Vector store initialized with {collection.count()} documents")
+    else:
+        print(f"Vector store already exists with {collection.count()} documents")
+def query_vector_store(query_text):
+    results = collection.query(
+        query_texts=[query_text],
+        n_results=5
+    )
+    return "\n".join(results["documents"][0])