NitinMoturu commited on
Commit
f02ba19
·
verified ·
1 Parent(s): 21f3194

Create vectore_store.py

Browse files
Files changed (1) hide show
  1. vectore_store.py +51 -0
vectore_store.py ADDED
@@ -0,0 +1,51 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from chromadb import PersistentClient
2
+ from dataset_loader import load_all_json
3
+ from embedding_utils import get_embedding
4
+
5
+ client = PersistentClient(path="chroma_db")
6
+ collection = None
7
+
8
+ def init_vector_store():
9
+ global collection
10
+ # Check if collection already exists with data
11
+ collection = client.get_or_create_collection("museum_data")
12
+
13
+ # Only initialize data if collection is empty
14
+ if collection.count() == 0:
15
+ print("Initializing vector store with data...")
16
+ df = load_all_json()
17
+
18
+ # Handle cases where 'title' column might be missing
19
+ if "title" not in df.columns:
20
+ df["title"] = df["text"].str[:50] # use first 50 chars of text
21
+
22
+ # Process in smaller batches to save memory
23
+ batch_size = 10
24
+ for i in range(0, len(df), batch_size):
25
+ batch = df[i:i + batch_size]
26
+
27
+ ids = [str(j) for j in range(i, min(i + batch_size, len(df)))]
28
+ documents = batch["text"].tolist()
29
+ embeddings = [get_embedding(text) for text in documents]
30
+ metadatas = [{"title": title} for title in batch["title"].tolist()]
31
+
32
+ collection.add(
33
+ ids=ids,
34
+ documents=documents,
35
+ embeddings=embeddings,
36
+ metadatas=metadatas
37
+ )
38
+
39
+ # Clear memory after each batch
40
+ del batch, embeddings
41
+
42
+ print(f"Vector store initialized with {collection.count()} documents")
43
+ else:
44
+ print(f"Vector store already exists with {collection.count()} documents")
45
+
46
+ def query_vector_store(query_text):
47
+ results = collection.query(
48
+ query_texts=[query_text],
49
+ n_results=5
50
+ )
51
+ return "\n".join(results["documents"][0])