maiseumsChat / vector_store.py
NitinMoturu's picture
Rename vectore_store.py to vector_store.py
62cc70c verified
from chromadb import PersistentClient
from dataset_loader import load_all_json
from embedding_utils import get_embedding
client = PersistentClient(path="chroma_db")
collection = None
def init_vector_store():
global collection
# Check if collection already exists with data
collection = client.get_or_create_collection("museum_data")
# Only initialize data if collection is empty
if collection.count() == 0:
print("Initializing vector store with data...")
df = load_all_json()
# Handle cases where 'title' column might be missing
if "title" not in df.columns:
df["title"] = df["text"].str[:50] # use first 50 chars of text
# Process in smaller batches to save memory
batch_size = 10
for i in range(0, len(df), batch_size):
batch = df[i:i + batch_size]
ids = [str(j) for j in range(i, min(i + batch_size, len(df)))]
documents = batch["text"].tolist()
embeddings = [get_embedding(text) for text in documents]
metadatas = [{"title": title} for title in batch["title"].tolist()]
collection.add(
ids=ids,
documents=documents,
embeddings=embeddings,
metadatas=metadatas
)
# Clear memory after each batch
del batch, embeddings
print(f"Vector store initialized with {collection.count()} documents")
else:
print(f"Vector store already exists with {collection.count()} documents")
def query_vector_store(query_text):
results = collection.query(
query_texts=[query_text],
n_results=5
)
return "\n".join(results["documents"][0])