from chromadb import PersistentClient from dataset_loader import load_all_json from embedding_utils import get_embedding client = PersistentClient(path="chroma_db") collection = None def init_vector_store(): global collection # Check if collection already exists with data collection = client.get_or_create_collection("museum_data") # Only initialize data if collection is empty if collection.count() == 0: print("Initializing vector store with data...") df = load_all_json() # Handle cases where 'title' column might be missing if "title" not in df.columns: df["title"] = df["text"].str[:50] # use first 50 chars of text # Process in smaller batches to save memory batch_size = 10 for i in range(0, len(df), batch_size): batch = df[i:i + batch_size] ids = [str(j) for j in range(i, min(i + batch_size, len(df)))] documents = batch["text"].tolist() embeddings = [get_embedding(text) for text in documents] metadatas = [{"title": title} for title in batch["title"].tolist()] collection.add( ids=ids, documents=documents, embeddings=embeddings, metadatas=metadatas ) # Clear memory after each batch del batch, embeddings print(f"Vector store initialized with {collection.count()} documents") else: print(f"Vector store already exists with {collection.count()} documents") def query_vector_store(query_text): results = collection.query( query_texts=[query_text], n_results=5 ) return "\n".join(results["documents"][0])