Spaces:

NitinMoturu
/

maiseumsChat

Sleeping

maiseumsChat / vector_store.py

Rename vectore_store.py to vector_store.py

62cc70c verified 8 months ago

1.78 kB

	from chromadb import PersistentClient
	from dataset_loader import load_all_json
	from embedding_utils import get_embedding

	client = PersistentClient(path="chroma_db")
	collection = None

	def init_vector_store():
	global collection
	# Check if collection already exists with data
	collection = client.get_or_create_collection("museum_data")

	# Only initialize data if collection is empty
	if collection.count() == 0:
	print("Initializing vector store with data...")
	df = load_all_json()

	# Handle cases where 'title' column might be missing
	if "title" not in df.columns:
	df["title"] = df["text"].str[:50] # use first 50 chars of text

	# Process in smaller batches to save memory
	batch_size = 10
	for i in range(0, len(df), batch_size):
	batch = df[i:i + batch_size]

	ids = [str(j) for j in range(i, min(i + batch_size, len(df)))]
	documents = batch["text"].tolist()
	embeddings = [get_embedding(text) for text in documents]
	metadatas = [{"title": title} for title in batch["title"].tolist()]

	collection.add(
	ids=ids,
	documents=documents,
	embeddings=embeddings,
	metadatas=metadatas
	)

	# Clear memory after each batch
	del batch, embeddings

	print(f"Vector store initialized with {collection.count()} documents")
	else:
	print(f"Vector store already exists with {collection.count()} documents")

	def query_vector_store(query_text):
	results = collection.query(
	query_texts=[query_text],
	n_results=5
	)
	return "\n".join(results["documents"][0])