Spaces:
Sleeping
Sleeping
| from chromadb import PersistentClient | |
| from dataset_loader import load_all_json | |
| from embedding_utils import get_embedding | |
| client = PersistentClient(path="chroma_db") | |
| collection = None | |
| def init_vector_store(): | |
| global collection | |
| # Check if collection already exists with data | |
| collection = client.get_or_create_collection("museum_data") | |
| # Only initialize data if collection is empty | |
| if collection.count() == 0: | |
| print("Initializing vector store with data...") | |
| df = load_all_json() | |
| # Handle cases where 'title' column might be missing | |
| if "title" not in df.columns: | |
| df["title"] = df["text"].str[:50] # use first 50 chars of text | |
| # Process in smaller batches to save memory | |
| batch_size = 10 | |
| for i in range(0, len(df), batch_size): | |
| batch = df[i:i + batch_size] | |
| ids = [str(j) for j in range(i, min(i + batch_size, len(df)))] | |
| documents = batch["text"].tolist() | |
| embeddings = [get_embedding(text) for text in documents] | |
| metadatas = [{"title": title} for title in batch["title"].tolist()] | |
| collection.add( | |
| ids=ids, | |
| documents=documents, | |
| embeddings=embeddings, | |
| metadatas=metadatas | |
| ) | |
| # Clear memory after each batch | |
| del batch, embeddings | |
| print(f"Vector store initialized with {collection.count()} documents") | |
| else: | |
| print(f"Vector store already exists with {collection.count()} documents") | |
| def query_vector_store(query_text): | |
| results = collection.query( | |
| query_texts=[query_text], | |
| n_results=5 | |
| ) | |
| return "\n".join(results["documents"][0]) | |