Spaces:

NitinMoturu
/

maiseumsChat

Sleeping

File size: 1,783 Bytes

f02ba19

from chromadb import PersistentClient
from dataset_loader import load_all_json
from embedding_utils import get_embedding

client = PersistentClient(path="chroma_db")
collection = None

def init_vector_store():
    global collection
    # Check if collection already exists with data
    collection = client.get_or_create_collection("museum_data")
    
    # Only initialize data if collection is empty
    if collection.count() == 0:
        print("Initializing vector store with data...")
        df = load_all_json()

        # Handle cases where 'title' column might be missing
        if "title" not in df.columns:
            df["title"] = df["text"].str[:50]  # use first 50 chars of text

        # Process in smaller batches to save memory
        batch_size = 10
        for i in range(0, len(df), batch_size):
            batch = df[i:i + batch_size]
            
            ids = [str(j) for j in range(i, min(i + batch_size, len(df)))]
            documents = batch["text"].tolist()
            embeddings = [get_embedding(text) for text in documents]
            metadatas = [{"title": title} for title in batch["title"].tolist()]
            
            collection.add(
                ids=ids,
                documents=documents,
                embeddings=embeddings,
                metadatas=metadatas
            )
            
            # Clear memory after each batch
            del batch, embeddings
        
        print(f"Vector store initialized with {collection.count()} documents")
    else:
        print(f"Vector store already exists with {collection.count()} documents")

def query_vector_store(query_text):
    results = collection.query(
        query_texts=[query_text],
        n_results=5
    )
    return "\n".join(results["documents"][0])