Spaces:
Sleeping
Sleeping
File size: 1,783 Bytes
f02ba19 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 | from chromadb import PersistentClient
from dataset_loader import load_all_json
from embedding_utils import get_embedding
client = PersistentClient(path="chroma_db")
collection = None
def init_vector_store():
global collection
# Check if collection already exists with data
collection = client.get_or_create_collection("museum_data")
# Only initialize data if collection is empty
if collection.count() == 0:
print("Initializing vector store with data...")
df = load_all_json()
# Handle cases where 'title' column might be missing
if "title" not in df.columns:
df["title"] = df["text"].str[:50] # use first 50 chars of text
# Process in smaller batches to save memory
batch_size = 10
for i in range(0, len(df), batch_size):
batch = df[i:i + batch_size]
ids = [str(j) for j in range(i, min(i + batch_size, len(df)))]
documents = batch["text"].tolist()
embeddings = [get_embedding(text) for text in documents]
metadatas = [{"title": title} for title in batch["title"].tolist()]
collection.add(
ids=ids,
documents=documents,
embeddings=embeddings,
metadatas=metadatas
)
# Clear memory after each batch
del batch, embeddings
print(f"Vector store initialized with {collection.count()} documents")
else:
print(f"Vector store already exists with {collection.count()} documents")
def query_vector_store(query_text):
results = collection.query(
query_texts=[query_text],
n_results=5
)
return "\n".join(results["documents"][0])
|