yuvrajsingh6
fix: Fix module imports and paths for HF Spaces
4ba4b25
import chromadb
from chromadb.config import Settings as ChromaSettings
from typing import List, Dict, Optional
from pathlib import Path
from backend.app.config import settings
class VectorStore:
def __init__(self):
self.client = chromadb.PersistentClient(
path=str(settings.VECTOR_DB_PATH),
settings=ChromaSettings(anonymized_telemetry=False),
)
self.collection = self.client.get_or_create_collection(
name="documents", metadata={"hnsw:space": "cosine"}
)
async def add_chunks(self, chunks: List[dict], embeddings: List[List[float]]):
ids = [
f"{chunk['metadata']['document_id']}_chunk_{chunk['metadata']['chunk_index']}"
for chunk in chunks
]
documents = [chunk["text"] for chunk in chunks]
metadatas = [
{
"document_id": chunk["metadata"]["document_id"],
"chunk_index": chunk["metadata"]["chunk_index"],
"page_number": chunk["metadata"]["page_number"] or 0,
"total_chunks": chunk["metadata"]["total_chunks"],
}
for chunk in chunks
]
self.collection.add(
ids=ids, embeddings=embeddings, documents=documents, metadatas=metadatas
)
async def search(
self,
query_embedding: List[float],
top_k: int = 5,
document_ids: Optional[List[str]] = None,
) -> List[Dict]:
where_filter = None
if document_ids:
where_filter = {"document_id": {"$in": document_ids}}
results = self.collection.query(
query_embeddings=[query_embedding], n_results=top_k, where=where_filter
)
search_results = []
for i in range(len(results["ids"][0])):
search_results.append(
{
"text": results["documents"][0][i],
"metadata": results["metadatas"][0][i],
"similarity": 1 - results["distances"][0][i],
"id": results["ids"][0][i],
}
)
return search_results
async def delete_document(self, document_id: str):
self.collection.delete(where={"document_id": document_id})
async def get_stats(self) -> Dict:
count = self.collection.count()
return {"total_chunks": count, "collection_name": self.collection.name}
vector_store = VectorStore()