|
|
import os |
|
|
import chromadb |
|
|
from typing import List, Dict, Any |
|
|
import hashlib |
|
|
|
|
|
class ChromaVectorDB: |
|
|
def __init__(self, db_path: str = "./data/chroma_db"): |
|
|
"""Initialize ChromaDB for vector storage""" |
|
|
os.makedirs(db_path, exist_ok=True) |
|
|
self.client = chromadb.PersistentClient(path=db_path) |
|
|
self.collection = self.client.get_or_create_collection("documents") |
|
|
|
|
|
def add_document(self, file_path: str, text_chunks: List[str], metadata: Dict[str, Any] = None): |
|
|
"""Add document chunks to the vector database""" |
|
|
|
|
|
ids = [hashlib.md5(f"{file_path}_{i}".encode()).hexdigest() for i in range(len(text_chunks))] |
|
|
|
|
|
|
|
|
metadatas = [] |
|
|
for i in range(len(text_chunks)): |
|
|
chunk_metadata = {"source": file_path, "chunk_id": i} |
|
|
if metadata: |
|
|
chunk_metadata.update(metadata) |
|
|
metadatas.append(chunk_metadata) |
|
|
|
|
|
|
|
|
self.collection.add( |
|
|
documents=text_chunks, |
|
|
metadatas=metadatas, |
|
|
ids=ids |
|
|
) |
|
|
|
|
|
return ids |
|
|
|
|
|
def search(self, query: str, n_results: int = 5): |
|
|
"""Search for relevant document chunks""" |
|
|
results = self.collection.query( |
|
|
query_texts=[query], |
|
|
n_results=n_results |
|
|
) |
|
|
|
|
|
return results |
|
|
|
|
|
def delete_document(self, file_path: str): |
|
|
"""Deleting all chunks from a specific document""" |
|
|
|
|
|
results = self.collection.get( |
|
|
where={"source": file_path} |
|
|
) |
|
|
|
|
|
if results and results['ids']: |
|
|
self.collection.delete(ids=results['ids']) |
|
|
|
|
|
def reset_collection(self): |
|
|
"""Reset the collection by clearing all documents""" |
|
|
try: |
|
|
|
|
|
try: |
|
|
all_ids = self.collection.get()["ids"] |
|
|
if all_ids: |
|
|
|
|
|
self.collection.delete(ids=all_ids) |
|
|
print(f"Deleted {len(all_ids)} documents from collection") |
|
|
else: |
|
|
print("Collection is already empty") |
|
|
return True |
|
|
except Exception as e: |
|
|
print(f"Error getting or deleting documents: {str(e)}") |
|
|
|
|
|
|
|
|
try: |
|
|
self.client.delete_collection("documents") |
|
|
self.collection = self.client.get_or_create_collection("documents") |
|
|
print("Collection recreated successfully") |
|
|
return True |
|
|
except Exception as e2: |
|
|
print(f"Error recreating collection: {str(e2)}") |
|
|
return False |
|
|
except Exception as e: |
|
|
print(f"Error resetting collection: {str(e)}") |
|
|
return False |