File size: 3,056 Bytes
5fffd14 6950cd1 5fffd14 b37a516 2feba09 264c011 2feba09 6950cd1 264c011 6950cd1 264c011 6950cd1 264c011 2feba09 028022d |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 |
import os
import chromadb
from typing import List, Dict, Any
import hashlib
class ChromaVectorDB:
def __init__(self, db_path: str = "./data/chroma_db"):
"""Initialize ChromaDB for vector storage"""
os.makedirs(db_path, exist_ok=True)
self.client = chromadb.PersistentClient(path=db_path)
self.collection = self.client.get_or_create_collection("documents")
def add_document(self, file_path: str, text_chunks: List[str], metadata: Dict[str, Any] = None):
"""Add document chunks to the vector database"""
# Generate unique IDs for each chunk
ids = [hashlib.md5(f"{file_path}_{i}".encode()).hexdigest() for i in range(len(text_chunks))]
# Create metadata for each chunk
metadatas = []
for i in range(len(text_chunks)):
chunk_metadata = {"source": file_path, "chunk_id": i}
if metadata:
chunk_metadata.update(metadata)
metadatas.append(chunk_metadata)
# Add to collection
self.collection.add(
documents=text_chunks,
metadatas=metadatas,
ids=ids
)
return ids
def search(self, query: str, n_results: int = 5):
"""Search for relevant document chunks"""
results = self.collection.query(
query_texts=[query],
n_results=n_results
)
return results
def delete_document(self, file_path: str):
"""Deleting all chunks from a specific document"""
# Getting all IDs related to this document
results = self.collection.get(
where={"source": file_path}
)
if results and results['ids']:
self.collection.delete(ids=results['ids'])
def reset_collection(self):
"""Reset the collection by clearing all documents"""
try:
# Getting all document IDs
try:
all_ids = self.collection.get()["ids"]
if all_ids:
# Deleting all documents
self.collection.delete(ids=all_ids)
print(f"Deleted {len(all_ids)} documents from collection")
else:
print("Collection is already empty")
return True
except Exception as e:
print(f"Error getting or deleting documents: {str(e)}")
# Trying to recreate the collection as a fallback
try:
self.client.delete_collection("documents")
self.collection = self.client.get_or_create_collection("documents")
print("Collection recreated successfully")
return True
except Exception as e2:
print(f"Error recreating collection: {str(e2)}")
return False
except Exception as e:
print(f"Error resetting collection: {str(e)}")
return False |