File size: 3,465 Bytes
555c75a |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 |
import chromadb
from chromadb.config import Settings
import os
from typing import List, Dict, Optional
class VectorStore:
def __init__(self, persist_dir: str = "./chroma_db", embedding_function=None):
self.persist_dir = persist_dir
os.makedirs(persist_dir, exist_ok=True)
# Initialize ChromaDB persistent client
self.client = chromadb.PersistentClient(
path=persist_dir,
settings=Settings(
anonymized_telemetry=False,
allow_reset=True
)
)
self.embedding_function = embedding_function
self.collection = None
def get_or_create_collection(self, collection_name: str = "pdf_documents"):
"""Get or create ChromaDB collection"""
try:
# Try to get existing collection
self.collection = self.client.get_collection(
name=collection_name,
embedding_function=self.embedding_function
)
print(f"✓ Loaded existing collection: {collection_name}")
except:
# Create new collection
self.collection = self.client.create_collection(
name=collection_name,
embedding_function=self.embedding_function,
metadata={"hnsw:space": "cosine"}
)
print(f"✓ Created new collection: {collection_name}")
return self.collection
def add_documents(self, documents: List[str], metadatas: List[Dict], ids: Optional[List[str]] = None):
"""Add documents to vector store"""
if not self.collection:
self.get_or_create_collection()
if ids is None:
ids = [f"doc_{i}" for i in range(len(documents))]
# Get existing IDs to avoid duplicates
try:
existing_ids = self.collection.get()["ids"]
except:
existing_ids = []
# Filter out documents that already exist
docs_to_add = []
meta_to_add = []
ids_to_add = []
for doc, meta, doc_id in zip(documents, metadatas, ids):
if doc_id not in existing_ids:
docs_to_add.append(doc)
meta_to_add.append(meta)
ids_to_add.append(doc_id)
if docs_to_add:
self.collection.add(
documents=docs_to_add,
metadatas=meta_to_add,
ids=ids_to_add
)
print(f"✓ Added {len(docs_to_add)} new documents to vector store")
else:
print("✓ All documents already in vector store")
def search(self, query: str, n_results: int = 5) -> Dict:
"""Search documents in vector store"""
if not self.collection:
return {"documents": [], "metadatas": [], "distances": []}
results = self.collection.query(
query_texts=[query],
n_results=n_results
)
return results
def get_collection_info(self) -> Dict:
"""Get collection statistics"""
if not self.collection:
return {}
count = self.collection.count()
return {
"collection_name": self.collection.name,
"document_count": count
} |