File size: 3,465 Bytes
555c75a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
import chromadb
from chromadb.config import Settings
import os
from typing import List, Dict, Optional

class VectorStore:
    def __init__(self, persist_dir: str = "./chroma_db", embedding_function=None):
        self.persist_dir = persist_dir
        os.makedirs(persist_dir, exist_ok=True)
        
        # Initialize ChromaDB persistent client
        self.client = chromadb.PersistentClient(
            path=persist_dir,
            settings=Settings(
                anonymized_telemetry=False,
                allow_reset=True
            )
        )
        
        self.embedding_function = embedding_function
        self.collection = None
    
    def get_or_create_collection(self, collection_name: str = "pdf_documents"):
        """Get or create ChromaDB collection"""
        try:
            # Try to get existing collection
            self.collection = self.client.get_collection(
                name=collection_name,
                embedding_function=self.embedding_function
            )
            print(f"✓ Loaded existing collection: {collection_name}")
        except:
            # Create new collection
            self.collection = self.client.create_collection(
                name=collection_name,
                embedding_function=self.embedding_function,
                metadata={"hnsw:space": "cosine"}
            )
            print(f"✓ Created new collection: {collection_name}")
        
        return self.collection
    
    def add_documents(self, documents: List[str], metadatas: List[Dict], ids: Optional[List[str]] = None):
        """Add documents to vector store"""
        if not self.collection:
            self.get_or_create_collection()
        
        if ids is None:
            ids = [f"doc_{i}" for i in range(len(documents))]
        
        # Get existing IDs to avoid duplicates
        try:
            existing_ids = self.collection.get()["ids"]
        except:
            existing_ids = []
        
        # Filter out documents that already exist
        docs_to_add = []
        meta_to_add = []
        ids_to_add = []
        
        for doc, meta, doc_id in zip(documents, metadatas, ids):
            if doc_id not in existing_ids:
                docs_to_add.append(doc)
                meta_to_add.append(meta)
                ids_to_add.append(doc_id)
        
        if docs_to_add:
            self.collection.add(
                documents=docs_to_add,
                metadatas=meta_to_add,
                ids=ids_to_add
            )
            print(f"✓ Added {len(docs_to_add)} new documents to vector store")
        else:
            print("✓ All documents already in vector store")
    
    def search(self, query: str, n_results: int = 5) -> Dict:
        """Search documents in vector store"""
        if not self.collection:
            return {"documents": [], "metadatas": [], "distances": []}
        
        results = self.collection.query(
            query_texts=[query],
            n_results=n_results
        )
        
        return results
    
    def get_collection_info(self) -> Dict:
        """Get collection statistics"""
        if not self.collection:
            return {}
        
        count = self.collection.count()
        return {
            "collection_name": self.collection.name,
            "document_count": count
        }