File size: 3,056 Bytes
5fffd14
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6950cd1
 
5fffd14
 
 
 
 
b37a516
2feba09
 
264c011
2feba09
6950cd1
264c011
 
 
6950cd1
264c011
 
 
 
 
 
 
 
6950cd1
264c011
 
 
 
 
 
 
 
2feba09
 
028022d
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
import os
import chromadb
from typing import List, Dict, Any
import hashlib

class ChromaVectorDB:
    def __init__(self, db_path: str = "./data/chroma_db"):
        """Initialize ChromaDB for vector storage"""
        os.makedirs(db_path, exist_ok=True)
        self.client = chromadb.PersistentClient(path=db_path)
        self.collection = self.client.get_or_create_collection("documents")
    
    def add_document(self, file_path: str, text_chunks: List[str], metadata: Dict[str, Any] = None):
        """Add document chunks to the vector database"""
        # Generate unique IDs for each chunk
        ids = [hashlib.md5(f"{file_path}_{i}".encode()).hexdigest() for i in range(len(text_chunks))]
        
        # Create metadata for each chunk
        metadatas = []
        for i in range(len(text_chunks)):
            chunk_metadata = {"source": file_path, "chunk_id": i}
            if metadata:
                chunk_metadata.update(metadata)
            metadatas.append(chunk_metadata)
        
        # Add to collection
        self.collection.add(
            documents=text_chunks,
            metadatas=metadatas,
            ids=ids
        )
        
        return ids
    
    def search(self, query: str, n_results: int = 5):
        """Search for relevant document chunks"""
        results = self.collection.query(
            query_texts=[query],
            n_results=n_results
        )
        
        return results
    
    def delete_document(self, file_path: str):
        """Deleting all chunks from a specific document"""
        # Getting all IDs related to this document
        results = self.collection.get(
            where={"source": file_path}
        )
        
        if results and results['ids']:
            self.collection.delete(ids=results['ids'])
    
    def reset_collection(self):
        """Reset the collection by clearing all documents"""
        try:
            # Getting all document IDs
            try:
                all_ids = self.collection.get()["ids"]
                if all_ids:
                    # Deleting all documents
                    self.collection.delete(ids=all_ids)
                    print(f"Deleted {len(all_ids)} documents from collection")
                else:
                    print("Collection is already empty")
                return True
            except Exception as e:
                print(f"Error getting or deleting documents: {str(e)}")
                
                # Trying to recreate the collection as a fallback
                try:
                    self.client.delete_collection("documents")
                    self.collection = self.client.get_or_create_collection("documents")
                    print("Collection recreated successfully")
                    return True
                except Exception as e2:
                    print(f"Error recreating collection: {str(e2)}")
                    return False
        except Exception as e:
            print(f"Error resetting collection: {str(e)}")
            return False