File size: 3,138 Bytes
bb60cf1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
import os
import chromadb
from chromadb.utils import embedding_functions
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import Chroma
from langchain.embeddings import SentenceTransformerEmbeddings
from langchain.docstore.document import Document
import uuid

class VectorStore:
    def __init__(self):
        # Initialize embedding function
        self.embedding_function = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")
        
        # Initialize ChromaDB client
        self.client = chromadb.PersistentClient(path="./chroma_db")
        
        # Create or get collection
        self.collection = self.client.get_or_create_collection(
            name="research_documents",
            embedding_function=embedding_functions.SentenceTransformerEmbeddingFunction(
                model_name="all-MiniLM-L6-v2"
            )
        )
        
        # Initialize LangChain vector store
        self.vector_store = Chroma(
            collection_name="research_documents",
            embedding_function=self.embedding_function,
            persist_directory="./chroma_db"
        )
        
        # Initialize text splitter
        self.text_splitter = RecursiveCharacterTextSplitter(
            chunk_size=1000,
            chunk_overlap=200,
            length_function=len,
        )
    
    def add_documents(self, documents):
        """Add documents to the vector store"""
        try:
            # Split documents into chunks
            split_docs = []
            for doc in documents:
                splits = self.text_splitter.split_text(doc.page_content)
                for i, split in enumerate(splits):
                    split_docs.append(Document(
                        page_content=split,
                        metadata={**doc.metadata, "chunk": i}
                    ))
            
            # Add to vector store
            ids = [str(uuid.uuid4()) for _ in split_docs]
            self.vector_store.add_documents(split_docs, ids=ids)
            
            return {"status": "success", "count": len(split_docs)}
        except Exception as e:
            return {"status": "error", "message": str(e)}
    
    def search(self, query, k=5):
        """Search for relevant documents"""
        try:
            # Perform similarity search
            docs = self.vector_store.similarity_search(query, k=k)
            return {"status": "success", "documents": docs}
        except Exception as e:
            return {"status": "error", "message": str(e)}
    
    def delete_collection(self):
        """Delete the entire collection"""
        try:
            self.client.delete_collection("research_documents")
            self.collection = self.client.get_or_create_collection(
                name="research_documents",
                embedding_function=embedding_functions.SentenceTransformerEmbeddingFunction(
                    model_name="all-MiniLM-L6-v2"
                )
            )
            return {"status": "success"}
        except Exception as e:
            return {"status": "error", "message": str(e)}