File size: 3,255 Bytes
f8bf7df
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
from langchain_community.vectorstores import Chroma
from langchain_community.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
import os
from typing import List
from langchain_openai import OpenAIEmbeddings  # ✅ Fixed import


class PrepareVectorDB:
    """

    A class for preparing and saving a VectorDB using OpenAI embeddings.

    """

    def __init__(

            self,

            data_directory: str,

            persist_directory: str,

            embedding_model_engine: str,

            chunk_size: int,

            chunk_overlap: int

    ) -> None:
        """

        Initialize the PrepareVectorDB instance.

        """
        self.embedding_model_engine = embedding_model_engine
        self.text_splitter = RecursiveCharacterTextSplitter(
            chunk_size=chunk_size,
            chunk_overlap=chunk_overlap,
            separators=["\n\n", "\n", " ", ""]
        )
        self.data_directory = data_directory
        self.persist_directory = persist_directory
        self.embedding = OpenAIEmbeddings()

    def __load_all_documents(self) -> List:
        """

        Load all documents from the specified directory or directories.

        """
        doc_counter = 0
        docs = []

        if isinstance(self.data_directory, list):
            print("Loading the uploaded documents...")
            for doc_dir in self.data_directory:
                docs.extend(PyPDFLoader(doc_dir).load())
                doc_counter += 1
        else:
            print("Loading documents manually...")
            if not os.path.exists(self.data_directory):
                os.makedirs(self.data_directory)  # ✅ Ensure the directory exists
                print(f"Created missing directory: {self.data_directory}")

            document_list = os.listdir(self.data_directory)  # ✅ Fixed undefined variable

            for doc_name in document_list:
                docs.extend(PyPDFLoader(os.path.join(self.data_directory, doc_name)).load())
                doc_counter += 1

        print("Number of loaded documents:", doc_counter)
        print("Number of pages:", len(docs), "\n\n")
        return docs

    def __chunk_documents(self, docs: List) -> List:
        """

        Chunk the loaded documents using the specified text splitter.

        """
        print("Chunking documents...")
        chunked_documents = self.text_splitter.split_documents(docs)
        print("Number of chunks:", len(chunked_documents), "\n\n")
        return chunked_documents

    def prepare_and_save_vectordb(self):
        """

        Load, chunk, and create a VectorDB with OpenAI embeddings, and save it.

        """
        docs = self.__load_all_documents()
        chunked_documents = self.__chunk_documents(docs)
        print("Preparing vectordb...")
        vectordb = Chroma.from_documents(
            documents=chunked_documents,
            embedding=self.embedding,
            persist_directory=self.persist_directory
        )
        print("VectorDB is created and saved.")
        print("Number of vectors in vectordb:", vectordb._collection.count(), "\n\n")
        return vectordb