from langchain_community.vectorstores import Chroma
from langchain_community.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
import os
from typing import List
from langchain_openai import OpenAIEmbeddings  # ✅ Fixed import


class PrepareVectorDB:
    """
    A class for preparing and saving a VectorDB using OpenAI embeddings.
    """

    def __init__(
            self,
            data_directory: str,
            persist_directory: str,
            embedding_model_engine: str,
            chunk_size: int,
            chunk_overlap: int
    ) -> None:
        """
        Initialize the PrepareVectorDB instance.
        """
        self.embedding_model_engine = embedding_model_engine
        self.text_splitter = RecursiveCharacterTextSplitter(
            chunk_size=chunk_size,
            chunk_overlap=chunk_overlap,
            separators=["\n\n", "\n", " ", ""]
        )
        self.data_directory = data_directory
        self.persist_directory = persist_directory
        self.embedding = OpenAIEmbeddings()

    def __load_all_documents(self) -> List:
        """
        Load all documents from the specified directory or directories.
        """
        doc_counter = 0
        docs = []

        if isinstance(self.data_directory, list):
            print("Loading the uploaded documents...")
            for doc_dir in self.data_directory:
                docs.extend(PyPDFLoader(doc_dir).load())
                doc_counter += 1
        else:
            print("Loading documents manually...")
            if not os.path.exists(self.data_directory):
                os.makedirs(self.data_directory)  # ✅ Ensure the directory exists
                print(f"Created missing directory: {self.data_directory}")

            document_list = os.listdir(self.data_directory)  # ✅ Fixed undefined variable

            for doc_name in document_list:
                docs.extend(PyPDFLoader(os.path.join(self.data_directory, doc_name)).load())
                doc_counter += 1

        print("Number of loaded documents:", doc_counter)
        print("Number of pages:", len(docs), "\n\n")
        return docs

    def __chunk_documents(self, docs: List) -> List:
        """
        Chunk the loaded documents using the specified text splitter.
        """
        print("Chunking documents...")
        chunked_documents = self.text_splitter.split_documents(docs)
        print("Number of chunks:", len(chunked_documents), "\n\n")
        return chunked_documents

    def prepare_and_save_vectordb(self):
        """
        Load, chunk, and create a VectorDB with OpenAI embeddings, and save it.
        """
        docs = self.__load_all_documents()
        chunked_documents = self.__chunk_documents(docs)
        print("Preparing vectordb...")
        vectordb = Chroma.from_documents(
            documents=chunked_documents,
            embedding=self.embedding,
            persist_directory=self.persist_directory
        )
        print("VectorDB is created and saved.")
        print("Number of vectors in vectordb:", vectordb._collection.count(), "\n\n")
        return vectordb