import os

# for loading the PDF documents
from langchain.document_loaders import PyPDFLoader

# text splitter 
from langchain.text_splitter import RecursiveCharacterTextSplitter

#embeddings 
from langchain.embeddings import SentenceTransformerEmbeddings

# Vector db imports 
from langchain.vectorstores import FAISS


def create_vectorstore(filepath, savedb=False) -> FAISS:
    
    print("debug , in create vectorstore, filepath =", filepath)
    try:
        chunks = preprocess(filepath)
        embedding = get_embedding()     
        print("in create vectorstore")
        db = FAISS.from_documents(documents = chunks, embedding = embedding)
    except Exception as e:
        print("Exception - e:", e)
        raise
    
    if savedb:
        # save index
        print("saving the new FAISS index for ",filepath)
        parent_dir_name = os.path.basename(os.path.dirname(filepath))
        print("pareant_dir_name", parent_dir_name)
        db.save_local("faiss_index/"+parent_dir_name)
    return db

def load_vectorstore(saved_db_name) -> FAISS:
    embedding = get_embedding()
    db = None
    saved_db_name=saved_db_name.strip()
    # Load the local database
    try:
        dbpath = "faiss_index/" + saved_db_name
        db = FAISS.load_local(dbpath, embedding)
    except RuntimeError as e:
        print("unable to load the db, save_db_name=", saved_db_name)
        #cwd = os.getcwd()
        basepath=os.path.normpath("C:/Users/ninad/develop/llm/huggingface/searchdocs/samples/")
    
        filepath = os.path.join(basepath, saved_db_name, "underwriting_agreement.pdf")
        filepath = os.path.normpath(filepath)
        print("in load_vectorstoe, file_path =", filepath)
        db = create_vectorstore(filepath, savedb=True)
    finally:
        print("in finally clause, returning db")
    print("debug - db is", db)
    return db
  

def get_embedding():
    #'sentence-transformers/all-mpnet-base-v2'
    embedding = SentenceTransformerEmbeddings(model_name="all-miniLM-L6-v2")
    #embedding = SentenceTransformerEmbeddings(model_name="all-mpnet-base-v2")
    return embedding

def get_input() -> str:
    cwd = os.getcwd()
    filpath = os.path.join(cwd, "samples/F5-SupportPolicies.pdf")
    return filpath

def preprocess(filpath) -> list:
    #filpath = get_input()
    # load the input file
    loader = PyPDFLoader(filpath)

    document = loader.load()

    # split the input document into chunks
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, 
                                                   chunk_overlap=5)

    chunks = text_splitter.split_documents(document)

    return chunks

    
if __name__ == "__main__":
    cwd = os.getcwd()
    
    file_path = os.path.join(cwd, "samples", "underwriting", "underwriting_agreement.pdf")
    print("file_path=", file_path)
    assert os.path.exists(file_path)
    #file_path = os.path.join(cwd, "samples","F5-SupportPolicies.pdf")
    #file_path = os.path.join(cwd, "samples\\underwriting\\1_underwriting_agreement.pdf")
    
    db = create_vectorstore(file_path, savedb=True)