import os # for loading the PDF documents from langchain.document_loaders import PyPDFLoader # text splitter from langchain.text_splitter import RecursiveCharacterTextSplitter #embeddings from langchain.embeddings import SentenceTransformerEmbeddings # Vector db imports from langchain.vectorstores import FAISS def create_vectorstore(filepath, savedb=False) -> FAISS: print("debug , in create vectorstore, filepath =", filepath) try: chunks = preprocess(filepath) embedding = get_embedding() print("in create vectorstore") db = FAISS.from_documents(documents = chunks, embedding = embedding) except Exception as e: print("Exception - e:", e) raise if savedb: # save index print("saving the new FAISS index for ",filepath) parent_dir_name = os.path.basename(os.path.dirname(filepath)) print("pareant_dir_name", parent_dir_name) db.save_local("faiss_index/"+parent_dir_name) return db def load_vectorstore(saved_db_name) -> FAISS: embedding = get_embedding() db = None saved_db_name=saved_db_name.strip() # Load the local database try: dbpath = "faiss_index/" + saved_db_name db = FAISS.load_local(dbpath, embedding) except RuntimeError as e: print("unable to load the db, save_db_name=", saved_db_name) #cwd = os.getcwd() basepath=os.path.normpath("C:/Users/ninad/develop/llm/huggingface/searchdocs/samples/") filepath = os.path.join(basepath, saved_db_name, "underwriting_agreement.pdf") filepath = os.path.normpath(filepath) print("in load_vectorstoe, file_path =", filepath) db = create_vectorstore(filepath, savedb=True) finally: print("in finally clause, returning db") print("debug - db is", db) return db def get_embedding(): #'sentence-transformers/all-mpnet-base-v2' embedding = SentenceTransformerEmbeddings(model_name="all-miniLM-L6-v2") #embedding = SentenceTransformerEmbeddings(model_name="all-mpnet-base-v2") return embedding def get_input() -> str: cwd = os.getcwd() filpath = os.path.join(cwd, "samples/F5-SupportPolicies.pdf") return filpath def preprocess(filpath) -> list: #filpath = get_input() # load the input file loader = PyPDFLoader(filpath) document = loader.load() # split the input document into chunks text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=5) chunks = text_splitter.split_documents(document) return chunks if __name__ == "__main__": cwd = os.getcwd() file_path = os.path.join(cwd, "samples", "underwriting", "underwriting_agreement.pdf") print("file_path=", file_path) assert os.path.exists(file_path) #file_path = os.path.join(cwd, "samples","F5-SupportPolicies.pdf") #file_path = os.path.join(cwd, "samples\\underwriting\\1_underwriting_agreement.pdf") db = create_vectorstore(file_path, savedb=True)