llm / preprocess.py
sathayen's picture
initial commit from local. need to test paths
c17b22b
Raw
History Blame Contribute Delete
3.09 kB
import os
# for loading the PDF documents
from langchain.document_loaders import PyPDFLoader
# text splitter
from langchain.text_splitter import RecursiveCharacterTextSplitter
#embeddings
from langchain.embeddings import SentenceTransformerEmbeddings
# Vector db imports
from langchain.vectorstores import FAISS
def create_vectorstore(filepath, savedb=False) -> FAISS:
print("debug , in create vectorstore, filepath =", filepath)
try:
chunks = preprocess(filepath)
embedding = get_embedding()
print("in create vectorstore")
db = FAISS.from_documents(documents = chunks, embedding = embedding)
except Exception as e:
print("Exception - e:", e)
raise
if savedb:
# save index
print("saving the new FAISS index for ",filepath)
parent_dir_name = os.path.basename(os.path.dirname(filepath))
print("pareant_dir_name", parent_dir_name)
db.save_local("faiss_index/"+parent_dir_name)
return db
def load_vectorstore(saved_db_name) -> FAISS:
embedding = get_embedding()
db = None
saved_db_name=saved_db_name.strip()
# Load the local database
try:
dbpath = "faiss_index/" + saved_db_name
db = FAISS.load_local(dbpath, embedding)
except RuntimeError as e:
print("unable to load the db, save_db_name=", saved_db_name)
#cwd = os.getcwd()
basepath=os.path.normpath("C:/Users/ninad/develop/llm/huggingface/searchdocs/samples/")
filepath = os.path.join(basepath, saved_db_name, "underwriting_agreement.pdf")
filepath = os.path.normpath(filepath)
print("in load_vectorstoe, file_path =", filepath)
db = create_vectorstore(filepath, savedb=True)
finally:
print("in finally clause, returning db")
print("debug - db is", db)
return db
def get_embedding():
#'sentence-transformers/all-mpnet-base-v2'
embedding = SentenceTransformerEmbeddings(model_name="all-miniLM-L6-v2")
#embedding = SentenceTransformerEmbeddings(model_name="all-mpnet-base-v2")
return embedding
def get_input() -> str:
cwd = os.getcwd()
filpath = os.path.join(cwd, "samples/F5-SupportPolicies.pdf")
return filpath
def preprocess(filpath) -> list:
#filpath = get_input()
# load the input file
loader = PyPDFLoader(filpath)
document = loader.load()
# split the input document into chunks
text_splitter = RecursiveCharacterTextSplitter(chunk_size=500,
chunk_overlap=5)
chunks = text_splitter.split_documents(document)
return chunks
if __name__ == "__main__":
cwd = os.getcwd()
file_path = os.path.join(cwd, "samples", "underwriting", "underwriting_agreement.pdf")
print("file_path=", file_path)
assert os.path.exists(file_path)
#file_path = os.path.join(cwd, "samples","F5-SupportPolicies.pdf")
#file_path = os.path.join(cwd, "samples\\underwriting\\1_underwriting_agreement.pdf")
db = create_vectorstore(file_path, savedb=True)