Spaces:

sathayen
/

llm

Runtime error

App Files Files Community

llm / preprocess.py

sathayen

initial commit from local. need to test paths

c17b22b almost 3 years ago

Raw

History Blame Contribute Delete

3.09 kB

	import os

	# for loading the PDF documents
	from langchain.document_loaders import PyPDFLoader

	# text splitter
	from langchain.text_splitter import RecursiveCharacterTextSplitter

	#embeddings
	from langchain.embeddings import SentenceTransformerEmbeddings

	# Vector db imports
	from langchain.vectorstores import FAISS


	def create_vectorstore(filepath, savedb=False) -> FAISS:

	print("debug , in create vectorstore, filepath =", filepath)
	try:
	chunks = preprocess(filepath)
	embedding = get_embedding()
	print("in create vectorstore")
	db = FAISS.from_documents(documents = chunks, embedding = embedding)
	except Exception as e:
	print("Exception - e:", e)
	raise

	if savedb:
	# save index
	print("saving the new FAISS index for ",filepath)
	parent_dir_name = os.path.basename(os.path.dirname(filepath))
	print("pareant_dir_name", parent_dir_name)
	db.save_local("faiss_index/"+parent_dir_name)
	return db

	def load_vectorstore(saved_db_name) -> FAISS:
	embedding = get_embedding()
	db = None
	saved_db_name=saved_db_name.strip()
	# Load the local database
	try:
	dbpath = "faiss_index/" + saved_db_name
	db = FAISS.load_local(dbpath, embedding)
	except RuntimeError as e:
	print("unable to load the db, save_db_name=", saved_db_name)
	#cwd = os.getcwd()
	basepath=os.path.normpath("C:/Users/ninad/develop/llm/huggingface/searchdocs/samples/")

	filepath = os.path.join(basepath, saved_db_name, "underwriting_agreement.pdf")
	filepath = os.path.normpath(filepath)
	print("in load_vectorstoe, file_path =", filepath)
	db = create_vectorstore(filepath, savedb=True)
	finally:
	print("in finally clause, returning db")
	print("debug - db is", db)
	return db


	def get_embedding():
	#'sentence-transformers/all-mpnet-base-v2'
	embedding = SentenceTransformerEmbeddings(model_name="all-miniLM-L6-v2")
	#embedding = SentenceTransformerEmbeddings(model_name="all-mpnet-base-v2")
	return embedding

	def get_input() -> str:
	cwd = os.getcwd()
	filpath = os.path.join(cwd, "samples/F5-SupportPolicies.pdf")
	return filpath

	def preprocess(filpath) -> list:
	#filpath = get_input()
	# load the input file
	loader = PyPDFLoader(filpath)

	document = loader.load()

	# split the input document into chunks
	text_splitter = RecursiveCharacterTextSplitter(chunk_size=500,
	chunk_overlap=5)

	chunks = text_splitter.split_documents(document)

	return chunks


	if __name__ == "__main__":
	cwd = os.getcwd()

	file_path = os.path.join(cwd, "samples", "underwriting", "underwriting_agreement.pdf")
	print("file_path=", file_path)
	assert os.path.exists(file_path)
	#file_path = os.path.join(cwd, "samples","F5-SupportPolicies.pdf")
	#file_path = os.path.join(cwd, "samples\\underwriting\\1_underwriting_agreement.pdf")

	db = create_vectorstore(file_path, savedb=True)