| import os |
|
|
| |
| from langchain.document_loaders import PyPDFLoader |
|
|
| |
| from langchain.text_splitter import RecursiveCharacterTextSplitter |
|
|
| |
| from langchain.embeddings import SentenceTransformerEmbeddings |
|
|
| |
| from langchain.vectorstores import FAISS |
|
|
|
|
| def create_vectorstore(filepath, savedb=False) -> FAISS: |
| |
| print("debug , in create vectorstore, filepath =", filepath) |
| try: |
| chunks = preprocess(filepath) |
| embedding = get_embedding() |
| print("in create vectorstore") |
| db = FAISS.from_documents(documents = chunks, embedding = embedding) |
| except Exception as e: |
| print("Exception - e:", e) |
| raise |
| |
| if savedb: |
| |
| print("saving the new FAISS index for ",filepath) |
| parent_dir_name = os.path.basename(os.path.dirname(filepath)) |
| print("pareant_dir_name", parent_dir_name) |
| db.save_local("faiss_index/"+parent_dir_name) |
| return db |
|
|
| def load_vectorstore(saved_db_name) -> FAISS: |
| embedding = get_embedding() |
| db = None |
| saved_db_name=saved_db_name.strip() |
| |
| try: |
| dbpath = "faiss_index/" + saved_db_name |
| db = FAISS.load_local(dbpath, embedding) |
| except RuntimeError as e: |
| print("unable to load the db, save_db_name=", saved_db_name) |
| |
| basepath=os.path.normpath("C:/Users/ninad/develop/llm/huggingface/searchdocs/samples/") |
| |
| filepath = os.path.join(basepath, saved_db_name, "underwriting_agreement.pdf") |
| filepath = os.path.normpath(filepath) |
| print("in load_vectorstoe, file_path =", filepath) |
| db = create_vectorstore(filepath, savedb=True) |
| finally: |
| print("in finally clause, returning db") |
| print("debug - db is", db) |
| return db |
| |
|
|
| def get_embedding(): |
| |
| embedding = SentenceTransformerEmbeddings(model_name="all-miniLM-L6-v2") |
| |
| return embedding |
|
|
| def get_input() -> str: |
| cwd = os.getcwd() |
| filpath = os.path.join(cwd, "samples/F5-SupportPolicies.pdf") |
| return filpath |
|
|
| def preprocess(filpath) -> list: |
| |
| |
| loader = PyPDFLoader(filpath) |
|
|
| document = loader.load() |
|
|
| |
| text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, |
| chunk_overlap=5) |
|
|
| chunks = text_splitter.split_documents(document) |
|
|
| return chunks |
|
|
| |
| if __name__ == "__main__": |
| cwd = os.getcwd() |
| |
| file_path = os.path.join(cwd, "samples", "underwriting", "underwriting_agreement.pdf") |
| print("file_path=", file_path) |
| assert os.path.exists(file_path) |
| |
| |
| |
| db = create_vectorstore(file_path, savedb=True) |
|
|