Spaces:
Sleeping
Sleeping
| from langchain_community.document_loaders import PyPDFLoader, DirectoryLoader | |
| from langchain.text_splitter import RecursiveCharacterTextSplitter | |
| from langchain_huggingface import HuggingFaceEmbeddings | |
| from langchain_community.vectorstores import FAISS | |
| from dotenv import load_dotenv, find_dotenv | |
| load_dotenv(find_dotenv()) | |
| DATA_PATH= "../data/" | |
| def load_pdf_files(data): | |
| loader = DirectoryLoader(data, | |
| glob='*.pdf', | |
| loader_cls=PyPDFLoader) | |
| documents=loader.load() | |
| return documents | |
| documents=load_pdf_files(data=DATA_PATH) | |
| def create_chunks(extracted_data): | |
| text_splitter=RecursiveCharacterTextSplitter(chunk_size=500, | |
| chunk_overlap=50) | |
| text_chunks=text_splitter.split_documents(extracted_data) | |
| return text_chunks | |
| text_chunks=create_chunks(extracted_data=documents) | |
| def get_embedding_model(): | |
| embedding_model=HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2") | |
| return embedding_model | |
| embedding_model=get_embedding_model() | |
| DB_FAISS_PATH= "../vectorstore/db_faiss" | |
| db=FAISS.from_documents(text_chunks, embedding_model) | |
| db.save_local(DB_FAISS_PATH) |