| import os |
| from langchain_community.document_loaders import PyPDFLoader, DirectoryLoader |
| from langchain.text_splitter import RecursiveCharacterTextSplitter |
| from langchain_huggingface import HuggingFaceEmbeddings |
| from langchain_community.vectorstores import FAISS |
|
|
| |
| DATA_PATH = "data/" |
| FAISS_PATH = "vectorstore/db_faiss" |
|
|
| |
| def load_pdf_files(data): |
| loader = DirectoryLoader(data, glob="*.pdf", loader_cls=PyPDFLoader) |
| documents = loader.load() |
| return documents |
|
|
| documents = load_pdf_files(DATA_PATH) |
|
|
| |
| def create_chunks(extracted_data): |
| text_splitter = RecursiveCharacterTextSplitter( |
| chunk_size=500, |
| chunk_overlap=50 |
| ) |
| return text_splitter.split_documents(extracted_data) |
|
|
| text_chunks = create_chunks(documents) |
|
|
| |
| def get_embedding_model(): |
| return HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2") |
|
|
| embedding_model = get_embedding_model() |
|
|
| |
| if not os.path.exists(FAISS_PATH): |
| db = FAISS.from_documents(text_chunks, embedding_model) |
| db.save_local(FAISS_PATH) |
| else: |
| db = FAISS.load_local(FAISS_PATH, embedding_model, allow_dangerous_deserialization=True) |
|
|