mediAI / creation_memory_llm.py
A1ee's picture
Update creation_memory_llm.py
e3b9b02 verified
Raw
History Blame Contribute Delete
1.25 kB
import os
from langchain_community.document_loaders import PyPDFLoader, DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_community.vectorstores import FAISS
# Define the data path
DATA_PATH = "data/"
FAISS_PATH = "vectorstore/db_faiss"
# Step 1: Load raw PDFs
def load_pdf_files(data):
loader = DirectoryLoader(data, glob="*.pdf", loader_cls=PyPDFLoader)
documents = loader.load()
return documents
documents = load_pdf_files(DATA_PATH)
# Step 2: Create Chunks
def create_chunks(extracted_data):
text_splitter = RecursiveCharacterTextSplitter(
chunk_size=500,
chunk_overlap=50
)
return text_splitter.split_documents(extracted_data)
text_chunks = create_chunks(documents)
# Step 3: Embeddings
def get_embedding_model():
return HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
embedding_model = get_embedding_model()
# Step 4: Store or Load FAISS
if not os.path.exists(FAISS_PATH):
db = FAISS.from_documents(text_chunks, embedding_model)
db.save_local(FAISS_PATH)
else:
db = FAISS.load_local(FAISS_PATH, embedding_model, allow_dangerous_deserialization=True)