Spaces:

A1ee
/

mediAI

Sleeping

mediAI / creation_memory_llm.py

Update creation_memory_llm.py

e3b9b02 verified 10 months ago

1.25 kB

	import os
	from langchain_community.document_loaders import PyPDFLoader, DirectoryLoader
	from langchain.text_splitter import RecursiveCharacterTextSplitter
	from langchain_huggingface import HuggingFaceEmbeddings
	from langchain_community.vectorstores import FAISS

	# Define the data path
	DATA_PATH = "data/"
	FAISS_PATH = "vectorstore/db_faiss"

	# Step 1: Load raw PDFs
	def load_pdf_files(data):
	loader = DirectoryLoader(data, glob="*.pdf", loader_cls=PyPDFLoader)
	documents = loader.load()
	return documents

	documents = load_pdf_files(DATA_PATH)

	# Step 2: Create Chunks
	def create_chunks(extracted_data):
	text_splitter = RecursiveCharacterTextSplitter(
	chunk_size=500,
	chunk_overlap=50
	)
	return text_splitter.split_documents(extracted_data)

	text_chunks = create_chunks(documents)

	# Step 3: Embeddings
	def get_embedding_model():
	return HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")

	embedding_model = get_embedding_model()

	# Step 4: Store or Load FAISS
	if not os.path.exists(FAISS_PATH):
	db = FAISS.from_documents(text_chunks, embedding_model)
	db.save_local(FAISS_PATH)
	else:
	db = FAISS.load_local(FAISS_PATH, embedding_model, allow_dangerous_deserialization=True)