Spaces:

codeby-hp
/

medical-rag-chatbot

Sleeping

medical-rag-chatbot / src /helper.py

Uploading the files

15a08d2 verified about 2 months ago

1.49 kB

	from langchain_community.document_loaders import PyPDFLoader, DirectoryLoader
	from langchain_text_splitters import RecursiveCharacterTextSplitter
	from langchain_classic.schema import Document
	from langchain_community.embeddings import HuggingFaceEmbeddings


	# Function: Load the pdf files from "data" dir
	def load_pdf_files(data):
	loader = DirectoryLoader(data, glob="*.pdf", loader_cls=PyPDFLoader)

	documents = loader.load()
	return documents


	# Function: Filter the Documents
	def filter_to_minimal_docs(docs: list[Document]) -> list[Document]:
	"""
	input: The list of Document
	output: The list of minimal Documents containing (src,page_content)
	"""

	minimal_docs: list[Document] = []
	for doc in docs:
	src = doc.metadata.get("source")
	minimal_docs.append(
	Document(page_content=doc.page_content, metadata={"source": src})
	)
	return minimal_docs


	# Function: Perfrom Text Splitting
	def text_split(minimal_docs):
	text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=20)
	texts_chunk = text_splitter.split_documents(minimal_docs)
	return texts_chunk


	# Function: Download embedding model
	def download_embeddings():
	"""
	Downlaod and return the HuggingFace embeddings model.
	"""
	model_name = "sentence-transformers/all-MiniLM-L6-v2"
	embeddings = HuggingFaceEmbeddings(model_name=model_name)
	return embeddings