agentic-rag

Sleeping

agentic-rag / src /indexing /document_processing.py

fahmiaziz98

init

31a1fee 9 months ago

1.8 kB

	from pathlib import Path
	from langchain.text_splitter import RecursiveCharacterTextSplitter
	from langchain_community.document_loaders import UnstructuredMarkdownLoader
	from src.utils import logger, convert_document_to_markdown, save_to_markdown

	class DocumentProcessor:
	def __init__(self, chunk_size=500, chunk_overlap=100):
	self.text_splitter = RecursiveCharacterTextSplitter(
	chunk_size=chunk_size,
	chunk_overlap=chunk_overlap
	)

	def process_document(self, file_path: str) -> str:
	"""
	Processes a document by converting it to markdown and saving it.
	Args:
	file_path (str): The path to the document file.
	Returns:
	str: The path to the saved markdown file."""
	logger.info(f"Processing document: {file_path}")
	path_obj = Path(file_path)

	md_content = convert_document_to_markdown(path_obj)
	logger.info("Document converted to markdown.")

	md_file_path = save_to_markdown(md_content, path_obj)
	logger.info(f"Markdown file saved at: {md_file_path}")

	return md_file_path

	def load_and_split_pdf(self, file_path: str):
	"""
	Loads a document, splits it into chunks, and returns the chunks.
	Args:
	file_path (str): The path to the PDF document.
	Returns:
	list: A list of document chunks.
	"""
	logger.info(f"Loading and splitting Document: {file_path}")
	path_doc = self.process_document(file_path)
	loader = UnstructuredMarkdownLoader(path_doc)
	docs = loader.load()
	chunks = self.text_splitter.split_documents(docs)
	logger.info(f"Loaded and split Document into {len(chunks)} chunks")
	return chunks