Spaces:
Sleeping
Sleeping
| from pathlib import Path | |
| from langchain.text_splitter import RecursiveCharacterTextSplitter | |
| from langchain_community.document_loaders import UnstructuredMarkdownLoader | |
| from src.utils import logger, convert_document_to_markdown, save_to_markdown | |
| class DocumentProcessor: | |
| def __init__(self, chunk_size=500, chunk_overlap=100): | |
| self.text_splitter = RecursiveCharacterTextSplitter( | |
| chunk_size=chunk_size, | |
| chunk_overlap=chunk_overlap | |
| ) | |
| def process_document(self, file_path: str) -> str: | |
| """ | |
| Processes a document by converting it to markdown and saving it. | |
| Args: | |
| file_path (str): The path to the document file. | |
| Returns: | |
| str: The path to the saved markdown file.""" | |
| logger.info(f"Processing document: {file_path}") | |
| path_obj = Path(file_path) | |
| md_content = convert_document_to_markdown(path_obj) | |
| logger.info("Document converted to markdown.") | |
| md_file_path = save_to_markdown(md_content, path_obj) | |
| logger.info(f"Markdown file saved at: {md_file_path}") | |
| return md_file_path | |
| def load_and_split_pdf(self, file_path: str): | |
| """ | |
| Loads a document, splits it into chunks, and returns the chunks. | |
| Args: | |
| file_path (str): The path to the PDF document. | |
| Returns: | |
| list: A list of document chunks. | |
| """ | |
| logger.info(f"Loading and splitting Document: {file_path}") | |
| path_doc = self.process_document(file_path) | |
| loader = UnstructuredMarkdownLoader(path_doc) | |
| docs = loader.load() | |
| chunks = self.text_splitter.split_documents(docs) | |
| logger.info(f"Loaded and split Document into {len(chunks)} chunks") | |
| return chunks |