Spaces:
Sleeping
Sleeping
File size: 1,796 Bytes
31a1fee ba900f0 40ca01e 31a1fee ba900f0 31a1fee ba900f0 31a1fee ba900f0 31a1fee ba900f0 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 |
from pathlib import Path
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import UnstructuredMarkdownLoader
from src.utils import logger, convert_document_to_markdown, save_to_markdown
class DocumentProcessor:
def __init__(self, chunk_size=500, chunk_overlap=100):
self.text_splitter = RecursiveCharacterTextSplitter(
chunk_size=chunk_size,
chunk_overlap=chunk_overlap
)
def process_document(self, file_path: str) -> str:
"""
Processes a document by converting it to markdown and saving it.
Args:
file_path (str): The path to the document file.
Returns:
str: The path to the saved markdown file."""
logger.info(f"Processing document: {file_path}")
path_obj = Path(file_path)
md_content = convert_document_to_markdown(path_obj)
logger.info("Document converted to markdown.")
md_file_path = save_to_markdown(md_content, path_obj)
logger.info(f"Markdown file saved at: {md_file_path}")
return md_file_path
def load_and_split_pdf(self, file_path: str):
"""
Loads a document, splits it into chunks, and returns the chunks.
Args:
file_path (str): The path to the PDF document.
Returns:
list: A list of document chunks.
"""
logger.info(f"Loading and splitting Document: {file_path}")
path_doc = self.process_document(file_path)
loader = UnstructuredMarkdownLoader(path_doc)
docs = loader.load()
chunks = self.text_splitter.split_documents(docs)
logger.info(f"Loaded and split Document into {len(chunks)} chunks")
return chunks |