File size: 1,796 Bytes
31a1fee
ba900f0
40ca01e
31a1fee
ba900f0
 
 
 
 
 
 
31a1fee
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ba900f0
 
31a1fee
 
 
 
 
 
 
 
 
 
ba900f0
 
31a1fee
ba900f0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
from pathlib import Path
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import UnstructuredMarkdownLoader
from src.utils import logger, convert_document_to_markdown, save_to_markdown

class DocumentProcessor:
    def __init__(self, chunk_size=500, chunk_overlap=100):
        self.text_splitter = RecursiveCharacterTextSplitter(
            chunk_size=chunk_size, 
            chunk_overlap=chunk_overlap
        )
   
    def process_document(self, file_path: str) -> str:
        """
        Processes a document by converting it to markdown and saving it.
        Args:
            file_path (str): The path to the document file.
        Returns:
            str: The path to the saved markdown file."""
        logger.info(f"Processing document: {file_path}")
        path_obj = Path(file_path)
        
        md_content = convert_document_to_markdown(path_obj)
        logger.info("Document converted to markdown.")
       
        md_file_path = save_to_markdown(md_content, path_obj)
        logger.info(f"Markdown file saved at: {md_file_path}")
        
        return md_file_path

    def load_and_split_pdf(self, file_path: str):
        """
        Loads a document, splits it into chunks, and returns the chunks.
        Args:
            file_path (str): The path to the PDF document.
        Returns:
            list: A list of document chunks.
        """
        logger.info(f"Loading and splitting Document: {file_path}")
        path_doc = self.process_document(file_path)
        loader = UnstructuredMarkdownLoader(path_doc)
        docs = loader.load()
        chunks = self.text_splitter.split_documents(docs)
        logger.info(f"Loaded and split Document into {len(chunks)} chunks")
        return chunks