File size: 3,756 Bytes
fd99b61
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
"""Document loader for PDF files."""

import logging
from pathlib import Path
from typing import List

from langchain_community.document_loaders import PyPDFLoader
from langchain_core.documents import Document
from langchain_text_splitters import RecursiveCharacterTextSplitter

from src.config.config import settings

logger = logging.getLogger(__name__)


class DocumentLoader:
    """Loader for PDF documents."""

    def __init__(

        self,

        chunk_size: int = 1000,

        chunk_overlap: int = 200,

    ) -> None:
        """Initialize document loader.



        Args:

            chunk_size: Size of text chunks.

            chunk_overlap: Overlap between chunks.

        """
        self.chunk_size = chunk_size
        self.chunk_overlap = chunk_overlap
        self.text_splitter = RecursiveCharacterTextSplitter(
            chunk_size=chunk_size,
            chunk_overlap=chunk_overlap,
            length_function=len,
        )

    def load_pdf(self, pdf_path: Path) -> List[Document]:
        """Load a PDF file and split it into chunks.



        Args:

            pdf_path: Path to the PDF file.



        Returns:

            List of document chunks.

        """
        if not pdf_path.exists():
            raise FileNotFoundError(f"PDF file not found: {pdf_path}")

        try:
            logger.info(f"Loading PDF: {pdf_path}")
            loader = PyPDFLoader(str(pdf_path))
            documents = loader.load()

            # Split documents into chunks
            chunks = self.text_splitter.split_documents(documents)

            logger.info(f"Loaded {len(chunks)} chunks from {pdf_path}")
            return chunks
        except Exception as e:
            logger.error(f"Error loading PDF {pdf_path}: {str(e)}")
            raise

    def load_pdfs_from_directory(self, directory: Path) -> List[Document]:
        """Load all PDF files from a directory.



        Args:

            directory: Directory containing PDF files.



        Returns:

            List of document chunks from all PDFs.

        """
        if not directory.exists():
            raise FileNotFoundError(f"Directory not found: {directory}")

        pdf_files = list(directory.glob("*.pdf"))
        if not pdf_files:
            logger.warning(f"No PDF files found in {directory}")
            return []

        all_chunks: List[Document] = []
        for pdf_path in pdf_files:
            try:
                chunks = self.load_pdf(pdf_path)
                all_chunks.extend(chunks)
            except Exception as e:
                logger.error(f"Failed to load {pdf_path}: {str(e)}")
                continue

        logger.info(f"Loaded {len(all_chunks)} total chunks from {len(pdf_files)} PDFs")
        return all_chunks

    def load_csv_insights(self, csv_path: Path, sample_size: int = 1050000) -> List[Document]:
        """Load insights from CSV file and convert to documents.



        Args:

            csv_path: Path to CSV file.

            sample_size: Number of rows to sample from CSV.



        Returns:

            List of documents generated from CSV insights.

        """
        try:
            from src.rag.csv_document_generator import CSVDocumentGenerator

            logger.info(f"Loading CSV insights from {csv_path}")
            generator = CSVDocumentGenerator(csv_path, sample_size=sample_size)
            documents = generator.generate_all_documents()

            logger.info(f"Generated {len(documents)} documents from CSV insights")
            return documents
        except Exception as e:
            logger.error(f"Error loading CSV insights: {str(e)}")
            raise