"""Document loader for PDF files.""" import logging from pathlib import Path from typing import List from langchain_community.document_loaders import PyPDFLoader from langchain_core.documents import Document from langchain_text_splitters import RecursiveCharacterTextSplitter from src.config.config import settings logger = logging.getLogger(__name__) class DocumentLoader: """Loader for PDF documents.""" def __init__( self, chunk_size: int = 1000, chunk_overlap: int = 200, ) -> None: """Initialize document loader. Args: chunk_size: Size of text chunks. chunk_overlap: Overlap between chunks. """ self.chunk_size = chunk_size self.chunk_overlap = chunk_overlap self.text_splitter = RecursiveCharacterTextSplitter( chunk_size=chunk_size, chunk_overlap=chunk_overlap, length_function=len, ) def load_pdf(self, pdf_path: Path) -> List[Document]: """Load a PDF file and split it into chunks. Args: pdf_path: Path to the PDF file. Returns: List of document chunks. """ if not pdf_path.exists(): raise FileNotFoundError(f"PDF file not found: {pdf_path}") try: logger.info(f"Loading PDF: {pdf_path}") loader = PyPDFLoader(str(pdf_path)) documents = loader.load() # Split documents into chunks chunks = self.text_splitter.split_documents(documents) logger.info(f"Loaded {len(chunks)} chunks from {pdf_path}") return chunks except Exception as e: logger.error(f"Error loading PDF {pdf_path}: {str(e)}") raise def load_pdfs_from_directory(self, directory: Path) -> List[Document]: """Load all PDF files from a directory. Args: directory: Directory containing PDF files. Returns: List of document chunks from all PDFs. """ if not directory.exists(): raise FileNotFoundError(f"Directory not found: {directory}") pdf_files = list(directory.glob("*.pdf")) if not pdf_files: logger.warning(f"No PDF files found in {directory}") return [] all_chunks: List[Document] = [] for pdf_path in pdf_files: try: chunks = self.load_pdf(pdf_path) all_chunks.extend(chunks) except Exception as e: logger.error(f"Failed to load {pdf_path}: {str(e)}") continue logger.info(f"Loaded {len(all_chunks)} total chunks from {len(pdf_files)} PDFs") return all_chunks def load_csv_insights(self, csv_path: Path, sample_size: int = 1050000) -> List[Document]: """Load insights from CSV file and convert to documents. Args: csv_path: Path to CSV file. sample_size: Number of rows to sample from CSV. Returns: List of documents generated from CSV insights. """ try: from src.rag.csv_document_generator import CSVDocumentGenerator logger.info(f"Loading CSV insights from {csv_path}") generator = CSVDocumentGenerator(csv_path, sample_size=sample_size) documents = generator.generate_all_documents() logger.info(f"Generated {len(documents)} documents from CSV insights") return documents except Exception as e: logger.error(f"Error loading CSV insights: {str(e)}") raise