Spaces:

ahmzakif
/

Fraud-Chatbot

Sleeping

File size: 3,756 Bytes

fd99b61

"""Document loader for PDF files."""

import logging
from pathlib import Path
from typing import List

from langchain_community.document_loaders import PyPDFLoader
from langchain_core.documents import Document
from langchain_text_splitters import RecursiveCharacterTextSplitter

from src.config.config import settings

logger = logging.getLogger(__name__)


class DocumentLoader:
    """Loader for PDF documents."""

    def __init__(

        self,

        chunk_size: int = 1000,

        chunk_overlap: int = 200,

    ) -> None:
        """Initialize document loader.



        Args:

            chunk_size: Size of text chunks.

            chunk_overlap: Overlap between chunks.

        """
        self.chunk_size = chunk_size
        self.chunk_overlap = chunk_overlap
        self.text_splitter = RecursiveCharacterTextSplitter(
            chunk_size=chunk_size,
            chunk_overlap=chunk_overlap,
            length_function=len,
        )

    def load_pdf(self, pdf_path: Path) -> List[Document]:
        """Load a PDF file and split it into chunks.



        Args:

            pdf_path: Path to the PDF file.



        Returns:

            List of document chunks.

        """
        if not pdf_path.exists():
            raise FileNotFoundError(f"PDF file not found: {pdf_path}")

        try:
            logger.info(f"Loading PDF: {pdf_path}")
            loader = PyPDFLoader(str(pdf_path))
            documents = loader.load()

            # Split documents into chunks
            chunks = self.text_splitter.split_documents(documents)

            logger.info(f"Loaded {len(chunks)} chunks from {pdf_path}")
            return chunks
        except Exception as e:
            logger.error(f"Error loading PDF {pdf_path}: {str(e)}")
            raise

    def load_pdfs_from_directory(self, directory: Path) -> List[Document]:
        """Load all PDF files from a directory.



        Args:

            directory: Directory containing PDF files.



        Returns:

            List of document chunks from all PDFs.

        """
        if not directory.exists():
            raise FileNotFoundError(f"Directory not found: {directory}")

        pdf_files = list(directory.glob("*.pdf"))
        if not pdf_files:
            logger.warning(f"No PDF files found in {directory}")
            return []

        all_chunks: List[Document] = []
        for pdf_path in pdf_files:
            try:
                chunks = self.load_pdf(pdf_path)
                all_chunks.extend(chunks)
            except Exception as e:
                logger.error(f"Failed to load {pdf_path}: {str(e)}")
                continue

        logger.info(f"Loaded {len(all_chunks)} total chunks from {len(pdf_files)} PDFs")
        return all_chunks

    def load_csv_insights(self, csv_path: Path, sample_size: int = 1050000) -> List[Document]:
        """Load insights from CSV file and convert to documents.



        Args:

            csv_path: Path to CSV file.

            sample_size: Number of rows to sample from CSV.



        Returns:

            List of documents generated from CSV insights.

        """
        try:
            from src.rag.csv_document_generator import CSVDocumentGenerator

            logger.info(f"Loading CSV insights from {csv_path}")
            generator = CSVDocumentGenerator(csv_path, sample_size=sample_size)
            documents = generator.generate_all_documents()

            logger.info(f"Generated {len(documents)} documents from CSV insights")
            return documents
        except Exception as e:
            logger.error(f"Error loading CSV insights: {str(e)}")
            raise