Fraud-Chatbot / src /rag /document_loader.py
ahmzakif's picture
feat: add new project
fd99b61 verified
"""Document loader for PDF files."""
import logging
from pathlib import Path
from typing import List
from langchain_community.document_loaders import PyPDFLoader
from langchain_core.documents import Document
from langchain_text_splitters import RecursiveCharacterTextSplitter
from src.config.config import settings
logger = logging.getLogger(__name__)
class DocumentLoader:
"""Loader for PDF documents."""
def __init__(
self,
chunk_size: int = 1000,
chunk_overlap: int = 200,
) -> None:
"""Initialize document loader.
Args:
chunk_size: Size of text chunks.
chunk_overlap: Overlap between chunks.
"""
self.chunk_size = chunk_size
self.chunk_overlap = chunk_overlap
self.text_splitter = RecursiveCharacterTextSplitter(
chunk_size=chunk_size,
chunk_overlap=chunk_overlap,
length_function=len,
)
def load_pdf(self, pdf_path: Path) -> List[Document]:
"""Load a PDF file and split it into chunks.
Args:
pdf_path: Path to the PDF file.
Returns:
List of document chunks.
"""
if not pdf_path.exists():
raise FileNotFoundError(f"PDF file not found: {pdf_path}")
try:
logger.info(f"Loading PDF: {pdf_path}")
loader = PyPDFLoader(str(pdf_path))
documents = loader.load()
# Split documents into chunks
chunks = self.text_splitter.split_documents(documents)
logger.info(f"Loaded {len(chunks)} chunks from {pdf_path}")
return chunks
except Exception as e:
logger.error(f"Error loading PDF {pdf_path}: {str(e)}")
raise
def load_pdfs_from_directory(self, directory: Path) -> List[Document]:
"""Load all PDF files from a directory.
Args:
directory: Directory containing PDF files.
Returns:
List of document chunks from all PDFs.
"""
if not directory.exists():
raise FileNotFoundError(f"Directory not found: {directory}")
pdf_files = list(directory.glob("*.pdf"))
if not pdf_files:
logger.warning(f"No PDF files found in {directory}")
return []
all_chunks: List[Document] = []
for pdf_path in pdf_files:
try:
chunks = self.load_pdf(pdf_path)
all_chunks.extend(chunks)
except Exception as e:
logger.error(f"Failed to load {pdf_path}: {str(e)}")
continue
logger.info(f"Loaded {len(all_chunks)} total chunks from {len(pdf_files)} PDFs")
return all_chunks
def load_csv_insights(self, csv_path: Path, sample_size: int = 1050000) -> List[Document]:
"""Load insights from CSV file and convert to documents.
Args:
csv_path: Path to CSV file.
sample_size: Number of rows to sample from CSV.
Returns:
List of documents generated from CSV insights.
"""
try:
from src.rag.csv_document_generator import CSVDocumentGenerator
logger.info(f"Loading CSV insights from {csv_path}")
generator = CSVDocumentGenerator(csv_path, sample_size=sample_size)
documents = generator.generate_all_documents()
logger.info(f"Generated {len(documents)} documents from CSV insights")
return documents
except Exception as e:
logger.error(f"Error loading CSV insights: {str(e)}")
raise