Spaces:
Sleeping
Sleeping
| """Document loader for PDF files.""" | |
| import logging | |
| from pathlib import Path | |
| from typing import List | |
| from langchain_community.document_loaders import PyPDFLoader | |
| from langchain_core.documents import Document | |
| from langchain_text_splitters import RecursiveCharacterTextSplitter | |
| from src.config.config import settings | |
| logger = logging.getLogger(__name__) | |
| class DocumentLoader: | |
| """Loader for PDF documents.""" | |
| def __init__( | |
| self, | |
| chunk_size: int = 1000, | |
| chunk_overlap: int = 200, | |
| ) -> None: | |
| """Initialize document loader. | |
| Args: | |
| chunk_size: Size of text chunks. | |
| chunk_overlap: Overlap between chunks. | |
| """ | |
| self.chunk_size = chunk_size | |
| self.chunk_overlap = chunk_overlap | |
| self.text_splitter = RecursiveCharacterTextSplitter( | |
| chunk_size=chunk_size, | |
| chunk_overlap=chunk_overlap, | |
| length_function=len, | |
| ) | |
| def load_pdf(self, pdf_path: Path) -> List[Document]: | |
| """Load a PDF file and split it into chunks. | |
| Args: | |
| pdf_path: Path to the PDF file. | |
| Returns: | |
| List of document chunks. | |
| """ | |
| if not pdf_path.exists(): | |
| raise FileNotFoundError(f"PDF file not found: {pdf_path}") | |
| try: | |
| logger.info(f"Loading PDF: {pdf_path}") | |
| loader = PyPDFLoader(str(pdf_path)) | |
| documents = loader.load() | |
| # Split documents into chunks | |
| chunks = self.text_splitter.split_documents(documents) | |
| logger.info(f"Loaded {len(chunks)} chunks from {pdf_path}") | |
| return chunks | |
| except Exception as e: | |
| logger.error(f"Error loading PDF {pdf_path}: {str(e)}") | |
| raise | |
| def load_pdfs_from_directory(self, directory: Path) -> List[Document]: | |
| """Load all PDF files from a directory. | |
| Args: | |
| directory: Directory containing PDF files. | |
| Returns: | |
| List of document chunks from all PDFs. | |
| """ | |
| if not directory.exists(): | |
| raise FileNotFoundError(f"Directory not found: {directory}") | |
| pdf_files = list(directory.glob("*.pdf")) | |
| if not pdf_files: | |
| logger.warning(f"No PDF files found in {directory}") | |
| return [] | |
| all_chunks: List[Document] = [] | |
| for pdf_path in pdf_files: | |
| try: | |
| chunks = self.load_pdf(pdf_path) | |
| all_chunks.extend(chunks) | |
| except Exception as e: | |
| logger.error(f"Failed to load {pdf_path}: {str(e)}") | |
| continue | |
| logger.info(f"Loaded {len(all_chunks)} total chunks from {len(pdf_files)} PDFs") | |
| return all_chunks | |
| def load_csv_insights(self, csv_path: Path, sample_size: int = 1050000) -> List[Document]: | |
| """Load insights from CSV file and convert to documents. | |
| Args: | |
| csv_path: Path to CSV file. | |
| sample_size: Number of rows to sample from CSV. | |
| Returns: | |
| List of documents generated from CSV insights. | |
| """ | |
| try: | |
| from src.rag.csv_document_generator import CSVDocumentGenerator | |
| logger.info(f"Loading CSV insights from {csv_path}") | |
| generator = CSVDocumentGenerator(csv_path, sample_size=sample_size) | |
| documents = generator.generate_all_documents() | |
| logger.info(f"Generated {len(documents)} documents from CSV insights") | |
| return documents | |
| except Exception as e: | |
| logger.error(f"Error loading CSV insights: {str(e)}") | |
| raise | |