from langchain_community.document_loaders import PyPDFLoader, Docx2txtLoader, TextLoader from langchain_core.documents import Document from app.utils.logger import logger from app.utils.errors import DocumentProcessingError from pathlib import Path from typing import List class DocumentLoader: SUPPORTED_FORMATS = {".pdf", ".docx", ".txt"} @staticmethod def load(file_path: str) -> List[Document]: path = Path(file_path) if not path.exists(): raise DocumentProcessingError(f"File not found: {file_path}") extension = path.suffix.lower() if extension not in DocumentLoader.SUPPORTED_FORMATS: raise DocumentProcessingError( f"Unsupported format: {extension}. Supported: {DocumentLoader.SUPPORTED_FORMATS}" ) try: # PDF loader if extension == ".pdf": loader = PyPDFLoader(file_path) # DOCX loader elif extension == ".docx": loader = Docx2txtLoader(file_path) # TXT loader else: loader = TextLoader(file_path, encoding="utf-8") documents = loader.load() logger.info(f"Loaded {len(documents)} pages from {path.name}") return documents except Exception as e: logger.error(f"Error loading {file_path}: {str(e)}") raise DocumentProcessingError(f"Failed to load document: {str(e)}") document_loader = DocumentLoader()