rag-chatbot / ingestion /loaders.py
Abeshith's picture
RAG Chatbot with LangChain, FastAPI, and service layer architecture
64d7fdf
from langchain_community.document_loaders import PyPDFLoader, Docx2txtLoader, TextLoader
from langchain_core.documents import Document
from app.utils.logger import logger
from app.utils.errors import DocumentProcessingError
from pathlib import Path
from typing import List
class DocumentLoader:
SUPPORTED_FORMATS = {".pdf", ".docx", ".txt"}
@staticmethod
def load(file_path: str) -> List[Document]:
path = Path(file_path)
if not path.exists():
raise DocumentProcessingError(f"File not found: {file_path}")
extension = path.suffix.lower()
if extension not in DocumentLoader.SUPPORTED_FORMATS:
raise DocumentProcessingError(
f"Unsupported format: {extension}. Supported: {DocumentLoader.SUPPORTED_FORMATS}"
)
try:
# PDF loader
if extension == ".pdf":
loader = PyPDFLoader(file_path)
# DOCX loader
elif extension == ".docx":
loader = Docx2txtLoader(file_path)
# TXT loader
else:
loader = TextLoader(file_path, encoding="utf-8")
documents = loader.load()
logger.info(f"Loaded {len(documents)} pages from {path.name}")
return documents
except Exception as e:
logger.error(f"Error loading {file_path}: {str(e)}")
raise DocumentProcessingError(f"Failed to load document: {str(e)}")
document_loader = DocumentLoader()