"""Document processing service for PDF, CSV, and Markdown files.""" import os from typing import List, Optional from langchain.text_splitter import RecursiveCharacterTextSplitter from langchain_community.document_loaders import ( PyPDFLoader, CSVLoader, TextLoader, ) from langchain.schema import Document from app.config import settings class DocumentProcessor: """Handles loading and chunking of various document types.""" def __init__(self): self.text_splitter = RecursiveCharacterTextSplitter( chunk_size=settings.CHUNK_SIZE, chunk_overlap=settings.CHUNK_OVERLAP, length_function=len, separators=["\n\n", "\n", ". ", " ", ""], ) self.supported_extensions = { ".pdf": self._load_pdf, ".csv": self._load_csv, ".md": self._load_markdown, ".txt": self._load_text, } def get_supported_extensions(self) -> List[str]: return list(self.supported_extensions.keys()) async def process_file( self, file_path: str, metadata: Optional[dict] = None ) -> List[Document]: """Process a file and return chunked documents with metadata.""" ext = os.path.splitext(file_path)[1].lower() if ext not in self.supported_extensions: raise ValueError( f"Unsupported file type: {ext}. " f"Supported: {', '.join(self.supported_extensions.keys())}" ) # Load documents loader_fn = self.supported_extensions[ext] documents = loader_fn(file_path) # Add custom metadata filename = os.path.basename(file_path) for doc in documents: doc.metadata["source"] = filename doc.metadata["file_type"] = ext if metadata: doc.metadata.update(metadata) # Split into chunks chunks = self.text_splitter.split_documents(documents) # Add chunk IDs for i, chunk in enumerate(chunks): chunk.metadata["chunk_id"] = f"{filename}_chunk_{i}" chunk.metadata["chunk_index"] = i chunk.metadata["total_chunks"] = len(chunks) return chunks def _load_pdf(self, file_path: str) -> List[Document]: """Load PDF file.""" loader = PyPDFLoader(file_path) return loader.load() def _load_csv(self, file_path: str) -> List[Document]: """Load CSV file.""" loader = CSVLoader(file_path, encoding="utf-8") return loader.load() def _load_markdown(self, file_path: str) -> List[Document]: """Load Markdown file.""" loader = TextLoader(file_path, encoding="utf-8") return loader.load() def _load_text(self, file_path: str) -> List[Document]: """Load plain text file.""" loader = TextLoader(file_path, encoding="utf-8") return loader.load() document_processor = DocumentProcessor()