# core/document_processor.py from typing import Dict, List import PyPDF2 import docx from pathlib import Path class DocumentProcessor: """Handles document ingestion and preprocessing.""" supported_formats = ['.txt', '.pdf', '.docx'] @staticmethod def process_document(file_path: str) -> Dict: path = Path(file_path) if not path.exists(): raise FileNotFoundError(f"File not found: {file_path}") if path.suffix not in DocumentProcessor.supported_formats: raise ValueError(f"Unsupported file format: {path.suffix}") content = DocumentProcessor._extract_content(path) metadata = DocumentProcessor._extract_metadata(path) return { "content": content, "metadata": metadata } @staticmethod def _extract_content(path: Path) -> str: if path.suffix == '.txt': return path.read_text(encoding='utf-8') elif path.suffix == '.pdf': with open(path, 'rb') as file: reader = PyPDF2.PdfReader(file) return ' '.join([page.extract_text() for page in reader.pages]) elif path.suffix == '.docx': doc = docx.Document(path) return ' '.join([paragraph.text for paragraph in doc.paragraphs]) @staticmethod def _extract_metadata(path: Path) -> Dict: return { "filename": path.name, "file_size": path.stat().st_size, "file_type": path.suffix, "last_modified": path.stat().st_mtime }