"""Document parsing utilities for various file formats.""" from pathlib import Path from src.utils.common import normalize_text from src.utils.logging import print_log def load_pdf(file_path: Path) -> str: """Load text from PDF file.""" try: import pypdf except ImportError: raise ImportError("pypdf is required for PDF files. Install with: pip install pypdf") reader = pypdf.PdfReader(str(file_path)) text = "" for page in reader.pages: text += page.extract_text() + "\n" return text.strip() def load_docx(file_path: Path) -> str: """Load text from DOCX file.""" try: import docx except ImportError: raise ImportError("python-docx is required for DOCX files. Install with: pip install python-docx") doc = docx.Document(str(file_path)) return "\n".join([para.text for para in doc.paragraphs]) def load_txt(file_path: Path) -> str: """Load text from TXT file.""" with open(file_path, encoding="utf-8") as f: return f.read() def load_document(file_path: Path) -> tuple[str | None, dict | None]: """Load document (PDF, DOCX, TXT), normalize text, and return (text, metadata). Returns (None, None) for unsupported or failed files. """ ext = file_path.suffix.lower() try: if ext == ".pdf": text = load_pdf(file_path) elif ext == ".docx": text = load_docx(file_path) elif ext == ".txt": text = load_txt(file_path) else: return None, None text = normalize_text(text) if not text: return None, None metadata = { "source_file": str(file_path), "file_name": file_path.name, "file_type": ext[1:], } return text, metadata except Exception as e: print_log(f" [Error] Failed to load {file_path.name}: {e}") return None, None