"""PDF text extractor using PyMuPDF (lightweight alternative to Docling).""" from datetime import datetime from pathlib import Path from typing import Optional try: import fitz # PyMuPDF PYMUPDF_AVAILABLE = True except ImportError: PYMUPDF_AVAILABLE = False class PDFExtractor: """Extracts text from PDF documents using PyMuPDF.""" def __init__(self, output_dir: Optional[Path] = None): """Initialize the extractor. Args: output_dir: Directory to store extracted text files. """ self.output_dir = output_dir or Path("data/extracted") self.output_dir.mkdir(parents=True, exist_ok=True) def extract_text(self, pdf_path: Path) -> dict: """Extract text from a PDF file. Args: pdf_path: Path to the PDF file. Returns: Dict with 'success', 'text', 'page_count', and 'error' keys. """ pdf_path = Path(pdf_path).resolve() if not PYMUPDF_AVAILABLE: return { 'success': False, 'text': '', 'page_count': 0, 'error': 'PyMuPDF not installed' } if not pdf_path.exists(): return { 'success': False, 'text': '', 'page_count': 0, 'error': f'File not found: {pdf_path}' } try: doc = fitz.open(pdf_path) text_parts = [] for page_num, page in enumerate(doc): page_text = page.get_text() if page_text.strip(): text_parts.append(f"--- Page {page_num + 1} ---\n{page_text}") full_text = "\n\n".join(text_parts) page_count = len(doc) doc.close() # Save extracted text txt_path = self.output_dir / f"{pdf_path.stem}.txt" txt_path.write_text(full_text, encoding='utf-8') return { 'success': True, 'text': full_text, 'page_count': page_count, 'error': None } except Exception as e: return { 'success': False, 'text': '', 'page_count': 0, 'error': str(e) } def extract_batch(self, pdf_paths: list) -> list: """Extract text from multiple PDFs. Args: pdf_paths: List of PDF file paths. Returns: List of extraction results. """ return [self.extract_text(pdf_path) for pdf_path in pdf_paths]