Spaces:
Sleeping
Sleeping
| """PDF text extractor using PyMuPDF (lightweight alternative to Docling).""" | |
| from datetime import datetime | |
| from pathlib import Path | |
| from typing import Optional | |
| try: | |
| import fitz # PyMuPDF | |
| PYMUPDF_AVAILABLE = True | |
| except ImportError: | |
| PYMUPDF_AVAILABLE = False | |
| class PDFExtractor: | |
| """Extracts text from PDF documents using PyMuPDF.""" | |
| def __init__(self, output_dir: Optional[Path] = None): | |
| """Initialize the extractor. | |
| Args: | |
| output_dir: Directory to store extracted text files. | |
| """ | |
| self.output_dir = output_dir or Path("data/extracted") | |
| self.output_dir.mkdir(parents=True, exist_ok=True) | |
| def extract_text(self, pdf_path: Path) -> dict: | |
| """Extract text from a PDF file. | |
| Args: | |
| pdf_path: Path to the PDF file. | |
| Returns: | |
| Dict with 'success', 'text', 'page_count', and 'error' keys. | |
| """ | |
| pdf_path = Path(pdf_path).resolve() | |
| if not PYMUPDF_AVAILABLE: | |
| return { | |
| 'success': False, | |
| 'text': '', | |
| 'page_count': 0, | |
| 'error': 'PyMuPDF not installed' | |
| } | |
| if not pdf_path.exists(): | |
| return { | |
| 'success': False, | |
| 'text': '', | |
| 'page_count': 0, | |
| 'error': f'File not found: {pdf_path}' | |
| } | |
| try: | |
| doc = fitz.open(pdf_path) | |
| text_parts = [] | |
| for page_num, page in enumerate(doc): | |
| page_text = page.get_text() | |
| if page_text.strip(): | |
| text_parts.append(f"--- Page {page_num + 1} ---\n{page_text}") | |
| full_text = "\n\n".join(text_parts) | |
| page_count = len(doc) | |
| doc.close() | |
| # Save extracted text | |
| txt_path = self.output_dir / f"{pdf_path.stem}.txt" | |
| txt_path.write_text(full_text, encoding='utf-8') | |
| return { | |
| 'success': True, | |
| 'text': full_text, | |
| 'page_count': page_count, | |
| 'error': None | |
| } | |
| except Exception as e: | |
| return { | |
| 'success': False, | |
| 'text': '', | |
| 'page_count': 0, | |
| 'error': str(e) | |
| } | |
| def extract_batch(self, pdf_paths: list) -> list: | |
| """Extract text from multiple PDFs. | |
| Args: | |
| pdf_paths: List of PDF file paths. | |
| Returns: | |
| List of extraction results. | |
| """ | |
| return [self.extract_text(pdf_path) for pdf_path in pdf_paths] | |