Spaces:
Sleeping
Sleeping
File size: 2,657 Bytes
3cf9b4f eb353a2 3cf9b4f eb353a2 3cf9b4f eb353a2 3cf9b4f eb353a2 3cf9b4f eb353a2 3cf9b4f eb353a2 3cf9b4f eb353a2 3cf9b4f eb353a2 3cf9b4f eb353a2 3cf9b4f eb353a2 3cf9b4f eb353a2 3cf9b4f eb353a2 3cf9b4f eb353a2 3cf9b4f eb353a2 3cf9b4f | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 | """PDF text extractor using PyMuPDF (lightweight alternative to Docling)."""
from datetime import datetime
from pathlib import Path
from typing import Optional
try:
import fitz # PyMuPDF
PYMUPDF_AVAILABLE = True
except ImportError:
PYMUPDF_AVAILABLE = False
class PDFExtractor:
"""Extracts text from PDF documents using PyMuPDF."""
def __init__(self, output_dir: Optional[Path] = None):
"""Initialize the extractor.
Args:
output_dir: Directory to store extracted text files.
"""
self.output_dir = output_dir or Path("data/extracted")
self.output_dir.mkdir(parents=True, exist_ok=True)
def extract_text(self, pdf_path: Path) -> dict:
"""Extract text from a PDF file.
Args:
pdf_path: Path to the PDF file.
Returns:
Dict with 'success', 'text', 'page_count', and 'error' keys.
"""
pdf_path = Path(pdf_path).resolve()
if not PYMUPDF_AVAILABLE:
return {
'success': False,
'text': '',
'page_count': 0,
'error': 'PyMuPDF not installed'
}
if not pdf_path.exists():
return {
'success': False,
'text': '',
'page_count': 0,
'error': f'File not found: {pdf_path}'
}
try:
doc = fitz.open(pdf_path)
text_parts = []
for page_num, page in enumerate(doc):
page_text = page.get_text()
if page_text.strip():
text_parts.append(f"--- Page {page_num + 1} ---\n{page_text}")
full_text = "\n\n".join(text_parts)
page_count = len(doc)
doc.close()
# Save extracted text
txt_path = self.output_dir / f"{pdf_path.stem}.txt"
txt_path.write_text(full_text, encoding='utf-8')
return {
'success': True,
'text': full_text,
'page_count': page_count,
'error': None
}
except Exception as e:
return {
'success': False,
'text': '',
'page_count': 0,
'error': str(e)
}
def extract_batch(self, pdf_paths: list) -> list:
"""Extract text from multiple PDFs.
Args:
pdf_paths: List of PDF file paths.
Returns:
List of extraction results.
"""
return [self.extract_text(pdf_path) for pdf_path in pdf_paths]
|