hrbot / src /document_processor /converter.py
Sonu Prasad
integrated dockling
3cf9b4f
"""PDF text extractor using PyMuPDF (lightweight alternative to Docling)."""
from datetime import datetime
from pathlib import Path
from typing import Optional
try:
import fitz # PyMuPDF
PYMUPDF_AVAILABLE = True
except ImportError:
PYMUPDF_AVAILABLE = False
class PDFExtractor:
"""Extracts text from PDF documents using PyMuPDF."""
def __init__(self, output_dir: Optional[Path] = None):
"""Initialize the extractor.
Args:
output_dir: Directory to store extracted text files.
"""
self.output_dir = output_dir or Path("data/extracted")
self.output_dir.mkdir(parents=True, exist_ok=True)
def extract_text(self, pdf_path: Path) -> dict:
"""Extract text from a PDF file.
Args:
pdf_path: Path to the PDF file.
Returns:
Dict with 'success', 'text', 'page_count', and 'error' keys.
"""
pdf_path = Path(pdf_path).resolve()
if not PYMUPDF_AVAILABLE:
return {
'success': False,
'text': '',
'page_count': 0,
'error': 'PyMuPDF not installed'
}
if not pdf_path.exists():
return {
'success': False,
'text': '',
'page_count': 0,
'error': f'File not found: {pdf_path}'
}
try:
doc = fitz.open(pdf_path)
text_parts = []
for page_num, page in enumerate(doc):
page_text = page.get_text()
if page_text.strip():
text_parts.append(f"--- Page {page_num + 1} ---\n{page_text}")
full_text = "\n\n".join(text_parts)
page_count = len(doc)
doc.close()
# Save extracted text
txt_path = self.output_dir / f"{pdf_path.stem}.txt"
txt_path.write_text(full_text, encoding='utf-8')
return {
'success': True,
'text': full_text,
'page_count': page_count,
'error': None
}
except Exception as e:
return {
'success': False,
'text': '',
'page_count': 0,
'error': str(e)
}
def extract_batch(self, pdf_paths: list) -> list:
"""Extract text from multiple PDFs.
Args:
pdf_paths: List of PDF file paths.
Returns:
List of extraction results.
"""
return [self.extract_text(pdf_path) for pdf_path in pdf_paths]