BabelSlide_2.0 / processors /pdf_processor.py
Marek4321's picture
Upload pdf_processor.py
ce00c7a verified
from typing import List, Tuple, Generator
from pathlib import Path
import fitz # PyMuPDF
from reportlab.pdfgen import canvas
from reportlab.lib.pagesizes import letter
from reportlab.pdfbase import pdfutils
from reportlab.pdfbase.ttfonts import TTFont
from reportlab.pdfbase import pdfmetrics
from core.base_processor import DocumentProcessor
from core.exceptions import ProcessorError
class PDFProcessor(DocumentProcessor):
"""PDF document processor"""
def __init__(self, translator):
super().__init__(translator)
# Use Helvetica as default - it's always available
self.font_name = 'Helvetica'
def extract_text_elements(self, file_path: Path) -> Generator[Tuple[str, dict], None, None]:
"""Extract text from PDF"""
try:
pdf_document = fitz.open(file_path)
for page_num in range(len(pdf_document)):
page = pdf_document[page_num]
text_blocks = page.get_text("dict")
for block_idx, block in enumerate(text_blocks["blocks"]):
if "lines" in block: # Text block
block_text = ""
for line in block["lines"]:
for span in line["spans"]:
block_text += span["text"]
block_text += "\n"
if block_text.strip():
metadata = {
'page_number': page_num,
'block_index': block_idx,
'bbox': block["bbox"], # Bounding box for positioning
'original_text': block_text.strip()
}
yield block_text.strip(), metadata
pdf_document.close()
except Exception as e:
raise ProcessorError(f"Failed to extract text from PDF: {str(e)}")
def apply_translations(self, file_path: Path, translations: List[Tuple[str, dict]]) -> Path:
"""
Apply translations to PDF by creating a new document
Note: PDF translation is complex due to formatting preservation.
This creates a simplified translated version.
"""
try:
# Create output path
output_path = self.generate_output_path(file_path, "translated")
# Group translations by page
page_translations = {}
for translated_text, metadata in translations:
page_num = metadata['page_number']
if page_num not in page_translations:
page_translations[page_num] = []
page_translations[page_num].append({
'text': translated_text,
'bbox': metadata['bbox']
})
# Create new PDF with translations
pdf_canvas = canvas.Canvas(str(output_path), pagesize=letter)
# Get original PDF dimensions
original_pdf = fitz.open(file_path)
for page_num in range(len(original_pdf)):
# Create a new page for each page in original
if page_num > 0:
pdf_canvas.showPage()
# Get page dimensions
page = original_pdf[page_num]
page_rect = page.rect
# Set font
pdf_canvas.setFont(self.font_name, 12)
# Add page number at top
pdf_canvas.drawString(50, page_rect.height - 30, f"Page {page_num + 1}")
if page_num in page_translations:
y_position = page_rect.height - 60 # Start below page number
for translation_block in page_translations[page_num]:
text = translation_block['text']
# Handle multi-line text
lines = text.split('\n')
for line in lines:
if line.strip() and y_position > 50:
# Encode text to handle special characters
try:
pdf_canvas.drawString(50, y_position, line.strip())
except UnicodeEncodeError:
# Fallback for problematic characters
safe_text = line.strip().encode('ascii', 'ignore').decode('ascii')
pdf_canvas.drawString(50, y_position, safe_text)
y_position -= 15 # Line spacing
y_position -= 10 # Block spacing
else:
# Empty page - just show page number and a note
pdf_canvas.drawString(50, page_rect.height - 80, "(No translatable content on this page)")
pdf_canvas.save()
original_pdf.close()
return output_path
except Exception as e:
raise ProcessorError(f"Failed to apply translations to PDF: {str(e)}")
def create_text_only_pdf(self, file_path: Path, translations: List[Tuple[str, dict]]) -> Path:
"""
Create a simplified text-only PDF with translations
This is a fallback method for complex PDFs
"""
try:
output_path = self.generate_output_path(file_path, "translated_text_only")
# Group by pages
page_translations = {}
for translated_text, metadata in translations:
page_num = metadata['page_number']
if page_num not in page_translations:
page_translations[page_num] = []
page_translations[page_num].append(translated_text)
pdf_canvas = canvas.Canvas(str(output_path), pagesize=letter)
for page_num in sorted(page_translations.keys()):
if page_num > 0:
pdf_canvas.showPage()
# Add page title
pdf_canvas.setFont('Helvetica-Bold', 14)
pdf_canvas.drawString(50, 750, f"Page {page_num + 1}")
y_position = 720
pdf_canvas.setFont('Helvetica', 11)
for text_block in page_translations[page_num]:
lines = text_block.split('\n')
for line in lines:
if line.strip() and y_position > 50:
# Handle long lines by wrapping
if len(line) > 80:
words = line.split()
current_line = ""
for word in words:
if len(current_line + word) < 80:
current_line += word + " "
else:
if current_line.strip():
pdf_canvas.drawString(50, y_position, current_line.strip())
y_position -= 12
current_line = word + " "
if current_line.strip():
pdf_canvas.drawString(50, y_position, current_line.strip())
y_position -= 12
else:
pdf_canvas.drawString(50, y_position, line.strip())
y_position -= 12
y_position -= 8 # Block spacing
pdf_canvas.save()
return output_path
except Exception as e:
raise ProcessorError(f"Failed to create text-only PDF: {str(e)}")
@property
def supported_extensions(self) -> List[str]:
return ['.pdf']