from typing import List, Tuple, Generator from pathlib import Path import fitz # PyMuPDF from reportlab.pdfgen import canvas from reportlab.lib.pagesizes import letter from reportlab.pdfbase import pdfutils from reportlab.pdfbase.ttfonts import TTFont from reportlab.pdfbase import pdfmetrics from core.base_processor import DocumentProcessor from core.exceptions import ProcessorError class PDFProcessor(DocumentProcessor): """PDF document processor""" def __init__(self, translator): super().__init__(translator) # Use Helvetica as default - it's always available self.font_name = 'Helvetica' def extract_text_elements(self, file_path: Path) -> Generator[Tuple[str, dict], None, None]: """Extract text from PDF""" try: pdf_document = fitz.open(file_path) for page_num in range(len(pdf_document)): page = pdf_document[page_num] text_blocks = page.get_text("dict") for block_idx, block in enumerate(text_blocks["blocks"]): if "lines" in block: # Text block block_text = "" for line in block["lines"]: for span in line["spans"]: block_text += span["text"] block_text += "\n" if block_text.strip(): metadata = { 'page_number': page_num, 'block_index': block_idx, 'bbox': block["bbox"], # Bounding box for positioning 'original_text': block_text.strip() } yield block_text.strip(), metadata pdf_document.close() except Exception as e: raise ProcessorError(f"Failed to extract text from PDF: {str(e)}") def apply_translations(self, file_path: Path, translations: List[Tuple[str, dict]]) -> Path: """ Apply translations to PDF by creating a new document Note: PDF translation is complex due to formatting preservation. This creates a simplified translated version. """ try: # Create output path output_path = self.generate_output_path(file_path, "translated") # Group translations by page page_translations = {} for translated_text, metadata in translations: page_num = metadata['page_number'] if page_num not in page_translations: page_translations[page_num] = [] page_translations[page_num].append({ 'text': translated_text, 'bbox': metadata['bbox'] }) # Create new PDF with translations pdf_canvas = canvas.Canvas(str(output_path), pagesize=letter) # Get original PDF dimensions original_pdf = fitz.open(file_path) for page_num in range(len(original_pdf)): # Create a new page for each page in original if page_num > 0: pdf_canvas.showPage() # Get page dimensions page = original_pdf[page_num] page_rect = page.rect # Set font pdf_canvas.setFont(self.font_name, 12) # Add page number at top pdf_canvas.drawString(50, page_rect.height - 30, f"Page {page_num + 1}") if page_num in page_translations: y_position = page_rect.height - 60 # Start below page number for translation_block in page_translations[page_num]: text = translation_block['text'] # Handle multi-line text lines = text.split('\n') for line in lines: if line.strip() and y_position > 50: # Encode text to handle special characters try: pdf_canvas.drawString(50, y_position, line.strip()) except UnicodeEncodeError: # Fallback for problematic characters safe_text = line.strip().encode('ascii', 'ignore').decode('ascii') pdf_canvas.drawString(50, y_position, safe_text) y_position -= 15 # Line spacing y_position -= 10 # Block spacing else: # Empty page - just show page number and a note pdf_canvas.drawString(50, page_rect.height - 80, "(No translatable content on this page)") pdf_canvas.save() original_pdf.close() return output_path except Exception as e: raise ProcessorError(f"Failed to apply translations to PDF: {str(e)}") def create_text_only_pdf(self, file_path: Path, translations: List[Tuple[str, dict]]) -> Path: """ Create a simplified text-only PDF with translations This is a fallback method for complex PDFs """ try: output_path = self.generate_output_path(file_path, "translated_text_only") # Group by pages page_translations = {} for translated_text, metadata in translations: page_num = metadata['page_number'] if page_num not in page_translations: page_translations[page_num] = [] page_translations[page_num].append(translated_text) pdf_canvas = canvas.Canvas(str(output_path), pagesize=letter) for page_num in sorted(page_translations.keys()): if page_num > 0: pdf_canvas.showPage() # Add page title pdf_canvas.setFont('Helvetica-Bold', 14) pdf_canvas.drawString(50, 750, f"Page {page_num + 1}") y_position = 720 pdf_canvas.setFont('Helvetica', 11) for text_block in page_translations[page_num]: lines = text_block.split('\n') for line in lines: if line.strip() and y_position > 50: # Handle long lines by wrapping if len(line) > 80: words = line.split() current_line = "" for word in words: if len(current_line + word) < 80: current_line += word + " " else: if current_line.strip(): pdf_canvas.drawString(50, y_position, current_line.strip()) y_position -= 12 current_line = word + " " if current_line.strip(): pdf_canvas.drawString(50, y_position, current_line.strip()) y_position -= 12 else: pdf_canvas.drawString(50, y_position, line.strip()) y_position -= 12 y_position -= 8 # Block spacing pdf_canvas.save() return output_path except Exception as e: raise ProcessorError(f"Failed to create text-only PDF: {str(e)}") @property def supported_extensions(self) -> List[str]: return ['.pdf']