Spaces:

Marek4321
/

BabelSlide_2.0

Sleeping

File size: 8,334 Bytes

ce00c7a

from typing import List, Tuple, Generator
from pathlib import Path
import fitz  # PyMuPDF
from reportlab.pdfgen import canvas
from reportlab.lib.pagesizes import letter
from reportlab.pdfbase import pdfutils
from reportlab.pdfbase.ttfonts import TTFont
from reportlab.pdfbase import pdfmetrics
from core.base_processor import DocumentProcessor
from core.exceptions import ProcessorError

class PDFProcessor(DocumentProcessor):
    """PDF document processor"""
    
    def __init__(self, translator):
        super().__init__(translator)
        # Use Helvetica as default - it's always available
        self.font_name = 'Helvetica'
    
    def extract_text_elements(self, file_path: Path) -> Generator[Tuple[str, dict], None, None]:
        """Extract text from PDF"""
        try:
            pdf_document = fitz.open(file_path)
            
            for page_num in range(len(pdf_document)):
                page = pdf_document[page_num]
                text_blocks = page.get_text("dict")
                
                for block_idx, block in enumerate(text_blocks["blocks"]):
                    if "lines" in block:  # Text block
                        block_text = ""
                        for line in block["lines"]:
                            for span in line["spans"]:
                                block_text += span["text"]
                            block_text += "\n"
                        
                        if block_text.strip():
                            metadata = {
                                'page_number': page_num,
                                'block_index': block_idx,
                                'bbox': block["bbox"],  # Bounding box for positioning
                                'original_text': block_text.strip()
                            }
                            yield block_text.strip(), metadata
            
            pdf_document.close()
                        
        except Exception as e:
            raise ProcessorError(f"Failed to extract text from PDF: {str(e)}")
    
    def apply_translations(self, file_path: Path, translations: List[Tuple[str, dict]]) -> Path:
        """
        Apply translations to PDF by creating a new document
        Note: PDF translation is complex due to formatting preservation.
        This creates a simplified translated version.
        """
        try:
            # Create output path
            output_path = self.generate_output_path(file_path, "translated")
            
            # Group translations by page
            page_translations = {}
            for translated_text, metadata in translations:
                page_num = metadata['page_number']
                if page_num not in page_translations:
                    page_translations[page_num] = []
                page_translations[page_num].append({
                    'text': translated_text,
                    'bbox': metadata['bbox']
                })
            
            # Create new PDF with translations
            pdf_canvas = canvas.Canvas(str(output_path), pagesize=letter)
            
            # Get original PDF dimensions
            original_pdf = fitz.open(file_path)
            
            for page_num in range(len(original_pdf)):
                # Create a new page for each page in original
                if page_num > 0:
                    pdf_canvas.showPage()
                
                # Get page dimensions
                page = original_pdf[page_num]
                page_rect = page.rect
                
                # Set font
                pdf_canvas.setFont(self.font_name, 12)
                
                # Add page number at top
                pdf_canvas.drawString(50, page_rect.height - 30, f"Page {page_num + 1}")
                
                if page_num in page_translations:
                    y_position = page_rect.height - 60  # Start below page number
                    
                    for translation_block in page_translations[page_num]:
                        text = translation_block['text']
                        
                        # Handle multi-line text
                        lines = text.split('\n')
                        for line in lines:
                            if line.strip() and y_position > 50:
                                # Encode text to handle special characters
                                try:
                                    pdf_canvas.drawString(50, y_position, line.strip())
                                except UnicodeEncodeError:
                                    # Fallback for problematic characters
                                    safe_text = line.strip().encode('ascii', 'ignore').decode('ascii')
                                    pdf_canvas.drawString(50, y_position, safe_text)
                                y_position -= 15  # Line spacing
                        
                        y_position -= 10  # Block spacing
                else:
                    # Empty page - just show page number and a note
                    pdf_canvas.drawString(50, page_rect.height - 80, "(No translatable content on this page)")
            
            pdf_canvas.save()
            original_pdf.close()
            
            return output_path
            
        except Exception as e:
            raise ProcessorError(f"Failed to apply translations to PDF: {str(e)}")
    
    def create_text_only_pdf(self, file_path: Path, translations: List[Tuple[str, dict]]) -> Path:
        """
        Create a simplified text-only PDF with translations
        This is a fallback method for complex PDFs
        """
        try:
            output_path = self.generate_output_path(file_path, "translated_text_only")
            
            # Group by pages
            page_translations = {}
            for translated_text, metadata in translations:
                page_num = metadata['page_number']
                if page_num not in page_translations:
                    page_translations[page_num] = []
                page_translations[page_num].append(translated_text)
            
            pdf_canvas = canvas.Canvas(str(output_path), pagesize=letter)
            
            for page_num in sorted(page_translations.keys()):
                if page_num > 0:
                    pdf_canvas.showPage()
                
                # Add page title  
                pdf_canvas.setFont('Helvetica-Bold', 14)
                pdf_canvas.drawString(50, 750, f"Page {page_num + 1}")
                
                y_position = 720
                pdf_canvas.setFont('Helvetica', 11)
                
                for text_block in page_translations[page_num]:
                    lines = text_block.split('\n')
                    for line in lines:
                        if line.strip() and y_position > 50:
                            # Handle long lines by wrapping
                            if len(line) > 80:
                                words = line.split()
                                current_line = ""
                                for word in words:
                                    if len(current_line + word) < 80:
                                        current_line += word + " "
                                    else:
                                        if current_line.strip():
                                            pdf_canvas.drawString(50, y_position, current_line.strip())
                                            y_position -= 12
                                        current_line = word + " "
                                if current_line.strip():
                                    pdf_canvas.drawString(50, y_position, current_line.strip())
                                    y_position -= 12
                            else:
                                pdf_canvas.drawString(50, y_position, line.strip())
                                y_position -= 12
                    y_position -= 8  # Block spacing
            
            pdf_canvas.save()
            return output_path
            
        except Exception as e:
            raise ProcessorError(f"Failed to create text-only PDF: {str(e)}")
    
    @property
    def supported_extensions(self) -> List[str]:
        return ['.pdf']