File size: 8,334 Bytes
ce00c7a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
from typing import List, Tuple, Generator
from pathlib import Path
import fitz  # PyMuPDF
from reportlab.pdfgen import canvas
from reportlab.lib.pagesizes import letter
from reportlab.pdfbase import pdfutils
from reportlab.pdfbase.ttfonts import TTFont
from reportlab.pdfbase import pdfmetrics
from core.base_processor import DocumentProcessor
from core.exceptions import ProcessorError

class PDFProcessor(DocumentProcessor):
    """PDF document processor"""
    
    def __init__(self, translator):
        super().__init__(translator)
        # Use Helvetica as default - it's always available
        self.font_name = 'Helvetica'
    
    def extract_text_elements(self, file_path: Path) -> Generator[Tuple[str, dict], None, None]:
        """Extract text from PDF"""
        try:
            pdf_document = fitz.open(file_path)
            
            for page_num in range(len(pdf_document)):
                page = pdf_document[page_num]
                text_blocks = page.get_text("dict")
                
                for block_idx, block in enumerate(text_blocks["blocks"]):
                    if "lines" in block:  # Text block
                        block_text = ""
                        for line in block["lines"]:
                            for span in line["spans"]:
                                block_text += span["text"]
                            block_text += "\n"
                        
                        if block_text.strip():
                            metadata = {
                                'page_number': page_num,
                                'block_index': block_idx,
                                'bbox': block["bbox"],  # Bounding box for positioning
                                'original_text': block_text.strip()
                            }
                            yield block_text.strip(), metadata
            
            pdf_document.close()
                        
        except Exception as e:
            raise ProcessorError(f"Failed to extract text from PDF: {str(e)}")
    
    def apply_translations(self, file_path: Path, translations: List[Tuple[str, dict]]) -> Path:
        """
        Apply translations to PDF by creating a new document
        Note: PDF translation is complex due to formatting preservation.
        This creates a simplified translated version.
        """
        try:
            # Create output path
            output_path = self.generate_output_path(file_path, "translated")
            
            # Group translations by page
            page_translations = {}
            for translated_text, metadata in translations:
                page_num = metadata['page_number']
                if page_num not in page_translations:
                    page_translations[page_num] = []
                page_translations[page_num].append({
                    'text': translated_text,
                    'bbox': metadata['bbox']
                })
            
            # Create new PDF with translations
            pdf_canvas = canvas.Canvas(str(output_path), pagesize=letter)
            
            # Get original PDF dimensions
            original_pdf = fitz.open(file_path)
            
            for page_num in range(len(original_pdf)):
                # Create a new page for each page in original
                if page_num > 0:
                    pdf_canvas.showPage()
                
                # Get page dimensions
                page = original_pdf[page_num]
                page_rect = page.rect
                
                # Set font
                pdf_canvas.setFont(self.font_name, 12)
                
                # Add page number at top
                pdf_canvas.drawString(50, page_rect.height - 30, f"Page {page_num + 1}")
                
                if page_num in page_translations:
                    y_position = page_rect.height - 60  # Start below page number
                    
                    for translation_block in page_translations[page_num]:
                        text = translation_block['text']
                        
                        # Handle multi-line text
                        lines = text.split('\n')
                        for line in lines:
                            if line.strip() and y_position > 50:
                                # Encode text to handle special characters
                                try:
                                    pdf_canvas.drawString(50, y_position, line.strip())
                                except UnicodeEncodeError:
                                    # Fallback for problematic characters
                                    safe_text = line.strip().encode('ascii', 'ignore').decode('ascii')
                                    pdf_canvas.drawString(50, y_position, safe_text)
                                y_position -= 15  # Line spacing
                        
                        y_position -= 10  # Block spacing
                else:
                    # Empty page - just show page number and a note
                    pdf_canvas.drawString(50, page_rect.height - 80, "(No translatable content on this page)")
            
            pdf_canvas.save()
            original_pdf.close()
            
            return output_path
            
        except Exception as e:
            raise ProcessorError(f"Failed to apply translations to PDF: {str(e)}")
    
    def create_text_only_pdf(self, file_path: Path, translations: List[Tuple[str, dict]]) -> Path:
        """
        Create a simplified text-only PDF with translations
        This is a fallback method for complex PDFs
        """
        try:
            output_path = self.generate_output_path(file_path, "translated_text_only")
            
            # Group by pages
            page_translations = {}
            for translated_text, metadata in translations:
                page_num = metadata['page_number']
                if page_num not in page_translations:
                    page_translations[page_num] = []
                page_translations[page_num].append(translated_text)
            
            pdf_canvas = canvas.Canvas(str(output_path), pagesize=letter)
            
            for page_num in sorted(page_translations.keys()):
                if page_num > 0:
                    pdf_canvas.showPage()
                
                # Add page title  
                pdf_canvas.setFont('Helvetica-Bold', 14)
                pdf_canvas.drawString(50, 750, f"Page {page_num + 1}")
                
                y_position = 720
                pdf_canvas.setFont('Helvetica', 11)
                
                for text_block in page_translations[page_num]:
                    lines = text_block.split('\n')
                    for line in lines:
                        if line.strip() and y_position > 50:
                            # Handle long lines by wrapping
                            if len(line) > 80:
                                words = line.split()
                                current_line = ""
                                for word in words:
                                    if len(current_line + word) < 80:
                                        current_line += word + " "
                                    else:
                                        if current_line.strip():
                                            pdf_canvas.drawString(50, y_position, current_line.strip())
                                            y_position -= 12
                                        current_line = word + " "
                                if current_line.strip():
                                    pdf_canvas.drawString(50, y_position, current_line.strip())
                                    y_position -= 12
                            else:
                                pdf_canvas.drawString(50, y_position, line.strip())
                                y_position -= 12
                    y_position -= 8  # Block spacing
            
            pdf_canvas.save()
            return output_path
            
        except Exception as e:
            raise ProcessorError(f"Failed to create text-only PDF: {str(e)}")
    
    @property
    def supported_extensions(self) -> List[str]:
        return ['.pdf']