Spaces:
Sleeping
Sleeping
File size: 8,334 Bytes
ce00c7a |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 |
from typing import List, Tuple, Generator
from pathlib import Path
import fitz # PyMuPDF
from reportlab.pdfgen import canvas
from reportlab.lib.pagesizes import letter
from reportlab.pdfbase import pdfutils
from reportlab.pdfbase.ttfonts import TTFont
from reportlab.pdfbase import pdfmetrics
from core.base_processor import DocumentProcessor
from core.exceptions import ProcessorError
class PDFProcessor(DocumentProcessor):
"""PDF document processor"""
def __init__(self, translator):
super().__init__(translator)
# Use Helvetica as default - it's always available
self.font_name = 'Helvetica'
def extract_text_elements(self, file_path: Path) -> Generator[Tuple[str, dict], None, None]:
"""Extract text from PDF"""
try:
pdf_document = fitz.open(file_path)
for page_num in range(len(pdf_document)):
page = pdf_document[page_num]
text_blocks = page.get_text("dict")
for block_idx, block in enumerate(text_blocks["blocks"]):
if "lines" in block: # Text block
block_text = ""
for line in block["lines"]:
for span in line["spans"]:
block_text += span["text"]
block_text += "\n"
if block_text.strip():
metadata = {
'page_number': page_num,
'block_index': block_idx,
'bbox': block["bbox"], # Bounding box for positioning
'original_text': block_text.strip()
}
yield block_text.strip(), metadata
pdf_document.close()
except Exception as e:
raise ProcessorError(f"Failed to extract text from PDF: {str(e)}")
def apply_translations(self, file_path: Path, translations: List[Tuple[str, dict]]) -> Path:
"""
Apply translations to PDF by creating a new document
Note: PDF translation is complex due to formatting preservation.
This creates a simplified translated version.
"""
try:
# Create output path
output_path = self.generate_output_path(file_path, "translated")
# Group translations by page
page_translations = {}
for translated_text, metadata in translations:
page_num = metadata['page_number']
if page_num not in page_translations:
page_translations[page_num] = []
page_translations[page_num].append({
'text': translated_text,
'bbox': metadata['bbox']
})
# Create new PDF with translations
pdf_canvas = canvas.Canvas(str(output_path), pagesize=letter)
# Get original PDF dimensions
original_pdf = fitz.open(file_path)
for page_num in range(len(original_pdf)):
# Create a new page for each page in original
if page_num > 0:
pdf_canvas.showPage()
# Get page dimensions
page = original_pdf[page_num]
page_rect = page.rect
# Set font
pdf_canvas.setFont(self.font_name, 12)
# Add page number at top
pdf_canvas.drawString(50, page_rect.height - 30, f"Page {page_num + 1}")
if page_num in page_translations:
y_position = page_rect.height - 60 # Start below page number
for translation_block in page_translations[page_num]:
text = translation_block['text']
# Handle multi-line text
lines = text.split('\n')
for line in lines:
if line.strip() and y_position > 50:
# Encode text to handle special characters
try:
pdf_canvas.drawString(50, y_position, line.strip())
except UnicodeEncodeError:
# Fallback for problematic characters
safe_text = line.strip().encode('ascii', 'ignore').decode('ascii')
pdf_canvas.drawString(50, y_position, safe_text)
y_position -= 15 # Line spacing
y_position -= 10 # Block spacing
else:
# Empty page - just show page number and a note
pdf_canvas.drawString(50, page_rect.height - 80, "(No translatable content on this page)")
pdf_canvas.save()
original_pdf.close()
return output_path
except Exception as e:
raise ProcessorError(f"Failed to apply translations to PDF: {str(e)}")
def create_text_only_pdf(self, file_path: Path, translations: List[Tuple[str, dict]]) -> Path:
"""
Create a simplified text-only PDF with translations
This is a fallback method for complex PDFs
"""
try:
output_path = self.generate_output_path(file_path, "translated_text_only")
# Group by pages
page_translations = {}
for translated_text, metadata in translations:
page_num = metadata['page_number']
if page_num not in page_translations:
page_translations[page_num] = []
page_translations[page_num].append(translated_text)
pdf_canvas = canvas.Canvas(str(output_path), pagesize=letter)
for page_num in sorted(page_translations.keys()):
if page_num > 0:
pdf_canvas.showPage()
# Add page title
pdf_canvas.setFont('Helvetica-Bold', 14)
pdf_canvas.drawString(50, 750, f"Page {page_num + 1}")
y_position = 720
pdf_canvas.setFont('Helvetica', 11)
for text_block in page_translations[page_num]:
lines = text_block.split('\n')
for line in lines:
if line.strip() and y_position > 50:
# Handle long lines by wrapping
if len(line) > 80:
words = line.split()
current_line = ""
for word in words:
if len(current_line + word) < 80:
current_line += word + " "
else:
if current_line.strip():
pdf_canvas.drawString(50, y_position, current_line.strip())
y_position -= 12
current_line = word + " "
if current_line.strip():
pdf_canvas.drawString(50, y_position, current_line.strip())
y_position -= 12
else:
pdf_canvas.drawString(50, y_position, line.strip())
y_position -= 12
y_position -= 8 # Block spacing
pdf_canvas.save()
return output_path
except Exception as e:
raise ProcessorError(f"Failed to create text-only PDF: {str(e)}")
@property
def supported_extensions(self) -> List[str]:
return ['.pdf'] |