Spaces:
Sleeping
Sleeping
| from typing import List, Tuple, Generator | |
| from pathlib import Path | |
| import fitz # PyMuPDF | |
| from reportlab.pdfgen import canvas | |
| from reportlab.lib.pagesizes import letter | |
| from reportlab.pdfbase import pdfutils | |
| from reportlab.pdfbase.ttfonts import TTFont | |
| from reportlab.pdfbase import pdfmetrics | |
| from core.base_processor import DocumentProcessor | |
| from core.exceptions import ProcessorError | |
| class PDFProcessor(DocumentProcessor): | |
| """PDF document processor""" | |
| def __init__(self, translator): | |
| super().__init__(translator) | |
| # Use Helvetica as default - it's always available | |
| self.font_name = 'Helvetica' | |
| def extract_text_elements(self, file_path: Path) -> Generator[Tuple[str, dict], None, None]: | |
| """Extract text from PDF""" | |
| try: | |
| pdf_document = fitz.open(file_path) | |
| for page_num in range(len(pdf_document)): | |
| page = pdf_document[page_num] | |
| text_blocks = page.get_text("dict") | |
| for block_idx, block in enumerate(text_blocks["blocks"]): | |
| if "lines" in block: # Text block | |
| block_text = "" | |
| for line in block["lines"]: | |
| for span in line["spans"]: | |
| block_text += span["text"] | |
| block_text += "\n" | |
| if block_text.strip(): | |
| metadata = { | |
| 'page_number': page_num, | |
| 'block_index': block_idx, | |
| 'bbox': block["bbox"], # Bounding box for positioning | |
| 'original_text': block_text.strip() | |
| } | |
| yield block_text.strip(), metadata | |
| pdf_document.close() | |
| except Exception as e: | |
| raise ProcessorError(f"Failed to extract text from PDF: {str(e)}") | |
| def apply_translations(self, file_path: Path, translations: List[Tuple[str, dict]]) -> Path: | |
| """ | |
| Apply translations to PDF by creating a new document | |
| Note: PDF translation is complex due to formatting preservation. | |
| This creates a simplified translated version. | |
| """ | |
| try: | |
| # Create output path | |
| output_path = self.generate_output_path(file_path, "translated") | |
| # Group translations by page | |
| page_translations = {} | |
| for translated_text, metadata in translations: | |
| page_num = metadata['page_number'] | |
| if page_num not in page_translations: | |
| page_translations[page_num] = [] | |
| page_translations[page_num].append({ | |
| 'text': translated_text, | |
| 'bbox': metadata['bbox'] | |
| }) | |
| # Create new PDF with translations | |
| pdf_canvas = canvas.Canvas(str(output_path), pagesize=letter) | |
| # Get original PDF dimensions | |
| original_pdf = fitz.open(file_path) | |
| for page_num in range(len(original_pdf)): | |
| # Create a new page for each page in original | |
| if page_num > 0: | |
| pdf_canvas.showPage() | |
| # Get page dimensions | |
| page = original_pdf[page_num] | |
| page_rect = page.rect | |
| # Set font | |
| pdf_canvas.setFont(self.font_name, 12) | |
| # Add page number at top | |
| pdf_canvas.drawString(50, page_rect.height - 30, f"Page {page_num + 1}") | |
| if page_num in page_translations: | |
| y_position = page_rect.height - 60 # Start below page number | |
| for translation_block in page_translations[page_num]: | |
| text = translation_block['text'] | |
| # Handle multi-line text | |
| lines = text.split('\n') | |
| for line in lines: | |
| if line.strip() and y_position > 50: | |
| # Encode text to handle special characters | |
| try: | |
| pdf_canvas.drawString(50, y_position, line.strip()) | |
| except UnicodeEncodeError: | |
| # Fallback for problematic characters | |
| safe_text = line.strip().encode('ascii', 'ignore').decode('ascii') | |
| pdf_canvas.drawString(50, y_position, safe_text) | |
| y_position -= 15 # Line spacing | |
| y_position -= 10 # Block spacing | |
| else: | |
| # Empty page - just show page number and a note | |
| pdf_canvas.drawString(50, page_rect.height - 80, "(No translatable content on this page)") | |
| pdf_canvas.save() | |
| original_pdf.close() | |
| return output_path | |
| except Exception as e: | |
| raise ProcessorError(f"Failed to apply translations to PDF: {str(e)}") | |
| def create_text_only_pdf(self, file_path: Path, translations: List[Tuple[str, dict]]) -> Path: | |
| """ | |
| Create a simplified text-only PDF with translations | |
| This is a fallback method for complex PDFs | |
| """ | |
| try: | |
| output_path = self.generate_output_path(file_path, "translated_text_only") | |
| # Group by pages | |
| page_translations = {} | |
| for translated_text, metadata in translations: | |
| page_num = metadata['page_number'] | |
| if page_num not in page_translations: | |
| page_translations[page_num] = [] | |
| page_translations[page_num].append(translated_text) | |
| pdf_canvas = canvas.Canvas(str(output_path), pagesize=letter) | |
| for page_num in sorted(page_translations.keys()): | |
| if page_num > 0: | |
| pdf_canvas.showPage() | |
| # Add page title | |
| pdf_canvas.setFont('Helvetica-Bold', 14) | |
| pdf_canvas.drawString(50, 750, f"Page {page_num + 1}") | |
| y_position = 720 | |
| pdf_canvas.setFont('Helvetica', 11) | |
| for text_block in page_translations[page_num]: | |
| lines = text_block.split('\n') | |
| for line in lines: | |
| if line.strip() and y_position > 50: | |
| # Handle long lines by wrapping | |
| if len(line) > 80: | |
| words = line.split() | |
| current_line = "" | |
| for word in words: | |
| if len(current_line + word) < 80: | |
| current_line += word + " " | |
| else: | |
| if current_line.strip(): | |
| pdf_canvas.drawString(50, y_position, current_line.strip()) | |
| y_position -= 12 | |
| current_line = word + " " | |
| if current_line.strip(): | |
| pdf_canvas.drawString(50, y_position, current_line.strip()) | |
| y_position -= 12 | |
| else: | |
| pdf_canvas.drawString(50, y_position, line.strip()) | |
| y_position -= 12 | |
| y_position -= 8 # Block spacing | |
| pdf_canvas.save() | |
| return output_path | |
| except Exception as e: | |
| raise ProcessorError(f"Failed to create text-only PDF: {str(e)}") | |
| def supported_extensions(self) -> List[str]: | |
| return ['.pdf'] |