Spaces:

Marek4321
/

BabelSlide_2.0

Sleeping

App Files Files Community

BabelSlide_2.0 / processors /pdf_processor.py

Marek4321

Upload pdf_processor.py

ce00c7a verified 5 months ago

raw

history blame contribute delete

8.33 kB

	from typing import List, Tuple, Generator
	from pathlib import Path
	import fitz # PyMuPDF
	from reportlab.pdfgen import canvas
	from reportlab.lib.pagesizes import letter
	from reportlab.pdfbase import pdfutils
	from reportlab.pdfbase.ttfonts import TTFont
	from reportlab.pdfbase import pdfmetrics
	from core.base_processor import DocumentProcessor
	from core.exceptions import ProcessorError

	class PDFProcessor(DocumentProcessor):
	"""PDF document processor"""

	def __init__(self, translator):
	super().__init__(translator)
	# Use Helvetica as default - it's always available
	self.font_name = 'Helvetica'

	def extract_text_elements(self, file_path: Path) -> Generator[Tuple[str, dict], None, None]:
	"""Extract text from PDF"""
	try:
	pdf_document = fitz.open(file_path)

	for page_num in range(len(pdf_document)):
	page = pdf_document[page_num]
	text_blocks = page.get_text("dict")

	for block_idx, block in enumerate(text_blocks["blocks"]):
	if "lines" in block: # Text block
	block_text = ""
	for line in block["lines"]:
	for span in line["spans"]:
	block_text += span["text"]
	block_text += "\n"

	if block_text.strip():
	metadata = {
	'page_number': page_num,
	'block_index': block_idx,
	'bbox': block["bbox"], # Bounding box for positioning
	'original_text': block_text.strip()
	}
	yield block_text.strip(), metadata

	pdf_document.close()

	except Exception as e:
	raise ProcessorError(f"Failed to extract text from PDF: {str(e)}")

	def apply_translations(self, file_path: Path, translations: List[Tuple[str, dict]]) -> Path:
	"""
	Apply translations to PDF by creating a new document
	Note: PDF translation is complex due to formatting preservation.
	This creates a simplified translated version.
	"""
	try:
	# Create output path
	output_path = self.generate_output_path(file_path, "translated")

	# Group translations by page
	page_translations = {}
	for translated_text, metadata in translations:
	page_num = metadata['page_number']
	if page_num not in page_translations:
	page_translations[page_num] = []
	page_translations[page_num].append({
	'text': translated_text,
	'bbox': metadata['bbox']
	})

	# Create new PDF with translations
	pdf_canvas = canvas.Canvas(str(output_path), pagesize=letter)

	# Get original PDF dimensions
	original_pdf = fitz.open(file_path)

	for page_num in range(len(original_pdf)):
	# Create a new page for each page in original
	if page_num > 0:
	pdf_canvas.showPage()

	# Get page dimensions
	page = original_pdf[page_num]
	page_rect = page.rect

	# Set font
	pdf_canvas.setFont(self.font_name, 12)

	# Add page number at top
	pdf_canvas.drawString(50, page_rect.height - 30, f"Page {page_num + 1}")

	if page_num in page_translations:
	y_position = page_rect.height - 60 # Start below page number

	for translation_block in page_translations[page_num]:
	text = translation_block['text']

	# Handle multi-line text
	lines = text.split('\n')
	for line in lines:
	if line.strip() and y_position > 50:
	# Encode text to handle special characters
	try:
	pdf_canvas.drawString(50, y_position, line.strip())
	except UnicodeEncodeError:
	# Fallback for problematic characters
	safe_text = line.strip().encode('ascii', 'ignore').decode('ascii')
	pdf_canvas.drawString(50, y_position, safe_text)
	y_position -= 15 # Line spacing

	y_position -= 10 # Block spacing
	else:
	# Empty page - just show page number and a note
	pdf_canvas.drawString(50, page_rect.height - 80, "(No translatable content on this page)")

	pdf_canvas.save()
	original_pdf.close()

	return output_path

	except Exception as e:
	raise ProcessorError(f"Failed to apply translations to PDF: {str(e)}")

	def create_text_only_pdf(self, file_path: Path, translations: List[Tuple[str, dict]]) -> Path:
	"""
	Create a simplified text-only PDF with translations
	This is a fallback method for complex PDFs
	"""
	try:
	output_path = self.generate_output_path(file_path, "translated_text_only")

	# Group by pages
	page_translations = {}
	for translated_text, metadata in translations:
	page_num = metadata['page_number']
	if page_num not in page_translations:
	page_translations[page_num] = []
	page_translations[page_num].append(translated_text)

	pdf_canvas = canvas.Canvas(str(output_path), pagesize=letter)

	for page_num in sorted(page_translations.keys()):
	if page_num > 0:
	pdf_canvas.showPage()

	# Add page title
	pdf_canvas.setFont('Helvetica-Bold', 14)
	pdf_canvas.drawString(50, 750, f"Page {page_num + 1}")

	y_position = 720
	pdf_canvas.setFont('Helvetica', 11)

	for text_block in page_translations[page_num]:
	lines = text_block.split('\n')
	for line in lines:
	if line.strip() and y_position > 50:
	# Handle long lines by wrapping
	if len(line) > 80:
	words = line.split()
	current_line = ""
	for word in words:
	if len(current_line + word) < 80:
	current_line += word + " "
	else:
	if current_line.strip():
	pdf_canvas.drawString(50, y_position, current_line.strip())
	y_position -= 12
	current_line = word + " "
	if current_line.strip():
	pdf_canvas.drawString(50, y_position, current_line.strip())
	y_position -= 12
	else:
	pdf_canvas.drawString(50, y_position, line.strip())
	y_position -= 12
	y_position -= 8 # Block spacing

	pdf_canvas.save()
	return output_path

	except Exception as e:
	raise ProcessorError(f"Failed to create text-only PDF: {str(e)}")

	@property
	def supported_extensions(self) -> List[str]:
	return ['.pdf']