BabelSlide_2.0 / processors /pptx_processor.py
Marek4321's picture
Upload 14 files
1df1e0b verified
from typing import List, Tuple, Generator
from pathlib import Path
from pptx import Presentation
from core.base_processor import DocumentProcessor
from core.exceptions import ProcessorError
class PPTXProcessor(DocumentProcessor):
"""PowerPoint presentation processor"""
def extract_text_elements(self, file_path: Path) -> Generator[Tuple[str, dict], None, None]:
"""Extract text from PowerPoint slides"""
try:
prs = Presentation(file_path)
for slide_idx, slide in enumerate(prs.slides):
for shape_idx, shape in enumerate(slide.shapes):
if hasattr(shape, "text") and shape.text.strip():
metadata = {
'slide_index': slide_idx,
'shape_index': shape_idx,
'shape_type': str(type(shape)),
'original_text': shape.text
}
yield shape.text, metadata
except Exception as e:
raise ProcessorError(f"Failed to extract text from PowerPoint: {str(e)}")
def apply_translations(self, file_path: Path, translations: List[Tuple[str, dict]]) -> Path:
"""Apply translations to PowerPoint presentation"""
try:
# Load the original presentation
prs = Presentation(file_path)
# Create a mapping of translations by slide and shape index
translation_map = {}
for translated_text, metadata in translations:
slide_idx = metadata['slide_index']
shape_idx = metadata['shape_index']
if slide_idx not in translation_map:
translation_map[slide_idx] = {}
translation_map[slide_idx][shape_idx] = translated_text
# Apply translations
for slide_idx, slide in enumerate(prs.slides):
if slide_idx in translation_map:
slide_translations = translation_map[slide_idx]
for shape_idx, shape in enumerate(slide.shapes):
if shape_idx in slide_translations and hasattr(shape, "text"):
shape.text = slide_translations[shape_idx]
# Save translated presentation
output_path = self.generate_output_path(file_path, "translated")
prs.save(output_path)
return output_path
except Exception as e:
raise ProcessorError(f"Failed to apply translations to PowerPoint: {str(e)}")
@property
def supported_extensions(self) -> List[str]:
return ['.pptx']