Spaces:

Marek4321
/

BabelSlide_2.0

Sleeping

File size: 6,451 Bytes

1df1e0b

from typing import List, Tuple, Generator
from pathlib import Path
from docx import Document
from docx.shared import Inches
from core.base_processor import DocumentProcessor
from core.exceptions import ProcessorError

class DOCXProcessor(DocumentProcessor):
    """Microsoft Word document processor"""
    
    def extract_text_elements(self, file_path: Path) -> Generator[Tuple[str, dict], None, None]:
        """Extract text from Word document"""
        try:
            doc = Document(file_path)
            
            # Extract text from paragraphs
            for para_idx, paragraph in enumerate(doc.paragraphs):
                if paragraph.text.strip():
                    metadata = {
                        'element_type': 'paragraph',
                        'index': para_idx,
                        'style': paragraph.style.name if paragraph.style else 'Normal',
                        'original_text': paragraph.text
                    }
                    yield paragraph.text, metadata
            
            # Extract text from tables
            for table_idx, table in enumerate(doc.tables):
                for row_idx, row in enumerate(table.rows):
                    for cell_idx, cell in enumerate(row.cells):
                        if cell.text.strip():
                            metadata = {
                                'element_type': 'table_cell',
                                'table_index': table_idx,
                                'row_index': row_idx,
                                'cell_index': cell_idx,
                                'original_text': cell.text
                            }
                            yield cell.text, metadata
            
            # Extract text from headers and footers
            for section_idx, section in enumerate(doc.sections):
                # Header
                if section.header:
                    for para_idx, paragraph in enumerate(section.header.paragraphs):
                        if paragraph.text.strip():
                            metadata = {
                                'element_type': 'header',
                                'section_index': section_idx,
                                'paragraph_index': para_idx,
                                'original_text': paragraph.text
                            }
                            yield paragraph.text, metadata
                
                # Footer
                if section.footer:
                    for para_idx, paragraph in enumerate(section.footer.paragraphs):
                        if paragraph.text.strip():
                            metadata = {
                                'element_type': 'footer',
                                'section_index': section_idx,
                                'paragraph_index': para_idx,
                                'original_text': paragraph.text
                            }
                            yield paragraph.text, metadata
                        
        except Exception as e:
            raise ProcessorError(f"Failed to extract text from Word document: {str(e)}")
    
    def apply_translations(self, file_path: Path, translations: List[Tuple[str, dict]]) -> Path:
        """Apply translations to Word document"""
        try:
            # Load the original document
            doc = Document(file_path)
            
            # Group translations by type
            paragraph_translations = {}
            table_translations = {}
            header_translations = {}
            footer_translations = {}
            
            for translated_text, metadata in translations:
                element_type = metadata['element_type']
                
                if element_type == 'paragraph':
                    paragraph_translations[metadata['index']] = translated_text
                elif element_type == 'table_cell':
                    table_key = (metadata['table_index'], metadata['row_index'], metadata['cell_index'])
                    table_translations[table_key] = translated_text
                elif element_type == 'header':
                    header_key = (metadata['section_index'], metadata['paragraph_index'])
                    header_translations[header_key] = translated_text
                elif element_type == 'footer':
                    footer_key = (metadata['section_index'], metadata['paragraph_index'])
                    footer_translations[footer_key] = translated_text
            
            # Apply paragraph translations
            for para_idx, paragraph in enumerate(doc.paragraphs):
                if para_idx in paragraph_translations:
                    paragraph.text = paragraph_translations[para_idx]
            
            # Apply table translations
            for table_idx, table in enumerate(doc.tables):
                for row_idx, row in enumerate(table.rows):
                    for cell_idx, cell in enumerate(row.cells):
                        table_key = (table_idx, row_idx, cell_idx)
                        if table_key in table_translations:
                            cell.text = table_translations[table_key]
            
            # Apply header and footer translations
            for section_idx, section in enumerate(doc.sections):
                # Headers
                if section.header:
                    for para_idx, paragraph in enumerate(section.header.paragraphs):
                        header_key = (section_idx, para_idx)
                        if header_key in header_translations:
                            paragraph.text = header_translations[header_key]
                
                # Footers
                if section.footer:
                    for para_idx, paragraph in enumerate(section.footer.paragraphs):
                        footer_key = (section_idx, para_idx)
                        if footer_key in footer_translations:
                            paragraph.text = footer_translations[footer_key]
            
            # Save translated document
            output_path = self.generate_output_path(file_path, "translated")
            doc.save(output_path)
            
            return output_path
            
        except Exception as e:
            raise ProcessorError(f"Failed to apply translations to Word document: {str(e)}")
    
    @property
    def supported_extensions(self) -> List[str]:
        return ['.docx']