File size: 6,451 Bytes
1df1e0b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
from typing import List, Tuple, Generator
from pathlib import Path
from docx import Document
from docx.shared import Inches
from core.base_processor import DocumentProcessor
from core.exceptions import ProcessorError

class DOCXProcessor(DocumentProcessor):
    """Microsoft Word document processor"""
    
    def extract_text_elements(self, file_path: Path) -> Generator[Tuple[str, dict], None, None]:
        """Extract text from Word document"""
        try:
            doc = Document(file_path)
            
            # Extract text from paragraphs
            for para_idx, paragraph in enumerate(doc.paragraphs):
                if paragraph.text.strip():
                    metadata = {
                        'element_type': 'paragraph',
                        'index': para_idx,
                        'style': paragraph.style.name if paragraph.style else 'Normal',
                        'original_text': paragraph.text
                    }
                    yield paragraph.text, metadata
            
            # Extract text from tables
            for table_idx, table in enumerate(doc.tables):
                for row_idx, row in enumerate(table.rows):
                    for cell_idx, cell in enumerate(row.cells):
                        if cell.text.strip():
                            metadata = {
                                'element_type': 'table_cell',
                                'table_index': table_idx,
                                'row_index': row_idx,
                                'cell_index': cell_idx,
                                'original_text': cell.text
                            }
                            yield cell.text, metadata
            
            # Extract text from headers and footers
            for section_idx, section in enumerate(doc.sections):
                # Header
                if section.header:
                    for para_idx, paragraph in enumerate(section.header.paragraphs):
                        if paragraph.text.strip():
                            metadata = {
                                'element_type': 'header',
                                'section_index': section_idx,
                                'paragraph_index': para_idx,
                                'original_text': paragraph.text
                            }
                            yield paragraph.text, metadata
                
                # Footer
                if section.footer:
                    for para_idx, paragraph in enumerate(section.footer.paragraphs):
                        if paragraph.text.strip():
                            metadata = {
                                'element_type': 'footer',
                                'section_index': section_idx,
                                'paragraph_index': para_idx,
                                'original_text': paragraph.text
                            }
                            yield paragraph.text, metadata
                        
        except Exception as e:
            raise ProcessorError(f"Failed to extract text from Word document: {str(e)}")
    
    def apply_translations(self, file_path: Path, translations: List[Tuple[str, dict]]) -> Path:
        """Apply translations to Word document"""
        try:
            # Load the original document
            doc = Document(file_path)
            
            # Group translations by type
            paragraph_translations = {}
            table_translations = {}
            header_translations = {}
            footer_translations = {}
            
            for translated_text, metadata in translations:
                element_type = metadata['element_type']
                
                if element_type == 'paragraph':
                    paragraph_translations[metadata['index']] = translated_text
                elif element_type == 'table_cell':
                    table_key = (metadata['table_index'], metadata['row_index'], metadata['cell_index'])
                    table_translations[table_key] = translated_text
                elif element_type == 'header':
                    header_key = (metadata['section_index'], metadata['paragraph_index'])
                    header_translations[header_key] = translated_text
                elif element_type == 'footer':
                    footer_key = (metadata['section_index'], metadata['paragraph_index'])
                    footer_translations[footer_key] = translated_text
            
            # Apply paragraph translations
            for para_idx, paragraph in enumerate(doc.paragraphs):
                if para_idx in paragraph_translations:
                    paragraph.text = paragraph_translations[para_idx]
            
            # Apply table translations
            for table_idx, table in enumerate(doc.tables):
                for row_idx, row in enumerate(table.rows):
                    for cell_idx, cell in enumerate(row.cells):
                        table_key = (table_idx, row_idx, cell_idx)
                        if table_key in table_translations:
                            cell.text = table_translations[table_key]
            
            # Apply header and footer translations
            for section_idx, section in enumerate(doc.sections):
                # Headers
                if section.header:
                    for para_idx, paragraph in enumerate(section.header.paragraphs):
                        header_key = (section_idx, para_idx)
                        if header_key in header_translations:
                            paragraph.text = header_translations[header_key]
                
                # Footers
                if section.footer:
                    for para_idx, paragraph in enumerate(section.footer.paragraphs):
                        footer_key = (section_idx, para_idx)
                        if footer_key in footer_translations:
                            paragraph.text = footer_translations[footer_key]
            
            # Save translated document
            output_path = self.generate_output_path(file_path, "translated")
            doc.save(output_path)
            
            return output_path
            
        except Exception as e:
            raise ProcessorError(f"Failed to apply translations to Word document: {str(e)}")
    
    @property
    def supported_extensions(self) -> List[str]:
        return ['.docx']