from typing import List, Tuple, Generator from pathlib import Path from docx import Document from docx.shared import Inches from core.base_processor import DocumentProcessor from core.exceptions import ProcessorError class DOCXProcessor(DocumentProcessor): """Microsoft Word document processor""" def extract_text_elements(self, file_path: Path) -> Generator[Tuple[str, dict], None, None]: """Extract text from Word document""" try: doc = Document(file_path) # Extract text from paragraphs for para_idx, paragraph in enumerate(doc.paragraphs): if paragraph.text.strip(): metadata = { 'element_type': 'paragraph', 'index': para_idx, 'style': paragraph.style.name if paragraph.style else 'Normal', 'original_text': paragraph.text } yield paragraph.text, metadata # Extract text from tables for table_idx, table in enumerate(doc.tables): for row_idx, row in enumerate(table.rows): for cell_idx, cell in enumerate(row.cells): if cell.text.strip(): metadata = { 'element_type': 'table_cell', 'table_index': table_idx, 'row_index': row_idx, 'cell_index': cell_idx, 'original_text': cell.text } yield cell.text, metadata # Extract text from headers and footers for section_idx, section in enumerate(doc.sections): # Header if section.header: for para_idx, paragraph in enumerate(section.header.paragraphs): if paragraph.text.strip(): metadata = { 'element_type': 'header', 'section_index': section_idx, 'paragraph_index': para_idx, 'original_text': paragraph.text } yield paragraph.text, metadata # Footer if section.footer: for para_idx, paragraph in enumerate(section.footer.paragraphs): if paragraph.text.strip(): metadata = { 'element_type': 'footer', 'section_index': section_idx, 'paragraph_index': para_idx, 'original_text': paragraph.text } yield paragraph.text, metadata except Exception as e: raise ProcessorError(f"Failed to extract text from Word document: {str(e)}") def apply_translations(self, file_path: Path, translations: List[Tuple[str, dict]]) -> Path: """Apply translations to Word document""" try: # Load the original document doc = Document(file_path) # Group translations by type paragraph_translations = {} table_translations = {} header_translations = {} footer_translations = {} for translated_text, metadata in translations: element_type = metadata['element_type'] if element_type == 'paragraph': paragraph_translations[metadata['index']] = translated_text elif element_type == 'table_cell': table_key = (metadata['table_index'], metadata['row_index'], metadata['cell_index']) table_translations[table_key] = translated_text elif element_type == 'header': header_key = (metadata['section_index'], metadata['paragraph_index']) header_translations[header_key] = translated_text elif element_type == 'footer': footer_key = (metadata['section_index'], metadata['paragraph_index']) footer_translations[footer_key] = translated_text # Apply paragraph translations for para_idx, paragraph in enumerate(doc.paragraphs): if para_idx in paragraph_translations: paragraph.text = paragraph_translations[para_idx] # Apply table translations for table_idx, table in enumerate(doc.tables): for row_idx, row in enumerate(table.rows): for cell_idx, cell in enumerate(row.cells): table_key = (table_idx, row_idx, cell_idx) if table_key in table_translations: cell.text = table_translations[table_key] # Apply header and footer translations for section_idx, section in enumerate(doc.sections): # Headers if section.header: for para_idx, paragraph in enumerate(section.header.paragraphs): header_key = (section_idx, para_idx) if header_key in header_translations: paragraph.text = header_translations[header_key] # Footers if section.footer: for para_idx, paragraph in enumerate(section.footer.paragraphs): footer_key = (section_idx, para_idx) if footer_key in footer_translations: paragraph.text = footer_translations[footer_key] # Save translated document output_path = self.generate_output_path(file_path, "translated") doc.save(output_path) return output_path except Exception as e: raise ProcessorError(f"Failed to apply translations to Word document: {str(e)}") @property def supported_extensions(self) -> List[str]: return ['.docx']