Spaces:
Sleeping
Sleeping
| from typing import List, Tuple, Generator | |
| from pathlib import Path | |
| from docx import Document | |
| from docx.shared import Inches | |
| from core.base_processor import DocumentProcessor | |
| from core.exceptions import ProcessorError | |
| class DOCXProcessor(DocumentProcessor): | |
| """Microsoft Word document processor""" | |
| def extract_text_elements(self, file_path: Path) -> Generator[Tuple[str, dict], None, None]: | |
| """Extract text from Word document""" | |
| try: | |
| doc = Document(file_path) | |
| # Extract text from paragraphs | |
| for para_idx, paragraph in enumerate(doc.paragraphs): | |
| if paragraph.text.strip(): | |
| metadata = { | |
| 'element_type': 'paragraph', | |
| 'index': para_idx, | |
| 'style': paragraph.style.name if paragraph.style else 'Normal', | |
| 'original_text': paragraph.text | |
| } | |
| yield paragraph.text, metadata | |
| # Extract text from tables | |
| for table_idx, table in enumerate(doc.tables): | |
| for row_idx, row in enumerate(table.rows): | |
| for cell_idx, cell in enumerate(row.cells): | |
| if cell.text.strip(): | |
| metadata = { | |
| 'element_type': 'table_cell', | |
| 'table_index': table_idx, | |
| 'row_index': row_idx, | |
| 'cell_index': cell_idx, | |
| 'original_text': cell.text | |
| } | |
| yield cell.text, metadata | |
| # Extract text from headers and footers | |
| for section_idx, section in enumerate(doc.sections): | |
| # Header | |
| if section.header: | |
| for para_idx, paragraph in enumerate(section.header.paragraphs): | |
| if paragraph.text.strip(): | |
| metadata = { | |
| 'element_type': 'header', | |
| 'section_index': section_idx, | |
| 'paragraph_index': para_idx, | |
| 'original_text': paragraph.text | |
| } | |
| yield paragraph.text, metadata | |
| # Footer | |
| if section.footer: | |
| for para_idx, paragraph in enumerate(section.footer.paragraphs): | |
| if paragraph.text.strip(): | |
| metadata = { | |
| 'element_type': 'footer', | |
| 'section_index': section_idx, | |
| 'paragraph_index': para_idx, | |
| 'original_text': paragraph.text | |
| } | |
| yield paragraph.text, metadata | |
| except Exception as e: | |
| raise ProcessorError(f"Failed to extract text from Word document: {str(e)}") | |
| def apply_translations(self, file_path: Path, translations: List[Tuple[str, dict]]) -> Path: | |
| """Apply translations to Word document""" | |
| try: | |
| # Load the original document | |
| doc = Document(file_path) | |
| # Group translations by type | |
| paragraph_translations = {} | |
| table_translations = {} | |
| header_translations = {} | |
| footer_translations = {} | |
| for translated_text, metadata in translations: | |
| element_type = metadata['element_type'] | |
| if element_type == 'paragraph': | |
| paragraph_translations[metadata['index']] = translated_text | |
| elif element_type == 'table_cell': | |
| table_key = (metadata['table_index'], metadata['row_index'], metadata['cell_index']) | |
| table_translations[table_key] = translated_text | |
| elif element_type == 'header': | |
| header_key = (metadata['section_index'], metadata['paragraph_index']) | |
| header_translations[header_key] = translated_text | |
| elif element_type == 'footer': | |
| footer_key = (metadata['section_index'], metadata['paragraph_index']) | |
| footer_translations[footer_key] = translated_text | |
| # Apply paragraph translations | |
| for para_idx, paragraph in enumerate(doc.paragraphs): | |
| if para_idx in paragraph_translations: | |
| paragraph.text = paragraph_translations[para_idx] | |
| # Apply table translations | |
| for table_idx, table in enumerate(doc.tables): | |
| for row_idx, row in enumerate(table.rows): | |
| for cell_idx, cell in enumerate(row.cells): | |
| table_key = (table_idx, row_idx, cell_idx) | |
| if table_key in table_translations: | |
| cell.text = table_translations[table_key] | |
| # Apply header and footer translations | |
| for section_idx, section in enumerate(doc.sections): | |
| # Headers | |
| if section.header: | |
| for para_idx, paragraph in enumerate(section.header.paragraphs): | |
| header_key = (section_idx, para_idx) | |
| if header_key in header_translations: | |
| paragraph.text = header_translations[header_key] | |
| # Footers | |
| if section.footer: | |
| for para_idx, paragraph in enumerate(section.footer.paragraphs): | |
| footer_key = (section_idx, para_idx) | |
| if footer_key in footer_translations: | |
| paragraph.text = footer_translations[footer_key] | |
| # Save translated document | |
| output_path = self.generate_output_path(file_path, "translated") | |
| doc.save(output_path) | |
| return output_path | |
| except Exception as e: | |
| raise ProcessorError(f"Failed to apply translations to Word document: {str(e)}") | |
| def supported_extensions(self) -> List[str]: | |
| return ['.docx'] |