Spaces:
Sleeping
Sleeping
File size: 6,451 Bytes
1df1e0b |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 |
from typing import List, Tuple, Generator
from pathlib import Path
from docx import Document
from docx.shared import Inches
from core.base_processor import DocumentProcessor
from core.exceptions import ProcessorError
class DOCXProcessor(DocumentProcessor):
"""Microsoft Word document processor"""
def extract_text_elements(self, file_path: Path) -> Generator[Tuple[str, dict], None, None]:
"""Extract text from Word document"""
try:
doc = Document(file_path)
# Extract text from paragraphs
for para_idx, paragraph in enumerate(doc.paragraphs):
if paragraph.text.strip():
metadata = {
'element_type': 'paragraph',
'index': para_idx,
'style': paragraph.style.name if paragraph.style else 'Normal',
'original_text': paragraph.text
}
yield paragraph.text, metadata
# Extract text from tables
for table_idx, table in enumerate(doc.tables):
for row_idx, row in enumerate(table.rows):
for cell_idx, cell in enumerate(row.cells):
if cell.text.strip():
metadata = {
'element_type': 'table_cell',
'table_index': table_idx,
'row_index': row_idx,
'cell_index': cell_idx,
'original_text': cell.text
}
yield cell.text, metadata
# Extract text from headers and footers
for section_idx, section in enumerate(doc.sections):
# Header
if section.header:
for para_idx, paragraph in enumerate(section.header.paragraphs):
if paragraph.text.strip():
metadata = {
'element_type': 'header',
'section_index': section_idx,
'paragraph_index': para_idx,
'original_text': paragraph.text
}
yield paragraph.text, metadata
# Footer
if section.footer:
for para_idx, paragraph in enumerate(section.footer.paragraphs):
if paragraph.text.strip():
metadata = {
'element_type': 'footer',
'section_index': section_idx,
'paragraph_index': para_idx,
'original_text': paragraph.text
}
yield paragraph.text, metadata
except Exception as e:
raise ProcessorError(f"Failed to extract text from Word document: {str(e)}")
def apply_translations(self, file_path: Path, translations: List[Tuple[str, dict]]) -> Path:
"""Apply translations to Word document"""
try:
# Load the original document
doc = Document(file_path)
# Group translations by type
paragraph_translations = {}
table_translations = {}
header_translations = {}
footer_translations = {}
for translated_text, metadata in translations:
element_type = metadata['element_type']
if element_type == 'paragraph':
paragraph_translations[metadata['index']] = translated_text
elif element_type == 'table_cell':
table_key = (metadata['table_index'], metadata['row_index'], metadata['cell_index'])
table_translations[table_key] = translated_text
elif element_type == 'header':
header_key = (metadata['section_index'], metadata['paragraph_index'])
header_translations[header_key] = translated_text
elif element_type == 'footer':
footer_key = (metadata['section_index'], metadata['paragraph_index'])
footer_translations[footer_key] = translated_text
# Apply paragraph translations
for para_idx, paragraph in enumerate(doc.paragraphs):
if para_idx in paragraph_translations:
paragraph.text = paragraph_translations[para_idx]
# Apply table translations
for table_idx, table in enumerate(doc.tables):
for row_idx, row in enumerate(table.rows):
for cell_idx, cell in enumerate(row.cells):
table_key = (table_idx, row_idx, cell_idx)
if table_key in table_translations:
cell.text = table_translations[table_key]
# Apply header and footer translations
for section_idx, section in enumerate(doc.sections):
# Headers
if section.header:
for para_idx, paragraph in enumerate(section.header.paragraphs):
header_key = (section_idx, para_idx)
if header_key in header_translations:
paragraph.text = header_translations[header_key]
# Footers
if section.footer:
for para_idx, paragraph in enumerate(section.footer.paragraphs):
footer_key = (section_idx, para_idx)
if footer_key in footer_translations:
paragraph.text = footer_translations[footer_key]
# Save translated document
output_path = self.generate_output_path(file_path, "translated")
doc.save(output_path)
return output_path
except Exception as e:
raise ProcessorError(f"Failed to apply translations to Word document: {str(e)}")
@property
def supported_extensions(self) -> List[str]:
return ['.docx'] |