BabelSlide_2.0 / processors /docx_processor.py
Marek4321's picture
Upload 14 files
1df1e0b verified
from typing import List, Tuple, Generator
from pathlib import Path
from docx import Document
from docx.shared import Inches
from core.base_processor import DocumentProcessor
from core.exceptions import ProcessorError
class DOCXProcessor(DocumentProcessor):
"""Microsoft Word document processor"""
def extract_text_elements(self, file_path: Path) -> Generator[Tuple[str, dict], None, None]:
"""Extract text from Word document"""
try:
doc = Document(file_path)
# Extract text from paragraphs
for para_idx, paragraph in enumerate(doc.paragraphs):
if paragraph.text.strip():
metadata = {
'element_type': 'paragraph',
'index': para_idx,
'style': paragraph.style.name if paragraph.style else 'Normal',
'original_text': paragraph.text
}
yield paragraph.text, metadata
# Extract text from tables
for table_idx, table in enumerate(doc.tables):
for row_idx, row in enumerate(table.rows):
for cell_idx, cell in enumerate(row.cells):
if cell.text.strip():
metadata = {
'element_type': 'table_cell',
'table_index': table_idx,
'row_index': row_idx,
'cell_index': cell_idx,
'original_text': cell.text
}
yield cell.text, metadata
# Extract text from headers and footers
for section_idx, section in enumerate(doc.sections):
# Header
if section.header:
for para_idx, paragraph in enumerate(section.header.paragraphs):
if paragraph.text.strip():
metadata = {
'element_type': 'header',
'section_index': section_idx,
'paragraph_index': para_idx,
'original_text': paragraph.text
}
yield paragraph.text, metadata
# Footer
if section.footer:
for para_idx, paragraph in enumerate(section.footer.paragraphs):
if paragraph.text.strip():
metadata = {
'element_type': 'footer',
'section_index': section_idx,
'paragraph_index': para_idx,
'original_text': paragraph.text
}
yield paragraph.text, metadata
except Exception as e:
raise ProcessorError(f"Failed to extract text from Word document: {str(e)}")
def apply_translations(self, file_path: Path, translations: List[Tuple[str, dict]]) -> Path:
"""Apply translations to Word document"""
try:
# Load the original document
doc = Document(file_path)
# Group translations by type
paragraph_translations = {}
table_translations = {}
header_translations = {}
footer_translations = {}
for translated_text, metadata in translations:
element_type = metadata['element_type']
if element_type == 'paragraph':
paragraph_translations[metadata['index']] = translated_text
elif element_type == 'table_cell':
table_key = (metadata['table_index'], metadata['row_index'], metadata['cell_index'])
table_translations[table_key] = translated_text
elif element_type == 'header':
header_key = (metadata['section_index'], metadata['paragraph_index'])
header_translations[header_key] = translated_text
elif element_type == 'footer':
footer_key = (metadata['section_index'], metadata['paragraph_index'])
footer_translations[footer_key] = translated_text
# Apply paragraph translations
for para_idx, paragraph in enumerate(doc.paragraphs):
if para_idx in paragraph_translations:
paragraph.text = paragraph_translations[para_idx]
# Apply table translations
for table_idx, table in enumerate(doc.tables):
for row_idx, row in enumerate(table.rows):
for cell_idx, cell in enumerate(row.cells):
table_key = (table_idx, row_idx, cell_idx)
if table_key in table_translations:
cell.text = table_translations[table_key]
# Apply header and footer translations
for section_idx, section in enumerate(doc.sections):
# Headers
if section.header:
for para_idx, paragraph in enumerate(section.header.paragraphs):
header_key = (section_idx, para_idx)
if header_key in header_translations:
paragraph.text = header_translations[header_key]
# Footers
if section.footer:
for para_idx, paragraph in enumerate(section.footer.paragraphs):
footer_key = (section_idx, para_idx)
if footer_key in footer_translations:
paragraph.text = footer_translations[footer_key]
# Save translated document
output_path = self.generate_output_path(file_path, "translated")
doc.save(output_path)
return output_path
except Exception as e:
raise ProcessorError(f"Failed to apply translations to Word document: {str(e)}")
@property
def supported_extensions(self) -> List[str]:
return ['.docx']