Spaces:

Marek4321
/

BabelSlide_2.0

Sleeping

App Files Files Community

BabelSlide_2.0 / processors /docx_processor.py

Marek4321

Upload 14 files

1df1e0b verified 5 months ago

raw

history blame contribute delete

6.45 kB

	from typing import List, Tuple, Generator
	from pathlib import Path
	from docx import Document
	from docx.shared import Inches
	from core.base_processor import DocumentProcessor
	from core.exceptions import ProcessorError

	class DOCXProcessor(DocumentProcessor):
	"""Microsoft Word document processor"""

	def extract_text_elements(self, file_path: Path) -> Generator[Tuple[str, dict], None, None]:
	"""Extract text from Word document"""
	try:
	doc = Document(file_path)

	# Extract text from paragraphs
	for para_idx, paragraph in enumerate(doc.paragraphs):
	if paragraph.text.strip():
	metadata = {
	'element_type': 'paragraph',
	'index': para_idx,
	'style': paragraph.style.name if paragraph.style else 'Normal',
	'original_text': paragraph.text
	}
	yield paragraph.text, metadata

	# Extract text from tables
	for table_idx, table in enumerate(doc.tables):
	for row_idx, row in enumerate(table.rows):
	for cell_idx, cell in enumerate(row.cells):
	if cell.text.strip():
	metadata = {
	'element_type': 'table_cell',
	'table_index': table_idx,
	'row_index': row_idx,
	'cell_index': cell_idx,
	'original_text': cell.text
	}
	yield cell.text, metadata

	# Extract text from headers and footers
	for section_idx, section in enumerate(doc.sections):
	# Header
	if section.header:
	for para_idx, paragraph in enumerate(section.header.paragraphs):
	if paragraph.text.strip():
	metadata = {
	'element_type': 'header',
	'section_index': section_idx,
	'paragraph_index': para_idx,
	'original_text': paragraph.text
	}
	yield paragraph.text, metadata

	# Footer
	if section.footer:
	for para_idx, paragraph in enumerate(section.footer.paragraphs):
	if paragraph.text.strip():
	metadata = {
	'element_type': 'footer',
	'section_index': section_idx,
	'paragraph_index': para_idx,
	'original_text': paragraph.text
	}
	yield paragraph.text, metadata

	except Exception as e:
	raise ProcessorError(f"Failed to extract text from Word document: {str(e)}")

	def apply_translations(self, file_path: Path, translations: List[Tuple[str, dict]]) -> Path:
	"""Apply translations to Word document"""
	try:
	# Load the original document
	doc = Document(file_path)

	# Group translations by type
	paragraph_translations = {}
	table_translations = {}
	header_translations = {}
	footer_translations = {}

	for translated_text, metadata in translations:
	element_type = metadata['element_type']

	if element_type == 'paragraph':
	paragraph_translations[metadata['index']] = translated_text
	elif element_type == 'table_cell':
	table_key = (metadata['table_index'], metadata['row_index'], metadata['cell_index'])
	table_translations[table_key] = translated_text
	elif element_type == 'header':
	header_key = (metadata['section_index'], metadata['paragraph_index'])
	header_translations[header_key] = translated_text
	elif element_type == 'footer':
	footer_key = (metadata['section_index'], metadata['paragraph_index'])
	footer_translations[footer_key] = translated_text

	# Apply paragraph translations
	for para_idx, paragraph in enumerate(doc.paragraphs):
	if para_idx in paragraph_translations:
	paragraph.text = paragraph_translations[para_idx]

	# Apply table translations
	for table_idx, table in enumerate(doc.tables):
	for row_idx, row in enumerate(table.rows):
	for cell_idx, cell in enumerate(row.cells):
	table_key = (table_idx, row_idx, cell_idx)
	if table_key in table_translations:
	cell.text = table_translations[table_key]

	# Apply header and footer translations
	for section_idx, section in enumerate(doc.sections):
	# Headers
	if section.header:
	for para_idx, paragraph in enumerate(section.header.paragraphs):
	header_key = (section_idx, para_idx)
	if header_key in header_translations:
	paragraph.text = header_translations[header_key]

	# Footers
	if section.footer:
	for para_idx, paragraph in enumerate(section.footer.paragraphs):
	footer_key = (section_idx, para_idx)
	if footer_key in footer_translations:
	paragraph.text = footer_translations[footer_key]

	# Save translated document
	output_path = self.generate_output_path(file_path, "translated")
	doc.save(output_path)

	return output_path

	except Exception as e:
	raise ProcessorError(f"Failed to apply translations to Word document: {str(e)}")

	@property
	def supported_extensions(self) -> List[str]:
	return ['.docx']