Spaces:

DrAbdulmalek
/

OmniFile-Processor

Sleeping

OmniFile-Processor / modules /export /layout_preserving.py

Dr. Abdulmalek

deploy: OmniFile AI Processor v4.3.0

900df0b 19 days ago

9.15 kB

	"""
	layout_preserving.py — تصدير DOCX/HTML مع الحفاظ على التخطيط البصري.
	يقبل layout_data بصيغة JSON ويُعيد مستنداً يحاكي الكتابة الحاسوبية.

	يدعم تنسيقين:
	1. التنسيق البسيط (layout_data): {"image_path": "...", "blocks": [...]}
	2. الهيكل القياسي (normalized): {"metadata": {...}, "pages": [{"blocks": [...]}]}

	المؤلف: Dr Abdulmalek Tamer Al-husseini
	الترخيص: MIT
	"""
	import json
	import os
	from pathlib import Path
	from docx import Document
	from docx.shared import Inches, Pt, Cm
	from docx.enum.text import WD_ALIGN_PARAGRAPH
	from docx.oxml.ns import qn


	def _set_rtl(paragraph):
	"""ضبط اتجاه الفقرة RTL."""
	pPr = paragraph._element.get_or_add_pPr()
	pPr.set(qn('w:bidi'), '1')


	def export_to_docx(layout_data: dict, output_path: str) -> str:
	"""
	تصدير layout_data إلى ملف DOCX يحافظ على البنية البصرية.
	يُرجع مسار الملف المُنشأ.

	التنسيق البسيط:
	layout_data = {
	"image_path": "...",
	"blocks": [{"type": "paragraph", "bbox": [...], "text": "..."}, ...]
	}
	"""
	doc = Document()

	# هوامش موحدة
	for section in doc.sections:
	section.top_margin = Cm(2)
	section.bottom_margin = Cm(2)
	section.left_margin = Cm(2.5)
	section.right_margin = Cm(2.5)

	# نمط افتراضي RTL
	style = doc.styles['Normal']
	style.font.size = Pt(12)
	rPr = style.element.get_or_add_rPr()
	rPr.set(qn('w:rtl'), '1')

	for block in layout_data.get('blocks', []):
	btype = block.get('type', 'paragraph')

	if btype in ('paragraph', 'caption'):
	p = doc.add_paragraph()
	p.alignment = WD_ALIGN_PARAGRAPH.RIGHT
	_set_rtl(p)
	run = p.add_run(block.get('text', ''))
	run.font.size = Pt(10 if btype == 'caption' else 12)
	if btype == 'caption':
	run.italic = True

	elif btype == 'header':
	p = doc.add_heading(block.get('text', ''), level=2)
	p.alignment = WD_ALIGN_PARAGRAPH.RIGHT
	_set_rtl(p)

	elif btype == 'table':
	cells = block.get('cells', [])
	if not cells:
	continue
	rows, cols = len(cells), max(len(r) for r in cells)
	tbl = doc.add_table(rows=rows, cols=cols, style='Table Grid')
	tbl.alignment = WD_ALIGN_PARAGRAPH.RIGHT
	for i, row in enumerate(cells):
	for j, cell_text in enumerate(row):
	if j < cols:
	c = tbl.cell(i, j)
	c.text = ''
	p = c.paragraphs[0]
	p.alignment = WD_ALIGN_PARAGRAPH.RIGHT
	_set_rtl(p)
	p.add_run(str(cell_text)).font.size = Pt(11)
	doc.add_paragraph() # مسافة بعد الجدول

	elif btype == 'image':
	img_file = block.get('image_file', '')
	if img_file and os.path.exists(img_file):
	doc.add_picture(img_file, width=Inches(4.5))
	doc.paragraphs[-1].alignment = WD_ALIGN_PARAGRAPH.CENTER

	doc.save(output_path)
	return output_path


	def layout_to_docx(layout_json_path: str, output_docx: str) -> str:
	"""
	تصدير النتائج إلى DOCX مع الحفاظ على التخطيط.
	يعمل على ملف JSON بالهيكل القياسي (metadata + pages + blocks).

	هذا هو التنسيق المُنتج من modules.vision.normalize.normalize_ocr_output().

	Args:
	layout_json_path: مسار ملف JSON بالهيكل القياسي
	output_docx: مسار ملف DOCX المطلوب

	Returns:
	مسار ملف DOCX المُنشأ
	"""
	with open(layout_json_path, 'r', encoding='utf-8') as f:
	data = json.load(f)

	doc = Document()

	# هوامش موحدة
	for section in doc.sections:
	section.top_margin = Cm(2)
	section.bottom_margin = Cm(2)
	section.left_margin = Cm(2.5)
	section.right_margin = Cm(2.5)

	# نمط افتراضي RTL
	style = doc.styles['Normal']
	style.font.size = Pt(12)
	rPr = style.element.get_or_add_rPr()
	rPr.set(qn('w:rtl'), '1')

	for page in data.get("pages", []):
	page_w = page.get("width", 2480)
	page_h = page.get("height", 3508)

	for block in page.get("blocks", []):
	b_type = block.get("type", "paragraph")

	if b_type == "paragraph":
	rtl = block.get("direction", "").lower() == "rtl"
	p = doc.add_paragraph()
	if rtl:
	p.alignment = WD_ALIGN_PARAGRAPH.RIGHT
	_set_rtl(p)
	run = p.add_run(block.get("text", ""))
	run.font.size = Pt(12)

	elif b_type == "header":
	p = doc.add_heading(block.get("text", ''), level=2)
	p.alignment = WD_ALIGN_PARAGRAPH.RIGHT
	_set_rtl(p)

	elif b_type == "table":
	cells_struct = block.get("structure", {}).get("cells", [])
	if not cells_struct:
	# محاولة استخدام cells البسيط (للتوافق)
	simple_cells = block.get("cells", [])
	if simple_cells:
	rows, cols = len(simple_cells), max(len(r) for r in simple_cells)
	tbl = doc.add_table(rows=rows, cols=cols, style='Table Grid')
	tbl.alignment = WD_ALIGN_PARAGRAPH.RIGHT
	for i, row in enumerate(simple_cells):
	for j, cell_text in enumerate(row):
	if j < cols:
	c = tbl.cell(i, j)
	c.text = ''
	p = c.paragraphs[0]
	p.alignment = WD_ALIGN_PARAGRAPH.RIGHT
	_set_rtl(p)
	p.add_run(str(cell_text)).font.size = Pt(11)
	doc.add_paragraph()
	continue

	# إعادة بناء صفوف/أعمدة من الهيكل القياسي
	rows_dict: dict[int, dict[int, str]] = {}
	for cell in cells_struct:
	r = cell["row"]
	c = cell["col"]
	rows_dict.setdefault(r, {})[c] = cell["text"]

	if not rows_dict:
	continue

	max_col = max(max(row.keys()) for row in rows_dict.values()) + 1
	num_rows = len(rows_dict)

	tbl = doc.add_table(
	rows=num_rows, cols=max_col, style='Table Grid'
	)
	tbl.alignment = WD_ALIGN_PARAGRAPH.RIGHT

	for r in sorted(rows_dict.keys()):
	for c in rows_dict[r]:
	cell = tbl.cell(r, c)
	cell.text = ''
	p = cell.paragraphs[0]
	p.alignment = WD_ALIGN_PARAGRAPH.RIGHT
	_set_rtl(p)
	run = p.add_run(str(rows_dict[r][c]))
	run.font.size = Pt(11)

	doc.add_paragraph()

	elif b_type == "image":
	img_file = block.get("image_file", "")
	if img_file and os.path.exists(img_file):
	doc.add_picture(img_file, width=Inches(4.5))
	last_paragraph = doc.paragraphs[-1]
	last_paragraph.alignment = WD_ALIGN_PARAGRAPH.CENTER

	# التسمية
	if "caption" in block:
	caption = block["caption"]
	caption_text = (
	caption["text"]
	if isinstance(caption, dict)
	else str(caption)
	)
	p = doc.add_paragraph()
	p.alignment = WD_ALIGN_PARAGRAPH.CENTER
	run = p.add_run(caption_text)
	run.font.size = Pt(10)
	run.italic = True

	doc.save(output_docx)
	return output_docx


	def ocr_result_to_layout(ocr_json: dict, image_path: str = "") -> dict:
	"""
	تحويل مخرجات OCR القياسية إلى تنسيق layout_data.
	"""
	layout = {"image_path": image_path, "blocks": []}
	for block in ocr_json.get('blocks', []):
	nb = {
	"type": block.get('type', 'paragraph'),
	"bbox": block.get('bbox', [0, 0, 1, 1]),
	"text": block.get('text', ''),
	}
	if nb["type"] == 'table':
	nb["cells"] = block.get('cells', [])
	elif nb["type"] == 'image':
	nb["image_file"] = block.get('image_file', '')
	layout["blocks"].append(nb)
	return layout