"""
layout_preserving.py — تصدير DOCX/HTML مع الحفاظ على التخطيط البصري.
يقبل layout_data بصيغة JSON ويُعيد مستنداً يحاكي الكتابة الحاسوبية.
يدعم تنسيقين:
1. التنسيق البسيط (layout_data): {"image_path": "...", "blocks": [...]}
2. الهيكل القياسي (normalized): {"metadata": {...}, "pages": [{"blocks": [...]}]}
المؤلف: Dr Abdulmalek Tamer Al-husseini
الترخيص: MIT
"""
import json
import os
from pathlib import Path
from docx import Document
from docx.shared import Inches, Pt, Cm
from docx.enum.text import WD_ALIGN_PARAGRAPH
from docx.oxml.ns import qn
def _set_rtl(paragraph):
"""ضبط اتجاه الفقرة RTL."""
pPr = paragraph._element.get_or_add_pPr()
pPr.set(qn('w:bidi'), '1')
def export_to_docx(layout_data: dict, output_path: str) -> str:
"""
تصدير layout_data إلى ملف DOCX يحافظ على البنية البصرية.
يُرجع مسار الملف المُنشأ.
التنسيق البسيط:
layout_data = {
"image_path": "...",
"blocks": [{"type": "paragraph", "bbox": [...], "text": "..."}, ...]
}
"""
doc = Document()
# هوامش موحدة
for section in doc.sections:
section.top_margin = Cm(2)
section.bottom_margin = Cm(2)
section.left_margin = Cm(2.5)
section.right_margin = Cm(2.5)
# نمط افتراضي RTL
style = doc.styles['Normal']
style.font.size = Pt(12)
rPr = style.element.get_or_add_rPr()
rPr.set(qn('w:rtl'), '1')
for block in layout_data.get('blocks', []):
btype = block.get('type', 'paragraph')
if btype in ('paragraph', 'caption'):
p = doc.add_paragraph()
p.alignment = WD_ALIGN_PARAGRAPH.RIGHT
_set_rtl(p)
run = p.add_run(block.get('text', ''))
run.font.size = Pt(10 if btype == 'caption' else 12)
if btype == 'caption':
run.italic = True
elif btype == 'header':
p = doc.add_heading(block.get('text', ''), level=2)
p.alignment = WD_ALIGN_PARAGRAPH.RIGHT
_set_rtl(p)
elif btype == 'table':
cells = block.get('cells', [])
if not cells:
continue
rows, cols = len(cells), max(len(r) for r in cells)
tbl = doc.add_table(rows=rows, cols=cols, style='Table Grid')
tbl.alignment = WD_ALIGN_PARAGRAPH.RIGHT
for i, row in enumerate(cells):
for j, cell_text in enumerate(row):
if j < cols:
c = tbl.cell(i, j)
c.text = ''
p = c.paragraphs[0]
p.alignment = WD_ALIGN_PARAGRAPH.RIGHT
_set_rtl(p)
p.add_run(str(cell_text)).font.size = Pt(11)
doc.add_paragraph() # مسافة بعد الجدول
elif btype == 'image':
img_file = block.get('image_file', '')
if img_file and os.path.exists(img_file):
doc.add_picture(img_file, width=Inches(4.5))
doc.paragraphs[-1].alignment = WD_ALIGN_PARAGRAPH.CENTER
doc.save(output_path)
return output_path
def layout_to_docx(layout_json_path: str, output_docx: str) -> str:
"""
تصدير النتائج إلى DOCX مع الحفاظ على التخطيط.
يعمل على ملف JSON بالهيكل القياسي (metadata + pages + blocks).
هذا هو التنسيق المُنتج من modules.vision.normalize.normalize_ocr_output().
Args:
layout_json_path: مسار ملف JSON بالهيكل القياسي
output_docx: مسار ملف DOCX المطلوب
Returns:
مسار ملف DOCX المُنشأ
"""
with open(layout_json_path, 'r', encoding='utf-8') as f:
data = json.load(f)
doc = Document()
# هوامش موحدة
for section in doc.sections:
section.top_margin = Cm(2)
section.bottom_margin = Cm(2)
section.left_margin = Cm(2.5)
section.right_margin = Cm(2.5)
# نمط افتراضي RTL
style = doc.styles['Normal']
style.font.size = Pt(12)
rPr = style.element.get_or_add_rPr()
rPr.set(qn('w:rtl'), '1')
for page in data.get("pages", []):
page_w = page.get("width", 2480)
page_h = page.get("height", 3508)
for block in page.get("blocks", []):
b_type = block.get("type", "paragraph")
if b_type == "paragraph":
rtl = block.get("direction", "").lower() == "rtl"
p = doc.add_paragraph()
if rtl:
p.alignment = WD_ALIGN_PARAGRAPH.RIGHT
_set_rtl(p)
run = p.add_run(block.get("text", ""))
run.font.size = Pt(12)
elif b_type == "header":
p = doc.add_heading(block.get("text", ''), level=2)
p.alignment = WD_ALIGN_PARAGRAPH.RIGHT
_set_rtl(p)
elif b_type == "table":
cells_struct = block.get("structure", {}).get("cells", [])
if not cells_struct:
# محاولة استخدام cells البسيط (للتوافق)
simple_cells = block.get("cells", [])
if simple_cells:
rows, cols = len(simple_cells), max(len(r) for r in simple_cells)
tbl = doc.add_table(rows=rows, cols=cols, style='Table Grid')
tbl.alignment = WD_ALIGN_PARAGRAPH.RIGHT
for i, row in enumerate(simple_cells):
for j, cell_text in enumerate(row):
if j < cols:
c = tbl.cell(i, j)
c.text = ''
p = c.paragraphs[0]
p.alignment = WD_ALIGN_PARAGRAPH.RIGHT
_set_rtl(p)
p.add_run(str(cell_text)).font.size = Pt(11)
doc.add_paragraph()
continue
# إعادة بناء صفوف/أعمدة من الهيكل القياسي
rows_dict: dict[int, dict[int, str]] = {}
for cell in cells_struct:
r = cell["row"]
c = cell["col"]
rows_dict.setdefault(r, {})[c] = cell["text"]
if not rows_dict:
continue
max_col = max(max(row.keys()) for row in rows_dict.values()) + 1
num_rows = len(rows_dict)
tbl = doc.add_table(
rows=num_rows, cols=max_col, style='Table Grid'
)
tbl.alignment = WD_ALIGN_PARAGRAPH.RIGHT
for r in sorted(rows_dict.keys()):
for c in rows_dict[r]:
cell = tbl.cell(r, c)
cell.text = ''
p = cell.paragraphs[0]
p.alignment = WD_ALIGN_PARAGRAPH.RIGHT
_set_rtl(p)
run = p.add_run(str(rows_dict[r][c]))
run.font.size = Pt(11)
doc.add_paragraph()
elif b_type == "image":
img_file = block.get("image_file", "")
if img_file and os.path.exists(img_file):
doc.add_picture(img_file, width=Inches(4.5))
last_paragraph = doc.paragraphs[-1]
last_paragraph.alignment = WD_ALIGN_PARAGRAPH.CENTER
# التسمية
if "caption" in block:
caption = block["caption"]
caption_text = (
caption["text"]
if isinstance(caption, dict)
else str(caption)
)
p = doc.add_paragraph()
p.alignment = WD_ALIGN_PARAGRAPH.CENTER
run = p.add_run(caption_text)
run.font.size = Pt(10)
run.italic = True
doc.save(output_docx)
return output_docx
def ocr_result_to_layout(ocr_json: dict, image_path: str = "") -> dict:
"""
تحويل مخرجات OCR القياسية إلى تنسيق layout_data.
"""
layout = {"image_path": image_path, "blocks": []}
for block in ocr_json.get('blocks', []):
nb = {
"type": block.get('type', 'paragraph'),
"bbox": block.get('bbox', [0, 0, 1, 1]),
"text": block.get('text', ''),
}
if nb["type"] == 'table':
nb["cells"] = block.get('cells', [])
elif nb["type"] == 'image':
nb["image_file"] = block.get('image_file', '')
layout["blocks"].append(nb)
return layout