Spaces:

DrAbdulmalek
/

OmniFile-Processor

Sleeping

File size: 9,150 Bytes

900df0b

"""
layout_preserving.py — تصدير DOCX/HTML مع الحفاظ على التخطيط البصري.
يقبل layout_data بصيغة JSON ويُعيد مستنداً يحاكي الكتابة الحاسوبية.

يدعم تنسيقين:
1. التنسيق البسيط (layout_data): {"image_path": "...", "blocks": [...]}
2. الهيكل القياسي (normalized): {"metadata": {...}, "pages": [{"blocks": [...]}]}

المؤلف: Dr Abdulmalek Tamer Al-husseini
الترخيص: MIT
"""
import json
import os
from pathlib import Path
from docx import Document
from docx.shared import Inches, Pt, Cm
from docx.enum.text import WD_ALIGN_PARAGRAPH
from docx.oxml.ns import qn


def _set_rtl(paragraph):
    """ضبط اتجاه الفقرة RTL."""
    pPr = paragraph._element.get_or_add_pPr()
    pPr.set(qn('w:bidi'), '1')


def export_to_docx(layout_data: dict, output_path: str) -> str:
    """
    تصدير layout_data إلى ملف DOCX يحافظ على البنية البصرية.
    يُرجع مسار الملف المُنشأ.

    التنسيق البسيط:
        layout_data = {
            "image_path": "...",
            "blocks": [{"type": "paragraph", "bbox": [...], "text": "..."}, ...]
        }
    """
    doc = Document()

    # هوامش موحدة
    for section in doc.sections:
        section.top_margin = Cm(2)
        section.bottom_margin = Cm(2)
        section.left_margin = Cm(2.5)
        section.right_margin = Cm(2.5)

    # نمط افتراضي RTL
    style = doc.styles['Normal']
    style.font.size = Pt(12)
    rPr = style.element.get_or_add_rPr()
    rPr.set(qn('w:rtl'), '1')

    for block in layout_data.get('blocks', []):
        btype = block.get('type', 'paragraph')

        if btype in ('paragraph', 'caption'):
            p = doc.add_paragraph()
            p.alignment = WD_ALIGN_PARAGRAPH.RIGHT
            _set_rtl(p)
            run = p.add_run(block.get('text', ''))
            run.font.size = Pt(10 if btype == 'caption' else 12)
            if btype == 'caption':
                run.italic = True

        elif btype == 'header':
            p = doc.add_heading(block.get('text', ''), level=2)
            p.alignment = WD_ALIGN_PARAGRAPH.RIGHT
            _set_rtl(p)

        elif btype == 'table':
            cells = block.get('cells', [])
            if not cells:
                continue
            rows, cols = len(cells), max(len(r) for r in cells)
            tbl = doc.add_table(rows=rows, cols=cols, style='Table Grid')
            tbl.alignment = WD_ALIGN_PARAGRAPH.RIGHT
            for i, row in enumerate(cells):
                for j, cell_text in enumerate(row):
                    if j < cols:
                        c = tbl.cell(i, j)
                        c.text = ''
                        p = c.paragraphs[0]
                        p.alignment = WD_ALIGN_PARAGRAPH.RIGHT
                        _set_rtl(p)
                        p.add_run(str(cell_text)).font.size = Pt(11)
            doc.add_paragraph()  # مسافة بعد الجدول

        elif btype == 'image':
            img_file = block.get('image_file', '')
            if img_file and os.path.exists(img_file):
                doc.add_picture(img_file, width=Inches(4.5))
                doc.paragraphs[-1].alignment = WD_ALIGN_PARAGRAPH.CENTER

    doc.save(output_path)
    return output_path


def layout_to_docx(layout_json_path: str, output_docx: str) -> str:
    """
    تصدير النتائج إلى DOCX مع الحفاظ على التخطيط.
    يعمل على ملف JSON بالهيكل القياسي (metadata + pages + blocks).

    هذا هو التنسيق المُنتج من modules.vision.normalize.normalize_ocr_output().

    Args:
        layout_json_path: مسار ملف JSON بالهيكل القياسي
        output_docx: مسار ملف DOCX المطلوب

    Returns:
        مسار ملف DOCX المُنشأ
    """
    with open(layout_json_path, 'r', encoding='utf-8') as f:
        data = json.load(f)

    doc = Document()

    # هوامش موحدة
    for section in doc.sections:
        section.top_margin = Cm(2)
        section.bottom_margin = Cm(2)
        section.left_margin = Cm(2.5)
        section.right_margin = Cm(2.5)

    # نمط افتراضي RTL
    style = doc.styles['Normal']
    style.font.size = Pt(12)
    rPr = style.element.get_or_add_rPr()
    rPr.set(qn('w:rtl'), '1')

    for page in data.get("pages", []):
        page_w = page.get("width", 2480)
        page_h = page.get("height", 3508)

        for block in page.get("blocks", []):
            b_type = block.get("type", "paragraph")

            if b_type == "paragraph":
                rtl = block.get("direction", "").lower() == "rtl"
                p = doc.add_paragraph()
                if rtl:
                    p.alignment = WD_ALIGN_PARAGRAPH.RIGHT
                _set_rtl(p)
                run = p.add_run(block.get("text", ""))
                run.font.size = Pt(12)

            elif b_type == "header":
                p = doc.add_heading(block.get("text", ''), level=2)
                p.alignment = WD_ALIGN_PARAGRAPH.RIGHT
                _set_rtl(p)

            elif b_type == "table":
                cells_struct = block.get("structure", {}).get("cells", [])
                if not cells_struct:
                    # محاولة استخدام cells البسيط (للتوافق)
                    simple_cells = block.get("cells", [])
                    if simple_cells:
                        rows, cols = len(simple_cells), max(len(r) for r in simple_cells)
                        tbl = doc.add_table(rows=rows, cols=cols, style='Table Grid')
                        tbl.alignment = WD_ALIGN_PARAGRAPH.RIGHT
                        for i, row in enumerate(simple_cells):
                            for j, cell_text in enumerate(row):
                                if j < cols:
                                    c = tbl.cell(i, j)
                                    c.text = ''
                                    p = c.paragraphs[0]
                                    p.alignment = WD_ALIGN_PARAGRAPH.RIGHT
                                    _set_rtl(p)
                                    p.add_run(str(cell_text)).font.size = Pt(11)
                        doc.add_paragraph()
                    continue

                # إعادة بناء صفوف/أعمدة من الهيكل القياسي
                rows_dict: dict[int, dict[int, str]] = {}
                for cell in cells_struct:
                    r = cell["row"]
                    c = cell["col"]
                    rows_dict.setdefault(r, {})[c] = cell["text"]

                if not rows_dict:
                    continue

                max_col = max(max(row.keys()) for row in rows_dict.values()) + 1
                num_rows = len(rows_dict)

                tbl = doc.add_table(
                    rows=num_rows, cols=max_col, style='Table Grid'
                )
                tbl.alignment = WD_ALIGN_PARAGRAPH.RIGHT

                for r in sorted(rows_dict.keys()):
                    for c in rows_dict[r]:
                        cell = tbl.cell(r, c)
                        cell.text = ''
                        p = cell.paragraphs[0]
                        p.alignment = WD_ALIGN_PARAGRAPH.RIGHT
                        _set_rtl(p)
                        run = p.add_run(str(rows_dict[r][c]))
                        run.font.size = Pt(11)

                doc.add_paragraph()

            elif b_type == "image":
                img_file = block.get("image_file", "")
                if img_file and os.path.exists(img_file):
                    doc.add_picture(img_file, width=Inches(4.5))
                    last_paragraph = doc.paragraphs[-1]
                    last_paragraph.alignment = WD_ALIGN_PARAGRAPH.CENTER

                # التسمية
                if "caption" in block:
                    caption = block["caption"]
                    caption_text = (
                        caption["text"]
                        if isinstance(caption, dict)
                        else str(caption)
                    )
                    p = doc.add_paragraph()
                    p.alignment = WD_ALIGN_PARAGRAPH.CENTER
                    run = p.add_run(caption_text)
                    run.font.size = Pt(10)
                    run.italic = True

    doc.save(output_docx)
    return output_docx


def ocr_result_to_layout(ocr_json: dict, image_path: str = "") -> dict:
    """
    تحويل مخرجات OCR القياسية إلى تنسيق layout_data.
    """
    layout = {"image_path": image_path, "blocks": []}
    for block in ocr_json.get('blocks', []):
        nb = {
            "type": block.get('type', 'paragraph'),
            "bbox": block.get('bbox', [0, 0, 1, 1]),
            "text": block.get('text', ''),
        }
        if nb["type"] == 'table':
            nb["cells"] = block.get('cells', [])
        elif nb["type"] == 'image':
            nb["image_file"] = block.get('image_file', '')
        layout["blocks"].append(nb)
    return layout