File size: 4,860 Bytes
dc4e6da
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
import json
import pathlib
import shutil

import fitz
from docgenie.generation.constants import IMAGE_RENDER_EXT, PDF_DPI
from docgenie.generation.models import (
    OCRBox,
    PipelineParameters,
    SynDatasetDefinition,
    SynDocumentLog,
)
from rich.progress import (
    Progress,
    TimeElapsedColumn,
    BarColumn,
    TaskProgressColumn,
    TimeRemainingColumn,
)
from PIL import Image

from docgenie.generation.utils.bboxes import (
    draw_bboxes_on_image,
    draw_bboxes_on_pdf,
    read_syn_dataset_bboxes,
    save_bboxes,
)
from docgenie.generation.utils.geos import rect_to_ocrbox
from docgenie.generation.utils.log import log_pipeline_level
from docgenie.generation.utils.status import get_progress_bar


def mm_to_px(mm: int | float):
    return mm * 72 / 25.4


def draw_visual_elements_debug(dsdef: SynDatasetDefinition, docid: str):
    dsfiles = dsdef.get_file_structure()
    bboxes = []
    data_path = dsfiles.visual_element_definitions_directory / f"{docid}.json"
    data = json.loads(data_path.read_text(encoding="utf-8"))
    for d in data:
        if d["error"] is None:
            b = rect_to_ocrbox(d["rect"])
            bboxes.append(b)

    draw_bboxes_on_pdf(
        dsfiles.final_pdf_directory / f"{docid}.pdf",
        dsfiles.debug_pdf_visual_elements_directory / f"{docid}.pdf",
        bboxes,
        color=(0, 0, 1),  # visual elements blue
    )


def unnormalize_bboxes(bboxes: list[OCRBox], width: float, height: float):
    for b in bboxes:
        yield OCRBox(
            x0=b.x0 * width,
            y0=b.y0 * height,
            x2=b.x2 * width,
            y2=b.y2 * height,
            text=b.text,
            block_no=b.block_no,
            line_no=b.line_no,
            word_no=b.word_no,
        )


def draw_bbox_final_debug(dsdef: SynDatasetDefinition, docid: str):
    dsfiles = dsdef.get_file_structure()

    bbox_norm_path = dsfiles.get_final_normalized_bbox_path(
        level="segment", doc_id=docid
    )
    bbox_norm = read_syn_dataset_bboxes(bbox_norm_path)

    img_path = dsfiles.img_directory / f"{docid}.{IMAGE_RENDER_EXT}"
    img = Image.open(img_path)
    width, height = img.size
    bbox_unnorm = list(unnormalize_bboxes(bboxes=bbox_norm, width=width, height=height))

    try:
        img_altered = draw_bboxes_on_image(img, bbox_unnorm, show_text=True)
        img_altered.save(
            dsfiles.debug_pdf_bboxes_final_directory / f"{docid}.{IMAGE_RENDER_EXT}"
        )
    except Exception as err:
        print(f"[ERROR]: Skipping debug PDF: {str(err)}")


def draw_bbox_debug(dsdef: SynDatasetDefinition, docid: str):
    dsfiles = dsdef.get_file_structure()

    bbox_norm_path = dsfiles.get_pdf_bbox_path(level="word", doc_id=docid)
    bbox_unnorm = read_syn_dataset_bboxes(bbox_norm_path)

    pdf_path = dsfiles.pdf_initial_directory / f"{docid}.pdf"
    outpath = dsfiles.debug_pdf_bboxes_directory / f"{docid}.pdf"

    try:
        draw_bboxes_on_pdf(pdf_path=pdf_path, outpath=outpath, bboxes=bbox_unnorm)
    except Exception as err:
        print(f"[ERROR]: Skipping debug PDF: {str(err)}")


def pipeline_create_debug_data(params: PipelineParameters):
    log_pipeline_level()

    dsdef = params.dsdef
    dsfiles = dsdef.get_file_structure()

    # Get valid documents
    valid_documents = []
    total_pdfs_count = 0
    for doclog in dsdef.get_document_logs():
        total_pdfs_count += 1

        if doclog.pdf_num_pages == 1:
            valid_documents.append(doclog)

    print(f"Found {len(valid_documents)} documents valid for debug PDF/Img drawing.")

    with get_progress_bar() as progress:
        task = progress.add_task(
            "[white]Draw Debug PDF/Images...", total=len(valid_documents)
        )

        doclog: SynDocumentLog
        for doclog in valid_documents:
            docid = doclog.document_id

            # Copy raw HTML to debug directory
            src = dsfiles.raw_html_directory / f"{docid}.html"
            tgt = dsfiles.debug_html_raw_directory / f"{docid}.html"
            shutil.copy(src, tgt)

            if doclog.visual_elements_num_elements > 0:
                draw_visual_elements_debug(dsdef=dsdef, docid=docid)

            # Handwriting debug is created when handwriting is inserted

            if doclog.ocr_found:
                draw_bbox_final_debug(dsdef=dsdef, docid=docid)

            progress.update(task, advance=1)

        # Copy debug script into debug html directory
        debug_script_fname = "debug.js"
        src_dir = pathlib.Path(__file__).parent
        src_path = src_dir / debug_script_fname
        dst_path = dsfiles.debug_html_raw_directory / debug_script_fname
        shutil.copy(src_path, dst_path)