File size: 3,885 Bytes
dc4e6da
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
from pathlib import Path

from PIL import Image, ImageDraw, ImageFont
import pymupdf

from docgenie.generation.models import OCRBox
from docgenie.generation.models._bbox import LayoutBox


def is_in_rect(rect: dict, bbox: OCRBox, threshold: float, document_id: str | None = None):
    # Convert back PDF points to pixels
    r_x0 = rect["x"] - threshold
    r_y0 = rect["y"] - threshold
    r_x2 = r_x0 + rect["width"] + 2 * threshold
    r_y2 = r_y0 + rect["height"] + 2 * threshold

    left = bbox.x0 >= r_x0
    top = bbox.y0 >= r_y0
    right = bbox.x2 <= r_x2
    bottom = bbox.y2 <= r_y2

    # if document_id == '74577486-4e36-425e-b733-8a745ca840f1_0':
    #     print(f'{left=} {top=} {right=} {bottom=} {bbox.as_string} {rect=}')

    return left and top and right and bottom


def save_bboxes(

    bboxes: list[OCRBox],

    bbox_path: Path,

):
    bbox_path.parent.mkdir(exist_ok=True, parents=True)
    with bbox_path.open(mode="w", encoding="utf-8") as f:
        for i, box in enumerate(bboxes):
            line = box.as_string()
            if i < len(bboxes) - 1:
                line += "\n"
            f.write(line)


def read_syn_dataset_bbox_str(line: str) -> OCRBox:
    parts = line.split(",", 4)
    x0 = float(parts[0])
    y0 = float(parts[1])
    x2 = float(parts[2])
    y2 = float(parts[3])
    txt = parts[4]
    parts = txt.rsplit(",", 3)
    txt = parts[0]
    block_no = int(parts[1])
    line_no = int(parts[2])
    word_no = int(parts[3])
    return OCRBox(
        x0=x0,
        y0=y0,
        x2=x2,
        y2=y2,
        text=txt,
        block_no=block_no,
        line_no=line_no,
        word_no=word_no,
    )


def read_syn_dataset_bboxes(box_path) -> list[OCRBox]:
    """

    Reads bboxes from synthetic datasets

    """
    bboxes = []
    line: str
    for line in box_path.read_text(encoding="utf-8").splitlines():
        bboxes.append(read_syn_dataset_bbox_str(line))
    return bboxes


def draw_pdf_bboxes_on_pdf(pdf_path, outpath: Path):
    doc = pymupdf.open(pdf_path)
    for page_num, page in enumerate(doc.pages()):
        for block in page.get_text("words"):
            x0, y0, x1, y1, txt = block[:5]
            # rect = pymupdf.Rect(block[:4])
            block = (round(x0), round(y0), round(x1), round(y1))
            rect = pymupdf.Rect(block)
            print(",".join([str(x) for x in block]))
            page.draw_rect(rect, color=(1, 0, 0))  # Red box

        doc.save(outpath)


def draw_bboxes_on_pdf(

    pdf_path: Path, outpath: Path, bboxes: list[OCRBox], color=(1, 0, 0)

):
    doc = pymupdf.open(pdf_path)
    for page_num, page in enumerate(doc.pages()):
        for bbox in bboxes:
            # rect = pymupdf.Rect(block[:4])
            block = (round(bbox.x0), round(bbox.y0), round(bbox.x2), round(bbox.y2))
            rect = pymupdf.Rect(block)
            page.draw_rect(rect, color=color)  # Red box

        doc.save(outpath)


def draw_bboxes_on_image(

    image, bboxes: list[OCRBox], color="red", width=3, show_text=True

) -> Image.Image:
    """

    Draws bounding boxes on a given Pillow image.



    :param image: Pillow Image object

    :param bboxes: List of bounding boxes [(x0, y0, x1, y1), ...]

    :param color: Color of the bounding box (default: red)

    :param width: Line width (default: 3)

    :return: Image with bounding boxes

    """
    draw = ImageDraw.Draw(image)

    bbox: OCRBox
    for bbox in bboxes:
        box = (bbox.x0, bbox.y0, bbox.x2, bbox.y2)
        draw.rectangle(box, outline=color, width=width)

        # font = ImageFont.truetype("sans-serif.ttf", 16)
        if show_text:
            font = ImageFont.load_default(32)
            draw.text(box, bbox.text, (255, 0, 255), font=font)  # type: ignore

    return image