File size: 3,885 Bytes
dc4e6da | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 | from pathlib import Path
from PIL import Image, ImageDraw, ImageFont
import pymupdf
from docgenie.generation.models import OCRBox
from docgenie.generation.models._bbox import LayoutBox
def is_in_rect(rect: dict, bbox: OCRBox, threshold: float, document_id: str | None = None):
# Convert back PDF points to pixels
r_x0 = rect["x"] - threshold
r_y0 = rect["y"] - threshold
r_x2 = r_x0 + rect["width"] + 2 * threshold
r_y2 = r_y0 + rect["height"] + 2 * threshold
left = bbox.x0 >= r_x0
top = bbox.y0 >= r_y0
right = bbox.x2 <= r_x2
bottom = bbox.y2 <= r_y2
# if document_id == '74577486-4e36-425e-b733-8a745ca840f1_0':
# print(f'{left=} {top=} {right=} {bottom=} {bbox.as_string} {rect=}')
return left and top and right and bottom
def save_bboxes(
bboxes: list[OCRBox],
bbox_path: Path,
):
bbox_path.parent.mkdir(exist_ok=True, parents=True)
with bbox_path.open(mode="w", encoding="utf-8") as f:
for i, box in enumerate(bboxes):
line = box.as_string()
if i < len(bboxes) - 1:
line += "\n"
f.write(line)
def read_syn_dataset_bbox_str(line: str) -> OCRBox:
parts = line.split(",", 4)
x0 = float(parts[0])
y0 = float(parts[1])
x2 = float(parts[2])
y2 = float(parts[3])
txt = parts[4]
parts = txt.rsplit(",", 3)
txt = parts[0]
block_no = int(parts[1])
line_no = int(parts[2])
word_no = int(parts[3])
return OCRBox(
x0=x0,
y0=y0,
x2=x2,
y2=y2,
text=txt,
block_no=block_no,
line_no=line_no,
word_no=word_no,
)
def read_syn_dataset_bboxes(box_path) -> list[OCRBox]:
"""
Reads bboxes from synthetic datasets
"""
bboxes = []
line: str
for line in box_path.read_text(encoding="utf-8").splitlines():
bboxes.append(read_syn_dataset_bbox_str(line))
return bboxes
def draw_pdf_bboxes_on_pdf(pdf_path, outpath: Path):
doc = pymupdf.open(pdf_path)
for page_num, page in enumerate(doc.pages()):
for block in page.get_text("words"):
x0, y0, x1, y1, txt = block[:5]
# rect = pymupdf.Rect(block[:4])
block = (round(x0), round(y0), round(x1), round(y1))
rect = pymupdf.Rect(block)
print(",".join([str(x) for x in block]))
page.draw_rect(rect, color=(1, 0, 0)) # Red box
doc.save(outpath)
def draw_bboxes_on_pdf(
pdf_path: Path, outpath: Path, bboxes: list[OCRBox], color=(1, 0, 0)
):
doc = pymupdf.open(pdf_path)
for page_num, page in enumerate(doc.pages()):
for bbox in bboxes:
# rect = pymupdf.Rect(block[:4])
block = (round(bbox.x0), round(bbox.y0), round(bbox.x2), round(bbox.y2))
rect = pymupdf.Rect(block)
page.draw_rect(rect, color=color) # Red box
doc.save(outpath)
def draw_bboxes_on_image(
image, bboxes: list[OCRBox], color="red", width=3, show_text=True
) -> Image.Image:
"""
Draws bounding boxes on a given Pillow image.
:param image: Pillow Image object
:param bboxes: List of bounding boxes [(x0, y0, x1, y1), ...]
:param color: Color of the bounding box (default: red)
:param width: Line width (default: 3)
:return: Image with bounding boxes
"""
draw = ImageDraw.Draw(image)
bbox: OCRBox
for bbox in bboxes:
box = (bbox.x0, bbox.y0, bbox.x2, bbox.y2)
draw.rectangle(box, outline=color, width=width)
# font = ImageFont.truetype("sans-serif.ttf", 16)
if show_text:
font = ImageFont.load_default(32)
draw.text(box, bbox.text, (255, 0, 255), font=font) # type: ignore
return image
|