field_semantic_mapping / utils_geometry.py
Tanishq Salkar
initial visual mapping code added to hf
db81e28
import os
import json
import shutil
import fitz # PyMuPDF
import io
from PIL import Image, ImageDraw, ImageFont
import config
def setup_debug_dir():
if os.path.exists(config.DEBUG_DIR):
shutil.rmtree(config.DEBUG_DIR)
os.makedirs(config.DEBUG_DIR)
print(f"Debug directory cleared: {config.DEBUG_DIR}/")
def save_debug_image(image_bytes, name):
path = os.path.join(config.DEBUG_DIR, f"{name}.jpg")
with open(path, "wb") as f:
f.write(image_bytes)
return path
def save_debug_json(data, name):
path = os.path.join(config.DEBUG_DIR, f"{name}.json")
with open(path, "w") as f:
json.dump(data, f, indent=2)
def normalize_bbox_to_top_left(bbox, page_height):
"""Convert PDF Bottom-Left coords to Image Top-Left coords."""
return {
"x0": bbox["x0"],
"y0": page_height - bbox["y1"],
"x1": bbox["x1"],
"y1": page_height - bbox["y0"]
}
def get_words_from_page(page):
return page.get_text("words")
def calculate_smart_anchors(field_bbox, words, page_height):
norm_bbox = normalize_bbox_to_top_left(field_bbox, page_height)
fx0, fy0, fx1, fy1 = norm_bbox["x0"], norm_bbox["y0"], norm_bbox["x1"], norm_bbox["y1"]
SEARCH_RADIUS = 150
Y_ALIGNMENT_TOLERANCE = 12
closest_left = []
closest_right = []
closest_above = []
for w in words:
wx0, wy0, wx1, wy1, text = w[0], w[1], w[2], w[3], w[4]
w_center_y = (wy0 + wy1) / 2
f_center_y = (fy0 + fy1) / 2
# Left
if wx1 < fx0 and abs(w_center_y - f_center_y) < Y_ALIGNMENT_TOLERANCE:
if fx0 - wx1 < SEARCH_RADIUS: closest_left.append((fx0 - wx1, text))
# Right
if wx0 > fx1 and abs(w_center_y - f_center_y) < Y_ALIGNMENT_TOLERANCE:
if wx0 - fx1 < SEARCH_RADIUS: closest_right.append((wx0 - fx1, text))
# Above
overlap = max(0, min(fx1, wx1) - max(fx0, wx0))
if wy1 < fy0 and overlap > 0:
if fy0 - wy1 < SEARCH_RADIUS: closest_above.append((fy0 - wy1, text))
closest_left.sort(key=lambda x: x[0])
closest_right.sort(key=lambda x: x[0])
closest_above.sort(key=lambda x: x[0])
def join_text(candidates): return " ".join([c[1] for c in candidates[:4]])
return {
"left": join_text(closest_left),
"right": join_text(closest_right),
"above": join_text(closest_above)
}
def render_hollow_debug_image(doc, page_num, fields):
if page_num >= len(doc): return None
page = doc[page_num]
zoom = 2.0
pix = page.get_pixmap(matrix=fitz.Matrix(zoom, zoom))
img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
draw = ImageDraw.Draw(img)
scale_x = pix.width / page.rect.width
scale_y = pix.height / page.rect.height
page_h = page.rect.height
try: font = ImageFont.truetype("arial.ttf", 30)
except: font = ImageFont.load_default()
for f in fields:
vis_id = f["temp_id"]
bbox = f["bbox"]
x0_bl = bbox["x0"] * scale_x
y0_bl = (page_h - bbox["y1"]) * scale_y
x1_bl = bbox["x1"] * scale_x
y1_bl = (page_h - bbox["y0"]) * scale_y
draw.rectangle([x0_bl, y0_bl, x1_bl, y1_bl], outline=config.BOX_COLOR, width=config.BOX_WIDTH)
badge_w, badge_h = 50, 35
bx0, by0 = x0_bl - 10, y0_bl - badge_h - 2
draw.rectangle([bx0, by0, bx0 + badge_w, by0 + badge_h], fill=config.BADGE_BG)
draw.text((bx0 + 10, by0 + 5), str(vis_id), fill=config.BADGE_COLOR, font=font)
buffer = io.BytesIO()
img.save(buffer, format="JPEG", quality=85)
return buffer.getvalue()