|
|
"""Visualize OCR results — overlay recognized text directly on detected regions. |
|
|
|
|
|
Features: |
|
|
- Text overlaid on word bounding boxes, scaled to fit |
|
|
- Semi-transparent background behind text for readability |
|
|
- Color-coded line bounding boxes |
|
|
- Confidence heat-map coloring (green=high, red=low) |
|
|
- Summary panel with statistics |
|
|
""" |
|
|
import sys |
|
|
import time |
|
|
from pathlib import Path |
|
|
|
|
|
import numpy as np |
|
|
from PIL import Image, ImageDraw, ImageFont |
|
|
|
|
|
sys.path.insert(0, str(Path(__file__).parent.parent)) |
|
|
|
|
|
from ocr.engine_onnx import OcrEngineOnnx |
|
|
from ocr.models import BoundingRect |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def _conf_color(conf: float) -> tuple[int, int, int]: |
|
|
"""Map confidence 0..1 → red..yellow..green.""" |
|
|
if conf >= 0.85: |
|
|
return (40, 180, 40) |
|
|
elif conf >= 0.6: |
|
|
t = (conf - 0.6) / 0.25 |
|
|
return (int(220 * (1 - t)), int(180 * t + 100), 40) |
|
|
else: |
|
|
return (220, 60, 40) |
|
|
|
|
|
|
|
|
_LINE_COLORS = [ |
|
|
(70, 130, 255), (255, 100, 70), (50, 200, 100), (255, 180, 40), |
|
|
(180, 80, 255), (40, 200, 200), (255, 80, 180), (160, 200, 60), |
|
|
] |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def _load_font(size: int) -> ImageFont.FreeTypeFont | ImageFont.ImageFont: |
|
|
"""Try to load TrueType font, fallback to default.""" |
|
|
for name in ("arial.ttf", "Arial.ttf", "segoeui.ttf", "msyh.ttc", |
|
|
"NotoSansCJK-Regular.ttc", "DejaVuSans.ttf"): |
|
|
try: |
|
|
return ImageFont.truetype(name, size) |
|
|
except Exception: |
|
|
pass |
|
|
return ImageFont.load_default() |
|
|
|
|
|
|
|
|
def _fit_font_size( |
|
|
text: str, box_w: float, box_h: float, |
|
|
min_size: int = 8, max_size: int = 120, |
|
|
) -> int: |
|
|
"""Binary search for font size that fits text into box_w × box_h.""" |
|
|
lo, hi = min_size, max_size |
|
|
best = min_size |
|
|
while lo <= hi: |
|
|
mid = (lo + hi) // 2 |
|
|
font = _load_font(mid) |
|
|
bbox = font.getbbox(text) |
|
|
tw, th = bbox[2] - bbox[0], bbox[3] - bbox[1] |
|
|
if tw <= box_w * 0.95 and th <= box_h * 0.88: |
|
|
best = mid |
|
|
lo = mid + 1 |
|
|
else: |
|
|
hi = mid - 1 |
|
|
return best |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def _draw_quad( |
|
|
draw: ImageDraw.ImageDraw, b: BoundingRect, |
|
|
color: tuple, width: int = 2, |
|
|
) -> None: |
|
|
"""Draw a quadrilateral outline.""" |
|
|
pts = [(b.x1, b.y1), (b.x2, b.y2), (b.x3, b.y3), (b.x4, b.y4)] |
|
|
draw.polygon(pts, outline=color, width=width) |
|
|
|
|
|
|
|
|
def _overlay_text_on_word( |
|
|
overlay: Image.Image, |
|
|
word_text: str, |
|
|
b: BoundingRect, |
|
|
conf: float, |
|
|
) -> None: |
|
|
"""Draw text overlaid inside the word bounding box with semi-transparent bg.""" |
|
|
xs = [b.x1, b.x2, b.x3, b.x4] |
|
|
ys = [b.y1, b.y2, b.y3, b.y4] |
|
|
x_min, x_max = min(xs), max(xs) |
|
|
y_min, y_max = min(ys), max(ys) |
|
|
box_w = x_max - x_min |
|
|
box_h = y_max - y_min |
|
|
|
|
|
if box_w < 3 or box_h < 3: |
|
|
return |
|
|
|
|
|
|
|
|
font_size = _fit_font_size(word_text, box_w, box_h) |
|
|
font = _load_font(font_size) |
|
|
|
|
|
|
|
|
bbox = font.getbbox(word_text) |
|
|
tw, th = bbox[2] - bbox[0], bbox[3] - bbox[1] |
|
|
|
|
|
|
|
|
tx = x_min + (box_w - tw) / 2 |
|
|
ty = y_min + (box_h - th) / 2 - bbox[1] |
|
|
|
|
|
|
|
|
bg = Image.new("RGBA", overlay.size, (0, 0, 0, 0)) |
|
|
bg_draw = ImageDraw.Draw(bg) |
|
|
pad = 2 |
|
|
bg_draw.rectangle( |
|
|
[tx - pad, y_min + (box_h - th) / 2 - pad, |
|
|
tx + tw + pad, y_min + (box_h + th) / 2 + pad], |
|
|
fill=(255, 255, 255, 170), |
|
|
) |
|
|
overlay.alpha_composite(bg) |
|
|
|
|
|
|
|
|
text_color = _conf_color(conf) |
|
|
draw = ImageDraw.Draw(overlay) |
|
|
draw.text((tx, ty), word_text, fill=(*text_color, 255), font=font) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def _draw_summary( |
|
|
overlay: Image.Image, |
|
|
n_lines: int, n_words: int, avg_conf: float, |
|
|
angle: float, elapsed: float, img_size: tuple[int, int], |
|
|
) -> None: |
|
|
"""Draw summary statistics panel at the top of the image.""" |
|
|
font = _load_font(16) |
|
|
stats = ( |
|
|
f"Lines: {n_lines} | Words: {n_words} | " |
|
|
f"Conf: {avg_conf:.1%} | Angle: {angle:.1f}\u00b0 | " |
|
|
f"Time: {elapsed:.0f}ms | {img_size[0]}\u00d7{img_size[1]}" |
|
|
) |
|
|
|
|
|
bbox = font.getbbox(stats) |
|
|
th = bbox[3] - bbox[1] |
|
|
panel_h = th + 12 |
|
|
|
|
|
|
|
|
bg = Image.new("RGBA", overlay.size, (0, 0, 0, 0)) |
|
|
bg_draw = ImageDraw.Draw(bg) |
|
|
bg_draw.rectangle([0, 0, overlay.width, panel_h], fill=(0, 0, 0, 180)) |
|
|
overlay.alpha_composite(bg) |
|
|
|
|
|
draw = ImageDraw.Draw(overlay) |
|
|
draw.text((8, 4), stats, fill=(255, 255, 255, 255), font=font) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def visualize( |
|
|
image_path: str, |
|
|
output_path: str = "result_ocr.png", |
|
|
show_word_boxes: bool = True, |
|
|
show_line_boxes: bool = True, |
|
|
show_text_overlay: bool = True, |
|
|
show_confidence: bool = True, |
|
|
) -> None: |
|
|
"""Run OCR and visualize results with text overlay. |
|
|
|
|
|
Args: |
|
|
image_path: Input image path. |
|
|
output_path: Output path for annotated image. |
|
|
show_word_boxes: Draw word-level bounding boxes. |
|
|
show_line_boxes: Draw line-level bounding boxes. |
|
|
show_text_overlay: Overlay recognized text on words. |
|
|
show_confidence: Show confidence % below words. |
|
|
""" |
|
|
img = Image.open(image_path).convert("RGBA") |
|
|
engine = OcrEngineOnnx() |
|
|
|
|
|
t0 = time.perf_counter() |
|
|
result = engine.recognize_pil(img.convert("RGB")) |
|
|
elapsed_ms = (time.perf_counter() - t0) * 1000 |
|
|
|
|
|
if result.error: |
|
|
print(f"Error: {result.error}") |
|
|
return |
|
|
|
|
|
overlay = img.copy() |
|
|
draw = ImageDraw.Draw(overlay) |
|
|
n_words = sum(len(l.words) for l in result.lines) |
|
|
|
|
|
for i, line in enumerate(result.lines): |
|
|
lc = _LINE_COLORS[i % len(_LINE_COLORS)] |
|
|
|
|
|
|
|
|
if show_line_boxes and line.bounding_rect: |
|
|
_draw_quad(draw, line.bounding_rect, color=lc, width=3) |
|
|
|
|
|
for word in line.words: |
|
|
if not word.bounding_rect: |
|
|
continue |
|
|
b = word.bounding_rect |
|
|
|
|
|
|
|
|
if show_word_boxes: |
|
|
wc = _conf_color(word.confidence) |
|
|
_draw_quad(draw, b, color=wc, width=2) |
|
|
|
|
|
|
|
|
if show_text_overlay: |
|
|
_overlay_text_on_word(overlay, word.text, b, word.confidence) |
|
|
draw = ImageDraw.Draw(overlay) |
|
|
|
|
|
|
|
|
if show_confidence: |
|
|
xs = [b.x1, b.x2, b.x3, b.x4] |
|
|
ys = [b.y1, b.y2, b.y3, b.y4] |
|
|
cx = sum(xs) / 4 |
|
|
y_bot = max(ys) + 2 |
|
|
conf_font = _load_font(11) |
|
|
label = f"{word.confidence:.0%}" |
|
|
lbbox = conf_font.getbbox(label) |
|
|
lw = lbbox[2] - lbbox[0] |
|
|
draw.text( |
|
|
(cx - lw / 2, y_bot), |
|
|
label, |
|
|
fill=(*_conf_color(word.confidence), 220), |
|
|
font=conf_font, |
|
|
) |
|
|
|
|
|
|
|
|
_draw_summary( |
|
|
overlay, |
|
|
n_lines=len(result.lines), |
|
|
n_words=n_words, |
|
|
avg_conf=result.average_confidence, |
|
|
angle=result.text_angle or 0.0, |
|
|
elapsed=elapsed_ms, |
|
|
img_size=(img.width, img.height), |
|
|
) |
|
|
|
|
|
|
|
|
final = Image.new("RGB", overlay.size, (255, 255, 255)) |
|
|
final.paste(overlay, mask=overlay.split()[3]) |
|
|
final.save(output_path, quality=95) |
|
|
|
|
|
print(f"\nSaved: {output_path}") |
|
|
print(f"Text: \"{result.text}\"") |
|
|
print(f"Lines: {len(result.lines)}, Words: {n_words}, " |
|
|
f"Conf: {result.average_confidence:.1%}, Time: {elapsed_ms:.0f}ms") |
|
|
|
|
|
for i, line in enumerate(result.lines): |
|
|
words_info = " ".join( |
|
|
f'"{w.text}"({w.confidence:.0%})' for w in line.words |
|
|
) |
|
|
print(f" L{i}: {words_info}") |
|
|
|
|
|
|
|
|
if __name__ == "__main__": |
|
|
image_path = sys.argv[1] if len(sys.argv) > 1 else "test3.png" |
|
|
output_path = sys.argv[2] if len(sys.argv) > 2 else "result_ocr.png" |
|
|
visualize(image_path, output_path) |
|
|
|