oneocr

File size: 9,295 Bytes

ce847d4

"""Visualize OCR results — overlay recognized text directly on detected regions.

Features:
  - Text overlaid on word bounding boxes, scaled to fit
  - Semi-transparent background behind text for readability
  - Color-coded line bounding boxes
  - Confidence heat-map coloring (green=high, red=low)
  - Summary panel with statistics
"""
import sys
import time
from pathlib import Path

import numpy as np
from PIL import Image, ImageDraw, ImageFont

sys.path.insert(0, str(Path(__file__).parent.parent))

from ocr.engine_onnx import OcrEngineOnnx
from ocr.models import BoundingRect


# ─── Color helpers ────────────────────────────────────────────────────────

def _conf_color(conf: float) -> tuple[int, int, int]:
    """Map confidence 0..1 → red..yellow..green."""
    if conf >= 0.85:
        return (40, 180, 40)
    elif conf >= 0.6:
        t = (conf - 0.6) / 0.25
        return (int(220 * (1 - t)), int(180 * t + 100), 40)
    else:
        return (220, 60, 40)


_LINE_COLORS = [
    (70, 130, 255), (255, 100, 70), (50, 200, 100), (255, 180, 40),
    (180, 80, 255), (40, 200, 200), (255, 80, 180), (160, 200, 60),
]


# ─── Font helpers ────────────────────────────────────────────────────────

def _load_font(size: int) -> ImageFont.FreeTypeFont | ImageFont.ImageFont:
    """Try to load TrueType font, fallback to default."""
    for name in ("arial.ttf", "Arial.ttf", "segoeui.ttf", "msyh.ttc",
                 "NotoSansCJK-Regular.ttc", "DejaVuSans.ttf"):
        try:
            return ImageFont.truetype(name, size)
        except Exception:
            pass
    return ImageFont.load_default()


def _fit_font_size(
    text: str, box_w: float, box_h: float,
    min_size: int = 8, max_size: int = 120,
) -> int:
    """Binary search for font size that fits text into box_w × box_h."""
    lo, hi = min_size, max_size
    best = min_size
    while lo <= hi:
        mid = (lo + hi) // 2
        font = _load_font(mid)
        bbox = font.getbbox(text)
        tw, th = bbox[2] - bbox[0], bbox[3] - bbox[1]
        if tw <= box_w * 0.95 and th <= box_h * 0.88:
            best = mid
            lo = mid + 1
        else:
            hi = mid - 1
    return best


# ─── Drawing helpers ─────────────────────────────────────────────────────

def _draw_quad(
    draw: ImageDraw.ImageDraw, b: BoundingRect,
    color: tuple, width: int = 2,
) -> None:
    """Draw a quadrilateral outline."""
    pts = [(b.x1, b.y1), (b.x2, b.y2), (b.x3, b.y3), (b.x4, b.y4)]
    draw.polygon(pts, outline=color, width=width)


def _overlay_text_on_word(
    overlay: Image.Image,
    word_text: str,
    b: BoundingRect,
    conf: float,
) -> None:
    """Draw text overlaid inside the word bounding box with semi-transparent bg."""
    xs = [b.x1, b.x2, b.x3, b.x4]
    ys = [b.y1, b.y2, b.y3, b.y4]
    x_min, x_max = min(xs), max(xs)
    y_min, y_max = min(ys), max(ys)
    box_w = x_max - x_min
    box_h = y_max - y_min

    if box_w < 3 or box_h < 3:
        return

    # Fit font to box
    font_size = _fit_font_size(word_text, box_w, box_h)
    font = _load_font(font_size)

    # Measure text
    bbox = font.getbbox(word_text)
    tw, th = bbox[2] - bbox[0], bbox[3] - bbox[1]

    # Center text in box
    tx = x_min + (box_w - tw) / 2
    ty = y_min + (box_h - th) / 2 - bbox[1]

    # Semi-transparent white background behind text
    bg = Image.new("RGBA", overlay.size, (0, 0, 0, 0))
    bg_draw = ImageDraw.Draw(bg)
    pad = 2
    bg_draw.rectangle(
        [tx - pad, y_min + (box_h - th) / 2 - pad,
         tx + tw + pad, y_min + (box_h + th) / 2 + pad],
        fill=(255, 255, 255, 170),
    )
    overlay.alpha_composite(bg)

    # Text color based on confidence
    text_color = _conf_color(conf)
    draw = ImageDraw.Draw(overlay)
    draw.text((tx, ty), word_text, fill=(*text_color, 255), font=font)


# ─── Summary panel ───────────────────────────────────────────────────────

def _draw_summary(
    overlay: Image.Image,
    n_lines: int, n_words: int, avg_conf: float,
    angle: float, elapsed: float, img_size: tuple[int, int],
) -> None:
    """Draw summary statistics panel at the top of the image."""
    font = _load_font(16)
    stats = (
        f"Lines: {n_lines}  |  Words: {n_words}  |  "
        f"Conf: {avg_conf:.1%}  |  Angle: {angle:.1f}\u00b0  |  "
        f"Time: {elapsed:.0f}ms  |  {img_size[0]}\u00d7{img_size[1]}"
    )

    bbox = font.getbbox(stats)
    th = bbox[3] - bbox[1]
    panel_h = th + 12

    # Semi-transparent dark panel
    bg = Image.new("RGBA", overlay.size, (0, 0, 0, 0))
    bg_draw = ImageDraw.Draw(bg)
    bg_draw.rectangle([0, 0, overlay.width, panel_h], fill=(0, 0, 0, 180))
    overlay.alpha_composite(bg)

    draw = ImageDraw.Draw(overlay)
    draw.text((8, 4), stats, fill=(255, 255, 255, 255), font=font)


# ═══════════════════════════════════════════════════════════════════════════
# Main visualization
# ═══════════════════════════════════════════════════════════════════════════

def visualize(
    image_path: str,
    output_path: str = "result_ocr.png",
    show_word_boxes: bool = True,
    show_line_boxes: bool = True,
    show_text_overlay: bool = True,
    show_confidence: bool = True,
) -> None:
    """Run OCR and visualize results with text overlay.

    Args:
        image_path: Input image path.
        output_path: Output path for annotated image.
        show_word_boxes: Draw word-level bounding boxes.
        show_line_boxes: Draw line-level bounding boxes.
        show_text_overlay: Overlay recognized text on words.
        show_confidence: Show confidence % below words.
    """
    img = Image.open(image_path).convert("RGBA")
    engine = OcrEngineOnnx()

    t0 = time.perf_counter()
    result = engine.recognize_pil(img.convert("RGB"))
    elapsed_ms = (time.perf_counter() - t0) * 1000

    if result.error:
        print(f"Error: {result.error}")
        return

    overlay = img.copy()
    draw = ImageDraw.Draw(overlay)
    n_words = sum(len(l.words) for l in result.lines)

    for i, line in enumerate(result.lines):
        lc = _LINE_COLORS[i % len(_LINE_COLORS)]

        # Line-level bounding box (thicker)
        if show_line_boxes and line.bounding_rect:
            _draw_quad(draw, line.bounding_rect, color=lc, width=3)

        for word in line.words:
            if not word.bounding_rect:
                continue
            b = word.bounding_rect

            # Word-level bounding box (confidence-colored)
            if show_word_boxes:
                wc = _conf_color(word.confidence)
                _draw_quad(draw, b, color=wc, width=2)

            # Overlay text inside the word box
            if show_text_overlay:
                _overlay_text_on_word(overlay, word.text, b, word.confidence)
                draw = ImageDraw.Draw(overlay)  # refresh after alpha_composite

            # Confidence label below word box
            if show_confidence:
                xs = [b.x1, b.x2, b.x3, b.x4]
                ys = [b.y1, b.y2, b.y3, b.y4]
                cx = sum(xs) / 4
                y_bot = max(ys) + 2
                conf_font = _load_font(11)
                label = f"{word.confidence:.0%}"
                lbbox = conf_font.getbbox(label)
                lw = lbbox[2] - lbbox[0]
                draw.text(
                    (cx - lw / 2, y_bot),
                    label,
                    fill=(*_conf_color(word.confidence), 220),
                    font=conf_font,
                )

    # Summary panel
    _draw_summary(
        overlay,
        n_lines=len(result.lines),
        n_words=n_words,
        avg_conf=result.average_confidence,
        angle=result.text_angle or 0.0,
        elapsed=elapsed_ms,
        img_size=(img.width, img.height),
    )

    # Save as RGB
    final = Image.new("RGB", overlay.size, (255, 255, 255))
    final.paste(overlay, mask=overlay.split()[3])
    final.save(output_path, quality=95)

    print(f"\nSaved: {output_path}")
    print(f"Text:  \"{result.text}\"")
    print(f"Lines: {len(result.lines)}, Words: {n_words}, "
          f"Conf: {result.average_confidence:.1%}, Time: {elapsed_ms:.0f}ms")

    for i, line in enumerate(result.lines):
        words_info = "  ".join(
            f'"{w.text}"({w.confidence:.0%})' for w in line.words
        )
        print(f"  L{i}: {words_info}")


if __name__ == "__main__":
    image_path = sys.argv[1] if len(sys.argv) > 1 else "test3.png"
    output_path = sys.argv[2] if len(sys.argv) > 2 else "result_ocr.png"
    visualize(image_path, output_path)