oneocr / tools /visualize_ocr.py

OneOCR Dev

OneOCR - reverse engineering complete, ONNX pipeline 53% match rate

ce847d4 about 14 hours ago

9.3 kB

	"""Visualize OCR results — overlay recognized text directly on detected regions.

	Features:
	- Text overlaid on word bounding boxes, scaled to fit
	- Semi-transparent background behind text for readability
	- Color-coded line bounding boxes
	- Confidence heat-map coloring (green=high, red=low)
	- Summary panel with statistics
	"""
	import sys
	import time
	from pathlib import Path

	import numpy as np
	from PIL import Image, ImageDraw, ImageFont

	sys.path.insert(0, str(Path(__file__).parent.parent))

	from ocr.engine_onnx import OcrEngineOnnx
	from ocr.models import BoundingRect


	# ─── Color helpers ────────────────────────────────────────────────────────

	def _conf_color(conf: float) -> tuple[int, int, int]:
	"""Map confidence 0..1 → red..yellow..green."""
	if conf >= 0.85:
	return (40, 180, 40)
	elif conf >= 0.6:
	t = (conf - 0.6) / 0.25
	return (int(220 * (1 - t)), int(180 * t + 100), 40)
	else:
	return (220, 60, 40)


	_LINE_COLORS = [
	(70, 130, 255), (255, 100, 70), (50, 200, 100), (255, 180, 40),
	(180, 80, 255), (40, 200, 200), (255, 80, 180), (160, 200, 60),
	]


	# ─── Font helpers ────────────────────────────────────────────────────────

	def _load_font(size: int) -> ImageFont.FreeTypeFont \| ImageFont.ImageFont:
	"""Try to load TrueType font, fallback to default."""
	for name in ("arial.ttf", "Arial.ttf", "segoeui.ttf", "msyh.ttc",
	"NotoSansCJK-Regular.ttc", "DejaVuSans.ttf"):
	try:
	return ImageFont.truetype(name, size)
	except Exception:
	pass
	return ImageFont.load_default()


	def _fit_font_size(
	text: str, box_w: float, box_h: float,
	min_size: int = 8, max_size: int = 120,
	) -> int:
	"""Binary search for font size that fits text into box_w × box_h."""
	lo, hi = min_size, max_size
	best = min_size
	while lo <= hi:
	mid = (lo + hi) // 2
	font = _load_font(mid)
	bbox = font.getbbox(text)
	tw, th = bbox[2] - bbox[0], bbox[3] - bbox[1]
	if tw <= box_w * 0.95 and th <= box_h * 0.88:
	best = mid
	lo = mid + 1
	else:
	hi = mid - 1
	return best


	# ─── Drawing helpers ─────────────────────────────────────────────────────

	def _draw_quad(
	draw: ImageDraw.ImageDraw, b: BoundingRect,
	color: tuple, width: int = 2,
	) -> None:
	"""Draw a quadrilateral outline."""
	pts = [(b.x1, b.y1), (b.x2, b.y2), (b.x3, b.y3), (b.x4, b.y4)]
	draw.polygon(pts, outline=color, width=width)


	def _overlay_text_on_word(
	overlay: Image.Image,
	word_text: str,
	b: BoundingRect,
	conf: float,
	) -> None:
	"""Draw text overlaid inside the word bounding box with semi-transparent bg."""
	xs = [b.x1, b.x2, b.x3, b.x4]
	ys = [b.y1, b.y2, b.y3, b.y4]
	x_min, x_max = min(xs), max(xs)
	y_min, y_max = min(ys), max(ys)
	box_w = x_max - x_min
	box_h = y_max - y_min

	if box_w < 3 or box_h < 3:
	return

	# Fit font to box
	font_size = _fit_font_size(word_text, box_w, box_h)
	font = _load_font(font_size)

	# Measure text
	bbox = font.getbbox(word_text)
	tw, th = bbox[2] - bbox[0], bbox[3] - bbox[1]

	# Center text in box
	tx = x_min + (box_w - tw) / 2
	ty = y_min + (box_h - th) / 2 - bbox[1]

	# Semi-transparent white background behind text
	bg = Image.new("RGBA", overlay.size, (0, 0, 0, 0))
	bg_draw = ImageDraw.Draw(bg)
	pad = 2
	bg_draw.rectangle(
	[tx - pad, y_min + (box_h - th) / 2 - pad,
	tx + tw + pad, y_min + (box_h + th) / 2 + pad],
	fill=(255, 255, 255, 170),
	)
	overlay.alpha_composite(bg)

	# Text color based on confidence
	text_color = _conf_color(conf)
	draw = ImageDraw.Draw(overlay)
	draw.text((tx, ty), word_text, fill=(*text_color, 255), font=font)


	# ─── Summary panel ───────────────────────────────────────────────────────

	def _draw_summary(
	overlay: Image.Image,
	n_lines: int, n_words: int, avg_conf: float,
	angle: float, elapsed: float, img_size: tuple[int, int],
	) -> None:
	"""Draw summary statistics panel at the top of the image."""
	font = _load_font(16)
	stats = (
	f"Lines: {n_lines} \| Words: {n_words} \| "
	f"Conf: {avg_conf:.1%} \| Angle: {angle:.1f}\u00b0 \| "
	f"Time: {elapsed:.0f}ms \| {img_size[0]}\u00d7{img_size[1]}"
	)

	bbox = font.getbbox(stats)
	th = bbox[3] - bbox[1]
	panel_h = th + 12

	# Semi-transparent dark panel
	bg = Image.new("RGBA", overlay.size, (0, 0, 0, 0))
	bg_draw = ImageDraw.Draw(bg)
	bg_draw.rectangle([0, 0, overlay.width, panel_h], fill=(0, 0, 0, 180))
	overlay.alpha_composite(bg)

	draw = ImageDraw.Draw(overlay)
	draw.text((8, 4), stats, fill=(255, 255, 255, 255), font=font)


	# ═══════════════════════════════════════════════════════════════════════════
	# Main visualization
	# ═══════════════════════════════════════════════════════════════════════════

	def visualize(
	image_path: str,
	output_path: str = "result_ocr.png",
	show_word_boxes: bool = True,
	show_line_boxes: bool = True,
	show_text_overlay: bool = True,
	show_confidence: bool = True,
	) -> None:
	"""Run OCR and visualize results with text overlay.

	Args:
	image_path: Input image path.
	output_path: Output path for annotated image.
	show_word_boxes: Draw word-level bounding boxes.
	show_line_boxes: Draw line-level bounding boxes.
	show_text_overlay: Overlay recognized text on words.
	show_confidence: Show confidence % below words.
	"""
	img = Image.open(image_path).convert("RGBA")
	engine = OcrEngineOnnx()

	t0 = time.perf_counter()
	result = engine.recognize_pil(img.convert("RGB"))
	elapsed_ms = (time.perf_counter() - t0) * 1000

	if result.error:
	print(f"Error: {result.error}")
	return

	overlay = img.copy()
	draw = ImageDraw.Draw(overlay)
	n_words = sum(len(l.words) for l in result.lines)

	for i, line in enumerate(result.lines):
	lc = _LINE_COLORS[i % len(_LINE_COLORS)]

	# Line-level bounding box (thicker)
	if show_line_boxes and line.bounding_rect:
	_draw_quad(draw, line.bounding_rect, color=lc, width=3)

	for word in line.words:
	if not word.bounding_rect:
	continue
	b = word.bounding_rect

	# Word-level bounding box (confidence-colored)
	if show_word_boxes:
	wc = _conf_color(word.confidence)
	_draw_quad(draw, b, color=wc, width=2)

	# Overlay text inside the word box
	if show_text_overlay:
	_overlay_text_on_word(overlay, word.text, b, word.confidence)
	draw = ImageDraw.Draw(overlay) # refresh after alpha_composite

	# Confidence label below word box
	if show_confidence:
	xs = [b.x1, b.x2, b.x3, b.x4]
	ys = [b.y1, b.y2, b.y3, b.y4]
	cx = sum(xs) / 4
	y_bot = max(ys) + 2
	conf_font = _load_font(11)
	label = f"{word.confidence:.0%}"
	lbbox = conf_font.getbbox(label)
	lw = lbbox[2] - lbbox[0]
	draw.text(
	(cx - lw / 2, y_bot),
	label,
	fill=(*_conf_color(word.confidence), 220),
	font=conf_font,
	)

	# Summary panel
	_draw_summary(
	overlay,
	n_lines=len(result.lines),
	n_words=n_words,
	avg_conf=result.average_confidence,
	angle=result.text_angle or 0.0,
	elapsed=elapsed_ms,
	img_size=(img.width, img.height),
	)

	# Save as RGB
	final = Image.new("RGB", overlay.size, (255, 255, 255))
	final.paste(overlay, mask=overlay.split()[3])
	final.save(output_path, quality=95)

	print(f"\nSaved: {output_path}")
	print(f"Text: \"{result.text}\"")
	print(f"Lines: {len(result.lines)}, Words: {n_words}, "
	f"Conf: {result.average_confidence:.1%}, Time: {elapsed_ms:.0f}ms")

	for i, line in enumerate(result.lines):
	words_info = " ".join(
	f'"{w.text}"({w.confidence:.0%})' for w in line.words
	)
	print(f" L{i}: {words_info}")


	if __name__ == "__main__":
	image_path = sys.argv[1] if len(sys.argv) > 1 else "test3.png"
	output_path = sys.argv[2] if len(sys.argv) > 2 else "result_ocr.png"
	visualize(image_path, output_path)