# Rendering dispatch — places translated text back onto the image. import cv2 import numpy as np from typing import List, Optional from shapely import affinity from shapely.geometry import Polygon from . import text_render from .bubble import detect_bubbles from ..utils import TextBlock, color_difference, get_logger, rotate_polygons logger = get_logger("render") def _fg_bg_compare(fg, bg): fg_avg = np.mean(fg) if color_difference(fg, bg) < 30: bg = (255, 255, 255) if fg_avg <= 127 else (0, 0, 0) return fg, bg def _count_text_length(text: str) -> float: half_width_chars = 'っッぁぃぅぇぉ' length = 0.0 for char in text.strip(): if char in half_width_chars: length += 0.5 else: length += 1.0 return length def _fallback_scale_cap(region: TextBlock, severity: float) -> float: """Single dynamic cap for fallback expansion, confidence-driven.""" bubble_conf = float(getattr(region, "_bubble_confidence", 0.0) or 0.0) if bubble_conf < 0.25: cap = 2.5 elif bubble_conf < 0.45: cap = 2.2 elif bubble_conf < 0.65: cap = 1.9 elif bubble_conf < 0.80: cap = 1.7 else: cap = 1.5 if severity > 2.0: cap = min(2.5, cap + 0.1) return cap def _early_fallback_bias(region: TextBlock) -> float: """Small optional bias for very low-confidence fallback only.""" bubble_conf = float(getattr(region, "_bubble_confidence", 0.0) or 0.0) ratio = _translation_length_ratio(region) if bubble_conf < 0.20 and ratio > 1.2: return 1.2 if bubble_conf < 0.30 and ratio > 1.6: return 1.1 return 1.0 def _translation_length_ratio(region: TextBlock) -> float: orig_text = getattr(region, "text_raw", region.text) char_count_orig = _count_text_length(orig_text) char_count_trans = _count_text_length((region.translation or "").strip()) if char_count_orig <= 0: return 1.0 return max(0.5, char_count_trans / char_count_orig) def _rect_to_quad(x: int, y: int, w: int, h: int) -> np.ndarray: return np.array([[[x, y], [x + w, y], [x + w, y + h], [x, y + h]]], dtype=np.int64) def _quad_from_inpaint_bbox(region: TextBlock) -> Optional[np.ndarray]: """Try to read inpaint-aligned bbox if available on the region.""" bbox = getattr(region, "inpaint_bbox", None) if bbox is None: return None try: # dict style: {x, y, w, h} if isinstance(bbox, dict): x, y = int(bbox["x"]), int(bbox["y"]) w, h = int(bbox["w"]), int(bbox["h"]) if w > 2 and h > 2: return _rect_to_quad(x, y, w, h) # tuple/list style: (x, y, w, h) if isinstance(bbox, (tuple, list)) and len(bbox) == 4: x, y, w, h = [int(v) for v in bbox] if w > 2 and h > 2: return _rect_to_quad(x, y, w, h) # 4-point quad style: [[x,y], ...] arr = np.array(bbox, dtype=np.int64) if arr.ndim == 2 and arr.shape == (4, 2): return arr.reshape(1, 4, 2) if arr.ndim == 3 and arr.shape[1:] == (4, 2): return arr[:1] except Exception: return None return None def _get_region_base_quad(region: TextBlock) -> np.ndarray: """Preferred placement quad: inpaint-aligned bbox, then region min rect.""" inpaint_quad = _quad_from_inpaint_bbox(region) if inpaint_quad is not None: return inpaint_quad.astype(np.int64) return region.min_rect.astype(np.int64) def _resize_regions_to_font_size( img: np.ndarray, text_regions: List[TextBlock], font_size_offset: int, font_size_minimum: int, ): """Expand text bounding boxes when translated text is longer than the original.""" if font_size_minimum == -1: font_size_minimum = round((img.shape[0] + img.shape[1]) / 200) font_size_minimum = max(1, font_size_minimum) dst_points_list = [] for region in text_regions: base_quad = _get_region_base_quad(region) _, _, base_w, base_h = cv2.boundingRect(base_quad[0].astype(np.int32)) base_w = max(1, int(base_w)) base_h = max(1, int(base_h)) original_fs = region.font_size if original_fs <= 0: original_fs = font_size_minimum target_fs = original_fs + font_size_offset target_fs = max(target_fs, font_size_minimum, 1) # Keep font-size nudging mild; geometry scaling is driven by overflow estimates. ratio = _translation_length_ratio(region) if ratio > 1.0: target_fs = int(round(target_fs * min(1.35, 1.0 + 0.18 * (ratio - 1.0)))) target_fs = max(target_fs, font_size_minimum, 1) # Single overflow-driven scaling decision on the true base region. base_scale = 1.0 if original_fs > 0: fs_growth = max(0.0, (target_fs - original_fs) / float(original_fs)) base_scale = 1.0 + 0.35 * fs_growth over_x, over_y = _estimate_overflow_scales( region, target_fs, avail_w=base_w, avail_h=base_h, ) severity = max(base_scale, over_x, over_y) cap = _fallback_scale_cap(region, severity) final_scale_x = min(max(max(base_scale, over_x), 1.0), cap) final_scale_y = min(max(max(base_scale, over_y), 1.0), cap) if final_scale_x > 1.001 or final_scale_y > 1.001: try: poly = Polygon(base_quad[0]) poly = affinity.scale(poly, xfact=final_scale_x, yfact=final_scale_y, origin='center') scaled_pts = np.array(poly.exterior.coords[:4]) dst_points = scaled_pts.reshape(-1, 4, 2).astype(np.int64) except Exception: dst_points = base_quad else: dst_points = base_quad dst_points_list.append(dst_points) region.font_size = int(target_fs) return dst_points_list def _order_points(pts: np.ndarray) -> np.ndarray: """Reorder 4 corner points to [TL, TR, BR, BL] regardless of input ordering.""" pts = pts.reshape(4, 2).astype(np.float64) s = pts.sum(axis=1) d = np.diff(pts, axis=1).flatten() # y - x tl = pts[np.argmin(s)] br = pts[np.argmax(s)] tr = pts[np.argmin(d)] bl = pts[np.argmax(d)] return np.array([tl, tr, br, bl]) def _compute_inner_margin(target_w: int, target_h: int, text_len: int) -> int: """Compute smooth, text-aware inner margin for bubble rendering.""" min_dim = max(1, min(target_w, target_h)) # Tiny boxes: keep margins minimal to preserve usable layout space. if min_dim < 70: return max(1, int(round(min_dim * 0.02))) if min_dim < 120: return max(2, int(round(min_dim * 0.03))) base = 0.040 * min_dim + 1.0 density_factor = max(0.86, min(1.08, 1.02 - 0.0022 * text_len)) margin = int(round(base * density_factor)) return max(2, min(10, margin)) def _center_text_in_box(temp_box: np.ndarray, target_w: int, target_h: int) -> np.ndarray: """Center rendered text in a target box without additional raster shrink.""" target_w = max(1, int(target_w)) target_h = max(1, int(target_h)) h, w, _ = temp_box.shape content = temp_box out = np.zeros((target_h, target_w, 4), dtype=np.uint8) draw_w = min(w, target_w) draw_h = min(h, target_h) x0 = max(0, (target_w - draw_w) // 2) y0 = max(0, (target_h - draw_h) // 2) src_x0 = max(0, (w - draw_w) // 2) src_y0 = max(0, (h - draw_h) // 2) out[y0:y0 + draw_h, x0:x0 + draw_w] = content[src_y0:src_y0 + draw_h, src_x0:src_x0 + draw_w] return out def _render_temp_text_box( region: TextBlock, font_size: int, width: int, height: int, fg, bg, hyphenate, line_spacing, render_h: bool, ): if render_h: return text_render.put_text_horizontal( font_size, region.get_translation_for_rendering(), width, height, region.alignment, region.direction == 'hl', fg, bg, region.target_lang, hyphenate, line_spacing, ) return text_render.put_text_vertical( font_size, region.get_translation_for_rendering(), height, region.alignment, fg, bg, line_spacing, ) def _sanitize_dst_quad(dst_points: np.ndarray, region: TextBlock) -> np.ndarray: """Sanitize destination quadrilateral to reduce homography placement errors.""" pts = _order_points(dst_points[0]).astype(np.float32) # Use region min rect if the incoming quad is degenerate. def _region_fallback(): return _order_points(_get_region_base_quad(region)[0]).astype(np.float32) area = abs(cv2.contourArea(pts.astype(np.int32))) if area < 20: pts = _region_fallback() if not cv2.isContourConvex(pts.astype(np.int32)): pts = _region_fallback() edges = [ np.linalg.norm(pts[1] - pts[0]), np.linalg.norm(pts[2] - pts[1]), np.linalg.norm(pts[3] - pts[2]), np.linalg.norm(pts[0] - pts[3]), ] if min(edges) < 4.0: pts = _region_fallback() # If destination is near-axis aligned but region angle is not, nudge orientation. vec = pts[1] - pts[0] quad_angle = np.degrees(np.arctan2(float(vec[1]), float(vec[0]))) region_angle = float(getattr(region, "angle", 0.0) or 0.0) if abs(region_angle) > 10.0 and abs(quad_angle) < 3.0 and abs(region_angle) < 45.0: rot_deg = float(np.clip(region_angle * 0.35, -10.0, 10.0)) theta = np.deg2rad(rot_deg) c, s = np.cos(theta), np.sin(theta) R = np.array([[c, -s], [s, c]], dtype=np.float32) center = np.mean(pts, axis=0, keepdims=True) pts = (pts - center) @ R.T + center return pts[np.newaxis].astype(np.float32) def _estimate_overflow_scales( region: TextBlock, target_fs: int, avail_w: Optional[float] = None, avail_h: Optional[float] = None, ) -> tuple[float, float]: """Estimate overflow-driven expansion scales from the current base region size.""" translation = region.get_translation_for_rendering() lang = getattr(region, "target_lang", "en_US") base_w = float(avail_w if avail_w is not None else region.unrotated_size[0]) base_h = float(avail_h if avail_h is not None else region.unrotated_size[1]) base_w = max(base_w, 1.0) base_h = max(base_h, 1.0) if region.horizontal: lines, widths = text_render.calc_horizontal( target_fs, translation, max_width=base_w, max_height=base_h, language=lang, ) used_rows = max(len(region.texts), 1) needed_rows = max(len(lines), 1) row_overflow = max(1.0, needed_rows / used_rows) width_overflow = max(1.0, (max(widths) if widths else 0) / base_w) scale_x = max(width_overflow, 1.0 + 0.35 * (row_overflow - 1.0)) scale_y = max(1.0, row_overflow) else: cols, col_heights = text_render.calc_vertical( target_fs, translation, max_height=base_h, ) used_cols = max(len(region.texts), 1) needed_cols = max(len(cols), 1) col_overflow = max(1.0, needed_cols / used_cols) height_overflow = max(1.0, (max(col_heights) if col_heights else 0) / base_h) scale_x = max(1.0, col_overflow) scale_y = max(height_overflow, 1.0 + 0.35 * (col_overflow - 1.0)) return min(max(scale_x, 1.0), 2.5), min(max(scale_y, 1.0), 2.5) def _render_region(img, region: TextBlock, dst_points, hyphenate, line_spacing, disable_font_border): fg, bg = region.get_font_colors() fg, bg = _fg_bg_compare(fg, bg) if disable_font_border: bg = None # Sanitize destination points so homography does not amplify bad geometry. dst_points = _sanitize_dst_quad(dst_points, region) middle_pts = (dst_points[:, [1, 2, 3, 0]] + dst_points) / 2 norm_h = np.linalg.norm(middle_pts[:, 1] - middle_pts[:, 3], axis=1) norm_v = np.linalg.norm(middle_pts[:, 2] - middle_pts[:, 0], axis=1) forced_dir = region._direction if hasattr(region, "_direction") else region.direction if forced_dir != "auto": render_h = forced_dir in ("horizontal", "h") else: render_h = region.horizontal target_w = max(1, int(round(norm_h[0]))) target_h = max(1, int(round(norm_v[0]))) temp_box = _render_temp_text_box( region, region.font_size, target_w, target_h, fg, bg, hyphenate, line_spacing, render_h, ) if temp_box is None: return img margin = _compute_inner_margin(target_w, target_h, len(region.get_translation_for_rendering())) inner_w = max(1, target_w - margin * 2) inner_h = max(1, target_h - margin * 2) # Fit by font-size adjustment only. Avoid additional raster downscaling. bubble_conf = float(getattr(region, "_bubble_confidence", 0.0) or 0.0) shrink_step_ratio = 0.03 if bubble_conf < 0.45 else 0.05 max_fit_steps = 4 if bubble_conf < 0.45 else 6 fit_steps = 0 while ( (temp_box.shape[1] - inner_w > 2 or temp_box.shape[0] - inner_h > 2) and fit_steps < max_fit_steps ): fs0 = int(region.font_size) trial_fs = max(8, fs0 - max(1, int(round(fs0 * shrink_step_ratio)))) if trial_fs >= fs0: break prev_over = max( max(temp_box.shape[1] - inner_w, 0) / max(inner_w, 1), max(temp_box.shape[0] - inner_h, 0) / max(inner_h, 1), ) trial_box = _render_temp_text_box( region, trial_fs, target_w, target_h, fg, bg, hyphenate, line_spacing, render_h, ) if trial_box is None: break next_over = max( max(trial_box.shape[1] - inner_w, 0) / max(inner_w, 1), max(trial_box.shape[0] - inner_h, 0) / max(inner_h, 1), ) if next_over > prev_over - 0.01: break temp_box = trial_box region.font_size = trial_fs fit_steps += 1 centered_inner = _center_text_in_box(temp_box, inner_w, inner_h) box = np.zeros((target_h, target_w, 4), dtype=np.uint8) ox = max(0, (target_w - inner_w) // 2) oy = max(0, (target_h - inner_h) // 2) box[oy:oy + inner_h, ox:ox + inner_w] = centered_inner src_pts = np.array([[0, 0], [box.shape[1], 0], [box.shape[1], box.shape[0]], [0, box.shape[0]]]).astype(np.float32) M, _ = cv2.findHomography(src_pts, dst_points, cv2.RANSAC, 5.0) rgba = cv2.warpPerspective(box, M, (img.shape[1], img.shape[0]), flags=cv2.INTER_LINEAR, borderMode=cv2.BORDER_CONSTANT, borderValue=0) x, y, rw, rh = cv2.boundingRect(dst_points.astype(np.int32)) canvas = rgba[y:y + rh, x:x + rw, :3] mask = rgba[y:y + rh, x:x + rw, 3:4].astype(np.float32) / 255.0 img[y:y + rh, x:x + rw] = np.clip( img[y:y + rh, x:x + rw].astype(np.float32) * (1 - mask) + canvas.astype(np.float32) * mask, 0, 255, ).astype(np.uint8) return img def _find_optimal_font_size( text, box_w, box_h, initial_fs, lang, min_fs=10, render_h: bool = True, line_spacing: Optional[float] = None, region: Optional[TextBlock] = None, ): """ Binary-search for the largest font size that fits text inside box. PHASE 1 IMPROVEMENTS: - Smooth formula based on min(box_w, box_h) instead of discrete classes - Accounts for padding and line spacing (15% instead of 1%) - More iterations for precision (14 instead of 12) - Adaptive margin calculation """ bubble_conf = float(getattr(region, "_bubble_confidence", 0.0) if region is not None else 0.0) text_len = _count_text_length(text) # Calculate adaptive safe margin (looser for low-confidence fallback regions) min_dim = max(1, min(box_w, box_h)) # Higher text density in the same region is treated as a complex case and # gets a slightly more conservative fit target. density = text_len / float(max(min_dim, 1)) complexity = max(0.0, min(1.0, (density - 0.22) / 0.35)) margin_ratio = max(0.02, min(0.10, 7 / min_dim)) if bubble_conf < 0.45: margin_ratio *= 0.85 margin_ratio *= 1.0 + 0.20 * complexity margin = max(4, int(min_dim * margin_ratio)) horizontal_spacing = float(line_spacing) if line_spacing is not None else 0.15 vertical_spacing = float(line_spacing) if line_spacing is not None else 0.10 safe_w = max(margin * 2, box_w - margin * 2) safe_h = max(margin * 2, box_h - margin * 2) # Smooth formula for max font size based on smallest dimension if min_dim < 80: max_fs = min(int(min_dim * 0.56), 40) elif min_dim < 200: max_fs = int(40 + (min_dim - 80) * 0.30) else: max_fs = min(int(min_dim * 0.44), 168) # Tone down the upper bound further for dense/complex bubbles. max_fs = int(round(max_fs * (1.0 - 0.12 * complexity))) # Relax growth caps, especially for low-confidence fallback regions. growth_mult = 3.2 if bubble_conf < 0.45 else 2.6 max_fs = min(max_fs, int(max(initial_fs * growth_mult, min_fs + 8))) max_fs = max(max_fs, min_fs + 6) # Tighten fit slack when complexity is high to avoid visually oversized text. slack_tighten = 0.03 * complexity lo, hi = min_fs, max_fs best = min_fs # More iterations for precision for _ in range(14): if lo > hi: break mid = (lo + hi) // 2 if mid < 1: break # Layout-aware fit check (horizontal and vertical text behave differently). if render_h: lines, widths = text_render.calc_horizontal(mid, text, safe_w, safe_h, lang) line_spacing_px = max(int(mid * horizontal_spacing), 3) total_h = mid * len(lines) + line_spacing_px * max(0, len(lines) - 1) + margin * 2 max_line_w = max(widths) if widths else 0 h_slack = (1.04 if bubble_conf < 0.45 else 1.01) - slack_tighten w_slack = (1.05 if bubble_conf < 0.45 else 1.02) - slack_tighten h_slack = max(1.0, h_slack) w_slack = max(1.0, w_slack) fits = total_h <= box_h * h_slack and (max_line_w + margin * 2) <= box_w * w_slack else: cols, col_heights = text_render.calc_vertical(mid, text, safe_h) col_spacing = max(int(mid * vertical_spacing), 2) total_w = mid * len(cols) + col_spacing * max(0, len(cols) - 1) + margin * 2 max_col_h = max(col_heights) if col_heights else 0 w_slack = (1.04 if bubble_conf < 0.45 else 1.01) - slack_tighten h_slack = (1.05 if bubble_conf < 0.45 else 1.02) - slack_tighten w_slack = max(1.0, w_slack) h_slack = max(1.0, h_slack) fits = total_w <= box_w * w_slack and (max_col_h + margin * 2) <= box_h * h_slack if fits: best = mid lo = mid + 1 else: hi = mid - 1 return best async def dispatch( img: np.ndarray, text_regions: List[TextBlock], font_path: str = '', font_size_offset: int = 0, font_size_minimum: int = 0, hyphenate: bool = True, line_spacing: int = None, disable_font_border: bool = False, ) -> np.ndarray: text_render.set_font(font_path) text_regions = [r for r in text_regions if r.translation] # ── 1. Detect speech bubbles ───────────────────────────────────── bubble_rects = detect_bubbles(img, text_regions) dst_points_list: list = [None] * len(text_regions) non_bubble_indices: list[int] = [] non_bubble_regions: list[TextBlock] = [] for i, (region, bubble_rect) in enumerate(zip(text_regions, bubble_rects)): if bubble_rect is not None: bw = int(bubble_rect[0, 1, 0] - bubble_rect[0, 0, 0]) bh = int(bubble_rect[0, 2, 1] - bubble_rect[0, 0, 1]) forced_dir = region._direction if hasattr(region, "_direction") else region.direction render_h = (forced_dir in ("horizontal", "h")) if forced_dir != "auto" else region.horizontal optimal_fs = _find_optimal_font_size( region.get_translation_for_rendering(), bw, bh, region.font_size, getattr(region, "target_lang", "en_US"), render_h=render_h, line_spacing=line_spacing, region=region, ) region.font_size = optimal_fs dst_points_list[i] = bubble_rect else: region._bubble_confidence = float(getattr(region, "_bubble_confidence", 0.0) or 0.0) non_bubble_indices.append(i) non_bubble_regions.append(region) # ── 2. Fallback: expand textline boxes for non-bubble regions ──── if non_bubble_regions: fallback = _resize_regions_to_font_size( img, non_bubble_regions, font_size_offset, font_size_minimum, ) for idx, pts in zip(non_bubble_indices, fallback): dst_points_list[idx] = pts # ── 3. Render ──────────────────────────────────────────────────── for region, dst_points in zip(text_regions, dst_points_list): img = _render_region(img, region, dst_points, hyphenate, line_spacing, disable_font_border) return img