Spaces:

Snipshot
/

snipshot-backend

Running

File size: 21,909 Bytes

0f6f6c1

# Rendering dispatch — places translated text back onto the image.

import cv2
import numpy as np
from typing import List, Optional
from shapely import affinity
from shapely.geometry import Polygon

from . import text_render
from .bubble import detect_bubbles
from ..utils import TextBlock, color_difference, get_logger, rotate_polygons

logger = get_logger("render")


def _fg_bg_compare(fg, bg):
    fg_avg = np.mean(fg)
    if color_difference(fg, bg) < 30:
        bg = (255, 255, 255) if fg_avg <= 127 else (0, 0, 0)
    return fg, bg


def _count_text_length(text: str) -> float:
    half_width_chars = 'っッぁぃぅぇぉ'
    length = 0.0
    for char in text.strip():
        if char in half_width_chars:
            length += 0.5
        else:
            length += 1.0
    return length


def _fallback_scale_cap(region: TextBlock, severity: float) -> float:
    """Single dynamic cap for fallback expansion, confidence-driven."""
    bubble_conf = float(getattr(region, "_bubble_confidence", 0.0) or 0.0)

    if bubble_conf < 0.25:
        cap = 2.5
    elif bubble_conf < 0.45:
        cap = 2.2
    elif bubble_conf < 0.65:
        cap = 1.9
    elif bubble_conf < 0.80:
        cap = 1.7
    else:
        cap = 1.5

    if severity > 2.0:
        cap = min(2.5, cap + 0.1)
    return cap


def _early_fallback_bias(region: TextBlock) -> float:
    """Small optional bias for very low-confidence fallback only."""
    bubble_conf = float(getattr(region, "_bubble_confidence", 0.0) or 0.0)
    ratio = _translation_length_ratio(region)

    if bubble_conf < 0.20 and ratio > 1.2:
        return 1.2
    if bubble_conf < 0.30 and ratio > 1.6:
        return 1.1
    return 1.0


def _translation_length_ratio(region: TextBlock) -> float:
    orig_text = getattr(region, "text_raw", region.text)
    char_count_orig = _count_text_length(orig_text)
    char_count_trans = _count_text_length((region.translation or "").strip())
    if char_count_orig <= 0:
        return 1.0
    return max(0.5, char_count_trans / char_count_orig)


def _rect_to_quad(x: int, y: int, w: int, h: int) -> np.ndarray:
    return np.array([[[x, y], [x + w, y], [x + w, y + h], [x, y + h]]], dtype=np.int64)


def _quad_from_inpaint_bbox(region: TextBlock) -> Optional[np.ndarray]:
    """Try to read inpaint-aligned bbox if available on the region."""
    bbox = getattr(region, "inpaint_bbox", None)
    if bbox is None:
        return None

    try:
        # dict style: {x, y, w, h}
        if isinstance(bbox, dict):
            x, y = int(bbox["x"]), int(bbox["y"])
            w, h = int(bbox["w"]), int(bbox["h"])
            if w > 2 and h > 2:
                return _rect_to_quad(x, y, w, h)

        # tuple/list style: (x, y, w, h)
        if isinstance(bbox, (tuple, list)) and len(bbox) == 4:
            x, y, w, h = [int(v) for v in bbox]
            if w > 2 and h > 2:
                return _rect_to_quad(x, y, w, h)

        # 4-point quad style: [[x,y], ...]
        arr = np.array(bbox, dtype=np.int64)
        if arr.ndim == 2 and arr.shape == (4, 2):
            return arr.reshape(1, 4, 2)
        if arr.ndim == 3 and arr.shape[1:] == (4, 2):
            return arr[:1]
    except Exception:
        return None

    return None


def _get_region_base_quad(region: TextBlock) -> np.ndarray:
    """Preferred placement quad: inpaint-aligned bbox, then region min rect."""
    inpaint_quad = _quad_from_inpaint_bbox(region)
    if inpaint_quad is not None:
        return inpaint_quad.astype(np.int64)
    return region.min_rect.astype(np.int64)


def _resize_regions_to_font_size(
    img: np.ndarray,
    text_regions: List[TextBlock],
    font_size_offset: int,
    font_size_minimum: int,
):
    """Expand text bounding boxes when translated text is longer than the original."""
    if font_size_minimum == -1:
        font_size_minimum = round((img.shape[0] + img.shape[1]) / 200)
    font_size_minimum = max(1, font_size_minimum)

    dst_points_list = []
    for region in text_regions:
        base_quad = _get_region_base_quad(region)
        _, _, base_w, base_h = cv2.boundingRect(base_quad[0].astype(np.int32))
        base_w = max(1, int(base_w))
        base_h = max(1, int(base_h))

        original_fs = region.font_size
        if original_fs <= 0:
            original_fs = font_size_minimum

        target_fs = original_fs + font_size_offset
        target_fs = max(target_fs, font_size_minimum, 1)

        # Keep font-size nudging mild; geometry scaling is driven by overflow estimates.
        ratio = _translation_length_ratio(region)
        if ratio > 1.0:
            target_fs = int(round(target_fs * min(1.35, 1.0 + 0.18 * (ratio - 1.0))))
            target_fs = max(target_fs, font_size_minimum, 1)

        # Single overflow-driven scaling decision on the true base region.
        base_scale = 1.0
        if original_fs > 0:
            fs_growth = max(0.0, (target_fs - original_fs) / float(original_fs))
            base_scale = 1.0 + 0.35 * fs_growth

        over_x, over_y = _estimate_overflow_scales(
            region,
            target_fs,
            avail_w=base_w,
            avail_h=base_h,
        )
        severity = max(base_scale, over_x, over_y)
        cap = _fallback_scale_cap(region, severity)

        final_scale_x = min(max(max(base_scale, over_x), 1.0), cap)
        final_scale_y = min(max(max(base_scale, over_y), 1.0), cap)

        if final_scale_x > 1.001 or final_scale_y > 1.001:
            try:
                poly = Polygon(base_quad[0])
                poly = affinity.scale(poly, xfact=final_scale_x, yfact=final_scale_y, origin='center')
                scaled_pts = np.array(poly.exterior.coords[:4])
                dst_points = scaled_pts.reshape(-1, 4, 2).astype(np.int64)
            except Exception:
                dst_points = base_quad
        else:
            dst_points = base_quad

        dst_points_list.append(dst_points)
        region.font_size = int(target_fs)

    return dst_points_list


def _order_points(pts: np.ndarray) -> np.ndarray:
    """Reorder 4 corner points to [TL, TR, BR, BL] regardless of input ordering."""
    pts = pts.reshape(4, 2).astype(np.float64)
    s = pts.sum(axis=1)
    d = np.diff(pts, axis=1).flatten()  # y - x
    tl = pts[np.argmin(s)]
    br = pts[np.argmax(s)]
    tr = pts[np.argmin(d)]
    bl = pts[np.argmax(d)]
    return np.array([tl, tr, br, bl])


def _compute_inner_margin(target_w: int, target_h: int, text_len: int) -> int:
    """Compute smooth, text-aware inner margin for bubble rendering."""
    min_dim = max(1, min(target_w, target_h))

    # Tiny boxes: keep margins minimal to preserve usable layout space.
    if min_dim < 70:
        return max(1, int(round(min_dim * 0.02)))

    if min_dim < 120:
        return max(2, int(round(min_dim * 0.03)))

    base = 0.040 * min_dim + 1.0
    density_factor = max(0.86, min(1.08, 1.02 - 0.0022 * text_len))
    margin = int(round(base * density_factor))
    return max(2, min(10, margin))


def _center_text_in_box(temp_box: np.ndarray, target_w: int, target_h: int) -> np.ndarray:
    """Center rendered text in a target box without additional raster shrink."""
    target_w = max(1, int(target_w))
    target_h = max(1, int(target_h))

    h, w, _ = temp_box.shape
    content = temp_box

    out = np.zeros((target_h, target_w, 4), dtype=np.uint8)
    draw_w = min(w, target_w)
    draw_h = min(h, target_h)

    x0 = max(0, (target_w - draw_w) // 2)
    y0 = max(0, (target_h - draw_h) // 2)

    src_x0 = max(0, (w - draw_w) // 2)
    src_y0 = max(0, (h - draw_h) // 2)
    out[y0:y0 + draw_h, x0:x0 + draw_w] = content[src_y0:src_y0 + draw_h, src_x0:src_x0 + draw_w]
    return out


def _render_temp_text_box(
    region: TextBlock,
    font_size: int,
    width: int,
    height: int,
    fg,
    bg,
    hyphenate,
    line_spacing,
    render_h: bool,
):
    if render_h:
        return text_render.put_text_horizontal(
            font_size,
            region.get_translation_for_rendering(),
            width,
            height,
            region.alignment,
            region.direction == 'hl',
            fg,
            bg,
            region.target_lang,
            hyphenate,
            line_spacing,
        )
    return text_render.put_text_vertical(
        font_size,
        region.get_translation_for_rendering(),
        height,
        region.alignment,
        fg,
        bg,
        line_spacing,
    )


def _sanitize_dst_quad(dst_points: np.ndarray, region: TextBlock) -> np.ndarray:
    """Sanitize destination quadrilateral to reduce homography placement errors."""
    pts = _order_points(dst_points[0]).astype(np.float32)

    # Use region min rect if the incoming quad is degenerate.
    def _region_fallback():
        return _order_points(_get_region_base_quad(region)[0]).astype(np.float32)

    area = abs(cv2.contourArea(pts.astype(np.int32)))
    if area < 20:
        pts = _region_fallback()

    if not cv2.isContourConvex(pts.astype(np.int32)):
        pts = _region_fallback()

    edges = [
        np.linalg.norm(pts[1] - pts[0]),
        np.linalg.norm(pts[2] - pts[1]),
        np.linalg.norm(pts[3] - pts[2]),
        np.linalg.norm(pts[0] - pts[3]),
    ]
    if min(edges) < 4.0:
        pts = _region_fallback()

    # If destination is near-axis aligned but region angle is not, nudge orientation.
    vec = pts[1] - pts[0]
    quad_angle = np.degrees(np.arctan2(float(vec[1]), float(vec[0])))
    region_angle = float(getattr(region, "angle", 0.0) or 0.0)
    if abs(region_angle) > 10.0 and abs(quad_angle) < 3.0 and abs(region_angle) < 45.0:
        rot_deg = float(np.clip(region_angle * 0.35, -10.0, 10.0))
        theta = np.deg2rad(rot_deg)
        c, s = np.cos(theta), np.sin(theta)
        R = np.array([[c, -s], [s, c]], dtype=np.float32)
        center = np.mean(pts, axis=0, keepdims=True)
        pts = (pts - center) @ R.T + center

    return pts[np.newaxis].astype(np.float32)


def _estimate_overflow_scales(
    region: TextBlock,
    target_fs: int,
    avail_w: Optional[float] = None,
    avail_h: Optional[float] = None,
) -> tuple[float, float]:
    """Estimate overflow-driven expansion scales from the current base region size."""
    translation = region.get_translation_for_rendering()
    lang = getattr(region, "target_lang", "en_US")

    base_w = float(avail_w if avail_w is not None else region.unrotated_size[0])
    base_h = float(avail_h if avail_h is not None else region.unrotated_size[1])
    base_w = max(base_w, 1.0)
    base_h = max(base_h, 1.0)

    if region.horizontal:
        lines, widths = text_render.calc_horizontal(
            target_fs,
            translation,
            max_width=base_w,
            max_height=base_h,
            language=lang,
        )
        used_rows = max(len(region.texts), 1)
        needed_rows = max(len(lines), 1)
        row_overflow = max(1.0, needed_rows / used_rows)
        width_overflow = max(1.0, (max(widths) if widths else 0) / base_w)
        scale_x = max(width_overflow, 1.0 + 0.35 * (row_overflow - 1.0))
        scale_y = max(1.0, row_overflow)
    else:
        cols, col_heights = text_render.calc_vertical(
            target_fs,
            translation,
            max_height=base_h,
        )
        used_cols = max(len(region.texts), 1)
        needed_cols = max(len(cols), 1)
        col_overflow = max(1.0, needed_cols / used_cols)
        height_overflow = max(1.0, (max(col_heights) if col_heights else 0) / base_h)
        scale_x = max(1.0, col_overflow)
        scale_y = max(height_overflow, 1.0 + 0.35 * (col_overflow - 1.0))

    return min(max(scale_x, 1.0), 2.5), min(max(scale_y, 1.0), 2.5)


def _render_region(img, region: TextBlock, dst_points, hyphenate, line_spacing, disable_font_border):
    fg, bg = region.get_font_colors()
    fg, bg = _fg_bg_compare(fg, bg)
    if disable_font_border:
        bg = None

    # Sanitize destination points so homography does not amplify bad geometry.
    dst_points = _sanitize_dst_quad(dst_points, region)

    middle_pts = (dst_points[:, [1, 2, 3, 0]] + dst_points) / 2
    norm_h = np.linalg.norm(middle_pts[:, 1] - middle_pts[:, 3], axis=1)
    norm_v = np.linalg.norm(middle_pts[:, 2] - middle_pts[:, 0], axis=1)

    forced_dir = region._direction if hasattr(region, "_direction") else region.direction
    if forced_dir != "auto":
        render_h = forced_dir in ("horizontal", "h")
    else:
        render_h = region.horizontal

    target_w = max(1, int(round(norm_h[0])))
    target_h = max(1, int(round(norm_v[0])))

    temp_box = _render_temp_text_box(
        region,
        region.font_size,
        target_w,
        target_h,
        fg,
        bg,
        hyphenate,
        line_spacing,
        render_h,
    )

    if temp_box is None:
        return img

    margin = _compute_inner_margin(target_w, target_h, len(region.get_translation_for_rendering()))

    inner_w = max(1, target_w - margin * 2)
    inner_h = max(1, target_h - margin * 2)

    # Fit by font-size adjustment only. Avoid additional raster downscaling.
    bubble_conf = float(getattr(region, "_bubble_confidence", 0.0) or 0.0)
    shrink_step_ratio = 0.03 if bubble_conf < 0.45 else 0.05
    max_fit_steps = 4 if bubble_conf < 0.45 else 6

    fit_steps = 0
    while (
        (temp_box.shape[1] - inner_w > 2 or temp_box.shape[0] - inner_h > 2)
        and fit_steps < max_fit_steps
    ):
        fs0 = int(region.font_size)
        trial_fs = max(8, fs0 - max(1, int(round(fs0 * shrink_step_ratio))))
        if trial_fs >= fs0:
            break
        prev_over = max(
            max(temp_box.shape[1] - inner_w, 0) / max(inner_w, 1),
            max(temp_box.shape[0] - inner_h, 0) / max(inner_h, 1),
        )
        trial_box = _render_temp_text_box(
            region,
            trial_fs,
            target_w,
            target_h,
            fg,
            bg,
            hyphenate,
            line_spacing,
            render_h,
        )
        if trial_box is None:
            break
        next_over = max(
            max(trial_box.shape[1] - inner_w, 0) / max(inner_w, 1),
            max(trial_box.shape[0] - inner_h, 0) / max(inner_h, 1),
        )
        if next_over > prev_over - 0.01:
            break
        temp_box = trial_box
        region.font_size = trial_fs
        fit_steps += 1

    centered_inner = _center_text_in_box(temp_box, inner_w, inner_h)

    box = np.zeros((target_h, target_w, 4), dtype=np.uint8)
    ox = max(0, (target_w - inner_w) // 2)
    oy = max(0, (target_h - inner_h) // 2)
    box[oy:oy + inner_h, ox:ox + inner_w] = centered_inner

    src_pts = np.array([[0, 0], [box.shape[1], 0], [box.shape[1], box.shape[0]], [0, box.shape[0]]]).astype(np.float32)
    M, _ = cv2.findHomography(src_pts, dst_points, cv2.RANSAC, 5.0)
    rgba = cv2.warpPerspective(box, M, (img.shape[1], img.shape[0]),
                               flags=cv2.INTER_LINEAR, borderMode=cv2.BORDER_CONSTANT, borderValue=0)
    x, y, rw, rh = cv2.boundingRect(dst_points.astype(np.int32))
    canvas = rgba[y:y + rh, x:x + rw, :3]
    mask = rgba[y:y + rh, x:x + rw, 3:4].astype(np.float32) / 255.0
    img[y:y + rh, x:x + rw] = np.clip(
        img[y:y + rh, x:x + rw].astype(np.float32) * (1 - mask) + canvas.astype(np.float32) * mask,
        0, 255,
    ).astype(np.uint8)
    return img


def _find_optimal_font_size(
    text,
    box_w,
    box_h,
    initial_fs,
    lang,
    min_fs=10,
    render_h: bool = True,
    line_spacing: Optional[float] = None,
    region: Optional[TextBlock] = None,
):
    """
    Binary-search for the largest font size that fits text inside box.
    
    PHASE 1 IMPROVEMENTS:
    - Smooth formula based on min(box_w, box_h) instead of discrete classes
    - Accounts for padding and line spacing (15% instead of 1%)
    - More iterations for precision (14 instead of 12)
    - Adaptive margin calculation
    """
    bubble_conf = float(getattr(region, "_bubble_confidence", 0.0) if region is not None else 0.0)
    text_len = _count_text_length(text)

    # Calculate adaptive safe margin (looser for low-confidence fallback regions)
    min_dim = max(1, min(box_w, box_h))
    # Higher text density in the same region is treated as a complex case and
    # gets a slightly more conservative fit target.
    density = text_len / float(max(min_dim, 1))
    complexity = max(0.0, min(1.0, (density - 0.22) / 0.35))

    margin_ratio = max(0.02, min(0.10, 7 / min_dim))
    if bubble_conf < 0.45:
        margin_ratio *= 0.85
    margin_ratio *= 1.0 + 0.20 * complexity
    margin = max(4, int(min_dim * margin_ratio))

    horizontal_spacing = float(line_spacing) if line_spacing is not None else 0.15
    vertical_spacing = float(line_spacing) if line_spacing is not None else 0.10
    
    safe_w = max(margin * 2, box_w - margin * 2)
    safe_h = max(margin * 2, box_h - margin * 2)
    
    # Smooth formula for max font size based on smallest dimension
    if min_dim < 80:
        max_fs = min(int(min_dim * 0.56), 40)
    elif min_dim < 200:
        max_fs = int(40 + (min_dim - 80) * 0.30)
    else:
        max_fs = min(int(min_dim * 0.44), 168)

    # Tone down the upper bound further for dense/complex bubbles.
    max_fs = int(round(max_fs * (1.0 - 0.12 * complexity)))
    
    # Relax growth caps, especially for low-confidence fallback regions.
    growth_mult = 3.2 if bubble_conf < 0.45 else 2.6
    max_fs = min(max_fs, int(max(initial_fs * growth_mult, min_fs + 8)))
    max_fs = max(max_fs, min_fs + 6)

    # Tighten fit slack when complexity is high to avoid visually oversized text.
    slack_tighten = 0.03 * complexity
    
    lo, hi = min_fs, max_fs
    best = min_fs
    
    # More iterations for precision
    for _ in range(14):
        if lo > hi:
            break
        mid = (lo + hi) // 2
        if mid < 1:
            break
        
        # Layout-aware fit check (horizontal and vertical text behave differently).
        if render_h:
            lines, widths = text_render.calc_horizontal(mid, text, safe_w, safe_h, lang)
            line_spacing_px = max(int(mid * horizontal_spacing), 3)
            total_h = mid * len(lines) + line_spacing_px * max(0, len(lines) - 1) + margin * 2
            max_line_w = max(widths) if widths else 0
            h_slack = (1.04 if bubble_conf < 0.45 else 1.01) - slack_tighten
            w_slack = (1.05 if bubble_conf < 0.45 else 1.02) - slack_tighten
            h_slack = max(1.0, h_slack)
            w_slack = max(1.0, w_slack)
            fits = total_h <= box_h * h_slack and (max_line_w + margin * 2) <= box_w * w_slack
        else:
            cols, col_heights = text_render.calc_vertical(mid, text, safe_h)
            col_spacing = max(int(mid * vertical_spacing), 2)
            total_w = mid * len(cols) + col_spacing * max(0, len(cols) - 1) + margin * 2
            max_col_h = max(col_heights) if col_heights else 0
            w_slack = (1.04 if bubble_conf < 0.45 else 1.01) - slack_tighten
            h_slack = (1.05 if bubble_conf < 0.45 else 1.02) - slack_tighten
            w_slack = max(1.0, w_slack)
            h_slack = max(1.0, h_slack)
            fits = total_w <= box_w * w_slack and (max_col_h + margin * 2) <= box_h * h_slack

        if fits:
            best = mid
            lo = mid + 1
        else:
            hi = mid - 1
    
    return best


async def dispatch(
    img: np.ndarray,
    text_regions: List[TextBlock],
    font_path: str = '',
    font_size_offset: int = 0,
    font_size_minimum: int = 0,
    hyphenate: bool = True,
    line_spacing: int = None,
    disable_font_border: bool = False,
) -> np.ndarray:
    text_render.set_font(font_path)
    text_regions = [r for r in text_regions if r.translation]

    # ── 1. Detect speech bubbles ─────────────────────────────────────
    bubble_rects = detect_bubbles(img, text_regions)

    dst_points_list: list = [None] * len(text_regions)
    non_bubble_indices: list[int] = []
    non_bubble_regions: list[TextBlock] = []

    for i, (region, bubble_rect) in enumerate(zip(text_regions, bubble_rects)):
        if bubble_rect is not None:
            bw = int(bubble_rect[0, 1, 0] - bubble_rect[0, 0, 0])
            bh = int(bubble_rect[0, 2, 1] - bubble_rect[0, 0, 1])

            forced_dir = region._direction if hasattr(region, "_direction") else region.direction
            render_h = (forced_dir in ("horizontal", "h")) if forced_dir != "auto" else region.horizontal

            optimal_fs = _find_optimal_font_size(
                region.get_translation_for_rendering(),
                bw, bh,
                region.font_size,
                getattr(region, "target_lang", "en_US"),
                render_h=render_h,
                line_spacing=line_spacing,
                region=region,
            )
            region.font_size = optimal_fs
            dst_points_list[i] = bubble_rect
        else:
            region._bubble_confidence = float(getattr(region, "_bubble_confidence", 0.0) or 0.0)
            non_bubble_indices.append(i)
            non_bubble_regions.append(region)

    # ── 2. Fallback: expand textline boxes for non-bubble regions ────
    if non_bubble_regions:
        fallback = _resize_regions_to_font_size(
            img, non_bubble_regions, font_size_offset, font_size_minimum,
        )
        for idx, pts in zip(non_bubble_indices, fallback):
            dst_points_list[idx] = pts

    # ── 3. Render ────────────────────────────────────────────────────
    for region, dst_points in zip(text_regions, dst_points_list):
        img = _render_region(img, region, dst_points, hyphenate, line_spacing, disable_font_border)
    return img