saliacoel
/

MyCustomNodes

+import torch
+import torch.nn.functional as F
+# ============================================================
+# Basic helpers (standalone)
+# ============================================================
+def _bhwc_to_nchw(img: torch.Tensor) -> torch.Tensor:
+    if img.dim() != 4:
+        raise ValueError(f"Expected [B,H,W,C], got {tuple(img.shape)}")
+    return img.permute(0, 3, 1, 2).contiguous()
+def _ensure_rgba_bhwc(images: torch.Tensor) -> torch.Tensor:
+    if images.dim() != 4:
+        raise ValueError(f"Expected [B,H,W,C], got {tuple(images.shape)}")
+    b, h, w, c = images.shape
+    if c == 4:
+        return images
+    if c == 3:
+        alpha = torch.ones((b, h, w, 1), device=images.device, dtype=images.dtype)
+        return torch.cat([images, alpha], dim=3)
+    raise ValueError(f"Expected 3 or 4 channels, got {c}")
+def _to_luma(x: torch.Tensor) -> torch.Tensor:
+    # x: [B,3,H,W]
+    r = x[:, 0:1, :, :]
+    g = x[:, 1:2, :, :]
+    b = x[:, 2:3, :, :]
+    return (0.2989 * r + 0.5870 * g + 0.1140 * b)
+def _resize_max(x: torch.Tensor, max_size: int) -> torch.Tensor:
+    if max_size <= 0:
+        return x
+    b, c, h, w = x.shape
+    m = max(h, w)
+    if m <= max_size:
+        return x
+    scale = max_size / float(m)
+    nh = max(1, int(round(h * scale)))
+    nw = max(1, int(round(w * scale)))
+    return F.interpolate(x, size=(nh, nw), mode="bilinear", align_corners=False)
+def _gaussian_blur(x: torch.Tensor, sigma: float) -> torch.Tensor:
+    if sigma <= 0:
+        return x
+    radius = int(max(1, round(3.0 * sigma)))
+    ksize = 2 * radius + 1
+    device = x.device
+    dtype = x.dtype
+    coords = torch.arange(-radius, radius + 1, device=device, dtype=dtype)
+    kernel1d = torch.exp(-(coords * coords) / (2.0 * sigma * sigma))
+    kernel1d = kernel1d / (kernel1d.sum() + 1e-12)
+    c = x.shape[1]
+    kh = kernel1d.view(1, 1, 1, ksize).repeat(c, 1, 1, 1)
+    kv = kernel1d.view(1, 1, ksize, 1).repeat(c, 1, 1, 1)
+    out = F.conv2d(x, kh, padding=(0, radius), groups=c)
+    out = F.conv2d(out, kv, padding=(radius, 0), groups=c)
+    return out
+def _sobel_edges(y: torch.Tensor) -> torch.Tensor:
+    # y: [B,1,H,W]
+    device = y.device
+    dtype = y.dtype
+    kx = torch.tensor(
+        [[-1, 0, 1],
+         [-2, 0, 2],
+         [-1, 0, 1]],
+        device=device, dtype=dtype
+    ) / 8.0
+    ky = torch.tensor(
+        [[-1, -2, -1],
+         [ 0,  0,  0],
+         [ 1,  2,  1]],
+        device=device, dtype=dtype
+    ) / 8.0
+    kx = kx.view(1, 1, 3, 3)
+    ky = ky.view(1, 1, 3, 3)
+    gx = F.conv2d(y, kx, padding=1)
+    gy = F.conv2d(y, ky, padding=1)
+    return torch.sqrt(gx * gx + gy * gy + 1e-12)
+def _make_gaussian_window(window_size: int, sigma: float, device, dtype):
+    radius = window_size // 2
+    coords = torch.arange(window_size, device=device, dtype=dtype) - radius
+    g = torch.exp(-(coords * coords) / (2.0 * sigma * sigma))
+    g = g / (g.sum() + 1e-12)
+    w2d = (g[:, None] * g[None, :]).view(1, 1, window_size, window_size)
+    return w2d, radius
+def _ssim_fast(x: torch.Tensor, y: torch.Tensor, w2d: torch.Tensor, radius: int) -> torch.Tensor:
+    """
+    SSIM for luma only.
+    x,y: [B,1,H,W]
+    returns [B]
+    """
+    mu_x = F.conv2d(x, w2d, padding=radius)
+    mu_y = F.conv2d(y, w2d, padding=radius)
+    mu_x2 = mu_x * mu_x
+    mu_y2 = mu_y * mu_y
+    mu_xy = mu_x * mu_y
+    sigma_x2 = F.conv2d(x * x, w2d, padding=radius) - mu_x2
+    sigma_y2 = F.conv2d(y * y, w2d, padding=radius) - mu_y2
+    sigma_xy = F.conv2d(x * y, w2d, padding=radius) - mu_xy
+    C1 = (0.01) ** 2
+    C2 = (0.03) ** 2
+    num = (2.0 * mu_xy + C1) * (2.0 * sigma_xy + C2)
+    den = (mu_x2 + mu_y2 + C1) * (sigma_x2 + sigma_y2 + C2)
+    ssim_map = num / (den + 1e-12)
+    return ssim_map.mean(dim=[1, 2, 3])
+def _hist_chi2_from_hists(hx: torch.Tensor, hy: torch.Tensor) -> torch.Tensor:
+    """
+    hx,hy: [B,3,bins] normalized
+    returns [B]
+    """
+    eps = 1e-12
+    return 0.5 * (((hx - hy) ** 2) / (hx + hy + eps)).sum(dim=2).mean(dim=1)
+# ============================================================
+# Fast frozen-tail diff (cheap)
+# ============================================================
+def _fast_tail_diff_bhwc(a_bhwc: torch.Tensor, b_bhwc: torch.Tensor) -> float:
+    DOWNSCALE_MAX = 128
+    BLUR_SIGMA = 1.2
+    SCALE = 1000.0
+    a = _bhwc_to_nchw(a_bhwc).clamp(0.0, 1.0)
+    b = _bhwc_to_nchw(b_bhwc).clamp(0.0, 1.0)
+    if a.shape[1] >= 4 and b.shape[1] >= 4:
+        aa = a[:, 3:4]
+        ba = b[:, 3:4]
+        ar = a[:, 0:3] * aa
+        br = b[:, 0:3] * ba
+    else:
+        ar = a[:, 0:3]
+        br = b[:, 0:3]
+    if ar.shape[2:] != br.shape[2:]:
+        br = F.interpolate(br, size=ar.shape[2:], mode="bilinear", align_corners=False)
+    ar = _resize_max(ar, DOWNSCALE_MAX)
+    br = _resize_max(br, DOWNSCALE_MAX)
+    ar = _gaussian_blur(ar, BLUR_SIGMA)
+    br = _gaussian_blur(br, BLUR_SIGMA)
+    mae = torch.mean(torch.abs(ar - br), dim=[1, 2, 3])
+    return float(mae.mean().item() * SCALE)
+# ============================================================
+# Waveform helpers (span widths in a y-band)
+# ============================================================
+def _smooth_1d(values: list) -> list:
+    n = len(values)
+    if n < 3:
+        return list(values)
+    t = torch.tensor(values, dtype=torch.float32).view(1, 1, n)
+    k = torch.tensor([0.25, 0.50, 0.25], dtype=torch.float32).view(1, 1, 3)
+    tpad = F.pad(t, (1, 1), mode="replicate")
+    out = F.conv1d(tpad, k)
+    return out.view(n).tolist()
+def _is_local_min(values: list, i: int) -> bool:
+    n = len(values)
+    if n < 3:
+        return True
+    if i <= 0 or i >= n - 1:
+        return False
+    return (values[i] <= values[i - 1]) and (values[i] <= values[i + 1])
+def _percentile(values: list, q: float) -> float:
+    if not values:
+        return 0.0
+    s = sorted(values)
+    q = max(0.0, min(1.0, float(q)))
+    pos = q * (len(s) - 1)
+    idx = int(round(pos))
+    idx = max(0, min(len(s) - 1, idx))
+    return float(s[idx])
+def _compute_visible_y_bounds(images_bhwc: torch.Tensor, alpha_thr: float = 0.01):
+    b, h, w, c = images_bhwc.shape
+    if c < 4:
+        return (0, h - 1)
+    alpha = images_bhwc[:, :, :, 3]
+    vis_y = (alpha > alpha_thr).any(dim=2).any(dim=0)  # [H]
+    idx = torch.where(vis_y)[0]
+    if idx.numel() == 0:
+        return (0, h - 1)
+    y_min = int(idx[0].item())
+    y_max = int(idx[-1].item())
+    return (max(0, y_min), min(h - 1, y_max))
+def _compute_band_span_widths(images_bhwc: torch.Tensor,
+                              y0: int,
+                              y1: int,
+                              alpha_thr: float = 0.01,
+                              sample_rows: int = 32) -> list:
+    """
+    Robust span width per frame in a band:
+      - sample rows between y0..y1
+      - for each row, get left/right visible
+      - aggregate via 10% / 90% quantile using sorting (small row count)
+    """
+    b, h, w, c = images_bhwc.shape
+    if c < 4:
+        return [float(w)] * b
+    y0 = max(0, min(h - 1, int(y0)))
+    y1 = max(0, min(h - 1, int(y1)))
+    if y0 > y1:
+        y0, y1 = y1, y0
+    if sample_rows <= 1:
+        ys = [y0]
+    else:
+        ys_t = torch.linspace(y0, y1, steps=sample_rows)
+        ys = torch.unique(torch.round(ys_t).long()).tolist()
+        ys = [int(v) for v in ys]
+    widths = []
+    for i in range(b):
+        lefts = []
+        rights = []
+        for y in ys:
+            row_alpha = images_bhwc[i, y, :, 3]
+            vis = row_alpha > alpha_thr
+            if torch.any(vis):
+                idx = torch.where(vis)[0]
+                lefts.append(int(idx[0].item()))
+                rights.append(int(idx[-1].item()))
+        if not lefts:
+            widths.append(0.0)
+            continue
+        lefts.sort()
+        rights.sort()
+        # 10% and 90% quantiles (row count is small)
+        lq = lefts[int(round(0.10 * (len(lefts) - 1)))]
+        rq = rights[int(round(0.90 * (len(rights) - 1)))]
+        widths.append(float(max(0, rq - lq + 1)))
+    return widths
+def _valley_candidates(signal: list, max_k: int, min_sep: int) -> list:
+    """
+    Strong valley candidates:
+      - local minima
+      - in low quantile band
+      - greedy separation
+    """
+    n = len(signal)
+    if n < 6:
+        return list(range(n))
+    low_thr = _percentile(signal, 0.50)  # valley-like should be in lower half
+    mins = [i for i in range(1, n - 1) if _is_local_min(signal, i) and signal[i] <= low_thr]
+    if not mins:
+        # fallback: just take globally small points
+        mins = list(range(n))
+    mins.sort(key=lambda i: signal[i])  # deepest valleys first
+    chosen = []
+    for i in mins:
+        if all(abs(i - j) >= min_sep for j in chosen):
+            chosen.append(i)
+        if len(chosen) >= max_k:
+            break
+    return sorted(chosen)
+# ============================================================
+# Hybrid fixed precompute + batched scoring (THIS is the speed win)
+# ============================================================
+class _HybridFixedBatchScorer:
+    """
+    Hardcoded hybrid:
+      downscale_max=256
+      blur_sigma=1.2
+      hist_bins=32
+      scale=1000
+      w_pixel=1.00
+      w_ssim=1.00
+      w_edge=0.5
+      w_hist=0.2
+    For sprites: premultiply alpha ON.
+    Precomputes per-frame features once, then scores many pairs at once.
+    """
+    DOWNSCALE_MAX = 256
+    BLUR_SIGMA = 1.2
+    HIST_BINS = 32
+    SCALE = 1000.0
+    W_PIXEL = 1.0
+    W_SSIM  = 1.0
+    W_EDGE  = 0.5
+    W_HIST  = 0.2
+    SSIM_W = 11
+    SSIM_SIGMA = 1.5
+    def __init__(self, images_bhwc: torch.Tensor, premultiply_alpha: bool = True):
+        # images_bhwc: [B,H,W,4]
+        self.device = images_bhwc.device
+        # keep float32 for stable ops
+        x = images_bhwc.clamp(0.0, 1.0).to(torch.float32)
+        x_nchw = _bhwc_to_nchw(x)
+        if x_nchw.shape[1] >= 4 and premultiply_alpha:
+            a = x_nchw[:, 3:4]
+            rgb = x_nchw[:, 0:3] * a
+        else:
+            rgb = x_nchw[:, 0:3]
+        # Downscale once
+        rgb_small = _resize_max(rgb, self.DOWNSCALE_MAX)
+        # Blur once
+        rgb_blur = _gaussian_blur(rgb_small, self.BLUR_SIGMA)
+        # Luma + edges once
+        luma_blur = _to_luma(rgb_blur)
+        edge = _sobel_edges(luma_blur)
+        # Histograms ONCE (no GPU->CPU ping-pong)
+        # Use histc (matches your original binning behavior closely)
+        b, c, h, w = rgb_small.shape
+        bins = self.HIST_BINS
+        eps = 1e-12
+        hists = torch.empty((b, 3, bins), device=rgb_small.device, dtype=torch.float32)
+        rgb_small = rgb_small.clamp(0.0, 1.0)
+        for i in range(b):
+            for ch in range(3):
+                hist = torch.histc(rgb_small[i, ch], bins=bins, min=0.0, max=1.0)
+                hist = hist / (hist.sum() + eps)
+                hists[i, ch] = hist
+        self.rgb_blur = rgb_blur
+        self.luma_blur = luma_blur
+        self.edge = edge
+        self.hists = hists
+        self.w2d, self.radius = _make_gaussian_window(self.SSIM_W, self.SSIM_SIGMA, self.device, torch.float32)
+    def scores_for_pairs(self, idx_i: list, idx_j: list) -> torch.Tensor:
+        """
+        idx_i, idx_j: python lists of same length M
+        returns: tensor [M] float32 (scaled by 1000)
+        """
+        if len(idx_i) != len(idx_j):
+            raise ValueError("idx_i and idx_j must have same length")
+        m = len(idx_i)
+        if m == 0:
+            return torch.zeros((0,), device=self.device, dtype=torch.float32)
+        ti = torch.tensor(idx_i, device=self.device, dtype=torch.long)
+        tj = torch.tensor(idx_j, device=self.device, dtype=torch.long)
+        a_rgb = self.rgb_blur.index_select(0, ti)
+        b_rgb = self.rgb_blur.index_select(0, tj)
+        pix = torch.mean(torch.abs(a_rgb - b_rgb), dim=[1, 2, 3])
+        a_y = self.luma_blur.index_select(0, ti)
+        b_y = self.luma_blur.index_select(0, tj)
+        ssim = _ssim_fast(a_y, b_y, self.w2d, self.radius)
+        ssim_diff = (1.0 - ssim).clamp(min=0.0)
+        a_e = self.edge.index_select(0, ti)
+        b_e = self.edge.index_select(0, tj)
+        ed = torch.mean(torch.abs(a_e - b_e), dim=[1, 2, 3])
+        ha = self.hists.index_select(0, ti)
+        hb = self.hists.index_select(0, tj)
+        hist = _hist_chi2_from_hists(ha, hb)
+        per = (self.W_PIXEL * pix) + (self.W_SSIM * ssim_diff) + (self.W_EDGE * ed) + (self.W_HIST * hist)
+        return per * self.SCALE
+    def score_one(self, i: int, j: int) -> float:
+        s = self.scores_for_pairs([i], [j])
+        return float(s[0].item())
+# ============================================================
+# Node 1: Hardcoded hybrid compare (2 images -> float)
+# ============================================================
+class ImageCompareHybrid:
+    """
+    Same hybrid as before, hardcoded.
+    Note: for general images, alpha is ignored (matches your original).
+    """
+    CATEGORY = "image/analysis"
+    RETURN_TYPES = ("FLOAT",)
+    RETURN_NAMES = ("difference",)
+    FUNCTION = "compare"
+    @classmethod
+    def INPUT_TYPES(cls):
+        return {"required": {"image_a": ("IMAGE",), "image_b": ("IMAGE",)}}
+    def compare(self, image_a, image_b):
+        # For single compare, keep behavior: drop alpha (no premultiply)
+        # We do it via scorer on a 2-frame batch.
+        a = _ensure_rgba_bhwc(image_a).to(torch.float32).clamp(0.0, 1.0)
+        b = _ensure_rgba_bhwc(image_b).to(torch.float32).clamp(0.0, 1.0)
+        x = torch.cat([a[0:1], b[0:1]], dim=0)
+        scorer = _HybridFixedBatchScorer(x, premultiply_alpha=False)
+        score = scorer.score_one(0, 1)
+        return (float(score),)
+# ============================================================
+# Node 2: FAST + VALLEY-TO-VALLEY auto loop
+# ============================================================
+class Salia_Extract_Loop:
+    """
+    FAST + always valley-to-valley.
+    - trims frozen tail (cheap)
+    - computes feet/hands span waveforms
+    - chooses valley candidates (minima)
+    - evaluates ALL candidate valley pairs in one batched hybrid scoring call
+    - refines by snapping to nearby local minima (still valley-to-valley)
+    """
+    CATEGORY = "image/batch"
+    RETURN_TYPES = ("IMAGE", "INT", "INT", "FLOAT", "STRING")
+    RETURN_NAMES = ("loop_batch", "start_index", "end_index", "match_score", "debug")
+    FUNCTION = "autoloop"
+    @classmethod
+    def INPUT_TYPES(cls):
+        return {"required": {"images": ("IMAGE",)}}
+    def _trim_frozen_tail(self, images_bhwc: torch.Tensor):
+        FREEZE_THR = 3.0
+        MIN_CONSEC = 2
+        b = images_bhwc.shape[0]
+        if b < 3:
+            return b, FREEZE_THR
+        tail = 0
+        for t in range(b - 1, 0, -1):
+            d = _fast_tail_diff_bhwc(images_bhwc[t - 1:t], images_bhwc[t:t + 1])
+            if d < FREEZE_THR:
+                tail += 1
+            else:
+                break
+        if tail >= MIN_CONSEC:
+            eff = max(2, b - tail)
+            return eff, FREEZE_THR
+        return b, FREEZE_THR
+    def _snap_valley_to_valley(self, scorer, feet_s, start, end, min_len):
+        """
+        Force both ends to be local minima, by searching nearby.
+        Evaluate candidates in one batch.
+        """
+        n = len(feet_s)
+        radius = 6
+        s_cands = []
+        for i in range(max(1, start - radius), min(n - 1, start + radius + 1)):
+            if _is_local_min(feet_s, i):
+                s_cands.append(i)
+        e_cands = []
+        for j in range(max(1, end - radius), min(n - 1, end + radius + 1)):
+            if _is_local_min(feet_s, j):
+                e_cands.append(j)
+        if not s_cands:
+            s_cands = [start]
+        if not e_cands:
+            e_cands = [end]
+        pairs_i = []
+        pairs_j = []
+        for i in s_cands:
+            for j in e_cands:
+                if j - i >= min_len:
+                    pairs_i.append(i)
+                    pairs_j.append(j)
+        if not pairs_i:
+            return start, end, float(scorer.score_one(start, end))
+        scores = scorer.scores_for_pairs(pairs_i, pairs_j)  # [M]
+        k = int(torch.argmin(scores).item())
+        best_s = pairs_i[k]
+        best_e = pairs_j[k]
+        best_score = float(scores[k].item())
+        return best_s, best_e, best_score
+    def autoloop(self, images):
+        if not isinstance(images, torch.Tensor):
+            raise TypeError(f"Expected IMAGE tensor, got {type(images)}")
+        if images.ndim != 4:
+            raise ValueError(f"Expected IMAGE [B,H,W,C], got {tuple(images.shape)}")
+        images = _ensure_rgba_bhwc(images).to(torch.float32).clamp(0.0, 1.0)
+        b, h, w, c = images.shape
+        if b < 6:
+            return (images, 0, max(0, b - 1), 0.0, f"Too few frames (B={b})")
+        # 1) Trim frozen tail
+        eff_len, freeze_thr = self._trim_frozen_tail(images)
+        imgs = images[:eff_len]
+        n = imgs.shape[0]
+        if n < 6:
+            return (imgs, 0, max(0, n - 1), 0.0, f"After trim too few frames (B={n})")
+        # 2) Visible bounds + adaptive bands
+        alpha_thr = 0.01
+        y_min, y_max = _compute_visible_y_bounds(imgs, alpha_thr=alpha_thr)
+        vis_h = max(1, (y_max - y_min + 1))
+        # relative bands (walkcycle-ish defaults)
+        hands_y0 = y_min + int(round(0.45 * (vis_h - 1)))
+        hands_y1 = y_min + int(round(0.63 * (vis_h - 1)))
+        feet_y0  = y_min + int(round(0.70 * (vis_h - 1)))
+        feet_y1  = y_min + int(round(0.93 * (vis_h - 1)))
+        hands_y0 = max(0, min(h - 1, hands_y0))
+        hands_y1 = max(0, min(h - 1, hands_y1))
+        feet_y0  = max(0, min(h - 1, feet_y0))
+        feet_y1  = max(0, min(h - 1, feet_y1))
+        # 3) Waveforms
+        feet = _compute_band_span_widths(imgs, feet_y0, feet_y1, alpha_thr=alpha_thr, sample_rows=32)
+        hands = _compute_band_span_widths(imgs, hands_y0, hands_y1, alpha_thr=alpha_thr, sample_rows=24)
+        feet_s = _smooth_1d(_smooth_1d(feet))
+        hands_s = _smooth_1d(_smooth_1d(hands))
+        feet_range = max(feet_s) - min(feet_s)
+        if feet_range < 4.0:
+            # In this case valley detection is unreliable; return original trimmed batch
+            dbg = (
+                f"Feet waveform too flat (range={feet_range:.2f}). "
+                f"Returning trimmed batch.\norig_B={b}, eff_B={n}, freeze_thr={freeze_thr}"
+            )
+            return (imgs, 0, n - 1, 0.0, dbg)
+        # 4) Valley candidates (minima) + choose end valleys near end
+        min_sep = max(2, int(round(0.08 * n)))
+        valleys = _valley_candidates(feet_s, max_k=14, min_sep=min_sep)
+        end_region = int(round(0.50 * (n - 1)))
+        end_valleys = [v for v in valleys if v >= end_region]
+        if not end_valleys:
+            end_valleys = sorted(valleys)[-4:]
+        end_valleys = sorted(end_valleys, reverse=True)[:4]
+        # start valleys are earlier
+        min_loop_len = max(8, int(round(0.18 * n)))  # prevents half-cycle accidental loops
+        start_valleys = [v for v in valleys if v <= (n - 1) - min_loop_len]
+        if not start_valleys or not end_valleys:
+            dbg = (
+                "No sufficient valley candidates. Returning trimmed batch.\n"
+                f"orig_B={b}, eff_B={n}, valleys={valleys}"
+            )
+            return (imgs, 0, n - 1, 0.0, dbg)
+        # 5) Precompute hybrid features ONCE (premultiply alpha ON for sprites)
+        scorer = _HybridFixedBatchScorer(imgs, premultiply_alpha=True)
+        # 6) Build candidate valley pairs and score in ONE batched call
+        pairs_i = []
+        pairs_j = []
+        feat_tie = []
+        foot_rng = (max(feet_s) - min(feet_s)) + 1e-6
+        hand_rng = (max(hands_s) - min(hands_s)) + 1e-6
+        for e in end_valleys:
+            for s in start_valleys:
+                if e - s < min_loop_len:
+                    continue
+                # enforce valley-to-valley: both should be local minima (or at least in candidate list)
+                if not _is_local_min(feet_s, s):
+                    continue
+                if not _is_local_min(feet_s, e):
+                    continue
+                pairs_i.append(s)
+                pairs_j.append(e)
+                feat = abs(feet_s[s] - feet_s[e]) / foot_rng + abs(hands_s[s] - hands_s[e]) / hand_rng
+                feat_tie.append(float(feat))
+        if not pairs_i:
+            # fallback: allow candidate valleys even if not strict local minima
+            for e in end_valleys:
+                for s in start_valleys:
+                    if e - s >= min_loop_len:
+                        pairs_i.append(s)
+                        pairs_j.append(e)
+                        feat = abs(feet_s[s] - feet_s[e]) / foot_rng + abs(hands_s[s] - hands_s[e]) / hand_rng
+                        feat_tie.append(float(feat))
+        scores = scorer.scores_for_pairs(pairs_i, pairs_j)  # [M]
+        # Combine score with tiny tie-breaker (keeps correct pose if multiple are close)
+        tie_w = 10.0
+        total = scores + tie_w * torch.tensor(feat_tie, device=scores.device, dtype=scores.dtype)
+        # prefer late end valley if many are similarly good:
+        # we do: among scores <= GOOD, pick highest end; else pick min total
+        GOOD = 8.0
+        good_mask = (scores <= GOOD)
+        if torch.any(good_mask):
+            good_idx = torch.where(good_mask)[0].tolist()
+            # pick max end, then min total
+            max_end = max(pairs_j[k] for k in good_idx)
+            best_pool = [k for k in good_idx if pairs_j[k] == max_end]
+            best_k = min(best_pool, key=lambda k: float(total[k].item()))
+        else:
+            best_k = int(torch.argmin(total).item())
+        start = int(pairs_i[best_k])
+        end = int(pairs_j[best_k])
+        match_score = float(scores[best_k].item())
+        # 7) Snap/refine to nearby minima -> GUARANTEED valley-to-valley
+        start, end, match_score = self._snap_valley_to_valley(scorer, feet_s, start, end, min_loop_len)
+        dropped_end = end
+        end_out = end - 1
+        # Slice end-exclusive (so last returned frame is end-1)
+        loop = imgs[start:end]  # [start .. end-1]
+        # Optional debug: closure score is now last_kept -> first
+        closure_score = float(scorer.score_one(start, end_out)) if end_out > start else float(match_score)
+        dbg = (
+            "AutoLoopSpriteBatch FAST (valley-to-valley)\n"
+            f"orig_B={b}, eff_B={n} (freeze_thr={freeze_thr})\n"
+            f"start={start}, end={end} (dropped), output_end={end_out}, len={end_out-start+1}, match_score(dup)={match_score:.4f}\n"
+            f"closure_score(last_kept->first)={closure_score:.4f}\n"
+            f"visible_y=[{y_min}..{y_max}] hands_y=[{hands_y0}..{hands_y1}] feet_y=[{feet_y0}..{feet_y1}]\n"
+            f"feet_range={feet_range:.2f}, min_sep={min_sep}, min_loop_len={min_loop_len}\n"
+            f"valleys={valleys}, end_valleys={end_valleys}"
+        )
+        return (loop, int(start), int(end_out), float(match_score), dbg)
+# ============================================================
+# Register
+# ============================================================
+NODE_CLASS_MAPPINGS = {
+    "ImageCompareHybrid": ImageCompareHybrid,
+    "Salia_Extract_Loop": Salia_Extract_Loop,
+}
+NODE_DISPLAY_NAME_MAPPINGS = {
+    "ImageCompareHybrid": "ImageCompareHybrid",
+    "Salia_Extract_Loop": "Salia_Extract_Loop",
+}