Spaces:

feng-x
/

ring-sizer

Running

File size: 7,908 Bytes

22df1ea

"""Compare hand-mask quality across backends on a single image.

Runs MediaPipe (current pipeline), SAM 2.1 tiny, and SAM 2.1 small using
a point prompt at the palm center from MediaPipe landmarks. Saves a 4-panel
side-by-side comparison and also writes each mask's contour + edge crop.
"""
from __future__ import annotations

import sys
import time
from pathlib import Path
from typing import Tuple

import cv2
import numpy as np
from PIL import Image as PILImage

sys.path.insert(0, str(Path(__file__).resolve().parents[1]))

from src.finger_segmentation import segment_hand  # noqa: E402

IMG_PATH = Path("input/sample-04-12/card_2.jpg")
OUT_DIR = Path("output/hand_sam_compare")

SAM_MODELS = [
    ("sam2.1-tiny", "facebook/sam2.1-hiera-tiny"),
    ("sam2.1-small", "facebook/sam2.1-hiera-small"),
]


def palm_and_card_points(image_bgr: np.ndarray, hand_data: dict) -> Tuple[Tuple[int, int], Tuple[int, int]]:
    """Return (palm_center, card_center) pixel coords in the canonical image space.

    Palm center = mean of wrist + MCPs (landmarks 0, 5, 9, 13, 17).
    Card center = a rough point to the left of the hand (negative prompt hint).
    """
    landmarks = hand_data.get("landmarks")
    if landmarks is None:
        raise RuntimeError("MediaPipe returned no landmarks")

    # landmarks is (21, 2 or 3) in pixel coords
    lm = np.asarray(landmarks)[:, :2]
    palm_ids = [0, 5, 9, 13, 17]
    palm_center = tuple(np.round(lm[palm_ids].mean(axis=0)).astype(int).tolist())

    # Card hint: far from hand, toward image left
    h, w = image_bgr.shape[:2]
    hand_x_min = int(lm[:, 0].min())
    card_x = max(50, hand_x_min - 150)
    card_y = h // 2
    return palm_center, (card_x, card_y)


def run_sam(
    model_id: str,
    image_rgb: np.ndarray,
    palm_xy: Tuple[int, int],
    negative_xy: Tuple[int, int],
) -> Tuple[np.ndarray, float, float]:
    """Run SAM 2.1 with palm positive + card negative point. Returns (mask, score, seconds)."""
    import torch
    from transformers import Sam2Model, Sam2Processor

    processor = Sam2Processor.from_pretrained(model_id)
    model = Sam2Model.from_pretrained(model_id).to("cpu").eval()

    pil = PILImage.fromarray(image_rgb)
    input_points = [[[list(palm_xy), list(negative_xy)]]]
    input_labels = [[[1, 0]]]

    t0 = time.time()
    inputs = processor(
        images=pil,
        input_points=input_points,
        input_labels=input_labels,
        return_tensors="pt",
    )
    with torch.inference_mode():
        outputs = model(**inputs, multimask_output=True)

    masks = processor.post_process_masks(
        outputs.pred_masks.cpu(),
        inputs["original_sizes"],
        mask_threshold=0.0,
    )[0][0]  # (num_candidates, H, W) for first image, first prompt set
    scores = outputs.iou_scores.cpu().numpy()[0, 0]

    best_idx = int(np.argmax(scores))
    mask = masks[best_idx].numpy().astype(bool)
    return mask, float(scores[best_idx]), time.time() - t0


def mask_to_overlay(image_bgr: np.ndarray, mask: np.ndarray, color: Tuple[int, int, int]) -> np.ndarray:
    """Return a BGR image with the mask tinted + contour drawn."""
    out = image_bgr.copy()
    tint = np.zeros_like(out)
    tint[mask] = color
    out = cv2.addWeighted(out, 1.0, tint, 0.35, 0)

    contours, _ = cv2.findContours(
        mask.astype(np.uint8) * 255, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_NONE
    )
    cv2.drawContours(out, contours, -1, color, 2, cv2.LINE_AA)
    return out


def label_panel(img: np.ndarray, text: str) -> np.ndarray:
    h, w = img.shape[:2]
    cv2.rectangle(img, (0, 0), (w, 60), (0, 0, 0), -1)
    cv2.putText(img, text, (20, 42), cv2.FONT_HERSHEY_SIMPLEX, 1.3,
                (255, 255, 255), 3, cv2.LINE_AA)
    return img


def main() -> int:
    OUT_DIR.mkdir(parents=True, exist_ok=True)

    image_bgr = cv2.imread(str(IMG_PATH))
    if image_bgr is None:
        print(f"Failed to load {IMG_PATH}")
        return 1

    print(f"Image: {IMG_PATH} {image_bgr.shape}")

    # --- MediaPipe baseline ---
    t0 = time.time()
    hand_data = segment_hand(image_bgr, finger="index")
    mp_time = time.time() - t0
    if hand_data is None:
        print("MediaPipe detected no hand — aborting")
        return 1

    canonical_image = hand_data.get("canonical_image", image_bgr)
    mp_mask = hand_data.get("mask")
    if mp_mask is None:
        print("MediaPipe did not return a hand mask")
        return 1
    mp_mask = mp_mask.astype(bool)
    print(f"MediaPipe: {mp_time:.1f}s  mask_area={mp_mask.sum()}")

    # Work in the canonical image so the comparison is apples-to-apples
    image_for_sam = canonical_image.copy()
    palm_xy, card_xy = palm_and_card_points(image_for_sam, hand_data)
    print(f"Palm prompt: {palm_xy}  Negative hint: {card_xy}")

    image_rgb = cv2.cvtColor(image_for_sam, cv2.COLOR_BGR2RGB)

    # --- SAM models ---
    results = {"mediapipe": (mp_mask, None, mp_time)}
    for name, model_id in SAM_MODELS:
        print(f"\n=== {name} ({model_id}) ===")
        try:
            mask, score, seconds = run_sam(model_id, image_rgb, palm_xy, card_xy)
            # Align shape (should already be canonical)
            if mask.shape != mp_mask.shape:
                mask = cv2.resize(
                    mask.astype(np.uint8),
                    (mp_mask.shape[1], mp_mask.shape[0]),
                    interpolation=cv2.INTER_NEAREST,
                ).astype(bool)
            print(f"  score={score:.3f}  time={seconds:.1f}s  area={mask.sum()}")
            results[name] = (mask, score, seconds)
        except Exception as e:
            print(f"  FAILED: {e!r}")
            import traceback
            traceback.print_exc()

    # --- Render panels ---
    panels = []
    colors = {
        "mediapipe": (0, 165, 255),      # orange
        "sam2.1-tiny": (0, 255, 255),    # yellow
        "sam2.1-small": (0, 255, 0),     # green
    }

    # Panel 0: original with prompt points
    orig = image_for_sam.copy()
    cv2.circle(orig, palm_xy, 18, (0, 255, 0), -1)
    cv2.circle(orig, palm_xy, 18, (0, 0, 0), 3)
    cv2.circle(orig, card_xy, 18, (0, 0, 255), -1)
    cv2.circle(orig, card_xy, 18, (0, 0, 0), 3)
    panels.append(label_panel(orig, "original + prompts"))

    for name in ["mediapipe", "sam2.1-tiny", "sam2.1-small"]:
        if name not in results:
            continue
        mask, score, seconds = results[name]
        panel = mask_to_overlay(image_for_sam, mask, colors[name])
        label = f"{name}  {seconds:.1f}s"
        if score is not None:
            label += f"  score={score:.2f}"
        panels.append(label_panel(panel, label))

    # Save individual panels full-res
    for i, p in enumerate(panels):
        cv2.imwrite(str(OUT_DIR / f"panel_{i}_{['orig','mediapipe','tiny','small'][i]}.png"), p)

    # Build a single side-by-side at a readable size
    def resize_to_height(img: np.ndarray, H: int) -> np.ndarray:
        h, w = img.shape[:2]
        scale = H / h
        return cv2.resize(img, (int(round(w * scale)), H), interpolation=cv2.INTER_AREA)

    target_h = 900
    resized = [resize_to_height(p, target_h) for p in panels]
    combined = np.hstack(resized)
    cv2.imwrite(str(OUT_DIR / "comparison_full.png"), combined)

    # Also zoom-crop around the hand for fine-detail inspection
    ys, xs = np.where(mp_mask)
    if len(xs) > 0:
        pad = 80
        x0, x1 = max(0, xs.min() - pad), min(image_for_sam.shape[1], xs.max() + pad)
        y0, y1 = max(0, ys.min() - pad), min(image_for_sam.shape[0], ys.max() + pad)
        crops = []
        for p in panels:
            crop = p[y0:y1, x0:x1]
            crops.append(resize_to_height(crop, target_h))
        combined_zoom = np.hstack(crops)
        cv2.imwrite(str(OUT_DIR / "comparison_zoom.png"), combined_zoom)

    print(f"\nSaved panels to {OUT_DIR}/")
    return 0


if __name__ == "__main__":
    raise SystemExit(main())