Spaces:
Running
Running
Upload folder using huggingface_hub
Browse files- measure_finger.py +338 -42
- requirements.txt +5 -0
- script/compare_hand_sam.py +227 -0
- script/validate_sam_card.py +198 -0
- src/edge_refinement.py +168 -90
- src/finger_segmentation.py +32 -3
- src/geometry.py +27 -19
- src/sam_backend.py +50 -0
- src/sam_card_detection.py +614 -0
- src/sam_hand_segmentation.py +158 -0
- web_demo/README.md +1 -1
- web_demo/app.py +7 -1
- web_demo/static/app.js +1 -1
- web_demo/supabase_client.py +13 -1
measure_finger.py
CHANGED
|
@@ -13,13 +13,17 @@ import argparse
|
|
| 13 |
import json
|
| 14 |
import sys
|
| 15 |
from pathlib import Path
|
| 16 |
-
from typing import Optional, Dict, Any, Literal
|
| 17 |
|
| 18 |
import cv2
|
| 19 |
import numpy as np
|
| 20 |
|
| 21 |
from src.image_quality import assess_image_quality
|
| 22 |
from src.card_detection import detect_credit_card, compute_scale_factor
|
|
|
|
|
|
|
|
|
|
|
|
|
| 23 |
from src.finger_segmentation import segment_hand, isolate_finger, clean_mask, get_finger_contour
|
| 24 |
from src.geometry import estimate_finger_axis, localize_ring_zone, localize_ring_zone_from_landmarks, compute_cross_section_width
|
| 25 |
from src.edge_refinement import refine_edges_sobel, should_use_sobel_measurement, compare_edge_methods
|
|
@@ -30,7 +34,7 @@ from src.confidence import (
|
|
| 30 |
compute_edge_quality_confidence,
|
| 31 |
compute_overall_confidence,
|
| 32 |
)
|
| 33 |
-
from src.debug_observer import draw_comprehensive_edge_overlay
|
| 34 |
from src.ring_size import recommend_ring_size, aggregate_ring_sizes, VALID_RING_MODELS, DEFAULT_RING_MODEL
|
| 35 |
from src.image_quality import (
|
| 36 |
check_card_in_frame,
|
|
@@ -114,9 +118,9 @@ Examples:
|
|
| 114 |
parser.add_argument(
|
| 115 |
"--edge-method",
|
| 116 |
type=str,
|
| 117 |
-
default="
|
| 118 |
-
choices=["auto", "contour", "sobel", "compare"],
|
| 119 |
-
help="Edge detection method: auto (quality-based), contour (v0), sobel (
|
| 120 |
)
|
| 121 |
parser.add_argument(
|
| 122 |
"--sobel-threshold",
|
|
@@ -168,6 +172,20 @@ Examples:
|
|
| 168 |
action="store_true",
|
| 169 |
help="[TESTING ONLY] Skip card detection and use dummy scale (allows testing finger segmentation without card)",
|
| 170 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 171 |
|
| 172 |
return parser.parse_args()
|
| 173 |
|
|
@@ -270,6 +288,162 @@ def save_output(output: Dict[str, Any], output_path: str) -> None:
|
|
| 270 |
json.dump(output, f, indent=2)
|
| 271 |
|
| 272 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 273 |
def measure_finger(
|
| 274 |
image: np.ndarray,
|
| 275 |
finger_index: FingerIndex = "index",
|
|
@@ -277,11 +451,13 @@ def measure_finger(
|
|
| 277 |
save_intermediate: bool = False,
|
| 278 |
result_png_path: Optional[str] = None,
|
| 279 |
save_debug: bool = False,
|
| 280 |
-
edge_method: str = "
|
| 281 |
sobel_threshold: float = 15.0,
|
| 282 |
sobel_kernel_size: int = 3,
|
| 283 |
use_subpixel: bool = True,
|
| 284 |
skip_card_detection: bool = False,
|
|
|
|
|
|
|
| 285 |
ring_model: str = DEFAULT_RING_MODEL,
|
| 286 |
) -> Dict[str, Any]:
|
| 287 |
"""
|
|
@@ -302,16 +478,14 @@ def measure_finger(
|
|
| 302 |
Returns:
|
| 303 |
Output dictionary with measurement results
|
| 304 |
"""
|
| 305 |
-
# Phase 2: Image quality
|
| 306 |
quality = assess_image_quality(image)
|
| 307 |
print(f"Image quality: blur={quality['blur_score']:.1f}, "
|
| 308 |
f"brightness={quality['brightness']:.1f}, "
|
| 309 |
f"contrast={quality['contrast']:.1f}")
|
| 310 |
-
|
| 311 |
if not quality["passed"]:
|
| 312 |
for issue in quality["issues"]:
|
| 313 |
-
print(f"
|
| 314 |
-
return create_output(fail_reason=quality["fail_reason"])
|
| 315 |
|
| 316 |
# Phase 3: Hand & finger segmentation (MOVED BEFORE CARD DETECTION)
|
| 317 |
# This allows us to rotate the image to canonical orientation first
|
|
@@ -320,7 +494,12 @@ def measure_finger(
|
|
| 320 |
if save_debug and result_png_path is not None:
|
| 321 |
finger_debug_dir = str(Path(result_png_path).parent / "finger_segmentation_debug")
|
| 322 |
|
| 323 |
-
hand_data = segment_hand(
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 324 |
|
| 325 |
if hand_data is None:
|
| 326 |
print("No hand detected in image")
|
|
@@ -358,7 +537,12 @@ def measure_finger(
|
|
| 358 |
view_angle_ok = True
|
| 359 |
card_detected = False
|
| 360 |
else:
|
| 361 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 362 |
|
| 363 |
if card_result is None:
|
| 364 |
print("Credit card not detected in image")
|
|
@@ -390,6 +574,12 @@ def measure_finger(
|
|
| 390 |
|
| 391 |
# Phase 5: Finger isolation (hand already segmented in Phase 3)
|
| 392 |
h_can, w_can = image_canonical.shape[:2]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 393 |
finger_data = isolate_finger(hand_data, finger=finger_index, image_shape=(h_can, w_can))
|
| 394 |
|
| 395 |
if finger_data is None:
|
|
@@ -497,6 +687,16 @@ def measure_finger(
|
|
| 497 |
borderValue=0
|
| 498 |
)
|
| 499 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 500 |
print(f"Rotation applied: {angle_from_vertical:.1f}° CW, finger now vertical")
|
| 501 |
else:
|
| 502 |
print(f"Finger axis is {angle_from_vertical:.1f}° from vertical (within {rotation_threshold}° threshold, no rotation needed)")
|
|
@@ -560,15 +760,33 @@ def measure_finger(
|
|
| 560 |
sobel_measurement = None
|
| 561 |
sobel_failed = False
|
| 562 |
|
| 563 |
-
if edge_method in ["sobel", "auto", "compare"]:
|
| 564 |
try:
|
| 565 |
-
|
| 566 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 567 |
# Create debug directory for edge refinement if debug enabled
|
| 568 |
edge_debug_dir = None
|
| 569 |
if save_debug and result_png_path is not None:
|
| 570 |
edge_debug_dir = str(Path(result_png_path).parent / "edge_refinement_debug")
|
| 571 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 572 |
sobel_measurement = refine_edges_sobel(
|
| 573 |
image=image_canonical, # Use canonical orientation
|
| 574 |
axis_data=axis_data,
|
|
@@ -578,27 +796,29 @@ def measure_finger(
|
|
| 578 |
sobel_threshold=sobel_threshold,
|
| 579 |
kernel_size=sobel_kernel_size,
|
| 580 |
use_subpixel=use_subpixel,
|
|
|
|
| 581 |
debug_dir=edge_debug_dir,
|
|
|
|
|
|
|
| 582 |
)
|
| 583 |
|
| 584 |
sobel_width_cm = sobel_measurement["median_width_cm"]
|
| 585 |
-
print(f"
|
| 586 |
f"({sobel_measurement['num_samples']} samples, "
|
| 587 |
f"std={sobel_measurement['std_width_px']:.2f}px, "
|
| 588 |
f"quality={sobel_measurement['edge_quality']['overall_score']:.3f})")
|
| 589 |
|
| 590 |
except Exception as e:
|
| 591 |
-
print(f"
|
| 592 |
sobel_failed = True
|
| 593 |
-
if edge_method
|
| 594 |
-
# User explicitly requested Sobel, fail if it doesn't work
|
| 595 |
return create_output(
|
| 596 |
card_detected=card_detected,
|
| 597 |
finger_detected=True,
|
| 598 |
scale_px_per_cm=px_per_cm,
|
| 599 |
view_angle_ok=view_angle_ok,
|
| 600 |
fail_reason="sobel_edge_refinement_failed",
|
| 601 |
-
edge_method_used=
|
| 602 |
)
|
| 603 |
|
| 604 |
# Select measurement method based on edge_method flag
|
|
@@ -616,6 +836,12 @@ def measure_finger(
|
|
| 616 |
median_width_cm = sobel_measurement["median_width_cm"]
|
| 617 |
edge_method_used = "sobel"
|
| 618 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 619 |
elif edge_method == "auto":
|
| 620 |
# Automatic selection based on quality
|
| 621 |
if sobel_measurement and not sobel_failed:
|
|
@@ -684,7 +910,7 @@ def measure_finger(
|
|
| 684 |
|
| 685 |
# Calculate edge quality confidence (v1)
|
| 686 |
edge_quality_conf = None
|
| 687 |
-
if edge_method_used in ["sobel", "compare"]:
|
| 688 |
edge_quality_conf = compute_edge_quality_confidence(
|
| 689 |
final_measurement.get("edge_quality")
|
| 690 |
)
|
|
@@ -694,7 +920,7 @@ def measure_finger(
|
|
| 694 |
card_conf,
|
| 695 |
finger_conf,
|
| 696 |
measurement_conf,
|
| 697 |
-
edge_method="sobel" if edge_method_used in ["sobel", "compare"] else "contour",
|
| 698 |
edge_quality_confidence=edge_quality_conf,
|
| 699 |
)
|
| 700 |
|
|
@@ -717,7 +943,7 @@ def measure_finger(
|
|
| 717 |
print(f"Generating result visualization...")
|
| 718 |
|
| 719 |
# Use comprehensive edge overlay (based on Sobel data) + card bounding box
|
| 720 |
-
if edge_method_used in ["sobel", "compare"] and sobel_measurement and not sobel_failed:
|
| 721 |
edge_data = sobel_measurement["edge_data"]
|
| 722 |
roi_bounds = sobel_measurement["roi_data"]["roi_bounds"]
|
| 723 |
width_data = sobel_measurement["width_data"]
|
|
@@ -747,6 +973,25 @@ def measure_finger(
|
|
| 747 |
# Fallback: plain image with axis/zone annotations when Sobel unavailable
|
| 748 |
debug_image = image_canonical.copy()
|
| 749 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 750 |
# Draw card bounding box (transform corners if image was rotated)
|
| 751 |
if card_result is not None and "corners" in card_result:
|
| 752 |
corners = card_result["corners"]
|
|
@@ -758,9 +1003,8 @@ def measure_finger(
|
|
| 758 |
cv2.polylines(debug_image, [pts], isClosed=True,
|
| 759 |
color=(0, 255, 0), thickness=3, lineType=cv2.LINE_AA)
|
| 760 |
|
| 761 |
-
# Save result image
|
| 762 |
-
|
| 763 |
-
cv2.imwrite(result_png_path, debug_image)
|
| 764 |
print(f"Result visualization saved to: {result_png_path}")
|
| 765 |
|
| 766 |
|
|
@@ -789,7 +1033,7 @@ def _measure_single_finger_from_shared(
|
|
| 789 |
view_angle_ok: bool,
|
| 790 |
card_result: Optional[Dict[str, Any]],
|
| 791 |
scale_confidence: float,
|
| 792 |
-
edge_method: str = "
|
| 793 |
sobel_threshold: float = 15.0,
|
| 794 |
sobel_kernel_size: int = 3,
|
| 795 |
use_subpixel: bool = True,
|
|
@@ -807,6 +1051,7 @@ def _measure_single_finger_from_shared(
|
|
| 807 |
)
|
| 808 |
|
| 809 |
h_can, w_can = image_canonical.shape[:2]
|
|
|
|
| 810 |
finger_data = isolate_finger(hand_data, finger=finger_name, image_shape=(h_can, w_can))
|
| 811 |
|
| 812 |
if finger_data is None:
|
|
@@ -858,6 +1103,11 @@ def _measure_single_finger_from_shared(
|
|
| 858 |
cleaned_mask, rotation_matrix, (w_can, h_can),
|
| 859 |
flags=cv2.INTER_NEAREST, borderMode=cv2.BORDER_CONSTANT, borderValue=0,
|
| 860 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 861 |
|
| 862 |
# Ring zone
|
| 863 |
try:
|
|
@@ -891,21 +1141,34 @@ def _measure_single_finger_from_shared(
|
|
| 891 |
# Sobel measurement
|
| 892 |
sobel_measurement = None
|
| 893 |
sobel_failed = False
|
| 894 |
-
if edge_method in ["sobel", "auto", "compare"]:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 895 |
try:
|
|
|
|
|
|
|
|
|
|
|
|
|
| 896 |
sobel_measurement = refine_edges_sobel(
|
| 897 |
image=img_work, axis_data=axis_data, zone_data=zone_data,
|
| 898 |
scale_px_per_cm=px_per_cm, finger_landmarks=finger_data.get("landmarks"),
|
| 899 |
sobel_threshold=sobel_threshold, kernel_size=sobel_kernel_size,
|
| 900 |
use_subpixel=use_subpixel,
|
|
|
|
|
|
|
|
|
|
| 901 |
)
|
| 902 |
except Exception:
|
| 903 |
sobel_failed = True
|
| 904 |
-
if edge_method
|
| 905 |
return create_output(
|
| 906 |
card_detected=card_detected, finger_detected=True,
|
| 907 |
scale_px_per_cm=px_per_cm, view_angle_ok=view_angle_ok,
|
| 908 |
-
fail_reason="sobel_edge_refinement_failed", edge_method_used=
|
| 909 |
)
|
| 910 |
|
| 911 |
# Select method
|
|
@@ -917,6 +1180,10 @@ def _measure_single_finger_from_shared(
|
|
| 917 |
median_width_cm = sobel_measurement["median_width_cm"]
|
| 918 |
edge_method_used = "sobel"
|
| 919 |
final_measurement = sobel_measurement
|
|
|
|
|
|
|
|
|
|
|
|
|
| 920 |
elif edge_method == "auto":
|
| 921 |
if sobel_measurement and not sobel_failed:
|
| 922 |
should_use, _ = should_use_sobel_measurement(sobel_measurement, contour_measurement)
|
|
@@ -947,11 +1214,11 @@ def _measure_single_finger_from_shared(
|
|
| 947 |
finger_conf = compute_finger_confidence(hand_data, finger_data, mask_area, image_area)
|
| 948 |
measurement_conf = compute_measurement_confidence(final_measurement, median_width_cm)
|
| 949 |
edge_quality_conf = None
|
| 950 |
-
if edge_method_used in ["sobel", "compare"]:
|
| 951 |
edge_quality_conf = compute_edge_quality_confidence(final_measurement.get("edge_quality"))
|
| 952 |
confidence_breakdown = compute_overall_confidence(
|
| 953 |
card_conf, finger_conf, measurement_conf,
|
| 954 |
-
edge_method="sobel" if edge_method_used in ["sobel", "compare"] else "contour",
|
| 955 |
edge_quality_confidence=edge_quality_conf,
|
| 956 |
)
|
| 957 |
|
|
@@ -978,12 +1245,14 @@ def measure_multi_finger(
|
|
| 978 |
confidence_threshold: float = 0.7,
|
| 979 |
result_png_path: Optional[str] = None,
|
| 980 |
save_debug: bool = False,
|
| 981 |
-
edge_method: str = "
|
| 982 |
sobel_threshold: float = 15.0,
|
| 983 |
sobel_kernel_size: int = 3,
|
| 984 |
use_subpixel: bool = True,
|
| 985 |
skip_card_detection: bool = False,
|
| 986 |
no_calibration: bool = False,
|
|
|
|
|
|
|
| 987 |
ring_model: str = DEFAULT_RING_MODEL,
|
| 988 |
) -> Dict[str, Any]:
|
| 989 |
"""Measure index, middle, and ring fingers from a single image.
|
|
@@ -996,14 +1265,13 @@ def measure_multi_finger(
|
|
| 996 |
"""
|
| 997 |
from src.finger_segmentation import FINGER_LANDMARKS
|
| 998 |
|
| 999 |
-
# Phase 1: Image quality
|
| 1000 |
quality = assess_image_quality(image)
|
| 1001 |
print(f"[multi] Image quality: blur={quality['blur_score']:.1f}, "
|
| 1002 |
f"brightness={quality['brightness']:.1f}, contrast={quality['contrast']:.1f}")
|
| 1003 |
if not quality["passed"]:
|
| 1004 |
for issue in quality["issues"]:
|
| 1005 |
-
print(f"
|
| 1006 |
-
return {"fail_reason": quality["fail_reason"], "per_finger": {}, "fingers_measured": 0, "fingers_succeeded": 0}
|
| 1007 |
|
| 1008 |
# Lighting uniformity check
|
| 1009 |
lighting = check_lighting_uniformity(image)
|
|
@@ -1015,7 +1283,12 @@ def measure_multi_finger(
|
|
| 1015 |
if save_debug and result_png_path is not None:
|
| 1016 |
finger_debug_dir = str(Path(result_png_path).parent / "finger_segmentation_debug")
|
| 1017 |
|
| 1018 |
-
hand_data = segment_hand(
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1019 |
if hand_data is None:
|
| 1020 |
print("[multi] No hand detected")
|
| 1021 |
return {"fail_reason": "hand_not_detected", "per_finger": {}, "fingers_measured": 0, "fingers_succeeded": 0}
|
|
@@ -1035,7 +1308,12 @@ def measure_multi_finger(
|
|
| 1035 |
view_angle_ok = True
|
| 1036 |
card_detected = False
|
| 1037 |
else:
|
| 1038 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1039 |
if card_result is None:
|
| 1040 |
return {"fail_reason": "card_not_detected", "per_finger": {}, "fingers_measured": 0, "fingers_succeeded": 0}
|
| 1041 |
px_per_cm, scale_confidence = compute_scale_factor(card_result["corners"])
|
|
@@ -1113,6 +1391,8 @@ def measure_multi_finger(
|
|
| 1113 |
card_result=card_result,
|
| 1114 |
px_per_cm=px_per_cm,
|
| 1115 |
result_png_path=result_png_path,
|
|
|
|
|
|
|
| 1116 |
)
|
| 1117 |
|
| 1118 |
# Clean internal data from output
|
|
@@ -1136,6 +1416,8 @@ def _draw_multi_finger_debug(
|
|
| 1136 |
card_result: Optional[Dict[str, Any]],
|
| 1137 |
px_per_cm: float,
|
| 1138 |
result_png_path: str,
|
|
|
|
|
|
|
| 1139 |
) -> None:
|
| 1140 |
"""Generate debug visualization for multi-finger measurement.
|
| 1141 |
|
|
@@ -1154,7 +1436,18 @@ def _draw_multi_finger_debug(
|
|
| 1154 |
vis = image_canonical.copy()
|
| 1155 |
h, w = vis.shape[:2]
|
| 1156 |
|
| 1157 |
-
#
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1158 |
if card_result is not None:
|
| 1159 |
vis = draw_card_overlay(vis, card_result, px_per_cm)
|
| 1160 |
|
|
@@ -1227,8 +1520,7 @@ def _draw_multi_finger_debug(
|
|
| 1227 |
Color.GREEN, 1, cv2.LINE_AA)
|
| 1228 |
count += 1
|
| 1229 |
|
| 1230 |
-
|
| 1231 |
-
cv2.imwrite(result_png_path, vis)
|
| 1232 |
print(f"\n[multi] Debug visualization saved to: {result_png_path}")
|
| 1233 |
|
| 1234 |
|
|
@@ -1266,6 +1558,8 @@ def main() -> int:
|
|
| 1266 |
use_subpixel=not args.no_subpixel,
|
| 1267 |
skip_card_detection=args.skip_card_detection,
|
| 1268 |
no_calibration=args.no_calibration,
|
|
|
|
|
|
|
| 1269 |
ring_model=args.ring_model,
|
| 1270 |
)
|
| 1271 |
|
|
@@ -1301,6 +1595,8 @@ def main() -> int:
|
|
| 1301 |
sobel_kernel_size=args.sobel_kernel_size,
|
| 1302 |
use_subpixel=not args.no_subpixel,
|
| 1303 |
skip_card_detection=args.skip_card_detection,
|
|
|
|
|
|
|
| 1304 |
ring_model=args.ring_model,
|
| 1305 |
)
|
| 1306 |
|
|
|
|
| 13 |
import json
|
| 14 |
import sys
|
| 15 |
from pathlib import Path
|
| 16 |
+
from typing import Optional, Dict, Any, List, Literal, Tuple
|
| 17 |
|
| 18 |
import cv2
|
| 19 |
import numpy as np
|
| 20 |
|
| 21 |
from src.image_quality import assess_image_quality
|
| 22 |
from src.card_detection import detect_credit_card, compute_scale_factor
|
| 23 |
+
from src.sam_card_detection import (
|
| 24 |
+
detect_credit_card_sam_prompt,
|
| 25 |
+
suggest_card_seeds,
|
| 26 |
+
)
|
| 27 |
from src.finger_segmentation import segment_hand, isolate_finger, clean_mask, get_finger_contour
|
| 28 |
from src.geometry import estimate_finger_axis, localize_ring_zone, localize_ring_zone_from_landmarks, compute_cross_section_width
|
| 29 |
from src.edge_refinement import refine_edges_sobel, should_use_sobel_measurement, compare_edge_methods
|
|
|
|
| 34 |
compute_edge_quality_confidence,
|
| 35 |
compute_overall_confidence,
|
| 36 |
)
|
| 37 |
+
from src.debug_observer import draw_comprehensive_edge_overlay, draw_hand_skeleton
|
| 38 |
from src.ring_size import recommend_ring_size, aggregate_ring_sizes, VALID_RING_MODELS, DEFAULT_RING_MODEL
|
| 39 |
from src.image_quality import (
|
| 40 |
check_card_in_frame,
|
|
|
|
| 118 |
parser.add_argument(
|
| 119 |
"--edge-method",
|
| 120 |
type=str,
|
| 121 |
+
default="mask",
|
| 122 |
+
choices=["auto", "contour", "sobel", "mask", "compare"],
|
| 123 |
+
help="Edge detection method: auto (quality-based), contour (v0), sobel (pure Sobel gradient, no SAM mask), mask (SAM mask boundary only, no Sobel), compare (both) (default: mask)",
|
| 124 |
)
|
| 125 |
parser.add_argument(
|
| 126 |
"--sobel-threshold",
|
|
|
|
| 172 |
action="store_true",
|
| 173 |
help="[TESTING ONLY] Skip card detection and use dummy scale (allows testing finger segmentation without card)",
|
| 174 |
)
|
| 175 |
+
parser.add_argument(
|
| 176 |
+
"--card-method",
|
| 177 |
+
type=str,
|
| 178 |
+
choices=["classic", "sam"],
|
| 179 |
+
default="classic",
|
| 180 |
+
help="Card detection backend: 'classic' (Canny/adaptive/Otsu/color waterfall) or 'sam' (SAM 2.1 mask segmentation). Default: classic.",
|
| 181 |
+
)
|
| 182 |
+
parser.add_argument(
|
| 183 |
+
"--hand-mask",
|
| 184 |
+
type=str,
|
| 185 |
+
choices=["synthetic", "sam"],
|
| 186 |
+
default="sam",
|
| 187 |
+
help="Hand mask source: 'synthetic' (MediaPipe landmark convex hull) or 'sam' (SAM 2.1 pixel-accurate). Default: sam.",
|
| 188 |
+
)
|
| 189 |
|
| 190 |
return parser.parse_args()
|
| 191 |
|
|
|
|
| 288 |
json.dump(output, f, indent=2)
|
| 289 |
|
| 290 |
|
| 291 |
+
# Debug visualisations are for human inspection, so there's no reason to
|
| 292 |
+
# write a 12-megapixel PNG (encoding alone can take 1–2s on CPU). Cap the
|
| 293 |
+
# long side and encode as JPEG — the on-disk path keeps its .png extension
|
| 294 |
+
# for backwards compat with existing callers, but we write JPEG bytes when
|
| 295 |
+
# the downscale is active to keep encoding well under ~100ms.
|
| 296 |
+
_DEBUG_VIS_MAX_LONG_SIDE = 1600
|
| 297 |
+
|
| 298 |
+
|
| 299 |
+
def _overlay_hand_skeleton(
|
| 300 |
+
image: np.ndarray,
|
| 301 |
+
landmarks: Optional[np.ndarray],
|
| 302 |
+
rotation_matrix: Optional[np.ndarray] = None,
|
| 303 |
+
) -> np.ndarray:
|
| 304 |
+
"""Draw the 21-point MediaPipe hand skeleton onto a debug image.
|
| 305 |
+
|
| 306 |
+
Landmarks are assumed to be in the canonical-image frame. If a precise
|
| 307 |
+
rotation was applied to align the finger vertically, pass the same
|
| 308 |
+
rotation_matrix so the skeleton lands on the rotated image.
|
| 309 |
+
"""
|
| 310 |
+
if landmarks is None or len(landmarks) < 21:
|
| 311 |
+
return image
|
| 312 |
+
pts = np.asarray(landmarks, dtype=np.float64)
|
| 313 |
+
if rotation_matrix is not None:
|
| 314 |
+
from src.geometry import transform_points_rotation
|
| 315 |
+
pts = transform_points_rotation(pts, rotation_matrix)
|
| 316 |
+
return draw_hand_skeleton(image, pts)
|
| 317 |
+
|
| 318 |
+
|
| 319 |
+
def _overlay_sam_masks(
|
| 320 |
+
image: np.ndarray,
|
| 321 |
+
hand_mask: Optional[np.ndarray] = None,
|
| 322 |
+
card_mask: Optional[np.ndarray] = None,
|
| 323 |
+
rotation_matrix: Optional[np.ndarray] = None,
|
| 324 |
+
) -> np.ndarray:
|
| 325 |
+
"""Tint the SAM hand and card masks onto a debug image.
|
| 326 |
+
|
| 327 |
+
Hand mask is rendered in cyan, card mask in green. Both are drawn as
|
| 328 |
+
semi-transparent fills plus a solid contour so the pixel-accurate SAM
|
| 329 |
+
silhouettes remain visible underneath downstream finger/edge overlays.
|
| 330 |
+
|
| 331 |
+
If ``rotation_matrix`` is supplied (because the caller applied a precise
|
| 332 |
+
finger-alignment rotation to the canonical image before this call), the
|
| 333 |
+
masks are rotated to match so they stay aligned with the image.
|
| 334 |
+
"""
|
| 335 |
+
if hand_mask is None and card_mask is None:
|
| 336 |
+
return image
|
| 337 |
+
|
| 338 |
+
h, w = image.shape[:2]
|
| 339 |
+
out = image.copy()
|
| 340 |
+
|
| 341 |
+
def _prepare(mask: np.ndarray) -> Optional[np.ndarray]:
|
| 342 |
+
if mask is None:
|
| 343 |
+
return None
|
| 344 |
+
if mask.dtype != np.uint8:
|
| 345 |
+
m = (mask > 0).astype(np.uint8) * 255
|
| 346 |
+
else:
|
| 347 |
+
m = mask.copy()
|
| 348 |
+
if m.shape[:2] != (h, w):
|
| 349 |
+
m = cv2.resize(m, (w, h), interpolation=cv2.INTER_NEAREST)
|
| 350 |
+
if rotation_matrix is not None:
|
| 351 |
+
m = cv2.warpAffine(
|
| 352 |
+
m, rotation_matrix, (w, h),
|
| 353 |
+
flags=cv2.INTER_NEAREST,
|
| 354 |
+
borderMode=cv2.BORDER_CONSTANT,
|
| 355 |
+
borderValue=0,
|
| 356 |
+
)
|
| 357 |
+
return m
|
| 358 |
+
|
| 359 |
+
hand_u8 = _prepare(hand_mask)
|
| 360 |
+
card_u8 = _prepare(card_mask)
|
| 361 |
+
|
| 362 |
+
# Semi-transparent fills
|
| 363 |
+
if hand_u8 is not None:
|
| 364 |
+
tint = np.zeros_like(out)
|
| 365 |
+
tint[hand_u8 > 0] = (255, 255, 0) # cyan in BGR
|
| 366 |
+
out = cv2.addWeighted(out, 1.0, tint, 0.18, 0)
|
| 367 |
+
if card_u8 is not None:
|
| 368 |
+
tint = np.zeros_like(out)
|
| 369 |
+
tint[card_u8 > 0] = (0, 255, 0) # green in BGR
|
| 370 |
+
out = cv2.addWeighted(out, 1.0, tint, 0.22, 0)
|
| 371 |
+
|
| 372 |
+
# Solid contours to emphasize the SAM-derived silhouette
|
| 373 |
+
if hand_u8 is not None:
|
| 374 |
+
contours, _ = cv2.findContours(hand_u8, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
|
| 375 |
+
cv2.drawContours(out, contours, -1, (255, 255, 0), 2, cv2.LINE_AA)
|
| 376 |
+
if card_u8 is not None:
|
| 377 |
+
contours, _ = cv2.findContours(card_u8, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
|
| 378 |
+
cv2.drawContours(out, contours, -1, (0, 255, 0), 2, cv2.LINE_AA)
|
| 379 |
+
|
| 380 |
+
return out
|
| 381 |
+
|
| 382 |
+
|
| 383 |
+
def _save_debug_visualization(path: str, image: np.ndarray) -> None:
|
| 384 |
+
"""Downscale + fast-encode a debug overlay image.
|
| 385 |
+
|
| 386 |
+
The web demo and validation scripts all consume this just for display,
|
| 387 |
+
so we trade 12 MP PNG encoding (~1–2s) for a ~1600 px JPEG (~50ms)
|
| 388 |
+
without changing the output file path.
|
| 389 |
+
"""
|
| 390 |
+
Path(path).parent.mkdir(parents=True, exist_ok=True)
|
| 391 |
+
h, w = image.shape[:2]
|
| 392 |
+
long_side = max(h, w)
|
| 393 |
+
if long_side > _DEBUG_VIS_MAX_LONG_SIDE:
|
| 394 |
+
scale = _DEBUG_VIS_MAX_LONG_SIDE / long_side
|
| 395 |
+
new_size = (int(round(w * scale)), int(round(h * scale)))
|
| 396 |
+
image = cv2.resize(image, new_size, interpolation=cv2.INTER_AREA)
|
| 397 |
+
# JPEG is ~20× faster than PNG to encode at this size and visually
|
| 398 |
+
# indistinguishable for debug overlays.
|
| 399 |
+
ok, buf = cv2.imencode(".jpg", image, [int(cv2.IMWRITE_JPEG_QUALITY), 85])
|
| 400 |
+
if not ok:
|
| 401 |
+
cv2.imwrite(path, image) # fallback to whatever imwrite picks from ext
|
| 402 |
+
return
|
| 403 |
+
with open(path, "wb") as f:
|
| 404 |
+
f.write(buf.tobytes())
|
| 405 |
+
|
| 406 |
+
|
| 407 |
+
def _sam_card_detect(
|
| 408 |
+
image_canonical: np.ndarray,
|
| 409 |
+
hand_data: Dict[str, Any],
|
| 410 |
+
save_debug: bool,
|
| 411 |
+
result_png_path: Optional[str],
|
| 412 |
+
) -> Optional[Dict[str, Any]]:
|
| 413 |
+
"""Run prompt-based SAM card detection.
|
| 414 |
+
|
| 415 |
+
No AMG fallback: empirically, if the 5x5 prompt grid doesn't find the
|
| 416 |
+
card, AMG won't either, and the ~20s AMG retry is pure cost. Returns
|
| 417 |
+
the card dict or None on failure.
|
| 418 |
+
"""
|
| 419 |
+
debug_root = (
|
| 420 |
+
Path(result_png_path).parent if (save_debug and result_png_path is not None) else None
|
| 421 |
+
)
|
| 422 |
+
hand_mask = hand_data.get("mask")
|
| 423 |
+
landmarks = hand_data.get("landmarks")
|
| 424 |
+
|
| 425 |
+
if hand_mask is None:
|
| 426 |
+
return None
|
| 427 |
+
|
| 428 |
+
seeds = suggest_card_seeds(hand_mask, image_canonical.shape[:2])
|
| 429 |
+
if not seeds:
|
| 430 |
+
return None
|
| 431 |
+
|
| 432 |
+
negatives: List[Tuple[int, int]] = []
|
| 433 |
+
if landmarks is not None:
|
| 434 |
+
palm_idx = [0, 5, 9, 13, 17]
|
| 435 |
+
palm_c = np.mean(landmarks[palm_idx, :2], axis=0)
|
| 436 |
+
negatives.append((int(round(palm_c[0])), int(round(palm_c[1]))))
|
| 437 |
+
|
| 438 |
+
prompt_debug = str(debug_root / "sam_card_prompt_debug") if debug_root else None
|
| 439 |
+
return detect_credit_card_sam_prompt(
|
| 440 |
+
image_canonical,
|
| 441 |
+
seed_points=seeds,
|
| 442 |
+
negative_points=negatives,
|
| 443 |
+
debug_dir=prompt_debug,
|
| 444 |
+
)
|
| 445 |
+
|
| 446 |
+
|
| 447 |
def measure_finger(
|
| 448 |
image: np.ndarray,
|
| 449 |
finger_index: FingerIndex = "index",
|
|
|
|
| 451 |
save_intermediate: bool = False,
|
| 452 |
result_png_path: Optional[str] = None,
|
| 453 |
save_debug: bool = False,
|
| 454 |
+
edge_method: str = "mask",
|
| 455 |
sobel_threshold: float = 15.0,
|
| 456 |
sobel_kernel_size: int = 3,
|
| 457 |
use_subpixel: bool = True,
|
| 458 |
skip_card_detection: bool = False,
|
| 459 |
+
card_method: str = "classic",
|
| 460 |
+
hand_mask_method: str = "sam",
|
| 461 |
ring_model: str = DEFAULT_RING_MODEL,
|
| 462 |
) -> Dict[str, Any]:
|
| 463 |
"""
|
|
|
|
| 478 |
Returns:
|
| 479 |
Output dictionary with measurement results
|
| 480 |
"""
|
| 481 |
+
# Phase 2: Image quality metrics (informational only — no hard fail)
|
| 482 |
quality = assess_image_quality(image)
|
| 483 |
print(f"Image quality: blur={quality['blur_score']:.1f}, "
|
| 484 |
f"brightness={quality['brightness']:.1f}, "
|
| 485 |
f"contrast={quality['contrast']:.1f}")
|
|
|
|
| 486 |
if not quality["passed"]:
|
| 487 |
for issue in quality["issues"]:
|
| 488 |
+
print(f" Note: {issue}")
|
|
|
|
| 489 |
|
| 490 |
# Phase 3: Hand & finger segmentation (MOVED BEFORE CARD DETECTION)
|
| 491 |
# This allows us to rotate the image to canonical orientation first
|
|
|
|
| 494 |
if save_debug and result_png_path is not None:
|
| 495 |
finger_debug_dir = str(Path(result_png_path).parent / "finger_segmentation_debug")
|
| 496 |
|
| 497 |
+
hand_data = segment_hand(
|
| 498 |
+
image,
|
| 499 |
+
finger=finger_index,
|
| 500 |
+
debug_dir=finger_debug_dir,
|
| 501 |
+
use_sam_mask=(hand_mask_method == "sam"),
|
| 502 |
+
)
|
| 503 |
|
| 504 |
if hand_data is None:
|
| 505 |
print("No hand detected in image")
|
|
|
|
| 537 |
view_angle_ok = True
|
| 538 |
card_detected = False
|
| 539 |
else:
|
| 540 |
+
if card_method == "sam":
|
| 541 |
+
card_result = _sam_card_detect(
|
| 542 |
+
image_canonical, hand_data, save_debug, result_png_path
|
| 543 |
+
)
|
| 544 |
+
else:
|
| 545 |
+
card_result = detect_credit_card(image_canonical, debug_dir=card_debug_dir)
|
| 546 |
|
| 547 |
if card_result is None:
|
| 548 |
print("Credit card not detected in image")
|
|
|
|
| 574 |
|
| 575 |
# Phase 5: Finger isolation (hand already segmented in Phase 3)
|
| 576 |
h_can, w_can = image_canonical.shape[:2]
|
| 577 |
+
# Keep a reference to the raw SAM hand mask (pre-isolation polygon clip).
|
| 578 |
+
# mask_only edge detection needs the untrimmed silhouette — the isolation
|
| 579 |
+
# polygon in _create_finger_roi_mask is only ~1.08x the landmark segment
|
| 580 |
+
# length and can cut into a wider-than-average finger, which would make
|
| 581 |
+
# the mask boundary narrower than the true SAM boundary.
|
| 582 |
+
raw_hand_mask = hand_data.get("mask")
|
| 583 |
finger_data = isolate_finger(hand_data, finger=finger_index, image_shape=(h_can, w_can))
|
| 584 |
|
| 585 |
if finger_data is None:
|
|
|
|
| 687 |
borderValue=0
|
| 688 |
)
|
| 689 |
|
| 690 |
+
# Also warp the raw SAM hand mask so mask_only mode can read the
|
| 691 |
+
# untrimmed silhouette in the same rotated frame as the image.
|
| 692 |
+
if raw_hand_mask is not None:
|
| 693 |
+
raw_hand_mask = cv2.warpAffine(
|
| 694 |
+
raw_hand_mask, rotation_matrix, (w_can, h_can),
|
| 695 |
+
flags=cv2.INTER_NEAREST,
|
| 696 |
+
borderMode=cv2.BORDER_CONSTANT,
|
| 697 |
+
borderValue=0,
|
| 698 |
+
)
|
| 699 |
+
|
| 700 |
print(f"Rotation applied: {angle_from_vertical:.1f}° CW, finger now vertical")
|
| 701 |
else:
|
| 702 |
print(f"Finger axis is {angle_from_vertical:.1f}° from vertical (within {rotation_threshold}° threshold, no rotation needed)")
|
|
|
|
| 760 |
sobel_measurement = None
|
| 761 |
sobel_failed = False
|
| 762 |
|
| 763 |
+
if edge_method in ["sobel", "mask", "auto", "compare"]:
|
| 764 |
try:
|
| 765 |
+
# Pure Sobel mode drops the SAM mask; pure mask mode uses the SAM
|
| 766 |
+
# boundary directly without gradient snapping; auto/compare stay
|
| 767 |
+
# on the legacy hybrid path that combines both.
|
| 768 |
+
if edge_method == "sobel":
|
| 769 |
+
mask_mode = "sobel_only"
|
| 770 |
+
elif edge_method == "mask":
|
| 771 |
+
mask_mode = "mask_only"
|
| 772 |
+
else:
|
| 773 |
+
mask_mode = "hybrid"
|
| 774 |
+
|
| 775 |
+
print(f"Running edge refinement (mode={mask_mode}, threshold={sobel_threshold}, kernel={sobel_kernel_size})...")
|
| 776 |
+
|
| 777 |
# Create debug directory for edge refinement if debug enabled
|
| 778 |
edge_debug_dir = None
|
| 779 |
if save_debug and result_png_path is not None:
|
| 780 |
edge_debug_dir = str(Path(result_png_path).parent / "edge_refinement_debug")
|
| 781 |
+
|
| 782 |
+
# mask_only reads boundaries directly from the mask, so it needs
|
| 783 |
+
# the *raw* SAM silhouette. The hybrid/sobel_only paths keep the
|
| 784 |
+
# isolation-trimmed mask they were validated against.
|
| 785 |
+
if mask_mode == "mask_only" and raw_hand_mask is not None:
|
| 786 |
+
edge_mask_input = raw_hand_mask
|
| 787 |
+
else:
|
| 788 |
+
edge_mask_input = cleaned_mask
|
| 789 |
+
|
| 790 |
sobel_measurement = refine_edges_sobel(
|
| 791 |
image=image_canonical, # Use canonical orientation
|
| 792 |
axis_data=axis_data,
|
|
|
|
| 796 |
sobel_threshold=sobel_threshold,
|
| 797 |
kernel_size=sobel_kernel_size,
|
| 798 |
use_subpixel=use_subpixel,
|
| 799 |
+
finger_mask=edge_mask_input,
|
| 800 |
debug_dir=edge_debug_dir,
|
| 801 |
+
mask_mode=mask_mode,
|
| 802 |
+
finger_name=finger_data.get("finger_name"),
|
| 803 |
)
|
| 804 |
|
| 805 |
sobel_width_cm = sobel_measurement["median_width_cm"]
|
| 806 |
+
print(f"Edge width: {sobel_width_cm:.4f}cm "
|
| 807 |
f"({sobel_measurement['num_samples']} samples, "
|
| 808 |
f"std={sobel_measurement['std_width_px']:.2f}px, "
|
| 809 |
f"quality={sobel_measurement['edge_quality']['overall_score']:.3f})")
|
| 810 |
|
| 811 |
except Exception as e:
|
| 812 |
+
print(f"Edge refinement failed: {e}")
|
| 813 |
sobel_failed = True
|
| 814 |
+
if edge_method in ("sobel", "mask"):
|
|
|
|
| 815 |
return create_output(
|
| 816 |
card_detected=card_detected,
|
| 817 |
finger_detected=True,
|
| 818 |
scale_px_per_cm=px_per_cm,
|
| 819 |
view_angle_ok=view_angle_ok,
|
| 820 |
fail_reason="sobel_edge_refinement_failed",
|
| 821 |
+
edge_method_used=edge_method,
|
| 822 |
)
|
| 823 |
|
| 824 |
# Select measurement method based on edge_method flag
|
|
|
|
| 836 |
median_width_cm = sobel_measurement["median_width_cm"]
|
| 837 |
edge_method_used = "sobel"
|
| 838 |
|
| 839 |
+
elif edge_method == "mask":
|
| 840 |
+
# Use SAM-mask boundary directly (already handled failure case above)
|
| 841 |
+
final_measurement = sobel_measurement
|
| 842 |
+
median_width_cm = sobel_measurement["median_width_cm"]
|
| 843 |
+
edge_method_used = "mask"
|
| 844 |
+
|
| 845 |
elif edge_method == "auto":
|
| 846 |
# Automatic selection based on quality
|
| 847 |
if sobel_measurement and not sobel_failed:
|
|
|
|
| 910 |
|
| 911 |
# Calculate edge quality confidence (v1)
|
| 912 |
edge_quality_conf = None
|
| 913 |
+
if edge_method_used in ["sobel", "mask", "compare"]:
|
| 914 |
edge_quality_conf = compute_edge_quality_confidence(
|
| 915 |
final_measurement.get("edge_quality")
|
| 916 |
)
|
|
|
|
| 920 |
card_conf,
|
| 921 |
finger_conf,
|
| 922 |
measurement_conf,
|
| 923 |
+
edge_method="sobel" if edge_method_used in ["sobel", "mask", "compare"] else "contour",
|
| 924 |
edge_quality_confidence=edge_quality_conf,
|
| 925 |
)
|
| 926 |
|
|
|
|
| 943 |
print(f"Generating result visualization...")
|
| 944 |
|
| 945 |
# Use comprehensive edge overlay (based on Sobel data) + card bounding box
|
| 946 |
+
if edge_method_used in ["sobel", "mask", "compare"] and sobel_measurement and not sobel_failed:
|
| 947 |
edge_data = sobel_measurement["edge_data"]
|
| 948 |
roi_bounds = sobel_measurement["roi_data"]["roi_bounds"]
|
| 949 |
width_data = sobel_measurement["width_data"]
|
|
|
|
| 973 |
# Fallback: plain image with axis/zone annotations when Sobel unavailable
|
| 974 |
debug_image = image_canonical.copy()
|
| 975 |
|
| 976 |
+
# Tint SAM hand + card masks as underlays. Both masks live in the
|
| 977 |
+
# pre-precise-rotation canonical frame, so apply the same rotation
|
| 978 |
+
# matrix that was used to align the finger.
|
| 979 |
+
debug_image = _overlay_sam_masks(
|
| 980 |
+
debug_image,
|
| 981 |
+
hand_mask=hand_data.get("mask") if hand_data else None,
|
| 982 |
+
card_mask=card_result.get("mask") if card_result else None,
|
| 983 |
+
rotation_matrix=rotation_matrix,
|
| 984 |
+
)
|
| 985 |
+
|
| 986 |
+
# Draw the MediaPipe hand skeleton so reviewers can see the detected
|
| 987 |
+
# landmarks. hand_data landmarks are in the pre-precise-rotation
|
| 988 |
+
# canonical frame, so apply the same rotation_matrix here.
|
| 989 |
+
debug_image = _overlay_hand_skeleton(
|
| 990 |
+
debug_image,
|
| 991 |
+
landmarks=hand_data.get("landmarks") if hand_data else None,
|
| 992 |
+
rotation_matrix=rotation_matrix,
|
| 993 |
+
)
|
| 994 |
+
|
| 995 |
# Draw card bounding box (transform corners if image was rotated)
|
| 996 |
if card_result is not None and "corners" in card_result:
|
| 997 |
corners = card_result["corners"]
|
|
|
|
| 1003 |
cv2.polylines(debug_image, [pts], isClosed=True,
|
| 1004 |
color=(0, 255, 0), thickness=3, lineType=cv2.LINE_AA)
|
| 1005 |
|
| 1006 |
+
# Save result image (downscaled + JPEG-encoded for speed)
|
| 1007 |
+
_save_debug_visualization(result_png_path, debug_image)
|
|
|
|
| 1008 |
print(f"Result visualization saved to: {result_png_path}")
|
| 1009 |
|
| 1010 |
|
|
|
|
| 1033 |
view_angle_ok: bool,
|
| 1034 |
card_result: Optional[Dict[str, Any]],
|
| 1035 |
scale_confidence: float,
|
| 1036 |
+
edge_method: str = "mask",
|
| 1037 |
sobel_threshold: float = 15.0,
|
| 1038 |
sobel_kernel_size: int = 3,
|
| 1039 |
use_subpixel: bool = True,
|
|
|
|
| 1051 |
)
|
| 1052 |
|
| 1053 |
h_can, w_can = image_canonical.shape[:2]
|
| 1054 |
+
raw_hand_mask = hand_data.get("mask")
|
| 1055 |
finger_data = isolate_finger(hand_data, finger=finger_name, image_shape=(h_can, w_can))
|
| 1056 |
|
| 1057 |
if finger_data is None:
|
|
|
|
| 1103 |
cleaned_mask, rotation_matrix, (w_can, h_can),
|
| 1104 |
flags=cv2.INTER_NEAREST, borderMode=cv2.BORDER_CONSTANT, borderValue=0,
|
| 1105 |
)
|
| 1106 |
+
if raw_hand_mask is not None:
|
| 1107 |
+
raw_hand_mask = cv2.warpAffine(
|
| 1108 |
+
raw_hand_mask, rotation_matrix, (w_can, h_can),
|
| 1109 |
+
flags=cv2.INTER_NEAREST, borderMode=cv2.BORDER_CONSTANT, borderValue=0,
|
| 1110 |
+
)
|
| 1111 |
|
| 1112 |
# Ring zone
|
| 1113 |
try:
|
|
|
|
| 1141 |
# Sobel measurement
|
| 1142 |
sobel_measurement = None
|
| 1143 |
sobel_failed = False
|
| 1144 |
+
if edge_method in ["sobel", "mask", "auto", "compare"]:
|
| 1145 |
+
if edge_method == "sobel":
|
| 1146 |
+
mask_mode = "sobel_only"
|
| 1147 |
+
elif edge_method == "mask":
|
| 1148 |
+
mask_mode = "mask_only"
|
| 1149 |
+
else:
|
| 1150 |
+
mask_mode = "hybrid"
|
| 1151 |
try:
|
| 1152 |
+
if mask_mode == "mask_only" and raw_hand_mask is not None:
|
| 1153 |
+
edge_mask_input = raw_hand_mask
|
| 1154 |
+
else:
|
| 1155 |
+
edge_mask_input = cleaned_mask
|
| 1156 |
sobel_measurement = refine_edges_sobel(
|
| 1157 |
image=img_work, axis_data=axis_data, zone_data=zone_data,
|
| 1158 |
scale_px_per_cm=px_per_cm, finger_landmarks=finger_data.get("landmarks"),
|
| 1159 |
sobel_threshold=sobel_threshold, kernel_size=sobel_kernel_size,
|
| 1160 |
use_subpixel=use_subpixel,
|
| 1161 |
+
finger_mask=edge_mask_input,
|
| 1162 |
+
mask_mode=mask_mode,
|
| 1163 |
+
finger_name=finger_name,
|
| 1164 |
)
|
| 1165 |
except Exception:
|
| 1166 |
sobel_failed = True
|
| 1167 |
+
if edge_method in ("sobel", "mask"):
|
| 1168 |
return create_output(
|
| 1169 |
card_detected=card_detected, finger_detected=True,
|
| 1170 |
scale_px_per_cm=px_per_cm, view_angle_ok=view_angle_ok,
|
| 1171 |
+
fail_reason="sobel_edge_refinement_failed", edge_method_used=edge_method,
|
| 1172 |
)
|
| 1173 |
|
| 1174 |
# Select method
|
|
|
|
| 1180 |
median_width_cm = sobel_measurement["median_width_cm"]
|
| 1181 |
edge_method_used = "sobel"
|
| 1182 |
final_measurement = sobel_measurement
|
| 1183 |
+
elif edge_method == "mask" and sobel_measurement:
|
| 1184 |
+
median_width_cm = sobel_measurement["median_width_cm"]
|
| 1185 |
+
edge_method_used = "mask"
|
| 1186 |
+
final_measurement = sobel_measurement
|
| 1187 |
elif edge_method == "auto":
|
| 1188 |
if sobel_measurement and not sobel_failed:
|
| 1189 |
should_use, _ = should_use_sobel_measurement(sobel_measurement, contour_measurement)
|
|
|
|
| 1214 |
finger_conf = compute_finger_confidence(hand_data, finger_data, mask_area, image_area)
|
| 1215 |
measurement_conf = compute_measurement_confidence(final_measurement, median_width_cm)
|
| 1216 |
edge_quality_conf = None
|
| 1217 |
+
if edge_method_used in ["sobel", "mask", "compare"]:
|
| 1218 |
edge_quality_conf = compute_edge_quality_confidence(final_measurement.get("edge_quality"))
|
| 1219 |
confidence_breakdown = compute_overall_confidence(
|
| 1220 |
card_conf, finger_conf, measurement_conf,
|
| 1221 |
+
edge_method="sobel" if edge_method_used in ["sobel", "mask", "compare"] else "contour",
|
| 1222 |
edge_quality_confidence=edge_quality_conf,
|
| 1223 |
)
|
| 1224 |
|
|
|
|
| 1245 |
confidence_threshold: float = 0.7,
|
| 1246 |
result_png_path: Optional[str] = None,
|
| 1247 |
save_debug: bool = False,
|
| 1248 |
+
edge_method: str = "mask",
|
| 1249 |
sobel_threshold: float = 15.0,
|
| 1250 |
sobel_kernel_size: int = 3,
|
| 1251 |
use_subpixel: bool = True,
|
| 1252 |
skip_card_detection: bool = False,
|
| 1253 |
no_calibration: bool = False,
|
| 1254 |
+
card_method: str = "classic",
|
| 1255 |
+
hand_mask_method: str = "sam",
|
| 1256 |
ring_model: str = DEFAULT_RING_MODEL,
|
| 1257 |
) -> Dict[str, Any]:
|
| 1258 |
"""Measure index, middle, and ring fingers from a single image.
|
|
|
|
| 1265 |
"""
|
| 1266 |
from src.finger_segmentation import FINGER_LANDMARKS
|
| 1267 |
|
| 1268 |
+
# Phase 1: Image quality metrics (informational only — no hard fail)
|
| 1269 |
quality = assess_image_quality(image)
|
| 1270 |
print(f"[multi] Image quality: blur={quality['blur_score']:.1f}, "
|
| 1271 |
f"brightness={quality['brightness']:.1f}, contrast={quality['contrast']:.1f}")
|
| 1272 |
if not quality["passed"]:
|
| 1273 |
for issue in quality["issues"]:
|
| 1274 |
+
print(f" Note: {issue}")
|
|
|
|
| 1275 |
|
| 1276 |
# Lighting uniformity check
|
| 1277 |
lighting = check_lighting_uniformity(image)
|
|
|
|
| 1283 |
if save_debug and result_png_path is not None:
|
| 1284 |
finger_debug_dir = str(Path(result_png_path).parent / "finger_segmentation_debug")
|
| 1285 |
|
| 1286 |
+
hand_data = segment_hand(
|
| 1287 |
+
image,
|
| 1288 |
+
finger="index",
|
| 1289 |
+
debug_dir=finger_debug_dir,
|
| 1290 |
+
use_sam_mask=(hand_mask_method == "sam"),
|
| 1291 |
+
)
|
| 1292 |
if hand_data is None:
|
| 1293 |
print("[multi] No hand detected")
|
| 1294 |
return {"fail_reason": "hand_not_detected", "per_finger": {}, "fingers_measured": 0, "fingers_succeeded": 0}
|
|
|
|
| 1308 |
view_angle_ok = True
|
| 1309 |
card_detected = False
|
| 1310 |
else:
|
| 1311 |
+
if card_method == "sam":
|
| 1312 |
+
card_result = _sam_card_detect(
|
| 1313 |
+
image_canonical, hand_data, save_debug, result_png_path
|
| 1314 |
+
)
|
| 1315 |
+
else:
|
| 1316 |
+
card_result = detect_credit_card(image_canonical, debug_dir=card_debug_dir)
|
| 1317 |
if card_result is None:
|
| 1318 |
return {"fail_reason": "card_not_detected", "per_finger": {}, "fingers_measured": 0, "fingers_succeeded": 0}
|
| 1319 |
px_per_cm, scale_confidence = compute_scale_factor(card_result["corners"])
|
|
|
|
| 1391 |
card_result=card_result,
|
| 1392 |
px_per_cm=px_per_cm,
|
| 1393 |
result_png_path=result_png_path,
|
| 1394 |
+
hand_mask=hand_data.get("mask") if hand_data else None,
|
| 1395 |
+
hand_landmarks=hand_data.get("landmarks") if hand_data else None,
|
| 1396 |
)
|
| 1397 |
|
| 1398 |
# Clean internal data from output
|
|
|
|
| 1416 |
card_result: Optional[Dict[str, Any]],
|
| 1417 |
px_per_cm: float,
|
| 1418 |
result_png_path: str,
|
| 1419 |
+
hand_mask: Optional[np.ndarray] = None,
|
| 1420 |
+
hand_landmarks: Optional[np.ndarray] = None,
|
| 1421 |
) -> None:
|
| 1422 |
"""Generate debug visualization for multi-finger measurement.
|
| 1423 |
|
|
|
|
| 1436 |
vis = image_canonical.copy()
|
| 1437 |
h, w = vis.shape[:2]
|
| 1438 |
|
| 1439 |
+
# SAM silhouettes (hand + card) as tinted underlays
|
| 1440 |
+
vis = _overlay_sam_masks(
|
| 1441 |
+
vis,
|
| 1442 |
+
hand_mask=hand_mask,
|
| 1443 |
+
card_mask=(card_result.get("mask") if card_result else None),
|
| 1444 |
+
)
|
| 1445 |
+
|
| 1446 |
+
# MediaPipe hand skeleton (canonical frame — no rotation needed since the
|
| 1447 |
+
# multi-finger viz composes per-finger overlays via inverse rotation).
|
| 1448 |
+
vis = _overlay_hand_skeleton(vis, landmarks=hand_landmarks)
|
| 1449 |
+
|
| 1450 |
+
# Draw card bounding box / dimensions on top of the tinted card mask
|
| 1451 |
if card_result is not None:
|
| 1452 |
vis = draw_card_overlay(vis, card_result, px_per_cm)
|
| 1453 |
|
|
|
|
| 1520 |
Color.GREEN, 1, cv2.LINE_AA)
|
| 1521 |
count += 1
|
| 1522 |
|
| 1523 |
+
_save_debug_visualization(result_png_path, vis)
|
|
|
|
| 1524 |
print(f"\n[multi] Debug visualization saved to: {result_png_path}")
|
| 1525 |
|
| 1526 |
|
|
|
|
| 1558 |
use_subpixel=not args.no_subpixel,
|
| 1559 |
skip_card_detection=args.skip_card_detection,
|
| 1560 |
no_calibration=args.no_calibration,
|
| 1561 |
+
card_method=args.card_method,
|
| 1562 |
+
hand_mask_method=args.hand_mask,
|
| 1563 |
ring_model=args.ring_model,
|
| 1564 |
)
|
| 1565 |
|
|
|
|
| 1595 |
sobel_kernel_size=args.sobel_kernel_size,
|
| 1596 |
use_subpixel=not args.no_subpixel,
|
| 1597 |
skip_card_detection=args.skip_card_detection,
|
| 1598 |
+
card_method=args.card_method,
|
| 1599 |
+
hand_mask_method=args.hand_mask,
|
| 1600 |
ring_model=args.ring_model,
|
| 1601 |
)
|
| 1602 |
|
requirements.txt
CHANGED
|
@@ -7,3 +7,8 @@ flask>=3.0.0
|
|
| 7 |
gunicorn>=21.2.0
|
| 8 |
openai>=1.0.0
|
| 9 |
supabase>=2.0.0
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 7 |
gunicorn>=21.2.0
|
| 8 |
openai>=1.0.0
|
| 9 |
supabase>=2.0.0
|
| 10 |
+
# SAM 2.1 via HuggingFace transformers (card segmentation)
|
| 11 |
+
torch>=2.4.0
|
| 12 |
+
torchvision>=0.19.0
|
| 13 |
+
transformers>=4.47.0
|
| 14 |
+
pillow>=10.0.0
|
script/compare_hand_sam.py
ADDED
|
@@ -0,0 +1,227 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Compare hand-mask quality across backends on a single image.
|
| 2 |
+
|
| 3 |
+
Runs MediaPipe (current pipeline), SAM 2.1 tiny, and SAM 2.1 small using
|
| 4 |
+
a point prompt at the palm center from MediaPipe landmarks. Saves a 4-panel
|
| 5 |
+
side-by-side comparison and also writes each mask's contour + edge crop.
|
| 6 |
+
"""
|
| 7 |
+
from __future__ import annotations
|
| 8 |
+
|
| 9 |
+
import sys
|
| 10 |
+
import time
|
| 11 |
+
from pathlib import Path
|
| 12 |
+
from typing import Tuple
|
| 13 |
+
|
| 14 |
+
import cv2
|
| 15 |
+
import numpy as np
|
| 16 |
+
from PIL import Image as PILImage
|
| 17 |
+
|
| 18 |
+
sys.path.insert(0, str(Path(__file__).resolve().parents[1]))
|
| 19 |
+
|
| 20 |
+
from src.finger_segmentation import segment_hand # noqa: E402
|
| 21 |
+
|
| 22 |
+
IMG_PATH = Path("input/sample-04-12/card_2.jpg")
|
| 23 |
+
OUT_DIR = Path("output/hand_sam_compare")
|
| 24 |
+
|
| 25 |
+
SAM_MODELS = [
|
| 26 |
+
("sam2.1-tiny", "facebook/sam2.1-hiera-tiny"),
|
| 27 |
+
("sam2.1-small", "facebook/sam2.1-hiera-small"),
|
| 28 |
+
]
|
| 29 |
+
|
| 30 |
+
|
| 31 |
+
def palm_and_card_points(image_bgr: np.ndarray, hand_data: dict) -> Tuple[Tuple[int, int], Tuple[int, int]]:
|
| 32 |
+
"""Return (palm_center, card_center) pixel coords in the canonical image space.
|
| 33 |
+
|
| 34 |
+
Palm center = mean of wrist + MCPs (landmarks 0, 5, 9, 13, 17).
|
| 35 |
+
Card center = a rough point to the left of the hand (negative prompt hint).
|
| 36 |
+
"""
|
| 37 |
+
landmarks = hand_data.get("landmarks")
|
| 38 |
+
if landmarks is None:
|
| 39 |
+
raise RuntimeError("MediaPipe returned no landmarks")
|
| 40 |
+
|
| 41 |
+
# landmarks is (21, 2 or 3) in pixel coords
|
| 42 |
+
lm = np.asarray(landmarks)[:, :2]
|
| 43 |
+
palm_ids = [0, 5, 9, 13, 17]
|
| 44 |
+
palm_center = tuple(np.round(lm[palm_ids].mean(axis=0)).astype(int).tolist())
|
| 45 |
+
|
| 46 |
+
# Card hint: far from hand, toward image left
|
| 47 |
+
h, w = image_bgr.shape[:2]
|
| 48 |
+
hand_x_min = int(lm[:, 0].min())
|
| 49 |
+
card_x = max(50, hand_x_min - 150)
|
| 50 |
+
card_y = h // 2
|
| 51 |
+
return palm_center, (card_x, card_y)
|
| 52 |
+
|
| 53 |
+
|
| 54 |
+
def run_sam(
|
| 55 |
+
model_id: str,
|
| 56 |
+
image_rgb: np.ndarray,
|
| 57 |
+
palm_xy: Tuple[int, int],
|
| 58 |
+
negative_xy: Tuple[int, int],
|
| 59 |
+
) -> Tuple[np.ndarray, float, float]:
|
| 60 |
+
"""Run SAM 2.1 with palm positive + card negative point. Returns (mask, score, seconds)."""
|
| 61 |
+
import torch
|
| 62 |
+
from transformers import Sam2Model, Sam2Processor
|
| 63 |
+
|
| 64 |
+
processor = Sam2Processor.from_pretrained(model_id)
|
| 65 |
+
model = Sam2Model.from_pretrained(model_id).to("cpu").eval()
|
| 66 |
+
|
| 67 |
+
pil = PILImage.fromarray(image_rgb)
|
| 68 |
+
input_points = [[[list(palm_xy), list(negative_xy)]]]
|
| 69 |
+
input_labels = [[[1, 0]]]
|
| 70 |
+
|
| 71 |
+
t0 = time.time()
|
| 72 |
+
inputs = processor(
|
| 73 |
+
images=pil,
|
| 74 |
+
input_points=input_points,
|
| 75 |
+
input_labels=input_labels,
|
| 76 |
+
return_tensors="pt",
|
| 77 |
+
)
|
| 78 |
+
with torch.inference_mode():
|
| 79 |
+
outputs = model(**inputs, multimask_output=True)
|
| 80 |
+
|
| 81 |
+
masks = processor.post_process_masks(
|
| 82 |
+
outputs.pred_masks.cpu(),
|
| 83 |
+
inputs["original_sizes"],
|
| 84 |
+
mask_threshold=0.0,
|
| 85 |
+
)[0][0] # (num_candidates, H, W) for first image, first prompt set
|
| 86 |
+
scores = outputs.iou_scores.cpu().numpy()[0, 0]
|
| 87 |
+
|
| 88 |
+
best_idx = int(np.argmax(scores))
|
| 89 |
+
mask = masks[best_idx].numpy().astype(bool)
|
| 90 |
+
return mask, float(scores[best_idx]), time.time() - t0
|
| 91 |
+
|
| 92 |
+
|
| 93 |
+
def mask_to_overlay(image_bgr: np.ndarray, mask: np.ndarray, color: Tuple[int, int, int]) -> np.ndarray:
|
| 94 |
+
"""Return a BGR image with the mask tinted + contour drawn."""
|
| 95 |
+
out = image_bgr.copy()
|
| 96 |
+
tint = np.zeros_like(out)
|
| 97 |
+
tint[mask] = color
|
| 98 |
+
out = cv2.addWeighted(out, 1.0, tint, 0.35, 0)
|
| 99 |
+
|
| 100 |
+
contours, _ = cv2.findContours(
|
| 101 |
+
mask.astype(np.uint8) * 255, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_NONE
|
| 102 |
+
)
|
| 103 |
+
cv2.drawContours(out, contours, -1, color, 2, cv2.LINE_AA)
|
| 104 |
+
return out
|
| 105 |
+
|
| 106 |
+
|
| 107 |
+
def label_panel(img: np.ndarray, text: str) -> np.ndarray:
|
| 108 |
+
h, w = img.shape[:2]
|
| 109 |
+
cv2.rectangle(img, (0, 0), (w, 60), (0, 0, 0), -1)
|
| 110 |
+
cv2.putText(img, text, (20, 42), cv2.FONT_HERSHEY_SIMPLEX, 1.3,
|
| 111 |
+
(255, 255, 255), 3, cv2.LINE_AA)
|
| 112 |
+
return img
|
| 113 |
+
|
| 114 |
+
|
| 115 |
+
def main() -> int:
|
| 116 |
+
OUT_DIR.mkdir(parents=True, exist_ok=True)
|
| 117 |
+
|
| 118 |
+
image_bgr = cv2.imread(str(IMG_PATH))
|
| 119 |
+
if image_bgr is None:
|
| 120 |
+
print(f"Failed to load {IMG_PATH}")
|
| 121 |
+
return 1
|
| 122 |
+
|
| 123 |
+
print(f"Image: {IMG_PATH} {image_bgr.shape}")
|
| 124 |
+
|
| 125 |
+
# --- MediaPipe baseline ---
|
| 126 |
+
t0 = time.time()
|
| 127 |
+
hand_data = segment_hand(image_bgr, finger="index")
|
| 128 |
+
mp_time = time.time() - t0
|
| 129 |
+
if hand_data is None:
|
| 130 |
+
print("MediaPipe detected no hand — aborting")
|
| 131 |
+
return 1
|
| 132 |
+
|
| 133 |
+
canonical_image = hand_data.get("canonical_image", image_bgr)
|
| 134 |
+
mp_mask = hand_data.get("mask")
|
| 135 |
+
if mp_mask is None:
|
| 136 |
+
print("MediaPipe did not return a hand mask")
|
| 137 |
+
return 1
|
| 138 |
+
mp_mask = mp_mask.astype(bool)
|
| 139 |
+
print(f"MediaPipe: {mp_time:.1f}s mask_area={mp_mask.sum()}")
|
| 140 |
+
|
| 141 |
+
# Work in the canonical image so the comparison is apples-to-apples
|
| 142 |
+
image_for_sam = canonical_image.copy()
|
| 143 |
+
palm_xy, card_xy = palm_and_card_points(image_for_sam, hand_data)
|
| 144 |
+
print(f"Palm prompt: {palm_xy} Negative hint: {card_xy}")
|
| 145 |
+
|
| 146 |
+
image_rgb = cv2.cvtColor(image_for_sam, cv2.COLOR_BGR2RGB)
|
| 147 |
+
|
| 148 |
+
# --- SAM models ---
|
| 149 |
+
results = {"mediapipe": (mp_mask, None, mp_time)}
|
| 150 |
+
for name, model_id in SAM_MODELS:
|
| 151 |
+
print(f"\n=== {name} ({model_id}) ===")
|
| 152 |
+
try:
|
| 153 |
+
mask, score, seconds = run_sam(model_id, image_rgb, palm_xy, card_xy)
|
| 154 |
+
# Align shape (should already be canonical)
|
| 155 |
+
if mask.shape != mp_mask.shape:
|
| 156 |
+
mask = cv2.resize(
|
| 157 |
+
mask.astype(np.uint8),
|
| 158 |
+
(mp_mask.shape[1], mp_mask.shape[0]),
|
| 159 |
+
interpolation=cv2.INTER_NEAREST,
|
| 160 |
+
).astype(bool)
|
| 161 |
+
print(f" score={score:.3f} time={seconds:.1f}s area={mask.sum()}")
|
| 162 |
+
results[name] = (mask, score, seconds)
|
| 163 |
+
except Exception as e:
|
| 164 |
+
print(f" FAILED: {e!r}")
|
| 165 |
+
import traceback
|
| 166 |
+
traceback.print_exc()
|
| 167 |
+
|
| 168 |
+
# --- Render panels ---
|
| 169 |
+
panels = []
|
| 170 |
+
colors = {
|
| 171 |
+
"mediapipe": (0, 165, 255), # orange
|
| 172 |
+
"sam2.1-tiny": (0, 255, 255), # yellow
|
| 173 |
+
"sam2.1-small": (0, 255, 0), # green
|
| 174 |
+
}
|
| 175 |
+
|
| 176 |
+
# Panel 0: original with prompt points
|
| 177 |
+
orig = image_for_sam.copy()
|
| 178 |
+
cv2.circle(orig, palm_xy, 18, (0, 255, 0), -1)
|
| 179 |
+
cv2.circle(orig, palm_xy, 18, (0, 0, 0), 3)
|
| 180 |
+
cv2.circle(orig, card_xy, 18, (0, 0, 255), -1)
|
| 181 |
+
cv2.circle(orig, card_xy, 18, (0, 0, 0), 3)
|
| 182 |
+
panels.append(label_panel(orig, "original + prompts"))
|
| 183 |
+
|
| 184 |
+
for name in ["mediapipe", "sam2.1-tiny", "sam2.1-small"]:
|
| 185 |
+
if name not in results:
|
| 186 |
+
continue
|
| 187 |
+
mask, score, seconds = results[name]
|
| 188 |
+
panel = mask_to_overlay(image_for_sam, mask, colors[name])
|
| 189 |
+
label = f"{name} {seconds:.1f}s"
|
| 190 |
+
if score is not None:
|
| 191 |
+
label += f" score={score:.2f}"
|
| 192 |
+
panels.append(label_panel(panel, label))
|
| 193 |
+
|
| 194 |
+
# Save individual panels full-res
|
| 195 |
+
for i, p in enumerate(panels):
|
| 196 |
+
cv2.imwrite(str(OUT_DIR / f"panel_{i}_{['orig','mediapipe','tiny','small'][i]}.png"), p)
|
| 197 |
+
|
| 198 |
+
# Build a single side-by-side at a readable size
|
| 199 |
+
def resize_to_height(img: np.ndarray, H: int) -> np.ndarray:
|
| 200 |
+
h, w = img.shape[:2]
|
| 201 |
+
scale = H / h
|
| 202 |
+
return cv2.resize(img, (int(round(w * scale)), H), interpolation=cv2.INTER_AREA)
|
| 203 |
+
|
| 204 |
+
target_h = 900
|
| 205 |
+
resized = [resize_to_height(p, target_h) for p in panels]
|
| 206 |
+
combined = np.hstack(resized)
|
| 207 |
+
cv2.imwrite(str(OUT_DIR / "comparison_full.png"), combined)
|
| 208 |
+
|
| 209 |
+
# Also zoom-crop around the hand for fine-detail inspection
|
| 210 |
+
ys, xs = np.where(mp_mask)
|
| 211 |
+
if len(xs) > 0:
|
| 212 |
+
pad = 80
|
| 213 |
+
x0, x1 = max(0, xs.min() - pad), min(image_for_sam.shape[1], xs.max() + pad)
|
| 214 |
+
y0, y1 = max(0, ys.min() - pad), min(image_for_sam.shape[0], ys.max() + pad)
|
| 215 |
+
crops = []
|
| 216 |
+
for p in panels:
|
| 217 |
+
crop = p[y0:y1, x0:x1]
|
| 218 |
+
crops.append(resize_to_height(crop, target_h))
|
| 219 |
+
combined_zoom = np.hstack(crops)
|
| 220 |
+
cv2.imwrite(str(OUT_DIR / "comparison_zoom.png"), combined_zoom)
|
| 221 |
+
|
| 222 |
+
print(f"\nSaved panels to {OUT_DIR}/")
|
| 223 |
+
return 0
|
| 224 |
+
|
| 225 |
+
|
| 226 |
+
if __name__ == "__main__":
|
| 227 |
+
raise SystemExit(main())
|
script/validate_sam_card.py
ADDED
|
@@ -0,0 +1,198 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Validate SAM card detection (classic vs AMG vs prompt) on sample-04-12.
|
| 2 |
+
|
| 3 |
+
Prompt-based SAM depends on MediaPipe running first to provide a hand mask
|
| 4 |
+
for seed derivation, so we run `segment_hand()` on each image before timing
|
| 5 |
+
the three detectors.
|
| 6 |
+
|
| 7 |
+
Outputs per-image rows and a summary with success counts + mean wall time.
|
| 8 |
+
Debug overlays saved under `output/sam_val/<stem>/`.
|
| 9 |
+
"""
|
| 10 |
+
from __future__ import annotations
|
| 11 |
+
|
| 12 |
+
import os
|
| 13 |
+
import sys
|
| 14 |
+
import time
|
| 15 |
+
import traceback
|
| 16 |
+
from pathlib import Path
|
| 17 |
+
|
| 18 |
+
SKIP_AMG = bool(os.environ.get("SKIP_AMG"))
|
| 19 |
+
|
| 20 |
+
import cv2
|
| 21 |
+
import numpy as np
|
| 22 |
+
|
| 23 |
+
sys.path.insert(0, str(Path(__file__).resolve().parents[1]))
|
| 24 |
+
|
| 25 |
+
from src.card_detection import compute_scale_factor, detect_credit_card # noqa: E402
|
| 26 |
+
from src.finger_segmentation import segment_hand # noqa: E402
|
| 27 |
+
from src.sam_card_detection import ( # noqa: E402
|
| 28 |
+
detect_credit_card_sam,
|
| 29 |
+
detect_credit_card_sam_prompt,
|
| 30 |
+
suggest_card_seeds,
|
| 31 |
+
)
|
| 32 |
+
|
| 33 |
+
SAMPLE_DIR = Path("input/sample-04-12")
|
| 34 |
+
OUT_DIR = Path("output/sam_val")
|
| 35 |
+
|
| 36 |
+
|
| 37 |
+
def _negatives_from_landmarks(landmarks: np.ndarray):
|
| 38 |
+
palm_idx = [0, 5, 9, 13, 17]
|
| 39 |
+
c = np.mean(landmarks[palm_idx, :2], axis=0)
|
| 40 |
+
return [(int(round(c[0])), int(round(c[1])))]
|
| 41 |
+
|
| 42 |
+
|
| 43 |
+
def run_one(img_path: Path) -> dict:
|
| 44 |
+
bgr = cv2.imread(str(img_path))
|
| 45 |
+
if bgr is None:
|
| 46 |
+
return {"file": img_path.name, "error": "load_failed"}
|
| 47 |
+
|
| 48 |
+
rec = {"file": img_path.name, "shape": bgr.shape[:2]}
|
| 49 |
+
|
| 50 |
+
# --- MediaPipe + SAM hand (needed for prompt-SAM seeds) ---
|
| 51 |
+
t0 = time.time()
|
| 52 |
+
try:
|
| 53 |
+
hand_data = segment_hand(bgr, finger="index", use_sam_mask=True)
|
| 54 |
+
except Exception as e:
|
| 55 |
+
hand_data = None
|
| 56 |
+
rec["hand_error"] = repr(e)[:120]
|
| 57 |
+
rec["hand_time_s"] = round(time.time() - t0, 2)
|
| 58 |
+
|
| 59 |
+
if hand_data is None:
|
| 60 |
+
rec["hand_detected"] = False
|
| 61 |
+
canonical = bgr
|
| 62 |
+
else:
|
| 63 |
+
rec["hand_detected"] = True
|
| 64 |
+
canonical = hand_data.get("canonical_image", bgr)
|
| 65 |
+
|
| 66 |
+
# --- Classic ---
|
| 67 |
+
t0 = time.time()
|
| 68 |
+
try:
|
| 69 |
+
classic = detect_credit_card(canonical)
|
| 70 |
+
if classic is not None:
|
| 71 |
+
px_cm, _ = compute_scale_factor(classic["corners"])
|
| 72 |
+
rec["classic_px_per_cm"] = px_cm
|
| 73 |
+
else:
|
| 74 |
+
rec["classic_px_per_cm"] = None
|
| 75 |
+
except Exception as e:
|
| 76 |
+
rec["classic_error"] = repr(e)[:120]
|
| 77 |
+
rec["classic_time_s"] = round(time.time() - t0, 2)
|
| 78 |
+
|
| 79 |
+
# --- SAM AMG ---
|
| 80 |
+
rec["amg_px_per_cm"] = None
|
| 81 |
+
rec["amg_time_s"] = None
|
| 82 |
+
if not SKIP_AMG:
|
| 83 |
+
amg_debug = OUT_DIR / img_path.stem / "sam_card_amg"
|
| 84 |
+
t0 = time.time()
|
| 85 |
+
try:
|
| 86 |
+
amg = detect_credit_card_sam(canonical, debug_dir=str(amg_debug))
|
| 87 |
+
if amg is not None:
|
| 88 |
+
px_cm, _ = compute_scale_factor(amg["corners"])
|
| 89 |
+
rec["amg_px_per_cm"] = px_cm
|
| 90 |
+
except Exception as e:
|
| 91 |
+
rec["amg_error"] = repr(e)[:120]
|
| 92 |
+
traceback.print_exc()
|
| 93 |
+
rec["amg_time_s"] = round(time.time() - t0, 2)
|
| 94 |
+
|
| 95 |
+
# --- SAM prompt ---
|
| 96 |
+
rec["prompt_px_per_cm"] = None
|
| 97 |
+
rec["prompt_time_s"] = None
|
| 98 |
+
if hand_data is not None:
|
| 99 |
+
prompt_debug = OUT_DIR / img_path.stem / "sam_card_prompt"
|
| 100 |
+
seeds = suggest_card_seeds(hand_data["mask"], canonical.shape[:2])
|
| 101 |
+
rec["prompt_n_seeds"] = len(seeds)
|
| 102 |
+
negs = _negatives_from_landmarks(hand_data["landmarks"])
|
| 103 |
+
t0 = time.time()
|
| 104 |
+
try:
|
| 105 |
+
pr = detect_credit_card_sam_prompt(
|
| 106 |
+
canonical,
|
| 107 |
+
seed_points=seeds,
|
| 108 |
+
negative_points=negs,
|
| 109 |
+
debug_dir=str(prompt_debug),
|
| 110 |
+
)
|
| 111 |
+
if pr is not None:
|
| 112 |
+
px_cm, _ = compute_scale_factor(pr["corners"])
|
| 113 |
+
rec["prompt_px_per_cm"] = px_cm
|
| 114 |
+
except Exception as e:
|
| 115 |
+
rec["prompt_error"] = repr(e)[:120]
|
| 116 |
+
traceback.print_exc()
|
| 117 |
+
rec["prompt_time_s"] = round(time.time() - t0, 2)
|
| 118 |
+
|
| 119 |
+
return rec
|
| 120 |
+
|
| 121 |
+
|
| 122 |
+
def main() -> int:
|
| 123 |
+
OUT_DIR.mkdir(parents=True, exist_ok=True)
|
| 124 |
+
images = sorted(SAMPLE_DIR.glob("*.jpg"))
|
| 125 |
+
if not images:
|
| 126 |
+
print(f"No images found in {SAMPLE_DIR}")
|
| 127 |
+
return 1
|
| 128 |
+
|
| 129 |
+
print(f"Validating {len(images)} images from {SAMPLE_DIR}\n")
|
| 130 |
+
results = []
|
| 131 |
+
for img in images:
|
| 132 |
+
print(f"=== {img.name} ===")
|
| 133 |
+
rec = run_one(img)
|
| 134 |
+
results.append(rec)
|
| 135 |
+
print(rec)
|
| 136 |
+
print()
|
| 137 |
+
|
| 138 |
+
# --- Summary table ---
|
| 139 |
+
print("\n===== SUMMARY =====")
|
| 140 |
+
header = (
|
| 141 |
+
f"{'file':<18}"
|
| 142 |
+
f"{'classic':>10}{'classicT':>10}"
|
| 143 |
+
f"{'amg':>10}{'amgT':>8}"
|
| 144 |
+
f"{'prompt':>10}{'promptT':>10}"
|
| 145 |
+
)
|
| 146 |
+
print(header)
|
| 147 |
+
print("-" * len(header))
|
| 148 |
+
|
| 149 |
+
counts = {"classic": 0, "amg": 0, "prompt": 0}
|
| 150 |
+
times = {"classic": [], "amg": [], "prompt": []}
|
| 151 |
+
|
| 152 |
+
for r in results:
|
| 153 |
+
def _fmt(v, fmt="{:.2f}"):
|
| 154 |
+
return fmt.format(v) if v is not None else "FAIL"
|
| 155 |
+
c = r.get("classic_px_per_cm")
|
| 156 |
+
a = r.get("amg_px_per_cm")
|
| 157 |
+
p = r.get("prompt_px_per_cm")
|
| 158 |
+
ct = r.get("classic_time_s")
|
| 159 |
+
at = r.get("amg_time_s")
|
| 160 |
+
pt = r.get("prompt_time_s")
|
| 161 |
+
print(
|
| 162 |
+
f"{r['file']:<18}"
|
| 163 |
+
f"{_fmt(c):>10}{_fmt(ct):>10}"
|
| 164 |
+
f"{_fmt(a):>10}{_fmt(at):>8}"
|
| 165 |
+
f"{_fmt(p):>10}{_fmt(pt):>10}"
|
| 166 |
+
)
|
| 167 |
+
if c is not None:
|
| 168 |
+
counts["classic"] += 1
|
| 169 |
+
times["classic"].append(ct)
|
| 170 |
+
if a is not None:
|
| 171 |
+
counts["amg"] += 1
|
| 172 |
+
times["amg"].append(at)
|
| 173 |
+
if p is not None:
|
| 174 |
+
counts["prompt"] += 1
|
| 175 |
+
times["prompt"].append(pt)
|
| 176 |
+
|
| 177 |
+
n = len(results)
|
| 178 |
+
print("-" * len(header))
|
| 179 |
+
for k in ("classic", "amg", "prompt"):
|
| 180 |
+
ok = counts[k]
|
| 181 |
+
mean_t = (sum(times[k]) / len(times[k])) if times[k] else float("nan")
|
| 182 |
+
print(f"{k:<8} success: {ok}/{n} mean_time_s: {mean_t:.2f}")
|
| 183 |
+
|
| 184 |
+
# Agreement check: when both prompt and amg succeeded, how close are scales?
|
| 185 |
+
agree = []
|
| 186 |
+
for r in results:
|
| 187 |
+
a, p = r.get("amg_px_per_cm"), r.get("prompt_px_per_cm")
|
| 188 |
+
if a is not None and p is not None:
|
| 189 |
+
agree.append(100 * abs(a - p) / max(a, p))
|
| 190 |
+
if agree:
|
| 191 |
+
print(f"\nprompt vs amg scale agreement: mean diff {np.mean(agree):.2f}%, "
|
| 192 |
+
f"max {max(agree):.2f}% (n={len(agree)})")
|
| 193 |
+
|
| 194 |
+
return 0
|
| 195 |
+
|
| 196 |
+
|
| 197 |
+
if __name__ == "__main__":
|
| 198 |
+
raise SystemExit(main())
|
src/edge_refinement.py
CHANGED
|
@@ -93,6 +93,7 @@ def _find_edges_from_axis(
|
|
| 93 |
row_mask: Optional[np.ndarray] = None,
|
| 94 |
row_gradient_left_to_right: Optional[np.ndarray] = None,
|
| 95 |
row_gradient_right_to_left: Optional[np.ndarray] = None,
|
|
|
|
| 96 |
) -> Optional[Tuple[float, float, float, float]]:
|
| 97 |
"""
|
| 98 |
Find left and right edges by expanding from axis position.
|
|
@@ -131,72 +132,90 @@ def _find_edges_from_axis(
|
|
| 131 |
left_search_gradient = row_gradient_right_to_left if row_gradient_right_to_left is not None else row_gradient
|
| 132 |
right_search_gradient = row_gradient_left_to_right if row_gradient_left_to_right is not None else row_gradient
|
| 133 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 134 |
# MASK-CONSTRAINED MODE (preferred when available)
|
| 135 |
if row_mask is not None and np.any(row_mask):
|
| 136 |
-
# Strategy:
|
| 137 |
-
#
|
|
|
|
|
|
|
|
|
|
|
|
|
| 138 |
|
| 139 |
mask_indices = np.where(row_mask)[0]
|
| 140 |
if len(mask_indices) < 2:
|
| 141 |
return None # Mask too small
|
| 142 |
|
| 143 |
-
|
| 144 |
-
|
| 145 |
-
|
| 146 |
-
#
|
| 147 |
-
|
| 148 |
-
|
| 149 |
-
|
| 150 |
-
|
| 151 |
-
|
| 152 |
-
|
| 153 |
-
|
| 154 |
-
|
| 155 |
-
|
| 156 |
-
# Found a strong edge - update if stronger than previous
|
| 157 |
-
if left_search_gradient[x] > left_strength:
|
| 158 |
-
left_edge_x = x
|
| 159 |
-
left_strength = left_search_gradient[x]
|
| 160 |
-
|
| 161 |
-
# If no edge found with full threshold, try with relaxed threshold
|
| 162 |
-
if left_edge_x is None:
|
| 163 |
-
relaxed_threshold = threshold * 0.5
|
| 164 |
-
for x in range(search_start, left_mask_boundary - 1, -1):
|
| 165 |
-
if x < 0 or x >= len(row_gradient):
|
| 166 |
-
continue
|
| 167 |
-
if left_search_gradient[x] > relaxed_threshold:
|
| 168 |
-
if left_search_gradient[x] > left_strength:
|
| 169 |
-
left_edge_x = x
|
| 170 |
-
left_strength = left_search_gradient[x]
|
| 171 |
-
|
| 172 |
-
# Search RIGHT from axis, stopping at mask boundary
|
| 173 |
-
right_edge_x = None
|
| 174 |
-
right_strength = 0
|
| 175 |
|
| 176 |
-
#
|
| 177 |
-
|
| 178 |
-
|
| 179 |
-
|
| 180 |
-
|
| 181 |
-
|
| 182 |
-
|
| 183 |
-
|
| 184 |
-
|
| 185 |
-
|
| 186 |
-
|
| 187 |
-
#
|
| 188 |
-
|
| 189 |
-
|
| 190 |
-
|
| 191 |
-
if x < 0 or x >= len(row_gradient):
|
| 192 |
-
continue
|
| 193 |
-
if right_search_gradient[x] > relaxed_threshold:
|
| 194 |
-
if right_search_gradient[x] > right_strength:
|
| 195 |
-
right_edge_x = x
|
| 196 |
-
right_strength = right_search_gradient[x]
|
| 197 |
|
| 198 |
-
|
| 199 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 200 |
|
| 201 |
else:
|
| 202 |
# AXIS-EXPANSION MODE (fallback when no mask)
|
|
@@ -240,7 +259,9 @@ def extract_ring_zone_roi(
|
|
| 240 |
image: np.ndarray,
|
| 241 |
axis_data: Dict[str, Any],
|
| 242 |
zone_data: Dict[str, Any],
|
| 243 |
-
rotate_align: bool = False
|
|
|
|
|
|
|
| 244 |
) -> Dict[str, Any]:
|
| 245 |
"""
|
| 246 |
Extract ROI around ring zone.
|
|
@@ -274,7 +295,14 @@ def extract_ring_zone_roi(
|
|
| 274 |
zone_length = zone_data["length"]
|
| 275 |
center = zone_data["center_point"]
|
| 276 |
direction = axis_data["direction"]
|
| 277 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 278 |
half_width = zone_length * 0.6 # 1.5x / 2
|
| 279 |
|
| 280 |
x_min = int(np.clip(center[0] - half_width, 0, w - 1))
|
|
@@ -294,8 +322,22 @@ def extract_ring_zone_roi(
|
|
| 294 |
# Convert to grayscale for edge detection
|
| 295 |
roi_gray = cv2.cvtColor(roi_bgr, cv2.COLOR_BGR2GRAY)
|
| 296 |
|
| 297 |
-
#
|
| 298 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 299 |
|
| 300 |
# Create transform matrix (ROI coords -> original coords)
|
| 301 |
# Simple translation for non-rotated case
|
|
@@ -473,7 +515,8 @@ def detect_edges_per_row(
|
|
| 473 |
roi_data: Dict[str, Any],
|
| 474 |
threshold: float = DEFAULT_GRADIENT_THRESHOLD,
|
| 475 |
expected_width_px: Optional[float] = None,
|
| 476 |
-
scale_px_per_cm: Optional[float] = None
|
|
|
|
| 477 |
) -> Dict[str, Any]:
|
| 478 |
"""
|
| 479 |
Detect left and right finger edges for each row (cross-section).
|
|
@@ -535,7 +578,12 @@ def detect_edges_per_row(
|
|
| 535 |
|
| 536 |
# Get finger mask for constrained edge detection (if available)
|
| 537 |
roi_mask = roi_data.get("roi_mask")
|
| 538 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 539 |
|
| 540 |
if roi_mask is not None:
|
| 541 |
logger.debug(f"Using MASK-CONSTRAINED edge detection (mask shape: {roi_mask.shape})")
|
|
@@ -568,7 +616,8 @@ def detect_edges_per_row(
|
|
| 568 |
result = _find_edges_from_axis(row_gradient, row, axis_x, threshold,
|
| 569 |
min_width_px, max_width_px, row_mask,
|
| 570 |
row_gradient_left_to_right=row_gradient_l2r,
|
| 571 |
-
row_gradient_right_to_left=row_gradient_r2l
|
|
|
|
| 572 |
|
| 573 |
if result is None:
|
| 574 |
continue # No valid edges found
|
|
@@ -958,56 +1007,73 @@ def should_use_sobel_measurement(
|
|
| 958 |
"""
|
| 959 |
Decide whether to use Sobel measurement or fall back to contour.
|
| 960 |
|
| 961 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 962 |
1. Edge quality score > min_quality_score (default 0.7)
|
| 963 |
2. Edge consistency > min_consistency (default 0.5 = 50%)
|
| 964 |
-
3.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 965 |
|
| 966 |
Args:
|
| 967 |
sobel_result: Output from refine_edges_sobel()
|
| 968 |
contour_result: Optional output from compute_cross_section_width()
|
| 969 |
-
min_quality_score: Minimum acceptable quality score
|
| 970 |
-
min_consistency: Minimum edge detection success rate
|
| 971 |
max_difference_pct: Maximum allowed difference from contour (%)
|
| 972 |
|
| 973 |
Returns:
|
| 974 |
Tuple of (should_use_sobel, reason)
|
| 975 |
"""
|
| 976 |
-
# Check if edge quality data available
|
| 977 |
if "edge_quality" not in sobel_result:
|
| 978 |
return False, "edge_quality_data_missing"
|
| 979 |
|
| 980 |
edge_quality = sobel_result["edge_quality"]
|
|
|
|
|
|
|
| 981 |
|
| 982 |
-
# Check 1: Overall quality score
|
| 983 |
-
if edge_quality["overall_score"] < min_quality_score:
|
| 984 |
-
return False, f"quality_score_low_{edge_quality['overall_score']:.2f}"
|
| 985 |
-
|
| 986 |
-
# Check 2: Consistency (success rate)
|
| 987 |
-
if edge_quality["consistency_score"] < min_consistency:
|
| 988 |
-
return False, f"consistency_low_{edge_quality['consistency_score']:.2f}"
|
| 989 |
-
|
| 990 |
-
# Check 3: Measurement reasonableness
|
| 991 |
sobel_width = sobel_result.get("median_width_cm")
|
| 992 |
if sobel_width is None or sobel_width <= 0:
|
| 993 |
return False, "invalid_measurement"
|
| 994 |
-
|
| 995 |
-
# Typical finger width range
|
| 996 |
if sobel_width < MIN_REALISTIC_WIDTH_CM or sobel_width > MAX_REALISTIC_WIDTH_CM:
|
| 997 |
return False, f"unrealistic_width_{sobel_width:.2f}cm"
|
| 998 |
|
| 999 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1000 |
if contour_result is not None:
|
| 1001 |
contour_width = contour_result.get("median_width_px")
|
| 1002 |
sobel_width_px = sobel_result.get("median_width_px")
|
| 1003 |
-
|
| 1004 |
if contour_width and sobel_width_px:
|
| 1005 |
diff_pct = abs(sobel_width_px - contour_width) / contour_width * 100
|
| 1006 |
-
|
| 1007 |
if diff_pct > max_difference_pct:
|
| 1008 |
return False, f"disagrees_with_contour_{diff_pct:.1f}pct"
|
| 1009 |
|
| 1010 |
-
# All checks passed
|
| 1011 |
return True, "quality_acceptable"
|
| 1012 |
|
| 1013 |
|
|
@@ -1023,6 +1089,9 @@ def refine_edges_sobel(
|
|
| 1023 |
use_subpixel: bool = True,
|
| 1024 |
expected_width_px: Optional[float] = None,
|
| 1025 |
debug_dir: Optional[str] = None,
|
|
|
|
|
|
|
|
|
|
| 1026 |
) -> Dict[str, Any]:
|
| 1027 |
"""
|
| 1028 |
Main entry point for Sobel-based edge refinement.
|
|
@@ -1079,10 +1148,15 @@ def refine_edges_sobel(
|
|
| 1079 |
# A.2: Ring zone + ROI bounds (need to extract bounds first)
|
| 1080 |
# We'll save this after ROI extraction
|
| 1081 |
|
| 1082 |
-
# Step 1: Extract ROI
|
|
|
|
|
|
|
|
|
|
| 1083 |
roi_data = extract_ring_zone_roi(
|
| 1084 |
image, axis_data, zone_data,
|
| 1085 |
-
rotate_align=rotate_align
|
|
|
|
|
|
|
| 1086 |
)
|
| 1087 |
|
| 1088 |
logger.debug(f"ROI size: {roi_data['roi_width']}x{roi_data['roi_height']}px")
|
|
@@ -1117,12 +1191,14 @@ def refine_edges_sobel(
|
|
| 1117 |
grad_mag = draw_gradient_visualization(gradient_data["gradient_magnitude"], cv2.COLORMAP_HOT)
|
| 1118 |
observer.save_stage("06_gradient_magnitude", grad_mag)
|
| 1119 |
|
| 1120 |
-
# Step 3: Detect edges per row
|
|
|
|
| 1121 |
edge_data = detect_edges_per_row(
|
| 1122 |
gradient_data, roi_data,
|
| 1123 |
threshold=sobel_threshold,
|
| 1124 |
expected_width_px=expected_width_px,
|
| 1125 |
-
scale_px_per_cm=scale_px_per_cm
|
|
|
|
| 1126 |
)
|
| 1127 |
|
| 1128 |
logger.debug(f"Valid rows: {edge_data['num_valid_rows']}/{len(edge_data['valid_rows'])} ({edge_data['num_valid_rows']/len(edge_data['valid_rows'])*100:.1f}%)")
|
|
@@ -1151,11 +1227,13 @@ def refine_edges_sobel(
|
|
| 1151 |
# B.5: Selected edges (final detected edges)
|
| 1152 |
observer.draw_and_save("09_selected_edges", roi_data["roi_image"], draw_selected_edges, edge_data)
|
| 1153 |
|
| 1154 |
-
# Step 4: Measure width from edges (with sub-pixel refinement)
|
|
|
|
|
|
|
| 1155 |
width_data = measure_width_from_edges(
|
| 1156 |
edge_data, roi_data, scale_px_per_cm,
|
| 1157 |
gradient_data=gradient_data,
|
| 1158 |
-
use_subpixel=
|
| 1159 |
)
|
| 1160 |
|
| 1161 |
if debug_dir:
|
|
|
|
| 93 |
row_mask: Optional[np.ndarray] = None,
|
| 94 |
row_gradient_left_to_right: Optional[np.ndarray] = None,
|
| 95 |
row_gradient_right_to_left: Optional[np.ndarray] = None,
|
| 96 |
+
mask_only: bool = False,
|
| 97 |
) -> Optional[Tuple[float, float, float, float]]:
|
| 98 |
"""
|
| 99 |
Find left and right edges by expanding from axis position.
|
|
|
|
| 132 |
left_search_gradient = row_gradient_right_to_left if row_gradient_right_to_left is not None else row_gradient
|
| 133 |
right_search_gradient = row_gradient_left_to_right if row_gradient_left_to_right is not None else row_gradient
|
| 134 |
|
| 135 |
+
# In mask_only mode a row with no mask pixels (or a mask that doesn't
|
| 136 |
+
# contain the finger axis) must be dropped. Otherwise we would fall
|
| 137 |
+
# through to the gradient axis-expansion path below, which routinely
|
| 138 |
+
# returns ROI-edge coordinates as "edges" on empty rows and poisons the
|
| 139 |
+
# width median.
|
| 140 |
+
if mask_only and (row_mask is None or not np.any(row_mask)):
|
| 141 |
+
return None
|
| 142 |
+
|
| 143 |
# MASK-CONSTRAINED MODE (preferred when available)
|
| 144 |
if row_mask is not None and np.any(row_mask):
|
| 145 |
+
# Strategy: the SAM mask already knows where the finger boundary is
|
| 146 |
+
# to pixel accuracy. We anchor to the mask boundary by default, and
|
| 147 |
+
# only snap to a nearby gradient peak when one exceeds the threshold
|
| 148 |
+
# (for sub-pixel refinement). When contrast is weak (pale finger on
|
| 149 |
+
# light background) the gradient search yields nothing, so trusting
|
| 150 |
+
# the mask directly is what prevents "no valid widths" failures.
|
| 151 |
|
| 152 |
mask_indices = np.where(row_mask)[0]
|
| 153 |
if len(mask_indices) < 2:
|
| 154 |
return None # Mask too small
|
| 155 |
|
| 156 |
+
# Pick the contiguous run of mask pixels that contains the finger
|
| 157 |
+
# axis. This matters when the raw SAM hand mask is passed in (mask_only
|
| 158 |
+
# path): at the ring-zone rows the mask may include adjacent fingers,
|
| 159 |
+
# and np.where(...)[0][0]/[-1] would then span across fingers.
|
| 160 |
+
axis_col = int(round(axis_x))
|
| 161 |
+
axis_col = max(0, min(len(row_mask) - 1, axis_col))
|
| 162 |
+
if not row_mask[axis_col]:
|
| 163 |
+
# Axis is off the mask on this row — the ROI is clipping into
|
| 164 |
+
# background (e.g. ring/pinky ROI reaching the MCP webbing).
|
| 165 |
+
# Treat the row as invalid rather than snapping to whatever mask
|
| 166 |
+
# run happens to be nearest; otherwise the wrong run can pull
|
| 167 |
+
# the median width up.
|
| 168 |
+
return None
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 169 |
|
| 170 |
+
# Expand left/right from the axis until we leave the mask run.
|
| 171 |
+
left_mask_boundary = axis_col
|
| 172 |
+
while left_mask_boundary > 0 and row_mask[left_mask_boundary - 1]:
|
| 173 |
+
left_mask_boundary -= 1
|
| 174 |
+
right_mask_boundary = axis_col
|
| 175 |
+
max_col = len(row_mask) - 1
|
| 176 |
+
while right_mask_boundary < max_col and row_mask[right_mask_boundary + 1]:
|
| 177 |
+
right_mask_boundary += 1
|
| 178 |
+
|
| 179 |
+
# If the contiguous run reaches either ROI edge the mask has bled
|
| 180 |
+
# out of the ROI laterally — almost always via the webbing into the
|
| 181 |
+
# adjacent finger. The column we stopped at is the ROI edge, not
|
| 182 |
+
# the true finger boundary, so the row is unreliable.
|
| 183 |
+
if mask_only and (left_mask_boundary == 0 or right_mask_boundary == max_col):
|
| 184 |
+
return None
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 185 |
|
| 186 |
+
SEARCH_RANGE = 10 # px on either side of mask boundary to refine
|
| 187 |
+
|
| 188 |
+
def _snap_to_peak(
|
| 189 |
+
boundary: int,
|
| 190 |
+
gradient: np.ndarray,
|
| 191 |
+
) -> Tuple[float, float]:
|
| 192 |
+
"""Return (edge_x, strength). Snap to local gradient peak if
|
| 193 |
+
it exceeds threshold, otherwise fall back to boundary itself."""
|
| 194 |
+
lo = max(0, boundary - SEARCH_RANGE)
|
| 195 |
+
hi = min(len(gradient) - 1, boundary + SEARCH_RANGE)
|
| 196 |
+
if hi < lo:
|
| 197 |
+
return float(boundary), 0.0
|
| 198 |
+
window = gradient[lo:hi + 1]
|
| 199 |
+
best_rel = int(np.argmax(window))
|
| 200 |
+
best_val = float(window[best_rel])
|
| 201 |
+
if best_val > threshold:
|
| 202 |
+
return float(lo + best_rel), best_val
|
| 203 |
+
# Weak gradient: trust the SAM mask boundary directly.
|
| 204 |
+
return float(boundary), float(gradient[boundary])
|
| 205 |
+
|
| 206 |
+
if mask_only:
|
| 207 |
+
# Trust the SAM mask boundary exactly; no gradient snapping.
|
| 208 |
+
left_edge_x = float(left_mask_boundary)
|
| 209 |
+
right_edge_x = float(right_mask_boundary)
|
| 210 |
+
left_strength = float(left_search_gradient[left_mask_boundary])
|
| 211 |
+
right_strength = float(right_search_gradient[right_mask_boundary])
|
| 212 |
+
else:
|
| 213 |
+
left_edge_x, left_strength = _snap_to_peak(
|
| 214 |
+
left_mask_boundary, left_search_gradient
|
| 215 |
+
)
|
| 216 |
+
right_edge_x, right_strength = _snap_to_peak(
|
| 217 |
+
right_mask_boundary, right_search_gradient
|
| 218 |
+
)
|
| 219 |
|
| 220 |
else:
|
| 221 |
# AXIS-EXPANSION MODE (fallback when no mask)
|
|
|
|
| 259 |
image: np.ndarray,
|
| 260 |
axis_data: Dict[str, Any],
|
| 261 |
zone_data: Dict[str, Any],
|
| 262 |
+
rotate_align: bool = False,
|
| 263 |
+
finger_mask: Optional[np.ndarray] = None,
|
| 264 |
+
finger_name: Optional[str] = None,
|
| 265 |
) -> Dict[str, Any]:
|
| 266 |
"""
|
| 267 |
Extract ROI around ring zone.
|
|
|
|
| 295 |
zone_length = zone_data["length"]
|
| 296 |
center = zone_data["center_point"]
|
| 297 |
direction = axis_data["direction"]
|
| 298 |
+
# Ring and pinky have their proximal phalanx set lower on the palm than
|
| 299 |
+
# index/middle, so an ROI sized for the latter reaches down into the
|
| 300 |
+
# MCP webbing — those rows bleed into the adjacent finger's mask. Use
|
| 301 |
+
# half the vertical span (0.25x zone length) for those fingers.
|
| 302 |
+
if finger_name in ("ring", "pinky"):
|
| 303 |
+
half_height = zone_length * 0.25 # 0.25x / 2
|
| 304 |
+
else:
|
| 305 |
+
half_height = zone_length * 0.25 # 0.5x / 2
|
| 306 |
half_width = zone_length * 0.6 # 1.5x / 2
|
| 307 |
|
| 308 |
x_min = int(np.clip(center[0] - half_width, 0, w - 1))
|
|
|
|
| 322 |
# Convert to grayscale for edge detection
|
| 323 |
roi_gray = cv2.cvtColor(roi_bgr, cv2.COLOR_BGR2GRAY)
|
| 324 |
|
| 325 |
+
# Build the per-row search constraint. Prefer a pixel-accurate finger mask
|
| 326 |
+
# when one is supplied (e.g. SAM 2.1 output). Falling back to a full-rect
|
| 327 |
+
# all-ones mask preserves legacy behaviour.
|
| 328 |
+
if finger_mask is not None:
|
| 329 |
+
fm = finger_mask
|
| 330 |
+
if fm.dtype != np.uint8:
|
| 331 |
+
fm = (fm > 0).astype(np.uint8) * 255
|
| 332 |
+
if fm.shape[:2] != image.shape[:2]:
|
| 333 |
+
fm = cv2.resize(
|
| 334 |
+
fm, (image.shape[1], image.shape[0]), interpolation=cv2.INTER_NEAREST
|
| 335 |
+
)
|
| 336 |
+
roi_mask = fm[y_min:y_max, x_min:x_max].copy()
|
| 337 |
+
if not np.any(roi_mask):
|
| 338 |
+
roi_mask = np.ones((roi_height, roi_width), dtype=np.uint8) * 255
|
| 339 |
+
else:
|
| 340 |
+
roi_mask = np.ones((roi_height, roi_width), dtype=np.uint8) * 255
|
| 341 |
|
| 342 |
# Create transform matrix (ROI coords -> original coords)
|
| 343 |
# Simple translation for non-rotated case
|
|
|
|
| 515 |
roi_data: Dict[str, Any],
|
| 516 |
threshold: float = DEFAULT_GRADIENT_THRESHOLD,
|
| 517 |
expected_width_px: Optional[float] = None,
|
| 518 |
+
scale_px_per_cm: Optional[float] = None,
|
| 519 |
+
mask_only: bool = False,
|
| 520 |
) -> Dict[str, Any]:
|
| 521 |
"""
|
| 522 |
Detect left and right finger edges for each row (cross-section).
|
|
|
|
| 578 |
|
| 579 |
# Get finger mask for constrained edge detection (if available)
|
| 580 |
roi_mask = roi_data.get("roi_mask")
|
| 581 |
+
if mask_only and roi_mask is not None:
|
| 582 |
+
mode_used = "mask_only"
|
| 583 |
+
elif roi_mask is not None:
|
| 584 |
+
mode_used = "mask_constrained"
|
| 585 |
+
else:
|
| 586 |
+
mode_used = "axis_expansion"
|
| 587 |
|
| 588 |
if roi_mask is not None:
|
| 589 |
logger.debug(f"Using MASK-CONSTRAINED edge detection (mask shape: {roi_mask.shape})")
|
|
|
|
| 616 |
result = _find_edges_from_axis(row_gradient, row, axis_x, threshold,
|
| 617 |
min_width_px, max_width_px, row_mask,
|
| 618 |
row_gradient_left_to_right=row_gradient_l2r,
|
| 619 |
+
row_gradient_right_to_left=row_gradient_r2l,
|
| 620 |
+
mask_only=mask_only)
|
| 621 |
|
| 622 |
if result is None:
|
| 623 |
continue # No valid edges found
|
|
|
|
| 1007 |
"""
|
| 1008 |
Decide whether to use Sobel measurement or fall back to contour.
|
| 1009 |
|
| 1010 |
+
When the edge detection ran in ``mask_constrained`` mode (a pixel-accurate
|
| 1011 |
+
finger mask was supplied, e.g. from SAM 2.1), the SAM mask boundary IS
|
| 1012 |
+
the ground truth — a low gradient score just reflects weak finger/background
|
| 1013 |
+
contrast, not a bad measurement. In that case we skip the quality/gradient
|
| 1014 |
+
gates entirely and only enforce the safety checks (plausible width,
|
| 1015 |
+
non-empty sample set, not wildly at odds with the contour baseline).
|
| 1016 |
+
|
| 1017 |
+
Decision criteria (non-masked path):
|
| 1018 |
1. Edge quality score > min_quality_score (default 0.7)
|
| 1019 |
2. Edge consistency > min_consistency (default 0.5 = 50%)
|
| 1020 |
+
3. Realistic width range
|
| 1021 |
+
4. If contour available: agreement within max_difference_pct
|
| 1022 |
+
|
| 1023 |
+
Decision criteria (mask_constrained path):
|
| 1024 |
+
1. Non-empty sample set
|
| 1025 |
+
2. Realistic width range
|
| 1026 |
|
| 1027 |
Args:
|
| 1028 |
sobel_result: Output from refine_edges_sobel()
|
| 1029 |
contour_result: Optional output from compute_cross_section_width()
|
| 1030 |
+
min_quality_score: Minimum acceptable quality score (ignored for masked)
|
| 1031 |
+
min_consistency: Minimum edge detection success rate (ignored for masked)
|
| 1032 |
max_difference_pct: Maximum allowed difference from contour (%)
|
| 1033 |
|
| 1034 |
Returns:
|
| 1035 |
Tuple of (should_use_sobel, reason)
|
| 1036 |
"""
|
|
|
|
| 1037 |
if "edge_quality" not in sobel_result:
|
| 1038 |
return False, "edge_quality_data_missing"
|
| 1039 |
|
| 1040 |
edge_quality = sobel_result["edge_quality"]
|
| 1041 |
+
mode_used = sobel_result.get("edge_data", {}).get("mode_used", "axis_expansion")
|
| 1042 |
+
mask_anchored = mode_used in ("mask_constrained", "mask_only")
|
| 1043 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1044 |
sobel_width = sobel_result.get("median_width_cm")
|
| 1045 |
if sobel_width is None or sobel_width <= 0:
|
| 1046 |
return False, "invalid_measurement"
|
|
|
|
|
|
|
| 1047 |
if sobel_width < MIN_REALISTIC_WIDTH_CM or sobel_width > MAX_REALISTIC_WIDTH_CM:
|
| 1048 |
return False, f"unrealistic_width_{sobel_width:.2f}cm"
|
| 1049 |
|
| 1050 |
+
if mask_anchored:
|
| 1051 |
+
# SAM mask is the source of truth, but we still require enough
|
| 1052 |
+
# valid rows to form a robust median. A low sample count with an
|
| 1053 |
+
# otherwise reasonable width usually indicates the per-finger mask
|
| 1054 |
+
# bled into an adjacent finger and width validation killed most
|
| 1055 |
+
# rows — contour is safer in that situation.
|
| 1056 |
+
MIN_MASK_SAMPLES = 20 # parity with the contour path's 20 samples
|
| 1057 |
+
num_samples = int(sobel_result.get("num_samples", 0))
|
| 1058 |
+
if num_samples < MIN_MASK_SAMPLES:
|
| 1059 |
+
return False, f"mask_samples_low_{num_samples}"
|
| 1060 |
+
return True, "mask_anchored"
|
| 1061 |
+
|
| 1062 |
+
# Non-masked path: preserve the original gradient-quality gates.
|
| 1063 |
+
if edge_quality["overall_score"] < min_quality_score:
|
| 1064 |
+
return False, f"quality_score_low_{edge_quality['overall_score']:.2f}"
|
| 1065 |
+
|
| 1066 |
+
if edge_quality["consistency_score"] < min_consistency:
|
| 1067 |
+
return False, f"consistency_low_{edge_quality['consistency_score']:.2f}"
|
| 1068 |
+
|
| 1069 |
if contour_result is not None:
|
| 1070 |
contour_width = contour_result.get("median_width_px")
|
| 1071 |
sobel_width_px = sobel_result.get("median_width_px")
|
|
|
|
| 1072 |
if contour_width and sobel_width_px:
|
| 1073 |
diff_pct = abs(sobel_width_px - contour_width) / contour_width * 100
|
|
|
|
| 1074 |
if diff_pct > max_difference_pct:
|
| 1075 |
return False, f"disagrees_with_contour_{diff_pct:.1f}pct"
|
| 1076 |
|
|
|
|
| 1077 |
return True, "quality_acceptable"
|
| 1078 |
|
| 1079 |
|
|
|
|
| 1089 |
use_subpixel: bool = True,
|
| 1090 |
expected_width_px: Optional[float] = None,
|
| 1091 |
debug_dir: Optional[str] = None,
|
| 1092 |
+
finger_mask: Optional[np.ndarray] = None,
|
| 1093 |
+
mask_mode: str = "hybrid",
|
| 1094 |
+
finger_name: Optional[str] = None,
|
| 1095 |
) -> Dict[str, Any]:
|
| 1096 |
"""
|
| 1097 |
Main entry point for Sobel-based edge refinement.
|
|
|
|
| 1148 |
# A.2: Ring zone + ROI bounds (need to extract bounds first)
|
| 1149 |
# We'll save this after ROI extraction
|
| 1150 |
|
| 1151 |
+
# Step 1: Extract ROI. Pure-Sobel mode drops the SAM mask so the gradient
|
| 1152 |
+
# search expands from the finger axis without any mask anchoring; the
|
| 1153 |
+
# "mask_only" and legacy "hybrid" modes both pass the mask through.
|
| 1154 |
+
roi_finger_mask = None if mask_mode == "sobel_only" else finger_mask
|
| 1155 |
roi_data = extract_ring_zone_roi(
|
| 1156 |
image, axis_data, zone_data,
|
| 1157 |
+
rotate_align=rotate_align,
|
| 1158 |
+
finger_mask=roi_finger_mask,
|
| 1159 |
+
finger_name=finger_name,
|
| 1160 |
)
|
| 1161 |
|
| 1162 |
logger.debug(f"ROI size: {roi_data['roi_width']}x{roi_data['roi_height']}px")
|
|
|
|
| 1191 |
grad_mag = draw_gradient_visualization(gradient_data["gradient_magnitude"], cv2.COLORMAP_HOT)
|
| 1192 |
observer.save_stage("06_gradient_magnitude", grad_mag)
|
| 1193 |
|
| 1194 |
+
# Step 3: Detect edges per row. In "mask_only" mode the mask boundary is
|
| 1195 |
+
# used verbatim so gradient snapping is disabled.
|
| 1196 |
edge_data = detect_edges_per_row(
|
| 1197 |
gradient_data, roi_data,
|
| 1198 |
threshold=sobel_threshold,
|
| 1199 |
expected_width_px=expected_width_px,
|
| 1200 |
+
scale_px_per_cm=scale_px_per_cm,
|
| 1201 |
+
mask_only=(mask_mode == "mask_only"),
|
| 1202 |
)
|
| 1203 |
|
| 1204 |
logger.debug(f"Valid rows: {edge_data['num_valid_rows']}/{len(edge_data['valid_rows'])} ({edge_data['num_valid_rows']/len(edge_data['valid_rows'])*100:.1f}%)")
|
|
|
|
| 1227 |
# B.5: Selected edges (final detected edges)
|
| 1228 |
observer.draw_and_save("09_selected_edges", roi_data["roi_image"], draw_selected_edges, edge_data)
|
| 1229 |
|
| 1230 |
+
# Step 4: Measure width from edges (with sub-pixel refinement).
|
| 1231 |
+
# Sub-pixel refinement is gradient-based, so it is skipped in mask_only.
|
| 1232 |
+
effective_subpixel = use_subpixel and mask_mode != "mask_only"
|
| 1233 |
width_data = measure_width_from_edges(
|
| 1234 |
edge_data, roi_data, scale_px_per_cm,
|
| 1235 |
gradient_data=gradient_data,
|
| 1236 |
+
use_subpixel=effective_subpixel,
|
| 1237 |
)
|
| 1238 |
|
| 1239 |
if debug_dir:
|
src/finger_segmentation.py
CHANGED
|
@@ -278,6 +278,7 @@ def segment_hand(
|
|
| 278 |
finger: FingerIndex = "index",
|
| 279 |
max_dimension: int = 1280,
|
| 280 |
debug_dir: Optional[str] = None,
|
|
|
|
| 281 |
) -> Optional[Dict[str, Any]]:
|
| 282 |
"""
|
| 283 |
Detect and segment hand from image using MediaPipe.
|
|
@@ -292,10 +293,17 @@ def segment_hand(
|
|
| 292 |
Dictionary containing:
|
| 293 |
- landmarks: 21x2 array of landmark positions (pixel coordinates)
|
| 294 |
- landmarks_normalized: 21x2 array of normalized coordinates [0-1]
|
| 295 |
-
- mask: Binary hand mask
|
|
|
|
|
|
|
| 296 |
- confidence: Detection confidence
|
| 297 |
- handedness: "Left" or "Right"
|
| 298 |
Or None if no hand detected
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 299 |
"""
|
| 300 |
# Create debug observer if debug mode enabled
|
| 301 |
observer = DebugObserver(debug_dir) if debug_dir else None
|
|
@@ -427,13 +435,34 @@ def segment_hand(
|
|
| 427 |
handedness[0].category_name,
|
| 428 |
f"det={rotation_code}, orient={orientation_rotation}")
|
| 429 |
|
| 430 |
-
#
|
| 431 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 432 |
|
| 433 |
return {
|
| 434 |
"landmarks": landmarks_canonical,
|
| 435 |
"landmarks_normalized": landmarks_normalized_canonical,
|
| 436 |
"mask": mask,
|
|
|
|
|
|
|
| 437 |
"confidence": handedness[0].score,
|
| 438 |
"handedness": handedness[0].category_name,
|
| 439 |
"rotation_applied": rotation_code,
|
|
|
|
| 278 |
finger: FingerIndex = "index",
|
| 279 |
max_dimension: int = 1280,
|
| 280 |
debug_dir: Optional[str] = None,
|
| 281 |
+
use_sam_mask: bool = True,
|
| 282 |
) -> Optional[Dict[str, Any]]:
|
| 283 |
"""
|
| 284 |
Detect and segment hand from image using MediaPipe.
|
|
|
|
| 293 |
Dictionary containing:
|
| 294 |
- landmarks: 21x2 array of landmark positions (pixel coordinates)
|
| 295 |
- landmarks_normalized: 21x2 array of normalized coordinates [0-1]
|
| 296 |
+
- mask: Binary hand mask (pixel-accurate SAM 2.1 mask if use_sam_mask=True,
|
| 297 |
+
else synthetic convex-hull fallback from landmarks)
|
| 298 |
+
- mask_synthetic: Synthetic fallback mask (always populated for debug)
|
| 299 |
- confidence: Detection confidence
|
| 300 |
- handedness: "Left" or "Right"
|
| 301 |
Or None if no hand detected
|
| 302 |
+
|
| 303 |
+
Args:
|
| 304 |
+
use_sam_mask: If True (default), call SAM 2.1 seeded by the palm-center
|
| 305 |
+
landmark to produce a pixel-accurate hand mask. If False, use the
|
| 306 |
+
legacy synthetic convex-hull mask built from landmarks only.
|
| 307 |
"""
|
| 308 |
# Create debug observer if debug mode enabled
|
| 309 |
observer = DebugObserver(debug_dir) if debug_dir else None
|
|
|
|
| 435 |
handedness[0].category_name,
|
| 436 |
f"det={rotation_code}, orient={orientation_rotation}")
|
| 437 |
|
| 438 |
+
# Legacy synthetic mask (convex hull of landmarks) — kept for fallback and debug
|
| 439 |
+
synthetic_mask = _create_hand_mask(landmarks_canonical, (can_full_h, can_full_w))
|
| 440 |
+
|
| 441 |
+
# SAM 2.1 pixel-accurate hand mask (default), seeded by palm center landmark
|
| 442 |
+
mask = synthetic_mask
|
| 443 |
+
mask_source = "synthetic"
|
| 444 |
+
if use_sam_mask:
|
| 445 |
+
try:
|
| 446 |
+
from .sam_hand_segmentation import segment_hand_sam, palm_center_from_landmarks
|
| 447 |
+
palm_xy = palm_center_from_landmarks(landmarks_canonical)
|
| 448 |
+
sam_debug_dir = str(Path(debug_dir) / "sam_hand") if debug_dir else None
|
| 449 |
+
sam_mask_bool = segment_hand_sam(
|
| 450 |
+
canonical_full, palm_xy, debug_dir=sam_debug_dir
|
| 451 |
+
)
|
| 452 |
+
if sam_mask_bool is not None:
|
| 453 |
+
mask = (sam_mask_bool.astype(np.uint8) * 255)
|
| 454 |
+
mask_source = "sam"
|
| 455 |
+
else:
|
| 456 |
+
print(" SAM hand mask returned None, falling back to synthetic")
|
| 457 |
+
except Exception as e:
|
| 458 |
+
print(f" SAM hand mask failed ({e!r}), falling back to synthetic")
|
| 459 |
|
| 460 |
return {
|
| 461 |
"landmarks": landmarks_canonical,
|
| 462 |
"landmarks_normalized": landmarks_normalized_canonical,
|
| 463 |
"mask": mask,
|
| 464 |
+
"mask_synthetic": synthetic_mask,
|
| 465 |
+
"mask_source": mask_source,
|
| 466 |
"confidence": handedness[0].score,
|
| 467 |
"handedness": handedness[0].category_name,
|
| 468 |
"rotation_applied": rotation_code,
|
src/geometry.py
CHANGED
|
@@ -92,22 +92,28 @@ def estimate_finger_axis_from_landmarks(
|
|
| 92 |
"""
|
| 93 |
Calculate finger axis directly from anatomical landmarks.
|
| 94 |
|
| 95 |
-
OPTIMIZED: Focuses on
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 96 |
|
| 97 |
Args:
|
| 98 |
landmarks: 4x2 array of finger landmarks [MCP, PIP, DIP, TIP]
|
| 99 |
method: Calculation method
|
| 100 |
- "endpoints": MCP to TIP vector (legacy, less accurate)
|
| 101 |
-
- "linear_fit":
|
| 102 |
- "median_direction": Median of 3 segment directions (robust to outliers)
|
| 103 |
|
| 104 |
Returns:
|
| 105 |
Dictionary containing:
|
| 106 |
-
- center: Axis center point at midpoint of
|
| 107 |
-
- direction: Unit direction vector (dx, dy)
|
| 108 |
- length: Full finger length in pixels (TIP to MCP, for reference)
|
| 109 |
-
- palm_end: Visualization endpoint (extended from
|
| 110 |
-
- tip_end: Visualization endpoint (extended from
|
| 111 |
- method: Method used ("landmarks")
|
| 112 |
"""
|
| 113 |
# Validate landmarks
|
|
@@ -122,7 +128,7 @@ def estimate_finger_axis_from_landmarks(
|
|
| 122 |
tip = landmarks[3] # Fingertip
|
| 123 |
|
| 124 |
# Calculate direction based on method
|
| 125 |
-
# OPTIMIZED: Focus on
|
| 126 |
if method == "endpoints":
|
| 127 |
# Simple: vector from MCP to TIP (legacy, less accurate for ring zone)
|
| 128 |
direction = tip - mcp
|
|
@@ -130,14 +136,16 @@ def estimate_finger_axis_from_landmarks(
|
|
| 130 |
direction = direction / direction_length
|
| 131 |
|
| 132 |
elif method == "linear_fit":
|
| 133 |
-
# OPTIMIZED: Use
|
| 134 |
-
#
|
| 135 |
-
|
|
|
|
|
|
|
| 136 |
direction_length = np.linalg.norm(direction)
|
| 137 |
direction = direction / direction_length
|
| 138 |
|
| 139 |
-
#
|
| 140 |
-
#
|
| 141 |
if np.dot(direction, tip - mcp) < 0:
|
| 142 |
direction = -direction
|
| 143 |
|
|
@@ -156,18 +164,18 @@ def estimate_finger_axis_from_landmarks(
|
|
| 156 |
else:
|
| 157 |
raise ValueError(f"Unknown method: {method}. Use 'endpoints', 'linear_fit', or 'median_direction'")
|
| 158 |
|
| 159 |
-
# OPTIMIZED: Center
|
| 160 |
-
center = (
|
| 161 |
|
| 162 |
# Calculate finger length (still use full finger for reference)
|
| 163 |
length = np.linalg.norm(tip - mcp)
|
| 164 |
|
| 165 |
-
# OPTIMIZED: Visual endpoints
|
| 166 |
-
#
|
| 167 |
-
segment_length = np.linalg.norm(
|
| 168 |
extension_factor = 0.5 # Extend 50% beyond each endpoint for visualization
|
| 169 |
-
palm_end =
|
| 170 |
-
tip_end =
|
| 171 |
|
| 172 |
return {
|
| 173 |
"center": center.astype(np.float32),
|
|
|
|
| 92 |
"""
|
| 93 |
Calculate finger axis directly from anatomical landmarks.
|
| 94 |
|
| 95 |
+
OPTIMIZED: Focuses on the PIP-MCP segment (proximal phalanx, where the
|
| 96 |
+
ring actually sits) for better accuracy. For straight fingers (index,
|
| 97 |
+
middle) this agrees with the DIP-PIP direction to within ~1°, but ring
|
| 98 |
+
and pinky often hold a visible PIP-joint curl, so the proximal phalanx
|
| 99 |
+
is at a different angle from the middle phalanx. Rotating by the
|
| 100 |
+
proximal-phalanx direction makes the ring zone exactly vertical and
|
| 101 |
+
cross-sections perpendicular to the bone we measure.
|
| 102 |
|
| 103 |
Args:
|
| 104 |
landmarks: 4x2 array of finger landmarks [MCP, PIP, DIP, TIP]
|
| 105 |
method: Calculation method
|
| 106 |
- "endpoints": MCP to TIP vector (legacy, less accurate)
|
| 107 |
+
- "linear_fit": MCP to PIP vector (DEFAULT, proximal phalanx)
|
| 108 |
- "median_direction": Median of 3 segment directions (robust to outliers)
|
| 109 |
|
| 110 |
Returns:
|
| 111 |
Dictionary containing:
|
| 112 |
+
- center: Axis center point at midpoint of MCP-PIP (x, y)
|
| 113 |
+
- direction: Unit direction vector (dx, dy) pointing palm→tip
|
| 114 |
- length: Full finger length in pixels (TIP to MCP, for reference)
|
| 115 |
+
- palm_end: Visualization endpoint (extended from MCP toward palm)
|
| 116 |
+
- tip_end: Visualization endpoint (extended from PIP toward tip)
|
| 117 |
- method: Method used ("landmarks")
|
| 118 |
"""
|
| 119 |
# Validate landmarks
|
|
|
|
| 128 |
tip = landmarks[3] # Fingertip
|
| 129 |
|
| 130 |
# Calculate direction based on method
|
| 131 |
+
# OPTIMIZED: Focus on the PIP-MCP segment (proximal phalanx = ring zone)
|
| 132 |
if method == "endpoints":
|
| 133 |
# Simple: vector from MCP to TIP (legacy, less accurate for ring zone)
|
| 134 |
direction = tip - mcp
|
|
|
|
| 136 |
direction = direction / direction_length
|
| 137 |
|
| 138 |
elif method == "linear_fit":
|
| 139 |
+
# OPTIMIZED: Use MCP→PIP, the proximal phalanx bone that a ring
|
| 140 |
+
# actually rests on. For ring and pinky this differs from the old
|
| 141 |
+
# DIP-PIP direction by the PIP-joint curl angle, which was
|
| 142 |
+
# silently tilting the measurement frame.
|
| 143 |
+
direction = pip - mcp # Vector from MCP to PIP (palm→tip)
|
| 144 |
direction_length = np.linalg.norm(direction)
|
| 145 |
direction = direction / direction_length
|
| 146 |
|
| 147 |
+
# Sanity check: direction should point palm→tip. (MCP→PIP already
|
| 148 |
+
# does, but verify in case landmarks are swapped.)
|
| 149 |
if np.dot(direction, tip - mcp) < 0:
|
| 150 |
direction = -direction
|
| 151 |
|
|
|
|
| 164 |
else:
|
| 165 |
raise ValueError(f"Unknown method: {method}. Use 'endpoints', 'linear_fit', or 'median_direction'")
|
| 166 |
|
| 167 |
+
# OPTIMIZED: Center on the proximal phalanx midpoint (the ring zone).
|
| 168 |
+
center = (mcp + pip) / 2.0
|
| 169 |
|
| 170 |
# Calculate finger length (still use full finger for reference)
|
| 171 |
length = np.linalg.norm(tip - mcp)
|
| 172 |
|
| 173 |
+
# OPTIMIZED: Visual endpoints span the proximal phalanx (MCP→PIP)
|
| 174 |
+
# extended slightly for visualization clarity.
|
| 175 |
+
segment_length = np.linalg.norm(pip - mcp)
|
| 176 |
extension_factor = 0.5 # Extend 50% beyond each endpoint for visualization
|
| 177 |
+
palm_end = mcp - direction * (segment_length * extension_factor)
|
| 178 |
+
tip_end = pip + direction * (segment_length * extension_factor)
|
| 179 |
|
| 180 |
return {
|
| 181 |
"center": center.astype(np.float32),
|
src/sam_backend.py
ADDED
|
@@ -0,0 +1,50 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Shared lazy singleton for SAM 2.1 Tiny (model + processor).
|
| 2 |
+
|
| 3 |
+
Both card detection (prompt-based) and hand segmentation use the same
|
| 4 |
+
HuggingFace weights, so loading them once per process halves cold-start
|
| 5 |
+
cost and keeps only one copy of the encoder in memory.
|
| 6 |
+
"""
|
| 7 |
+
|
| 8 |
+
from __future__ import annotations
|
| 9 |
+
|
| 10 |
+
import os
|
| 11 |
+
import time
|
| 12 |
+
from typing import Tuple
|
| 13 |
+
|
| 14 |
+
# Bump the default HF Hub HEAD/download timeout (10s) before transformers
|
| 15 |
+
# reads the env var. On flaky networks the 10s HEAD check fires a retry storm
|
| 16 |
+
# even when the weights are already cached locally.
|
| 17 |
+
os.environ.setdefault("HF_HUB_DOWNLOAD_TIMEOUT", "60")
|
| 18 |
+
|
| 19 |
+
SAM2_MODEL_ID = "facebook/sam2.1-hiera-large"
|
| 20 |
+
|
| 21 |
+
# SAM resizes internally to 1024 — feeding >1024 wastes CPU on image encoding.
|
| 22 |
+
INFERENCE_MAX_SIDE = 1024
|
| 23 |
+
|
| 24 |
+
_model = None
|
| 25 |
+
_processor = None
|
| 26 |
+
|
| 27 |
+
|
| 28 |
+
def get_sam2() -> Tuple[object, object]:
|
| 29 |
+
"""Return (model, processor) singletons, loading on first call.
|
| 30 |
+
|
| 31 |
+
Tries the local HF cache first (``local_files_only=True``). This avoids
|
| 32 |
+
the HEAD-request retry storm that happens when huggingface.co is slow or
|
| 33 |
+
unreachable but the weights are already on disk. On a true cache miss we
|
| 34 |
+
fall through to a normal online load.
|
| 35 |
+
"""
|
| 36 |
+
global _model, _processor
|
| 37 |
+
if _model is None or _processor is None:
|
| 38 |
+
from transformers import Sam2Model, Sam2Processor
|
| 39 |
+
t0 = time.time()
|
| 40 |
+
print(f" Loading SAM 2.1 ({SAM2_MODEL_ID})...")
|
| 41 |
+
try:
|
| 42 |
+
_processor = Sam2Processor.from_pretrained(SAM2_MODEL_ID, local_files_only=True)
|
| 43 |
+
_model = Sam2Model.from_pretrained(SAM2_MODEL_ID, local_files_only=True).to("cpu").eval()
|
| 44 |
+
print(f" SAM 2.1 loaded (offline cache) in {time.time() - t0:.1f}s")
|
| 45 |
+
except (OSError, ValueError):
|
| 46 |
+
# Cache miss — fall back to online download.
|
| 47 |
+
_processor = Sam2Processor.from_pretrained(SAM2_MODEL_ID)
|
| 48 |
+
_model = Sam2Model.from_pretrained(SAM2_MODEL_ID).to("cpu").eval()
|
| 49 |
+
print(f" SAM 2.1 loaded (online) in {time.time() - t0:.1f}s")
|
| 50 |
+
return _model, _processor
|
src/sam_card_detection.py
ADDED
|
@@ -0,0 +1,614 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
SAM 2.1-based credit card detection.
|
| 3 |
+
|
| 4 |
+
Uses Meta's Segment Anything 2.1 (Hiera Tiny) via HuggingFace transformers
|
| 5 |
+
to produce a pixel-accurate card mask, then filters candidate masks by area,
|
| 6 |
+
rectangularity, and aspect ratio (~1.586) to pick the credit card.
|
| 7 |
+
|
| 8 |
+
Drop-in replacement for `card_detection.detect_credit_card`: returns a dict
|
| 9 |
+
with the same keys so the downstream pipeline is unchanged.
|
| 10 |
+
"""
|
| 11 |
+
|
| 12 |
+
from __future__ import annotations
|
| 13 |
+
|
| 14 |
+
import os
|
| 15 |
+
import time
|
| 16 |
+
from pathlib import Path
|
| 17 |
+
from typing import Any, Dict, List, Optional, Tuple
|
| 18 |
+
|
| 19 |
+
import cv2
|
| 20 |
+
import numpy as np
|
| 21 |
+
|
| 22 |
+
from .card_detection import (
|
| 23 |
+
CARD_ASPECT_RATIO,
|
| 24 |
+
MAX_CARD_AREA_RATIO,
|
| 25 |
+
MIN_CARD_AREA_RATIO,
|
| 26 |
+
get_quad_dimensions,
|
| 27 |
+
order_corners,
|
| 28 |
+
)
|
| 29 |
+
from .sam_backend import INFERENCE_MAX_SIDE as PROMPT_INFERENCE_MAX_SIDE, get_sam2
|
| 30 |
+
|
| 31 |
+
# HF Hub model id — small, fast SAM 2.1 variant that fits CPU / HF free Spaces
|
| 32 |
+
SAM2_MODEL_ID = "facebook/sam2.1-hiera-large"
|
| 33 |
+
|
| 34 |
+
# Downscale large images before inference to keep CPU runtime tractable.
|
| 35 |
+
# SAM 2.1 internally resizes to 1024, so feeding >1024 is pure overhead.
|
| 36 |
+
INFERENCE_MAX_SIDE = 1024
|
| 37 |
+
|
| 38 |
+
# Automatic mask generation grid density. 16 gives ~256 prompts — enough to
|
| 39 |
+
# hit a credit card reliably without blowing up CPU time.
|
| 40 |
+
POINTS_PER_SIDE = 16
|
| 41 |
+
POINTS_PER_BATCH = 64
|
| 42 |
+
|
| 43 |
+
# Candidate filtering
|
| 44 |
+
MIN_RECTANGULARITY = 0.90 # mask_area / minAreaRect_area; card mask is near-perfect rectangle
|
| 45 |
+
ASPECT_RATIO_TOLERANCE = 0.15 # fractional deviation from 1.586
|
| 46 |
+
|
| 47 |
+
_pipeline = None # lazy singleton
|
| 48 |
+
|
| 49 |
+
|
| 50 |
+
def _get_pipeline():
|
| 51 |
+
"""Lazy-load the SAM 2.1 mask-generation pipeline (module-level singleton).
|
| 52 |
+
|
| 53 |
+
Tries the local HF cache first to sidestep huggingface.co HEAD-request
|
| 54 |
+
timeouts when the weights are already on disk; falls back to a normal
|
| 55 |
+
online load on a true cache miss.
|
| 56 |
+
"""
|
| 57 |
+
global _pipeline
|
| 58 |
+
if _pipeline is None:
|
| 59 |
+
import os as _os
|
| 60 |
+
_os.environ.setdefault("HF_HUB_DOWNLOAD_TIMEOUT", "60")
|
| 61 |
+
from transformers import pipeline as hf_pipeline
|
| 62 |
+
t0 = time.time()
|
| 63 |
+
print(f" Loading SAM 2.1 ({SAM2_MODEL_ID})...")
|
| 64 |
+
try:
|
| 65 |
+
_pipeline = hf_pipeline(
|
| 66 |
+
"mask-generation", model=SAM2_MODEL_ID, device="cpu",
|
| 67 |
+
model_kwargs={"local_files_only": True},
|
| 68 |
+
)
|
| 69 |
+
print(f" SAM 2.1 loaded (offline cache) in {time.time() - t0:.1f}s")
|
| 70 |
+
except (OSError, ValueError):
|
| 71 |
+
_pipeline = hf_pipeline("mask-generation", model=SAM2_MODEL_ID, device="cpu")
|
| 72 |
+
print(f" SAM 2.1 loaded (online) in {time.time() - t0:.1f}s")
|
| 73 |
+
return _pipeline
|
| 74 |
+
|
| 75 |
+
|
| 76 |
+
def _downscale_for_inference(image_bgr: np.ndarray) -> Tuple[np.ndarray, float]:
|
| 77 |
+
"""Downscale image so the long side is INFERENCE_MAX_SIDE. Returns (scaled, scale_factor).
|
| 78 |
+
|
| 79 |
+
scale_factor is original/scaled, so coords in the scaled space * scale_factor
|
| 80 |
+
-> coords in the original space.
|
| 81 |
+
"""
|
| 82 |
+
h, w = image_bgr.shape[:2]
|
| 83 |
+
long_side = max(h, w)
|
| 84 |
+
if long_side <= INFERENCE_MAX_SIDE:
|
| 85 |
+
return image_bgr, 1.0
|
| 86 |
+
scale = INFERENCE_MAX_SIDE / long_side
|
| 87 |
+
new_w = int(round(w * scale))
|
| 88 |
+
new_h = int(round(h * scale))
|
| 89 |
+
scaled = cv2.resize(image_bgr, (new_w, new_h), interpolation=cv2.INTER_AREA)
|
| 90 |
+
return scaled, 1.0 / scale # factor to go scaled -> original
|
| 91 |
+
|
| 92 |
+
|
| 93 |
+
def _mask_to_bool_array(mask: Any, target_shape: Optional[Tuple[int, int]] = None) -> np.ndarray:
|
| 94 |
+
"""Convert SAM output mask (torch tensor, PIL, or ndarray) to a bool ndarray.
|
| 95 |
+
|
| 96 |
+
If target_shape is given and differs, resize with nearest neighbor.
|
| 97 |
+
"""
|
| 98 |
+
if hasattr(mask, "cpu"):
|
| 99 |
+
arr = mask.cpu().numpy()
|
| 100 |
+
else:
|
| 101 |
+
arr = np.asarray(mask)
|
| 102 |
+
if arr.dtype != bool:
|
| 103 |
+
arr = arr > 0
|
| 104 |
+
if target_shape is not None and arr.shape != target_shape:
|
| 105 |
+
arr_u8 = arr.astype(np.uint8) * 255
|
| 106 |
+
resized = cv2.resize(
|
| 107 |
+
arr_u8, (target_shape[1], target_shape[0]), interpolation=cv2.INTER_NEAREST
|
| 108 |
+
)
|
| 109 |
+
arr = resized > 127
|
| 110 |
+
return arr
|
| 111 |
+
|
| 112 |
+
|
| 113 |
+
def _score_card_mask(
|
| 114 |
+
mask: np.ndarray,
|
| 115 |
+
image_area: float,
|
| 116 |
+
) -> Optional[Dict[str, Any]]:
|
| 117 |
+
"""Score a candidate mask for being a credit card.
|
| 118 |
+
|
| 119 |
+
Returns a dict with {corners, width, height, area, aspect_ratio, rectangularity, score}
|
| 120 |
+
or None if the mask is rejected.
|
| 121 |
+
"""
|
| 122 |
+
mask_u8 = mask.astype(np.uint8) * 255
|
| 123 |
+
mask_area = float(mask.sum())
|
| 124 |
+
|
| 125 |
+
area_ratio = mask_area / image_area
|
| 126 |
+
if area_ratio < MIN_CARD_AREA_RATIO or area_ratio > MAX_CARD_AREA_RATIO:
|
| 127 |
+
return None
|
| 128 |
+
|
| 129 |
+
contours, _ = cv2.findContours(mask_u8, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
|
| 130 |
+
if not contours:
|
| 131 |
+
return None
|
| 132 |
+
|
| 133 |
+
# Largest external contour is the card body (SAM masks can be slightly disconnected)
|
| 134 |
+
contour = max(contours, key=cv2.contourArea)
|
| 135 |
+
contour_area = cv2.contourArea(contour)
|
| 136 |
+
if contour_area <= 0:
|
| 137 |
+
return None
|
| 138 |
+
|
| 139 |
+
rect = cv2.minAreaRect(contour)
|
| 140 |
+
box = cv2.boxPoints(rect)
|
| 141 |
+
rect_area = cv2.contourArea(box.astype(np.float32))
|
| 142 |
+
if rect_area <= 0:
|
| 143 |
+
return None
|
| 144 |
+
|
| 145 |
+
rectangularity = contour_area / rect_area
|
| 146 |
+
if rectangularity < MIN_RECTANGULARITY:
|
| 147 |
+
return None
|
| 148 |
+
|
| 149 |
+
corners = order_corners(box)
|
| 150 |
+
width, height = get_quad_dimensions(corners)
|
| 151 |
+
if width <= 0 or height <= 0:
|
| 152 |
+
return None
|
| 153 |
+
|
| 154 |
+
aspect_ratio = max(width, height) / min(width, height)
|
| 155 |
+
ratio_diff = abs(aspect_ratio - CARD_ASPECT_RATIO) / CARD_ASPECT_RATIO
|
| 156 |
+
if ratio_diff > ASPECT_RATIO_TOLERANCE:
|
| 157 |
+
return None
|
| 158 |
+
|
| 159 |
+
# Higher score: better rectangularity + tighter aspect ratio match + meaningful size
|
| 160 |
+
ratio_score = 1.0 - ratio_diff / ASPECT_RATIO_TOLERANCE
|
| 161 |
+
rect_score = (rectangularity - MIN_RECTANGULARITY) / (1.0 - MIN_RECTANGULARITY)
|
| 162 |
+
area_score = min(area_ratio / 0.1, 1.0) # caps at 10% of image area
|
| 163 |
+
score = 0.4 * ratio_score + 0.4 * rect_score + 0.2 * area_score
|
| 164 |
+
|
| 165 |
+
return {
|
| 166 |
+
"corners": corners,
|
| 167 |
+
"contour": contour,
|
| 168 |
+
"width": width,
|
| 169 |
+
"height": height,
|
| 170 |
+
"area": mask_area,
|
| 171 |
+
"aspect_ratio": aspect_ratio,
|
| 172 |
+
"rectangularity": rectangularity,
|
| 173 |
+
"score": score,
|
| 174 |
+
"mask": mask,
|
| 175 |
+
}
|
| 176 |
+
|
| 177 |
+
|
| 178 |
+
def _save_debug(
|
| 179 |
+
debug_dir: str,
|
| 180 |
+
image_bgr: np.ndarray,
|
| 181 |
+
all_masks: List[np.ndarray],
|
| 182 |
+
scored: List[Dict[str, Any]],
|
| 183 |
+
best: Optional[Dict[str, Any]],
|
| 184 |
+
) -> None:
|
| 185 |
+
"""Save debug visualizations for SAM card detection."""
|
| 186 |
+
Path(debug_dir).mkdir(parents=True, exist_ok=True)
|
| 187 |
+
|
| 188 |
+
# 01: all SAM masks overlaid (colored)
|
| 189 |
+
overlay = image_bgr.copy()
|
| 190 |
+
rng = np.random.default_rng(42)
|
| 191 |
+
for m in all_masks:
|
| 192 |
+
color = rng.integers(64, 255, size=3).tolist()
|
| 193 |
+
overlay[m] = (0.5 * overlay[m] + 0.5 * np.array(color)).astype(np.uint8)
|
| 194 |
+
cv2.imwrite(str(Path(debug_dir) / "01_all_sam_masks.png"), overlay)
|
| 195 |
+
|
| 196 |
+
# 02: scored card candidates (green contours, score labels)
|
| 197 |
+
cand_img = image_bgr.copy()
|
| 198 |
+
for s in scored:
|
| 199 |
+
corners = s["corners"].astype(np.int32)
|
| 200 |
+
cv2.polylines(cand_img, [corners], True, (0, 255, 0), 3)
|
| 201 |
+
cv2.putText(
|
| 202 |
+
cand_img,
|
| 203 |
+
f"{s['score']:.2f} ar={s['aspect_ratio']:.3f}",
|
| 204 |
+
tuple(corners[0]),
|
| 205 |
+
cv2.FONT_HERSHEY_SIMPLEX,
|
| 206 |
+
1.2,
|
| 207 |
+
(0, 255, 0),
|
| 208 |
+
3,
|
| 209 |
+
cv2.LINE_AA,
|
| 210 |
+
)
|
| 211 |
+
cv2.imwrite(str(Path(debug_dir) / "02_card_candidates.png"), cand_img)
|
| 212 |
+
|
| 213 |
+
# 03: final selection
|
| 214 |
+
if best is not None:
|
| 215 |
+
final = image_bgr.copy()
|
| 216 |
+
mask_u8 = best["mask"].astype(np.uint8) * 255
|
| 217 |
+
tint = np.zeros_like(final)
|
| 218 |
+
tint[:, :, 1] = mask_u8 # green channel
|
| 219 |
+
final = cv2.addWeighted(final, 1.0, tint, 0.35, 0)
|
| 220 |
+
|
| 221 |
+
corners = best["corners"].astype(np.int32)
|
| 222 |
+
cv2.polylines(final, [corners], True, (0, 255, 0), 4)
|
| 223 |
+
for pt in corners:
|
| 224 |
+
cv2.circle(final, tuple(pt), 10, (0, 0, 255), -1)
|
| 225 |
+
|
| 226 |
+
label = (
|
| 227 |
+
f"SAM card score={best['score']:.3f} "
|
| 228 |
+
f"ar={best['aspect_ratio']:.3f} rect={best['rectangularity']:.3f}"
|
| 229 |
+
)
|
| 230 |
+
cv2.putText(final, label, (30, 60), cv2.FONT_HERSHEY_SIMPLEX, 1.1,
|
| 231 |
+
(255, 255, 255), 5, cv2.LINE_AA)
|
| 232 |
+
cv2.putText(final, label, (30, 60), cv2.FONT_HERSHEY_SIMPLEX, 1.1,
|
| 233 |
+
(0, 255, 0), 2, cv2.LINE_AA)
|
| 234 |
+
cv2.imwrite(str(Path(debug_dir) / "03_final_selection.png"), final)
|
| 235 |
+
|
| 236 |
+
|
| 237 |
+
def detect_credit_card_sam(
|
| 238 |
+
image: np.ndarray,
|
| 239 |
+
debug_dir: Optional[str] = None,
|
| 240 |
+
) -> Optional[Dict[str, Any]]:
|
| 241 |
+
"""Detect a credit card in the image using SAM 2.1.
|
| 242 |
+
|
| 243 |
+
Args:
|
| 244 |
+
image: Input BGR image (any size)
|
| 245 |
+
debug_dir: Optional directory to save debug visualizations
|
| 246 |
+
|
| 247 |
+
Returns:
|
| 248 |
+
Dict with keys matching card_detection.detect_credit_card:
|
| 249 |
+
{corners, contour, confidence, width_px, height_px, aspect_ratio}
|
| 250 |
+
or None if no card-like mask was found.
|
| 251 |
+
"""
|
| 252 |
+
from PIL import Image as PILImage
|
| 253 |
+
|
| 254 |
+
if debug_dir:
|
| 255 |
+
print(f" SAM card detection debug → {debug_dir}")
|
| 256 |
+
|
| 257 |
+
h, w = image.shape[:2]
|
| 258 |
+
image_area = float(h * w)
|
| 259 |
+
|
| 260 |
+
scaled_bgr, scale_back = _downscale_for_inference(image)
|
| 261 |
+
scaled_rgb = cv2.cvtColor(scaled_bgr, cv2.COLOR_BGR2RGB)
|
| 262 |
+
pil = PILImage.fromarray(scaled_rgb)
|
| 263 |
+
|
| 264 |
+
pipe = _get_pipeline()
|
| 265 |
+
t0 = time.time()
|
| 266 |
+
output = pipe(
|
| 267 |
+
pil,
|
| 268 |
+
points_per_side=POINTS_PER_SIDE,
|
| 269 |
+
points_per_batch=POINTS_PER_BATCH,
|
| 270 |
+
)
|
| 271 |
+
print(f" SAM inference: {time.time() - t0:.1f}s → {len(output['masks'])} masks")
|
| 272 |
+
|
| 273 |
+
# Upscale masks back to original resolution once, keep them for scoring + debug
|
| 274 |
+
all_masks_full: List[np.ndarray] = []
|
| 275 |
+
for m in output["masks"]:
|
| 276 |
+
all_masks_full.append(_mask_to_bool_array(m, target_shape=(h, w)))
|
| 277 |
+
|
| 278 |
+
scored: List[Dict[str, Any]] = []
|
| 279 |
+
for m in all_masks_full:
|
| 280 |
+
result = _score_card_mask(m, image_area)
|
| 281 |
+
if result is not None:
|
| 282 |
+
scored.append(result)
|
| 283 |
+
|
| 284 |
+
scored.sort(key=lambda d: d["score"], reverse=True)
|
| 285 |
+
best = scored[0] if scored else None
|
| 286 |
+
|
| 287 |
+
if debug_dir:
|
| 288 |
+
_save_debug(debug_dir, image, all_masks_full, scored, best)
|
| 289 |
+
|
| 290 |
+
if best is None:
|
| 291 |
+
print(" SAM: no card-like mask found")
|
| 292 |
+
return None
|
| 293 |
+
|
| 294 |
+
print(
|
| 295 |
+
f" SAM card: score={best['score']:.3f}, aspect={best['aspect_ratio']:.3f}, "
|
| 296 |
+
f"rect={best['rectangularity']:.3f}, {best['width']:.0f}x{best['height']:.0f}px"
|
| 297 |
+
)
|
| 298 |
+
|
| 299 |
+
return {
|
| 300 |
+
"corners": best["corners"],
|
| 301 |
+
"contour": best["corners"],
|
| 302 |
+
"confidence": float(best["score"]),
|
| 303 |
+
"width_px": float(best["width"]),
|
| 304 |
+
"height_px": float(best["height"]),
|
| 305 |
+
"aspect_ratio": float(best["aspect_ratio"]),
|
| 306 |
+
"mask": best["mask"],
|
| 307 |
+
"mask_source": "sam_amg",
|
| 308 |
+
}
|
| 309 |
+
|
| 310 |
+
|
| 311 |
+
# ---------------------------------------------------------------------------
|
| 312 |
+
# Prompt-based card detection (fast path)
|
| 313 |
+
# ---------------------------------------------------------------------------
|
| 314 |
+
|
| 315 |
+
|
| 316 |
+
def suggest_card_seeds(
|
| 317 |
+
hand_mask: np.ndarray,
|
| 318 |
+
image_shape: Tuple[int, int],
|
| 319 |
+
max_seeds: int = 25,
|
| 320 |
+
) -> List[Tuple[int, int]]:
|
| 321 |
+
"""Uniform 5x5 grid of seed points, with hand-mask points dropped.
|
| 322 |
+
|
| 323 |
+
Lay out a 5x5 lattice over the inner core of the image (outer 20% margin
|
| 324 |
+
trimmed on each axis because cards never sit flush against the image
|
| 325 |
+
edge), then discard any point that falls inside the hand mask. Dense
|
| 326 |
+
enough that at least one point reliably lands inside the credit card
|
| 327 |
+
regardless of where it sits relative to the hand.
|
| 328 |
+
|
| 329 |
+
Args:
|
| 330 |
+
hand_mask: bool or uint8 hand mask, same shape as the canonical image.
|
| 331 |
+
image_shape: (h, w) of the canonical image.
|
| 332 |
+
max_seeds: hard cap on returned seeds (default 25 = full 5x5 grid).
|
| 333 |
+
|
| 334 |
+
Returns:
|
| 335 |
+
List of (x, y) pixel coordinates in the canonical image frame.
|
| 336 |
+
"""
|
| 337 |
+
h, w = image_shape
|
| 338 |
+
mask_bool = hand_mask.astype(bool) if hand_mask.dtype != bool else hand_mask
|
| 339 |
+
|
| 340 |
+
# 5x5 grid in [0.2, 0.8] × [0.2, 0.8] of the image.
|
| 341 |
+
fracs = (0.20, 0.35, 0.50, 0.65, 0.80)
|
| 342 |
+
xs_grid = [int(round(w * f)) for f in fracs]
|
| 343 |
+
ys_grid = [int(round(h * f)) for f in fracs]
|
| 344 |
+
|
| 345 |
+
seeds: List[Tuple[int, int]] = []
|
| 346 |
+
for gy in ys_grid:
|
| 347 |
+
for gx in xs_grid:
|
| 348 |
+
px = max(0, min(w - 1, gx))
|
| 349 |
+
py = max(0, min(h - 1, gy))
|
| 350 |
+
if mask_bool[py, px]:
|
| 351 |
+
continue
|
| 352 |
+
seeds.append((px, py))
|
| 353 |
+
if len(seeds) >= max_seeds:
|
| 354 |
+
return seeds
|
| 355 |
+
return seeds
|
| 356 |
+
|
| 357 |
+
|
| 358 |
+
def _downscale_prompt(image_bgr: np.ndarray) -> Tuple[np.ndarray, float]:
|
| 359 |
+
"""Downscale for prompt inference. Returns (scaled, scale_back)."""
|
| 360 |
+
h, w = image_bgr.shape[:2]
|
| 361 |
+
long_side = max(h, w)
|
| 362 |
+
if long_side <= PROMPT_INFERENCE_MAX_SIDE:
|
| 363 |
+
return image_bgr, 1.0
|
| 364 |
+
scale = PROMPT_INFERENCE_MAX_SIDE / long_side
|
| 365 |
+
new_w = int(round(w * scale))
|
| 366 |
+
new_h = int(round(h * scale))
|
| 367 |
+
scaled = cv2.resize(image_bgr, (new_w, new_h), interpolation=cv2.INTER_AREA)
|
| 368 |
+
return scaled, 1.0 / scale
|
| 369 |
+
|
| 370 |
+
|
| 371 |
+
def _save_prompt_debug(
|
| 372 |
+
debug_dir: str,
|
| 373 |
+
image_bgr: np.ndarray,
|
| 374 |
+
seeds: List[Tuple[int, int]],
|
| 375 |
+
negatives: List[Tuple[int, int]],
|
| 376 |
+
candidate_masks: List[np.ndarray],
|
| 377 |
+
scored: List[Dict[str, Any]],
|
| 378 |
+
best: Optional[Dict[str, Any]],
|
| 379 |
+
) -> None:
|
| 380 |
+
"""Save debug visualizations for prompt-based card detection."""
|
| 381 |
+
Path(debug_dir).mkdir(parents=True, exist_ok=True)
|
| 382 |
+
|
| 383 |
+
# 01: prompt points on the image
|
| 384 |
+
pts_img = image_bgr.copy()
|
| 385 |
+
for (px, py) in seeds:
|
| 386 |
+
cv2.circle(pts_img, (px, py), 20, (0, 255, 0), -1)
|
| 387 |
+
cv2.circle(pts_img, (px, py), 20, (0, 0, 0), 3)
|
| 388 |
+
for (nx, ny) in negatives:
|
| 389 |
+
cv2.circle(pts_img, (nx, ny), 20, (0, 0, 255), -1)
|
| 390 |
+
cv2.circle(pts_img, (nx, ny), 20, (0, 0, 0), 3)
|
| 391 |
+
cv2.imwrite(str(Path(debug_dir) / "01_prompt_points.png"), pts_img)
|
| 392 |
+
|
| 393 |
+
# 02: all candidate masks overlaid (one color per prompt)
|
| 394 |
+
overlay = image_bgr.copy()
|
| 395 |
+
rng = np.random.default_rng(7)
|
| 396 |
+
for m in candidate_masks:
|
| 397 |
+
if m is None or m.sum() == 0:
|
| 398 |
+
continue
|
| 399 |
+
color = rng.integers(64, 255, size=3).tolist()
|
| 400 |
+
overlay[m] = (0.5 * overlay[m] + 0.5 * np.array(color)).astype(np.uint8)
|
| 401 |
+
cv2.imwrite(str(Path(debug_dir) / "02_candidate_masks.png"), overlay)
|
| 402 |
+
|
| 403 |
+
# 03: scored candidates
|
| 404 |
+
cand_img = image_bgr.copy()
|
| 405 |
+
for s in scored:
|
| 406 |
+
corners = s["corners"].astype(np.int32)
|
| 407 |
+
cv2.polylines(cand_img, [corners], True, (0, 255, 0), 3)
|
| 408 |
+
cv2.putText(
|
| 409 |
+
cand_img,
|
| 410 |
+
f"{s['score']:.2f} ar={s['aspect_ratio']:.3f}",
|
| 411 |
+
tuple(corners[0]),
|
| 412 |
+
cv2.FONT_HERSHEY_SIMPLEX,
|
| 413 |
+
1.2,
|
| 414 |
+
(0, 255, 0),
|
| 415 |
+
3,
|
| 416 |
+
cv2.LINE_AA,
|
| 417 |
+
)
|
| 418 |
+
cv2.imwrite(str(Path(debug_dir) / "03_scored.png"), cand_img)
|
| 419 |
+
|
| 420 |
+
if best is not None:
|
| 421 |
+
final = image_bgr.copy()
|
| 422 |
+
mask_u8 = best["mask"].astype(np.uint8) * 255
|
| 423 |
+
tint = np.zeros_like(final)
|
| 424 |
+
tint[:, :, 1] = mask_u8
|
| 425 |
+
final = cv2.addWeighted(final, 1.0, tint, 0.35, 0)
|
| 426 |
+
corners = best["corners"].astype(np.int32)
|
| 427 |
+
cv2.polylines(final, [corners], True, (0, 255, 0), 4)
|
| 428 |
+
for pt in corners:
|
| 429 |
+
cv2.circle(final, tuple(pt), 10, (0, 0, 255), -1)
|
| 430 |
+
label = (
|
| 431 |
+
f"SAM-prompt card score={best['score']:.3f} "
|
| 432 |
+
f"ar={best['aspect_ratio']:.3f} rect={best['rectangularity']:.3f}"
|
| 433 |
+
)
|
| 434 |
+
cv2.putText(final, label, (30, 60), cv2.FONT_HERSHEY_SIMPLEX, 1.1,
|
| 435 |
+
(255, 255, 255), 5, cv2.LINE_AA)
|
| 436 |
+
cv2.putText(final, label, (30, 60), cv2.FONT_HERSHEY_SIMPLEX, 1.1,
|
| 437 |
+
(0, 255, 0), 2, cv2.LINE_AA)
|
| 438 |
+
cv2.imwrite(str(Path(debug_dir) / "04_final_selection.png"), final)
|
| 439 |
+
|
| 440 |
+
|
| 441 |
+
def detect_credit_card_sam_prompt(
|
| 442 |
+
image: np.ndarray,
|
| 443 |
+
seed_points: List[Tuple[int, int]],
|
| 444 |
+
negative_points: Optional[List[Tuple[int, int]]] = None,
|
| 445 |
+
debug_dir: Optional[str] = None,
|
| 446 |
+
) -> Optional[Dict[str, Any]]:
|
| 447 |
+
"""Prompt-based SAM 2.1 credit card detection.
|
| 448 |
+
|
| 449 |
+
For each seed point, runs a single-point SAM decoder pass with
|
| 450 |
+
`multimask_output=True` and collects all returned masks. Every mask is
|
| 451 |
+
then filtered through `_score_card_mask`; the highest-scoring survivor
|
| 452 |
+
is returned. This is ~20× faster than the AMG path because it runs the
|
| 453 |
+
decoder ~N times (one per seed) instead of 256 times on a dense grid.
|
| 454 |
+
|
| 455 |
+
Args:
|
| 456 |
+
image: Full-resolution BGR image (canonical orientation).
|
| 457 |
+
seed_points: List of (x, y) positive-point candidates. Each one is
|
| 458 |
+
tried independently. A few well-placed candidates are enough.
|
| 459 |
+
negative_points: Optional list of (x, y) negative points applied to
|
| 460 |
+
every seed's prompt (e.g., palm center to steer SAM off the hand).
|
| 461 |
+
debug_dir: Optional directory to dump debug visualizations.
|
| 462 |
+
|
| 463 |
+
Returns:
|
| 464 |
+
Card dict matching `detect_credit_card`/`detect_credit_card_sam`, or
|
| 465 |
+
None if no seed produced a valid card mask.
|
| 466 |
+
"""
|
| 467 |
+
import torch
|
| 468 |
+
from PIL import Image as PILImage
|
| 469 |
+
|
| 470 |
+
if not seed_points:
|
| 471 |
+
print(" SAM-prompt: no seed points provided")
|
| 472 |
+
return None
|
| 473 |
+
|
| 474 |
+
h, w = image.shape[:2]
|
| 475 |
+
image_area = float(h * w)
|
| 476 |
+
|
| 477 |
+
scaled_bgr, scale_back = _downscale_prompt(image)
|
| 478 |
+
scaled_rgb = cv2.cvtColor(scaled_bgr, cv2.COLOR_BGR2RGB)
|
| 479 |
+
pil = PILImage.fromarray(scaled_rgb)
|
| 480 |
+
scale_down = 1.0 / scale_back # original → scaled
|
| 481 |
+
|
| 482 |
+
def _to_scaled(pts: List[Tuple[int, int]]) -> List[List[int]]:
|
| 483 |
+
return [[int(round(px * scale_down)), int(round(py * scale_down))] for px, py in pts]
|
| 484 |
+
|
| 485 |
+
seeds_scaled = _to_scaled(seed_points)
|
| 486 |
+
negatives_scaled = _to_scaled(negative_points) if negative_points else []
|
| 487 |
+
|
| 488 |
+
# Build one prompt per seed; each prompt carries (1 positive + all negatives)
|
| 489 |
+
# input_points shape: [batch=1, num_prompts, points_per_prompt, 2]
|
| 490 |
+
# input_labels shape: [batch=1, num_prompts, points_per_prompt]
|
| 491 |
+
points_per_prompt = 1 + len(negatives_scaled)
|
| 492 |
+
input_points = [[[seed] + negatives_scaled for seed in seeds_scaled]]
|
| 493 |
+
input_labels = [[[1] + [0] * len(negatives_scaled) for _ in seeds_scaled]]
|
| 494 |
+
|
| 495 |
+
model, processor = get_sam2()
|
| 496 |
+
|
| 497 |
+
t0 = time.time()
|
| 498 |
+
inputs = processor(
|
| 499 |
+
images=pil,
|
| 500 |
+
input_points=input_points,
|
| 501 |
+
input_labels=input_labels,
|
| 502 |
+
return_tensors="pt",
|
| 503 |
+
)
|
| 504 |
+
with torch.inference_mode():
|
| 505 |
+
# multimask_output=True gives 3 masks per seed (small / medium / large
|
| 506 |
+
# disambiguation of the prompt). Empirically this matters for card
|
| 507 |
+
# detection: SAM's single-best IoU mask sometimes latches onto a
|
| 508 |
+
# sub-region or a nearby distractor, but one of the other two
|
| 509 |
+
# candidates is the full card. Scoring cost is fine because we score
|
| 510 |
+
# in the scaled 1024-space, not full resolution.
|
| 511 |
+
outputs = model(**inputs, multimask_output=True)
|
| 512 |
+
|
| 513 |
+
# Score masks in the scaled 1024-space. Only the single winner is
|
| 514 |
+
# upscaled to full resolution afterward, which avoids O(N) 12 MP resizes.
|
| 515 |
+
scaled_h = inputs["original_sizes"][0][0].item()
|
| 516 |
+
scaled_w = inputs["original_sizes"][0][1].item()
|
| 517 |
+
scaled_area = float(scaled_h * scaled_w)
|
| 518 |
+
|
| 519 |
+
masks_list = processor.post_process_masks(
|
| 520 |
+
outputs.pred_masks.cpu(),
|
| 521 |
+
inputs["original_sizes"],
|
| 522 |
+
mask_threshold=0.0,
|
| 523 |
+
)
|
| 524 |
+
masks_tensor = masks_list[0] # (num_prompts, num_candidates, H_s, W_s)
|
| 525 |
+
iou_scores = outputs.iou_scores.cpu().numpy()[0]
|
| 526 |
+
infer_time = time.time() - t0
|
| 527 |
+
|
| 528 |
+
scored: List[Dict[str, Any]] = []
|
| 529 |
+
scaled_candidate_masks: List[np.ndarray] = []
|
| 530 |
+
for prompt_idx in range(masks_tensor.shape[0]):
|
| 531 |
+
for cand_idx in range(masks_tensor.shape[1]):
|
| 532 |
+
mask_scaled = masks_tensor[prompt_idx, cand_idx].numpy().astype(bool)
|
| 533 |
+
scaled_candidate_masks.append(mask_scaled)
|
| 534 |
+
result = _score_card_mask(mask_scaled, scaled_area)
|
| 535 |
+
if result is not None:
|
| 536 |
+
result["seed_idx"] = prompt_idx
|
| 537 |
+
result["cand_idx"] = cand_idx
|
| 538 |
+
result["iou_score"] = float(iou_scores[prompt_idx, cand_idx])
|
| 539 |
+
result["mask_scaled"] = mask_scaled
|
| 540 |
+
scored.append(result)
|
| 541 |
+
|
| 542 |
+
scored.sort(key=lambda d: d["score"], reverse=True)
|
| 543 |
+
best = scored[0] if scored else None
|
| 544 |
+
|
| 545 |
+
# Upscale only the winning mask + corners to full resolution
|
| 546 |
+
if best is not None:
|
| 547 |
+
mask_scaled_best = best["mask_scaled"]
|
| 548 |
+
if mask_scaled_best.shape != (h, w):
|
| 549 |
+
mask_full = cv2.resize(
|
| 550 |
+
mask_scaled_best.astype(np.uint8), (w, h),
|
| 551 |
+
interpolation=cv2.INTER_NEAREST,
|
| 552 |
+
).astype(bool)
|
| 553 |
+
else:
|
| 554 |
+
mask_full = mask_scaled_best
|
| 555 |
+
best["mask"] = mask_full
|
| 556 |
+
best["corners"] = best["corners"] * scale_back
|
| 557 |
+
best["width"] = best["width"] * scale_back
|
| 558 |
+
best["height"] = best["height"] * scale_back
|
| 559 |
+
|
| 560 |
+
print(
|
| 561 |
+
f" SAM-prompt: {len(seed_points)} seeds, "
|
| 562 |
+
f"{masks_tensor.shape[0] * masks_tensor.shape[1]} candidates, "
|
| 563 |
+
f"{len(scored)} passed filter, inference={infer_time:.2f}s"
|
| 564 |
+
)
|
| 565 |
+
|
| 566 |
+
if debug_dir:
|
| 567 |
+
# Render debug overlays in the downscaled 1024-space. Upscaling
|
| 568 |
+
# ~60 masks to full 12 MP resolution just for PNGs was dominating
|
| 569 |
+
# end-to-end time (8–10s out of ~9s total). The debug images are
|
| 570 |
+
# for human inspection; 1024 is plenty.
|
| 571 |
+
dh, dw = scaled_bgr.shape[:2]
|
| 572 |
+
debug_seeds = [
|
| 573 |
+
(int(round(px / scale_back)), int(round(py / scale_back)))
|
| 574 |
+
for px, py in seed_points
|
| 575 |
+
]
|
| 576 |
+
debug_negs = [
|
| 577 |
+
(int(round(px / scale_back)), int(round(py / scale_back)))
|
| 578 |
+
for px, py in (negative_points or [])
|
| 579 |
+
]
|
| 580 |
+
debug_scored_for_viz = []
|
| 581 |
+
for s in scored:
|
| 582 |
+
s_copy = dict(s)
|
| 583 |
+
s_copy["corners"] = s["corners"] # already scaled-space
|
| 584 |
+
s_copy["mask"] = s["mask_scaled"]
|
| 585 |
+
debug_scored_for_viz.append(s_copy)
|
| 586 |
+
best_for_viz = None
|
| 587 |
+
if best is not None:
|
| 588 |
+
best_for_viz = dict(best)
|
| 589 |
+
best_for_viz["corners"] = best["corners"] / scale_back # back to scaled
|
| 590 |
+
best_for_viz["mask"] = best["mask_scaled"]
|
| 591 |
+
_save_prompt_debug(
|
| 592 |
+
debug_dir, scaled_bgr, debug_seeds, debug_negs,
|
| 593 |
+
scaled_candidate_masks, debug_scored_for_viz, best_for_viz,
|
| 594 |
+
)
|
| 595 |
+
|
| 596 |
+
if best is None:
|
| 597 |
+
return None
|
| 598 |
+
|
| 599 |
+
print(
|
| 600 |
+
f" SAM-prompt card: score={best['score']:.3f}, "
|
| 601 |
+
f"aspect={best['aspect_ratio']:.3f}, rect={best['rectangularity']:.3f}, "
|
| 602 |
+
f"{best['width']:.0f}x{best['height']:.0f}px (seed {best['seed_idx']})"
|
| 603 |
+
)
|
| 604 |
+
|
| 605 |
+
return {
|
| 606 |
+
"corners": best["corners"],
|
| 607 |
+
"contour": best["corners"],
|
| 608 |
+
"confidence": float(best["score"]),
|
| 609 |
+
"width_px": float(best["width"]),
|
| 610 |
+
"height_px": float(best["height"]),
|
| 611 |
+
"aspect_ratio": float(best["aspect_ratio"]),
|
| 612 |
+
"mask": best["mask"], # bool HxW, canonical-image coords
|
| 613 |
+
"mask_source": "sam_prompt",
|
| 614 |
+
}
|
src/sam_hand_segmentation.py
ADDED
|
@@ -0,0 +1,158 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
SAM 2.1-based hand segmentation.
|
| 3 |
+
|
| 4 |
+
Produces a pixel-accurate hand mask using Meta's Segment Anything 2.1
|
| 5 |
+
(Hiera Tiny) via HuggingFace transformers, seeded by a positive point
|
| 6 |
+
prompt at the palm center (derived from MediaPipe landmarks). Optional
|
| 7 |
+
negative points can steer SAM away from the credit card.
|
| 8 |
+
|
| 9 |
+
This replaces the synthetic convex-hull "mask" produced by
|
| 10 |
+
`finger_segmentation._create_hand_mask()`, which is built from the
|
| 11 |
+
21 hand landmarks and does not follow the true hand contour.
|
| 12 |
+
|
| 13 |
+
Prompt-based inference: ~0.6s per call on CPU (vs ~18s for AMG).
|
| 14 |
+
"""
|
| 15 |
+
|
| 16 |
+
from __future__ import annotations
|
| 17 |
+
|
| 18 |
+
import time
|
| 19 |
+
from pathlib import Path
|
| 20 |
+
from typing import List, Optional, Tuple
|
| 21 |
+
|
| 22 |
+
import cv2
|
| 23 |
+
import numpy as np
|
| 24 |
+
|
| 25 |
+
from .sam_backend import INFERENCE_MAX_SIDE, get_sam2
|
| 26 |
+
|
| 27 |
+
|
| 28 |
+
def _downscale(image_bgr: np.ndarray) -> Tuple[np.ndarray, float]:
|
| 29 |
+
"""Downscale so the long side is INFERENCE_MAX_SIDE. Returns (scaled, scale_back).
|
| 30 |
+
|
| 31 |
+
`scale_back` is the factor to multiply scaled coords by to get original coords.
|
| 32 |
+
"""
|
| 33 |
+
h, w = image_bgr.shape[:2]
|
| 34 |
+
long_side = max(h, w)
|
| 35 |
+
if long_side <= INFERENCE_MAX_SIDE:
|
| 36 |
+
return image_bgr, 1.0
|
| 37 |
+
scale = INFERENCE_MAX_SIDE / long_side
|
| 38 |
+
new_w = int(round(w * scale))
|
| 39 |
+
new_h = int(round(h * scale))
|
| 40 |
+
return cv2.resize(image_bgr, (new_w, new_h), interpolation=cv2.INTER_AREA), 1.0 / scale
|
| 41 |
+
|
| 42 |
+
|
| 43 |
+
def segment_hand_sam(
|
| 44 |
+
image_bgr: np.ndarray,
|
| 45 |
+
palm_xy: Tuple[int, int],
|
| 46 |
+
negative_points: Optional[List[Tuple[int, int]]] = None,
|
| 47 |
+
debug_dir: Optional[str] = None,
|
| 48 |
+
) -> Optional[np.ndarray]:
|
| 49 |
+
"""Return a pixel-accurate bool hand mask (H x W) via SAM 2.1 Tiny.
|
| 50 |
+
|
| 51 |
+
Args:
|
| 52 |
+
image_bgr: Full-resolution BGR image in the canonical orientation.
|
| 53 |
+
palm_xy: (x, y) pixel coordinates of the palm center (positive prompt).
|
| 54 |
+
negative_points: Optional list of (x, y) points to steer SAM away from
|
| 55 |
+
non-hand regions (e.g., credit card center).
|
| 56 |
+
debug_dir: Optional directory to save mask + overlay for inspection.
|
| 57 |
+
|
| 58 |
+
Returns:
|
| 59 |
+
Bool mask of the same shape as `image_bgr[:2]`, or None on failure.
|
| 60 |
+
"""
|
| 61 |
+
import torch
|
| 62 |
+
from PIL import Image as PILImage
|
| 63 |
+
|
| 64 |
+
h_full, w_full = image_bgr.shape[:2]
|
| 65 |
+
|
| 66 |
+
scaled_bgr, scale_back = _downscale(image_bgr)
|
| 67 |
+
scaled_rgb = cv2.cvtColor(scaled_bgr, cv2.COLOR_BGR2RGB)
|
| 68 |
+
pil = PILImage.fromarray(scaled_rgb)
|
| 69 |
+
|
| 70 |
+
# Map prompt points into the scaled image space
|
| 71 |
+
scale_down = 1.0 / scale_back # original -> scaled
|
| 72 |
+
palm_scaled = (int(round(palm_xy[0] * scale_down)), int(round(palm_xy[1] * scale_down)))
|
| 73 |
+
prompt_points = [list(palm_scaled)]
|
| 74 |
+
prompt_labels = [1]
|
| 75 |
+
if negative_points:
|
| 76 |
+
for nx, ny in negative_points:
|
| 77 |
+
prompt_points.append([int(round(nx * scale_down)), int(round(ny * scale_down))])
|
| 78 |
+
prompt_labels.append(0)
|
| 79 |
+
|
| 80 |
+
model, processor = get_sam2()
|
| 81 |
+
|
| 82 |
+
t0 = time.time()
|
| 83 |
+
inputs = processor(
|
| 84 |
+
images=pil,
|
| 85 |
+
input_points=[[prompt_points]],
|
| 86 |
+
input_labels=[[prompt_labels]],
|
| 87 |
+
return_tensors="pt",
|
| 88 |
+
)
|
| 89 |
+
with torch.inference_mode():
|
| 90 |
+
outputs = model(**inputs, multimask_output=True)
|
| 91 |
+
|
| 92 |
+
masks = processor.post_process_masks(
|
| 93 |
+
outputs.pred_masks.cpu(),
|
| 94 |
+
inputs["original_sizes"],
|
| 95 |
+
mask_threshold=0.0,
|
| 96 |
+
)[0][0] # (num_candidates, H_scaled, W_scaled)
|
| 97 |
+
scores = outputs.iou_scores.cpu().numpy()[0, 0]
|
| 98 |
+
best_idx = int(np.argmax(scores))
|
| 99 |
+
mask_scaled = masks[best_idx].numpy().astype(bool)
|
| 100 |
+
best_score = float(scores[best_idx])
|
| 101 |
+
infer_time = time.time() - t0
|
| 102 |
+
|
| 103 |
+
# Upscale back to original resolution
|
| 104 |
+
if mask_scaled.shape != (h_full, w_full):
|
| 105 |
+
mask_full = cv2.resize(
|
| 106 |
+
mask_scaled.astype(np.uint8),
|
| 107 |
+
(w_full, h_full),
|
| 108 |
+
interpolation=cv2.INTER_NEAREST,
|
| 109 |
+
).astype(bool)
|
| 110 |
+
else:
|
| 111 |
+
mask_full = mask_scaled
|
| 112 |
+
|
| 113 |
+
print(
|
| 114 |
+
f" SAM hand mask: score={best_score:.3f} time={infer_time:.1f}s "
|
| 115 |
+
f"area={int(mask_full.sum())}px"
|
| 116 |
+
)
|
| 117 |
+
|
| 118 |
+
if debug_dir:
|
| 119 |
+
Path(debug_dir).mkdir(parents=True, exist_ok=True)
|
| 120 |
+
# Raw mask
|
| 121 |
+
cv2.imwrite(str(Path(debug_dir) / "sam_hand_mask.png"),
|
| 122 |
+
mask_full.astype(np.uint8) * 255)
|
| 123 |
+
# Overlay with prompt points
|
| 124 |
+
overlay = image_bgr.copy()
|
| 125 |
+
tint = np.zeros_like(overlay)
|
| 126 |
+
tint[mask_full] = (0, 255, 255)
|
| 127 |
+
overlay = cv2.addWeighted(overlay, 1.0, tint, 0.35, 0)
|
| 128 |
+
|
| 129 |
+
contours, _ = cv2.findContours(
|
| 130 |
+
mask_full.astype(np.uint8) * 255, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_NONE
|
| 131 |
+
)
|
| 132 |
+
cv2.drawContours(overlay, contours, -1, (0, 255, 255), 3, cv2.LINE_AA)
|
| 133 |
+
|
| 134 |
+
cv2.circle(overlay, palm_xy, 20, (0, 255, 0), -1)
|
| 135 |
+
cv2.circle(overlay, palm_xy, 20, (0, 0, 0), 3)
|
| 136 |
+
if negative_points:
|
| 137 |
+
for nx, ny in negative_points:
|
| 138 |
+
cv2.circle(overlay, (int(nx), int(ny)), 20, (0, 0, 255), -1)
|
| 139 |
+
cv2.circle(overlay, (int(nx), int(ny)), 20, (0, 0, 0), 3)
|
| 140 |
+
|
| 141 |
+
label = f"SAM hand score={best_score:.2f} {infer_time:.1f}s"
|
| 142 |
+
cv2.putText(overlay, label, (30, 60), cv2.FONT_HERSHEY_SIMPLEX, 1.1,
|
| 143 |
+
(255, 255, 255), 5, cv2.LINE_AA)
|
| 144 |
+
cv2.putText(overlay, label, (30, 60), cv2.FONT_HERSHEY_SIMPLEX, 1.1,
|
| 145 |
+
(0, 255, 255), 2, cv2.LINE_AA)
|
| 146 |
+
cv2.imwrite(str(Path(debug_dir) / "sam_hand_overlay.png"), overlay)
|
| 147 |
+
|
| 148 |
+
return mask_full
|
| 149 |
+
|
| 150 |
+
|
| 151 |
+
def palm_center_from_landmarks(landmarks_px: np.ndarray) -> Tuple[int, int]:
|
| 152 |
+
"""Return (x, y) pixel coord of the palm center from the 21 MediaPipe landmarks.
|
| 153 |
+
|
| 154 |
+
Defined as the mean of wrist (0) + four MCPs (5, 9, 13, 17).
|
| 155 |
+
"""
|
| 156 |
+
idx = [0, 5, 9, 13, 17]
|
| 157 |
+
center = np.mean(landmarks_px[idx, :2], axis=0)
|
| 158 |
+
return (int(round(center[0])), int(round(center[1])))
|
web_demo/README.md
CHANGED
|
@@ -25,4 +25,4 @@ Open `http://localhost:8000`.
|
|
| 25 |
- Debug overlay auto-generated per request
|
| 26 |
- Default guided sample image is at `web_demo/static/examples/default_sample.jpg`
|
| 27 |
- `Start Measurement` uses the default sample image when no upload is selected
|
| 28 |
-
- Web demo enforces
|
|
|
|
| 25 |
- Debug overlay auto-generated per request
|
| 26 |
- Default guided sample image is at `web_demo/static/examples/default_sample.jpg`
|
| 27 |
- `Start Measurement` uses the default sample image when no upload is selected
|
| 28 |
+
- Web demo enforces SAM-mask boundary edge detection only (`edge_method=mask`)
|
web_demo/app.py
CHANGED
|
@@ -36,7 +36,9 @@ RESULTS_DIR = APP_ROOT / "results"
|
|
| 36 |
DEFAULT_SAMPLE_PATH = APP_ROOT / "static" / "examples" / "default_sample.jpg"
|
| 37 |
DEFAULT_SAMPLE_URL = "/static/examples/default_sample.jpg"
|
| 38 |
ALLOWED_EXTENSIONS = {".jpg", ".jpeg", ".png"}
|
| 39 |
-
DEMO_EDGE_METHOD = "
|
|
|
|
|
|
|
| 40 |
|
| 41 |
app = Flask(__name__)
|
| 42 |
|
|
@@ -233,6 +235,8 @@ def _run_measurement(
|
|
| 233 |
image=image,
|
| 234 |
finger_index=finger_index,
|
| 235 |
edge_method=DEMO_EDGE_METHOD,
|
|
|
|
|
|
|
| 236 |
result_png_path=str(result_png_path),
|
| 237 |
save_debug=False,
|
| 238 |
ring_model=ring_model,
|
|
@@ -313,6 +317,8 @@ def _run_multi_measurement(
|
|
| 313 |
result = measure_multi_finger(
|
| 314 |
image=image,
|
| 315 |
edge_method=DEMO_EDGE_METHOD,
|
|
|
|
|
|
|
| 316 |
result_png_path=str(result_png_path),
|
| 317 |
save_debug=False,
|
| 318 |
no_calibration=False,
|
|
|
|
| 36 |
DEFAULT_SAMPLE_PATH = APP_ROOT / "static" / "examples" / "default_sample.jpg"
|
| 37 |
DEFAULT_SAMPLE_URL = "/static/examples/default_sample.jpg"
|
| 38 |
ALLOWED_EXTENSIONS = {".jpg", ".jpeg", ".png"}
|
| 39 |
+
DEMO_EDGE_METHOD = "mask"
|
| 40 |
+
DEMO_CARD_METHOD = "sam"
|
| 41 |
+
DEMO_HAND_MASK_METHOD = "sam"
|
| 42 |
|
| 43 |
app = Flask(__name__)
|
| 44 |
|
|
|
|
| 235 |
image=image,
|
| 236 |
finger_index=finger_index,
|
| 237 |
edge_method=DEMO_EDGE_METHOD,
|
| 238 |
+
card_method=DEMO_CARD_METHOD,
|
| 239 |
+
hand_mask_method=DEMO_HAND_MASK_METHOD,
|
| 240 |
result_png_path=str(result_png_path),
|
| 241 |
save_debug=False,
|
| 242 |
ring_model=ring_model,
|
|
|
|
| 317 |
result = measure_multi_finger(
|
| 318 |
image=image,
|
| 319 |
edge_method=DEMO_EDGE_METHOD,
|
| 320 |
+
card_method=DEMO_CARD_METHOD,
|
| 321 |
+
hand_mask_method=DEMO_HAND_MASK_METHOD,
|
| 322 |
result_png_path=str(result_png_path),
|
| 323 |
save_debug=False,
|
| 324 |
no_calibration=False,
|
web_demo/static/app.js
CHANGED
|
@@ -112,7 +112,7 @@ const buildMeasureSettings = () => {
|
|
| 112 |
const aiOn = aiToggle ? (aiToggle.type === "checkbox" ? aiToggle.checked : true) : false;
|
| 113 |
return {
|
| 114 |
finger_index: fingerSelect ? fingerSelect.value : "index",
|
| 115 |
-
edge_method: "
|
| 116 |
mode: mode,
|
| 117 |
ring_model: ringModel,
|
| 118 |
ai_explain: aiOn ? "1" : "0",
|
|
|
|
| 112 |
const aiOn = aiToggle ? (aiToggle.type === "checkbox" ? aiToggle.checked : true) : false;
|
| 113 |
return {
|
| 114 |
finger_index: fingerSelect ? fingerSelect.value : "index",
|
| 115 |
+
edge_method: "mask",
|
| 116 |
mode: mode,
|
| 117 |
ring_model: ringModel,
|
| 118 |
ai_explain: aiOn ? "1" : "0",
|
web_demo/supabase_client.py
CHANGED
|
@@ -19,12 +19,24 @@ _initialized = False
|
|
| 19 |
|
| 20 |
|
| 21 |
def _get_client():
|
| 22 |
-
"""Lazy-init Supabase client. Returns None if
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 23 |
global _client, _initialized
|
| 24 |
if _initialized:
|
| 25 |
return _client
|
| 26 |
_initialized = True
|
| 27 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 28 |
url = os.environ.get("SUPABASE_URL", "").strip()
|
| 29 |
key = os.environ.get("SUPABASE_SERVICE_KEY", "").strip()
|
| 30 |
if not url or not key:
|
|
|
|
| 19 |
|
| 20 |
|
| 21 |
def _get_client():
|
| 22 |
+
"""Lazy-init Supabase client. Returns None if persistence is disabled.
|
| 23 |
+
|
| 24 |
+
Persistence is disabled when either:
|
| 25 |
+
- SUPABASE_URL / SUPABASE_SERVICE_KEY is missing, or
|
| 26 |
+
- RING_DISABLE_SUPABASE is set to a truthy value (explicit opt-out, so
|
| 27 |
+
local dev sessions don't upload photos + result PNGs to the real
|
| 28 |
+
bucket on every request).
|
| 29 |
+
"""
|
| 30 |
global _client, _initialized
|
| 31 |
if _initialized:
|
| 32 |
return _client
|
| 33 |
_initialized = True
|
| 34 |
|
| 35 |
+
disable = os.environ.get("RING_DISABLE_SUPABASE", "").strip().lower()
|
| 36 |
+
if disable in ("1", "true", "yes", "on"):
|
| 37 |
+
logger.info("RING_DISABLE_SUPABASE set — persistence disabled")
|
| 38 |
+
return None
|
| 39 |
+
|
| 40 |
url = os.environ.get("SUPABASE_URL", "").strip()
|
| 41 |
key = os.environ.get("SUPABASE_SERVICE_KEY", "").strip()
|
| 42 |
if not url or not key:
|