Spaces:

feng-x
/

ring-sizer

Running

App Files Files Community

feng-x commited on 17 days ago

Commit

6cd4ed9

verified ·

1 Parent(s): 947c3d4

Upload folder using huggingface_hub

Browse files

Files changed (2) hide show

src/sam_card_detection.py +63 -7
web_demo/static/mobile/steps/guide.js +1 -1

src/sam_card_detection.py CHANGED Viewed

@@ -48,12 +48,24 @@ MAX_HULL_HAND_FILL_RATIO = 0.05
 # aspect ratio purely by accident. A real credit card held alongside a hand
 # is ~5-15% of the frame; 25% is already 2× the realistic maximum.
 SAM_MAX_CARD_AREA_RATIO = 0.25
 def _score_card_mask(
     mask: np.ndarray,
     image_area: float,
     hand_mask: Optional[np.ndarray] = None,
 ) -> Optional[Dict[str, Any]]:
     """Score a candidate mask for being a credit card.
@@ -77,12 +89,21 @@ def _score_card_mask(
     # x86 vs Apple Silicon can bump `contour_area / rect_area` below 0.90 purely
     # from Torch CPU activation drift). Non-card shapes stay non-rectangular
     # under their hull, so this does not create false positives.
-    contour = max(contours, key=cv2.contourArea)
-    contour = cv2.convexHull(contour)
     contour_area = cv2.contourArea(contour)
     if contour_area <= 0:
         return None
     # Reject candidates whose convex hull engulfs the hand. When SAM is
     # prompted to segment the background paper, it returns the paper mask
     # with the hand carved *out* of it — so raw AND(mask, hand) is ~0
@@ -116,16 +137,43 @@ def _score_card_mask(
     if width <= 0 or height <= 0:
         return None
     aspect_ratio = max(width, height) / min(width, height)
     ratio_diff = abs(aspect_ratio - CARD_ASPECT_RATIO) / CARD_ASPECT_RATIO
     if ratio_diff > ASPECT_RATIO_TOLERANCE:
         return None
-    # Higher score: better rectangularity + tighter aspect ratio match + meaningful size
     ratio_score = 1.0 - ratio_diff / ASPECT_RATIO_TOLERANCE
     rect_score = (rectangularity - MIN_RECTANGULARITY) / (1.0 - MIN_RECTANGULARITY)
     area_score = min(area_ratio / 0.1, 1.0)  # caps at 10% of image area
-    score = 0.4 * ratio_score + 0.4 * rect_score + 0.2 * area_score
     return {
         "corners": corners,
@@ -400,12 +448,20 @@ def detect_credit_card_sam_prompt(
         for cand_idx in range(masks_tensor.shape[1]):
             mask_scaled = masks_tensor[prompt_idx, cand_idx].numpy().astype(bool)
             scaled_candidate_masks.append(mask_scaled)
-            result = _score_card_mask(mask_scaled, scaled_area, hand_mask=hand_mask_scaled)
             if result is not None:
                 result["seed_idx"] = prompt_idx
                 result["cand_idx"] = cand_idx
-                result["iou_score"] = float(iou_scores[prompt_idx, cand_idx])
-                result["mask_scaled"] = mask_scaled
                 scored.append(result)
     scored.sort(key=lambda d: d["score"], reverse=True)

 # aspect ratio purely by accident. A real credit card held alongside a hand
 # is ~5-15% of the frame; 25% is already 2× the realistic maximum.
 SAM_MAX_CARD_AREA_RATIO = 0.25
+# Reject candidates whose longer side spans more of the image short side
+# than any real card photo plausibly would. This catches the distinctive
+# SAM failure where a single-prompt mask grabs the entire background paper
+# / tabletop: the candidate is long and thin (so its mask area sneaks
+# under SAM_MAX_CARD_AREA_RATIO) but its bounding rectangle stretches
+# across nearly the full image short side (framing ratio ~0.99). Threshold
+# picked from doc/report/framing_ratio_survey.md: max observed in 47 KOL
+# successes is 0.532, max in calibration is 0.486; 0.70 leaves ≥30% margin
+# above legitimate framing while sitting well below the ~1.0 failure mode.
+MAX_CARD_FRAMING_RATIO = 0.70
 def _score_card_mask(
     mask: np.ndarray,
     image_area: float,
     hand_mask: Optional[np.ndarray] = None,
+    image_short_side: float = 0.0,
+    iou_score: float = 0.0,
 ) -> Optional[Dict[str, Any]]:
     """Score a candidate mask for being a credit card.
     # x86 vs Apple Silicon can bump `contour_area / rect_area` below 0.90 purely
     # from Torch CPU activation drift). Non-card shapes stay non-rectangular
     # under their hull, so this does not create false positives.
+    largest_contour = max(contours, key=cv2.contourArea)
+    contour = cv2.convexHull(largest_contour)
     contour_area = cv2.contourArea(contour)
     if contour_area <= 0:
         return None
+    # Replace the raw multi-blob SAM mask with just the largest connected
+    # component. The card prompt with multimask_output=True occasionally lassos
+    # background paper between fingers as part of the same candidate; those
+    # blobs pass scoring (we only check the largest contour) but pollute every
+    # downstream consumer of `result["mask"]` (debug overlays, the result PNG).
+    clean_mask_u8 = np.zeros_like(mask_u8)
+    cv2.drawContours(clean_mask_u8, [largest_contour], -1, 255, thickness=cv2.FILLED)
+    mask = clean_mask_u8.astype(bool)
     # Reject candidates whose convex hull engulfs the hand. When SAM is
     # prompted to segment the background paper, it returns the paper mask
     # with the hand carved *out* of it — so raw AND(mask, hand) is ~0
     if width <= 0 or height <= 0:
         return None
+    # Reject long-thin SAM false positives that span ~the entire image short
+    # side. These slip past SAM_MAX_CARD_AREA_RATIO because their pixel
+    # count is modest (the mask is hollow / not solidly filled), but their
+    # bounding rectangle gives them away.
+    if image_short_side > 0:
+        framing_ratio = max(width, height) / image_short_side
+        if framing_ratio > MAX_CARD_FRAMING_RATIO:
+            return None
     aspect_ratio = max(width, height) / min(width, height)
     ratio_diff = abs(aspect_ratio - CARD_ASPECT_RATIO) / CARD_ASPECT_RATIO
     if ratio_diff > ASPECT_RATIO_TOLERANCE:
         return None
+    # Score components — picking weights here is delicate because real
+    # photos have *perspective foreshortening* that pulls the apparent card
+    # aspect away from the flat-card ideal of 1.586. A mask that bleeds
+    # extra background paper onto the short edge can pull aspect *closer*
+    # to the ideal than a tight mask, so over-weighting ratio_score selects
+    # fattened masks (the Brooklyn Shields case). The current split:
+    #   * 0.3 ratio  — kept as a soft preference but no longer dominant
+    #   * 0.4 rect   — primary signal; tight cards are near-perfect rectangles,
+    #                  fattened SAM masks always lose a little here
+    #   * 0.1 area   — small reward for "actually card-sized"
+    #   * 0.2 iou    — SAM's own segmentation confidence; stable across
+    #                  platforms because it's decoder-internal, not derived
+    #                  from per-pixel boundary noise. Acts as a second opinion
+    #                  that breaks the tie when geometry is too close to call.
     ratio_score = 1.0 - ratio_diff / ASPECT_RATIO_TOLERANCE
     rect_score = (rectangularity - MIN_RECTANGULARITY) / (1.0 - MIN_RECTANGULARITY)
     area_score = min(area_ratio / 0.1, 1.0)  # caps at 10% of image area
+    score = (
+        0.3 * ratio_score
+        + 0.4 * rect_score
+        + 0.1 * area_score
+        + 0.2 * iou_score
+    )
     return {
         "corners": corners,
         for cand_idx in range(masks_tensor.shape[1]):
             mask_scaled = masks_tensor[prompt_idx, cand_idx].numpy().astype(bool)
             scaled_candidate_masks.append(mask_scaled)
+            iou = float(iou_scores[prompt_idx, cand_idx])
+            result = _score_card_mask(
+                mask_scaled, scaled_area, hand_mask=hand_mask_scaled,
+                image_short_side=float(min(scaled_h, scaled_w)),
+                iou_score=iou,
+            )
             if result is not None:
                 result["seed_idx"] = prompt_idx
                 result["cand_idx"] = cand_idx
+                result["iou_score"] = iou
+                # `result["mask"]` is the cleaned (largest-component) mask;
+                # keep that as the scaled-space mask so upscaling and debug
+                # rendering both see the cleaned version.
+                result["mask_scaled"] = result["mask"]
                 scored.append(result)
     scored.sort(key=lambda d: d["score"], reverse=True)

web_demo/static/mobile/steps/guide.js CHANGED Viewed

@@ -26,7 +26,7 @@ export default {
             <ul class="capture-tips">
               <li>Place a card of <strong>standard credit card size</strong> beside your hand.</li>
               <li>Hold phone <strong>directly above hand</strong>, parallel to table.</li>
-              <li>Use <strong>plain white background</strong>, a sheet of paper works great.</li>
             </ul>
             <figure class="guide-example">

             <ul class="capture-tips">
               <li>Place a card of <strong>standard credit card size</strong> beside your hand.</li>
               <li>Hold phone <strong>directly above hand</strong>, parallel to table.</li>
+              <li>Use <strong>flat, plain background</strong>, a sheet of paper works great.</li>
             </ul>
             <figure class="guide-example">