Spaces:
Running
Running
Upload folder using huggingface_hub
Browse files
src/sam_card_detection.py
CHANGED
|
@@ -48,12 +48,24 @@ MAX_HULL_HAND_FILL_RATIO = 0.05
|
|
| 48 |
# aspect ratio purely by accident. A real credit card held alongside a hand
|
| 49 |
# is ~5-15% of the frame; 25% is already 2Γ the realistic maximum.
|
| 50 |
SAM_MAX_CARD_AREA_RATIO = 0.25
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 51 |
|
| 52 |
|
| 53 |
def _score_card_mask(
|
| 54 |
mask: np.ndarray,
|
| 55 |
image_area: float,
|
| 56 |
hand_mask: Optional[np.ndarray] = None,
|
|
|
|
|
|
|
| 57 |
) -> Optional[Dict[str, Any]]:
|
| 58 |
"""Score a candidate mask for being a credit card.
|
| 59 |
|
|
@@ -77,12 +89,21 @@ def _score_card_mask(
|
|
| 77 |
# x86 vs Apple Silicon can bump `contour_area / rect_area` below 0.90 purely
|
| 78 |
# from Torch CPU activation drift). Non-card shapes stay non-rectangular
|
| 79 |
# under their hull, so this does not create false positives.
|
| 80 |
-
|
| 81 |
-
contour = cv2.convexHull(
|
| 82 |
contour_area = cv2.contourArea(contour)
|
| 83 |
if contour_area <= 0:
|
| 84 |
return None
|
| 85 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 86 |
# Reject candidates whose convex hull engulfs the hand. When SAM is
|
| 87 |
# prompted to segment the background paper, it returns the paper mask
|
| 88 |
# with the hand carved *out* of it β so raw AND(mask, hand) is ~0
|
|
@@ -116,16 +137,43 @@ def _score_card_mask(
|
|
| 116 |
if width <= 0 or height <= 0:
|
| 117 |
return None
|
| 118 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 119 |
aspect_ratio = max(width, height) / min(width, height)
|
| 120 |
ratio_diff = abs(aspect_ratio - CARD_ASPECT_RATIO) / CARD_ASPECT_RATIO
|
| 121 |
if ratio_diff > ASPECT_RATIO_TOLERANCE:
|
| 122 |
return None
|
| 123 |
|
| 124 |
-
#
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 125 |
ratio_score = 1.0 - ratio_diff / ASPECT_RATIO_TOLERANCE
|
| 126 |
rect_score = (rectangularity - MIN_RECTANGULARITY) / (1.0 - MIN_RECTANGULARITY)
|
| 127 |
area_score = min(area_ratio / 0.1, 1.0) # caps at 10% of image area
|
| 128 |
-
score =
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 129 |
|
| 130 |
return {
|
| 131 |
"corners": corners,
|
|
@@ -400,12 +448,20 @@ def detect_credit_card_sam_prompt(
|
|
| 400 |
for cand_idx in range(masks_tensor.shape[1]):
|
| 401 |
mask_scaled = masks_tensor[prompt_idx, cand_idx].numpy().astype(bool)
|
| 402 |
scaled_candidate_masks.append(mask_scaled)
|
| 403 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 404 |
if result is not None:
|
| 405 |
result["seed_idx"] = prompt_idx
|
| 406 |
result["cand_idx"] = cand_idx
|
| 407 |
-
result["iou_score"] =
|
| 408 |
-
result["
|
|
|
|
|
|
|
|
|
|
| 409 |
scored.append(result)
|
| 410 |
|
| 411 |
scored.sort(key=lambda d: d["score"], reverse=True)
|
|
|
|
| 48 |
# aspect ratio purely by accident. A real credit card held alongside a hand
|
| 49 |
# is ~5-15% of the frame; 25% is already 2Γ the realistic maximum.
|
| 50 |
SAM_MAX_CARD_AREA_RATIO = 0.25
|
| 51 |
+
# Reject candidates whose longer side spans more of the image short side
|
| 52 |
+
# than any real card photo plausibly would. This catches the distinctive
|
| 53 |
+
# SAM failure where a single-prompt mask grabs the entire background paper
|
| 54 |
+
# / tabletop: the candidate is long and thin (so its mask area sneaks
|
| 55 |
+
# under SAM_MAX_CARD_AREA_RATIO) but its bounding rectangle stretches
|
| 56 |
+
# across nearly the full image short side (framing ratio ~0.99). Threshold
|
| 57 |
+
# picked from doc/report/framing_ratio_survey.md: max observed in 47 KOL
|
| 58 |
+
# successes is 0.532, max in calibration is 0.486; 0.70 leaves β₯30% margin
|
| 59 |
+
# above legitimate framing while sitting well below the ~1.0 failure mode.
|
| 60 |
+
MAX_CARD_FRAMING_RATIO = 0.70
|
| 61 |
|
| 62 |
|
| 63 |
def _score_card_mask(
|
| 64 |
mask: np.ndarray,
|
| 65 |
image_area: float,
|
| 66 |
hand_mask: Optional[np.ndarray] = None,
|
| 67 |
+
image_short_side: float = 0.0,
|
| 68 |
+
iou_score: float = 0.0,
|
| 69 |
) -> Optional[Dict[str, Any]]:
|
| 70 |
"""Score a candidate mask for being a credit card.
|
| 71 |
|
|
|
|
| 89 |
# x86 vs Apple Silicon can bump `contour_area / rect_area` below 0.90 purely
|
| 90 |
# from Torch CPU activation drift). Non-card shapes stay non-rectangular
|
| 91 |
# under their hull, so this does not create false positives.
|
| 92 |
+
largest_contour = max(contours, key=cv2.contourArea)
|
| 93 |
+
contour = cv2.convexHull(largest_contour)
|
| 94 |
contour_area = cv2.contourArea(contour)
|
| 95 |
if contour_area <= 0:
|
| 96 |
return None
|
| 97 |
|
| 98 |
+
# Replace the raw multi-blob SAM mask with just the largest connected
|
| 99 |
+
# component. The card prompt with multimask_output=True occasionally lassos
|
| 100 |
+
# background paper between fingers as part of the same candidate; those
|
| 101 |
+
# blobs pass scoring (we only check the largest contour) but pollute every
|
| 102 |
+
# downstream consumer of `result["mask"]` (debug overlays, the result PNG).
|
| 103 |
+
clean_mask_u8 = np.zeros_like(mask_u8)
|
| 104 |
+
cv2.drawContours(clean_mask_u8, [largest_contour], -1, 255, thickness=cv2.FILLED)
|
| 105 |
+
mask = clean_mask_u8.astype(bool)
|
| 106 |
+
|
| 107 |
# Reject candidates whose convex hull engulfs the hand. When SAM is
|
| 108 |
# prompted to segment the background paper, it returns the paper mask
|
| 109 |
# with the hand carved *out* of it β so raw AND(mask, hand) is ~0
|
|
|
|
| 137 |
if width <= 0 or height <= 0:
|
| 138 |
return None
|
| 139 |
|
| 140 |
+
# Reject long-thin SAM false positives that span ~the entire image short
|
| 141 |
+
# side. These slip past SAM_MAX_CARD_AREA_RATIO because their pixel
|
| 142 |
+
# count is modest (the mask is hollow / not solidly filled), but their
|
| 143 |
+
# bounding rectangle gives them away.
|
| 144 |
+
if image_short_side > 0:
|
| 145 |
+
framing_ratio = max(width, height) / image_short_side
|
| 146 |
+
if framing_ratio > MAX_CARD_FRAMING_RATIO:
|
| 147 |
+
return None
|
| 148 |
+
|
| 149 |
aspect_ratio = max(width, height) / min(width, height)
|
| 150 |
ratio_diff = abs(aspect_ratio - CARD_ASPECT_RATIO) / CARD_ASPECT_RATIO
|
| 151 |
if ratio_diff > ASPECT_RATIO_TOLERANCE:
|
| 152 |
return None
|
| 153 |
|
| 154 |
+
# Score components β picking weights here is delicate because real
|
| 155 |
+
# photos have *perspective foreshortening* that pulls the apparent card
|
| 156 |
+
# aspect away from the flat-card ideal of 1.586. A mask that bleeds
|
| 157 |
+
# extra background paper onto the short edge can pull aspect *closer*
|
| 158 |
+
# to the ideal than a tight mask, so over-weighting ratio_score selects
|
| 159 |
+
# fattened masks (the Brooklyn Shields case). The current split:
|
| 160 |
+
# * 0.3 ratio β kept as a soft preference but no longer dominant
|
| 161 |
+
# * 0.4 rect β primary signal; tight cards are near-perfect rectangles,
|
| 162 |
+
# fattened SAM masks always lose a little here
|
| 163 |
+
# * 0.1 area β small reward for "actually card-sized"
|
| 164 |
+
# * 0.2 iou β SAM's own segmentation confidence; stable across
|
| 165 |
+
# platforms because it's decoder-internal, not derived
|
| 166 |
+
# from per-pixel boundary noise. Acts as a second opinion
|
| 167 |
+
# that breaks the tie when geometry is too close to call.
|
| 168 |
ratio_score = 1.0 - ratio_diff / ASPECT_RATIO_TOLERANCE
|
| 169 |
rect_score = (rectangularity - MIN_RECTANGULARITY) / (1.0 - MIN_RECTANGULARITY)
|
| 170 |
area_score = min(area_ratio / 0.1, 1.0) # caps at 10% of image area
|
| 171 |
+
score = (
|
| 172 |
+
0.3 * ratio_score
|
| 173 |
+
+ 0.4 * rect_score
|
| 174 |
+
+ 0.1 * area_score
|
| 175 |
+
+ 0.2 * iou_score
|
| 176 |
+
)
|
| 177 |
|
| 178 |
return {
|
| 179 |
"corners": corners,
|
|
|
|
| 448 |
for cand_idx in range(masks_tensor.shape[1]):
|
| 449 |
mask_scaled = masks_tensor[prompt_idx, cand_idx].numpy().astype(bool)
|
| 450 |
scaled_candidate_masks.append(mask_scaled)
|
| 451 |
+
iou = float(iou_scores[prompt_idx, cand_idx])
|
| 452 |
+
result = _score_card_mask(
|
| 453 |
+
mask_scaled, scaled_area, hand_mask=hand_mask_scaled,
|
| 454 |
+
image_short_side=float(min(scaled_h, scaled_w)),
|
| 455 |
+
iou_score=iou,
|
| 456 |
+
)
|
| 457 |
if result is not None:
|
| 458 |
result["seed_idx"] = prompt_idx
|
| 459 |
result["cand_idx"] = cand_idx
|
| 460 |
+
result["iou_score"] = iou
|
| 461 |
+
# `result["mask"]` is the cleaned (largest-component) mask;
|
| 462 |
+
# keep that as the scaled-space mask so upscaling and debug
|
| 463 |
+
# rendering both see the cleaned version.
|
| 464 |
+
result["mask_scaled"] = result["mask"]
|
| 465 |
scored.append(result)
|
| 466 |
|
| 467 |
scored.sort(key=lambda d: d["score"], reverse=True)
|
web_demo/static/mobile/steps/guide.js
CHANGED
|
@@ -26,7 +26,7 @@ export default {
|
|
| 26 |
<ul class="capture-tips">
|
| 27 |
<li>Place a card of <strong>standard credit card size</strong> beside your hand.</li>
|
| 28 |
<li>Hold phone <strong>directly above hand</strong>, parallel to table.</li>
|
| 29 |
-
<li>Use <strong>plain
|
| 30 |
</ul>
|
| 31 |
|
| 32 |
<figure class="guide-example">
|
|
|
|
| 26 |
<ul class="capture-tips">
|
| 27 |
<li>Place a card of <strong>standard credit card size</strong> beside your hand.</li>
|
| 28 |
<li>Hold phone <strong>directly above hand</strong>, parallel to table.</li>
|
| 29 |
+
<li>Use <strong>flat, plain background</strong>, a sheet of paper works great.</li>
|
| 30 |
</ul>
|
| 31 |
|
| 32 |
<figure class="guide-example">
|