Spaces:

contacthamza91
/

SAM_three_UI

Sleeping

App Files Files Community

AI Agent commited on Mar 31

Commit

488f9ce

1 Parent(s): a387aca

Revert to SAM 3 mask pipeline with sigmoid logits + edge-only AA upscaler, remove rembg

Browse files

Files changed (2) hide show

app.py +78 -83
requirements.txt +0 -2

app.py CHANGED Viewed

@@ -7,13 +7,6 @@ import os
 import io
 import fitz  # PyMuPDF
-# Fix: HF Spaces sets OMP_NUM_THREADS to "3500m" which crashes onnxruntime/rembg
-omp_val = os.environ.get("OMP_NUM_THREADS", "")
-if not omp_val.isdigit():
-    os.environ["OMP_NUM_THREADS"] = "4"
-from rembg import remove as rembg_remove, new_session as rembg_new_session
 # ── UNCONDITIONAL BFloat16 → Float16 Patch for T4 Turing GPUs ────
 # CRITICAL: torch.cuda.is_bf16_supported() returns True on T4 because CUDA
 # can *emulate* bfloat16 in software, but the actual kernels crash on mixed
@@ -146,37 +139,53 @@ ASSET_LIBRARY_DIR = os.path.join(tempfile.gettempdir(), "sam3_library")
 os.makedirs(ASSET_LIBRARY_DIR, exist_ok=True)
 asset_counter = 0
-# Initialize rembg session (downloads model once, reuses for all requests)
-print("Loading background removal model...", flush=True)
-rembg_session = rembg_new_session(model_name="isnet-general-use")
-print("Background removal model loaded.", flush=True)
-def box_iou(b1, b2):
-    """IoU between two boxes [x0, y0, x1, y1]."""
-    x0 = max(b1[0], b2[0])
-    y0 = max(b1[1], b2[1])
-    x1 = min(b1[2], b2[2])
-    y1 = min(b1[3], b2[3])
-    inter = max(0, x1 - x0) * max(0, y1 - y0)
-    a1 = (b1[2] - b1[0]) * (b1[3] - b1[1])
-    a2 = (b2[2] - b2[0]) * (b2[3] - b2[1])
-    union = a1 + a2 - inter
-    return inter / union if union > 0 else 0.0
 def upscale_4x(rgba: np.ndarray) -> np.ndarray:
-    """4x Lanczos upscale with unsharp masking for crisp graphics."""
     h, w = rgba.shape[:2]
     new_w, new_h = w * 4, h * 4
-    # Upscale full RGBA together with Lanczos4
     upscaled = cv2.resize(rgba, (new_w, new_h), interpolation=cv2.INTER_LANCZOS4)
-    # Unsharp mask on RGB only (preserve alpha from rembg)
     rgb = upscaled[:, :, :3]
     blurred = cv2.GaussianBlur(rgb, (0, 0), sigmaX=1.0)
     rgb_sharp = cv2.addWeighted(rgb, 1.5, blurred, -0.5, 0)
     upscaled[:, :, :3] = rgb_sharp
     return upscaled
 def extract_assets(input_image):
@@ -196,7 +205,7 @@ def extract_assets(input_image):
         print(f">>> Image size: {w}x{h}, area: {img_area}", flush=True)
         pil_img = Image.fromarray(orig_rgb)
-        all_boxes = []
         all_scores = []
         with torch.inference_mode():
@@ -210,97 +219,83 @@ def extract_assets(input_image):
                 masks = out["masks"]
                 scores = out["scores"]
-                boxes = out.get("boxes")
                 if masks is None or len(masks) == 0:
-                    print(f"  [{concept}] No detections", flush=True)
                     continue
                 if torch.is_tensor(masks): masks = masks.float().cpu().numpy()
                 if torch.is_tensor(scores): scores = scores.float().cpu().numpy()
-                if boxes is not None and torch.is_tensor(boxes): boxes = boxes.float().cpu().numpy()
-                print(f"  [{concept}] Found {len(masks)} detections, boxes: {boxes.shape if boxes is not None else 'None'}", flush=True)
                 for j in range(len(masks)):
-                    score = float(scores[j]) if scores.ndim > 0 else float(scores)
-                    # Get bounding box: prefer SAM 3's boxes, fallback to mask bbox
-                    if boxes is not None and j < len(boxes):
-                        box = boxes[j].flatten()
-                        # SAM 3 boxes format: get x0,y0,x1,y1
-                        if len(box) >= 4:
-                            x0, y0, x1, y1 = int(box[0]), int(box[1]), int(box[2]), int(box[3])
-                        else:
-                            continue
-                    else:
-                        # Fallback: derive box from mask
-                        m = masks[j]
-                        while m.ndim > 2: m = m[0]
-                        ys, xs = np.nonzero(m > 0.5)
-                        if len(ys) == 0: continue
-                        x0, y0, x1, y1 = int(xs.min()), int(ys.min()), int(xs.max()), int(ys.max())
-                    # Validate box
-                    box_w = x1 - x0
-                    box_h = y1 - y0
-                    box_area = box_w * box_h
-                    if score < 0.1 or box_area < 500 or box_area > img_area * 0.90:
-                        print(f"    det[{j}] SKIPPED: score={score:.4f}, area={box_area}", flush=True)
                         continue
-                    # Add padding (10% of box size)
-                    pad_x = max(10, int(box_w * 0.10))
-                    pad_y = max(10, int(box_h * 0.10))
-                    bx0 = max(0, x0 - pad_x)
-                    by0 = max(0, y0 - pad_y)
-                    bx1 = min(w, x1 + pad_x)
-                    by1 = min(h, y1 + pad_y)
-                    all_boxes.append([bx0, by0, bx1, by1])
                     all_scores.append(score)
-                    print(f"    det[{j}] KEPT: score={score:.4f}, box=[{bx0},{by0},{bx1},{by1}]", flush=True)
-        print(f">>> Total detections kept: {len(all_boxes)}", flush=True)
-        if not all_boxes:
             gr.Info("No assets found in this image. Try a different slide with more visual elements.")
-            print(">>> No detections passed filters, returning []", flush=True)
             return []
-        # Deduplicate by box IoU
-        order = sorted(range(len(all_boxes)), key=lambda i: all_scores[i], reverse=True)
         keep = []
         for i in order:
             dup = False
             for ki in keep:
-                if box_iou(all_boxes[i], all_boxes[ki]) > 0.5:
                     dup = True
                     break
             if not dup:
                 keep.append(i)
-        # For each kept box: crop → rembg → upscale → save
         results = []
         global asset_counter
         for idx, ki in enumerate(keep):
-            bx0, by0, bx1, by1 = all_boxes[ki]
-            crop_rgb = orig_rgb[by0:by1, bx0:bx1]
-            # Background removal with rembg (clean alpha matte)
-            crop_pil = Image.fromarray(crop_rgb)
-            rgba_pil = rembg_remove(crop_pil, session=rembg_session)
-            rgba_np = np.array(rgba_pil)
-            print(f"    crop[{idx}] rembg done: {rgba_np.shape}", flush=True)
-            # 4x upscale
-            rgba_np = upscale_4x(rgba_np)
-            # Save to library
             asset_counter += 1
             lib_path = os.path.join(ASSET_LIBRARY_DIR, f"asset_{asset_counter:04d}.png")
-            Image.fromarray(rgba_np, "RGBA").save(lib_path, format="PNG")
             results.append(lib_path)
         print(f">>> Returning {len(results)} assets (library total: {asset_counter})", flush=True)

 import io
 import fitz  # PyMuPDF
 # ── UNCONDITIONAL BFloat16 → Float16 Patch for T4 Turing GPUs ────
 # CRITICAL: torch.cuda.is_bf16_supported() returns True on T4 because CUDA
 # can *emulate* bfloat16 in software, but the actual kernels crash on mixed
 os.makedirs(ASSET_LIBRARY_DIR, exist_ok=True)
 asset_counter = 0
+def mask_iou(m1: np.ndarray, m2: np.ndarray) -> float:
+    b1 = m1 > 128 if m1.dtype == np.uint8 else m1
+    b2 = m2 > 128 if m2.dtype == np.uint8 else m2
+    inter = np.logical_and(b1, b2).sum()
+    union = np.logical_or(b1, b2).sum()
+    return float(inter) / float(union) if union > 0 else 0.0
+def mask_to_crop(orig_rgb: np.ndarray, alpha: np.ndarray) -> np.ndarray:
+    """Crop RGBA using alpha channel, with edge-only AA smoothing."""
+    h, w = orig_rgb.shape[:2]
+    rgba = np.zeros((h, w, 4), dtype=np.uint8)
+    rgba[:, :, :3] = orig_rgb
+    rgba[:, :, 3] = alpha
+    ys, xs = np.nonzero(alpha > 10)
+    if len(ys) == 0:
+        return rgba
+    y0, y1 = int(ys.min()), int(ys.max())
+    x0, x1 = int(xs.min()), int(xs.max())
+    pad = 6
+    y0, x0 = max(0, y0 - pad), max(0, x0 - pad)
+    y1, x1 = min(h - 1, y1 + pad), min(w - 1, x1 + pad)
+    return rgba[y0:y1+1, x0:x1+1]
 def upscale_4x(rgba: np.ndarray) -> np.ndarray:
+    """4x Lanczos upscale with unsharp masking + edge-only alpha AA."""
     h, w = rgba.shape[:2]
     new_w, new_h = w * 4, h * 4
+    # Upscale full RGBA with Lanczos4
     upscaled = cv2.resize(rgba, (new_w, new_h), interpolation=cv2.INTER_LANCZOS4)
+    # Unsharp mask on RGB only
     rgb = upscaled[:, :, :3]
     blurred = cv2.GaussianBlur(rgb, (0, 0), sigmaX=1.0)
     rgb_sharp = cv2.addWeighted(rgb, 1.5, blurred, -0.5, 0)
     upscaled[:, :, :3] = rgb_sharp
+    # Edge-only AA on alpha: blur then re-harden interior
+    alpha = upscaled[:, :, 3].astype(np.float32)
+    alpha_blur = cv2.GaussianBlur(alpha, (5, 5), sigmaX=1.2)
+    # Keep interior fully opaque, only use blurred values at edges
+    interior = alpha > 240  # pixels that were solidly opaque
+    alpha_aa = np.where(interior, 255.0, alpha_blur)
+    upscaled[:, :, 3] = alpha_aa.clip(0, 255).astype(np.uint8)
     return upscaled
 def extract_assets(input_image):
         print(f">>> Image size: {w}x{h}, area: {img_area}", flush=True)
         pil_img = Image.fromarray(orig_rgb)
+        all_masks = []
         all_scores = []
         with torch.inference_mode():
                 masks = out["masks"]
                 scores = out["scores"]
+                # Check for raw logits
+                raw_logits = None
+                for logit_key in ["masks_logits", "low_res_masks", "logits", "mask_logits"]:
+                    val = out.get(logit_key)
+                    if val is not None and (not torch.is_tensor(val) or val.numel() > 0):
+                        raw_logits = val
+                        break
                 if masks is None or len(masks) == 0:
+                    print(f"  [{concept}] No masks returned", flush=True)
                     continue
                 if torch.is_tensor(masks): masks = masks.float().cpu().numpy()
                 if torch.is_tensor(scores): scores = scores.float().cpu().numpy()
+                if raw_logits is not None and torch.is_tensor(raw_logits):
+                    raw_logits = raw_logits.float().cpu().numpy()
+                print(f"  [{concept}] Found {len(masks)} masks", flush=True)
                 for j in range(len(masks)):
+                    m = masks[j]
+                    while m.ndim > 2: m = m[0]
+                    m_bool = m.astype(bool)
+                    score = float(scores[j]) if scores.ndim > 0 else float(scores)
+                    area = m_bool.sum()
+                    if score < 0.1 or area < 500 or area > img_area * 0.90:
+                        print(f"    mask[{j}] SKIPPED: score={score:.4f}, area={area}", flush=True)
                         continue
+                    # Build alpha: sigmoid logits for smooth edges + full opacity interior
+                    if raw_logits is not None and j < len(raw_logits):
+                        logit = raw_logits[j]
+                        while logit.ndim > 2: logit = logit[0]
+                        alpha_smooth = 1.0 / (1.0 + np.exp(-logit.astype(np.float32)))
+                        kernel = cv2.getStructuringElement(cv2.MORPH_ELLIPSE, (7, 7))
+                        dilated = cv2.dilate(m_bool.astype(np.uint8), kernel, iterations=2)
+                        alpha_smooth = alpha_smooth * dilated
+                        alpha_uint8 = (alpha_smooth * 255).clip(0, 255).astype(np.uint8)
+                        alpha_mask = np.where(m_bool, np.uint8(255), alpha_uint8)
+                    else:
+                        alpha_mask = m_bool.astype(np.uint8) * 255
+                    all_masks.append(alpha_mask)
                     all_scores.append(score)
+                    print(f"    mask[{j}] KEPT: score={score:.4f}, area={area}", flush=True)
+        print(f">>> Total masks kept: {len(all_masks)}", flush=True)
+        if not all_masks:
             gr.Info("No assets found in this image. Try a different slide with more visual elements.")
+            print(">>> No masks passed filters, returning []", flush=True)
             return []
+        # Deduplicate by mask IoU
+        order = sorted(range(len(all_masks)), key=lambda i: all_scores[i], reverse=True)
         keep = []
         for i in order:
             dup = False
             for ki in keep:
+                if mask_iou(all_masks[i], all_masks[ki]) > 0.5:
                     dup = True
                     break
             if not dup:
                 keep.append(i)
+        # Crop → upscale (with edge AA) → save
         results = []
         global asset_counter
         for idx, ki in enumerate(keep):
+            crop = mask_to_crop(orig_rgb, all_masks[ki])
+            crop = upscale_4x(crop)
             asset_counter += 1
             lib_path = os.path.join(ASSET_LIBRARY_DIR, f"asset_{asset_counter:04d}.png")
+            Image.fromarray(crop, "RGBA").save(lib_path, format="PNG")
             results.append(lib_path)
         print(f">>> Returning {len(results)} assets (library total: {asset_counter})", flush=True)

requirements.txt CHANGED Viewed

@@ -11,5 +11,3 @@ gradio
 git+https://github.com/facebookresearch/sam3.git
 opencv-python-headless
 PyMuPDF
-rembg
-onnxruntime

 git+https://github.com/facebookresearch/sam3.git
 opencv-python-headless
 PyMuPDF