Spaces:

contacthamza91
/

SAM_three_UI

Sleeping

App Files Files Community

AI Agent commited on Mar 28

Commit

cb7cf0f

1 Parent(s): 4b23049

CRITICAL FIX: Remove is_bf16_supported gate - T4 reports True but crashes on bf16 ops. Patches now unconditional + model cast to fp16

Browse files

Files changed (1) hide show

app.py +25 -17

app.py CHANGED Viewed

@@ -6,27 +6,31 @@ from PIL import Image
 import os
 import io
-# ── Native Runtime BFloat16 Nullification for T4 Turing GPUs ────
-# Hugging Face containers lock `site-packages` on boot, dropping our dynamic writer.
-# Instead, we directly alias PyTorch's global datatypes and AMP engine modes to silently
-# intercept Meta's `bfloat16` requests anywhere in the active memory loop.
-if torch.cuda.is_available() and not torch.cuda.is_bf16_supported():
-    # 1. Bruteforce global datatype alias mapping
     torch.bfloat16 = torch.float16
-    # 2. Intercept inner PyTorch AMP decorators
     import torch.amp.autocast_mode
-    original_amp = torch.amp.autocast_mode.autocast
-    class PatchedAutocast(original_amp):
         def __init__(self, device_type, dtype=None, *args, **kwargs):
-            if dtype == torch.bfloat16 or dtype == torch.float16:
-                dtype = torch.float16  # Always force Turing FP16
-            super().__init__(device_type, dtype, *args, **kwargs)
-    torch.autocast = PatchedAutocast
-    torch.amp.autocast_mode.autocast = PatchedAutocast
     if hasattr(torch.amp, 'autocast'):
-        torch.amp.autocast = PatchedAutocast
 # ── Ensure SAM 3 Checkpoint is downloaded ────────────────────────
 # (HuggingFace Spaces can use the hf_hub_download mechanism)
@@ -86,7 +90,11 @@ if model_installed:
     model.load_state_dict(image_state_dict, strict=False)
     model.to(device)
-    model.to(torch.float32)  # Maintain standard Floats parameters; the patched Float16 autocast will natively handle precision math.
     processor = Sam3Processor(model)
     if not torch.cuda.is_available():

 import os
 import io
+# ── UNCONDITIONAL BFloat16 → Float16 Patch for T4 Turing GPUs ────
+# CRITICAL: torch.cuda.is_bf16_supported() returns True on T4 because CUDA
+# can *emulate* bfloat16 in software, but the actual kernels crash on mixed
+# dtype operations (linear, conv2d). We MUST patch unconditionally.
+if torch.cuda.is_available():
+    # 1. Globally alias bfloat16 → float16 so all future lookups resolve to fp16
+    _original_bf16 = torch.bfloat16
     torch.bfloat16 = torch.float16
+    # 2. Intercept ALL autocast entry points to force float16
     import torch.amp.autocast_mode
+    _OriginalAutocast = torch.amp.autocast_mode.autocast
+    class _Fp16Autocast(_OriginalAutocast):
         def __init__(self, device_type, dtype=None, *args, **kwargs):
+            # Intercept any bfloat16 request (original C enum or aliased)
+            if dtype is not None and dtype in (_original_bf16, torch.float16):
+                dtype = torch.float16
+            super().__init__(device_type, dtype=dtype, *args, **kwargs)
+    torch.autocast = _Fp16Autocast
+    torch.amp.autocast_mode.autocast = _Fp16Autocast
     if hasattr(torch.amp, 'autocast'):
+        torch.amp.autocast = _Fp16Autocast
+    if hasattr(torch.cuda.amp, 'autocast'):
+        torch.cuda.amp.autocast = _Fp16Autocast
 # ── Ensure SAM 3 Checkpoint is downloaded ────────────────────────
 # (HuggingFace Spaces can use the hf_hub_download mechanism)
     model.load_state_dict(image_state_dict, strict=False)
     model.to(device)
+    # Cast to float16 on GPU (matches our patched autocast dtype) or float32 on CPU
+    if torch.cuda.is_available():
+        model.to(torch.float16)
+    else:
+        model.to(torch.float32)
     processor = Sam3Processor(model)
     if not torch.cuda.is_available():