Spaces:

nvidia
/

PhysicsNeMo-Earth2Studio-Inference

Paused

App Files Files

carmelog commited on Jan 22

Commit

7a84f11

1 Parent(s): 156dc4d

fix: update some comments, env vars, and settings for h200 and HF backend issues

Browse files

Files changed (1) hide show

app.py +18 -76

app.py CHANGED Viewed

@@ -5,7 +5,7 @@ from typing import Optional, Tuple
 # Save original asyncio.run BEFORE any imports that might patch it (nest_asyncio)
 _ORIGINAL_ASYNCIO_RUN = asyncio.run
-# On ZeroGPU (shared A10G), TF32 matmul paths can occasionally trip cuBLAS errors in
 # some einsum-heavy models. Prefer full FP32 math for stability.
 os.environ.setdefault("NVIDIA_TF32_OVERRIDE", "0")
 # ZeroGPU H200-specific workarounds for cuBLAS strided-batch GEMM issues
@@ -126,70 +126,28 @@ def _run_inference(forecast_date: str, nsteps: int):
     _ensure_cache_dirs()
-    # Memory management for ZeroGPU
-    torch.backends.cudnn.benchmark = False  # More stable on shared GPU
-    # Prefer full FP32 math (avoid TF32) for stability on shared A10G
-    try:
-        torch.set_float32_matmul_precision("highest")
-    except Exception:
-        pass
-    try:
-        torch.backends.cuda.matmul.allow_tf32 = False
-    except Exception:
-        pass
-    try:
-        torch.backends.cudnn.allow_tf32 = False
-    except Exception:
-        pass
-    # Avoid reduced-precision reductions (guarded for older torch versions)
-    try:
-        torch.backends.cuda.matmul.allow_fp16_reduced_precision_reduction = False
-    except Exception:
-        pass
-    try:
-        torch.backends.cuda.matmul.allow_bf16_reduced_precision_reduction = False
-    except Exception:
-        pass
-    try:
-        torch.cuda.set_device(0)
-    except Exception:
-        pass
     torch.cuda.empty_cache()
-    # Some cuBLAS "INVALID_VALUE" failures originate from non-contiguous einsum operands
-    # producing unsupported strides for batched GEMM. Force contiguity at the einsum boundary.
     _orig_einsum = torch.einsum
-    def _einsum_contiguous(equation, *operands):
-        ops = []
-        for op in operands:
-            if isinstance(op, torch.Tensor) and not op.is_contiguous():
-                ops.append(op.contiguous())
-            else:
-                ops.append(op)
-        return _orig_einsum(equation, *ops)
-    torch.einsum = _einsum_contiguous  # type: ignore[assignment]
     # Load model inside GPU function (ZeroGPU requirement)
-    try:
-        from earth2studio.models.px.fcn import FCN
-    except Exception:
-        from earth2studio.models.px import FCN
     package = FCN.load_default_package()
     model = FCN.load_model(package)
     device = torch.device("cuda")
-    # Ensure FP32 weights for cuBLAS stability
-    try:
-        model = model.float()
-    except Exception:
-        pass
-    model = model.to(device)
-    model.eval()  # Ensure eval mode
-    # Clear memory after model load
     torch.cuda.empty_cache()
     # CRITICAL: Warmup CUDA/cuBLAS context on ZeroGPU's H200 before complex ops
@@ -221,27 +179,11 @@ def _run_inference(forecast_date: str, nsteps: int):
         return lon, lat, all_fields
     finally:
-        # Restore einsum in case this worker is reused
-        try:
-            torch.einsum = _orig_einsum  # type: ignore[assignment]
-        except Exception:
-            pass
-        # Free GPU memory aggressively
-        try:
-            model.to("cpu")
-        except Exception:
-            pass
-        try:
-            del model
-            del data
-            del io
-        except Exception:
-            pass
-        try:
-            torch.cuda.empty_cache()
-            torch.cuda.synchronize()
-        except Exception:
-            pass
 def run_forecast(forecast_date: str, nsteps: int):

 # Save original asyncio.run BEFORE any imports that might patch it (nest_asyncio)
 _ORIGINAL_ASYNCIO_RUN = asyncio.run
+# On ZeroGPU H200, TF32 matmul paths can occasionally trip cuBLAS errors in
 # some einsum-heavy models. Prefer full FP32 math for stability.
 os.environ.setdefault("NVIDIA_TF32_OVERRIDE", "0")
 # ZeroGPU H200-specific workarounds for cuBLAS strided-batch GEMM issues
     _ensure_cache_dirs()
+    # Critical precision settings for ZeroGPU H200 cuBLAS stability
+    torch.backends.cudnn.benchmark = False
+    torch.set_float32_matmul_precision("highest")  # Full FP32, no TF32
+    torch.backends.cuda.matmul.allow_tf32 = False
+    torch.backends.cudnn.allow_tf32 = False
     torch.cuda.empty_cache()
+    # Force einsum operand contiguity to avoid cuBLAS strided-batch GEMM errors
     _orig_einsum = torch.einsum
+    torch.einsum = lambda eq, *ops: _orig_einsum(
+        eq, *[op.contiguous() if torch.is_tensor(op) else op for op in ops]
+    )  # type: ignore[assignment]
     # Load model inside GPU function (ZeroGPU requirement)
+    from earth2studio.models.px import FCN
     package = FCN.load_default_package()
     model = FCN.load_model(package)
+    # Move to GPU with FP32 precision
     device = torch.device("cuda")
+    model = model.float().to(device).eval()
     torch.cuda.empty_cache()
     # CRITICAL: Warmup CUDA/cuBLAS context on ZeroGPU's H200 before complex ops
         return lon, lat, all_fields
     finally:
+        # Cleanup: restore einsum and free GPU memory
+        torch.einsum = _orig_einsum  # type: ignore[assignment]
+        del model, data, io
+        torch.cuda.empty_cache()
+        torch.cuda.synchronize()
 def run_forecast(forecast_date: str, nsteps: int):