Upload StormCast CONUS regression checkpoint
Browse files
README.md
CHANGED
|
@@ -134,7 +134,7 @@ ERA5 data for the date range of 2023/01/01 to 2023/01/11, interpolated to the HR
|
|
| 134 |
| Optimizer | Adam (fused) |
|
| 135 |
| Learning rate | 4e-4 |
|
| 136 |
| LR rampup steps | 1,000 |
|
| 137 |
-
| Total steps |
|
| 138 |
| Effective batch size | 4 (gradient accumulation) |
|
| 139 |
| Batch size per GPU | 1 |
|
| 140 |
| Loss | MSE (regression) |
|
|
@@ -148,7 +148,9 @@ ERA5 data for the date range of 2023/01/01 to 2023/01/11, interpolated to the HR
|
|
| 148 |
| GPU | 1x NVIDIA H100 80GB |
|
| 149 |
| Peak GPU memory | ~29 GiB |
|
| 150 |
| Training speed | ~4.5 s/step (with grad accum) |
|
| 151 |
-
| Training time | ~
|
|
|
|
|
|
|
| 152 |
|
| 153 |
## Inference
|
| 154 |
|
|
@@ -191,7 +193,7 @@ class RegressionOnlyStormCast(StormCast):
|
|
| 191 |
|
| 192 |
|
| 193 |
# Download and load checkpoint
|
| 194 |
-
ckpt_path = hf_hub_download(REPO_ID, "StormCastUNet.0.
|
| 195 |
regression = PhysicsNemoModule.from_checkpoint(ckpt_path)
|
| 196 |
diffusion = torch.nn.Identity()
|
| 197 |
|
|
|
|
| 134 |
| Optimizer | Adam (fused) |
|
| 135 |
| Learning rate | 4e-4 |
|
| 136 |
| LR rampup steps | 1,000 |
|
| 137 |
+
| Total steps | 16,000 |
|
| 138 |
| Effective batch size | 4 (gradient accumulation) |
|
| 139 |
| Batch size per GPU | 1 |
|
| 140 |
| Loss | MSE (regression) |
|
|
|
|
| 148 |
| GPU | 1x NVIDIA H100 80GB |
|
| 149 |
| Peak GPU memory | ~29 GiB |
|
| 150 |
| Training speed | ~4.5 s/step (with grad accum) |
|
| 151 |
+
| Training time | ~21 hours (16,000 steps) |
|
| 152 |
+
| Final train loss | 0.0143 |
|
| 153 |
+
| Final val loss | 0.0125 |
|
| 154 |
|
| 155 |
## Inference
|
| 156 |
|
|
|
|
| 193 |
|
| 194 |
|
| 195 |
# Download and load checkpoint
|
| 196 |
+
ckpt_path = hf_hub_download(REPO_ID, "StormCastUNet.0.16000.mdlus")
|
| 197 |
regression = PhysicsNemoModule.from_checkpoint(ckpt_path)
|
| 198 |
diffusion = torch.nn.Identity()
|
| 199 |
|