robotflowlabs
/

FORGE-Nano-Benchmark

+{
+  "benchmark": "student_variants",
+  "timestamp": "2026-03-19T12:35:24.930881+00:00",
+  "device": "cuda",
+  "gpu": "NVIDIA L4",
+  "variants": {
+    "nano_baseline": {
+      "variant": "nano_baseline",
+      "config": {
+        "variant": "nano",
+        "language_model": "Qwen/Qwen2.5-0.5B",
+        "lora_rank": 32,
+        "action_head_type": "diffusion"
+      },
+      "total_params_m": 967.9,
+      "trainable_params_m": 495.6,
+      "frozen_params_m": 472.3,
+      "build_time_s": 5.9,
+      "inference": {
+        "fp32_p50_ms": 125.64,
+        "fp32_fps": 7.9,
+        "fp16_p50_ms": 90.41,
+        "fp16_fps": 11.0,
+        "fp16_speedup": 1.39,
+        "gpu_mem_gb": 4.32
+      },
+      "training": {
+        "n_steps": 30,
+        "loss_start": 3.2132,
+        "loss_end": 1.0615,
+        "loss_reduction_pct": 67.0,
+        "step_time_ms": 610.5,
+        "steps_per_sec": 1.64,
+        "gpu_mem_gb": 9.01,
+        "loss_curve": [
+          3.2132,
+          41.0657,
+          85.2859,
+          39.3644,
+          11.2475,
+          2.2388,
+          1.3854,
+          1.0555,
+          1.5505,
+          1.2637,
+          2.3099,
+          1.9009,
+          1.5492,
+          2.2762,
+          1.3436,
+          1.0509,
+          0.7195,
+          1.1853,
+          1.4664,
+          0.9882,
+          0.7097,
+          1.8235,
+          1.5141,
+          2.4983,
+          1.6145,
+          0.87,
+          1.5064,
+          1.5116,
+          1.2326,
+          1.0615
+        ]
+      }
+    },
+    "nano_lora64": {
+      "variant": "nano_lora64",
+      "config": {
+        "variant": "nano",
+        "language_model": "Qwen/Qwen2.5-0.5B",
+        "lora_rank": 64,
+        "action_head_type": "diffusion"
+      },
+      "total_params_m": 972.3,
+      "trainable_params_m": 500.0,
+      "frozen_params_m": 472.3,
+      "build_time_s": 2.9,
+      "inference": {
+        "fp32_p50_ms": 126.67,
+        "fp32_fps": 7.9,
+        "fp16_p50_ms": 92.33,
+        "fp16_fps": 10.8,
+        "fp16_speedup": 1.37,
+        "gpu_mem_gb": 7.66
+      },
+      "training": {
+        "n_steps": 30,
+        "loss_start": 5.4019,
+        "loss_end": 1.2458,
+        "loss_reduction_pct": 76.9,
+        "step_time_ms": 618.1,
+        "steps_per_sec": 1.62,
+        "gpu_mem_gb": 9.1,
+        "loss_curve": [
+          5.4019,
+          22.3632,
+          51.01,
+          42.1202,
+          46.8094,
+          3.5741,
+          7.4527,
+          4.3158,
+          1.9751,
+          1.9907,
+          1.8124,
+          2.6258,
+          2.0602,
+          3.2828,
+          1.5963,
+          1.2459,
+          1.539,
+          2.3277,
+          1.3788,
+          1.9966,
+          1.8496,
+          1.064,
+          1.7389,
+          0.6589,
+          1.3724,
+          1.2252,
+          1.164,
+          0.8581,
+          1.2236,
+          1.2458
+        ]
+      }
+    },
+    "nano_flow": {
+      "variant": "nano_flow",
+      "config": {
+        "variant": "nano",
+        "language_model": "Qwen/Qwen2.5-0.5B",
+        "lora_rank": 32,
+        "action_head_type": "flow"
+      },
+      "total_params_m": 967.9,
+      "trainable_params_m": 495.6,
+      "frozen_params_m": 472.3,
+      "build_time_s": 2.9,
+      "inference": {
+        "fp32_p50_ms": 121.72,
+        "fp32_fps": 8.2,
+        "fp16_p50_ms": 79.13,
+        "fp16_fps": 12.6,
+        "fp16_speedup": 1.54,
+        "gpu_mem_gb": 7.73
+      },
+      "training": {
+        "n_steps": 30,
+        "loss_start": 7.475,
+        "loss_end": 1.0583,
+        "loss_reduction_pct": 85.8,
+        "step_time_ms": 630.9,
+        "steps_per_sec": 1.58,
+        "gpu_mem_gb": 9.02,
+        "loss_curve": [
+          7.475,
+          52.8619,
+          27.0023,
+          48.6988,
+          4.5813,
+          4.4473,
+          2.7973,
+          2.4244,
+          2.0722,
+          2.7877,
+          2.2137,
+          1.4398,
+          1.8208,
+          2.5302,
+          2.0963,
+          1.8007,
+          2.9426,
+          1.9881,
+          1.8891,
+          2.3121,
+          2.3319,
+          2.6846,
+          3.4347,
+          2.3639,
+          3.0736,
+          3.5383,
+          3.0924,
+          2.5027,
+          2.4456,
+          1.0583
+        ]
+      }
+    },
+    "small_baseline": {
+      "error": "CUDA out of memory. Tried to allocate 54.00 MiB. GPU 0 has a total capacity of 22.03 GiB of which 5.06 MiB is free. Including non-PyTorch memory, this process has 22.02 GiB memory in use. Of the allocated memory 21.17 GiB is allocated by PyTorch, and 632.04 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)"
+    },
+    "small_flow": {
+      "error": "CUDA out of memory. Tried to allocate 54.00 MiB. GPU 0 has a total capacity of 22.03 GiB of which 37.06 MiB is free. Including non-PyTorch memory, this process has 21.99 GiB memory in use. Of the allocated memory 21.11 GiB is allocated by PyTorch, and 653.15 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)"
+    }
+  }
+}