Instructions to use Geometric-AI/geometric-ai-kernels with libraries, inference providers, notebooks, and local apps. Follow these links to get started.
- Libraries
- Kernels
How to use Geometric-AI/geometric-ai-kernels with Kernels:
# !pip install kernels from kernels import get_kernel kernel = get_kernel("Geometric-AI/geometric-ai-kernels") - Notebooks
- Google Colab
- Kaggle
Upload folder using huggingface_hub
Browse files- benchmark_results/bnpo_loss_compiled/bnpo_loss_compiled_dark_animation.svg +123 -0
- benchmark_results/bnpo_loss_compiled/bnpo_loss_compiled_dark_latency.svg +0 -0
- benchmark_results/bnpo_loss_compiled/bnpo_loss_compiled_dark_throughput.svg +0 -0
- benchmark_results/bnpo_loss_compiled/bnpo_loss_compiled_light_animation.svg +123 -0
- benchmark_results/bnpo_loss_compiled/bnpo_loss_compiled_light_latency.svg +0 -0
- benchmark_results/bnpo_loss_compiled/bnpo_loss_compiled_light_throughput.svg +0 -0
- benchmark_results/bnpo_loss_compiled/results.json +206 -0
- benchmark_results/bnpo_loss_eager/bnpo_loss_eager_dark_animation.svg +123 -0
- benchmark_results/bnpo_loss_eager/bnpo_loss_eager_dark_latency.svg +0 -0
- benchmark_results/bnpo_loss_eager/bnpo_loss_eager_dark_throughput.svg +0 -0
- benchmark_results/bnpo_loss_eager/bnpo_loss_eager_light_animation.svg +123 -0
- benchmark_results/bnpo_loss_eager/bnpo_loss_eager_light_latency.svg +0 -0
- benchmark_results/bnpo_loss_eager/bnpo_loss_eager_light_throughput.svg +0 -0
- benchmark_results/bnpo_loss_eager/results.json +206 -0
- benchmark_results/grpo_loss_compiled/grpo_loss_compiled_dark_animation.svg +105 -0
- benchmark_results/grpo_loss_compiled/grpo_loss_compiled_dark_latency.svg +0 -0
- benchmark_results/grpo_loss_compiled/grpo_loss_compiled_dark_throughput.svg +0 -0
- benchmark_results/grpo_loss_compiled/grpo_loss_compiled_light_animation.svg +105 -0
- benchmark_results/grpo_loss_compiled/grpo_loss_compiled_light_latency.svg +0 -0
- benchmark_results/grpo_loss_compiled/grpo_loss_compiled_light_throughput.svg +0 -0
- benchmark_results/grpo_loss_compiled/results.json +174 -0
- benchmark_results/grpo_loss_eager/grpo_loss_eager_dark_animation.svg +105 -0
- benchmark_results/grpo_loss_eager/grpo_loss_eager_dark_latency.svg +0 -0
- benchmark_results/grpo_loss_eager/grpo_loss_eager_dark_throughput.svg +0 -0
- benchmark_results/grpo_loss_eager/grpo_loss_eager_light_animation.svg +105 -0
- benchmark_results/grpo_loss_eager/grpo_loss_eager_light_latency.svg +0 -0
- benchmark_results/grpo_loss_eager/grpo_loss_eager_light_throughput.svg +0 -0
- benchmark_results/grpo_loss_eager/results.json +174 -0
- benchmark_results/reverse_kl_compiled/results.json +206 -0
- benchmark_results/reverse_kl_compiled/reverse_kl_compiled_dark_animation.svg +123 -0
- benchmark_results/reverse_kl_compiled/reverse_kl_compiled_dark_latency.svg +0 -0
- benchmark_results/reverse_kl_compiled/reverse_kl_compiled_dark_throughput.svg +0 -0
- benchmark_results/reverse_kl_compiled/reverse_kl_compiled_light_animation.svg +123 -0
- benchmark_results/reverse_kl_compiled/reverse_kl_compiled_light_latency.svg +0 -0
- benchmark_results/reverse_kl_compiled/reverse_kl_compiled_light_throughput.svg +0 -0
- benchmark_results/reverse_kl_eager/results.json +206 -0
- benchmark_results/reverse_kl_eager/reverse_kl_eager_dark_animation.svg +123 -0
- benchmark_results/reverse_kl_eager/reverse_kl_eager_dark_latency.svg +0 -0
- benchmark_results/reverse_kl_eager/reverse_kl_eager_dark_throughput.svg +0 -0
- benchmark_results/reverse_kl_eager/reverse_kl_eager_light_animation.svg +123 -0
- benchmark_results/reverse_kl_eager/reverse_kl_eager_light_latency.svg +0 -0
- benchmark_results/reverse_kl_eager/reverse_kl_eager_light_throughput.svg +0 -0
benchmark_results/bnpo_loss_compiled/bnpo_loss_compiled_dark_animation.svg
ADDED
|
|
benchmark_results/bnpo_loss_compiled/bnpo_loss_compiled_dark_latency.svg
ADDED
|
|
benchmark_results/bnpo_loss_compiled/bnpo_loss_compiled_dark_throughput.svg
ADDED
|
|
benchmark_results/bnpo_loss_compiled/bnpo_loss_compiled_light_animation.svg
ADDED
|
|
benchmark_results/bnpo_loss_compiled/bnpo_loss_compiled_light_latency.svg
ADDED
|
|
benchmark_results/bnpo_loss_compiled/bnpo_loss_compiled_light_throughput.svg
ADDED
|
|
benchmark_results/bnpo_loss_compiled/results.json
ADDED
|
@@ -0,0 +1,206 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"results": [
|
| 3 |
+
{
|
| 4 |
+
"workload": "bnpoLossBenchmark.bnpo_loss_batch128_seqlen02781_compiled",
|
| 5 |
+
"timingResults": {
|
| 6 |
+
"mean_ms": 0.0359,
|
| 7 |
+
"std_ms": 0.0038,
|
| 8 |
+
"min_ms": 0.0332,
|
| 9 |
+
"max_ms": 0.0701,
|
| 10 |
+
"q1_ms": 0.0344,
|
| 11 |
+
"q3_ms": 0.0357,
|
| 12 |
+
"iqr_ms": 0.0013,
|
| 13 |
+
"outliers": 20,
|
| 14 |
+
"iterations": 200,
|
| 15 |
+
"refMeanMs": 0.0771
|
| 16 |
+
},
|
| 17 |
+
"verified": true
|
| 18 |
+
},
|
| 19 |
+
{
|
| 20 |
+
"workload": "bnpoLossBenchmark.bnpo_loss_batch128_seqlen08192_compiled",
|
| 21 |
+
"timingResults": {
|
| 22 |
+
"mean_ms": 0.0351,
|
| 23 |
+
"std_ms": 0.0033,
|
| 24 |
+
"min_ms": 0.0327,
|
| 25 |
+
"max_ms": 0.0557,
|
| 26 |
+
"q1_ms": 0.0336,
|
| 27 |
+
"q3_ms": 0.035,
|
| 28 |
+
"iqr_ms": 0.0014,
|
| 29 |
+
"outliers": 14,
|
| 30 |
+
"iterations": 200,
|
| 31 |
+
"refMeanMs": 0.0771
|
| 32 |
+
},
|
| 33 |
+
"verified": true
|
| 34 |
+
},
|
| 35 |
+
{
|
| 36 |
+
"workload": "bnpoLossBenchmark.bnpo_loss_batch16_seqlen01024_compiled",
|
| 37 |
+
"timingResults": {
|
| 38 |
+
"mean_ms": 0.0355,
|
| 39 |
+
"std_ms": 0.0042,
|
| 40 |
+
"min_ms": 0.0331,
|
| 41 |
+
"max_ms": 0.0706,
|
| 42 |
+
"q1_ms": 0.034,
|
| 43 |
+
"q3_ms": 0.0351,
|
| 44 |
+
"iqr_ms": 0.0011,
|
| 45 |
+
"outliers": 21,
|
| 46 |
+
"iterations": 200,
|
| 47 |
+
"refMeanMs": 0.0811
|
| 48 |
+
},
|
| 49 |
+
"verified": true
|
| 50 |
+
},
|
| 51 |
+
{
|
| 52 |
+
"workload": "bnpoLossBenchmark.bnpo_loss_batch16_seqlen02781_compiled",
|
| 53 |
+
"timingResults": {
|
| 54 |
+
"mean_ms": 0.0355,
|
| 55 |
+
"std_ms": 0.004,
|
| 56 |
+
"min_ms": 0.0319,
|
| 57 |
+
"max_ms": 0.0591,
|
| 58 |
+
"q1_ms": 0.0338,
|
| 59 |
+
"q3_ms": 0.0352,
|
| 60 |
+
"iqr_ms": 0.0014,
|
| 61 |
+
"outliers": 24,
|
| 62 |
+
"iterations": 200,
|
| 63 |
+
"refMeanMs": 0.0709
|
| 64 |
+
},
|
| 65 |
+
"verified": true
|
| 66 |
+
},
|
| 67 |
+
{
|
| 68 |
+
"workload": "bnpoLossBenchmark.bnpo_loss_batch32_seqlen02048_compiled",
|
| 69 |
+
"timingResults": {
|
| 70 |
+
"mean_ms": 0.0358,
|
| 71 |
+
"std_ms": 0.0042,
|
| 72 |
+
"min_ms": 0.032,
|
| 73 |
+
"max_ms": 0.0569,
|
| 74 |
+
"q1_ms": 0.0338,
|
| 75 |
+
"q3_ms": 0.0355,
|
| 76 |
+
"iqr_ms": 0.0017,
|
| 77 |
+
"outliers": 27,
|
| 78 |
+
"iterations": 200,
|
| 79 |
+
"refMeanMs": 0.0763
|
| 80 |
+
},
|
| 81 |
+
"verified": true
|
| 82 |
+
},
|
| 83 |
+
{
|
| 84 |
+
"workload": "bnpoLossBenchmark.bnpo_loss_batch64_seqlen04096_compiled",
|
| 85 |
+
"timingResults": {
|
| 86 |
+
"mean_ms": 0.0344,
|
| 87 |
+
"std_ms": 0.0031,
|
| 88 |
+
"min_ms": 0.032,
|
| 89 |
+
"max_ms": 0.0557,
|
| 90 |
+
"q1_ms": 0.0331,
|
| 91 |
+
"q3_ms": 0.0341,
|
| 92 |
+
"iqr_ms": 0.001,
|
| 93 |
+
"outliers": 32,
|
| 94 |
+
"iterations": 200,
|
| 95 |
+
"refMeanMs": 0.0739
|
| 96 |
+
},
|
| 97 |
+
"verified": true
|
| 98 |
+
},
|
| 99 |
+
{
|
| 100 |
+
"workload": "bnpoLossBenchmark.bnpo_loss_fwd_batch128_seqlen02781_compiled",
|
| 101 |
+
"timingResults": {
|
| 102 |
+
"mean_ms": 0.0323,
|
| 103 |
+
"std_ms": 0.0034,
|
| 104 |
+
"min_ms": 0.03,
|
| 105 |
+
"max_ms": 0.053,
|
| 106 |
+
"q1_ms": 0.0311,
|
| 107 |
+
"q3_ms": 0.0318,
|
| 108 |
+
"iqr_ms": 0.0007,
|
| 109 |
+
"outliers": 25,
|
| 110 |
+
"iterations": 200,
|
| 111 |
+
"refMeanMs": 0.0808
|
| 112 |
+
},
|
| 113 |
+
"verified": true
|
| 114 |
+
},
|
| 115 |
+
{
|
| 116 |
+
"workload": "bnpoLossBenchmark.bnpo_loss_fwd_batch128_seqlen08192_compiled",
|
| 117 |
+
"timingResults": {
|
| 118 |
+
"mean_ms": 0.0318,
|
| 119 |
+
"std_ms": 0.0032,
|
| 120 |
+
"min_ms": 0.0293,
|
| 121 |
+
"max_ms": 0.0502,
|
| 122 |
+
"q1_ms": 0.0304,
|
| 123 |
+
"q3_ms": 0.0317,
|
| 124 |
+
"iqr_ms": 0.0013,
|
| 125 |
+
"outliers": 17,
|
| 126 |
+
"iterations": 200,
|
| 127 |
+
"refMeanMs": 0.0845
|
| 128 |
+
},
|
| 129 |
+
"verified": true
|
| 130 |
+
},
|
| 131 |
+
{
|
| 132 |
+
"workload": "bnpoLossBenchmark.bnpo_loss_fwd_batch16_seqlen01024_compiled",
|
| 133 |
+
"timingResults": {
|
| 134 |
+
"mean_ms": 0.0317,
|
| 135 |
+
"std_ms": 0.0031,
|
| 136 |
+
"min_ms": 0.0293,
|
| 137 |
+
"max_ms": 0.0593,
|
| 138 |
+
"q1_ms": 0.0304,
|
| 139 |
+
"q3_ms": 0.0317,
|
| 140 |
+
"iqr_ms": 0.0013,
|
| 141 |
+
"outliers": 17,
|
| 142 |
+
"iterations": 200,
|
| 143 |
+
"refMeanMs": 0.079
|
| 144 |
+
},
|
| 145 |
+
"verified": true
|
| 146 |
+
},
|
| 147 |
+
{
|
| 148 |
+
"workload": "bnpoLossBenchmark.bnpo_loss_fwd_batch16_seqlen02781_compiled",
|
| 149 |
+
"timingResults": {
|
| 150 |
+
"mean_ms": 0.0306,
|
| 151 |
+
"std_ms": 0.0035,
|
| 152 |
+
"min_ms": 0.0279,
|
| 153 |
+
"max_ms": 0.0534,
|
| 154 |
+
"q1_ms": 0.0289,
|
| 155 |
+
"q3_ms": 0.0306,
|
| 156 |
+
"iqr_ms": 0.0017,
|
| 157 |
+
"outliers": 20,
|
| 158 |
+
"iterations": 200,
|
| 159 |
+
"refMeanMs": 0.084
|
| 160 |
+
},
|
| 161 |
+
"verified": true
|
| 162 |
+
},
|
| 163 |
+
{
|
| 164 |
+
"workload": "bnpoLossBenchmark.bnpo_loss_fwd_batch32_seqlen02048_compiled",
|
| 165 |
+
"timingResults": {
|
| 166 |
+
"mean_ms": 0.0305,
|
| 167 |
+
"std_ms": 0.0035,
|
| 168 |
+
"min_ms": 0.0279,
|
| 169 |
+
"max_ms": 0.051,
|
| 170 |
+
"q1_ms": 0.0288,
|
| 171 |
+
"q3_ms": 0.0308,
|
| 172 |
+
"iqr_ms": 0.002,
|
| 173 |
+
"outliers": 15,
|
| 174 |
+
"iterations": 200,
|
| 175 |
+
"refMeanMs": 0.0764
|
| 176 |
+
},
|
| 177 |
+
"verified": true
|
| 178 |
+
},
|
| 179 |
+
{
|
| 180 |
+
"workload": "bnpoLossBenchmark.bnpo_loss_fwd_batch64_seqlen04096_compiled",
|
| 181 |
+
"timingResults": {
|
| 182 |
+
"mean_ms": 0.0315,
|
| 183 |
+
"std_ms": 0.0033,
|
| 184 |
+
"min_ms": 0.0293,
|
| 185 |
+
"max_ms": 0.0543,
|
| 186 |
+
"q1_ms": 0.0302,
|
| 187 |
+
"q3_ms": 0.0311,
|
| 188 |
+
"iqr_ms": 0.0009,
|
| 189 |
+
"outliers": 21,
|
| 190 |
+
"iterations": 200,
|
| 191 |
+
"refMeanMs": 0.0739
|
| 192 |
+
},
|
| 193 |
+
"verified": true
|
| 194 |
+
}
|
| 195 |
+
],
|
| 196 |
+
"machineInfo": {
|
| 197 |
+
"gpu": "NVIDIA H100 80GB HBM3",
|
| 198 |
+
"backend": "CUDA 13.0",
|
| 199 |
+
"pytorchVersion": "2.11.0+cu130",
|
| 200 |
+
"os": "Linux 6.11.0-1016-nvidia",
|
| 201 |
+
"cpu": "x86_64"
|
| 202 |
+
},
|
| 203 |
+
"kernelCommitSha": "7972ab0e834be24d",
|
| 204 |
+
"benchmarkScriptPath": "benchmarks",
|
| 205 |
+
"benchmarkScriptSha": "68426064f76adff2066ad365f6c97be3fe279bd6b20d025b3dc5614f9b2da449"
|
| 206 |
+
}
|
benchmark_results/bnpo_loss_eager/bnpo_loss_eager_dark_animation.svg
ADDED
|
|
benchmark_results/bnpo_loss_eager/bnpo_loss_eager_dark_latency.svg
ADDED
|
|
benchmark_results/bnpo_loss_eager/bnpo_loss_eager_dark_throughput.svg
ADDED
|
|
benchmark_results/bnpo_loss_eager/bnpo_loss_eager_light_animation.svg
ADDED
|
|
benchmark_results/bnpo_loss_eager/bnpo_loss_eager_light_latency.svg
ADDED
|
|
benchmark_results/bnpo_loss_eager/bnpo_loss_eager_light_throughput.svg
ADDED
|
|
benchmark_results/bnpo_loss_eager/results.json
ADDED
|
@@ -0,0 +1,206 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"results": [
|
| 3 |
+
{
|
| 4 |
+
"workload": "bnpoLossBenchmark.bnpo_loss_batch128_seqlen02781_eager",
|
| 5 |
+
"timingResults": {
|
| 6 |
+
"mean_ms": 0.0358,
|
| 7 |
+
"std_ms": 0.0035,
|
| 8 |
+
"min_ms": 0.0323,
|
| 9 |
+
"max_ms": 0.0536,
|
| 10 |
+
"q1_ms": 0.0342,
|
| 11 |
+
"q3_ms": 0.0358,
|
| 12 |
+
"iqr_ms": 0.0017,
|
| 13 |
+
"outliers": 17,
|
| 14 |
+
"iterations": 200,
|
| 15 |
+
"refMeanMs": 0.5552
|
| 16 |
+
},
|
| 17 |
+
"verified": true
|
| 18 |
+
},
|
| 19 |
+
{
|
| 20 |
+
"workload": "bnpoLossBenchmark.bnpo_loss_batch128_seqlen08192_eager",
|
| 21 |
+
"timingResults": {
|
| 22 |
+
"mean_ms": 0.0344,
|
| 23 |
+
"std_ms": 0.0031,
|
| 24 |
+
"min_ms": 0.0314,
|
| 25 |
+
"max_ms": 0.0537,
|
| 26 |
+
"q1_ms": 0.0329,
|
| 27 |
+
"q3_ms": 0.0345,
|
| 28 |
+
"iqr_ms": 0.0015,
|
| 29 |
+
"outliers": 20,
|
| 30 |
+
"iterations": 200,
|
| 31 |
+
"refMeanMs": 0.6466
|
| 32 |
+
},
|
| 33 |
+
"verified": true
|
| 34 |
+
},
|
| 35 |
+
{
|
| 36 |
+
"workload": "bnpoLossBenchmark.bnpo_loss_batch16_seqlen01024_eager",
|
| 37 |
+
"timingResults": {
|
| 38 |
+
"mean_ms": 0.0345,
|
| 39 |
+
"std_ms": 0.0171,
|
| 40 |
+
"min_ms": 0.0305,
|
| 41 |
+
"max_ms": 0.2718,
|
| 42 |
+
"q1_ms": 0.0319,
|
| 43 |
+
"q3_ms": 0.033,
|
| 44 |
+
"iqr_ms": 0.0011,
|
| 45 |
+
"outliers": 23,
|
| 46 |
+
"iterations": 200,
|
| 47 |
+
"refMeanMs": 0.5868
|
| 48 |
+
},
|
| 49 |
+
"verified": true
|
| 50 |
+
},
|
| 51 |
+
{
|
| 52 |
+
"workload": "bnpoLossBenchmark.bnpo_loss_batch16_seqlen02781_eager",
|
| 53 |
+
"timingResults": {
|
| 54 |
+
"mean_ms": 0.0324,
|
| 55 |
+
"std_ms": 0.0027,
|
| 56 |
+
"min_ms": 0.0301,
|
| 57 |
+
"max_ms": 0.0508,
|
| 58 |
+
"q1_ms": 0.0312,
|
| 59 |
+
"q3_ms": 0.0324,
|
| 60 |
+
"iqr_ms": 0.0012,
|
| 61 |
+
"outliers": 17,
|
| 62 |
+
"iterations": 200,
|
| 63 |
+
"refMeanMs": 0.5832
|
| 64 |
+
},
|
| 65 |
+
"verified": true
|
| 66 |
+
},
|
| 67 |
+
{
|
| 68 |
+
"workload": "bnpoLossBenchmark.bnpo_loss_batch32_seqlen02048_eager",
|
| 69 |
+
"timingResults": {
|
| 70 |
+
"mean_ms": 0.0343,
|
| 71 |
+
"std_ms": 0.0033,
|
| 72 |
+
"min_ms": 0.031,
|
| 73 |
+
"max_ms": 0.0513,
|
| 74 |
+
"q1_ms": 0.0325,
|
| 75 |
+
"q3_ms": 0.0346,
|
| 76 |
+
"iqr_ms": 0.0021,
|
| 77 |
+
"outliers": 19,
|
| 78 |
+
"iterations": 200,
|
| 79 |
+
"refMeanMs": 0.6265
|
| 80 |
+
},
|
| 81 |
+
"verified": true
|
| 82 |
+
},
|
| 83 |
+
{
|
| 84 |
+
"workload": "bnpoLossBenchmark.bnpo_loss_batch64_seqlen04096_eager",
|
| 85 |
+
"timingResults": {
|
| 86 |
+
"mean_ms": 0.0328,
|
| 87 |
+
"std_ms": 0.0029,
|
| 88 |
+
"min_ms": 0.0306,
|
| 89 |
+
"max_ms": 0.0499,
|
| 90 |
+
"q1_ms": 0.0317,
|
| 91 |
+
"q3_ms": 0.0326,
|
| 92 |
+
"iqr_ms": 0.0009,
|
| 93 |
+
"outliers": 20,
|
| 94 |
+
"iterations": 200,
|
| 95 |
+
"refMeanMs": 0.5698
|
| 96 |
+
},
|
| 97 |
+
"verified": true
|
| 98 |
+
},
|
| 99 |
+
{
|
| 100 |
+
"workload": "bnpoLossBenchmark.bnpo_loss_fwd_batch128_seqlen02781_eager",
|
| 101 |
+
"timingResults": {
|
| 102 |
+
"mean_ms": 0.0317,
|
| 103 |
+
"std_ms": 0.0034,
|
| 104 |
+
"min_ms": 0.0285,
|
| 105 |
+
"max_ms": 0.052,
|
| 106 |
+
"q1_ms": 0.0305,
|
| 107 |
+
"q3_ms": 0.0314,
|
| 108 |
+
"iqr_ms": 0.0009,
|
| 109 |
+
"outliers": 22,
|
| 110 |
+
"iterations": 200,
|
| 111 |
+
"refMeanMs": 0.1858
|
| 112 |
+
},
|
| 113 |
+
"verified": true
|
| 114 |
+
},
|
| 115 |
+
{
|
| 116 |
+
"workload": "bnpoLossBenchmark.bnpo_loss_fwd_batch128_seqlen08192_eager",
|
| 117 |
+
"timingResults": {
|
| 118 |
+
"mean_ms": 0.0292,
|
| 119 |
+
"std_ms": 0.0028,
|
| 120 |
+
"min_ms": 0.0273,
|
| 121 |
+
"max_ms": 0.0455,
|
| 122 |
+
"q1_ms": 0.0281,
|
| 123 |
+
"q3_ms": 0.0289,
|
| 124 |
+
"iqr_ms": 0.0008,
|
| 125 |
+
"outliers": 23,
|
| 126 |
+
"iterations": 200,
|
| 127 |
+
"refMeanMs": 0.1633
|
| 128 |
+
},
|
| 129 |
+
"verified": true
|
| 130 |
+
},
|
| 131 |
+
{
|
| 132 |
+
"workload": "bnpoLossBenchmark.bnpo_loss_fwd_batch16_seqlen01024_eager",
|
| 133 |
+
"timingResults": {
|
| 134 |
+
"mean_ms": 0.0311,
|
| 135 |
+
"std_ms": 0.0267,
|
| 136 |
+
"min_ms": 0.0256,
|
| 137 |
+
"max_ms": 0.4049,
|
| 138 |
+
"q1_ms": 0.0276,
|
| 139 |
+
"q3_ms": 0.0295,
|
| 140 |
+
"iqr_ms": 0.0018,
|
| 141 |
+
"outliers": 18,
|
| 142 |
+
"iterations": 200,
|
| 143 |
+
"refMeanMs": 0.1761
|
| 144 |
+
},
|
| 145 |
+
"verified": true
|
| 146 |
+
},
|
| 147 |
+
{
|
| 148 |
+
"workload": "bnpoLossBenchmark.bnpo_loss_fwd_batch16_seqlen02781_eager",
|
| 149 |
+
"timingResults": {
|
| 150 |
+
"mean_ms": 0.0288,
|
| 151 |
+
"std_ms": 0.003,
|
| 152 |
+
"min_ms": 0.027,
|
| 153 |
+
"max_ms": 0.0554,
|
| 154 |
+
"q1_ms": 0.0278,
|
| 155 |
+
"q3_ms": 0.0284,
|
| 156 |
+
"iqr_ms": 0.0006,
|
| 157 |
+
"outliers": 22,
|
| 158 |
+
"iterations": 200,
|
| 159 |
+
"refMeanMs": 0.1755
|
| 160 |
+
},
|
| 161 |
+
"verified": true
|
| 162 |
+
},
|
| 163 |
+
{
|
| 164 |
+
"workload": "bnpoLossBenchmark.bnpo_loss_fwd_batch32_seqlen02048_eager",
|
| 165 |
+
"timingResults": {
|
| 166 |
+
"mean_ms": 0.031,
|
| 167 |
+
"std_ms": 0.0034,
|
| 168 |
+
"min_ms": 0.0281,
|
| 169 |
+
"max_ms": 0.0484,
|
| 170 |
+
"q1_ms": 0.0296,
|
| 171 |
+
"q3_ms": 0.0306,
|
| 172 |
+
"iqr_ms": 0.0009,
|
| 173 |
+
"outliers": 27,
|
| 174 |
+
"iterations": 200,
|
| 175 |
+
"refMeanMs": 0.1533
|
| 176 |
+
},
|
| 177 |
+
"verified": true
|
| 178 |
+
},
|
| 179 |
+
{
|
| 180 |
+
"workload": "bnpoLossBenchmark.bnpo_loss_fwd_batch64_seqlen04096_eager",
|
| 181 |
+
"timingResults": {
|
| 182 |
+
"mean_ms": 0.031,
|
| 183 |
+
"std_ms": 0.0041,
|
| 184 |
+
"min_ms": 0.0286,
|
| 185 |
+
"max_ms": 0.0625,
|
| 186 |
+
"q1_ms": 0.0294,
|
| 187 |
+
"q3_ms": 0.0305,
|
| 188 |
+
"iqr_ms": 0.0011,
|
| 189 |
+
"outliers": 22,
|
| 190 |
+
"iterations": 200,
|
| 191 |
+
"refMeanMs": 0.1678
|
| 192 |
+
},
|
| 193 |
+
"verified": true
|
| 194 |
+
}
|
| 195 |
+
],
|
| 196 |
+
"machineInfo": {
|
| 197 |
+
"gpu": "NVIDIA H100 80GB HBM3",
|
| 198 |
+
"backend": "CUDA 13.0",
|
| 199 |
+
"pytorchVersion": "2.11.0+cu130",
|
| 200 |
+
"os": "Linux 6.11.0-1016-nvidia",
|
| 201 |
+
"cpu": "x86_64"
|
| 202 |
+
},
|
| 203 |
+
"kernelCommitSha": "84e79b2f3ee3088a",
|
| 204 |
+
"benchmarkScriptPath": "benchmarks",
|
| 205 |
+
"benchmarkScriptSha": "68426064f76adff2066ad365f6c97be3fe279bd6b20d025b3dc5614f9b2da449"
|
| 206 |
+
}
|
benchmark_results/grpo_loss_compiled/grpo_loss_compiled_dark_animation.svg
ADDED
|
|
benchmark_results/grpo_loss_compiled/grpo_loss_compiled_dark_latency.svg
ADDED
|
|
benchmark_results/grpo_loss_compiled/grpo_loss_compiled_dark_throughput.svg
ADDED
|
|
benchmark_results/grpo_loss_compiled/grpo_loss_compiled_light_animation.svg
ADDED
|
|
benchmark_results/grpo_loss_compiled/grpo_loss_compiled_light_latency.svg
ADDED
|
|
benchmark_results/grpo_loss_compiled/grpo_loss_compiled_light_throughput.svg
ADDED
|
|
benchmark_results/grpo_loss_compiled/results.json
ADDED
|
@@ -0,0 +1,174 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"results": [
|
| 3 |
+
{
|
| 4 |
+
"workload": "GrpoLossBenchmark.grpo_loss_batch128_seqlen02781_compiled",
|
| 5 |
+
"timingResults": {
|
| 6 |
+
"mean_ms": 0.0329,
|
| 7 |
+
"std_ms": 0.0042,
|
| 8 |
+
"min_ms": 0.0301,
|
| 9 |
+
"max_ms": 0.0632,
|
| 10 |
+
"q1_ms": 0.031,
|
| 11 |
+
"q3_ms": 0.0326,
|
| 12 |
+
"iqr_ms": 0.0016,
|
| 13 |
+
"outliers": 22,
|
| 14 |
+
"iterations": 200,
|
| 15 |
+
"refMeanMs": 0.0874
|
| 16 |
+
},
|
| 17 |
+
"verified": true
|
| 18 |
+
},
|
| 19 |
+
{
|
| 20 |
+
"workload": "GrpoLossBenchmark.grpo_loss_batch128_seqlen08192_compiled",
|
| 21 |
+
"timingResults": {
|
| 22 |
+
"mean_ms": 0.0337,
|
| 23 |
+
"std_ms": 0.0045,
|
| 24 |
+
"min_ms": 0.0305,
|
| 25 |
+
"max_ms": 0.065,
|
| 26 |
+
"q1_ms": 0.0318,
|
| 27 |
+
"q3_ms": 0.0333,
|
| 28 |
+
"iqr_ms": 0.0015,
|
| 29 |
+
"outliers": 23,
|
| 30 |
+
"iterations": 200,
|
| 31 |
+
"refMeanMs": 0.0824
|
| 32 |
+
},
|
| 33 |
+
"verified": true
|
| 34 |
+
},
|
| 35 |
+
{
|
| 36 |
+
"workload": "GrpoLossBenchmark.grpo_loss_batch16_seqlen01024_compiled",
|
| 37 |
+
"timingResults": {
|
| 38 |
+
"mean_ms": 0.0323,
|
| 39 |
+
"std_ms": 0.0045,
|
| 40 |
+
"min_ms": 0.0286,
|
| 41 |
+
"max_ms": 0.0621,
|
| 42 |
+
"q1_ms": 0.0306,
|
| 43 |
+
"q3_ms": 0.0321,
|
| 44 |
+
"iqr_ms": 0.0015,
|
| 45 |
+
"outliers": 24,
|
| 46 |
+
"iterations": 200,
|
| 47 |
+
"refMeanMs": 0.0626
|
| 48 |
+
},
|
| 49 |
+
"verified": true
|
| 50 |
+
},
|
| 51 |
+
{
|
| 52 |
+
"workload": "GrpoLossBenchmark.grpo_loss_batch32_seqlen02048_compiled",
|
| 53 |
+
"timingResults": {
|
| 54 |
+
"mean_ms": 0.0324,
|
| 55 |
+
"std_ms": 0.0046,
|
| 56 |
+
"min_ms": 0.0286,
|
| 57 |
+
"max_ms": 0.0688,
|
| 58 |
+
"q1_ms": 0.0305,
|
| 59 |
+
"q3_ms": 0.0321,
|
| 60 |
+
"iqr_ms": 0.0016,
|
| 61 |
+
"outliers": 22,
|
| 62 |
+
"iterations": 200,
|
| 63 |
+
"refMeanMs": 0.0633
|
| 64 |
+
},
|
| 65 |
+
"verified": true
|
| 66 |
+
},
|
| 67 |
+
{
|
| 68 |
+
"workload": "GrpoLossBenchmark.grpo_loss_batch64_seqlen04096_compiled",
|
| 69 |
+
"timingResults": {
|
| 70 |
+
"mean_ms": 0.0349,
|
| 71 |
+
"std_ms": 0.0058,
|
| 72 |
+
"min_ms": 0.0315,
|
| 73 |
+
"max_ms": 0.0814,
|
| 74 |
+
"q1_ms": 0.0325,
|
| 75 |
+
"q3_ms": 0.0341,
|
| 76 |
+
"iqr_ms": 0.0016,
|
| 77 |
+
"outliers": 26,
|
| 78 |
+
"iterations": 200,
|
| 79 |
+
"refMeanMs": 0.0869
|
| 80 |
+
},
|
| 81 |
+
"verified": true
|
| 82 |
+
},
|
| 83 |
+
{
|
| 84 |
+
"workload": "GrpoLossBenchmark.grpo_loss_fwd_batch128_seqlen02781_compiled",
|
| 85 |
+
"timingResults": {
|
| 86 |
+
"mean_ms": 0.033,
|
| 87 |
+
"std_ms": 0.0038,
|
| 88 |
+
"min_ms": 0.0295,
|
| 89 |
+
"max_ms": 0.0543,
|
| 90 |
+
"q1_ms": 0.0313,
|
| 91 |
+
"q3_ms": 0.0333,
|
| 92 |
+
"iqr_ms": 0.0019,
|
| 93 |
+
"outliers": 16,
|
| 94 |
+
"iterations": 200,
|
| 95 |
+
"refMeanMs": 0.0772
|
| 96 |
+
},
|
| 97 |
+
"verified": true
|
| 98 |
+
},
|
| 99 |
+
{
|
| 100 |
+
"workload": "GrpoLossBenchmark.grpo_loss_fwd_batch128_seqlen08192_compiled",
|
| 101 |
+
"timingResults": {
|
| 102 |
+
"mean_ms": 0.0331,
|
| 103 |
+
"std_ms": 0.0032,
|
| 104 |
+
"min_ms": 0.0295,
|
| 105 |
+
"max_ms": 0.0535,
|
| 106 |
+
"q1_ms": 0.0316,
|
| 107 |
+
"q3_ms": 0.0331,
|
| 108 |
+
"iqr_ms": 0.0015,
|
| 109 |
+
"outliers": 19,
|
| 110 |
+
"iterations": 200,
|
| 111 |
+
"refMeanMs": 0.0767
|
| 112 |
+
},
|
| 113 |
+
"verified": true
|
| 114 |
+
},
|
| 115 |
+
{
|
| 116 |
+
"workload": "GrpoLossBenchmark.grpo_loss_fwd_batch16_seqlen01024_compiled",
|
| 117 |
+
"timingResults": {
|
| 118 |
+
"mean_ms": 0.033,
|
| 119 |
+
"std_ms": 0.0032,
|
| 120 |
+
"min_ms": 0.029,
|
| 121 |
+
"max_ms": 0.051,
|
| 122 |
+
"q1_ms": 0.0315,
|
| 123 |
+
"q3_ms": 0.0332,
|
| 124 |
+
"iqr_ms": 0.0016,
|
| 125 |
+
"outliers": 17,
|
| 126 |
+
"iterations": 200,
|
| 127 |
+
"refMeanMs": 0.0845
|
| 128 |
+
},
|
| 129 |
+
"verified": true
|
| 130 |
+
},
|
| 131 |
+
{
|
| 132 |
+
"workload": "GrpoLossBenchmark.grpo_loss_fwd_batch32_seqlen02048_compiled",
|
| 133 |
+
"timingResults": {
|
| 134 |
+
"mean_ms": 0.0339,
|
| 135 |
+
"std_ms": 0.006,
|
| 136 |
+
"min_ms": 0.03,
|
| 137 |
+
"max_ms": 0.0674,
|
| 138 |
+
"q1_ms": 0.0314,
|
| 139 |
+
"q3_ms": 0.0331,
|
| 140 |
+
"iqr_ms": 0.0017,
|
| 141 |
+
"outliers": 23,
|
| 142 |
+
"iterations": 200,
|
| 143 |
+
"refMeanMs": 0.1052
|
| 144 |
+
},
|
| 145 |
+
"verified": true
|
| 146 |
+
},
|
| 147 |
+
{
|
| 148 |
+
"workload": "GrpoLossBenchmark.grpo_loss_fwd_batch64_seqlen04096_compiled",
|
| 149 |
+
"timingResults": {
|
| 150 |
+
"mean_ms": 0.034,
|
| 151 |
+
"std_ms": 0.004,
|
| 152 |
+
"min_ms": 0.031,
|
| 153 |
+
"max_ms": 0.0623,
|
| 154 |
+
"q1_ms": 0.0323,
|
| 155 |
+
"q3_ms": 0.0339,
|
| 156 |
+
"iqr_ms": 0.0016,
|
| 157 |
+
"outliers": 20,
|
| 158 |
+
"iterations": 200,
|
| 159 |
+
"refMeanMs": 0.0796
|
| 160 |
+
},
|
| 161 |
+
"verified": true
|
| 162 |
+
}
|
| 163 |
+
],
|
| 164 |
+
"machineInfo": {
|
| 165 |
+
"gpu": "NVIDIA H100 80GB HBM3",
|
| 166 |
+
"backend": "CUDA 13.0",
|
| 167 |
+
"pytorchVersion": "2.11.0+cu130",
|
| 168 |
+
"os": "Linux 6.11.0-1016-nvidia",
|
| 169 |
+
"cpu": "x86_64"
|
| 170 |
+
},
|
| 171 |
+
"kernelCommitSha": "ad285d68b8c8c0ff",
|
| 172 |
+
"benchmarkScriptPath": "benchmarks",
|
| 173 |
+
"benchmarkScriptSha": "ff35d63fbca37cfcbf5c94f067c930adc2bd0043ce6788f286dbad5a4f9b9d4a"
|
| 174 |
+
}
|
benchmark_results/grpo_loss_eager/grpo_loss_eager_dark_animation.svg
ADDED
|
|
benchmark_results/grpo_loss_eager/grpo_loss_eager_dark_latency.svg
ADDED
|
|
benchmark_results/grpo_loss_eager/grpo_loss_eager_dark_throughput.svg
ADDED
|
|
benchmark_results/grpo_loss_eager/grpo_loss_eager_light_animation.svg
ADDED
|
|
benchmark_results/grpo_loss_eager/grpo_loss_eager_light_latency.svg
ADDED
|
|
benchmark_results/grpo_loss_eager/grpo_loss_eager_light_throughput.svg
ADDED
|
|
benchmark_results/grpo_loss_eager/results.json
ADDED
|
@@ -0,0 +1,174 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"results": [
|
| 3 |
+
{
|
| 4 |
+
"workload": "GrpoLossBenchmark.grpo_loss_batch128_seqlen02781_eager",
|
| 5 |
+
"timingResults": {
|
| 6 |
+
"mean_ms": 0.0313,
|
| 7 |
+
"std_ms": 0.0029,
|
| 8 |
+
"min_ms": 0.0281,
|
| 9 |
+
"max_ms": 0.0482,
|
| 10 |
+
"q1_ms": 0.03,
|
| 11 |
+
"q3_ms": 0.0314,
|
| 12 |
+
"iqr_ms": 0.0013,
|
| 13 |
+
"outliers": 16,
|
| 14 |
+
"iterations": 200,
|
| 15 |
+
"refMeanMs": 0.6643
|
| 16 |
+
},
|
| 17 |
+
"verified": true
|
| 18 |
+
},
|
| 19 |
+
{
|
| 20 |
+
"workload": "GrpoLossBenchmark.grpo_loss_batch128_seqlen08192_eager",
|
| 21 |
+
"timingResults": {
|
| 22 |
+
"mean_ms": 0.0309,
|
| 23 |
+
"std_ms": 0.0031,
|
| 24 |
+
"min_ms": 0.0285,
|
| 25 |
+
"max_ms": 0.0477,
|
| 26 |
+
"q1_ms": 0.0298,
|
| 27 |
+
"q3_ms": 0.0306,
|
| 28 |
+
"iqr_ms": 0.0008,
|
| 29 |
+
"outliers": 19,
|
| 30 |
+
"iterations": 200,
|
| 31 |
+
"refMeanMs": 0.5961
|
| 32 |
+
},
|
| 33 |
+
"verified": true
|
| 34 |
+
},
|
| 35 |
+
{
|
| 36 |
+
"workload": "GrpoLossBenchmark.grpo_loss_batch16_seqlen01024_eager",
|
| 37 |
+
"timingResults": {
|
| 38 |
+
"mean_ms": 0.0315,
|
| 39 |
+
"std_ms": 0.0033,
|
| 40 |
+
"min_ms": 0.0293,
|
| 41 |
+
"max_ms": 0.0507,
|
| 42 |
+
"q1_ms": 0.0302,
|
| 43 |
+
"q3_ms": 0.0311,
|
| 44 |
+
"iqr_ms": 0.0009,
|
| 45 |
+
"outliers": 23,
|
| 46 |
+
"iterations": 200,
|
| 47 |
+
"refMeanMs": 0.6132
|
| 48 |
+
},
|
| 49 |
+
"verified": true
|
| 50 |
+
},
|
| 51 |
+
{
|
| 52 |
+
"workload": "GrpoLossBenchmark.grpo_loss_batch32_seqlen02048_eager",
|
| 53 |
+
"timingResults": {
|
| 54 |
+
"mean_ms": 0.0302,
|
| 55 |
+
"std_ms": 0.0029,
|
| 56 |
+
"min_ms": 0.028,
|
| 57 |
+
"max_ms": 0.0467,
|
| 58 |
+
"q1_ms": 0.029,
|
| 59 |
+
"q3_ms": 0.0299,
|
| 60 |
+
"iqr_ms": 0.0008,
|
| 61 |
+
"outliers": 20,
|
| 62 |
+
"iterations": 200,
|
| 63 |
+
"refMeanMs": 0.6043
|
| 64 |
+
},
|
| 65 |
+
"verified": true
|
| 66 |
+
},
|
| 67 |
+
{
|
| 68 |
+
"workload": "GrpoLossBenchmark.grpo_loss_batch64_seqlen04096_eager",
|
| 69 |
+
"timingResults": {
|
| 70 |
+
"mean_ms": 0.0295,
|
| 71 |
+
"std_ms": 0.003,
|
| 72 |
+
"min_ms": 0.0268,
|
| 73 |
+
"max_ms": 0.0465,
|
| 74 |
+
"q1_ms": 0.0279,
|
| 75 |
+
"q3_ms": 0.03,
|
| 76 |
+
"iqr_ms": 0.002,
|
| 77 |
+
"outliers": 12,
|
| 78 |
+
"iterations": 200,
|
| 79 |
+
"refMeanMs": 0.5798
|
| 80 |
+
},
|
| 81 |
+
"verified": true
|
| 82 |
+
},
|
| 83 |
+
{
|
| 84 |
+
"workload": "GrpoLossBenchmark.grpo_loss_fwd_batch128_seqlen02781_eager",
|
| 85 |
+
"timingResults": {
|
| 86 |
+
"mean_ms": 0.0306,
|
| 87 |
+
"std_ms": 0.0032,
|
| 88 |
+
"min_ms": 0.0281,
|
| 89 |
+
"max_ms": 0.0513,
|
| 90 |
+
"q1_ms": 0.0293,
|
| 91 |
+
"q3_ms": 0.0302,
|
| 92 |
+
"iqr_ms": 0.0009,
|
| 93 |
+
"outliers": 24,
|
| 94 |
+
"iterations": 200,
|
| 95 |
+
"refMeanMs": 0.1716
|
| 96 |
+
},
|
| 97 |
+
"verified": true
|
| 98 |
+
},
|
| 99 |
+
{
|
| 100 |
+
"workload": "GrpoLossBenchmark.grpo_loss_fwd_batch128_seqlen08192_eager",
|
| 101 |
+
"timingResults": {
|
| 102 |
+
"mean_ms": 0.0302,
|
| 103 |
+
"std_ms": 0.0031,
|
| 104 |
+
"min_ms": 0.0284,
|
| 105 |
+
"max_ms": 0.0594,
|
| 106 |
+
"q1_ms": 0.0291,
|
| 107 |
+
"q3_ms": 0.0299,
|
| 108 |
+
"iqr_ms": 0.0008,
|
| 109 |
+
"outliers": 21,
|
| 110 |
+
"iterations": 200,
|
| 111 |
+
"refMeanMs": 0.1701
|
| 112 |
+
},
|
| 113 |
+
"verified": true
|
| 114 |
+
},
|
| 115 |
+
{
|
| 116 |
+
"workload": "GrpoLossBenchmark.grpo_loss_fwd_batch16_seqlen01024_eager",
|
| 117 |
+
"timingResults": {
|
| 118 |
+
"mean_ms": 0.0306,
|
| 119 |
+
"std_ms": 0.0027,
|
| 120 |
+
"min_ms": 0.0286,
|
| 121 |
+
"max_ms": 0.0455,
|
| 122 |
+
"q1_ms": 0.0294,
|
| 123 |
+
"q3_ms": 0.0304,
|
| 124 |
+
"iqr_ms": 0.001,
|
| 125 |
+
"outliers": 16,
|
| 126 |
+
"iterations": 200,
|
| 127 |
+
"refMeanMs": 0.1741
|
| 128 |
+
},
|
| 129 |
+
"verified": true
|
| 130 |
+
},
|
| 131 |
+
{
|
| 132 |
+
"workload": "GrpoLossBenchmark.grpo_loss_fwd_batch32_seqlen02048_eager",
|
| 133 |
+
"timingResults": {
|
| 134 |
+
"mean_ms": 0.0299,
|
| 135 |
+
"std_ms": 0.0029,
|
| 136 |
+
"min_ms": 0.0269,
|
| 137 |
+
"max_ms": 0.0488,
|
| 138 |
+
"q1_ms": 0.0287,
|
| 139 |
+
"q3_ms": 0.0301,
|
| 140 |
+
"iqr_ms": 0.0015,
|
| 141 |
+
"outliers": 14,
|
| 142 |
+
"iterations": 200,
|
| 143 |
+
"refMeanMs": 0.1647
|
| 144 |
+
},
|
| 145 |
+
"verified": true
|
| 146 |
+
},
|
| 147 |
+
{
|
| 148 |
+
"workload": "GrpoLossBenchmark.grpo_loss_fwd_batch64_seqlen04096_eager",
|
| 149 |
+
"timingResults": {
|
| 150 |
+
"mean_ms": 0.0314,
|
| 151 |
+
"std_ms": 0.0028,
|
| 152 |
+
"min_ms": 0.0289,
|
| 153 |
+
"max_ms": 0.0465,
|
| 154 |
+
"q1_ms": 0.0301,
|
| 155 |
+
"q3_ms": 0.0312,
|
| 156 |
+
"iqr_ms": 0.0011,
|
| 157 |
+
"outliers": 22,
|
| 158 |
+
"iterations": 200,
|
| 159 |
+
"refMeanMs": 0.1751
|
| 160 |
+
},
|
| 161 |
+
"verified": true
|
| 162 |
+
}
|
| 163 |
+
],
|
| 164 |
+
"machineInfo": {
|
| 165 |
+
"gpu": "NVIDIA H100 80GB HBM3",
|
| 166 |
+
"backend": "CUDA 13.0",
|
| 167 |
+
"pytorchVersion": "2.11.0+cu130",
|
| 168 |
+
"os": "Linux 6.11.0-1016-nvidia",
|
| 169 |
+
"cpu": "x86_64"
|
| 170 |
+
},
|
| 171 |
+
"kernelCommitSha": "87ec9b61421d0121",
|
| 172 |
+
"benchmarkScriptPath": "benchmarks",
|
| 173 |
+
"benchmarkScriptSha": "ff35d63fbca37cfcbf5c94f067c930adc2bd0043ce6788f286dbad5a4f9b9d4a"
|
| 174 |
+
}
|
benchmark_results/reverse_kl_compiled/results.json
ADDED
|
@@ -0,0 +1,206 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"results": [
|
| 3 |
+
{
|
| 4 |
+
"workload": "ReverseKLBenchmark.reverse_kl_batch01_seqlen064_vocab248320_compiled",
|
| 5 |
+
"timingResults": {
|
| 6 |
+
"mean_ms": 0.1039,
|
| 7 |
+
"std_ms": 0.0035,
|
| 8 |
+
"min_ms": 0.1,
|
| 9 |
+
"max_ms": 0.1229,
|
| 10 |
+
"q1_ms": 0.1018,
|
| 11 |
+
"q3_ms": 0.104,
|
| 12 |
+
"iqr_ms": 0.0022,
|
| 13 |
+
"outliers": 28,
|
| 14 |
+
"iterations": 200,
|
| 15 |
+
"refMeanMs": 0.2322
|
| 16 |
+
},
|
| 17 |
+
"verified": true
|
| 18 |
+
},
|
| 19 |
+
{
|
| 20 |
+
"workload": "ReverseKLBenchmark.reverse_kl_batch02_seqlen128_vocab248320_compiled",
|
| 21 |
+
"timingResults": {
|
| 22 |
+
"mean_ms": 0.2483,
|
| 23 |
+
"std_ms": 0.0035,
|
| 24 |
+
"min_ms": 0.2418,
|
| 25 |
+
"max_ms": 0.2612,
|
| 26 |
+
"q1_ms": 0.2457,
|
| 27 |
+
"q3_ms": 0.2513,
|
| 28 |
+
"iqr_ms": 0.0057,
|
| 29 |
+
"outliers": 2,
|
| 30 |
+
"iterations": 200,
|
| 31 |
+
"refMeanMs": 0.6455
|
| 32 |
+
},
|
| 33 |
+
"verified": true
|
| 34 |
+
},
|
| 35 |
+
{
|
| 36 |
+
"workload": "ReverseKLBenchmark.reverse_kl_batch04_seqlen256_vocab248320_compiled",
|
| 37 |
+
"timingResults": {
|
| 38 |
+
"mean_ms": 0.8322,
|
| 39 |
+
"std_ms": 0.0044,
|
| 40 |
+
"min_ms": 0.8232,
|
| 41 |
+
"max_ms": 0.8623,
|
| 42 |
+
"q1_ms": 0.8303,
|
| 43 |
+
"q3_ms": 0.8335,
|
| 44 |
+
"iqr_ms": 0.0032,
|
| 45 |
+
"outliers": 18,
|
| 46 |
+
"iterations": 200,
|
| 47 |
+
"refMeanMs": 2.2082
|
| 48 |
+
},
|
| 49 |
+
"verified": true
|
| 50 |
+
},
|
| 51 |
+
{
|
| 52 |
+
"workload": "ReverseKLBenchmark.reverse_kl_batch08_seqlen1024_vocab248320_compiled",
|
| 53 |
+
"timingResults": {
|
| 54 |
+
"mean_ms": 6.1083,
|
| 55 |
+
"std_ms": 0.0054,
|
| 56 |
+
"min_ms": 6.097,
|
| 57 |
+
"max_ms": 6.1513,
|
| 58 |
+
"q1_ms": 6.1054,
|
| 59 |
+
"q3_ms": 6.11,
|
| 60 |
+
"iqr_ms": 0.0046,
|
| 61 |
+
"outliers": 13,
|
| 62 |
+
"iterations": 200,
|
| 63 |
+
"refMeanMs": 16.4779
|
| 64 |
+
},
|
| 65 |
+
"verified": true
|
| 66 |
+
},
|
| 67 |
+
{
|
| 68 |
+
"workload": "ReverseKLBenchmark.reverse_kl_batch08_seqlen512_vocab248320_compiled",
|
| 69 |
+
"timingResults": {
|
| 70 |
+
"mean_ms": 3.0861,
|
| 71 |
+
"std_ms": 0.0045,
|
| 72 |
+
"min_ms": 3.0769,
|
| 73 |
+
"max_ms": 3.1155,
|
| 74 |
+
"q1_ms": 3.0832,
|
| 75 |
+
"q3_ms": 3.0883,
|
| 76 |
+
"iqr_ms": 0.0051,
|
| 77 |
+
"outliers": 5,
|
| 78 |
+
"iterations": 200,
|
| 79 |
+
"refMeanMs": 8.3849
|
| 80 |
+
},
|
| 81 |
+
"verified": true
|
| 82 |
+
},
|
| 83 |
+
{
|
| 84 |
+
"workload": "ReverseKLBenchmark.reverse_kl_batch08_seqlen981_vocab248320_compiled",
|
| 85 |
+
"timingResults": {
|
| 86 |
+
"mean_ms": 5.8622,
|
| 87 |
+
"std_ms": 0.0044,
|
| 88 |
+
"min_ms": 5.8544,
|
| 89 |
+
"max_ms": 5.8821,
|
| 90 |
+
"q1_ms": 5.859,
|
| 91 |
+
"q3_ms": 5.8646,
|
| 92 |
+
"iqr_ms": 0.0056,
|
| 93 |
+
"outliers": 6,
|
| 94 |
+
"iterations": 200,
|
| 95 |
+
"refMeanMs": 15.8101
|
| 96 |
+
},
|
| 97 |
+
"verified": true
|
| 98 |
+
},
|
| 99 |
+
{
|
| 100 |
+
"workload": "ReverseKLBenchmark.reverse_kl_fwd_batch01_seqlen064_vocab248320_compiled",
|
| 101 |
+
"timingResults": {
|
| 102 |
+
"mean_ms": 0.0657,
|
| 103 |
+
"std_ms": 0.0041,
|
| 104 |
+
"min_ms": 0.0619,
|
| 105 |
+
"max_ms": 0.093,
|
| 106 |
+
"q1_ms": 0.0635,
|
| 107 |
+
"q3_ms": 0.0656,
|
| 108 |
+
"iqr_ms": 0.0021,
|
| 109 |
+
"outliers": 24,
|
| 110 |
+
"iterations": 200,
|
| 111 |
+
"refMeanMs": 0.1434
|
| 112 |
+
},
|
| 113 |
+
"verified": true
|
| 114 |
+
},
|
| 115 |
+
{
|
| 116 |
+
"workload": "ReverseKLBenchmark.reverse_kl_fwd_batch02_seqlen128_vocab248320_compiled",
|
| 117 |
+
"timingResults": {
|
| 118 |
+
"mean_ms": 0.1234,
|
| 119 |
+
"std_ms": 0.0041,
|
| 120 |
+
"min_ms": 0.1187,
|
| 121 |
+
"max_ms": 0.1464,
|
| 122 |
+
"q1_ms": 0.1208,
|
| 123 |
+
"q3_ms": 0.1244,
|
| 124 |
+
"iqr_ms": 0.0036,
|
| 125 |
+
"outliers": 16,
|
| 126 |
+
"iterations": 200,
|
| 127 |
+
"refMeanMs": 0.3277
|
| 128 |
+
},
|
| 129 |
+
"verified": true
|
| 130 |
+
},
|
| 131 |
+
{
|
| 132 |
+
"workload": "ReverseKLBenchmark.reverse_kl_fwd_batch04_seqlen256_vocab248320_compiled",
|
| 133 |
+
"timingResults": {
|
| 134 |
+
"mean_ms": 0.3764,
|
| 135 |
+
"std_ms": 0.0037,
|
| 136 |
+
"min_ms": 0.3699,
|
| 137 |
+
"max_ms": 0.3926,
|
| 138 |
+
"q1_ms": 0.3733,
|
| 139 |
+
"q3_ms": 0.3787,
|
| 140 |
+
"iqr_ms": 0.0054,
|
| 141 |
+
"outliers": 2,
|
| 142 |
+
"iterations": 200,
|
| 143 |
+
"refMeanMs": 0.9228
|
| 144 |
+
},
|
| 145 |
+
"verified": true
|
| 146 |
+
},
|
| 147 |
+
{
|
| 148 |
+
"workload": "ReverseKLBenchmark.reverse_kl_fwd_batch08_seqlen1024_vocab248320_compiled",
|
| 149 |
+
"timingResults": {
|
| 150 |
+
"mean_ms": 2.658,
|
| 151 |
+
"std_ms": 0.0089,
|
| 152 |
+
"min_ms": 2.6359,
|
| 153 |
+
"max_ms": 2.6859,
|
| 154 |
+
"q1_ms": 2.6524,
|
| 155 |
+
"q3_ms": 2.663,
|
| 156 |
+
"iqr_ms": 0.0106,
|
| 157 |
+
"outliers": 4,
|
| 158 |
+
"iterations": 200,
|
| 159 |
+
"refMeanMs": 6.6033
|
| 160 |
+
},
|
| 161 |
+
"verified": true
|
| 162 |
+
},
|
| 163 |
+
{
|
| 164 |
+
"workload": "ReverseKLBenchmark.reverse_kl_fwd_batch08_seqlen512_vocab248320_compiled",
|
| 165 |
+
"timingResults": {
|
| 166 |
+
"mean_ms": 1.38,
|
| 167 |
+
"std_ms": 0.0035,
|
| 168 |
+
"min_ms": 1.37,
|
| 169 |
+
"max_ms": 1.3924,
|
| 170 |
+
"q1_ms": 1.3776,
|
| 171 |
+
"q3_ms": 1.3818,
|
| 172 |
+
"iqr_ms": 0.0042,
|
| 173 |
+
"outliers": 6,
|
| 174 |
+
"iterations": 200,
|
| 175 |
+
"refMeanMs": 3.3854
|
| 176 |
+
},
|
| 177 |
+
"verified": true
|
| 178 |
+
},
|
| 179 |
+
{
|
| 180 |
+
"workload": "ReverseKLBenchmark.reverse_kl_fwd_batch08_seqlen981_vocab248320_compiled",
|
| 181 |
+
"timingResults": {
|
| 182 |
+
"mean_ms": 2.5422,
|
| 183 |
+
"std_ms": 0.0091,
|
| 184 |
+
"min_ms": 2.5286,
|
| 185 |
+
"max_ms": 2.5773,
|
| 186 |
+
"q1_ms": 2.5356,
|
| 187 |
+
"q3_ms": 2.5455,
|
| 188 |
+
"iqr_ms": 0.0099,
|
| 189 |
+
"outliers": 9,
|
| 190 |
+
"iterations": 200,
|
| 191 |
+
"refMeanMs": 6.2191
|
| 192 |
+
},
|
| 193 |
+
"verified": true
|
| 194 |
+
}
|
| 195 |
+
],
|
| 196 |
+
"machineInfo": {
|
| 197 |
+
"gpu": "NVIDIA H100 80GB HBM3",
|
| 198 |
+
"backend": "CUDA 13.0",
|
| 199 |
+
"pytorchVersion": "2.11.0+cu130",
|
| 200 |
+
"os": "Linux 6.11.0-1016-nvidia",
|
| 201 |
+
"cpu": "x86_64"
|
| 202 |
+
},
|
| 203 |
+
"kernelCommitSha": "ca5cbc20b4d2c7d8",
|
| 204 |
+
"benchmarkScriptPath": "benchmarks",
|
| 205 |
+
"benchmarkScriptSha": "690eea1f54f31bef1ad248380201005fd667d4b9c535f92f06eb6a5a33380d22"
|
| 206 |
+
}
|
benchmark_results/reverse_kl_compiled/reverse_kl_compiled_dark_animation.svg
ADDED
|
|
benchmark_results/reverse_kl_compiled/reverse_kl_compiled_dark_latency.svg
ADDED
|
|
benchmark_results/reverse_kl_compiled/reverse_kl_compiled_dark_throughput.svg
ADDED
|
|
benchmark_results/reverse_kl_compiled/reverse_kl_compiled_light_animation.svg
ADDED
|
|
benchmark_results/reverse_kl_compiled/reverse_kl_compiled_light_latency.svg
ADDED
|
|
benchmark_results/reverse_kl_compiled/reverse_kl_compiled_light_throughput.svg
ADDED
|
|
benchmark_results/reverse_kl_eager/results.json
ADDED
|
@@ -0,0 +1,206 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"results": [
|
| 3 |
+
{
|
| 4 |
+
"workload": "ReverseKLBenchmark.reverse_kl_batch01_seqlen064_vocab248320_eager",
|
| 5 |
+
"timingResults": {
|
| 6 |
+
"mean_ms": 0.1029,
|
| 7 |
+
"std_ms": 0.0032,
|
| 8 |
+
"min_ms": 0.0982,
|
| 9 |
+
"max_ms": 0.1129,
|
| 10 |
+
"q1_ms": 0.101,
|
| 11 |
+
"q3_ms": 0.1036,
|
| 12 |
+
"iqr_ms": 0.0026,
|
| 13 |
+
"outliers": 27,
|
| 14 |
+
"iterations": 200,
|
| 15 |
+
"refMeanMs": 0.5293
|
| 16 |
+
},
|
| 17 |
+
"verified": true
|
| 18 |
+
},
|
| 19 |
+
{
|
| 20 |
+
"workload": "ReverseKLBenchmark.reverse_kl_batch02_seqlen128_vocab248320_eager",
|
| 21 |
+
"timingResults": {
|
| 22 |
+
"mean_ms": 0.248,
|
| 23 |
+
"std_ms": 0.0037,
|
| 24 |
+
"min_ms": 0.2417,
|
| 25 |
+
"max_ms": 0.2592,
|
| 26 |
+
"q1_ms": 0.2451,
|
| 27 |
+
"q3_ms": 0.251,
|
| 28 |
+
"iqr_ms": 0.0058,
|
| 29 |
+
"outliers": 0,
|
| 30 |
+
"iterations": 200,
|
| 31 |
+
"refMeanMs": 1.624
|
| 32 |
+
},
|
| 33 |
+
"verified": true
|
| 34 |
+
},
|
| 35 |
+
{
|
| 36 |
+
"workload": "ReverseKLBenchmark.reverse_kl_batch04_seqlen256_vocab248320_eager",
|
| 37 |
+
"timingResults": {
|
| 38 |
+
"mean_ms": 0.8321,
|
| 39 |
+
"std_ms": 0.0035,
|
| 40 |
+
"min_ms": 0.8234,
|
| 41 |
+
"max_ms": 0.854,
|
| 42 |
+
"q1_ms": 0.8306,
|
| 43 |
+
"q3_ms": 0.8335,
|
| 44 |
+
"iqr_ms": 0.003,
|
| 45 |
+
"outliers": 20,
|
| 46 |
+
"iterations": 200,
|
| 47 |
+
"refMeanMs": 6.174
|
| 48 |
+
},
|
| 49 |
+
"verified": true
|
| 50 |
+
},
|
| 51 |
+
{
|
| 52 |
+
"workload": "ReverseKLBenchmark.reverse_kl_batch08_seqlen1024_vocab248320_eager",
|
| 53 |
+
"timingResults": {
|
| 54 |
+
"mean_ms": 6.1046,
|
| 55 |
+
"std_ms": 0.0041,
|
| 56 |
+
"min_ms": 6.0961,
|
| 57 |
+
"max_ms": 6.1376,
|
| 58 |
+
"q1_ms": 6.1023,
|
| 59 |
+
"q3_ms": 6.106,
|
| 60 |
+
"iqr_ms": 0.0037,
|
| 61 |
+
"outliers": 9,
|
| 62 |
+
"iterations": 200,
|
| 63 |
+
"refMeanMs": 48.4051
|
| 64 |
+
},
|
| 65 |
+
"verified": true
|
| 66 |
+
},
|
| 67 |
+
{
|
| 68 |
+
"workload": "ReverseKLBenchmark.reverse_kl_batch08_seqlen512_vocab248320_eager",
|
| 69 |
+
"timingResults": {
|
| 70 |
+
"mean_ms": 3.0816,
|
| 71 |
+
"std_ms": 0.0035,
|
| 72 |
+
"min_ms": 3.0743,
|
| 73 |
+
"max_ms": 3.0939,
|
| 74 |
+
"q1_ms": 3.0794,
|
| 75 |
+
"q3_ms": 3.0832,
|
| 76 |
+
"iqr_ms": 0.0038,
|
| 77 |
+
"outliers": 8,
|
| 78 |
+
"iterations": 200,
|
| 79 |
+
"refMeanMs": 24.3385
|
| 80 |
+
},
|
| 81 |
+
"verified": true
|
| 82 |
+
},
|
| 83 |
+
{
|
| 84 |
+
"workload": "ReverseKLBenchmark.reverse_kl_batch08_seqlen981_vocab248320_eager",
|
| 85 |
+
"timingResults": {
|
| 86 |
+
"mean_ms": 5.8549,
|
| 87 |
+
"std_ms": 0.0045,
|
| 88 |
+
"min_ms": 5.8459,
|
| 89 |
+
"max_ms": 5.8819,
|
| 90 |
+
"q1_ms": 5.8524,
|
| 91 |
+
"q3_ms": 5.8561,
|
| 92 |
+
"iqr_ms": 0.0037,
|
| 93 |
+
"outliers": 14,
|
| 94 |
+
"iterations": 200,
|
| 95 |
+
"refMeanMs": 46.4274
|
| 96 |
+
},
|
| 97 |
+
"verified": true
|
| 98 |
+
},
|
| 99 |
+
{
|
| 100 |
+
"workload": "ReverseKLBenchmark.reverse_kl_fwd_batch01_seqlen064_vocab248320_eager",
|
| 101 |
+
"timingResults": {
|
| 102 |
+
"mean_ms": 0.0638,
|
| 103 |
+
"std_ms": 0.0027,
|
| 104 |
+
"min_ms": 0.0604,
|
| 105 |
+
"max_ms": 0.0787,
|
| 106 |
+
"q1_ms": 0.0624,
|
| 107 |
+
"q3_ms": 0.064,
|
| 108 |
+
"iqr_ms": 0.0016,
|
| 109 |
+
"outliers": 20,
|
| 110 |
+
"iterations": 200,
|
| 111 |
+
"refMeanMs": 0.2532
|
| 112 |
+
},
|
| 113 |
+
"verified": true
|
| 114 |
+
},
|
| 115 |
+
{
|
| 116 |
+
"workload": "ReverseKLBenchmark.reverse_kl_fwd_batch02_seqlen128_vocab248320_eager",
|
| 117 |
+
"timingResults": {
|
| 118 |
+
"mean_ms": 0.1217,
|
| 119 |
+
"std_ms": 0.0038,
|
| 120 |
+
"min_ms": 0.1166,
|
| 121 |
+
"max_ms": 0.1428,
|
| 122 |
+
"q1_ms": 0.1193,
|
| 123 |
+
"q3_ms": 0.1227,
|
| 124 |
+
"iqr_ms": 0.0034,
|
| 125 |
+
"outliers": 19,
|
| 126 |
+
"iterations": 200,
|
| 127 |
+
"refMeanMs": 0.7671
|
| 128 |
+
},
|
| 129 |
+
"verified": true
|
| 130 |
+
},
|
| 131 |
+
{
|
| 132 |
+
"workload": "ReverseKLBenchmark.reverse_kl_fwd_batch04_seqlen256_vocab248320_eager",
|
| 133 |
+
"timingResults": {
|
| 134 |
+
"mean_ms": 0.3753,
|
| 135 |
+
"std_ms": 0.0033,
|
| 136 |
+
"min_ms": 0.3695,
|
| 137 |
+
"max_ms": 0.3843,
|
| 138 |
+
"q1_ms": 0.3726,
|
| 139 |
+
"q3_ms": 0.3779,
|
| 140 |
+
"iqr_ms": 0.0053,
|
| 141 |
+
"outliers": 0,
|
| 142 |
+
"iterations": 200,
|
| 143 |
+
"refMeanMs": 2.869
|
| 144 |
+
},
|
| 145 |
+
"verified": true
|
| 146 |
+
},
|
| 147 |
+
{
|
| 148 |
+
"workload": "ReverseKLBenchmark.reverse_kl_fwd_batch08_seqlen1024_vocab248320_eager",
|
| 149 |
+
"timingResults": {
|
| 150 |
+
"mean_ms": 2.6484,
|
| 151 |
+
"std_ms": 0.0065,
|
| 152 |
+
"min_ms": 2.6364,
|
| 153 |
+
"max_ms": 2.7044,
|
| 154 |
+
"q1_ms": 2.6449,
|
| 155 |
+
"q3_ms": 2.6515,
|
| 156 |
+
"iqr_ms": 0.0067,
|
| 157 |
+
"outliers": 3,
|
| 158 |
+
"iterations": 200,
|
| 159 |
+
"refMeanMs": 22.3336
|
| 160 |
+
},
|
| 161 |
+
"verified": true
|
| 162 |
+
},
|
| 163 |
+
{
|
| 164 |
+
"workload": "ReverseKLBenchmark.reverse_kl_fwd_batch08_seqlen512_vocab248320_eager",
|
| 165 |
+
"timingResults": {
|
| 166 |
+
"mean_ms": 1.365,
|
| 167 |
+
"std_ms": 0.0046,
|
| 168 |
+
"min_ms": 1.3548,
|
| 169 |
+
"max_ms": 1.3865,
|
| 170 |
+
"q1_ms": 1.3618,
|
| 171 |
+
"q3_ms": 1.3675,
|
| 172 |
+
"iqr_ms": 0.0057,
|
| 173 |
+
"outliers": 4,
|
| 174 |
+
"iterations": 200,
|
| 175 |
+
"refMeanMs": 11.2401
|
| 176 |
+
},
|
| 177 |
+
"verified": true
|
| 178 |
+
},
|
| 179 |
+
{
|
| 180 |
+
"workload": "ReverseKLBenchmark.reverse_kl_fwd_batch08_seqlen981_vocab248320_eager",
|
| 181 |
+
"timingResults": {
|
| 182 |
+
"mean_ms": 2.5316,
|
| 183 |
+
"std_ms": 0.0059,
|
| 184 |
+
"min_ms": 2.5203,
|
| 185 |
+
"max_ms": 2.5523,
|
| 186 |
+
"q1_ms": 2.5272,
|
| 187 |
+
"q3_ms": 2.5355,
|
| 188 |
+
"iqr_ms": 0.0083,
|
| 189 |
+
"outliers": 3,
|
| 190 |
+
"iterations": 200,
|
| 191 |
+
"refMeanMs": 21.4099
|
| 192 |
+
},
|
| 193 |
+
"verified": true
|
| 194 |
+
}
|
| 195 |
+
],
|
| 196 |
+
"machineInfo": {
|
| 197 |
+
"gpu": "NVIDIA H100 80GB HBM3",
|
| 198 |
+
"backend": "CUDA 13.0",
|
| 199 |
+
"pytorchVersion": "2.11.0+cu130",
|
| 200 |
+
"os": "Linux 6.11.0-1016-nvidia",
|
| 201 |
+
"cpu": "x86_64"
|
| 202 |
+
},
|
| 203 |
+
"kernelCommitSha": "3e023eb5121761b8",
|
| 204 |
+
"benchmarkScriptPath": "benchmarks",
|
| 205 |
+
"benchmarkScriptSha": "690eea1f54f31bef1ad248380201005fd667d4b9c535f92f06eb6a5a33380d22"
|
| 206 |
+
}
|
benchmark_results/reverse_kl_eager/reverse_kl_eager_dark_animation.svg
ADDED
|
|
benchmark_results/reverse_kl_eager/reverse_kl_eager_dark_latency.svg
ADDED
|
|
benchmark_results/reverse_kl_eager/reverse_kl_eager_dark_throughput.svg
ADDED
|
|
benchmark_results/reverse_kl_eager/reverse_kl_eager_light_animation.svg
ADDED
|
|
benchmark_results/reverse_kl_eager/reverse_kl_eager_light_latency.svg
ADDED
|
|
benchmark_results/reverse_kl_eager/reverse_kl_eager_light_throughput.svg
ADDED
|
|