Pramodith commited on
Commit
55142ca
·
verified ·
1 Parent(s): 43849eb

Upload folder using huggingface_hub

Browse files
Files changed (42) hide show
  1. benchmark_results/bnpo_loss_compiled/bnpo_loss_compiled_dark_animation.svg +123 -0
  2. benchmark_results/bnpo_loss_compiled/bnpo_loss_compiled_dark_latency.svg +0 -0
  3. benchmark_results/bnpo_loss_compiled/bnpo_loss_compiled_dark_throughput.svg +0 -0
  4. benchmark_results/bnpo_loss_compiled/bnpo_loss_compiled_light_animation.svg +123 -0
  5. benchmark_results/bnpo_loss_compiled/bnpo_loss_compiled_light_latency.svg +0 -0
  6. benchmark_results/bnpo_loss_compiled/bnpo_loss_compiled_light_throughput.svg +0 -0
  7. benchmark_results/bnpo_loss_compiled/results.json +206 -0
  8. benchmark_results/bnpo_loss_eager/bnpo_loss_eager_dark_animation.svg +123 -0
  9. benchmark_results/bnpo_loss_eager/bnpo_loss_eager_dark_latency.svg +0 -0
  10. benchmark_results/bnpo_loss_eager/bnpo_loss_eager_dark_throughput.svg +0 -0
  11. benchmark_results/bnpo_loss_eager/bnpo_loss_eager_light_animation.svg +123 -0
  12. benchmark_results/bnpo_loss_eager/bnpo_loss_eager_light_latency.svg +0 -0
  13. benchmark_results/bnpo_loss_eager/bnpo_loss_eager_light_throughput.svg +0 -0
  14. benchmark_results/bnpo_loss_eager/results.json +206 -0
  15. benchmark_results/grpo_loss_compiled/grpo_loss_compiled_dark_animation.svg +105 -0
  16. benchmark_results/grpo_loss_compiled/grpo_loss_compiled_dark_latency.svg +0 -0
  17. benchmark_results/grpo_loss_compiled/grpo_loss_compiled_dark_throughput.svg +0 -0
  18. benchmark_results/grpo_loss_compiled/grpo_loss_compiled_light_animation.svg +105 -0
  19. benchmark_results/grpo_loss_compiled/grpo_loss_compiled_light_latency.svg +0 -0
  20. benchmark_results/grpo_loss_compiled/grpo_loss_compiled_light_throughput.svg +0 -0
  21. benchmark_results/grpo_loss_compiled/results.json +174 -0
  22. benchmark_results/grpo_loss_eager/grpo_loss_eager_dark_animation.svg +105 -0
  23. benchmark_results/grpo_loss_eager/grpo_loss_eager_dark_latency.svg +0 -0
  24. benchmark_results/grpo_loss_eager/grpo_loss_eager_dark_throughput.svg +0 -0
  25. benchmark_results/grpo_loss_eager/grpo_loss_eager_light_animation.svg +105 -0
  26. benchmark_results/grpo_loss_eager/grpo_loss_eager_light_latency.svg +0 -0
  27. benchmark_results/grpo_loss_eager/grpo_loss_eager_light_throughput.svg +0 -0
  28. benchmark_results/grpo_loss_eager/results.json +174 -0
  29. benchmark_results/reverse_kl_compiled/results.json +206 -0
  30. benchmark_results/reverse_kl_compiled/reverse_kl_compiled_dark_animation.svg +123 -0
  31. benchmark_results/reverse_kl_compiled/reverse_kl_compiled_dark_latency.svg +0 -0
  32. benchmark_results/reverse_kl_compiled/reverse_kl_compiled_dark_throughput.svg +0 -0
  33. benchmark_results/reverse_kl_compiled/reverse_kl_compiled_light_animation.svg +123 -0
  34. benchmark_results/reverse_kl_compiled/reverse_kl_compiled_light_latency.svg +0 -0
  35. benchmark_results/reverse_kl_compiled/reverse_kl_compiled_light_throughput.svg +0 -0
  36. benchmark_results/reverse_kl_eager/results.json +206 -0
  37. benchmark_results/reverse_kl_eager/reverse_kl_eager_dark_animation.svg +123 -0
  38. benchmark_results/reverse_kl_eager/reverse_kl_eager_dark_latency.svg +0 -0
  39. benchmark_results/reverse_kl_eager/reverse_kl_eager_dark_throughput.svg +0 -0
  40. benchmark_results/reverse_kl_eager/reverse_kl_eager_light_animation.svg +123 -0
  41. benchmark_results/reverse_kl_eager/reverse_kl_eager_light_latency.svg +0 -0
  42. benchmark_results/reverse_kl_eager/reverse_kl_eager_light_throughput.svg +0 -0
benchmark_results/bnpo_loss_compiled/bnpo_loss_compiled_dark_animation.svg ADDED
benchmark_results/bnpo_loss_compiled/bnpo_loss_compiled_dark_latency.svg ADDED
benchmark_results/bnpo_loss_compiled/bnpo_loss_compiled_dark_throughput.svg ADDED
benchmark_results/bnpo_loss_compiled/bnpo_loss_compiled_light_animation.svg ADDED
benchmark_results/bnpo_loss_compiled/bnpo_loss_compiled_light_latency.svg ADDED
benchmark_results/bnpo_loss_compiled/bnpo_loss_compiled_light_throughput.svg ADDED
benchmark_results/bnpo_loss_compiled/results.json ADDED
@@ -0,0 +1,206 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "results": [
3
+ {
4
+ "workload": "bnpoLossBenchmark.bnpo_loss_batch128_seqlen02781_compiled",
5
+ "timingResults": {
6
+ "mean_ms": 0.0359,
7
+ "std_ms": 0.0038,
8
+ "min_ms": 0.0332,
9
+ "max_ms": 0.0701,
10
+ "q1_ms": 0.0344,
11
+ "q3_ms": 0.0357,
12
+ "iqr_ms": 0.0013,
13
+ "outliers": 20,
14
+ "iterations": 200,
15
+ "refMeanMs": 0.0771
16
+ },
17
+ "verified": true
18
+ },
19
+ {
20
+ "workload": "bnpoLossBenchmark.bnpo_loss_batch128_seqlen08192_compiled",
21
+ "timingResults": {
22
+ "mean_ms": 0.0351,
23
+ "std_ms": 0.0033,
24
+ "min_ms": 0.0327,
25
+ "max_ms": 0.0557,
26
+ "q1_ms": 0.0336,
27
+ "q3_ms": 0.035,
28
+ "iqr_ms": 0.0014,
29
+ "outliers": 14,
30
+ "iterations": 200,
31
+ "refMeanMs": 0.0771
32
+ },
33
+ "verified": true
34
+ },
35
+ {
36
+ "workload": "bnpoLossBenchmark.bnpo_loss_batch16_seqlen01024_compiled",
37
+ "timingResults": {
38
+ "mean_ms": 0.0355,
39
+ "std_ms": 0.0042,
40
+ "min_ms": 0.0331,
41
+ "max_ms": 0.0706,
42
+ "q1_ms": 0.034,
43
+ "q3_ms": 0.0351,
44
+ "iqr_ms": 0.0011,
45
+ "outliers": 21,
46
+ "iterations": 200,
47
+ "refMeanMs": 0.0811
48
+ },
49
+ "verified": true
50
+ },
51
+ {
52
+ "workload": "bnpoLossBenchmark.bnpo_loss_batch16_seqlen02781_compiled",
53
+ "timingResults": {
54
+ "mean_ms": 0.0355,
55
+ "std_ms": 0.004,
56
+ "min_ms": 0.0319,
57
+ "max_ms": 0.0591,
58
+ "q1_ms": 0.0338,
59
+ "q3_ms": 0.0352,
60
+ "iqr_ms": 0.0014,
61
+ "outliers": 24,
62
+ "iterations": 200,
63
+ "refMeanMs": 0.0709
64
+ },
65
+ "verified": true
66
+ },
67
+ {
68
+ "workload": "bnpoLossBenchmark.bnpo_loss_batch32_seqlen02048_compiled",
69
+ "timingResults": {
70
+ "mean_ms": 0.0358,
71
+ "std_ms": 0.0042,
72
+ "min_ms": 0.032,
73
+ "max_ms": 0.0569,
74
+ "q1_ms": 0.0338,
75
+ "q3_ms": 0.0355,
76
+ "iqr_ms": 0.0017,
77
+ "outliers": 27,
78
+ "iterations": 200,
79
+ "refMeanMs": 0.0763
80
+ },
81
+ "verified": true
82
+ },
83
+ {
84
+ "workload": "bnpoLossBenchmark.bnpo_loss_batch64_seqlen04096_compiled",
85
+ "timingResults": {
86
+ "mean_ms": 0.0344,
87
+ "std_ms": 0.0031,
88
+ "min_ms": 0.032,
89
+ "max_ms": 0.0557,
90
+ "q1_ms": 0.0331,
91
+ "q3_ms": 0.0341,
92
+ "iqr_ms": 0.001,
93
+ "outliers": 32,
94
+ "iterations": 200,
95
+ "refMeanMs": 0.0739
96
+ },
97
+ "verified": true
98
+ },
99
+ {
100
+ "workload": "bnpoLossBenchmark.bnpo_loss_fwd_batch128_seqlen02781_compiled",
101
+ "timingResults": {
102
+ "mean_ms": 0.0323,
103
+ "std_ms": 0.0034,
104
+ "min_ms": 0.03,
105
+ "max_ms": 0.053,
106
+ "q1_ms": 0.0311,
107
+ "q3_ms": 0.0318,
108
+ "iqr_ms": 0.0007,
109
+ "outliers": 25,
110
+ "iterations": 200,
111
+ "refMeanMs": 0.0808
112
+ },
113
+ "verified": true
114
+ },
115
+ {
116
+ "workload": "bnpoLossBenchmark.bnpo_loss_fwd_batch128_seqlen08192_compiled",
117
+ "timingResults": {
118
+ "mean_ms": 0.0318,
119
+ "std_ms": 0.0032,
120
+ "min_ms": 0.0293,
121
+ "max_ms": 0.0502,
122
+ "q1_ms": 0.0304,
123
+ "q3_ms": 0.0317,
124
+ "iqr_ms": 0.0013,
125
+ "outliers": 17,
126
+ "iterations": 200,
127
+ "refMeanMs": 0.0845
128
+ },
129
+ "verified": true
130
+ },
131
+ {
132
+ "workload": "bnpoLossBenchmark.bnpo_loss_fwd_batch16_seqlen01024_compiled",
133
+ "timingResults": {
134
+ "mean_ms": 0.0317,
135
+ "std_ms": 0.0031,
136
+ "min_ms": 0.0293,
137
+ "max_ms": 0.0593,
138
+ "q1_ms": 0.0304,
139
+ "q3_ms": 0.0317,
140
+ "iqr_ms": 0.0013,
141
+ "outliers": 17,
142
+ "iterations": 200,
143
+ "refMeanMs": 0.079
144
+ },
145
+ "verified": true
146
+ },
147
+ {
148
+ "workload": "bnpoLossBenchmark.bnpo_loss_fwd_batch16_seqlen02781_compiled",
149
+ "timingResults": {
150
+ "mean_ms": 0.0306,
151
+ "std_ms": 0.0035,
152
+ "min_ms": 0.0279,
153
+ "max_ms": 0.0534,
154
+ "q1_ms": 0.0289,
155
+ "q3_ms": 0.0306,
156
+ "iqr_ms": 0.0017,
157
+ "outliers": 20,
158
+ "iterations": 200,
159
+ "refMeanMs": 0.084
160
+ },
161
+ "verified": true
162
+ },
163
+ {
164
+ "workload": "bnpoLossBenchmark.bnpo_loss_fwd_batch32_seqlen02048_compiled",
165
+ "timingResults": {
166
+ "mean_ms": 0.0305,
167
+ "std_ms": 0.0035,
168
+ "min_ms": 0.0279,
169
+ "max_ms": 0.051,
170
+ "q1_ms": 0.0288,
171
+ "q3_ms": 0.0308,
172
+ "iqr_ms": 0.002,
173
+ "outliers": 15,
174
+ "iterations": 200,
175
+ "refMeanMs": 0.0764
176
+ },
177
+ "verified": true
178
+ },
179
+ {
180
+ "workload": "bnpoLossBenchmark.bnpo_loss_fwd_batch64_seqlen04096_compiled",
181
+ "timingResults": {
182
+ "mean_ms": 0.0315,
183
+ "std_ms": 0.0033,
184
+ "min_ms": 0.0293,
185
+ "max_ms": 0.0543,
186
+ "q1_ms": 0.0302,
187
+ "q3_ms": 0.0311,
188
+ "iqr_ms": 0.0009,
189
+ "outliers": 21,
190
+ "iterations": 200,
191
+ "refMeanMs": 0.0739
192
+ },
193
+ "verified": true
194
+ }
195
+ ],
196
+ "machineInfo": {
197
+ "gpu": "NVIDIA H100 80GB HBM3",
198
+ "backend": "CUDA 13.0",
199
+ "pytorchVersion": "2.11.0+cu130",
200
+ "os": "Linux 6.11.0-1016-nvidia",
201
+ "cpu": "x86_64"
202
+ },
203
+ "kernelCommitSha": "7972ab0e834be24d",
204
+ "benchmarkScriptPath": "benchmarks",
205
+ "benchmarkScriptSha": "68426064f76adff2066ad365f6c97be3fe279bd6b20d025b3dc5614f9b2da449"
206
+ }
benchmark_results/bnpo_loss_eager/bnpo_loss_eager_dark_animation.svg ADDED
benchmark_results/bnpo_loss_eager/bnpo_loss_eager_dark_latency.svg ADDED
benchmark_results/bnpo_loss_eager/bnpo_loss_eager_dark_throughput.svg ADDED
benchmark_results/bnpo_loss_eager/bnpo_loss_eager_light_animation.svg ADDED
benchmark_results/bnpo_loss_eager/bnpo_loss_eager_light_latency.svg ADDED
benchmark_results/bnpo_loss_eager/bnpo_loss_eager_light_throughput.svg ADDED
benchmark_results/bnpo_loss_eager/results.json ADDED
@@ -0,0 +1,206 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "results": [
3
+ {
4
+ "workload": "bnpoLossBenchmark.bnpo_loss_batch128_seqlen02781_eager",
5
+ "timingResults": {
6
+ "mean_ms": 0.0358,
7
+ "std_ms": 0.0035,
8
+ "min_ms": 0.0323,
9
+ "max_ms": 0.0536,
10
+ "q1_ms": 0.0342,
11
+ "q3_ms": 0.0358,
12
+ "iqr_ms": 0.0017,
13
+ "outliers": 17,
14
+ "iterations": 200,
15
+ "refMeanMs": 0.5552
16
+ },
17
+ "verified": true
18
+ },
19
+ {
20
+ "workload": "bnpoLossBenchmark.bnpo_loss_batch128_seqlen08192_eager",
21
+ "timingResults": {
22
+ "mean_ms": 0.0344,
23
+ "std_ms": 0.0031,
24
+ "min_ms": 0.0314,
25
+ "max_ms": 0.0537,
26
+ "q1_ms": 0.0329,
27
+ "q3_ms": 0.0345,
28
+ "iqr_ms": 0.0015,
29
+ "outliers": 20,
30
+ "iterations": 200,
31
+ "refMeanMs": 0.6466
32
+ },
33
+ "verified": true
34
+ },
35
+ {
36
+ "workload": "bnpoLossBenchmark.bnpo_loss_batch16_seqlen01024_eager",
37
+ "timingResults": {
38
+ "mean_ms": 0.0345,
39
+ "std_ms": 0.0171,
40
+ "min_ms": 0.0305,
41
+ "max_ms": 0.2718,
42
+ "q1_ms": 0.0319,
43
+ "q3_ms": 0.033,
44
+ "iqr_ms": 0.0011,
45
+ "outliers": 23,
46
+ "iterations": 200,
47
+ "refMeanMs": 0.5868
48
+ },
49
+ "verified": true
50
+ },
51
+ {
52
+ "workload": "bnpoLossBenchmark.bnpo_loss_batch16_seqlen02781_eager",
53
+ "timingResults": {
54
+ "mean_ms": 0.0324,
55
+ "std_ms": 0.0027,
56
+ "min_ms": 0.0301,
57
+ "max_ms": 0.0508,
58
+ "q1_ms": 0.0312,
59
+ "q3_ms": 0.0324,
60
+ "iqr_ms": 0.0012,
61
+ "outliers": 17,
62
+ "iterations": 200,
63
+ "refMeanMs": 0.5832
64
+ },
65
+ "verified": true
66
+ },
67
+ {
68
+ "workload": "bnpoLossBenchmark.bnpo_loss_batch32_seqlen02048_eager",
69
+ "timingResults": {
70
+ "mean_ms": 0.0343,
71
+ "std_ms": 0.0033,
72
+ "min_ms": 0.031,
73
+ "max_ms": 0.0513,
74
+ "q1_ms": 0.0325,
75
+ "q3_ms": 0.0346,
76
+ "iqr_ms": 0.0021,
77
+ "outliers": 19,
78
+ "iterations": 200,
79
+ "refMeanMs": 0.6265
80
+ },
81
+ "verified": true
82
+ },
83
+ {
84
+ "workload": "bnpoLossBenchmark.bnpo_loss_batch64_seqlen04096_eager",
85
+ "timingResults": {
86
+ "mean_ms": 0.0328,
87
+ "std_ms": 0.0029,
88
+ "min_ms": 0.0306,
89
+ "max_ms": 0.0499,
90
+ "q1_ms": 0.0317,
91
+ "q3_ms": 0.0326,
92
+ "iqr_ms": 0.0009,
93
+ "outliers": 20,
94
+ "iterations": 200,
95
+ "refMeanMs": 0.5698
96
+ },
97
+ "verified": true
98
+ },
99
+ {
100
+ "workload": "bnpoLossBenchmark.bnpo_loss_fwd_batch128_seqlen02781_eager",
101
+ "timingResults": {
102
+ "mean_ms": 0.0317,
103
+ "std_ms": 0.0034,
104
+ "min_ms": 0.0285,
105
+ "max_ms": 0.052,
106
+ "q1_ms": 0.0305,
107
+ "q3_ms": 0.0314,
108
+ "iqr_ms": 0.0009,
109
+ "outliers": 22,
110
+ "iterations": 200,
111
+ "refMeanMs": 0.1858
112
+ },
113
+ "verified": true
114
+ },
115
+ {
116
+ "workload": "bnpoLossBenchmark.bnpo_loss_fwd_batch128_seqlen08192_eager",
117
+ "timingResults": {
118
+ "mean_ms": 0.0292,
119
+ "std_ms": 0.0028,
120
+ "min_ms": 0.0273,
121
+ "max_ms": 0.0455,
122
+ "q1_ms": 0.0281,
123
+ "q3_ms": 0.0289,
124
+ "iqr_ms": 0.0008,
125
+ "outliers": 23,
126
+ "iterations": 200,
127
+ "refMeanMs": 0.1633
128
+ },
129
+ "verified": true
130
+ },
131
+ {
132
+ "workload": "bnpoLossBenchmark.bnpo_loss_fwd_batch16_seqlen01024_eager",
133
+ "timingResults": {
134
+ "mean_ms": 0.0311,
135
+ "std_ms": 0.0267,
136
+ "min_ms": 0.0256,
137
+ "max_ms": 0.4049,
138
+ "q1_ms": 0.0276,
139
+ "q3_ms": 0.0295,
140
+ "iqr_ms": 0.0018,
141
+ "outliers": 18,
142
+ "iterations": 200,
143
+ "refMeanMs": 0.1761
144
+ },
145
+ "verified": true
146
+ },
147
+ {
148
+ "workload": "bnpoLossBenchmark.bnpo_loss_fwd_batch16_seqlen02781_eager",
149
+ "timingResults": {
150
+ "mean_ms": 0.0288,
151
+ "std_ms": 0.003,
152
+ "min_ms": 0.027,
153
+ "max_ms": 0.0554,
154
+ "q1_ms": 0.0278,
155
+ "q3_ms": 0.0284,
156
+ "iqr_ms": 0.0006,
157
+ "outliers": 22,
158
+ "iterations": 200,
159
+ "refMeanMs": 0.1755
160
+ },
161
+ "verified": true
162
+ },
163
+ {
164
+ "workload": "bnpoLossBenchmark.bnpo_loss_fwd_batch32_seqlen02048_eager",
165
+ "timingResults": {
166
+ "mean_ms": 0.031,
167
+ "std_ms": 0.0034,
168
+ "min_ms": 0.0281,
169
+ "max_ms": 0.0484,
170
+ "q1_ms": 0.0296,
171
+ "q3_ms": 0.0306,
172
+ "iqr_ms": 0.0009,
173
+ "outliers": 27,
174
+ "iterations": 200,
175
+ "refMeanMs": 0.1533
176
+ },
177
+ "verified": true
178
+ },
179
+ {
180
+ "workload": "bnpoLossBenchmark.bnpo_loss_fwd_batch64_seqlen04096_eager",
181
+ "timingResults": {
182
+ "mean_ms": 0.031,
183
+ "std_ms": 0.0041,
184
+ "min_ms": 0.0286,
185
+ "max_ms": 0.0625,
186
+ "q1_ms": 0.0294,
187
+ "q3_ms": 0.0305,
188
+ "iqr_ms": 0.0011,
189
+ "outliers": 22,
190
+ "iterations": 200,
191
+ "refMeanMs": 0.1678
192
+ },
193
+ "verified": true
194
+ }
195
+ ],
196
+ "machineInfo": {
197
+ "gpu": "NVIDIA H100 80GB HBM3",
198
+ "backend": "CUDA 13.0",
199
+ "pytorchVersion": "2.11.0+cu130",
200
+ "os": "Linux 6.11.0-1016-nvidia",
201
+ "cpu": "x86_64"
202
+ },
203
+ "kernelCommitSha": "84e79b2f3ee3088a",
204
+ "benchmarkScriptPath": "benchmarks",
205
+ "benchmarkScriptSha": "68426064f76adff2066ad365f6c97be3fe279bd6b20d025b3dc5614f9b2da449"
206
+ }
benchmark_results/grpo_loss_compiled/grpo_loss_compiled_dark_animation.svg ADDED
benchmark_results/grpo_loss_compiled/grpo_loss_compiled_dark_latency.svg ADDED
benchmark_results/grpo_loss_compiled/grpo_loss_compiled_dark_throughput.svg ADDED
benchmark_results/grpo_loss_compiled/grpo_loss_compiled_light_animation.svg ADDED
benchmark_results/grpo_loss_compiled/grpo_loss_compiled_light_latency.svg ADDED
benchmark_results/grpo_loss_compiled/grpo_loss_compiled_light_throughput.svg ADDED
benchmark_results/grpo_loss_compiled/results.json ADDED
@@ -0,0 +1,174 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "results": [
3
+ {
4
+ "workload": "GrpoLossBenchmark.grpo_loss_batch128_seqlen02781_compiled",
5
+ "timingResults": {
6
+ "mean_ms": 0.0329,
7
+ "std_ms": 0.0042,
8
+ "min_ms": 0.0301,
9
+ "max_ms": 0.0632,
10
+ "q1_ms": 0.031,
11
+ "q3_ms": 0.0326,
12
+ "iqr_ms": 0.0016,
13
+ "outliers": 22,
14
+ "iterations": 200,
15
+ "refMeanMs": 0.0874
16
+ },
17
+ "verified": true
18
+ },
19
+ {
20
+ "workload": "GrpoLossBenchmark.grpo_loss_batch128_seqlen08192_compiled",
21
+ "timingResults": {
22
+ "mean_ms": 0.0337,
23
+ "std_ms": 0.0045,
24
+ "min_ms": 0.0305,
25
+ "max_ms": 0.065,
26
+ "q1_ms": 0.0318,
27
+ "q3_ms": 0.0333,
28
+ "iqr_ms": 0.0015,
29
+ "outliers": 23,
30
+ "iterations": 200,
31
+ "refMeanMs": 0.0824
32
+ },
33
+ "verified": true
34
+ },
35
+ {
36
+ "workload": "GrpoLossBenchmark.grpo_loss_batch16_seqlen01024_compiled",
37
+ "timingResults": {
38
+ "mean_ms": 0.0323,
39
+ "std_ms": 0.0045,
40
+ "min_ms": 0.0286,
41
+ "max_ms": 0.0621,
42
+ "q1_ms": 0.0306,
43
+ "q3_ms": 0.0321,
44
+ "iqr_ms": 0.0015,
45
+ "outliers": 24,
46
+ "iterations": 200,
47
+ "refMeanMs": 0.0626
48
+ },
49
+ "verified": true
50
+ },
51
+ {
52
+ "workload": "GrpoLossBenchmark.grpo_loss_batch32_seqlen02048_compiled",
53
+ "timingResults": {
54
+ "mean_ms": 0.0324,
55
+ "std_ms": 0.0046,
56
+ "min_ms": 0.0286,
57
+ "max_ms": 0.0688,
58
+ "q1_ms": 0.0305,
59
+ "q3_ms": 0.0321,
60
+ "iqr_ms": 0.0016,
61
+ "outliers": 22,
62
+ "iterations": 200,
63
+ "refMeanMs": 0.0633
64
+ },
65
+ "verified": true
66
+ },
67
+ {
68
+ "workload": "GrpoLossBenchmark.grpo_loss_batch64_seqlen04096_compiled",
69
+ "timingResults": {
70
+ "mean_ms": 0.0349,
71
+ "std_ms": 0.0058,
72
+ "min_ms": 0.0315,
73
+ "max_ms": 0.0814,
74
+ "q1_ms": 0.0325,
75
+ "q3_ms": 0.0341,
76
+ "iqr_ms": 0.0016,
77
+ "outliers": 26,
78
+ "iterations": 200,
79
+ "refMeanMs": 0.0869
80
+ },
81
+ "verified": true
82
+ },
83
+ {
84
+ "workload": "GrpoLossBenchmark.grpo_loss_fwd_batch128_seqlen02781_compiled",
85
+ "timingResults": {
86
+ "mean_ms": 0.033,
87
+ "std_ms": 0.0038,
88
+ "min_ms": 0.0295,
89
+ "max_ms": 0.0543,
90
+ "q1_ms": 0.0313,
91
+ "q3_ms": 0.0333,
92
+ "iqr_ms": 0.0019,
93
+ "outliers": 16,
94
+ "iterations": 200,
95
+ "refMeanMs": 0.0772
96
+ },
97
+ "verified": true
98
+ },
99
+ {
100
+ "workload": "GrpoLossBenchmark.grpo_loss_fwd_batch128_seqlen08192_compiled",
101
+ "timingResults": {
102
+ "mean_ms": 0.0331,
103
+ "std_ms": 0.0032,
104
+ "min_ms": 0.0295,
105
+ "max_ms": 0.0535,
106
+ "q1_ms": 0.0316,
107
+ "q3_ms": 0.0331,
108
+ "iqr_ms": 0.0015,
109
+ "outliers": 19,
110
+ "iterations": 200,
111
+ "refMeanMs": 0.0767
112
+ },
113
+ "verified": true
114
+ },
115
+ {
116
+ "workload": "GrpoLossBenchmark.grpo_loss_fwd_batch16_seqlen01024_compiled",
117
+ "timingResults": {
118
+ "mean_ms": 0.033,
119
+ "std_ms": 0.0032,
120
+ "min_ms": 0.029,
121
+ "max_ms": 0.051,
122
+ "q1_ms": 0.0315,
123
+ "q3_ms": 0.0332,
124
+ "iqr_ms": 0.0016,
125
+ "outliers": 17,
126
+ "iterations": 200,
127
+ "refMeanMs": 0.0845
128
+ },
129
+ "verified": true
130
+ },
131
+ {
132
+ "workload": "GrpoLossBenchmark.grpo_loss_fwd_batch32_seqlen02048_compiled",
133
+ "timingResults": {
134
+ "mean_ms": 0.0339,
135
+ "std_ms": 0.006,
136
+ "min_ms": 0.03,
137
+ "max_ms": 0.0674,
138
+ "q1_ms": 0.0314,
139
+ "q3_ms": 0.0331,
140
+ "iqr_ms": 0.0017,
141
+ "outliers": 23,
142
+ "iterations": 200,
143
+ "refMeanMs": 0.1052
144
+ },
145
+ "verified": true
146
+ },
147
+ {
148
+ "workload": "GrpoLossBenchmark.grpo_loss_fwd_batch64_seqlen04096_compiled",
149
+ "timingResults": {
150
+ "mean_ms": 0.034,
151
+ "std_ms": 0.004,
152
+ "min_ms": 0.031,
153
+ "max_ms": 0.0623,
154
+ "q1_ms": 0.0323,
155
+ "q3_ms": 0.0339,
156
+ "iqr_ms": 0.0016,
157
+ "outliers": 20,
158
+ "iterations": 200,
159
+ "refMeanMs": 0.0796
160
+ },
161
+ "verified": true
162
+ }
163
+ ],
164
+ "machineInfo": {
165
+ "gpu": "NVIDIA H100 80GB HBM3",
166
+ "backend": "CUDA 13.0",
167
+ "pytorchVersion": "2.11.0+cu130",
168
+ "os": "Linux 6.11.0-1016-nvidia",
169
+ "cpu": "x86_64"
170
+ },
171
+ "kernelCommitSha": "ad285d68b8c8c0ff",
172
+ "benchmarkScriptPath": "benchmarks",
173
+ "benchmarkScriptSha": "ff35d63fbca37cfcbf5c94f067c930adc2bd0043ce6788f286dbad5a4f9b9d4a"
174
+ }
benchmark_results/grpo_loss_eager/grpo_loss_eager_dark_animation.svg ADDED
benchmark_results/grpo_loss_eager/grpo_loss_eager_dark_latency.svg ADDED
benchmark_results/grpo_loss_eager/grpo_loss_eager_dark_throughput.svg ADDED
benchmark_results/grpo_loss_eager/grpo_loss_eager_light_animation.svg ADDED
benchmark_results/grpo_loss_eager/grpo_loss_eager_light_latency.svg ADDED
benchmark_results/grpo_loss_eager/grpo_loss_eager_light_throughput.svg ADDED
benchmark_results/grpo_loss_eager/results.json ADDED
@@ -0,0 +1,174 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "results": [
3
+ {
4
+ "workload": "GrpoLossBenchmark.grpo_loss_batch128_seqlen02781_eager",
5
+ "timingResults": {
6
+ "mean_ms": 0.0313,
7
+ "std_ms": 0.0029,
8
+ "min_ms": 0.0281,
9
+ "max_ms": 0.0482,
10
+ "q1_ms": 0.03,
11
+ "q3_ms": 0.0314,
12
+ "iqr_ms": 0.0013,
13
+ "outliers": 16,
14
+ "iterations": 200,
15
+ "refMeanMs": 0.6643
16
+ },
17
+ "verified": true
18
+ },
19
+ {
20
+ "workload": "GrpoLossBenchmark.grpo_loss_batch128_seqlen08192_eager",
21
+ "timingResults": {
22
+ "mean_ms": 0.0309,
23
+ "std_ms": 0.0031,
24
+ "min_ms": 0.0285,
25
+ "max_ms": 0.0477,
26
+ "q1_ms": 0.0298,
27
+ "q3_ms": 0.0306,
28
+ "iqr_ms": 0.0008,
29
+ "outliers": 19,
30
+ "iterations": 200,
31
+ "refMeanMs": 0.5961
32
+ },
33
+ "verified": true
34
+ },
35
+ {
36
+ "workload": "GrpoLossBenchmark.grpo_loss_batch16_seqlen01024_eager",
37
+ "timingResults": {
38
+ "mean_ms": 0.0315,
39
+ "std_ms": 0.0033,
40
+ "min_ms": 0.0293,
41
+ "max_ms": 0.0507,
42
+ "q1_ms": 0.0302,
43
+ "q3_ms": 0.0311,
44
+ "iqr_ms": 0.0009,
45
+ "outliers": 23,
46
+ "iterations": 200,
47
+ "refMeanMs": 0.6132
48
+ },
49
+ "verified": true
50
+ },
51
+ {
52
+ "workload": "GrpoLossBenchmark.grpo_loss_batch32_seqlen02048_eager",
53
+ "timingResults": {
54
+ "mean_ms": 0.0302,
55
+ "std_ms": 0.0029,
56
+ "min_ms": 0.028,
57
+ "max_ms": 0.0467,
58
+ "q1_ms": 0.029,
59
+ "q3_ms": 0.0299,
60
+ "iqr_ms": 0.0008,
61
+ "outliers": 20,
62
+ "iterations": 200,
63
+ "refMeanMs": 0.6043
64
+ },
65
+ "verified": true
66
+ },
67
+ {
68
+ "workload": "GrpoLossBenchmark.grpo_loss_batch64_seqlen04096_eager",
69
+ "timingResults": {
70
+ "mean_ms": 0.0295,
71
+ "std_ms": 0.003,
72
+ "min_ms": 0.0268,
73
+ "max_ms": 0.0465,
74
+ "q1_ms": 0.0279,
75
+ "q3_ms": 0.03,
76
+ "iqr_ms": 0.002,
77
+ "outliers": 12,
78
+ "iterations": 200,
79
+ "refMeanMs": 0.5798
80
+ },
81
+ "verified": true
82
+ },
83
+ {
84
+ "workload": "GrpoLossBenchmark.grpo_loss_fwd_batch128_seqlen02781_eager",
85
+ "timingResults": {
86
+ "mean_ms": 0.0306,
87
+ "std_ms": 0.0032,
88
+ "min_ms": 0.0281,
89
+ "max_ms": 0.0513,
90
+ "q1_ms": 0.0293,
91
+ "q3_ms": 0.0302,
92
+ "iqr_ms": 0.0009,
93
+ "outliers": 24,
94
+ "iterations": 200,
95
+ "refMeanMs": 0.1716
96
+ },
97
+ "verified": true
98
+ },
99
+ {
100
+ "workload": "GrpoLossBenchmark.grpo_loss_fwd_batch128_seqlen08192_eager",
101
+ "timingResults": {
102
+ "mean_ms": 0.0302,
103
+ "std_ms": 0.0031,
104
+ "min_ms": 0.0284,
105
+ "max_ms": 0.0594,
106
+ "q1_ms": 0.0291,
107
+ "q3_ms": 0.0299,
108
+ "iqr_ms": 0.0008,
109
+ "outliers": 21,
110
+ "iterations": 200,
111
+ "refMeanMs": 0.1701
112
+ },
113
+ "verified": true
114
+ },
115
+ {
116
+ "workload": "GrpoLossBenchmark.grpo_loss_fwd_batch16_seqlen01024_eager",
117
+ "timingResults": {
118
+ "mean_ms": 0.0306,
119
+ "std_ms": 0.0027,
120
+ "min_ms": 0.0286,
121
+ "max_ms": 0.0455,
122
+ "q1_ms": 0.0294,
123
+ "q3_ms": 0.0304,
124
+ "iqr_ms": 0.001,
125
+ "outliers": 16,
126
+ "iterations": 200,
127
+ "refMeanMs": 0.1741
128
+ },
129
+ "verified": true
130
+ },
131
+ {
132
+ "workload": "GrpoLossBenchmark.grpo_loss_fwd_batch32_seqlen02048_eager",
133
+ "timingResults": {
134
+ "mean_ms": 0.0299,
135
+ "std_ms": 0.0029,
136
+ "min_ms": 0.0269,
137
+ "max_ms": 0.0488,
138
+ "q1_ms": 0.0287,
139
+ "q3_ms": 0.0301,
140
+ "iqr_ms": 0.0015,
141
+ "outliers": 14,
142
+ "iterations": 200,
143
+ "refMeanMs": 0.1647
144
+ },
145
+ "verified": true
146
+ },
147
+ {
148
+ "workload": "GrpoLossBenchmark.grpo_loss_fwd_batch64_seqlen04096_eager",
149
+ "timingResults": {
150
+ "mean_ms": 0.0314,
151
+ "std_ms": 0.0028,
152
+ "min_ms": 0.0289,
153
+ "max_ms": 0.0465,
154
+ "q1_ms": 0.0301,
155
+ "q3_ms": 0.0312,
156
+ "iqr_ms": 0.0011,
157
+ "outliers": 22,
158
+ "iterations": 200,
159
+ "refMeanMs": 0.1751
160
+ },
161
+ "verified": true
162
+ }
163
+ ],
164
+ "machineInfo": {
165
+ "gpu": "NVIDIA H100 80GB HBM3",
166
+ "backend": "CUDA 13.0",
167
+ "pytorchVersion": "2.11.0+cu130",
168
+ "os": "Linux 6.11.0-1016-nvidia",
169
+ "cpu": "x86_64"
170
+ },
171
+ "kernelCommitSha": "87ec9b61421d0121",
172
+ "benchmarkScriptPath": "benchmarks",
173
+ "benchmarkScriptSha": "ff35d63fbca37cfcbf5c94f067c930adc2bd0043ce6788f286dbad5a4f9b9d4a"
174
+ }
benchmark_results/reverse_kl_compiled/results.json ADDED
@@ -0,0 +1,206 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "results": [
3
+ {
4
+ "workload": "ReverseKLBenchmark.reverse_kl_batch01_seqlen064_vocab248320_compiled",
5
+ "timingResults": {
6
+ "mean_ms": 0.1039,
7
+ "std_ms": 0.0035,
8
+ "min_ms": 0.1,
9
+ "max_ms": 0.1229,
10
+ "q1_ms": 0.1018,
11
+ "q3_ms": 0.104,
12
+ "iqr_ms": 0.0022,
13
+ "outliers": 28,
14
+ "iterations": 200,
15
+ "refMeanMs": 0.2322
16
+ },
17
+ "verified": true
18
+ },
19
+ {
20
+ "workload": "ReverseKLBenchmark.reverse_kl_batch02_seqlen128_vocab248320_compiled",
21
+ "timingResults": {
22
+ "mean_ms": 0.2483,
23
+ "std_ms": 0.0035,
24
+ "min_ms": 0.2418,
25
+ "max_ms": 0.2612,
26
+ "q1_ms": 0.2457,
27
+ "q3_ms": 0.2513,
28
+ "iqr_ms": 0.0057,
29
+ "outliers": 2,
30
+ "iterations": 200,
31
+ "refMeanMs": 0.6455
32
+ },
33
+ "verified": true
34
+ },
35
+ {
36
+ "workload": "ReverseKLBenchmark.reverse_kl_batch04_seqlen256_vocab248320_compiled",
37
+ "timingResults": {
38
+ "mean_ms": 0.8322,
39
+ "std_ms": 0.0044,
40
+ "min_ms": 0.8232,
41
+ "max_ms": 0.8623,
42
+ "q1_ms": 0.8303,
43
+ "q3_ms": 0.8335,
44
+ "iqr_ms": 0.0032,
45
+ "outliers": 18,
46
+ "iterations": 200,
47
+ "refMeanMs": 2.2082
48
+ },
49
+ "verified": true
50
+ },
51
+ {
52
+ "workload": "ReverseKLBenchmark.reverse_kl_batch08_seqlen1024_vocab248320_compiled",
53
+ "timingResults": {
54
+ "mean_ms": 6.1083,
55
+ "std_ms": 0.0054,
56
+ "min_ms": 6.097,
57
+ "max_ms": 6.1513,
58
+ "q1_ms": 6.1054,
59
+ "q3_ms": 6.11,
60
+ "iqr_ms": 0.0046,
61
+ "outliers": 13,
62
+ "iterations": 200,
63
+ "refMeanMs": 16.4779
64
+ },
65
+ "verified": true
66
+ },
67
+ {
68
+ "workload": "ReverseKLBenchmark.reverse_kl_batch08_seqlen512_vocab248320_compiled",
69
+ "timingResults": {
70
+ "mean_ms": 3.0861,
71
+ "std_ms": 0.0045,
72
+ "min_ms": 3.0769,
73
+ "max_ms": 3.1155,
74
+ "q1_ms": 3.0832,
75
+ "q3_ms": 3.0883,
76
+ "iqr_ms": 0.0051,
77
+ "outliers": 5,
78
+ "iterations": 200,
79
+ "refMeanMs": 8.3849
80
+ },
81
+ "verified": true
82
+ },
83
+ {
84
+ "workload": "ReverseKLBenchmark.reverse_kl_batch08_seqlen981_vocab248320_compiled",
85
+ "timingResults": {
86
+ "mean_ms": 5.8622,
87
+ "std_ms": 0.0044,
88
+ "min_ms": 5.8544,
89
+ "max_ms": 5.8821,
90
+ "q1_ms": 5.859,
91
+ "q3_ms": 5.8646,
92
+ "iqr_ms": 0.0056,
93
+ "outliers": 6,
94
+ "iterations": 200,
95
+ "refMeanMs": 15.8101
96
+ },
97
+ "verified": true
98
+ },
99
+ {
100
+ "workload": "ReverseKLBenchmark.reverse_kl_fwd_batch01_seqlen064_vocab248320_compiled",
101
+ "timingResults": {
102
+ "mean_ms": 0.0657,
103
+ "std_ms": 0.0041,
104
+ "min_ms": 0.0619,
105
+ "max_ms": 0.093,
106
+ "q1_ms": 0.0635,
107
+ "q3_ms": 0.0656,
108
+ "iqr_ms": 0.0021,
109
+ "outliers": 24,
110
+ "iterations": 200,
111
+ "refMeanMs": 0.1434
112
+ },
113
+ "verified": true
114
+ },
115
+ {
116
+ "workload": "ReverseKLBenchmark.reverse_kl_fwd_batch02_seqlen128_vocab248320_compiled",
117
+ "timingResults": {
118
+ "mean_ms": 0.1234,
119
+ "std_ms": 0.0041,
120
+ "min_ms": 0.1187,
121
+ "max_ms": 0.1464,
122
+ "q1_ms": 0.1208,
123
+ "q3_ms": 0.1244,
124
+ "iqr_ms": 0.0036,
125
+ "outliers": 16,
126
+ "iterations": 200,
127
+ "refMeanMs": 0.3277
128
+ },
129
+ "verified": true
130
+ },
131
+ {
132
+ "workload": "ReverseKLBenchmark.reverse_kl_fwd_batch04_seqlen256_vocab248320_compiled",
133
+ "timingResults": {
134
+ "mean_ms": 0.3764,
135
+ "std_ms": 0.0037,
136
+ "min_ms": 0.3699,
137
+ "max_ms": 0.3926,
138
+ "q1_ms": 0.3733,
139
+ "q3_ms": 0.3787,
140
+ "iqr_ms": 0.0054,
141
+ "outliers": 2,
142
+ "iterations": 200,
143
+ "refMeanMs": 0.9228
144
+ },
145
+ "verified": true
146
+ },
147
+ {
148
+ "workload": "ReverseKLBenchmark.reverse_kl_fwd_batch08_seqlen1024_vocab248320_compiled",
149
+ "timingResults": {
150
+ "mean_ms": 2.658,
151
+ "std_ms": 0.0089,
152
+ "min_ms": 2.6359,
153
+ "max_ms": 2.6859,
154
+ "q1_ms": 2.6524,
155
+ "q3_ms": 2.663,
156
+ "iqr_ms": 0.0106,
157
+ "outliers": 4,
158
+ "iterations": 200,
159
+ "refMeanMs": 6.6033
160
+ },
161
+ "verified": true
162
+ },
163
+ {
164
+ "workload": "ReverseKLBenchmark.reverse_kl_fwd_batch08_seqlen512_vocab248320_compiled",
165
+ "timingResults": {
166
+ "mean_ms": 1.38,
167
+ "std_ms": 0.0035,
168
+ "min_ms": 1.37,
169
+ "max_ms": 1.3924,
170
+ "q1_ms": 1.3776,
171
+ "q3_ms": 1.3818,
172
+ "iqr_ms": 0.0042,
173
+ "outliers": 6,
174
+ "iterations": 200,
175
+ "refMeanMs": 3.3854
176
+ },
177
+ "verified": true
178
+ },
179
+ {
180
+ "workload": "ReverseKLBenchmark.reverse_kl_fwd_batch08_seqlen981_vocab248320_compiled",
181
+ "timingResults": {
182
+ "mean_ms": 2.5422,
183
+ "std_ms": 0.0091,
184
+ "min_ms": 2.5286,
185
+ "max_ms": 2.5773,
186
+ "q1_ms": 2.5356,
187
+ "q3_ms": 2.5455,
188
+ "iqr_ms": 0.0099,
189
+ "outliers": 9,
190
+ "iterations": 200,
191
+ "refMeanMs": 6.2191
192
+ },
193
+ "verified": true
194
+ }
195
+ ],
196
+ "machineInfo": {
197
+ "gpu": "NVIDIA H100 80GB HBM3",
198
+ "backend": "CUDA 13.0",
199
+ "pytorchVersion": "2.11.0+cu130",
200
+ "os": "Linux 6.11.0-1016-nvidia",
201
+ "cpu": "x86_64"
202
+ },
203
+ "kernelCommitSha": "ca5cbc20b4d2c7d8",
204
+ "benchmarkScriptPath": "benchmarks",
205
+ "benchmarkScriptSha": "690eea1f54f31bef1ad248380201005fd667d4b9c535f92f06eb6a5a33380d22"
206
+ }
benchmark_results/reverse_kl_compiled/reverse_kl_compiled_dark_animation.svg ADDED
benchmark_results/reverse_kl_compiled/reverse_kl_compiled_dark_latency.svg ADDED
benchmark_results/reverse_kl_compiled/reverse_kl_compiled_dark_throughput.svg ADDED
benchmark_results/reverse_kl_compiled/reverse_kl_compiled_light_animation.svg ADDED
benchmark_results/reverse_kl_compiled/reverse_kl_compiled_light_latency.svg ADDED
benchmark_results/reverse_kl_compiled/reverse_kl_compiled_light_throughput.svg ADDED
benchmark_results/reverse_kl_eager/results.json ADDED
@@ -0,0 +1,206 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "results": [
3
+ {
4
+ "workload": "ReverseKLBenchmark.reverse_kl_batch01_seqlen064_vocab248320_eager",
5
+ "timingResults": {
6
+ "mean_ms": 0.1029,
7
+ "std_ms": 0.0032,
8
+ "min_ms": 0.0982,
9
+ "max_ms": 0.1129,
10
+ "q1_ms": 0.101,
11
+ "q3_ms": 0.1036,
12
+ "iqr_ms": 0.0026,
13
+ "outliers": 27,
14
+ "iterations": 200,
15
+ "refMeanMs": 0.5293
16
+ },
17
+ "verified": true
18
+ },
19
+ {
20
+ "workload": "ReverseKLBenchmark.reverse_kl_batch02_seqlen128_vocab248320_eager",
21
+ "timingResults": {
22
+ "mean_ms": 0.248,
23
+ "std_ms": 0.0037,
24
+ "min_ms": 0.2417,
25
+ "max_ms": 0.2592,
26
+ "q1_ms": 0.2451,
27
+ "q3_ms": 0.251,
28
+ "iqr_ms": 0.0058,
29
+ "outliers": 0,
30
+ "iterations": 200,
31
+ "refMeanMs": 1.624
32
+ },
33
+ "verified": true
34
+ },
35
+ {
36
+ "workload": "ReverseKLBenchmark.reverse_kl_batch04_seqlen256_vocab248320_eager",
37
+ "timingResults": {
38
+ "mean_ms": 0.8321,
39
+ "std_ms": 0.0035,
40
+ "min_ms": 0.8234,
41
+ "max_ms": 0.854,
42
+ "q1_ms": 0.8306,
43
+ "q3_ms": 0.8335,
44
+ "iqr_ms": 0.003,
45
+ "outliers": 20,
46
+ "iterations": 200,
47
+ "refMeanMs": 6.174
48
+ },
49
+ "verified": true
50
+ },
51
+ {
52
+ "workload": "ReverseKLBenchmark.reverse_kl_batch08_seqlen1024_vocab248320_eager",
53
+ "timingResults": {
54
+ "mean_ms": 6.1046,
55
+ "std_ms": 0.0041,
56
+ "min_ms": 6.0961,
57
+ "max_ms": 6.1376,
58
+ "q1_ms": 6.1023,
59
+ "q3_ms": 6.106,
60
+ "iqr_ms": 0.0037,
61
+ "outliers": 9,
62
+ "iterations": 200,
63
+ "refMeanMs": 48.4051
64
+ },
65
+ "verified": true
66
+ },
67
+ {
68
+ "workload": "ReverseKLBenchmark.reverse_kl_batch08_seqlen512_vocab248320_eager",
69
+ "timingResults": {
70
+ "mean_ms": 3.0816,
71
+ "std_ms": 0.0035,
72
+ "min_ms": 3.0743,
73
+ "max_ms": 3.0939,
74
+ "q1_ms": 3.0794,
75
+ "q3_ms": 3.0832,
76
+ "iqr_ms": 0.0038,
77
+ "outliers": 8,
78
+ "iterations": 200,
79
+ "refMeanMs": 24.3385
80
+ },
81
+ "verified": true
82
+ },
83
+ {
84
+ "workload": "ReverseKLBenchmark.reverse_kl_batch08_seqlen981_vocab248320_eager",
85
+ "timingResults": {
86
+ "mean_ms": 5.8549,
87
+ "std_ms": 0.0045,
88
+ "min_ms": 5.8459,
89
+ "max_ms": 5.8819,
90
+ "q1_ms": 5.8524,
91
+ "q3_ms": 5.8561,
92
+ "iqr_ms": 0.0037,
93
+ "outliers": 14,
94
+ "iterations": 200,
95
+ "refMeanMs": 46.4274
96
+ },
97
+ "verified": true
98
+ },
99
+ {
100
+ "workload": "ReverseKLBenchmark.reverse_kl_fwd_batch01_seqlen064_vocab248320_eager",
101
+ "timingResults": {
102
+ "mean_ms": 0.0638,
103
+ "std_ms": 0.0027,
104
+ "min_ms": 0.0604,
105
+ "max_ms": 0.0787,
106
+ "q1_ms": 0.0624,
107
+ "q3_ms": 0.064,
108
+ "iqr_ms": 0.0016,
109
+ "outliers": 20,
110
+ "iterations": 200,
111
+ "refMeanMs": 0.2532
112
+ },
113
+ "verified": true
114
+ },
115
+ {
116
+ "workload": "ReverseKLBenchmark.reverse_kl_fwd_batch02_seqlen128_vocab248320_eager",
117
+ "timingResults": {
118
+ "mean_ms": 0.1217,
119
+ "std_ms": 0.0038,
120
+ "min_ms": 0.1166,
121
+ "max_ms": 0.1428,
122
+ "q1_ms": 0.1193,
123
+ "q3_ms": 0.1227,
124
+ "iqr_ms": 0.0034,
125
+ "outliers": 19,
126
+ "iterations": 200,
127
+ "refMeanMs": 0.7671
128
+ },
129
+ "verified": true
130
+ },
131
+ {
132
+ "workload": "ReverseKLBenchmark.reverse_kl_fwd_batch04_seqlen256_vocab248320_eager",
133
+ "timingResults": {
134
+ "mean_ms": 0.3753,
135
+ "std_ms": 0.0033,
136
+ "min_ms": 0.3695,
137
+ "max_ms": 0.3843,
138
+ "q1_ms": 0.3726,
139
+ "q3_ms": 0.3779,
140
+ "iqr_ms": 0.0053,
141
+ "outliers": 0,
142
+ "iterations": 200,
143
+ "refMeanMs": 2.869
144
+ },
145
+ "verified": true
146
+ },
147
+ {
148
+ "workload": "ReverseKLBenchmark.reverse_kl_fwd_batch08_seqlen1024_vocab248320_eager",
149
+ "timingResults": {
150
+ "mean_ms": 2.6484,
151
+ "std_ms": 0.0065,
152
+ "min_ms": 2.6364,
153
+ "max_ms": 2.7044,
154
+ "q1_ms": 2.6449,
155
+ "q3_ms": 2.6515,
156
+ "iqr_ms": 0.0067,
157
+ "outliers": 3,
158
+ "iterations": 200,
159
+ "refMeanMs": 22.3336
160
+ },
161
+ "verified": true
162
+ },
163
+ {
164
+ "workload": "ReverseKLBenchmark.reverse_kl_fwd_batch08_seqlen512_vocab248320_eager",
165
+ "timingResults": {
166
+ "mean_ms": 1.365,
167
+ "std_ms": 0.0046,
168
+ "min_ms": 1.3548,
169
+ "max_ms": 1.3865,
170
+ "q1_ms": 1.3618,
171
+ "q3_ms": 1.3675,
172
+ "iqr_ms": 0.0057,
173
+ "outliers": 4,
174
+ "iterations": 200,
175
+ "refMeanMs": 11.2401
176
+ },
177
+ "verified": true
178
+ },
179
+ {
180
+ "workload": "ReverseKLBenchmark.reverse_kl_fwd_batch08_seqlen981_vocab248320_eager",
181
+ "timingResults": {
182
+ "mean_ms": 2.5316,
183
+ "std_ms": 0.0059,
184
+ "min_ms": 2.5203,
185
+ "max_ms": 2.5523,
186
+ "q1_ms": 2.5272,
187
+ "q3_ms": 2.5355,
188
+ "iqr_ms": 0.0083,
189
+ "outliers": 3,
190
+ "iterations": 200,
191
+ "refMeanMs": 21.4099
192
+ },
193
+ "verified": true
194
+ }
195
+ ],
196
+ "machineInfo": {
197
+ "gpu": "NVIDIA H100 80GB HBM3",
198
+ "backend": "CUDA 13.0",
199
+ "pytorchVersion": "2.11.0+cu130",
200
+ "os": "Linux 6.11.0-1016-nvidia",
201
+ "cpu": "x86_64"
202
+ },
203
+ "kernelCommitSha": "3e023eb5121761b8",
204
+ "benchmarkScriptPath": "benchmarks",
205
+ "benchmarkScriptSha": "690eea1f54f31bef1ad248380201005fd667d4b9c535f92f06eb6a5a33380d22"
206
+ }
benchmark_results/reverse_kl_eager/reverse_kl_eager_dark_animation.svg ADDED
benchmark_results/reverse_kl_eager/reverse_kl_eager_dark_latency.svg ADDED
benchmark_results/reverse_kl_eager/reverse_kl_eager_dark_throughput.svg ADDED
benchmark_results/reverse_kl_eager/reverse_kl_eager_light_animation.svg ADDED
benchmark_results/reverse_kl_eager/reverse_kl_eager_light_latency.svg ADDED
benchmark_results/reverse_kl_eager/reverse_kl_eager_light_throughput.svg ADDED