Upload folder using huggingface_hub
Browse files- flash_attn/artifacts/benchmark/Attention Benchmark.csv +7 -0
- flash_attn/artifacts/benchmark/Attention Benchmark.png +3 -0
- flash_attn/artifacts/benchmark/results.html +3 -0
- flash_attn/benchmark.html +92 -88
- flash_attn/cells/benchmark.py +4 -2
- moe_benchmarks/megablocks/cells/nv.py +3 -0
- moe_benchmarks/megablocks/megablocks_only.html +70 -3
- moe_benchmarks/megablocks_yamoe/artifacts/binned_run/binned_results.json +9 -9
- moe_benchmarks/megablocks_yamoe/artifacts/gptoss_run/gptoss_results.json +9 -9
- moe_benchmarks/megablocks_yamoe/artifacts/gptoss_training_run/gptoss_training_results.json +9 -9
- moe_benchmarks/megablocks_yamoe/artifacts/megablocks_run/megablocks_results.json +9 -9
- moe_benchmarks/megablocks_yamoe/artifacts/visualization/moe_performance_comparison.png +2 -2
- moe_benchmarks/megablocks_yamoe/artifacts/yamoe_run/yamoe_results.json +9 -9
- moe_benchmarks/megablocks_yamoe/cells/__pycache__/bench_utils.cpython-311.pyc +0 -0
- moe_benchmarks/megablocks_yamoe/cells/__pycache__/config.cpython-311.pyc +0 -0
- moe_benchmarks/megablocks_yamoe/megablocks_yamoe.html +200 -132
- moe_benchmarks/megablocks_yamoe/torch_profile.html +216 -217
flash_attn/artifacts/benchmark/Attention Benchmark.csv
ADDED
|
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
seq_len,torch_cudnn,torch_cudnn_compile_d,torch_cudnn_compile_ma,torch_flash,torch_flash_compile_d,torch_flash_compile_ma,hf_flash_attn,hf_flash_attn3
|
| 2 |
+
4224.000000,3.802832,3.792784,4.181488,3.966576,3.956640,4.313024,3.396816,3.333200
|
| 3 |
+
4352.000000,4.081776,4.086624,4.433040,4.399632,4.392240,4.736416,3.837312,3.758016
|
| 4 |
+
4416.000000,4.146080,4.139200,4.479680,4.456032,4.446992,4.795904,3.893088,3.864576
|
| 5 |
+
4480.000000,4.211200,4.203072,4.555296,4.529248,4.523104,4.877248,3.951152,3.871312
|
| 6 |
+
4544.000000,4.436080,4.432784,4.789248,4.585120,4.580192,4.938464,4.010128,3.978448
|
| 7 |
+
4608.000000,4.504256,4.497184,4.872832,4.662272,4.654272,5.030304,4.065760,3.986496
|
flash_attn/artifacts/benchmark/Attention Benchmark.png
ADDED
|
Git LFS Details
|
flash_attn/artifacts/benchmark/results.html
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
<html><body>
|
| 2 |
+
<image src="Attention Benchmark.png"/>
|
| 3 |
+
</body></html>
|
flash_attn/benchmark.html
CHANGED
|
@@ -3722,12 +3722,12 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
|
|
| 3722 |
<span onclick="toggleOutput('benchmark')" style="cursor: pointer;">▼ output</span>
|
| 3723 |
<span id="uv-indicator-benchmark" onclick="toggleUvLogsFromHeader('benchmark')" style="cursor: pointer;">▶ uv-logs</span>
|
| 3724 |
</span> |
|
| 3725 |
-
Cell: benchmark |
|
| 3726 |
| <button class="run-btn" onclick="runCell('benchmark')">▶ run</button>
|
| 3727 |
<button class="copy-btn" onclick="copyCell('benchmark')">Copy</button>
|
| 3728 |
<a href="cells/benchmark.py" target="_blank" class="raw-btn">Raw</a>
|
| 3729 |
</div>
|
| 3730 |
-
<div id="code-benchmark" class="cell-code" data-lines="
|
| 3731 |
<div class="highlight-with-lines">
|
| 3732 |
<div class="line-numbers" id="lines-benchmark">
|
| 3733 |
<a class="line-number" data-cell="benchmark" data-line="1" href="#cell-benchmark" onclick="event.preventDefault(); selectCellLine('benchmark', 1, true);">1</a>
|
|
@@ -4071,6 +4071,8 @@ Cell: benchmark | 80.35s
|
|
| 4071 |
<a class="line-number" data-cell="benchmark" data-line="339" href="#cell-benchmark" onclick="event.preventDefault(); selectCellLine('benchmark', 339, true);">339</a>
|
| 4072 |
<a class="line-number" data-cell="benchmark" data-line="340" href="#cell-benchmark" onclick="event.preventDefault(); selectCellLine('benchmark', 340, true);">340</a>
|
| 4073 |
<a class="line-number" data-cell="benchmark" data-line="341" href="#cell-benchmark" onclick="event.preventDefault(); selectCellLine('benchmark', 341, true);">341</a>
|
|
|
|
|
|
|
| 4074 |
</div>
|
| 4075 |
<div class="code-wrap">
|
| 4076 |
<div class="highlight"><pre><span></span><span class="c1"># /// script</span>
|
|
@@ -4165,8 +4167,10 @@ Cell: benchmark | 80.35s
|
|
| 4165 |
<span class="c1"># I can't seem to get it to work any other way under torch.compile, so any suggestions are welcome!</span>
|
| 4166 |
<span class="n">torch</span><span class="o">.</span><span class="n">_dynamo</span><span class="o">.</span><span class="n">config</span><span class="o">.</span><span class="n">suppress_errors</span> <span class="o">=</span> <span class="kc">True</span>
|
| 4167 |
|
| 4168 |
-
<span class="
|
| 4169 |
-
<span class="
|
|
|
|
|
|
|
| 4170 |
|
| 4171 |
<span class="n">batch_size</span> <span class="o">=</span> <span class="mi">1</span>
|
| 4172 |
<span class="n">num_attention_heads</span> <span class="o">=</span> <span class="mi">24</span>
|
|
@@ -4429,105 +4433,105 @@ xFormers not found.
|
|
| 4429 |
|
| 4430 |
|
| 4431 |
===== Testing shape: (1, 4224, 24, 128) =====
|
| 4432 |
-
torch_cudnn : absmax=0.
|
| 4433 |
-
torch_cudnn_compile_d : absmax=0.
|
| 4434 |
-
torch_cudnn_compile_ma : absmax=0.
|
| 4435 |
-
torch_flash : absmax=0.
|
| 4436 |
-
torch_flash_compile_d : absmax=0.
|
| 4437 |
-
torch_flash_compile_ma : absmax=0.
|
| 4438 |
-
hf_flash_attn : absmax=0.
|
| 4439 |
-
hf_flash_attn3 : absmax=0.
|
| 4440 |
|
| 4441 |
|
| 4442 |
===== Testing shape: (1, 4352, 24, 128) =====
|
| 4443 |
-
torch_cudnn : absmax=0.
|
| 4444 |
-
torch_cudnn_compile_d : absmax=0.
|
| 4445 |
-
torch_cudnn_compile_ma : absmax=0.
|
| 4446 |
-
torch_flash : absmax=0.
|
| 4447 |
-
torch_flash_compile_d : absmax=0.
|
| 4448 |
-
torch_flash_compile_ma : absmax=0.
|
| 4449 |
-
hf_flash_attn : absmax=0.
|
| 4450 |
-
hf_flash_attn3 : absmax=0.
|
| 4451 |
|
| 4452 |
|
| 4453 |
===== Testing shape: (1, 4416, 24, 128) =====
|
| 4454 |
-
torch_cudnn : absmax=0.
|
| 4455 |
-
torch_cudnn_compile_d : absmax=0.
|
| 4456 |
-
torch_cudnn_compile_ma : absmax=0.
|
| 4457 |
-
torch_flash : absmax=0.
|
| 4458 |
-
torch_flash_compile_d : absmax=0.
|
| 4459 |
-
torch_flash_compile_ma : absmax=0.
|
| 4460 |
-
hf_flash_attn : absmax=0.
|
| 4461 |
-
hf_flash_attn3 : absmax=0.
|
| 4462 |
|
| 4463 |
|
| 4464 |
===== Testing shape: (1, 4480, 24, 128) =====
|
| 4465 |
-
torch_cudnn : absmax=0.
|
| 4466 |
-
torch_cudnn_compile_d : absmax=0.
|
| 4467 |
-
torch_cudnn_compile_ma : absmax=0.
|
| 4468 |
-
torch_flash : absmax=0.
|
| 4469 |
-
torch_flash_compile_d : absmax=0.
|
| 4470 |
-
torch_flash_compile_ma : absmax=0.
|
| 4471 |
-
hf_flash_attn : absmax=0.
|
| 4472 |
-
hf_flash_attn3 : absmax=0.
|
| 4473 |
|
| 4474 |
|
| 4475 |
===== Testing shape: (1, 4544, 24, 128) =====
|
| 4476 |
-
torch_cudnn : absmax=0.
|
| 4477 |
-
torch_cudnn_compile_d : absmax=0.
|
| 4478 |
-
torch_cudnn_compile_ma : absmax=0.
|
| 4479 |
-
torch_flash : absmax=0.
|
| 4480 |
-
torch_flash_compile_d : absmax=0.
|
| 4481 |
-
torch_flash_compile_ma : absmax=0.
|
| 4482 |
-
hf_flash_attn : absmax=0.
|
| 4483 |
-
hf_flash_attn3 : absmax=0.
|
| 4484 |
|
| 4485 |
|
| 4486 |
===== Testing shape: (1, 4608, 24, 128) =====
|
| 4487 |
-
torch_cudnn : absmax=0.
|
| 4488 |
-
torch_cudnn_compile_d : absmax=0.
|
| 4489 |
-
torch_cudnn_compile_ma : absmax=0.
|
| 4490 |
-
torch_flash : absmax=0.
|
| 4491 |
-
torch_flash_compile_d : absmax=0.
|
| 4492 |
-
torch_flash_compile_ma : absmax=0.
|
| 4493 |
-
hf_flash_attn : absmax=0.
|
| 4494 |
-
hf_flash_attn3 : absmax=0.
|
| 4495 |
Attention Benchmark:
|
| 4496 |
seq_len torch_cudnn torch_cudnn_compile_d torch_cudnn_compile_ma torch_flash torch_flash_compile_d torch_flash_compile_ma hf_flash_attn hf_flash_attn3
|
| 4497 |
-
0 4224.0 3.
|
| 4498 |
-
1 4352.0 4.
|
| 4499 |
-
2 4416.0 4.
|
| 4500 |
-
3 4480.0 4.
|
| 4501 |
-
4 4544.0 4.
|
| 4502 |
-
5 4608.0 4.
|
| 4503 |
</div>
|
| 4504 |
<div class="uv-install-logs" id="uv-logs-benchmark">
|
| 4505 |
<div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
|
| 4506 |
<div class="uv-logs-content" style="display: none;">
|
| 4507 |
-
Downloading hf-xet (3.0MiB)
|
| 4508 |
Downloading nvidia-cuda-cupti-cu12 (9.8MiB)
|
| 4509 |
-
Downloading sympy (6.0MiB)
|
| 4510 |
Downloading pandas (11.8MiB)
|
| 4511 |
-
Downloading
|
| 4512 |
-
Downloading nvidia-cusparse-cu12 (274.9MiB)
|
| 4513 |
-
Downloading networkx (1.9MiB)
|
| 4514 |
-
Downloading nvidia-nccl-cu12 (307.4MiB)
|
| 4515 |
-
Downloading nvidia-curand-cu12 (60.7MiB)
|
| 4516 |
Downloading nvidia-nvjitlink-cu12 (37.4MiB)
|
| 4517 |
-
Downloading matplotlib (8.3MiB)
|
| 4518 |
-
Downloading nvidia-cublas-cu12 (566.8MiB)
|
| 4519 |
-
Downloading nvidia-cuda-nvrtc-cu12 (84.0MiB)
|
| 4520 |
-
Downloading nvidia-cusolver-cu12 (255.1MiB)
|
| 4521 |
-
Downloading pillow (6.3MiB)
|
| 4522 |
-
Downloading nvidia-cudnn-cu12 (674.0MiB)
|
| 4523 |
Downloading nvidia-cusparselt-cu12 (273.9MiB)
|
| 4524 |
-
Downloading nvidia-
|
| 4525 |
-
Downloading nvidia-cufft-cu12 (184.2MiB)
|
| 4526 |
Downloading fonttools (4.7MiB)
|
| 4527 |
-
Downloading
|
| 4528 |
Downloading triton (148.3MiB)
|
| 4529 |
-
Downloading
|
|
|
|
|
|
|
| 4530 |
Downloading kiwisolver (1.4MiB)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 4531 |
Downloading nvidia-cufile-cu12
|
| 4532 |
Downloading kiwisolver
|
| 4533 |
Downloading hf-xet
|
|
@@ -4540,37 +4544,37 @@ Downloading kiwisolver (1.4MiB)
|
|
| 4540 |
Downloading sympy
|
| 4541 |
Downloading numpy
|
| 4542 |
Downloading nvidia-nvjitlink-cu12
|
| 4543 |
-
Downloading nvidia-curand-cu12
|
| 4544 |
Downloading pandas
|
|
|
|
| 4545 |
Downloading nvidia-cuda-nvrtc-cu12
|
| 4546 |
Downloading triton
|
| 4547 |
Downloading nvidia-cufft-cu12
|
| 4548 |
Downloading nvidia-cusolver-cu12
|
| 4549 |
-
Downloading nvidia-cusparselt-cu12
|
| 4550 |
Downloading nvidia-cusparse-cu12
|
|
|
|
| 4551 |
Downloading nvidia-nccl-cu12
|
| 4552 |
Downloading nvidia-cublas-cu12
|
| 4553 |
Downloading nvidia-cudnn-cu12
|
| 4554 |
Downloading torch
|
| 4555 |
-
Installed 49 packages in
|
| 4556 |
</div>
|
| 4557 |
</div>
|
| 4558 |
<div class="cell-stderr">Fetching 20 files: 0%| | 0/20 [00:00<?, ?it/s]
|
| 4559 |
-
Fetching 20 files: 5%|▌ | 1/20 [00:00<00:04, 4.
|
| 4560 |
-
Fetching 20 files: 10%|█ | 2/20 [00:
|
| 4561 |
-
Fetching 20 files: 100%|██████████| 20/20 [00:
|
| 4562 |
|
| 4563 |
Fetching 4 files: 0%| | 0/4 [00:00<?, ?it/s]
|
| 4564 |
-
Fetching 4 files: 25%|██▌ | 1/4 [00:00<00:
|
| 4565 |
-
Fetching 4 files: 50%|█████ | 2/4 [00:01<00:01, 1.
|
| 4566 |
-
Fetching 4 files: 100%|██████████| 4/4 [00:01<00:00, 2.
|
| 4567 |
<div class="cell-artifacts">
|
| 4568 |
<h4>Artifacts:</h4>
|
| 4569 |
-
<a href="artifacts/benchmark/
|
| 4570 |
-
<a href="artifacts/benchmark/
|
| 4571 |
-
<a href="artifacts/benchmark/
|
| 4572 |
<div class="artifact-preview">
|
| 4573 |
-
<img src="artifacts/benchmark/
|
| 4574 |
</div>
|
| 4575 |
</div>
|
| 4576 |
</div>
|
|
|
|
| 3722 |
<span onclick="toggleOutput('benchmark')" style="cursor: pointer;">▼ output</span>
|
| 3723 |
<span id="uv-indicator-benchmark" onclick="toggleUvLogsFromHeader('benchmark')" style="cursor: pointer;">▶ uv-logs</span>
|
| 3724 |
</span> |
|
| 3725 |
+
Cell: benchmark | 77.66s
|
| 3726 |
| <button class="run-btn" onclick="runCell('benchmark')">▶ run</button>
|
| 3727 |
<button class="copy-btn" onclick="copyCell('benchmark')">Copy</button>
|
| 3728 |
<a href="cells/benchmark.py" target="_blank" class="raw-btn">Raw</a>
|
| 3729 |
</div>
|
| 3730 |
+
<div id="code-benchmark" class="cell-code" data-lines="343">
|
| 3731 |
<div class="highlight-with-lines">
|
| 3732 |
<div class="line-numbers" id="lines-benchmark">
|
| 3733 |
<a class="line-number" data-cell="benchmark" data-line="1" href="#cell-benchmark" onclick="event.preventDefault(); selectCellLine('benchmark', 1, true);">1</a>
|
|
|
|
| 4071 |
<a class="line-number" data-cell="benchmark" data-line="339" href="#cell-benchmark" onclick="event.preventDefault(); selectCellLine('benchmark', 339, true);">339</a>
|
| 4072 |
<a class="line-number" data-cell="benchmark" data-line="340" href="#cell-benchmark" onclick="event.preventDefault(); selectCellLine('benchmark', 340, true);">340</a>
|
| 4073 |
<a class="line-number" data-cell="benchmark" data-line="341" href="#cell-benchmark" onclick="event.preventDefault(); selectCellLine('benchmark', 341, true);">341</a>
|
| 4074 |
+
<a class="line-number" data-cell="benchmark" data-line="342" href="#cell-benchmark" onclick="event.preventDefault(); selectCellLine('benchmark', 342, true);">342</a>
|
| 4075 |
+
<a class="line-number" data-cell="benchmark" data-line="343" href="#cell-benchmark" onclick="event.preventDefault(); selectCellLine('benchmark', 343, true);">343</a>
|
| 4076 |
</div>
|
| 4077 |
<div class="code-wrap">
|
| 4078 |
<div class="highlight"><pre><span></span><span class="c1"># /// script</span>
|
|
|
|
| 4167 |
<span class="c1"># I can't seem to get it to work any other way under torch.compile, so any suggestions are welcome!</span>
|
| 4168 |
<span class="n">torch</span><span class="o">.</span><span class="n">_dynamo</span><span class="o">.</span><span class="n">config</span><span class="o">.</span><span class="n">suppress_errors</span> <span class="o">=</span> <span class="kc">True</span>
|
| 4169 |
|
| 4170 |
+
<span class="c1"># output_dir = pathlib.Path("dump_attention_benchmark")</span>
|
| 4171 |
+
<span class="c1"># output_dir.mkdir(parents=True, exist_ok=True)</span>
|
| 4172 |
+
|
| 4173 |
+
<span class="n">output_dir</span> <span class="o">=</span> <span class="n">pathlib</span><span class="o">.</span><span class="n">Path</span><span class="p">(</span><span class="s2">"."</span><span class="p">)</span> <span class="c1"># output to current directory for upload</span>
|
| 4174 |
|
| 4175 |
<span class="n">batch_size</span> <span class="o">=</span> <span class="mi">1</span>
|
| 4176 |
<span class="n">num_attention_heads</span> <span class="o">=</span> <span class="mi">24</span>
|
|
|
|
| 4433 |
|
| 4434 |
|
| 4435 |
===== Testing shape: (1, 4224, 24, 128) =====
|
| 4436 |
+
torch_cudnn : absmax=0.000871, mae=0.000075, mse=0.000000
|
| 4437 |
+
torch_cudnn_compile_d : absmax=0.000871, mae=0.000075, mse=0.000000
|
| 4438 |
+
torch_cudnn_compile_ma : absmax=0.000871, mae=0.000075, mse=0.000000
|
| 4439 |
+
torch_flash : absmax=0.000947, mae=0.000075, mse=0.000000
|
| 4440 |
+
torch_flash_compile_d : absmax=0.000947, mae=0.000075, mse=0.000000
|
| 4441 |
+
torch_flash_compile_ma : absmax=0.000947, mae=0.000075, mse=0.000000
|
| 4442 |
+
hf_flash_attn : absmax=0.000947, mae=0.000075, mse=0.000000
|
| 4443 |
+
hf_flash_attn3 : absmax=0.000842, mae=0.000075, mse=0.000000
|
| 4444 |
|
| 4445 |
|
| 4446 |
===== Testing shape: (1, 4352, 24, 128) =====
|
| 4447 |
+
torch_cudnn : absmax=0.001069, mae=0.000073, mse=0.000000
|
| 4448 |
+
torch_cudnn_compile_d : absmax=0.001069, mae=0.000073, mse=0.000000
|
| 4449 |
+
torch_cudnn_compile_ma : absmax=0.001069, mae=0.000073, mse=0.000000
|
| 4450 |
+
torch_flash : absmax=0.000963, mae=0.000073, mse=0.000000
|
| 4451 |
+
torch_flash_compile_d : absmax=0.000963, mae=0.000073, mse=0.000000
|
| 4452 |
+
torch_flash_compile_ma : absmax=0.000963, mae=0.000073, mse=0.000000
|
| 4453 |
+
hf_flash_attn : absmax=0.000963, mae=0.000073, mse=0.000000
|
| 4454 |
+
hf_flash_attn3 : absmax=0.001069, mae=0.000073, mse=0.000000
|
| 4455 |
|
| 4456 |
|
| 4457 |
===== Testing shape: (1, 4416, 24, 128) =====
|
| 4458 |
+
torch_cudnn : absmax=0.001802, mae=0.000073, mse=0.000000
|
| 4459 |
+
torch_cudnn_compile_d : absmax=0.001802, mae=0.000073, mse=0.000000
|
| 4460 |
+
torch_cudnn_compile_ma : absmax=0.001802, mae=0.000073, mse=0.000000
|
| 4461 |
+
torch_flash : absmax=0.001802, mae=0.000073, mse=0.000000
|
| 4462 |
+
torch_flash_compile_d : absmax=0.001802, mae=0.000073, mse=0.000000
|
| 4463 |
+
torch_flash_compile_ma : absmax=0.001802, mae=0.000073, mse=0.000000
|
| 4464 |
+
hf_flash_attn : absmax=0.001802, mae=0.000073, mse=0.000000
|
| 4465 |
+
hf_flash_attn3 : absmax=0.001802, mae=0.000073, mse=0.000000
|
| 4466 |
|
| 4467 |
|
| 4468 |
===== Testing shape: (1, 4480, 24, 128) =====
|
| 4469 |
+
torch_cudnn : absmax=0.001438, mae=0.000073, mse=0.000000
|
| 4470 |
+
torch_cudnn_compile_d : absmax=0.001438, mae=0.000073, mse=0.000000
|
| 4471 |
+
torch_cudnn_compile_ma : absmax=0.001438, mae=0.000073, mse=0.000000
|
| 4472 |
+
torch_flash : absmax=0.001438, mae=0.000073, mse=0.000000
|
| 4473 |
+
torch_flash_compile_d : absmax=0.001438, mae=0.000073, mse=0.000000
|
| 4474 |
+
torch_flash_compile_ma : absmax=0.001438, mae=0.000073, mse=0.000000
|
| 4475 |
+
hf_flash_attn : absmax=0.001438, mae=0.000073, mse=0.000000
|
| 4476 |
+
hf_flash_attn3 : absmax=0.001438, mae=0.000073, mse=0.000000
|
| 4477 |
|
| 4478 |
|
| 4479 |
===== Testing shape: (1, 4544, 24, 128) =====
|
| 4480 |
+
torch_cudnn : absmax=0.000976, mae=0.000072, mse=0.000000
|
| 4481 |
+
torch_cudnn_compile_d : absmax=0.000976, mae=0.000072, mse=0.000000
|
| 4482 |
+
torch_cudnn_compile_ma : absmax=0.000976, mae=0.000072, mse=0.000000
|
| 4483 |
+
torch_flash : absmax=0.000976, mae=0.000072, mse=0.000000
|
| 4484 |
+
torch_flash_compile_d : absmax=0.000976, mae=0.000072, mse=0.000000
|
| 4485 |
+
torch_flash_compile_ma : absmax=0.000976, mae=0.000072, mse=0.000000
|
| 4486 |
+
hf_flash_attn : absmax=0.000976, mae=0.000072, mse=0.000000
|
| 4487 |
+
hf_flash_attn3 : absmax=0.000976, mae=0.000072, mse=0.000000
|
| 4488 |
|
| 4489 |
|
| 4490 |
===== Testing shape: (1, 4608, 24, 128) =====
|
| 4491 |
+
torch_cudnn : absmax=0.000937, mae=0.000072, mse=0.000000
|
| 4492 |
+
torch_cudnn_compile_d : absmax=0.000937, mae=0.000072, mse=0.000000
|
| 4493 |
+
torch_cudnn_compile_ma : absmax=0.000937, mae=0.000072, mse=0.000000
|
| 4494 |
+
torch_flash : absmax=0.000937, mae=0.000072, mse=0.000000
|
| 4495 |
+
torch_flash_compile_d : absmax=0.000937, mae=0.000072, mse=0.000000
|
| 4496 |
+
torch_flash_compile_ma : absmax=0.000937, mae=0.000072, mse=0.000000
|
| 4497 |
+
hf_flash_attn : absmax=0.000937, mae=0.000072, mse=0.000000
|
| 4498 |
+
hf_flash_attn3 : absmax=0.000937, mae=0.000072, mse=0.000000
|
| 4499 |
Attention Benchmark:
|
| 4500 |
seq_len torch_cudnn torch_cudnn_compile_d torch_cudnn_compile_ma torch_flash torch_flash_compile_d torch_flash_compile_ma hf_flash_attn hf_flash_attn3
|
| 4501 |
+
0 4224.0 3.802832 3.792784 4.181488 3.966576 3.956640 4.313024 3.396816 3.333200
|
| 4502 |
+
1 4352.0 4.081776 4.086624 4.433040 4.399632 4.392240 4.736416 3.837312 3.758016
|
| 4503 |
+
2 4416.0 4.146080 4.139200 4.479680 4.456032 4.446992 4.795904 3.893088 3.864576
|
| 4504 |
+
3 4480.0 4.211200 4.203072 4.555296 4.529248 4.523104 4.877248 3.951152 3.871312
|
| 4505 |
+
4 4544.0 4.436080 4.432784 4.789248 4.585120 4.580192 4.938464 4.010128 3.978448
|
| 4506 |
+
5 4608.0 4.504256 4.497184 4.872832 4.662272 4.654272 5.030304 4.065760 3.986496
|
| 4507 |
</div>
|
| 4508 |
<div class="uv-install-logs" id="uv-logs-benchmark">
|
| 4509 |
<div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
|
| 4510 |
<div class="uv-logs-content" style="display: none;">
|
|
|
|
| 4511 |
Downloading nvidia-cuda-cupti-cu12 (9.8MiB)
|
|
|
|
| 4512 |
Downloading pandas (11.8MiB)
|
| 4513 |
+
Downloading sympy (6.0MiB)
|
|
|
|
|
|
|
|
|
|
|
|
|
| 4514 |
Downloading nvidia-nvjitlink-cu12 (37.4MiB)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 4515 |
Downloading nvidia-cusparselt-cu12 (273.9MiB)
|
| 4516 |
+
Downloading nvidia-cudnn-cu12 (674.0MiB)
|
|
|
|
| 4517 |
Downloading fonttools (4.7MiB)
|
| 4518 |
+
Downloading nvidia-cuda-nvrtc-cu12 (84.0MiB)
|
| 4519 |
Downloading triton (148.3MiB)
|
| 4520 |
+
Downloading setuptools (1.1MiB)
|
| 4521 |
+
Downloading pillow (6.3MiB)
|
| 4522 |
+
Downloading nvidia-cufft-cu12 (184.2MiB)
|
| 4523 |
Downloading kiwisolver (1.4MiB)
|
| 4524 |
+
Downloading matplotlib (8.3MiB)
|
| 4525 |
+
Downloading nvidia-curand-cu12 (60.7MiB)
|
| 4526 |
+
Downloading nvidia-cusolver-cu12 (255.1MiB)
|
| 4527 |
+
Downloading nvidia-cusparse-cu12 (274.9MiB)
|
| 4528 |
+
Downloading torch (846.9MiB)
|
| 4529 |
+
Downloading nvidia-cublas-cu12 (566.8MiB)
|
| 4530 |
+
Downloading numpy (16.2MiB)
|
| 4531 |
+
Downloading nvidia-nccl-cu12 (307.4MiB)
|
| 4532 |
+
Downloading nvidia-cufile-cu12 (1.1MiB)
|
| 4533 |
+
Downloading networkx (1.9MiB)
|
| 4534 |
+
Downloading hf-xet (3.0MiB)
|
| 4535 |
Downloading nvidia-cufile-cu12
|
| 4536 |
Downloading kiwisolver
|
| 4537 |
Downloading hf-xet
|
|
|
|
| 4544 |
Downloading sympy
|
| 4545 |
Downloading numpy
|
| 4546 |
Downloading nvidia-nvjitlink-cu12
|
|
|
|
| 4547 |
Downloading pandas
|
| 4548 |
+
Downloading nvidia-curand-cu12
|
| 4549 |
Downloading nvidia-cuda-nvrtc-cu12
|
| 4550 |
Downloading triton
|
| 4551 |
Downloading nvidia-cufft-cu12
|
| 4552 |
Downloading nvidia-cusolver-cu12
|
|
|
|
| 4553 |
Downloading nvidia-cusparse-cu12
|
| 4554 |
+
Downloading nvidia-cusparselt-cu12
|
| 4555 |
Downloading nvidia-nccl-cu12
|
| 4556 |
Downloading nvidia-cublas-cu12
|
| 4557 |
Downloading nvidia-cudnn-cu12
|
| 4558 |
Downloading torch
|
| 4559 |
+
Installed 49 packages in 617ms
|
| 4560 |
</div>
|
| 4561 |
</div>
|
| 4562 |
<div class="cell-stderr">Fetching 20 files: 0%| | 0/20 [00:00<?, ?it/s]
|
| 4563 |
+
Fetching 20 files: 5%|▌ | 1/20 [00:00<00:04, 4.35it/s]
|
| 4564 |
+
Fetching 20 files: 10%|█ | 2/20 [00:02<00:23, 1.31s/it]
|
| 4565 |
+
Fetching 20 files: 100%|██████████| 20/20 [00:02<00:00, 8.73it/s]
|
| 4566 |
|
| 4567 |
Fetching 4 files: 0%| | 0/4 [00:00<?, ?it/s]
|
| 4568 |
+
Fetching 4 files: 25%|██▌ | 1/4 [00:00<00:00, 5.70it/s]
|
| 4569 |
+
Fetching 4 files: 50%|█████ | 2/4 [00:01<00:01, 1.31it/s]
|
| 4570 |
+
Fetching 4 files: 100%|██████████| 4/4 [00:01<00:00, 2.96it/s]</div>
|
| 4571 |
<div class="cell-artifacts">
|
| 4572 |
<h4>Artifacts:</h4>
|
| 4573 |
+
<a href="artifacts/benchmark/Attention Benchmark.png" class="artifact" target="_blank">Attention Benchmark.png</a>
|
| 4574 |
+
<a href="artifacts/benchmark/Attention Benchmark.csv" class="artifact" target="_blank">Attention Benchmark.csv</a>
|
| 4575 |
+
<a href="artifacts/benchmark/results.html" class="artifact" target="_blank">results.html</a>
|
| 4576 |
<div class="artifact-preview">
|
| 4577 |
+
<img src="artifacts/benchmark/Attention Benchmark.png" alt="Attention Benchmark.png">
|
| 4578 |
</div>
|
| 4579 |
</div>
|
| 4580 |
</div>
|
flash_attn/cells/benchmark.py
CHANGED
|
@@ -90,8 +90,10 @@ torch._dynamo.config.cache_size_limit = 10000
|
|
| 90 |
# I can't seem to get it to work any other way under torch.compile, so any suggestions are welcome!
|
| 91 |
torch._dynamo.config.suppress_errors = True
|
| 92 |
|
| 93 |
-
output_dir = pathlib.Path("dump_attention_benchmark")
|
| 94 |
-
output_dir.mkdir(parents=True, exist_ok=True)
|
|
|
|
|
|
|
| 95 |
|
| 96 |
batch_size = 1
|
| 97 |
num_attention_heads = 24
|
|
|
|
| 90 |
# I can't seem to get it to work any other way under torch.compile, so any suggestions are welcome!
|
| 91 |
torch._dynamo.config.suppress_errors = True
|
| 92 |
|
| 93 |
+
# output_dir = pathlib.Path("dump_attention_benchmark")
|
| 94 |
+
# output_dir.mkdir(parents=True, exist_ok=True)
|
| 95 |
+
|
| 96 |
+
output_dir = pathlib.Path(".") # output to current directory for upload
|
| 97 |
|
| 98 |
batch_size = 1
|
| 99 |
num_attention_heads = 24
|
moe_benchmarks/megablocks/cells/nv.py
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import subprocess
|
| 2 |
+
|
| 3 |
+
print(subprocess.run(["nvidia-smi"], capture_output=True, text=True).stdout)
|
moe_benchmarks/megablocks/megablocks_only.html
CHANGED
|
@@ -3715,7 +3715,74 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
|
|
| 3715 |
</div>
|
| 3716 |
|
| 3717 |
<div class="main-content">
|
| 3718 |
-
<
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 3719 |
<p>First, we run the model without any custom kernels to get a reference point.</p>
|
| 3720 |
<h2>Forward</h2>
|
| 3721 |
<h2>Forward and Backward</h2>
|
|
@@ -3727,7 +3794,7 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
|
|
| 3727 |
<span onclick="toggleOutput('forward_and_backward_no_kernel')" style="cursor: pointer;">▼ output</span>
|
| 3728 |
<span id="uv-indicator-forward_and_backward_no_kernel" style="cursor: default; opacity: 0.3;">▶ uv-logs</span>
|
| 3729 |
</span> |
|
| 3730 |
-
Cell: forward_and_backward_no_kernel |
|
| 3731 |
| <button class="run-btn" onclick="runCell('forward_and_backward_no_kernel')">▶ run</button>
|
| 3732 |
<button class="copy-btn" onclick="copyCell('forward_and_backward_no_kernel')">Copy</button>
|
| 3733 |
<a href="cells/forward_and_backward_no_kernel.py" target="_blank" class="raw-btn">Raw</a>
|
|
@@ -4137,7 +4204,7 @@ Cell: forward_and_backward_no_kernel | 17.02s | FAILED
|
|
| 4137 |
<div id="output-forward_and_backward_no_kernel" class="cell-output">
|
| 4138 |
<div class="cell-stderr">warning: The requested interpreter resolved to Python 3.11.13, which is incompatible with the script's Python requirement: `>=3.12`
|
| 4139 |
Updating https://github.com/huggingface/transformers.git (HEAD)
|
| 4140 |
-
Updated https://github.com/huggingface/transformers.git (
|
| 4141 |
× No solution found when resolving script dependencies:
|
| 4142 |
╰─▶ Because only transformers==4.57.0.dev0 is available and
|
| 4143 |
transformers==4.57.0.dev0 depends on huggingface-hub==1.0.0rc1,
|
|
|
|
| 3715 |
</div>
|
| 3716 |
|
| 3717 |
<div class="main-content">
|
| 3718 |
+
<div class="cell" id="cell-nv">
|
| 3719 |
+
<div class="cell-header">
|
| 3720 |
+
<span class="collapse-indicators">
|
| 3721 |
+
<span onclick="toggleCode('nv')" style="cursor: pointer;">▼ code</span>
|
| 3722 |
+
<span onclick="toggleOutput('nv')" style="cursor: pointer;">▼ output</span>
|
| 3723 |
+
<span id="uv-indicator-nv" style="cursor: default; opacity: 0.3;">▶ uv-logs</span>
|
| 3724 |
+
</span> |
|
| 3725 |
+
Cell: nv | 0.67s
|
| 3726 |
+
| <button class="run-btn" onclick="runCell('nv')">▶ run</button>
|
| 3727 |
+
<button class="copy-btn" onclick="copyCell('nv')">Copy</button>
|
| 3728 |
+
<a href="cells/nv.py" target="_blank" class="raw-btn">Raw</a>
|
| 3729 |
+
</div>
|
| 3730 |
+
<div id="code-nv" class="cell-code" data-lines="3">
|
| 3731 |
+
<div class="highlight-with-lines">
|
| 3732 |
+
<div class="line-numbers" id="lines-nv">
|
| 3733 |
+
<a class="line-number" data-cell="nv" data-line="1" href="#cell-nv" onclick="event.preventDefault(); selectCellLine('nv', 1, true);">1</a>
|
| 3734 |
+
<a class="line-number" data-cell="nv" data-line="2" href="#cell-nv" onclick="event.preventDefault(); selectCellLine('nv', 2, true);">2</a>
|
| 3735 |
+
<a class="line-number" data-cell="nv" data-line="3" href="#cell-nv" onclick="event.preventDefault(); selectCellLine('nv', 3, true);">3</a>
|
| 3736 |
+
</div>
|
| 3737 |
+
<div class="code-wrap">
|
| 3738 |
+
<div class="highlight"><pre><span></span><span class="kn">import</span><span class="w"> </span><span class="nn">subprocess</span>
|
| 3739 |
+
|
| 3740 |
+
<span class="nb">print</span><span class="p">(</span><span class="n">subprocess</span><span class="o">.</span><span class="n">run</span><span class="p">([</span><span class="s2">"nvidia-smi"</span><span class="p">],</span> <span class="n">capture_output</span><span class="o">=</span><span class="kc">True</span><span class="p">,</span> <span class="n">text</span><span class="o">=</span><span class="kc">True</span><span class="p">)</span><span class="o">.</span><span class="n">stdout</span><span class="p">)</span>
|
| 3741 |
+
</pre></div>
|
| 3742 |
+
|
| 3743 |
+
<div class="code-line-highlight" id="line-highlight-nv"></div>
|
| 3744 |
+
</div>
|
| 3745 |
+
</div>
|
| 3746 |
+
</div>
|
| 3747 |
+
<div id="output-nv" class="cell-output">
|
| 3748 |
+
<div class="cell-stdout">Thu Sep 25 20:02:38 2025
|
| 3749 |
+
+-----------------------------------------------------------------------------------------+
|
| 3750 |
+
| NVIDIA-SMI 570.172.08 Driver Version: 570.172.08 CUDA Version: 12.8 |
|
| 3751 |
+
|-----------------------------------------+------------------------+----------------------+
|
| 3752 |
+
| GPU Name Persistence-M | Bus-Id Disp.A | Volatile Uncorr. ECC |
|
| 3753 |
+
| Fan Temp Perf Pwr:Usage/Cap | Memory-Usage | GPU-Util Compute M. |
|
| 3754 |
+
| | | MIG M. |
|
| 3755 |
+
|=========================================+========================+======================|
|
| 3756 |
+
| 0 NVIDIA A10G On | 00000000:00:1B.0 Off | 0 |
|
| 3757 |
+
| 0% 40C P0 49W / 300W | 0MiB / 23028MiB | 0% Default |
|
| 3758 |
+
| | | N/A |
|
| 3759 |
+
+-----------------------------------------+------------------------+----------------------+
|
| 3760 |
+
| 1 NVIDIA A10G On | 00000000:00:1C.0 Off | 0 |
|
| 3761 |
+
| 0% 33C P8 23W / 300W | 0MiB / 23028MiB | 0% Default |
|
| 3762 |
+
| | | N/A |
|
| 3763 |
+
+-----------------------------------------+------------------------+----------------------+
|
| 3764 |
+
| 2 NVIDIA A10G On | 00000000:00:1D.0 Off | 0 |
|
| 3765 |
+
| 0% 33C P8 24W / 300W | 0MiB / 23028MiB | 0% Default |
|
| 3766 |
+
| | | N/A |
|
| 3767 |
+
+-----------------------------------------+------------------------+----------------------+
|
| 3768 |
+
| 3 NVIDIA A10G On | 00000000:00:1E.0 Off | 0 |
|
| 3769 |
+
| 0% 34C P8 26W / 300W | 0MiB / 23028MiB | 0% Default |
|
| 3770 |
+
| | | N/A |
|
| 3771 |
+
+-----------------------------------------+------------------------+----------------------+
|
| 3772 |
+
|
| 3773 |
+
+-----------------------------------------------------------------------------------------+
|
| 3774 |
+
| Processes: |
|
| 3775 |
+
| GPU GI CI PID Type Process name GPU Memory |
|
| 3776 |
+
| ID ID Usage |
|
| 3777 |
+
|=========================================================================================|
|
| 3778 |
+
| No running processes found |
|
| 3779 |
+
+-----------------------------------------------------------------------------------------+
|
| 3780 |
+
|
| 3781 |
+
</div>
|
| 3782 |
+
</div>
|
| 3783 |
+
</div>
|
| 3784 |
+
|
| 3785 |
+
<h1>No Kernels</h1>
|
| 3786 |
<p>First, we run the model without any custom kernels to get a reference point.</p>
|
| 3787 |
<h2>Forward</h2>
|
| 3788 |
<h2>Forward and Backward</h2>
|
|
|
|
| 3794 |
<span onclick="toggleOutput('forward_and_backward_no_kernel')" style="cursor: pointer;">▼ output</span>
|
| 3795 |
<span id="uv-indicator-forward_and_backward_no_kernel" style="cursor: default; opacity: 0.3;">▶ uv-logs</span>
|
| 3796 |
</span> |
|
| 3797 |
+
Cell: forward_and_backward_no_kernel | 16.89s | FAILED
|
| 3798 |
| <button class="run-btn" onclick="runCell('forward_and_backward_no_kernel')">▶ run</button>
|
| 3799 |
<button class="copy-btn" onclick="copyCell('forward_and_backward_no_kernel')">Copy</button>
|
| 3800 |
<a href="cells/forward_and_backward_no_kernel.py" target="_blank" class="raw-btn">Raw</a>
|
|
|
|
| 4204 |
<div id="output-forward_and_backward_no_kernel" class="cell-output">
|
| 4205 |
<div class="cell-stderr">warning: The requested interpreter resolved to Python 3.11.13, which is incompatible with the script's Python requirement: `>=3.12`
|
| 4206 |
Updating https://github.com/huggingface/transformers.git (HEAD)
|
| 4207 |
+
Updated https://github.com/huggingface/transformers.git (53838edde77cb10f3a360150aa85a457637e9ac3)
|
| 4208 |
× No solution found when resolving script dependencies:
|
| 4209 |
╰─▶ Because only transformers==4.57.0.dev0 is available and
|
| 4210 |
transformers==4.57.0.dev0 depends on huggingface-hub==1.0.0rc1,
|
moe_benchmarks/megablocks_yamoe/artifacts/binned_run/binned_results.json
CHANGED
|
@@ -9,16 +9,16 @@
|
|
| 9 |
"vary_inputs": true
|
| 10 |
},
|
| 11 |
"stats": {
|
| 12 |
-
"avg_ms": 36.
|
| 13 |
-
"min_ms": 33.
|
| 14 |
-
"max_ms":
|
| 15 |
-
"std_ms": 1.
|
| 16 |
-
"p50_ms": 36.
|
| 17 |
-
"p95_ms":
|
| 18 |
-
"p99_ms": 38.
|
| 19 |
"num_iters": 50,
|
| 20 |
-
"tokens_per_s":
|
| 21 |
-
"throughput_variance":
|
| 22 |
},
|
| 23 |
"output_sum": 3.97190523147583
|
| 24 |
}
|
|
|
|
| 9 |
"vary_inputs": true
|
| 10 |
},
|
| 11 |
"stats": {
|
| 12 |
+
"avg_ms": 36.1408544400183,
|
| 13 |
+
"min_ms": 33.21830599998066,
|
| 14 |
+
"max_ms": 38.347281000142175,
|
| 15 |
+
"std_ms": 1.386811930577117,
|
| 16 |
+
"p50_ms": 36.57941149992894,
|
| 17 |
+
"p95_ms": 37.79091359995164,
|
| 18 |
+
"p99_ms": 38.30271452006173,
|
| 19 |
"num_iters": 50,
|
| 20 |
+
"tokens_per_s": 2766.951737844673,
|
| 21 |
+
"throughput_variance": 108.07582031577446
|
| 22 |
},
|
| 23 |
"output_sum": 3.97190523147583
|
| 24 |
}
|
moe_benchmarks/megablocks_yamoe/artifacts/gptoss_run/gptoss_results.json
CHANGED
|
@@ -9,16 +9,16 @@
|
|
| 9 |
"vary_inputs": true
|
| 10 |
},
|
| 11 |
"stats": {
|
| 12 |
-
"avg_ms":
|
| 13 |
-
"min_ms":
|
| 14 |
-
"max_ms":
|
| 15 |
-
"std_ms":
|
| 16 |
-
"p50_ms":
|
| 17 |
-
"p95_ms": 50.
|
| 18 |
-
"p99_ms":
|
| 19 |
"num_iters": 50,
|
| 20 |
-
"tokens_per_s":
|
| 21 |
-
"throughput_variance":
|
| 22 |
},
|
| 23 |
"output_sum": 11.53223705291748
|
| 24 |
}
|
|
|
|
| 9 |
"vary_inputs": true
|
| 10 |
},
|
| 11 |
"stats": {
|
| 12 |
+
"avg_ms": 46.66910740000276,
|
| 13 |
+
"min_ms": 40.354887999910716,
|
| 14 |
+
"max_ms": 50.56437000007463,
|
| 15 |
+
"std_ms": 2.944349756547624,
|
| 16 |
+
"p50_ms": 47.07003099997564,
|
| 17 |
+
"p95_ms": 50.338209400013056,
|
| 18 |
+
"p99_ms": 50.5430893000721,
|
| 19 |
"num_iters": 50,
|
| 20 |
+
"tokens_per_s": 2142.7450742285696,
|
| 21 |
+
"throughput_variance": 139.2849368532802
|
| 22 |
},
|
| 23 |
"output_sum": 11.53223705291748
|
| 24 |
}
|
moe_benchmarks/megablocks_yamoe/artifacts/gptoss_training_run/gptoss_training_results.json
CHANGED
|
@@ -9,16 +9,16 @@
|
|
| 9 |
"vary_inputs": true
|
| 10 |
},
|
| 11 |
"stats": {
|
| 12 |
-
"avg_ms": 47.
|
| 13 |
-
"min_ms": 40.
|
| 14 |
-
"max_ms": 51.
|
| 15 |
-
"std_ms":
|
| 16 |
-
"p50_ms": 47.
|
| 17 |
-
"p95_ms":
|
| 18 |
-
"p99_ms": 51.
|
| 19 |
"num_iters": 50,
|
| 20 |
-
"tokens_per_s":
|
| 21 |
-
"throughput_variance":
|
| 22 |
},
|
| 23 |
"output_sum": 11.53223705291748
|
| 24 |
}
|
|
|
|
| 9 |
"vary_inputs": true
|
| 10 |
},
|
| 11 |
"stats": {
|
| 12 |
+
"avg_ms": 47.25082628000564,
|
| 13 |
+
"min_ms": 40.695745999983046,
|
| 14 |
+
"max_ms": 51.11116500006574,
|
| 15 |
+
"std_ms": 2.9791735891229654,
|
| 16 |
+
"p50_ms": 47.64148850006222,
|
| 17 |
+
"p95_ms": 50.98971859999892,
|
| 18 |
+
"p99_ms": 51.07645830010824,
|
| 19 |
"num_iters": 50,
|
| 20 |
+
"tokens_per_s": 2116.365106663021,
|
| 21 |
+
"throughput_variance": 137.95784254249725
|
| 22 |
},
|
| 23 |
"output_sum": 11.53223705291748
|
| 24 |
}
|
moe_benchmarks/megablocks_yamoe/artifacts/megablocks_run/megablocks_results.json
CHANGED
|
@@ -9,16 +9,16 @@
|
|
| 9 |
"vary_inputs": true
|
| 10 |
},
|
| 11 |
"stats": {
|
| 12 |
-
"avg_ms": 3.
|
| 13 |
-
"min_ms": 0.
|
| 14 |
-
"max_ms": 8.
|
| 15 |
-
"std_ms": 3.
|
| 16 |
-
"p50_ms": 0.
|
| 17 |
-
"p95_ms": 8.
|
| 18 |
-
"p99_ms": 8.
|
| 19 |
"num_iters": 50,
|
| 20 |
-
"tokens_per_s":
|
| 21 |
-
"throughput_variance":
|
| 22 |
},
|
| 23 |
"output_sum": 6.4738850593566895
|
| 24 |
}
|
|
|
|
| 9 |
"vary_inputs": true
|
| 10 |
},
|
| 11 |
"stats": {
|
| 12 |
+
"avg_ms": 3.824586980035747,
|
| 13 |
+
"min_ms": 0.8051279999108374,
|
| 14 |
+
"max_ms": 8.439711999926658,
|
| 15 |
+
"std_ms": 3.6657124717057186,
|
| 16 |
+
"p50_ms": 0.8526945000539854,
|
| 17 |
+
"p95_ms": 8.437759499952335,
|
| 18 |
+
"p99_ms": 8.439305299993975,
|
| 19 |
"num_iters": 50,
|
| 20 |
+
"tokens_per_s": 26146.614136898344,
|
| 21 |
+
"throughput_variance": 52691.24007431396
|
| 22 |
},
|
| 23 |
"output_sum": 6.4738850593566895
|
| 24 |
}
|
moe_benchmarks/megablocks_yamoe/artifacts/visualization/moe_performance_comparison.png
CHANGED
|
Git LFS Details
|
|
Git LFS Details
|
moe_benchmarks/megablocks_yamoe/artifacts/yamoe_run/yamoe_results.json
CHANGED
|
@@ -9,16 +9,16 @@
|
|
| 9 |
"vary_inputs": true
|
| 10 |
},
|
| 11 |
"stats": {
|
| 12 |
-
"avg_ms": 4.
|
| 13 |
-
"min_ms": 4.
|
| 14 |
-
"max_ms": 4.
|
| 15 |
-
"std_ms": 0.
|
| 16 |
-
"p50_ms": 4.
|
| 17 |
-
"p95_ms": 4.
|
| 18 |
-
"p99_ms": 4.
|
| 19 |
"num_iters": 50,
|
| 20 |
-
"tokens_per_s":
|
| 21 |
-
"throughput_variance":
|
| 22 |
},
|
| 23 |
"output_sum": 3.97190523147583
|
| 24 |
}
|
|
|
|
| 9 |
"vary_inputs": true
|
| 10 |
},
|
| 11 |
"stats": {
|
| 12 |
+
"avg_ms": 4.249273639984494,
|
| 13 |
+
"min_ms": 4.135004000090703,
|
| 14 |
+
"max_ms": 4.295798000157447,
|
| 15 |
+
"std_ms": 0.022830750834695483,
|
| 16 |
+
"p50_ms": 4.2523765000623825,
|
| 17 |
+
"p95_ms": 4.274072999987766,
|
| 18 |
+
"p99_ms": 4.289211910063386,
|
| 19 |
"num_iters": 50,
|
| 20 |
+
"tokens_per_s": 23533.433822436742,
|
| 21 |
+
"throughput_variance": 128.24246969319347
|
| 22 |
},
|
| 23 |
"output_sum": 3.97190523147583
|
| 24 |
}
|
moe_benchmarks/megablocks_yamoe/cells/__pycache__/bench_utils.cpython-311.pyc
CHANGED
|
Binary files a/moe_benchmarks/megablocks_yamoe/cells/__pycache__/bench_utils.cpython-311.pyc and b/moe_benchmarks/megablocks_yamoe/cells/__pycache__/bench_utils.cpython-311.pyc differ
|
|
|
moe_benchmarks/megablocks_yamoe/cells/__pycache__/config.cpython-311.pyc
CHANGED
|
Binary files a/moe_benchmarks/megablocks_yamoe/cells/__pycache__/config.cpython-311.pyc and b/moe_benchmarks/megablocks_yamoe/cells/__pycache__/config.cpython-311.pyc differ
|
|
|
moe_benchmarks/megablocks_yamoe/megablocks_yamoe.html
CHANGED
|
@@ -3715,139 +3715,208 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
|
|
| 3715 |
</div>
|
| 3716 |
|
| 3717 |
<div class="main-content">
|
| 3718 |
-
<
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 3719 |
<p>This note compares the performance of the Megablocks and Yamoe kernels on the GPT-OSS-20B model.</p>
|
| 3720 |
<h2>Megablocks kernel</h2>
|
| 3721 |
-
<
|
|
|
|
| 3722 |
<div class="cell-header">
|
| 3723 |
<span class="collapse-indicators">
|
| 3724 |
-
<span onclick="toggleCode('
|
| 3725 |
-
<span onclick="toggleOutput('
|
| 3726 |
-
<span id="uv-indicator-
|
| 3727 |
</span> |
|
| 3728 |
-
Cell:
|
| 3729 |
-
| <button class="run-btn" onclick="runCell('
|
| 3730 |
-
<button class="copy-btn" onclick="copyCell('
|
| 3731 |
-
<a href="cells/
|
| 3732 |
</div>
|
| 3733 |
-
<div id="code-
|
| 3734 |
<div class="highlight-with-lines">
|
| 3735 |
-
<div class="line-numbers" id="lines-
|
| 3736 |
-
<a class="line-number" data-cell="
|
| 3737 |
-
<a class="line-number" data-cell="
|
| 3738 |
-
<a class="line-number" data-cell="
|
| 3739 |
-
<a class="line-number" data-cell="
|
| 3740 |
-
<a class="line-number" data-cell="
|
| 3741 |
-
<a class="line-number" data-cell="
|
| 3742 |
-
<a class="line-number" data-cell="
|
| 3743 |
-
<a class="line-number" data-cell="
|
| 3744 |
-
<a class="line-number" data-cell="
|
| 3745 |
-
<a class="line-number" data-cell="
|
| 3746 |
-
<a class="line-number" data-cell="
|
| 3747 |
-
<a class="line-number" data-cell="
|
| 3748 |
-
<a class="line-number" data-cell="
|
| 3749 |
-
<a class="line-number" data-cell="
|
| 3750 |
-
<a class="line-number" data-cell="
|
| 3751 |
-
<a class="line-number" data-cell="
|
| 3752 |
-
<a class="line-number" data-cell="
|
| 3753 |
-
<a class="line-number" data-cell="
|
| 3754 |
-
<a class="line-number" data-cell="
|
| 3755 |
-
<a class="line-number" data-cell="
|
| 3756 |
-
<a class="line-number" data-cell="
|
| 3757 |
-
<a class="line-number" data-cell="
|
| 3758 |
-
<a class="line-number" data-cell="
|
| 3759 |
-
<a class="line-number" data-cell="
|
| 3760 |
-
<a class="line-number" data-cell="
|
| 3761 |
-
<a class="line-number" data-cell="
|
| 3762 |
-
<a class="line-number" data-cell="
|
| 3763 |
-
<a class="line-number" data-cell="
|
| 3764 |
-
<a class="line-number" data-cell="
|
| 3765 |
-
<a class="line-number" data-cell="
|
| 3766 |
-
<a class="line-number" data-cell="
|
| 3767 |
-
<a class="line-number" data-cell="
|
| 3768 |
-
<a class="line-number" data-cell="
|
| 3769 |
-
<a class="line-number" data-cell="
|
| 3770 |
-
<a class="line-number" data-cell="
|
| 3771 |
-
<a class="line-number" data-cell="
|
| 3772 |
-
<a class="line-number" data-cell="
|
| 3773 |
-
<a class="line-number" data-cell="
|
| 3774 |
-
<a class="line-number" data-cell="
|
| 3775 |
-
<a class="line-number" data-cell="
|
| 3776 |
-
<a class="line-number" data-cell="
|
| 3777 |
-
<a class="line-number" data-cell="
|
| 3778 |
-
<a class="line-number" data-cell="
|
| 3779 |
-
<a class="line-number" data-cell="
|
| 3780 |
-
<a class="line-number" data-cell="
|
| 3781 |
-
<a class="line-number" data-cell="
|
| 3782 |
-
<a class="line-number" data-cell="
|
| 3783 |
-
<a class="line-number" data-cell="
|
| 3784 |
-
<a class="line-number" data-cell="
|
| 3785 |
-
<a class="line-number" data-cell="
|
| 3786 |
-
<a class="line-number" data-cell="
|
| 3787 |
-
<a class="line-number" data-cell="
|
| 3788 |
-
<a class="line-number" data-cell="
|
| 3789 |
-
<a class="line-number" data-cell="
|
| 3790 |
-
<a class="line-number" data-cell="
|
| 3791 |
-
<a class="line-number" data-cell="
|
| 3792 |
-
<a class="line-number" data-cell="
|
| 3793 |
-
<a class="line-number" data-cell="
|
| 3794 |
-
<a class="line-number" data-cell="
|
| 3795 |
-
<a class="line-number" data-cell="
|
| 3796 |
-
<a class="line-number" data-cell="
|
| 3797 |
-
<a class="line-number" data-cell="
|
| 3798 |
-
<a class="line-number" data-cell="
|
| 3799 |
-
<a class="line-number" data-cell="
|
| 3800 |
-
<a class="line-number" data-cell="
|
| 3801 |
-
<a class="line-number" data-cell="
|
| 3802 |
-
<a class="line-number" data-cell="
|
| 3803 |
-
<a class="line-number" data-cell="
|
| 3804 |
-
<a class="line-number" data-cell="
|
| 3805 |
-
<a class="line-number" data-cell="
|
| 3806 |
-
<a class="line-number" data-cell="
|
| 3807 |
-
<a class="line-number" data-cell="
|
| 3808 |
-
<a class="line-number" data-cell="
|
| 3809 |
-
<a class="line-number" data-cell="
|
| 3810 |
-
<a class="line-number" data-cell="
|
| 3811 |
-
<a class="line-number" data-cell="
|
| 3812 |
-
<a class="line-number" data-cell="
|
| 3813 |
-
<a class="line-number" data-cell="
|
| 3814 |
-
<a class="line-number" data-cell="
|
| 3815 |
-
<a class="line-number" data-cell="
|
| 3816 |
-
<a class="line-number" data-cell="
|
| 3817 |
-
<a class="line-number" data-cell="
|
| 3818 |
-
<a class="line-number" data-cell="
|
| 3819 |
-
<a class="line-number" data-cell="
|
| 3820 |
-
<a class="line-number" data-cell="
|
| 3821 |
-
<a class="line-number" data-cell="
|
| 3822 |
-
<a class="line-number" data-cell="
|
| 3823 |
-
<a class="line-number" data-cell="
|
| 3824 |
-
<a class="line-number" data-cell="
|
| 3825 |
-
<a class="line-number" data-cell="
|
| 3826 |
-
<a class="line-number" data-cell="
|
| 3827 |
-
<a class="line-number" data-cell="
|
| 3828 |
-
<a class="line-number" data-cell="
|
| 3829 |
-
<a class="line-number" data-cell="
|
| 3830 |
-
<a class="line-number" data-cell="
|
| 3831 |
-
<a class="line-number" data-cell="
|
| 3832 |
-
<a class="line-number" data-cell="
|
| 3833 |
-
<a class="line-number" data-cell="
|
| 3834 |
-
<a class="line-number" data-cell="
|
| 3835 |
-
<a class="line-number" data-cell="
|
| 3836 |
-
<a class="line-number" data-cell="
|
| 3837 |
-
<a class="line-number" data-cell="
|
| 3838 |
-
<a class="line-number" data-cell="
|
| 3839 |
-
<a class="line-number" data-cell="
|
| 3840 |
-
<a class="line-number" data-cell="
|
| 3841 |
-
<a class="line-number" data-cell="
|
| 3842 |
-
<a class="line-number" data-cell="
|
| 3843 |
-
<a class="line-number" data-cell="
|
| 3844 |
-
<a class="line-number" data-cell="
|
| 3845 |
-
<a class="line-number" data-cell="
|
| 3846 |
-
<a class="line-number" data-cell="
|
| 3847 |
-
<a class="line-number" data-cell="
|
| 3848 |
-
<a class="line-number" data-cell="
|
| 3849 |
-
<a class="line-number" data-cell="
|
| 3850 |
-
<a class="line-number" data-cell="
|
|
|
|
| 3851 |
</div>
|
| 3852 |
<div class="code-wrap">
|
| 3853 |
<div class="highlight"><pre><span></span><span class="c1"># /// script</span>
|
|
@@ -3916,7 +3985,8 @@ Cell: setup2 | 16.98s | FAILED
|
|
| 3916 |
|
| 3917 |
<span class="kn">from</span><span class="w"> </span><span class="nn">transformers.models.gpt_oss.modeling_gpt_oss</span><span class="w"> </span><span class="kn">import</span> <span class="n">GptOssMLP</span><span class="p">,</span> <span class="n">GptOssRMSNorm</span>
|
| 3918 |
|
| 3919 |
-
<span class="n">replace_kernel_forward_from_hub</span><span class="p">(</span><span class="n">
|
|
|
|
| 3920 |
<span class="n">custom_mapping</span> <span class="o">=</span> <span class="p">{</span>
|
| 3921 |
<span class="s2">"Yamoe"</span><span class="p">:</span> <span class="p">{</span>
|
| 3922 |
<span class="s2">"cuda"</span><span class="p">:</span> <span class="p">{</span>
|
|
@@ -3967,14 +4037,14 @@ Cell: setup2 | 16.98s | FAILED
|
|
| 3967 |
<span class="nb">print</span><span class="p">(</span><span class="sa">f</span><span class="s2">"Generation took </span><span class="si">{</span><span class="n">end_time</span><span class="w"> </span><span class="o">-</span><span class="w"> </span><span class="n">start_time</span><span class="si">:</span><span class="s2">.2f</span><span class="si">}</span><span class="s2"> seconds"</span><span class="p">)</span>
|
| 3968 |
</pre></div>
|
| 3969 |
|
| 3970 |
-
<div class="code-line-highlight" id="line-highlight-
|
| 3971 |
</div>
|
| 3972 |
</div>
|
| 3973 |
</div>
|
| 3974 |
-
<div id="output-
|
| 3975 |
<div class="cell-stderr">warning: The requested interpreter resolved to Python 3.11.13, which is incompatible with the script's Python requirement: `>=3.12`
|
| 3976 |
Updating https://github.com/huggingface/transformers.git (HEAD)
|
| 3977 |
-
Updated https://github.com/huggingface/transformers.git (
|
| 3978 |
× No solution found when resolving script dependencies:
|
| 3979 |
╰─▶ Because only transformers==4.57.0.dev0 is available and
|
| 3980 |
transformers==4.57.0.dev0 depends on huggingface-hub==1.0.0rc1,
|
|
@@ -3988,8 +4058,6 @@ Cell: setup2 | 16.98s | FAILED
|
|
| 3988 |
</div>
|
| 3989 |
</div>
|
| 3990 |
</div>
|
| 3991 |
-
|
| 3992 |
-
<h2>Yamoe Kernel</h2>
|
| 3993 |
</div>
|
| 3994 |
|
| 3995 |
</body>
|
|
|
|
| 3715 |
</div>
|
| 3716 |
|
| 3717 |
<div class="main-content">
|
| 3718 |
+
<div class="cell" id="cell-nv">
|
| 3719 |
+
<div class="cell-header">
|
| 3720 |
+
<span class="collapse-indicators">
|
| 3721 |
+
<span onclick="toggleCode('nv')" style="cursor: pointer;">▼ code</span>
|
| 3722 |
+
<span onclick="toggleOutput('nv')" style="cursor: pointer;">▼ output</span>
|
| 3723 |
+
<span id="uv-indicator-nv" style="cursor: default; opacity: 0.3;">▶ uv-logs</span>
|
| 3724 |
+
</span> |
|
| 3725 |
+
Cell: nv | 0.71s
|
| 3726 |
+
| <button class="run-btn" onclick="runCell('nv')">▶ run</button>
|
| 3727 |
+
<button class="copy-btn" onclick="copyCell('nv')">Copy</button>
|
| 3728 |
+
<a href="cells/nv.py" target="_blank" class="raw-btn">Raw</a>
|
| 3729 |
+
</div>
|
| 3730 |
+
<div id="code-nv" class="cell-code" data-lines="3">
|
| 3731 |
+
<div class="highlight-with-lines">
|
| 3732 |
+
<div class="line-numbers" id="lines-nv">
|
| 3733 |
+
<a class="line-number" data-cell="nv" data-line="1" href="#cell-nv" onclick="event.preventDefault(); selectCellLine('nv', 1, true);">1</a>
|
| 3734 |
+
<a class="line-number" data-cell="nv" data-line="2" href="#cell-nv" onclick="event.preventDefault(); selectCellLine('nv', 2, true);">2</a>
|
| 3735 |
+
<a class="line-number" data-cell="nv" data-line="3" href="#cell-nv" onclick="event.preventDefault(); selectCellLine('nv', 3, true);">3</a>
|
| 3736 |
+
</div>
|
| 3737 |
+
<div class="code-wrap">
|
| 3738 |
+
<div class="highlight"><pre><span></span><span class="kn">import</span><span class="w"> </span><span class="nn">subprocess</span>
|
| 3739 |
+
|
| 3740 |
+
<span class="nb">print</span><span class="p">(</span><span class="n">subprocess</span><span class="o">.</span><span class="n">run</span><span class="p">([</span><span class="s2">"nvidia-smi"</span><span class="p">],</span> <span class="n">capture_output</span><span class="o">=</span><span class="kc">True</span><span class="p">,</span> <span class="n">text</span><span class="o">=</span><span class="kc">True</span><span class="p">)</span><span class="o">.</span><span class="n">stdout</span><span class="p">)</span>
|
| 3741 |
+
</pre></div>
|
| 3742 |
+
|
| 3743 |
+
<div class="code-line-highlight" id="line-highlight-nv"></div>
|
| 3744 |
+
</div>
|
| 3745 |
+
</div>
|
| 3746 |
+
</div>
|
| 3747 |
+
<div id="output-nv" class="cell-output">
|
| 3748 |
+
<div class="cell-stdout">Thu Sep 25 20:02:55 2025
|
| 3749 |
+
+-----------------------------------------------------------------------------------------+
|
| 3750 |
+
| NVIDIA-SMI 570.172.08 Driver Version: 570.172.08 CUDA Version: 12.8 |
|
| 3751 |
+
|-----------------------------------------+------------------------+----------------------+
|
| 3752 |
+
| GPU Name Persistence-M | Bus-Id Disp.A | Volatile Uncorr. ECC |
|
| 3753 |
+
| Fan Temp Perf Pwr:Usage/Cap | Memory-Usage | GPU-Util Compute M. |
|
| 3754 |
+
| | | MIG M. |
|
| 3755 |
+
|=========================================+========================+======================|
|
| 3756 |
+
| 0 NVIDIA A10G On | 00000000:00:1B.0 Off | 0 |
|
| 3757 |
+
| 0% 36C P8 25W / 300W | 0MiB / 23028MiB | 0% Default |
|
| 3758 |
+
| | | N/A |
|
| 3759 |
+
+-----------------------------------------+------------------------+----------------------+
|
| 3760 |
+
| 1 NVIDIA A10G On | 00000000:00:1C.0 Off | 0 |
|
| 3761 |
+
| 0% 33C P8 23W / 300W | 0MiB / 23028MiB | 0% Default |
|
| 3762 |
+
| | | N/A |
|
| 3763 |
+
+-----------------------------------------+------------------------+----------------------+
|
| 3764 |
+
| 2 NVIDIA A10G On | 00000000:00:1D.0 Off | 0 |
|
| 3765 |
+
| 0% 33C P8 24W / 300W | 0MiB / 23028MiB | 0% Default |
|
| 3766 |
+
| | | N/A |
|
| 3767 |
+
+-----------------------------------------+------------------------+----------------------+
|
| 3768 |
+
| 3 NVIDIA A10G On | 00000000:00:1E.0 Off | 0 |
|
| 3769 |
+
| 0% 33C P8 26W / 300W | 0MiB / 23028MiB | 0% Default |
|
| 3770 |
+
| | | N/A |
|
| 3771 |
+
+-----------------------------------------+------------------------+----------------------+
|
| 3772 |
+
|
| 3773 |
+
+-----------------------------------------------------------------------------------------+
|
| 3774 |
+
| Processes: |
|
| 3775 |
+
| GPU GI CI PID Type Process name GPU Memory |
|
| 3776 |
+
| ID ID Usage |
|
| 3777 |
+
|=========================================================================================|
|
| 3778 |
+
| No running processes found |
|
| 3779 |
+
+-----------------------------------------------------------------------------------------+
|
| 3780 |
+
|
| 3781 |
+
</div>
|
| 3782 |
+
</div>
|
| 3783 |
+
</div>
|
| 3784 |
+
|
| 3785 |
+
<h1>Comparison of Megablocks and Yamoe Kernels</h1>
|
| 3786 |
<p>This note compares the performance of the Megablocks and Yamoe kernels on the GPT-OSS-20B model.</p>
|
| 3787 |
<h2>Megablocks kernel</h2>
|
| 3788 |
+
<h2>Yamoe Kernel</h2>
|
| 3789 |
+
<div class="cell cell-failed" id="cell-setup">
|
| 3790 |
<div class="cell-header">
|
| 3791 |
<span class="collapse-indicators">
|
| 3792 |
+
<span onclick="toggleCode('setup')" style="cursor: pointer;">▼ code</span>
|
| 3793 |
+
<span onclick="toggleOutput('setup')" style="cursor: pointer;">▼ output</span>
|
| 3794 |
+
<span id="uv-indicator-setup" style="cursor: default; opacity: 0.3;">▶ uv-logs</span>
|
| 3795 |
</span> |
|
| 3796 |
+
Cell: setup | 16.83s | FAILED
|
| 3797 |
+
| <button class="run-btn" onclick="runCell('setup')">▶ run</button>
|
| 3798 |
+
<button class="copy-btn" onclick="copyCell('setup')">Copy</button>
|
| 3799 |
+
<a href="cells/setup.py" target="_blank" class="raw-btn">Raw</a>
|
| 3800 |
</div>
|
| 3801 |
+
<div id="code-setup" class="cell-code" data-lines="116">
|
| 3802 |
<div class="highlight-with-lines">
|
| 3803 |
+
<div class="line-numbers" id="lines-setup">
|
| 3804 |
+
<a class="line-number" data-cell="setup" data-line="1" href="#cell-setup" onclick="event.preventDefault(); selectCellLine('setup', 1, true);">1</a>
|
| 3805 |
+
<a class="line-number" data-cell="setup" data-line="2" href="#cell-setup" onclick="event.preventDefault(); selectCellLine('setup', 2, true);">2</a>
|
| 3806 |
+
<a class="line-number" data-cell="setup" data-line="3" href="#cell-setup" onclick="event.preventDefault(); selectCellLine('setup', 3, true);">3</a>
|
| 3807 |
+
<a class="line-number" data-cell="setup" data-line="4" href="#cell-setup" onclick="event.preventDefault(); selectCellLine('setup', 4, true);">4</a>
|
| 3808 |
+
<a class="line-number" data-cell="setup" data-line="5" href="#cell-setup" onclick="event.preventDefault(); selectCellLine('setup', 5, true);">5</a>
|
| 3809 |
+
<a class="line-number" data-cell="setup" data-line="6" href="#cell-setup" onclick="event.preventDefault(); selectCellLine('setup', 6, true);">6</a>
|
| 3810 |
+
<a class="line-number" data-cell="setup" data-line="7" href="#cell-setup" onclick="event.preventDefault(); selectCellLine('setup', 7, true);">7</a>
|
| 3811 |
+
<a class="line-number" data-cell="setup" data-line="8" href="#cell-setup" onclick="event.preventDefault(); selectCellLine('setup', 8, true);">8</a>
|
| 3812 |
+
<a class="line-number" data-cell="setup" data-line="9" href="#cell-setup" onclick="event.preventDefault(); selectCellLine('setup', 9, true);">9</a>
|
| 3813 |
+
<a class="line-number" data-cell="setup" data-line="10" href="#cell-setup" onclick="event.preventDefault(); selectCellLine('setup', 10, true);">10</a>
|
| 3814 |
+
<a class="line-number" data-cell="setup" data-line="11" href="#cell-setup" onclick="event.preventDefault(); selectCellLine('setup', 11, true);">11</a>
|
| 3815 |
+
<a class="line-number" data-cell="setup" data-line="12" href="#cell-setup" onclick="event.preventDefault(); selectCellLine('setup', 12, true);">12</a>
|
| 3816 |
+
<a class="line-number" data-cell="setup" data-line="13" href="#cell-setup" onclick="event.preventDefault(); selectCellLine('setup', 13, true);">13</a>
|
| 3817 |
+
<a class="line-number" data-cell="setup" data-line="14" href="#cell-setup" onclick="event.preventDefault(); selectCellLine('setup', 14, true);">14</a>
|
| 3818 |
+
<a class="line-number" data-cell="setup" data-line="15" href="#cell-setup" onclick="event.preventDefault(); selectCellLine('setup', 15, true);">15</a>
|
| 3819 |
+
<a class="line-number" data-cell="setup" data-line="16" href="#cell-setup" onclick="event.preventDefault(); selectCellLine('setup', 16, true);">16</a>
|
| 3820 |
+
<a class="line-number" data-cell="setup" data-line="17" href="#cell-setup" onclick="event.preventDefault(); selectCellLine('setup', 17, true);">17</a>
|
| 3821 |
+
<a class="line-number" data-cell="setup" data-line="18" href="#cell-setup" onclick="event.preventDefault(); selectCellLine('setup', 18, true);">18</a>
|
| 3822 |
+
<a class="line-number" data-cell="setup" data-line="19" href="#cell-setup" onclick="event.preventDefault(); selectCellLine('setup', 19, true);">19</a>
|
| 3823 |
+
<a class="line-number" data-cell="setup" data-line="20" href="#cell-setup" onclick="event.preventDefault(); selectCellLine('setup', 20, true);">20</a>
|
| 3824 |
+
<a class="line-number" data-cell="setup" data-line="21" href="#cell-setup" onclick="event.preventDefault(); selectCellLine('setup', 21, true);">21</a>
|
| 3825 |
+
<a class="line-number" data-cell="setup" data-line="22" href="#cell-setup" onclick="event.preventDefault(); selectCellLine('setup', 22, true);">22</a>
|
| 3826 |
+
<a class="line-number" data-cell="setup" data-line="23" href="#cell-setup" onclick="event.preventDefault(); selectCellLine('setup', 23, true);">23</a>
|
| 3827 |
+
<a class="line-number" data-cell="setup" data-line="24" href="#cell-setup" onclick="event.preventDefault(); selectCellLine('setup', 24, true);">24</a>
|
| 3828 |
+
<a class="line-number" data-cell="setup" data-line="25" href="#cell-setup" onclick="event.preventDefault(); selectCellLine('setup', 25, true);">25</a>
|
| 3829 |
+
<a class="line-number" data-cell="setup" data-line="26" href="#cell-setup" onclick="event.preventDefault(); selectCellLine('setup', 26, true);">26</a>
|
| 3830 |
+
<a class="line-number" data-cell="setup" data-line="27" href="#cell-setup" onclick="event.preventDefault(); selectCellLine('setup', 27, true);">27</a>
|
| 3831 |
+
<a class="line-number" data-cell="setup" data-line="28" href="#cell-setup" onclick="event.preventDefault(); selectCellLine('setup', 28, true);">28</a>
|
| 3832 |
+
<a class="line-number" data-cell="setup" data-line="29" href="#cell-setup" onclick="event.preventDefault(); selectCellLine('setup', 29, true);">29</a>
|
| 3833 |
+
<a class="line-number" data-cell="setup" data-line="30" href="#cell-setup" onclick="event.preventDefault(); selectCellLine('setup', 30, true);">30</a>
|
| 3834 |
+
<a class="line-number" data-cell="setup" data-line="31" href="#cell-setup" onclick="event.preventDefault(); selectCellLine('setup', 31, true);">31</a>
|
| 3835 |
+
<a class="line-number" data-cell="setup" data-line="32" href="#cell-setup" onclick="event.preventDefault(); selectCellLine('setup', 32, true);">32</a>
|
| 3836 |
+
<a class="line-number" data-cell="setup" data-line="33" href="#cell-setup" onclick="event.preventDefault(); selectCellLine('setup', 33, true);">33</a>
|
| 3837 |
+
<a class="line-number" data-cell="setup" data-line="34" href="#cell-setup" onclick="event.preventDefault(); selectCellLine('setup', 34, true);">34</a>
|
| 3838 |
+
<a class="line-number" data-cell="setup" data-line="35" href="#cell-setup" onclick="event.preventDefault(); selectCellLine('setup', 35, true);">35</a>
|
| 3839 |
+
<a class="line-number" data-cell="setup" data-line="36" href="#cell-setup" onclick="event.preventDefault(); selectCellLine('setup', 36, true);">36</a>
|
| 3840 |
+
<a class="line-number" data-cell="setup" data-line="37" href="#cell-setup" onclick="event.preventDefault(); selectCellLine('setup', 37, true);">37</a>
|
| 3841 |
+
<a class="line-number" data-cell="setup" data-line="38" href="#cell-setup" onclick="event.preventDefault(); selectCellLine('setup', 38, true);">38</a>
|
| 3842 |
+
<a class="line-number" data-cell="setup" data-line="39" href="#cell-setup" onclick="event.preventDefault(); selectCellLine('setup', 39, true);">39</a>
|
| 3843 |
+
<a class="line-number" data-cell="setup" data-line="40" href="#cell-setup" onclick="event.preventDefault(); selectCellLine('setup', 40, true);">40</a>
|
| 3844 |
+
<a class="line-number" data-cell="setup" data-line="41" href="#cell-setup" onclick="event.preventDefault(); selectCellLine('setup', 41, true);">41</a>
|
| 3845 |
+
<a class="line-number" data-cell="setup" data-line="42" href="#cell-setup" onclick="event.preventDefault(); selectCellLine('setup', 42, true);">42</a>
|
| 3846 |
+
<a class="line-number" data-cell="setup" data-line="43" href="#cell-setup" onclick="event.preventDefault(); selectCellLine('setup', 43, true);">43</a>
|
| 3847 |
+
<a class="line-number" data-cell="setup" data-line="44" href="#cell-setup" onclick="event.preventDefault(); selectCellLine('setup', 44, true);">44</a>
|
| 3848 |
+
<a class="line-number" data-cell="setup" data-line="45" href="#cell-setup" onclick="event.preventDefault(); selectCellLine('setup', 45, true);">45</a>
|
| 3849 |
+
<a class="line-number" data-cell="setup" data-line="46" href="#cell-setup" onclick="event.preventDefault(); selectCellLine('setup', 46, true);">46</a>
|
| 3850 |
+
<a class="line-number" data-cell="setup" data-line="47" href="#cell-setup" onclick="event.preventDefault(); selectCellLine('setup', 47, true);">47</a>
|
| 3851 |
+
<a class="line-number" data-cell="setup" data-line="48" href="#cell-setup" onclick="event.preventDefault(); selectCellLine('setup', 48, true);">48</a>
|
| 3852 |
+
<a class="line-number" data-cell="setup" data-line="49" href="#cell-setup" onclick="event.preventDefault(); selectCellLine('setup', 49, true);">49</a>
|
| 3853 |
+
<a class="line-number" data-cell="setup" data-line="50" href="#cell-setup" onclick="event.preventDefault(); selectCellLine('setup', 50, true);">50</a>
|
| 3854 |
+
<a class="line-number" data-cell="setup" data-line="51" href="#cell-setup" onclick="event.preventDefault(); selectCellLine('setup', 51, true);">51</a>
|
| 3855 |
+
<a class="line-number" data-cell="setup" data-line="52" href="#cell-setup" onclick="event.preventDefault(); selectCellLine('setup', 52, true);">52</a>
|
| 3856 |
+
<a class="line-number" data-cell="setup" data-line="53" href="#cell-setup" onclick="event.preventDefault(); selectCellLine('setup', 53, true);">53</a>
|
| 3857 |
+
<a class="line-number" data-cell="setup" data-line="54" href="#cell-setup" onclick="event.preventDefault(); selectCellLine('setup', 54, true);">54</a>
|
| 3858 |
+
<a class="line-number" data-cell="setup" data-line="55" href="#cell-setup" onclick="event.preventDefault(); selectCellLine('setup', 55, true);">55</a>
|
| 3859 |
+
<a class="line-number" data-cell="setup" data-line="56" href="#cell-setup" onclick="event.preventDefault(); selectCellLine('setup', 56, true);">56</a>
|
| 3860 |
+
<a class="line-number" data-cell="setup" data-line="57" href="#cell-setup" onclick="event.preventDefault(); selectCellLine('setup', 57, true);">57</a>
|
| 3861 |
+
<a class="line-number" data-cell="setup" data-line="58" href="#cell-setup" onclick="event.preventDefault(); selectCellLine('setup', 58, true);">58</a>
|
| 3862 |
+
<a class="line-number" data-cell="setup" data-line="59" href="#cell-setup" onclick="event.preventDefault(); selectCellLine('setup', 59, true);">59</a>
|
| 3863 |
+
<a class="line-number" data-cell="setup" data-line="60" href="#cell-setup" onclick="event.preventDefault(); selectCellLine('setup', 60, true);">60</a>
|
| 3864 |
+
<a class="line-number" data-cell="setup" data-line="61" href="#cell-setup" onclick="event.preventDefault(); selectCellLine('setup', 61, true);">61</a>
|
| 3865 |
+
<a class="line-number" data-cell="setup" data-line="62" href="#cell-setup" onclick="event.preventDefault(); selectCellLine('setup', 62, true);">62</a>
|
| 3866 |
+
<a class="line-number" data-cell="setup" data-line="63" href="#cell-setup" onclick="event.preventDefault(); selectCellLine('setup', 63, true);">63</a>
|
| 3867 |
+
<a class="line-number" data-cell="setup" data-line="64" href="#cell-setup" onclick="event.preventDefault(); selectCellLine('setup', 64, true);">64</a>
|
| 3868 |
+
<a class="line-number" data-cell="setup" data-line="65" href="#cell-setup" onclick="event.preventDefault(); selectCellLine('setup', 65, true);">65</a>
|
| 3869 |
+
<a class="line-number" data-cell="setup" data-line="66" href="#cell-setup" onclick="event.preventDefault(); selectCellLine('setup', 66, true);">66</a>
|
| 3870 |
+
<a class="line-number" data-cell="setup" data-line="67" href="#cell-setup" onclick="event.preventDefault(); selectCellLine('setup', 67, true);">67</a>
|
| 3871 |
+
<a class="line-number" data-cell="setup" data-line="68" href="#cell-setup" onclick="event.preventDefault(); selectCellLine('setup', 68, true);">68</a>
|
| 3872 |
+
<a class="line-number" data-cell="setup" data-line="69" href="#cell-setup" onclick="event.preventDefault(); selectCellLine('setup', 69, true);">69</a>
|
| 3873 |
+
<a class="line-number" data-cell="setup" data-line="70" href="#cell-setup" onclick="event.preventDefault(); selectCellLine('setup', 70, true);">70</a>
|
| 3874 |
+
<a class="line-number" data-cell="setup" data-line="71" href="#cell-setup" onclick="event.preventDefault(); selectCellLine('setup', 71, true);">71</a>
|
| 3875 |
+
<a class="line-number" data-cell="setup" data-line="72" href="#cell-setup" onclick="event.preventDefault(); selectCellLine('setup', 72, true);">72</a>
|
| 3876 |
+
<a class="line-number" data-cell="setup" data-line="73" href="#cell-setup" onclick="event.preventDefault(); selectCellLine('setup', 73, true);">73</a>
|
| 3877 |
+
<a class="line-number" data-cell="setup" data-line="74" href="#cell-setup" onclick="event.preventDefault(); selectCellLine('setup', 74, true);">74</a>
|
| 3878 |
+
<a class="line-number" data-cell="setup" data-line="75" href="#cell-setup" onclick="event.preventDefault(); selectCellLine('setup', 75, true);">75</a>
|
| 3879 |
+
<a class="line-number" data-cell="setup" data-line="76" href="#cell-setup" onclick="event.preventDefault(); selectCellLine('setup', 76, true);">76</a>
|
| 3880 |
+
<a class="line-number" data-cell="setup" data-line="77" href="#cell-setup" onclick="event.preventDefault(); selectCellLine('setup', 77, true);">77</a>
|
| 3881 |
+
<a class="line-number" data-cell="setup" data-line="78" href="#cell-setup" onclick="event.preventDefault(); selectCellLine('setup', 78, true);">78</a>
|
| 3882 |
+
<a class="line-number" data-cell="setup" data-line="79" href="#cell-setup" onclick="event.preventDefault(); selectCellLine('setup', 79, true);">79</a>
|
| 3883 |
+
<a class="line-number" data-cell="setup" data-line="80" href="#cell-setup" onclick="event.preventDefault(); selectCellLine('setup', 80, true);">80</a>
|
| 3884 |
+
<a class="line-number" data-cell="setup" data-line="81" href="#cell-setup" onclick="event.preventDefault(); selectCellLine('setup', 81, true);">81</a>
|
| 3885 |
+
<a class="line-number" data-cell="setup" data-line="82" href="#cell-setup" onclick="event.preventDefault(); selectCellLine('setup', 82, true);">82</a>
|
| 3886 |
+
<a class="line-number" data-cell="setup" data-line="83" href="#cell-setup" onclick="event.preventDefault(); selectCellLine('setup', 83, true);">83</a>
|
| 3887 |
+
<a class="line-number" data-cell="setup" data-line="84" href="#cell-setup" onclick="event.preventDefault(); selectCellLine('setup', 84, true);">84</a>
|
| 3888 |
+
<a class="line-number" data-cell="setup" data-line="85" href="#cell-setup" onclick="event.preventDefault(); selectCellLine('setup', 85, true);">85</a>
|
| 3889 |
+
<a class="line-number" data-cell="setup" data-line="86" href="#cell-setup" onclick="event.preventDefault(); selectCellLine('setup', 86, true);">86</a>
|
| 3890 |
+
<a class="line-number" data-cell="setup" data-line="87" href="#cell-setup" onclick="event.preventDefault(); selectCellLine('setup', 87, true);">87</a>
|
| 3891 |
+
<a class="line-number" data-cell="setup" data-line="88" href="#cell-setup" onclick="event.preventDefault(); selectCellLine('setup', 88, true);">88</a>
|
| 3892 |
+
<a class="line-number" data-cell="setup" data-line="89" href="#cell-setup" onclick="event.preventDefault(); selectCellLine('setup', 89, true);">89</a>
|
| 3893 |
+
<a class="line-number" data-cell="setup" data-line="90" href="#cell-setup" onclick="event.preventDefault(); selectCellLine('setup', 90, true);">90</a>
|
| 3894 |
+
<a class="line-number" data-cell="setup" data-line="91" href="#cell-setup" onclick="event.preventDefault(); selectCellLine('setup', 91, true);">91</a>
|
| 3895 |
+
<a class="line-number" data-cell="setup" data-line="92" href="#cell-setup" onclick="event.preventDefault(); selectCellLine('setup', 92, true);">92</a>
|
| 3896 |
+
<a class="line-number" data-cell="setup" data-line="93" href="#cell-setup" onclick="event.preventDefault(); selectCellLine('setup', 93, true);">93</a>
|
| 3897 |
+
<a class="line-number" data-cell="setup" data-line="94" href="#cell-setup" onclick="event.preventDefault(); selectCellLine('setup', 94, true);">94</a>
|
| 3898 |
+
<a class="line-number" data-cell="setup" data-line="95" href="#cell-setup" onclick="event.preventDefault(); selectCellLine('setup', 95, true);">95</a>
|
| 3899 |
+
<a class="line-number" data-cell="setup" data-line="96" href="#cell-setup" onclick="event.preventDefault(); selectCellLine('setup', 96, true);">96</a>
|
| 3900 |
+
<a class="line-number" data-cell="setup" data-line="97" href="#cell-setup" onclick="event.preventDefault(); selectCellLine('setup', 97, true);">97</a>
|
| 3901 |
+
<a class="line-number" data-cell="setup" data-line="98" href="#cell-setup" onclick="event.preventDefault(); selectCellLine('setup', 98, true);">98</a>
|
| 3902 |
+
<a class="line-number" data-cell="setup" data-line="99" href="#cell-setup" onclick="event.preventDefault(); selectCellLine('setup', 99, true);">99</a>
|
| 3903 |
+
<a class="line-number" data-cell="setup" data-line="100" href="#cell-setup" onclick="event.preventDefault(); selectCellLine('setup', 100, true);">100</a>
|
| 3904 |
+
<a class="line-number" data-cell="setup" data-line="101" href="#cell-setup" onclick="event.preventDefault(); selectCellLine('setup', 101, true);">101</a>
|
| 3905 |
+
<a class="line-number" data-cell="setup" data-line="102" href="#cell-setup" onclick="event.preventDefault(); selectCellLine('setup', 102, true);">102</a>
|
| 3906 |
+
<a class="line-number" data-cell="setup" data-line="103" href="#cell-setup" onclick="event.preventDefault(); selectCellLine('setup', 103, true);">103</a>
|
| 3907 |
+
<a class="line-number" data-cell="setup" data-line="104" href="#cell-setup" onclick="event.preventDefault(); selectCellLine('setup', 104, true);">104</a>
|
| 3908 |
+
<a class="line-number" data-cell="setup" data-line="105" href="#cell-setup" onclick="event.preventDefault(); selectCellLine('setup', 105, true);">105</a>
|
| 3909 |
+
<a class="line-number" data-cell="setup" data-line="106" href="#cell-setup" onclick="event.preventDefault(); selectCellLine('setup', 106, true);">106</a>
|
| 3910 |
+
<a class="line-number" data-cell="setup" data-line="107" href="#cell-setup" onclick="event.preventDefault(); selectCellLine('setup', 107, true);">107</a>
|
| 3911 |
+
<a class="line-number" data-cell="setup" data-line="108" href="#cell-setup" onclick="event.preventDefault(); selectCellLine('setup', 108, true);">108</a>
|
| 3912 |
+
<a class="line-number" data-cell="setup" data-line="109" href="#cell-setup" onclick="event.preventDefault(); selectCellLine('setup', 109, true);">109</a>
|
| 3913 |
+
<a class="line-number" data-cell="setup" data-line="110" href="#cell-setup" onclick="event.preventDefault(); selectCellLine('setup', 110, true);">110</a>
|
| 3914 |
+
<a class="line-number" data-cell="setup" data-line="111" href="#cell-setup" onclick="event.preventDefault(); selectCellLine('setup', 111, true);">111</a>
|
| 3915 |
+
<a class="line-number" data-cell="setup" data-line="112" href="#cell-setup" onclick="event.preventDefault(); selectCellLine('setup', 112, true);">112</a>
|
| 3916 |
+
<a class="line-number" data-cell="setup" data-line="113" href="#cell-setup" onclick="event.preventDefault(); selectCellLine('setup', 113, true);">113</a>
|
| 3917 |
+
<a class="line-number" data-cell="setup" data-line="114" href="#cell-setup" onclick="event.preventDefault(); selectCellLine('setup', 114, true);">114</a>
|
| 3918 |
+
<a class="line-number" data-cell="setup" data-line="115" href="#cell-setup" onclick="event.preventDefault(); selectCellLine('setup', 115, true);">115</a>
|
| 3919 |
+
<a class="line-number" data-cell="setup" data-line="116" href="#cell-setup" onclick="event.preventDefault(); selectCellLine('setup', 116, true);">116</a>
|
| 3920 |
</div>
|
| 3921 |
<div class="code-wrap">
|
| 3922 |
<div class="highlight"><pre><span></span><span class="c1"># /// script</span>
|
|
|
|
| 3985 |
|
| 3986 |
<span class="kn">from</span><span class="w"> </span><span class="nn">transformers.models.gpt_oss.modeling_gpt_oss</span><span class="w"> </span><span class="kn">import</span> <span class="n">GptOssMLP</span><span class="p">,</span> <span class="n">GptOssRMSNorm</span>
|
| 3987 |
|
| 3988 |
+
<span class="n">replace_kernel_forward_from_hub</span><span class="p">(</span><span class="n">GptOssMLP</span><span class="p">,</span> <span class="s2">"Yamoe"</span><span class="p">)</span>
|
| 3989 |
+
<span class="n">replace_kernel_forward_from_hub</span><span class="p">(</span><span class="n">GptOssRMSNorm</span><span class="p">,</span> <span class="kc">None</span><span class="p">)</span>
|
| 3990 |
<span class="n">custom_mapping</span> <span class="o">=</span> <span class="p">{</span>
|
| 3991 |
<span class="s2">"Yamoe"</span><span class="p">:</span> <span class="p">{</span>
|
| 3992 |
<span class="s2">"cuda"</span><span class="p">:</span> <span class="p">{</span>
|
|
|
|
| 4037 |
<span class="nb">print</span><span class="p">(</span><span class="sa">f</span><span class="s2">"Generation took </span><span class="si">{</span><span class="n">end_time</span><span class="w"> </span><span class="o">-</span><span class="w"> </span><span class="n">start_time</span><span class="si">:</span><span class="s2">.2f</span><span class="si">}</span><span class="s2"> seconds"</span><span class="p">)</span>
|
| 4038 |
</pre></div>
|
| 4039 |
|
| 4040 |
+
<div class="code-line-highlight" id="line-highlight-setup"></div>
|
| 4041 |
</div>
|
| 4042 |
</div>
|
| 4043 |
</div>
|
| 4044 |
+
<div id="output-setup" class="cell-output">
|
| 4045 |
<div class="cell-stderr">warning: The requested interpreter resolved to Python 3.11.13, which is incompatible with the script's Python requirement: `>=3.12`
|
| 4046 |
Updating https://github.com/huggingface/transformers.git (HEAD)
|
| 4047 |
+
Updated https://github.com/huggingface/transformers.git (53838edde77cb10f3a360150aa85a457637e9ac3)
|
| 4048 |
× No solution found when resolving script dependencies:
|
| 4049 |
╰─▶ Because only transformers==4.57.0.dev0 is available and
|
| 4050 |
transformers==4.57.0.dev0 depends on huggingface-hub==1.0.0rc1,
|
|
|
|
| 4058 |
</div>
|
| 4059 |
</div>
|
| 4060 |
</div>
|
|
|
|
|
|
|
| 4061 |
</div>
|
| 4062 |
|
| 4063 |
</body>
|
moe_benchmarks/megablocks_yamoe/torch_profile.html
CHANGED
|
@@ -3720,7 +3720,7 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left:
|
|
| 3720 |
<span onclick="toggleOutput('utils')" style="cursor: pointer;">▼ output</span>
|
| 3721 |
<span id="uv-indicator-utils" onclick="toggleUvLogsFromHeader('utils')" style="cursor: pointer;">▶ uv-logs</span>
|
| 3722 |
</span> |
|
| 3723 |
-
Cell: utils | deps: torch, numpy |
|
| 3724 |
| <button class="run-btn" onclick="runCell('utils')">▶ run</button>
|
| 3725 |
<button class="copy-btn" onclick="copyCell('utils')">Copy</button>
|
| 3726 |
<a href="cells/utils.py" target="_blank" class="raw-btn">Raw</a>
|
|
@@ -3794,24 +3794,24 @@ Cell: utils | deps: torch, numpy | 34.73s
|
|
| 3794 |
<div class="uv-install-logs" id="uv-logs-utils">
|
| 3795 |
<div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
|
| 3796 |
<div class="uv-logs-content" style="display: none;">
|
| 3797 |
-
Downloading nvidia-curand-cu12 (60.7MiB)
|
| 3798 |
-
Downloading nvidia-cuda-cupti-cu12 (9.8MiB)
|
| 3799 |
-
Downloading nvidia-cufile-cu12 (1.1MiB)
|
| 3800 |
-
Downloading nvidia-cuda-nvrtc-cu12 (84.0MiB)
|
| 3801 |
Downloading networkx (1.9MiB)
|
| 3802 |
-
Downloading sympy (6.0MiB)
|
| 3803 |
Downloading setuptools (1.1MiB)
|
| 3804 |
-
Downloading nvidia-cublas-cu12 (566.8MiB)
|
| 3805 |
-
Downloading nvidia-cusparse-cu12 (274.9MiB)
|
| 3806 |
-
Downloading nvidia-cusolver-cu12 (255.1MiB)
|
| 3807 |
-
Downloading nvidia-cusparselt-cu12 (273.9MiB)
|
| 3808 |
Downloading numpy (16.2MiB)
|
| 3809 |
-
Downloading nvidia-
|
|
|
|
|
|
|
|
|
|
| 3810 |
Downloading nvidia-cufft-cu12 (184.2MiB)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 3811 |
Downloading nvidia-nccl-cu12 (307.4MiB)
|
| 3812 |
-
Downloading nvidia-nvjitlink-cu12 (37.4MiB)
|
| 3813 |
Downloading torch (846.9MiB)
|
| 3814 |
-
Downloading
|
| 3815 |
Downloading nvidia-cufile-cu12
|
| 3816 |
Downloading setuptools
|
| 3817 |
Downloading networkx
|
|
@@ -3830,7 +3830,7 @@ Downloading triton (148.3MiB)
|
|
| 3830 |
Downloading nvidia-cublas-cu12
|
| 3831 |
Downloading nvidia-cudnn-cu12
|
| 3832 |
Downloading torch
|
| 3833 |
-
Installed 26 packages in
|
| 3834 |
</div>
|
| 3835 |
</div>
|
| 3836 |
</div>
|
|
@@ -3843,7 +3843,7 @@ Installed 26 packages in 456ms
|
|
| 3843 |
<span onclick="toggleOutput('bench_utils')" style="cursor: pointer;">▼ output</span>
|
| 3844 |
<span id="uv-indicator-bench_utils" onclick="toggleUvLogsFromHeader('bench_utils')" style="cursor: pointer;">▶ uv-logs</span>
|
| 3845 |
</span> |
|
| 3846 |
-
Cell: bench_utils | deps: torch, numpy | 34.
|
| 3847 |
| <button class="run-btn" onclick="runCell('bench_utils')">▶ run</button>
|
| 3848 |
<button class="copy-btn" onclick="copyCell('bench_utils')">Copy</button>
|
| 3849 |
<a href="cells/bench_utils.py" target="_blank" class="raw-btn">Raw</a>
|
|
@@ -4331,24 +4331,24 @@ Cell: bench_utils | deps: torch, numpy | 34.06s
|
|
| 4331 |
<div class="uv-install-logs" id="uv-logs-bench_utils">
|
| 4332 |
<div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
|
| 4333 |
<div class="uv-logs-content" style="display: none;">
|
| 4334 |
-
Downloading
|
| 4335 |
-
Downloading nvidia-cufile-cu12 (1.1MiB)
|
| 4336 |
-
Downloading nvidia-curand-cu12 (60.7MiB)
|
| 4337 |
Downloading nvidia-cudnn-cu12 (674.0MiB)
|
| 4338 |
Downloading setuptools (1.1MiB)
|
| 4339 |
-
Downloading nvidia-
|
| 4340 |
-
Downloading nvidia-nccl-cu12 (307.4MiB)
|
| 4341 |
-
Downloading nvidia-cublas-cu12 (566.8MiB)
|
| 4342 |
Downloading nvidia-nvjitlink-cu12 (37.4MiB)
|
| 4343 |
-
Downloading nvidia-cuda-nvrtc-cu12 (84.0MiB)
|
| 4344 |
-
Downloading nvidia-cusparse-cu12 (274.9MiB)
|
| 4345 |
-
Downloading nvidia-cusolver-cu12 (255.1MiB)
|
| 4346 |
Downloading sympy (6.0MiB)
|
| 4347 |
-
Downloading
|
| 4348 |
-
Downloading nvidia-cusparselt-cu12 (273.9MiB)
|
| 4349 |
Downloading numpy (16.2MiB)
|
| 4350 |
-
Downloading
|
| 4351 |
Downloading nvidia-cuda-cupti-cu12 (9.8MiB)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 4352 |
Downloading nvidia-cufile-cu12
|
| 4353 |
Downloading setuptools
|
| 4354 |
Downloading networkx
|
|
@@ -4361,13 +4361,13 @@ Downloading nvidia-cuda-cupti-cu12 (9.8MiB)
|
|
| 4361 |
Downloading triton
|
| 4362 |
Downloading nvidia-cufft-cu12
|
| 4363 |
Downloading nvidia-cusolver-cu12
|
| 4364 |
-
Downloading nvidia-cusparselt-cu12
|
| 4365 |
Downloading nvidia-cusparse-cu12
|
|
|
|
| 4366 |
Downloading nvidia-nccl-cu12
|
| 4367 |
Downloading nvidia-cublas-cu12
|
| 4368 |
Downloading nvidia-cudnn-cu12
|
| 4369 |
Downloading torch
|
| 4370 |
-
Installed 26 packages in
|
| 4371 |
</div>
|
| 4372 |
</div>
|
| 4373 |
</div>
|
|
@@ -4381,7 +4381,7 @@ Installed 26 packages in 448ms
|
|
| 4381 |
<span onclick="toggleOutput('config')" style="cursor: pointer;">▼ output</span>
|
| 4382 |
<span id="uv-indicator-config" onclick="toggleUvLogsFromHeader('config')" style="cursor: pointer;">▶ uv-logs</span>
|
| 4383 |
</span> |
|
| 4384 |
-
Cell: config | deps: torch, numpy |
|
| 4385 |
| <button class="run-btn" onclick="runCell('config')">▶ run</button>
|
| 4386 |
<button class="copy-btn" onclick="copyCell('config')">Copy</button>
|
| 4387 |
<a href="cells/config.py" target="_blank" class="raw-btn">Raw</a>
|
|
@@ -4441,24 +4441,24 @@ Cell: config | deps: torch, numpy | 35.66s
|
|
| 4441 |
<div class="uv-install-logs" id="uv-logs-config">
|
| 4442 |
<div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
|
| 4443 |
<div class="uv-logs-content" style="display: none;">
|
| 4444 |
-
Downloading
|
| 4445 |
-
Downloading nvidia-curand-cu12 (60.7MiB)
|
| 4446 |
Downloading sympy (6.0MiB)
|
| 4447 |
-
Downloading nvidia-
|
| 4448 |
-
Downloading
|
|
|
|
|
|
|
| 4449 |
Downloading nvidia-nvjitlink-cu12 (37.4MiB)
|
| 4450 |
-
Downloading
|
| 4451 |
Downloading nvidia-cusparse-cu12 (274.9MiB)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 4452 |
Downloading nvidia-cuda-cupti-cu12 (9.8MiB)
|
| 4453 |
-
Downloading nvidia-cusparselt-cu12 (273.9MiB)
|
| 4454 |
Downloading nvidia-nccl-cu12 (307.4MiB)
|
| 4455 |
-
Downloading nvidia-cufft-cu12 (184.2MiB)
|
| 4456 |
-
Downloading nvidia-cufile-cu12 (1.1MiB)
|
| 4457 |
-
Downloading networkx (1.9MiB)
|
| 4458 |
-
Downloading torch (846.9MiB)
|
| 4459 |
-
Downloading nvidia-cudnn-cu12 (674.0MiB)
|
| 4460 |
Downloading nvidia-cuda-nvrtc-cu12 (84.0MiB)
|
| 4461 |
-
Downloading
|
| 4462 |
Downloading nvidia-cufile-cu12
|
| 4463 |
Downloading setuptools
|
| 4464 |
Downloading networkx
|
|
@@ -4477,7 +4477,7 @@ Downloading triton (148.3MiB)
|
|
| 4477 |
Downloading nvidia-cublas-cu12
|
| 4478 |
Downloading nvidia-cudnn-cu12
|
| 4479 |
Downloading torch
|
| 4480 |
-
Installed 26 packages in
|
| 4481 |
</div>
|
| 4482 |
</div>
|
| 4483 |
</div>
|
|
@@ -4490,7 +4490,7 @@ Installed 26 packages in 456ms
|
|
| 4490 |
<span onclick="toggleOutput('save_data')" style="cursor: pointer;">▼ output</span>
|
| 4491 |
<span id="uv-indicator-save_data" onclick="toggleUvLogsFromHeader('save_data')" style="cursor: pointer;">▶ uv-logs</span>
|
| 4492 |
</span> |
|
| 4493 |
-
Cell: save_data | deps: torch, numpy |
|
| 4494 |
| <button class="run-btn" onclick="runCell('save_data')">▶ run</button>
|
| 4495 |
<button class="copy-btn" onclick="copyCell('save_data')">Copy</button>
|
| 4496 |
<a href="cells/save_data.py" target="_blank" class="raw-btn">Raw</a>
|
|
@@ -4585,24 +4585,24 @@ Down sum: 206.729263
|
|
| 4585 |
<div class="uv-install-logs" id="uv-logs-save_data">
|
| 4586 |
<div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
|
| 4587 |
<div class="uv-logs-content" style="display: none;">
|
| 4588 |
-
Downloading
|
| 4589 |
-
Downloading nvidia-
|
|
|
|
| 4590 |
Downloading numpy (16.2MiB)
|
| 4591 |
-
Downloading
|
| 4592 |
Downloading nvidia-nccl-cu12 (307.4MiB)
|
| 4593 |
-
Downloading nvidia-cublas-cu12 (566.8MiB)
|
| 4594 |
-
Downloading nvidia-cuda-cupti-cu12 (9.8MiB)
|
| 4595 |
-
Downloading nvidia-nvjitlink-cu12 (37.4MiB)
|
| 4596 |
-
Downloading nvidia-curand-cu12 (60.7MiB)
|
| 4597 |
Downloading setuptools (1.1MiB)
|
|
|
|
| 4598 |
Downloading nvidia-cufft-cu12 (184.2MiB)
|
| 4599 |
Downloading nvidia-cudnn-cu12 (674.0MiB)
|
| 4600 |
-
Downloading nvidia-cuda-nvrtc-cu12 (84.0MiB)
|
| 4601 |
-
Downloading networkx (1.9MiB)
|
| 4602 |
-
Downloading nvidia-cusparse-cu12 (274.9MiB)
|
| 4603 |
Downloading nvidia-cufile-cu12 (1.1MiB)
|
| 4604 |
Downloading nvidia-cusolver-cu12 (255.1MiB)
|
|
|
|
|
|
|
|
|
|
|
|
|
| 4605 |
Downloading triton (148.3MiB)
|
|
|
|
| 4606 |
Downloading nvidia-cufile-cu12
|
| 4607 |
Downloading setuptools
|
| 4608 |
Downloading networkx
|
|
@@ -4615,23 +4615,23 @@ Downloading triton (148.3MiB)
|
|
| 4615 |
Downloading triton
|
| 4616 |
Downloading nvidia-cufft-cu12
|
| 4617 |
Downloading nvidia-cusolver-cu12
|
| 4618 |
-
Downloading nvidia-cusparse-cu12
|
| 4619 |
Downloading nvidia-cusparselt-cu12
|
|
|
|
| 4620 |
Downloading nvidia-nccl-cu12
|
| 4621 |
Downloading nvidia-cublas-cu12
|
| 4622 |
Downloading nvidia-cudnn-cu12
|
| 4623 |
Downloading torch
|
| 4624 |
-
Installed 26 packages in
|
| 4625 |
</div>
|
| 4626 |
</div>
|
| 4627 |
<div class="cell-artifacts">
|
| 4628 |
<h4>Artifacts:</h4>
|
| 4629 |
-
<a href="artifacts/save_data/
|
| 4630 |
-
<a href="artifacts/save_data/router_weight.pt" class="artifact" target="_blank">router_weight.pt</a>
|
| 4631 |
<a href="artifacts/save_data/down_proj_bias.pt" class="artifact" target="_blank">down_proj_bias.pt</a>
|
| 4632 |
-
<a href="artifacts/save_data/gate_up_proj.pt" class="artifact" target="_blank">gate_up_proj.pt</a>
|
| 4633 |
<a href="artifacts/save_data/gate_up_proj_bias.pt" class="artifact" target="_blank">gate_up_proj_bias.pt</a>
|
| 4634 |
-
<a href="artifacts/save_data/
|
|
|
|
|
|
|
| 4635 |
</div>
|
| 4636 |
</div>
|
| 4637 |
</div>
|
|
@@ -4645,7 +4645,7 @@ Installed 26 packages in 453ms
|
|
| 4645 |
<span onclick="toggleOutput('yamoe_run')" style="cursor: pointer;">▼ output</span>
|
| 4646 |
<span id="uv-indicator-yamoe_run" onclick="toggleUvLogsFromHeader('yamoe_run')" style="cursor: pointer;">▶ uv-logs</span>
|
| 4647 |
</span> |
|
| 4648 |
-
Cell: yamoe_run | deps: torch, kernels, numpy | 38.
|
| 4649 |
| <button class="run-btn" onclick="runCell('yamoe_run')">▶ run</button>
|
| 4650 |
<button class="copy-btn" onclick="copyCell('yamoe_run')">Copy</button>
|
| 4651 |
<a href="cells/yamoe_run.py" target="_blank" class="raw-btn">Raw</a>
|
|
@@ -4938,10 +4938,10 @@ Input Variation: +0.001 * iteration (deterministic)
|
|
| 4938 |
|
| 4939 |
Warming up (10 iterations)...
|
| 4940 |
Benchmarking (50 iterations)...
|
| 4941 |
-
Progress: 20% complete (avg: 4.
|
| 4942 |
-
Progress: 40% complete (avg: 4.
|
| 4943 |
-
Progress: 60% complete (avg: 4.
|
| 4944 |
-
Progress: 80% complete (avg: 4.
|
| 4945 |
|
| 4946 |
Output tensors:
|
| 4947 |
Primary: shape=(1, 100, 1152), dtype=torch.float32, device=cuda:0, range=[-0.049506, 0.054984], mean=0.000034, std=0.006508, norm=2.208791
|
|
@@ -4952,18 +4952,18 @@ Iterations: 50
|
|
| 4952 |
|
| 4953 |
Latency Statistics:
|
| 4954 |
Average: 4.249 ms
|
| 4955 |
-
Min: 4.
|
| 4956 |
-
Max: 4.
|
| 4957 |
-
Std Dev: 0.
|
| 4958 |
|
| 4959 |
Percentiles:
|
| 4960 |
-
P50 (median): 4.
|
| 4961 |
-
P95: 4.
|
| 4962 |
-
P99: 4.
|
| 4963 |
|
| 4964 |
Throughput:
|
| 4965 |
-
Tokens/sec:
|
| 4966 |
-
Std Dev:
|
| 4967 |
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
|
| 4968 |
|
| 4969 |
Saved benchmark results to yamoe_results.json
|
|
@@ -4973,25 +4973,25 @@ Output sum: 3.971905
|
|
| 4973 |
<div class="uv-install-logs" id="uv-logs-yamoe_run">
|
| 4974 |
<div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
|
| 4975 |
<div class="uv-logs-content" style="display: none;">
|
| 4976 |
-
Downloading nvidia-cufft-cu12 (184.2MiB)
|
| 4977 |
-
Downloading setuptools (1.1MiB)
|
| 4978 |
-
Downloading nvidia-curand-cu12 (60.7MiB)
|
| 4979 |
-
Downloading nvidia-cudnn-cu12 (674.0MiB)
|
| 4980 |
Downloading sympy (6.0MiB)
|
| 4981 |
-
Downloading nvidia-
|
| 4982 |
-
Downloading
|
| 4983 |
Downloading nvidia-cuda-cupti-cu12 (9.8MiB)
|
|
|
|
|
|
|
| 4984 |
Downloading nvidia-cusparselt-cu12 (273.9MiB)
|
|
|
|
|
|
|
| 4985 |
Downloading triton (148.3MiB)
|
| 4986 |
-
Downloading
|
| 4987 |
-
Downloading nvidia-
|
| 4988 |
-
Downloading numpy (16.2MiB)
|
| 4989 |
-
Downloading nvidia-cuda-nvrtc-cu12 (84.0MiB)
|
| 4990 |
Downloading nvidia-cusparse-cu12 (274.9MiB)
|
| 4991 |
-
Downloading nvidia-nvjitlink-cu12 (37.4MiB)
|
| 4992 |
-
Downloading nvidia-cufile-cu12 (1.1MiB)
|
| 4993 |
Downloading nvidia-nccl-cu12 (307.4MiB)
|
|
|
|
| 4994 |
Downloading torch (846.9MiB)
|
|
|
|
|
|
|
|
|
|
| 4995 |
Downloading nvidia-cufile-cu12
|
| 4996 |
Downloading hf-xet
|
| 4997 |
Downloading setuptools
|
|
@@ -5005,20 +5005,19 @@ Downloading torch (846.9MiB)
|
|
| 5005 |
Downloading triton
|
| 5006 |
Downloading nvidia-cufft-cu12
|
| 5007 |
Downloading nvidia-cusolver-cu12
|
| 5008 |
-
Downloading nvidia-cusparselt-cu12
|
| 5009 |
Downloading nvidia-cusparse-cu12
|
|
|
|
| 5010 |
Downloading nvidia-nccl-cu12
|
| 5011 |
Downloading nvidia-cublas-cu12
|
| 5012 |
Downloading nvidia-cudnn-cu12
|
| 5013 |
Downloading torch
|
| 5014 |
-
Installed 37 packages in
|
| 5015 |
</div>
|
| 5016 |
</div>
|
| 5017 |
<div class="cell-stderr">Fetching 6 files: 0%| | 0/6 [00:00<?, ?it/s]
|
| 5018 |
-
Fetching 6 files: 17%|█▋ | 1/6 [00:00<00:01,
|
| 5019 |
-
Fetching 6 files:
|
| 5020 |
-
Fetching 6 files:
|
| 5021 |
-
Fetching 6 files: 100%|██████████| 6/6 [00:00<00:00, 7.04it/s]</div>
|
| 5022 |
<div class="cell-artifacts">
|
| 5023 |
<h4>Artifacts:</h4>
|
| 5024 |
<a href="artifacts/yamoe_run/yamoe_results.json" class="artifact" target="_blank">yamoe_results.json</a>
|
|
@@ -5035,7 +5034,7 @@ Fetching 6 files: 100%|██████████| 6/6 [00:00<00:00, 7.0
|
|
| 5035 |
<span onclick="toggleOutput('binned_run')" style="cursor: pointer;">▼ output</span>
|
| 5036 |
<span id="uv-indicator-binned_run" onclick="toggleUvLogsFromHeader('binned_run')" style="cursor: pointer;">▶ uv-logs</span>
|
| 5037 |
</span> |
|
| 5038 |
-
Cell: binned_run | deps: torch, numpy | 39.
|
| 5039 |
| <button class="run-btn" onclick="runCell('binned_run')">▶ run</button>
|
| 5040 |
<button class="copy-btn" onclick="copyCell('binned_run')">Copy</button>
|
| 5041 |
<a href="cells/binned_run.py" target="_blank" class="raw-btn">Raw</a>
|
|
@@ -5449,10 +5448,10 @@ Input Variation: +0.001 * iteration (deterministic)
|
|
| 5449 |
|
| 5450 |
Warming up (10 iterations)...
|
| 5451 |
Benchmarking (50 iterations)...
|
| 5452 |
-
Progress: 20% complete (avg:
|
| 5453 |
-
Progress: 40% complete (avg:
|
| 5454 |
-
Progress: 60% complete (avg: 37.
|
| 5455 |
-
Progress: 80% complete (avg:
|
| 5456 |
|
| 5457 |
Output tensors:
|
| 5458 |
Primary: shape=(1, 100, 1152), dtype=torch.float32, device=cuda:0, range=[-0.049506, 0.054984], mean=0.000034, std=0.006508, norm=2.208791
|
|
@@ -5462,19 +5461,19 @@ Output tensors:
|
|
| 5462 |
Iterations: 50
|
| 5463 |
|
| 5464 |
Latency Statistics:
|
| 5465 |
-
Average: 36.
|
| 5466 |
-
Min: 33.
|
| 5467 |
-
Max:
|
| 5468 |
-
Std Dev: 1.
|
| 5469 |
|
| 5470 |
Percentiles:
|
| 5471 |
-
P50 (median): 36.
|
| 5472 |
-
P95:
|
| 5473 |
-
P99: 38.
|
| 5474 |
|
| 5475 |
Throughput:
|
| 5476 |
-
Tokens/sec:
|
| 5477 |
-
Std Dev:
|
| 5478 |
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
|
| 5479 |
|
| 5480 |
Saved benchmark results to binned_results.json
|
|
@@ -5484,24 +5483,24 @@ Output sum: 3.971905
|
|
| 5484 |
<div class="uv-install-logs" id="uv-logs-binned_run">
|
| 5485 |
<div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
|
| 5486 |
<div class="uv-logs-content" style="display: none;">
|
|
|
|
|
|
|
| 5487 |
Downloading networkx (1.9MiB)
|
| 5488 |
-
Downloading nvidia-cusparselt-cu12 (273.9MiB)
|
| 5489 |
-
Downloading nvidia-cusolver-cu12 (255.1MiB)
|
| 5490 |
-
Downloading nvidia-cusparse-cu12 (274.9MiB)
|
| 5491 |
-
Downloading nvidia-cufft-cu12 (184.2MiB)
|
| 5492 |
-
Downloading nvidia-cufile-cu12 (1.1MiB)
|
| 5493 |
Downloading nvidia-cuda-cupti-cu12 (9.8MiB)
|
| 5494 |
-
Downloading
|
| 5495 |
-
Downloading
|
|
|
|
| 5496 |
Downloading nvidia-nvjitlink-cu12 (37.4MiB)
|
| 5497 |
-
Downloading nvidia-
|
|
|
|
| 5498 |
Downloading sympy (6.0MiB)
|
|
|
|
|
|
|
|
|
|
|
|
|
| 5499 |
Downloading nvidia-nccl-cu12 (307.4MiB)
|
| 5500 |
-
Downloading nvidia-cuda-nvrtc-cu12 (84.0MiB)
|
| 5501 |
-
Downloading nvidia-cublas-cu12 (566.8MiB)
|
| 5502 |
-
Downloading nvidia-cudnn-cu12 (674.0MiB)
|
| 5503 |
-
Downloading triton (148.3MiB)
|
| 5504 |
Downloading torch (846.9MiB)
|
|
|
|
| 5505 |
Downloading nvidia-cufile-cu12
|
| 5506 |
Downloading setuptools
|
| 5507 |
Downloading networkx
|
|
@@ -5520,7 +5519,7 @@ Downloading torch (846.9MiB)
|
|
| 5520 |
Downloading nvidia-cublas-cu12
|
| 5521 |
Downloading nvidia-cudnn-cu12
|
| 5522 |
Downloading torch
|
| 5523 |
-
Installed 26 packages in
|
| 5524 |
</div>
|
| 5525 |
</div>
|
| 5526 |
<div class="cell-artifacts">
|
|
@@ -5539,7 +5538,7 @@ Installed 26 packages in 450ms
|
|
| 5539 |
<span onclick="toggleOutput('gptoss_run')" style="cursor: pointer;">▼ output</span>
|
| 5540 |
<span id="uv-indicator-gptoss_run" onclick="toggleUvLogsFromHeader('gptoss_run')" style="cursor: pointer;">▶ uv-logs</span>
|
| 5541 |
</span> |
|
| 5542 |
-
Cell: gptoss_run | deps: torch, numpy |
|
| 5543 |
| <button class="run-btn" onclick="runCell('gptoss_run')">▶ run</button>
|
| 5544 |
<button class="copy-btn" onclick="copyCell('gptoss_run')">Copy</button>
|
| 5545 |
<a href="cells/gptoss_run.py" target="_blank" class="raw-btn">Raw</a>
|
|
@@ -5857,10 +5856,10 @@ Input Variation: +0.001 * iteration (deterministic)
|
|
| 5857 |
|
| 5858 |
Warming up (10 iterations)...
|
| 5859 |
Benchmarking (50 iterations)...
|
| 5860 |
-
Progress: 20% complete (avg: 50.
|
| 5861 |
-
Progress: 40% complete (avg: 49.
|
| 5862 |
-
Progress: 60% complete (avg:
|
| 5863 |
-
Progress: 80% complete (avg:
|
| 5864 |
|
| 5865 |
Output tensors:
|
| 5866 |
Primary: shape=(1, 100, 1152), dtype=torch.float32, device=cuda:0, range=[-0.064982, 0.061193], mean=0.000100, std=0.013510, norm=4.585560
|
|
@@ -5870,19 +5869,19 @@ Output tensors:
|
|
| 5870 |
Iterations: 50
|
| 5871 |
|
| 5872 |
Latency Statistics:
|
| 5873 |
-
Average:
|
| 5874 |
-
Min:
|
| 5875 |
-
Max:
|
| 5876 |
-
Std Dev:
|
| 5877 |
|
| 5878 |
Percentiles:
|
| 5879 |
-
P50 (median):
|
| 5880 |
-
P95: 50.
|
| 5881 |
-
P99:
|
| 5882 |
|
| 5883 |
Throughput:
|
| 5884 |
-
Tokens/sec:
|
| 5885 |
-
Std Dev:
|
| 5886 |
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
|
| 5887 |
|
| 5888 |
Saved benchmark results to gptoss_results.json
|
|
@@ -5892,24 +5891,24 @@ Output sum: 11.532237
|
|
| 5892 |
<div class="uv-install-logs" id="uv-logs-gptoss_run">
|
| 5893 |
<div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
|
| 5894 |
<div class="uv-logs-content" style="display: none;">
|
| 5895 |
-
Downloading
|
|
|
|
| 5896 |
Downloading nvidia-nvjitlink-cu12 (37.4MiB)
|
| 5897 |
-
Downloading numpy (16.2MiB)
|
| 5898 |
-
Downloading nvidia-cuda-cupti-cu12 (9.8MiB)
|
| 5899 |
-
Downloading setuptools (1.1MiB)
|
| 5900 |
-
Downloading triton (148.3MiB)
|
| 5901 |
Downloading nvidia-nccl-cu12 (307.4MiB)
|
| 5902 |
-
Downloading
|
|
|
|
|
|
|
| 5903 |
Downloading nvidia-cudnn-cu12 (674.0MiB)
|
| 5904 |
-
Downloading
|
| 5905 |
-
Downloading networkx (1.9MiB)
|
| 5906 |
Downloading nvidia-cublas-cu12 (566.8MiB)
|
| 5907 |
-
Downloading
|
| 5908 |
-
Downloading
|
| 5909 |
Downloading nvidia-cufile-cu12 (1.1MiB)
|
| 5910 |
-
Downloading nvidia-cusparse-cu12 (274.9MiB)
|
| 5911 |
Downloading nvidia-cusolver-cu12 (255.1MiB)
|
|
|
|
|
|
|
| 5912 |
Downloading torch (846.9MiB)
|
|
|
|
| 5913 |
Downloading nvidia-cufile-cu12
|
| 5914 |
Downloading setuptools
|
| 5915 |
Downloading networkx
|
|
@@ -5928,7 +5927,7 @@ Downloading torch (846.9MiB)
|
|
| 5928 |
Downloading nvidia-cublas-cu12
|
| 5929 |
Downloading nvidia-cudnn-cu12
|
| 5930 |
Downloading torch
|
| 5931 |
-
Installed 26 packages in
|
| 5932 |
</div>
|
| 5933 |
</div>
|
| 5934 |
<div class="cell-artifacts">
|
|
@@ -5947,7 +5946,7 @@ Installed 26 packages in 443ms
|
|
| 5947 |
<span onclick="toggleOutput('gptoss_training_run')" style="cursor: pointer;">▼ output</span>
|
| 5948 |
<span id="uv-indicator-gptoss_training_run" onclick="toggleUvLogsFromHeader('gptoss_training_run')" style="cursor: pointer;">▶ uv-logs</span>
|
| 5949 |
</span> |
|
| 5950 |
-
Cell: gptoss_training_run | deps: torch, numpy | 40.
|
| 5951 |
| <button class="run-btn" onclick="runCell('gptoss_training_run')">▶ run</button>
|
| 5952 |
<button class="copy-btn" onclick="copyCell('gptoss_training_run')">Copy</button>
|
| 5953 |
<a href="cells/gptoss_training_run.py" target="_blank" class="raw-btn">Raw</a>
|
|
@@ -6248,10 +6247,10 @@ Input Variation: +0.001 * iteration (deterministic)
|
|
| 6248 |
|
| 6249 |
Warming up (10 iterations)...
|
| 6250 |
Benchmarking (50 iterations)...
|
| 6251 |
-
Progress: 20% complete (avg: 50.
|
| 6252 |
-
Progress: 40% complete (avg: 50.
|
| 6253 |
-
Progress: 60% complete (avg: 49.
|
| 6254 |
-
Progress: 80% complete (avg: 48.
|
| 6255 |
|
| 6256 |
Output tensors:
|
| 6257 |
Primary: shape=(1, 100, 1152), dtype=torch.float32, device=cuda:0, range=[-0.064982, 0.061193], mean=0.000100, std=0.013510, norm=4.585560
|
|
@@ -6261,19 +6260,19 @@ Output tensors:
|
|
| 6261 |
Iterations: 50
|
| 6262 |
|
| 6263 |
Latency Statistics:
|
| 6264 |
-
Average: 47.
|
| 6265 |
-
Min: 40.
|
| 6266 |
-
Max: 51.
|
| 6267 |
-
Std Dev:
|
| 6268 |
|
| 6269 |
Percentiles:
|
| 6270 |
-
P50 (median): 47.
|
| 6271 |
-
P95:
|
| 6272 |
-
P99: 51.
|
| 6273 |
|
| 6274 |
Throughput:
|
| 6275 |
-
Tokens/sec:
|
| 6276 |
-
Std Dev:
|
| 6277 |
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
|
| 6278 |
|
| 6279 |
Saved benchmark results to gptoss_training_results.json
|
|
@@ -6283,30 +6282,30 @@ Output sum: 11.532237
|
|
| 6283 |
<div class="uv-install-logs" id="uv-logs-gptoss_training_run">
|
| 6284 |
<div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
|
| 6285 |
<div class="uv-logs-content" style="display: none;">
|
| 6286 |
-
Downloading nvidia-nvjitlink-cu12 (37.4MiB)
|
| 6287 |
-
Downloading numpy (16.2MiB)
|
| 6288 |
-
Downloading sympy (6.0MiB)
|
| 6289 |
-
Downloading networkx (1.9MiB)
|
| 6290 |
Downloading setuptools (1.1MiB)
|
| 6291 |
-
Downloading nvidia-
|
| 6292 |
-
Downloading torch (846.9MiB)
|
| 6293 |
-
Downloading nvidia-cuda-cupti-cu12 (9.8MiB)
|
| 6294 |
Downloading nvidia-cudnn-cu12 (674.0MiB)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 6295 |
Downloading nvidia-cublas-cu12 (566.8MiB)
|
| 6296 |
-
Downloading nvidia-
|
| 6297 |
-
Downloading nvidia-
|
|
|
|
| 6298 |
Downloading nvidia-cusolver-cu12 (255.1MiB)
|
| 6299 |
Downloading nvidia-cufile-cu12 (1.1MiB)
|
| 6300 |
-
Downloading nvidia-
|
| 6301 |
-
Downloading nvidia-cuda-nvrtc-cu12 (84.0MiB)
|
| 6302 |
Downloading triton (148.3MiB)
|
| 6303 |
-
Downloading
|
|
|
|
| 6304 |
Downloading nvidia-cufile-cu12
|
| 6305 |
Downloading setuptools
|
| 6306 |
Downloading networkx
|
| 6307 |
Downloading nvidia-cuda-cupti-cu12
|
| 6308 |
-
Downloading sympy
|
| 6309 |
Downloading numpy
|
|
|
|
| 6310 |
Downloading nvidia-nvjitlink-cu12
|
| 6311 |
Downloading nvidia-curand-cu12
|
| 6312 |
Downloading nvidia-cuda-nvrtc-cu12
|
|
@@ -6319,7 +6318,7 @@ Downloading nvidia-cusparse-cu12 (274.9MiB)
|
|
| 6319 |
Downloading nvidia-cublas-cu12
|
| 6320 |
Downloading nvidia-cudnn-cu12
|
| 6321 |
Downloading torch
|
| 6322 |
-
Installed 26 packages in
|
| 6323 |
</div>
|
| 6324 |
</div>
|
| 6325 |
<div class="cell-artifacts">
|
|
@@ -6338,7 +6337,7 @@ Installed 26 packages in 451ms
|
|
| 6338 |
<span onclick="toggleOutput('megablocks_run')" style="cursor: pointer;">▼ output</span>
|
| 6339 |
<span id="uv-indicator-megablocks_run" onclick="toggleUvLogsFromHeader('megablocks_run')" style="cursor: pointer;">▶ uv-logs</span>
|
| 6340 |
</span> |
|
| 6341 |
-
Cell: megablocks_run | deps: torch, numpy, kernels | 47.
|
| 6342 |
| <button class="run-btn" onclick="runCell('megablocks_run')">▶ run</button>
|
| 6343 |
<button class="copy-btn" onclick="copyCell('megablocks_run')">Copy</button>
|
| 6344 |
<a href="cells/megablocks_run.py" target="_blank" class="raw-btn">Raw</a>
|
|
@@ -6567,10 +6566,10 @@ Input Variation: +0.001 * iteration (deterministic)
|
|
| 6567 |
|
| 6568 |
Warming up (10 iterations)...
|
| 6569 |
Benchmarking (50 iterations)...
|
| 6570 |
-
Progress: 20% complete (avg: 0.
|
| 6571 |
-
Progress: 40% complete (avg: 0.
|
| 6572 |
-
Progress: 60% complete (avg: 0.
|
| 6573 |
-
Progress: 80% complete (avg: 2.
|
| 6574 |
|
| 6575 |
Output tensors:
|
| 6576 |
Primary: shape=(1, 100, 1152), dtype=torch.float32, device=cuda:0, range=[-0.061104, 0.055115], mean=0.000056, std=0.013535, norm=4.593927
|
|
@@ -6580,19 +6579,19 @@ Output tensors:
|
|
| 6580 |
Iterations: 50
|
| 6581 |
|
| 6582 |
Latency Statistics:
|
| 6583 |
-
Average: 3.
|
| 6584 |
-
Min: 0.
|
| 6585 |
Max: 8.440 ms
|
| 6586 |
-
Std Dev: 3.
|
| 6587 |
|
| 6588 |
Percentiles:
|
| 6589 |
-
P50 (median): 0.
|
| 6590 |
-
P95: 8.
|
| 6591 |
P99: 8.439 ms
|
| 6592 |
|
| 6593 |
Throughput:
|
| 6594 |
-
Tokens/sec:
|
| 6595 |
-
Std Dev:
|
| 6596 |
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
|
| 6597 |
|
| 6598 |
Saved benchmark results to megablocks_results.json
|
|
@@ -6602,25 +6601,25 @@ Output sum: 6.473885
|
|
| 6602 |
<div class="uv-install-logs" id="uv-logs-megablocks_run">
|
| 6603 |
<div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
|
| 6604 |
<div class="uv-logs-content" style="display: none;">
|
| 6605 |
-
Downloading sympy (6.0MiB)
|
| 6606 |
-
Downloading nvidia-cufft-cu12 (184.2MiB)
|
| 6607 |
-
Downloading nvidia-cudnn-cu12 (674.0MiB)
|
| 6608 |
-
Downloading nvidia-cublas-cu12 (566.8MiB)
|
| 6609 |
-
Downloading nvidia-curand-cu12 (60.7MiB)
|
| 6610 |
Downloading nvidia-cuda-nvrtc-cu12 (84.0MiB)
|
| 6611 |
-
Downloading nvidia-cuda-cupti-cu12 (9.8MiB)
|
| 6612 |
-
Downloading nvidia-cufile-cu12 (1.1MiB)
|
| 6613 |
-
Downloading nvidia-nvjitlink-cu12 (37.4MiB)
|
| 6614 |
Downloading nvidia-cusparse-cu12 (274.9MiB)
|
| 6615 |
-
Downloading
|
| 6616 |
-
Downloading networkx (1.9MiB)
|
| 6617 |
Downloading hf-xet (3.0MiB)
|
|
|
|
| 6618 |
Downloading numpy (16.2MiB)
|
| 6619 |
-
Downloading
|
| 6620 |
-
Downloading
|
| 6621 |
Downloading nvidia-cusparselt-cu12 (273.9MiB)
|
| 6622 |
Downloading torch (846.9MiB)
|
|
|
|
|
|
|
|
|
|
|
|
|
| 6623 |
Downloading nvidia-cusolver-cu12 (255.1MiB)
|
|
|
|
|
|
|
|
|
|
|
|
|
| 6624 |
Downloading nvidia-cufile-cu12
|
| 6625 |
Downloading hf-xet
|
| 6626 |
Downloading setuptools
|
|
@@ -6640,19 +6639,19 @@ Downloading nvidia-cusolver-cu12 (255.1MiB)
|
|
| 6640 |
Downloading nvidia-cublas-cu12
|
| 6641 |
Downloading nvidia-cudnn-cu12
|
| 6642 |
Downloading torch
|
| 6643 |
-
Installed 37 packages in
|
| 6644 |
</div>
|
| 6645 |
</div>
|
| 6646 |
<div class="cell-stderr">Fetching 66 files: 0%| | 0/66 [00:00<?, ?it/s]
|
| 6647 |
-
Fetching 66 files: 2%|▏ | 1/66 [00:00<00:
|
| 6648 |
-
Fetching 66 files:
|
| 6649 |
-
Fetching 66 files:
|
| 6650 |
-
Fetching 66 files:
|
| 6651 |
-
Fetching 66 files:
|
| 6652 |
-
Fetching 66 files:
|
| 6653 |
-
Fetching 66 files:
|
| 6654 |
-
Fetching 66 files:
|
| 6655 |
-
Fetching 66 files: 100%|██████████| 66/66 [00:
|
| 6656 |
<div class="cell-artifacts">
|
| 6657 |
<h4>Artifacts:</h4>
|
| 6658 |
<a href="artifacts/megablocks_run/megablocks_results.json" class="artifact" target="_blank">megablocks_results.json</a>
|
|
@@ -6915,23 +6914,23 @@ Loaded /repo/moe_benchmarks/megablocks_yamoe/.uvnote/cache/0febdf3420999533bc2e1
|
|
| 6915 |
Performance Summary:
|
| 6916 |
Implementation Avg (ms) P95 (ms) Tokens/sec Relative Speed
|
| 6917 |
--------------------------------------------------------------------------------
|
| 6918 |
-
megablocks_results 3.
|
| 6919 |
-
yamoe_results 4.25 4.
|
| 6920 |
-
binned_results 36.
|
| 6921 |
-
gptoss_results
|
| 6922 |
-
gptoss_training_results 47.
|
| 6923 |
-
|
| 6924 |
-
Fastest: megablocks_results (3.
|
| 6925 |
-
Slowest: gptoss_training_results (47.
|
| 6926 |
-
Max Speedup: 12.
|
| 6927 |
</div>
|
| 6928 |
<div class="uv-install-logs" id="uv-logs-visualization">
|
| 6929 |
<div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
|
| 6930 |
<div class="uv-logs-content" style="display: none;">
|
| 6931 |
-
Downloading matplotlib (8.3MiB)
|
| 6932 |
-
Downloading numpy (16.2MiB)
|
| 6933 |
Downloading pillow (6.3MiB)
|
|
|
|
| 6934 |
Downloading fonttools (4.7MiB)
|
|
|
|
| 6935 |
Downloading kiwisolver (1.4MiB)
|
| 6936 |
Downloading kiwisolver
|
| 6937 |
Downloading pillow
|
|
|
|
| 3720 |
<span onclick="toggleOutput('utils')" style="cursor: pointer;">▼ output</span>
|
| 3721 |
<span id="uv-indicator-utils" onclick="toggleUvLogsFromHeader('utils')" style="cursor: pointer;">▶ uv-logs</span>
|
| 3722 |
</span> |
|
| 3723 |
+
Cell: utils | deps: torch, numpy | 36.15s
|
| 3724 |
| <button class="run-btn" onclick="runCell('utils')">▶ run</button>
|
| 3725 |
<button class="copy-btn" onclick="copyCell('utils')">Copy</button>
|
| 3726 |
<a href="cells/utils.py" target="_blank" class="raw-btn">Raw</a>
|
|
|
|
| 3794 |
<div class="uv-install-logs" id="uv-logs-utils">
|
| 3795 |
<div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
|
| 3796 |
<div class="uv-logs-content" style="display: none;">
|
|
|
|
|
|
|
|
|
|
|
|
|
| 3797 |
Downloading networkx (1.9MiB)
|
|
|
|
| 3798 |
Downloading setuptools (1.1MiB)
|
|
|
|
|
|
|
|
|
|
|
|
|
| 3799 |
Downloading numpy (16.2MiB)
|
| 3800 |
+
Downloading nvidia-cufile-cu12 (1.1MiB)
|
| 3801 |
+
Downloading nvidia-nvjitlink-cu12 (37.4MiB)
|
| 3802 |
+
Downloading nvidia-cuda-cupti-cu12 (9.8MiB)
|
| 3803 |
+
Downloading triton (148.3MiB)
|
| 3804 |
Downloading nvidia-cufft-cu12 (184.2MiB)
|
| 3805 |
+
Downloading nvidia-curand-cu12 (60.7MiB)
|
| 3806 |
+
Downloading nvidia-cudnn-cu12 (674.0MiB)
|
| 3807 |
+
Downloading nvidia-cusparse-cu12 (274.9MiB)
|
| 3808 |
+
Downloading sympy (6.0MiB)
|
| 3809 |
+
Downloading nvidia-cublas-cu12 (566.8MiB)
|
| 3810 |
+
Downloading nvidia-cusolver-cu12 (255.1MiB)
|
| 3811 |
+
Downloading nvidia-cuda-nvrtc-cu12 (84.0MiB)
|
| 3812 |
Downloading nvidia-nccl-cu12 (307.4MiB)
|
|
|
|
| 3813 |
Downloading torch (846.9MiB)
|
| 3814 |
+
Downloading nvidia-cusparselt-cu12 (273.9MiB)
|
| 3815 |
Downloading nvidia-cufile-cu12
|
| 3816 |
Downloading setuptools
|
| 3817 |
Downloading networkx
|
|
|
|
| 3830 |
Downloading nvidia-cublas-cu12
|
| 3831 |
Downloading nvidia-cudnn-cu12
|
| 3832 |
Downloading torch
|
| 3833 |
+
Installed 26 packages in 452ms
|
| 3834 |
</div>
|
| 3835 |
</div>
|
| 3836 |
</div>
|
|
|
|
| 3843 |
<span onclick="toggleOutput('bench_utils')" style="cursor: pointer;">▼ output</span>
|
| 3844 |
<span id="uv-indicator-bench_utils" onclick="toggleUvLogsFromHeader('bench_utils')" style="cursor: pointer;">▶ uv-logs</span>
|
| 3845 |
</span> |
|
| 3846 |
+
Cell: bench_utils | deps: torch, numpy | 34.88s
|
| 3847 |
| <button class="run-btn" onclick="runCell('bench_utils')">▶ run</button>
|
| 3848 |
<button class="copy-btn" onclick="copyCell('bench_utils')">Copy</button>
|
| 3849 |
<a href="cells/bench_utils.py" target="_blank" class="raw-btn">Raw</a>
|
|
|
|
| 4331 |
<div class="uv-install-logs" id="uv-logs-bench_utils">
|
| 4332 |
<div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
|
| 4333 |
<div class="uv-logs-content" style="display: none;">
|
| 4334 |
+
Downloading nvidia-cusparselt-cu12 (273.9MiB)
|
|
|
|
|
|
|
| 4335 |
Downloading nvidia-cudnn-cu12 (674.0MiB)
|
| 4336 |
Downloading setuptools (1.1MiB)
|
| 4337 |
+
Downloading nvidia-curand-cu12 (60.7MiB)
|
|
|
|
|
|
|
| 4338 |
Downloading nvidia-nvjitlink-cu12 (37.4MiB)
|
|
|
|
|
|
|
|
|
|
| 4339 |
Downloading sympy (6.0MiB)
|
| 4340 |
+
Downloading nvidia-cusolver-cu12 (255.1MiB)
|
|
|
|
| 4341 |
Downloading numpy (16.2MiB)
|
| 4342 |
+
Downloading nvidia-cufft-cu12 (184.2MiB)
|
| 4343 |
Downloading nvidia-cuda-cupti-cu12 (9.8MiB)
|
| 4344 |
+
Downloading torch (846.9MiB)
|
| 4345 |
+
Downloading triton (148.3MiB)
|
| 4346 |
+
Downloading nvidia-cusparse-cu12 (274.9MiB)
|
| 4347 |
+
Downloading nvidia-cufile-cu12 (1.1MiB)
|
| 4348 |
+
Downloading nvidia-cublas-cu12 (566.8MiB)
|
| 4349 |
+
Downloading networkx (1.9MiB)
|
| 4350 |
+
Downloading nvidia-nccl-cu12 (307.4MiB)
|
| 4351 |
+
Downloading nvidia-cuda-nvrtc-cu12 (84.0MiB)
|
| 4352 |
Downloading nvidia-cufile-cu12
|
| 4353 |
Downloading setuptools
|
| 4354 |
Downloading networkx
|
|
|
|
| 4361 |
Downloading triton
|
| 4362 |
Downloading nvidia-cufft-cu12
|
| 4363 |
Downloading nvidia-cusolver-cu12
|
|
|
|
| 4364 |
Downloading nvidia-cusparse-cu12
|
| 4365 |
+
Downloading nvidia-cusparselt-cu12
|
| 4366 |
Downloading nvidia-nccl-cu12
|
| 4367 |
Downloading nvidia-cublas-cu12
|
| 4368 |
Downloading nvidia-cudnn-cu12
|
| 4369 |
Downloading torch
|
| 4370 |
+
Installed 26 packages in 453ms
|
| 4371 |
</div>
|
| 4372 |
</div>
|
| 4373 |
</div>
|
|
|
|
| 4381 |
<span onclick="toggleOutput('config')" style="cursor: pointer;">▼ output</span>
|
| 4382 |
<span id="uv-indicator-config" onclick="toggleUvLogsFromHeader('config')" style="cursor: pointer;">▶ uv-logs</span>
|
| 4383 |
</span> |
|
| 4384 |
+
Cell: config | deps: torch, numpy | 37.12s
|
| 4385 |
| <button class="run-btn" onclick="runCell('config')">▶ run</button>
|
| 4386 |
<button class="copy-btn" onclick="copyCell('config')">Copy</button>
|
| 4387 |
<a href="cells/config.py" target="_blank" class="raw-btn">Raw</a>
|
|
|
|
| 4441 |
<div class="uv-install-logs" id="uv-logs-config">
|
| 4442 |
<div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
|
| 4443 |
<div class="uv-logs-content" style="display: none;">
|
| 4444 |
+
Downloading networkx (1.9MiB)
|
|
|
|
| 4445 |
Downloading sympy (6.0MiB)
|
| 4446 |
+
Downloading nvidia-cusparselt-cu12 (273.9MiB)
|
| 4447 |
+
Downloading numpy (16.2MiB)
|
| 4448 |
+
Downloading nvidia-cufile-cu12 (1.1MiB)
|
| 4449 |
+
Downloading triton (148.3MiB)
|
| 4450 |
Downloading nvidia-nvjitlink-cu12 (37.4MiB)
|
| 4451 |
+
Downloading nvidia-cublas-cu12 (566.8MiB)
|
| 4452 |
Downloading nvidia-cusparse-cu12 (274.9MiB)
|
| 4453 |
+
Downloading setuptools (1.1MiB)
|
| 4454 |
+
Downloading nvidia-cudnn-cu12 (674.0MiB)
|
| 4455 |
+
Downloading nvidia-curand-cu12 (60.7MiB)
|
| 4456 |
+
Downloading nvidia-cusolver-cu12 (255.1MiB)
|
| 4457 |
+
Downloading nvidia-cufft-cu12 (184.2MiB)
|
| 4458 |
Downloading nvidia-cuda-cupti-cu12 (9.8MiB)
|
|
|
|
| 4459 |
Downloading nvidia-nccl-cu12 (307.4MiB)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 4460 |
Downloading nvidia-cuda-nvrtc-cu12 (84.0MiB)
|
| 4461 |
+
Downloading torch (846.9MiB)
|
| 4462 |
Downloading nvidia-cufile-cu12
|
| 4463 |
Downloading setuptools
|
| 4464 |
Downloading networkx
|
|
|
|
| 4477 |
Downloading nvidia-cublas-cu12
|
| 4478 |
Downloading nvidia-cudnn-cu12
|
| 4479 |
Downloading torch
|
| 4480 |
+
Installed 26 packages in 453ms
|
| 4481 |
</div>
|
| 4482 |
</div>
|
| 4483 |
</div>
|
|
|
|
| 4490 |
<span onclick="toggleOutput('save_data')" style="cursor: pointer;">▼ output</span>
|
| 4491 |
<span id="uv-indicator-save_data" onclick="toggleUvLogsFromHeader('save_data')" style="cursor: pointer;">▶ uv-logs</span>
|
| 4492 |
</span> |
|
| 4493 |
+
Cell: save_data | deps: torch, numpy | 39.39s
|
| 4494 |
| <button class="run-btn" onclick="runCell('save_data')">▶ run</button>
|
| 4495 |
<button class="copy-btn" onclick="copyCell('save_data')">Copy</button>
|
| 4496 |
<a href="cells/save_data.py" target="_blank" class="raw-btn">Raw</a>
|
|
|
|
| 4585 |
<div class="uv-install-logs" id="uv-logs-save_data">
|
| 4586 |
<div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
|
| 4587 |
<div class="uv-logs-content" style="display: none;">
|
| 4588 |
+
Downloading networkx (1.9MiB)
|
| 4589 |
+
Downloading nvidia-cuda-nvrtc-cu12 (84.0MiB)
|
| 4590 |
+
Downloading nvidia-cusparse-cu12 (274.9MiB)
|
| 4591 |
Downloading numpy (16.2MiB)
|
| 4592 |
+
Downloading sympy (6.0MiB)
|
| 4593 |
Downloading nvidia-nccl-cu12 (307.4MiB)
|
|
|
|
|
|
|
|
|
|
|
|
|
| 4594 |
Downloading setuptools (1.1MiB)
|
| 4595 |
+
Downloading nvidia-cuda-cupti-cu12 (9.8MiB)
|
| 4596 |
Downloading nvidia-cufft-cu12 (184.2MiB)
|
| 4597 |
Downloading nvidia-cudnn-cu12 (674.0MiB)
|
|
|
|
|
|
|
|
|
|
| 4598 |
Downloading nvidia-cufile-cu12 (1.1MiB)
|
| 4599 |
Downloading nvidia-cusolver-cu12 (255.1MiB)
|
| 4600 |
+
Downloading nvidia-cusparselt-cu12 (273.9MiB)
|
| 4601 |
+
Downloading nvidia-nvjitlink-cu12 (37.4MiB)
|
| 4602 |
+
Downloading nvidia-curand-cu12 (60.7MiB)
|
| 4603 |
+
Downloading torch (846.9MiB)
|
| 4604 |
Downloading triton (148.3MiB)
|
| 4605 |
+
Downloading nvidia-cublas-cu12 (566.8MiB)
|
| 4606 |
Downloading nvidia-cufile-cu12
|
| 4607 |
Downloading setuptools
|
| 4608 |
Downloading networkx
|
|
|
|
| 4615 |
Downloading triton
|
| 4616 |
Downloading nvidia-cufft-cu12
|
| 4617 |
Downloading nvidia-cusolver-cu12
|
|
|
|
| 4618 |
Downloading nvidia-cusparselt-cu12
|
| 4619 |
+
Downloading nvidia-cusparse-cu12
|
| 4620 |
Downloading nvidia-nccl-cu12
|
| 4621 |
Downloading nvidia-cublas-cu12
|
| 4622 |
Downloading nvidia-cudnn-cu12
|
| 4623 |
Downloading torch
|
| 4624 |
+
Installed 26 packages in 464ms
|
| 4625 |
</div>
|
| 4626 |
</div>
|
| 4627 |
<div class="cell-artifacts">
|
| 4628 |
<h4>Artifacts:</h4>
|
| 4629 |
+
<a href="artifacts/save_data/router_bias.pt" class="artifact" target="_blank">router_bias.pt</a>
|
|
|
|
| 4630 |
<a href="artifacts/save_data/down_proj_bias.pt" class="artifact" target="_blank">down_proj_bias.pt</a>
|
|
|
|
| 4631 |
<a href="artifacts/save_data/gate_up_proj_bias.pt" class="artifact" target="_blank">gate_up_proj_bias.pt</a>
|
| 4632 |
+
<a href="artifacts/save_data/router_weight.pt" class="artifact" target="_blank">router_weight.pt</a>
|
| 4633 |
+
<a href="artifacts/save_data/down_proj.pt" class="artifact" target="_blank">down_proj.pt</a>
|
| 4634 |
+
<a href="artifacts/save_data/gate_up_proj.pt" class="artifact" target="_blank">gate_up_proj.pt</a>
|
| 4635 |
</div>
|
| 4636 |
</div>
|
| 4637 |
</div>
|
|
|
|
| 4645 |
<span onclick="toggleOutput('yamoe_run')" style="cursor: pointer;">▼ output</span>
|
| 4646 |
<span id="uv-indicator-yamoe_run" onclick="toggleUvLogsFromHeader('yamoe_run')" style="cursor: pointer;">▶ uv-logs</span>
|
| 4647 |
</span> |
|
| 4648 |
+
Cell: yamoe_run | deps: torch, kernels, numpy | 38.20s
|
| 4649 |
| <button class="run-btn" onclick="runCell('yamoe_run')">▶ run</button>
|
| 4650 |
<button class="copy-btn" onclick="copyCell('yamoe_run')">Copy</button>
|
| 4651 |
<a href="cells/yamoe_run.py" target="_blank" class="raw-btn">Raw</a>
|
|
|
|
| 4938 |
|
| 4939 |
Warming up (10 iterations)...
|
| 4940 |
Benchmarking (50 iterations)...
|
| 4941 |
+
Progress: 20% complete (avg: 4.253 ms)
|
| 4942 |
+
Progress: 40% complete (avg: 4.249 ms)
|
| 4943 |
+
Progress: 60% complete (avg: 4.249 ms)
|
| 4944 |
+
Progress: 80% complete (avg: 4.249 ms)
|
| 4945 |
|
| 4946 |
Output tensors:
|
| 4947 |
Primary: shape=(1, 100, 1152), dtype=torch.float32, device=cuda:0, range=[-0.049506, 0.054984], mean=0.000034, std=0.006508, norm=2.208791
|
|
|
|
| 4952 |
|
| 4953 |
Latency Statistics:
|
| 4954 |
Average: 4.249 ms
|
| 4955 |
+
Min: 4.135 ms
|
| 4956 |
+
Max: 4.296 ms
|
| 4957 |
+
Std Dev: 0.023 ms
|
| 4958 |
|
| 4959 |
Percentiles:
|
| 4960 |
+
P50 (median): 4.252 ms
|
| 4961 |
+
P95: 4.274 ms
|
| 4962 |
+
P99: 4.289 ms
|
| 4963 |
|
| 4964 |
Throughput:
|
| 4965 |
+
Tokens/sec: 23533.4
|
| 4966 |
+
Std Dev: 128.2
|
| 4967 |
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
|
| 4968 |
|
| 4969 |
Saved benchmark results to yamoe_results.json
|
|
|
|
| 4973 |
<div class="uv-install-logs" id="uv-logs-yamoe_run">
|
| 4974 |
<div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
|
| 4975 |
<div class="uv-logs-content" style="display: none;">
|
|
|
|
|
|
|
|
|
|
|
|
|
| 4976 |
Downloading sympy (6.0MiB)
|
| 4977 |
+
Downloading nvidia-cusolver-cu12 (255.1MiB)
|
| 4978 |
+
Downloading hf-xet (3.0MiB)
|
| 4979 |
Downloading nvidia-cuda-cupti-cu12 (9.8MiB)
|
| 4980 |
+
Downloading nvidia-cuda-nvrtc-cu12 (84.0MiB)
|
| 4981 |
+
Downloading numpy (16.2MiB)
|
| 4982 |
Downloading nvidia-cusparselt-cu12 (273.9MiB)
|
| 4983 |
+
Downloading nvidia-cufile-cu12 (1.1MiB)
|
| 4984 |
+
Downloading nvidia-nvjitlink-cu12 (37.4MiB)
|
| 4985 |
Downloading triton (148.3MiB)
|
| 4986 |
+
Downloading nvidia-cublas-cu12 (566.8MiB)
|
| 4987 |
+
Downloading nvidia-cudnn-cu12 (674.0MiB)
|
|
|
|
|
|
|
| 4988 |
Downloading nvidia-cusparse-cu12 (274.9MiB)
|
|
|
|
|
|
|
| 4989 |
Downloading nvidia-nccl-cu12 (307.4MiB)
|
| 4990 |
+
Downloading nvidia-curand-cu12 (60.7MiB)
|
| 4991 |
Downloading torch (846.9MiB)
|
| 4992 |
+
Downloading nvidia-cufft-cu12 (184.2MiB)
|
| 4993 |
+
Downloading setuptools (1.1MiB)
|
| 4994 |
+
Downloading networkx (1.9MiB)
|
| 4995 |
Downloading nvidia-cufile-cu12
|
| 4996 |
Downloading hf-xet
|
| 4997 |
Downloading setuptools
|
|
|
|
| 5005 |
Downloading triton
|
| 5006 |
Downloading nvidia-cufft-cu12
|
| 5007 |
Downloading nvidia-cusolver-cu12
|
|
|
|
| 5008 |
Downloading nvidia-cusparse-cu12
|
| 5009 |
+
Downloading nvidia-cusparselt-cu12
|
| 5010 |
Downloading nvidia-nccl-cu12
|
| 5011 |
Downloading nvidia-cublas-cu12
|
| 5012 |
Downloading nvidia-cudnn-cu12
|
| 5013 |
Downloading torch
|
| 5014 |
+
Installed 37 packages in 458ms
|
| 5015 |
</div>
|
| 5016 |
</div>
|
| 5017 |
<div class="cell-stderr">Fetching 6 files: 0%| | 0/6 [00:00<?, ?it/s]
|
| 5018 |
+
Fetching 6 files: 17%|█▋ | 1/6 [00:00<00:01, 4.00it/s]
|
| 5019 |
+
Fetching 6 files: 50%|█████ | 3/6 [00:00<00:00, 4.44it/s]
|
| 5020 |
+
Fetching 6 files: 100%|██████████| 6/6 [00:00<00:00, 8.79it/s]</div>
|
|
|
|
| 5021 |
<div class="cell-artifacts">
|
| 5022 |
<h4>Artifacts:</h4>
|
| 5023 |
<a href="artifacts/yamoe_run/yamoe_results.json" class="artifact" target="_blank">yamoe_results.json</a>
|
|
|
|
| 5034 |
<span onclick="toggleOutput('binned_run')" style="cursor: pointer;">▼ output</span>
|
| 5035 |
<span id="uv-indicator-binned_run" onclick="toggleUvLogsFromHeader('binned_run')" style="cursor: pointer;">▶ uv-logs</span>
|
| 5036 |
</span> |
|
| 5037 |
+
Cell: binned_run | deps: torch, numpy | 39.24s
|
| 5038 |
| <button class="run-btn" onclick="runCell('binned_run')">▶ run</button>
|
| 5039 |
<button class="copy-btn" onclick="copyCell('binned_run')">Copy</button>
|
| 5040 |
<a href="cells/binned_run.py" target="_blank" class="raw-btn">Raw</a>
|
|
|
|
| 5448 |
|
| 5449 |
Warming up (10 iterations)...
|
| 5450 |
Benchmarking (50 iterations)...
|
| 5451 |
+
Progress: 20% complete (avg: 37.466 ms)
|
| 5452 |
+
Progress: 40% complete (avg: 37.465 ms)
|
| 5453 |
+
Progress: 60% complete (avg: 37.162 ms)
|
| 5454 |
+
Progress: 80% complete (avg: 36.629 ms)
|
| 5455 |
|
| 5456 |
Output tensors:
|
| 5457 |
Primary: shape=(1, 100, 1152), dtype=torch.float32, device=cuda:0, range=[-0.049506, 0.054984], mean=0.000034, std=0.006508, norm=2.208791
|
|
|
|
| 5461 |
Iterations: 50
|
| 5462 |
|
| 5463 |
Latency Statistics:
|
| 5464 |
+
Average: 36.141 ms
|
| 5465 |
+
Min: 33.218 ms
|
| 5466 |
+
Max: 38.347 ms
|
| 5467 |
+
Std Dev: 1.387 ms
|
| 5468 |
|
| 5469 |
Percentiles:
|
| 5470 |
+
P50 (median): 36.579 ms
|
| 5471 |
+
P95: 37.791 ms
|
| 5472 |
+
P99: 38.303 ms
|
| 5473 |
|
| 5474 |
Throughput:
|
| 5475 |
+
Tokens/sec: 2767.0
|
| 5476 |
+
Std Dev: 108.1
|
| 5477 |
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
|
| 5478 |
|
| 5479 |
Saved benchmark results to binned_results.json
|
|
|
|
| 5483 |
<div class="uv-install-logs" id="uv-logs-binned_run">
|
| 5484 |
<div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
|
| 5485 |
<div class="uv-logs-content" style="display: none;">
|
| 5486 |
+
Downloading setuptools (1.1MiB)
|
| 5487 |
+
Downloading numpy (16.2MiB)
|
| 5488 |
Downloading networkx (1.9MiB)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 5489 |
Downloading nvidia-cuda-cupti-cu12 (9.8MiB)
|
| 5490 |
+
Downloading nvidia-cublas-cu12 (566.8MiB)
|
| 5491 |
+
Downloading nvidia-cusolver-cu12 (255.1MiB)
|
| 5492 |
+
Downloading nvidia-cudnn-cu12 (674.0MiB)
|
| 5493 |
Downloading nvidia-nvjitlink-cu12 (37.4MiB)
|
| 5494 |
+
Downloading nvidia-cusparse-cu12 (274.9MiB)
|
| 5495 |
+
Downloading nvidia-cuda-nvrtc-cu12 (84.0MiB)
|
| 5496 |
Downloading sympy (6.0MiB)
|
| 5497 |
+
Downloading nvidia-curand-cu12 (60.7MiB)
|
| 5498 |
+
Downloading nvidia-cusparselt-cu12 (273.9MiB)
|
| 5499 |
+
Downloading nvidia-cufile-cu12 (1.1MiB)
|
| 5500 |
+
Downloading nvidia-cufft-cu12 (184.2MiB)
|
| 5501 |
Downloading nvidia-nccl-cu12 (307.4MiB)
|
|
|
|
|
|
|
|
|
|
|
|
|
| 5502 |
Downloading torch (846.9MiB)
|
| 5503 |
+
Downloading triton (148.3MiB)
|
| 5504 |
Downloading nvidia-cufile-cu12
|
| 5505 |
Downloading setuptools
|
| 5506 |
Downloading networkx
|
|
|
|
| 5519 |
Downloading nvidia-cublas-cu12
|
| 5520 |
Downloading nvidia-cudnn-cu12
|
| 5521 |
Downloading torch
|
| 5522 |
+
Installed 26 packages in 444ms
|
| 5523 |
</div>
|
| 5524 |
</div>
|
| 5525 |
<div class="cell-artifacts">
|
|
|
|
| 5538 |
<span onclick="toggleOutput('gptoss_run')" style="cursor: pointer;">▼ output</span>
|
| 5539 |
<span id="uv-indicator-gptoss_run" onclick="toggleUvLogsFromHeader('gptoss_run')" style="cursor: pointer;">▶ uv-logs</span>
|
| 5540 |
</span> |
|
| 5541 |
+
Cell: gptoss_run | deps: torch, numpy | 43.23s
|
| 5542 |
| <button class="run-btn" onclick="runCell('gptoss_run')">▶ run</button>
|
| 5543 |
<button class="copy-btn" onclick="copyCell('gptoss_run')">Copy</button>
|
| 5544 |
<a href="cells/gptoss_run.py" target="_blank" class="raw-btn">Raw</a>
|
|
|
|
| 5856 |
|
| 5857 |
Warming up (10 iterations)...
|
| 5858 |
Benchmarking (50 iterations)...
|
| 5859 |
+
Progress: 20% complete (avg: 50.062 ms)
|
| 5860 |
+
Progress: 40% complete (avg: 49.677 ms)
|
| 5861 |
+
Progress: 60% complete (avg: 48.802 ms)
|
| 5862 |
+
Progress: 80% complete (avg: 47.718 ms)
|
| 5863 |
|
| 5864 |
Output tensors:
|
| 5865 |
Primary: shape=(1, 100, 1152), dtype=torch.float32, device=cuda:0, range=[-0.064982, 0.061193], mean=0.000100, std=0.013510, norm=4.585560
|
|
|
|
| 5869 |
Iterations: 50
|
| 5870 |
|
| 5871 |
Latency Statistics:
|
| 5872 |
+
Average: 46.669 ms
|
| 5873 |
+
Min: 40.355 ms
|
| 5874 |
+
Max: 50.564 ms
|
| 5875 |
+
Std Dev: 2.944 ms
|
| 5876 |
|
| 5877 |
Percentiles:
|
| 5878 |
+
P50 (median): 47.070 ms
|
| 5879 |
+
P95: 50.338 ms
|
| 5880 |
+
P99: 50.543 ms
|
| 5881 |
|
| 5882 |
Throughput:
|
| 5883 |
+
Tokens/sec: 2142.7
|
| 5884 |
+
Std Dev: 139.3
|
| 5885 |
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
|
| 5886 |
|
| 5887 |
Saved benchmark results to gptoss_results.json
|
|
|
|
| 5891 |
<div class="uv-install-logs" id="uv-logs-gptoss_run">
|
| 5892 |
<div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
|
| 5893 |
<div class="uv-logs-content" style="display: none;">
|
| 5894 |
+
Downloading networkx (1.9MiB)
|
| 5895 |
+
Downloading nvidia-curand-cu12 (60.7MiB)
|
| 5896 |
Downloading nvidia-nvjitlink-cu12 (37.4MiB)
|
|
|
|
|
|
|
|
|
|
|
|
|
| 5897 |
Downloading nvidia-nccl-cu12 (307.4MiB)
|
| 5898 |
+
Downloading triton (148.3MiB)
|
| 5899 |
+
Downloading nvidia-cuda-nvrtc-cu12 (84.0MiB)
|
| 5900 |
+
Downloading nvidia-cusparselt-cu12 (273.9MiB)
|
| 5901 |
Downloading nvidia-cudnn-cu12 (674.0MiB)
|
| 5902 |
+
Downloading nvidia-cufft-cu12 (184.2MiB)
|
|
|
|
| 5903 |
Downloading nvidia-cublas-cu12 (566.8MiB)
|
| 5904 |
+
Downloading sympy (6.0MiB)
|
| 5905 |
+
Downloading setuptools (1.1MiB)
|
| 5906 |
Downloading nvidia-cufile-cu12 (1.1MiB)
|
|
|
|
| 5907 |
Downloading nvidia-cusolver-cu12 (255.1MiB)
|
| 5908 |
+
Downloading nvidia-cuda-cupti-cu12 (9.8MiB)
|
| 5909 |
+
Downloading numpy (16.2MiB)
|
| 5910 |
Downloading torch (846.9MiB)
|
| 5911 |
+
Downloading nvidia-cusparse-cu12 (274.9MiB)
|
| 5912 |
Downloading nvidia-cufile-cu12
|
| 5913 |
Downloading setuptools
|
| 5914 |
Downloading networkx
|
|
|
|
| 5927 |
Downloading nvidia-cublas-cu12
|
| 5928 |
Downloading nvidia-cudnn-cu12
|
| 5929 |
Downloading torch
|
| 5930 |
+
Installed 26 packages in 455ms
|
| 5931 |
</div>
|
| 5932 |
</div>
|
| 5933 |
<div class="cell-artifacts">
|
|
|
|
| 5946 |
<span onclick="toggleOutput('gptoss_training_run')" style="cursor: pointer;">▼ output</span>
|
| 5947 |
<span id="uv-indicator-gptoss_training_run" onclick="toggleUvLogsFromHeader('gptoss_training_run')" style="cursor: pointer;">▶ uv-logs</span>
|
| 5948 |
</span> |
|
| 5949 |
+
Cell: gptoss_training_run | deps: torch, numpy | 40.10s
|
| 5950 |
| <button class="run-btn" onclick="runCell('gptoss_training_run')">▶ run</button>
|
| 5951 |
<button class="copy-btn" onclick="copyCell('gptoss_training_run')">Copy</button>
|
| 5952 |
<a href="cells/gptoss_training_run.py" target="_blank" class="raw-btn">Raw</a>
|
|
|
|
| 6247 |
|
| 6248 |
Warming up (10 iterations)...
|
| 6249 |
Benchmarking (50 iterations)...
|
| 6250 |
+
Progress: 20% complete (avg: 50.696 ms)
|
| 6251 |
+
Progress: 40% complete (avg: 50.262 ms)
|
| 6252 |
+
Progress: 60% complete (avg: 49.357 ms)
|
| 6253 |
+
Progress: 80% complete (avg: 48.257 ms)
|
| 6254 |
|
| 6255 |
Output tensors:
|
| 6256 |
Primary: shape=(1, 100, 1152), dtype=torch.float32, device=cuda:0, range=[-0.064982, 0.061193], mean=0.000100, std=0.013510, norm=4.585560
|
|
|
|
| 6260 |
Iterations: 50
|
| 6261 |
|
| 6262 |
Latency Statistics:
|
| 6263 |
+
Average: 47.251 ms
|
| 6264 |
+
Min: 40.696 ms
|
| 6265 |
+
Max: 51.111 ms
|
| 6266 |
+
Std Dev: 2.979 ms
|
| 6267 |
|
| 6268 |
Percentiles:
|
| 6269 |
+
P50 (median): 47.641 ms
|
| 6270 |
+
P95: 50.990 ms
|
| 6271 |
+
P99: 51.076 ms
|
| 6272 |
|
| 6273 |
Throughput:
|
| 6274 |
+
Tokens/sec: 2116.4
|
| 6275 |
+
Std Dev: 138.0
|
| 6276 |
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
|
| 6277 |
|
| 6278 |
Saved benchmark results to gptoss_training_results.json
|
|
|
|
| 6282 |
<div class="uv-install-logs" id="uv-logs-gptoss_training_run">
|
| 6283 |
<div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
|
| 6284 |
<div class="uv-logs-content" style="display: none;">
|
|
|
|
|
|
|
|
|
|
|
|
|
| 6285 |
Downloading setuptools (1.1MiB)
|
| 6286 |
+
Downloading nvidia-curand-cu12 (60.7MiB)
|
|
|
|
|
|
|
| 6287 |
Downloading nvidia-cudnn-cu12 (674.0MiB)
|
| 6288 |
+
Downloading nvidia-cusparselt-cu12 (273.9MiB)
|
| 6289 |
+
Downloading numpy (16.2MiB)
|
| 6290 |
+
Downloading nvidia-cufft-cu12 (184.2MiB)
|
| 6291 |
+
Downloading nvidia-cusparse-cu12 (274.9MiB)
|
| 6292 |
+
Downloading nvidia-nvjitlink-cu12 (37.4MiB)
|
| 6293 |
Downloading nvidia-cublas-cu12 (566.8MiB)
|
| 6294 |
+
Downloading nvidia-cuda-cupti-cu12 (9.8MiB)
|
| 6295 |
+
Downloading nvidia-cuda-nvrtc-cu12 (84.0MiB)
|
| 6296 |
+
Downloading networkx (1.9MiB)
|
| 6297 |
Downloading nvidia-cusolver-cu12 (255.1MiB)
|
| 6298 |
Downloading nvidia-cufile-cu12 (1.1MiB)
|
| 6299 |
+
Downloading nvidia-nccl-cu12 (307.4MiB)
|
|
|
|
| 6300 |
Downloading triton (148.3MiB)
|
| 6301 |
+
Downloading sympy (6.0MiB)
|
| 6302 |
+
Downloading torch (846.9MiB)
|
| 6303 |
Downloading nvidia-cufile-cu12
|
| 6304 |
Downloading setuptools
|
| 6305 |
Downloading networkx
|
| 6306 |
Downloading nvidia-cuda-cupti-cu12
|
|
|
|
| 6307 |
Downloading numpy
|
| 6308 |
+
Downloading sympy
|
| 6309 |
Downloading nvidia-nvjitlink-cu12
|
| 6310 |
Downloading nvidia-curand-cu12
|
| 6311 |
Downloading nvidia-cuda-nvrtc-cu12
|
|
|
|
| 6318 |
Downloading nvidia-cublas-cu12
|
| 6319 |
Downloading nvidia-cudnn-cu12
|
| 6320 |
Downloading torch
|
| 6321 |
+
Installed 26 packages in 444ms
|
| 6322 |
</div>
|
| 6323 |
</div>
|
| 6324 |
<div class="cell-artifacts">
|
|
|
|
| 6337 |
<span onclick="toggleOutput('megablocks_run')" style="cursor: pointer;">▼ output</span>
|
| 6338 |
<span id="uv-indicator-megablocks_run" onclick="toggleUvLogsFromHeader('megablocks_run')" style="cursor: pointer;">▶ uv-logs</span>
|
| 6339 |
</span> |
|
| 6340 |
+
Cell: megablocks_run | deps: torch, numpy, kernels | 47.19s
|
| 6341 |
| <button class="run-btn" onclick="runCell('megablocks_run')">▶ run</button>
|
| 6342 |
<button class="copy-btn" onclick="copyCell('megablocks_run')">Copy</button>
|
| 6343 |
<a href="cells/megablocks_run.py" target="_blank" class="raw-btn">Raw</a>
|
|
|
|
| 6566 |
|
| 6567 |
Warming up (10 iterations)...
|
| 6568 |
Benchmarking (50 iterations)...
|
| 6569 |
+
Progress: 20% complete (avg: 0.854 ms)
|
| 6570 |
+
Progress: 40% complete (avg: 0.841 ms)
|
| 6571 |
+
Progress: 60% complete (avg: 0.843 ms)
|
| 6572 |
+
Progress: 80% complete (avg: 2.673 ms)
|
| 6573 |
|
| 6574 |
Output tensors:
|
| 6575 |
Primary: shape=(1, 100, 1152), dtype=torch.float32, device=cuda:0, range=[-0.061104, 0.055115], mean=0.000056, std=0.013535, norm=4.593927
|
|
|
|
| 6579 |
Iterations: 50
|
| 6580 |
|
| 6581 |
Latency Statistics:
|
| 6582 |
+
Average: 3.825 ms
|
| 6583 |
+
Min: 0.805 ms
|
| 6584 |
Max: 8.440 ms
|
| 6585 |
+
Std Dev: 3.666 ms
|
| 6586 |
|
| 6587 |
Percentiles:
|
| 6588 |
+
P50 (median): 0.853 ms
|
| 6589 |
+
P95: 8.438 ms
|
| 6590 |
P99: 8.439 ms
|
| 6591 |
|
| 6592 |
Throughput:
|
| 6593 |
+
Tokens/sec: 26146.6
|
| 6594 |
+
Std Dev: 52691.2
|
| 6595 |
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
|
| 6596 |
|
| 6597 |
Saved benchmark results to megablocks_results.json
|
|
|
|
| 6601 |
<div class="uv-install-logs" id="uv-logs-megablocks_run">
|
| 6602 |
<div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
|
| 6603 |
<div class="uv-logs-content" style="display: none;">
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 6604 |
Downloading nvidia-cuda-nvrtc-cu12 (84.0MiB)
|
|
|
|
|
|
|
|
|
|
| 6605 |
Downloading nvidia-cusparse-cu12 (274.9MiB)
|
| 6606 |
+
Downloading setuptools (1.1MiB)
|
|
|
|
| 6607 |
Downloading hf-xet (3.0MiB)
|
| 6608 |
+
Downloading nvidia-cufile-cu12 (1.1MiB)
|
| 6609 |
Downloading numpy (16.2MiB)
|
| 6610 |
+
Downloading networkx (1.9MiB)
|
| 6611 |
+
Downloading nvidia-nvjitlink-cu12 (37.4MiB)
|
| 6612 |
Downloading nvidia-cusparselt-cu12 (273.9MiB)
|
| 6613 |
Downloading torch (846.9MiB)
|
| 6614 |
+
Downloading nvidia-cudnn-cu12 (674.0MiB)
|
| 6615 |
+
Downloading sympy (6.0MiB)
|
| 6616 |
+
Downloading nvidia-cublas-cu12 (566.8MiB)
|
| 6617 |
+
Downloading nvidia-cuda-cupti-cu12 (9.8MiB)
|
| 6618 |
Downloading nvidia-cusolver-cu12 (255.1MiB)
|
| 6619 |
+
Downloading nvidia-cufft-cu12 (184.2MiB)
|
| 6620 |
+
Downloading nvidia-nccl-cu12 (307.4MiB)
|
| 6621 |
+
Downloading nvidia-curand-cu12 (60.7MiB)
|
| 6622 |
+
Downloading triton (148.3MiB)
|
| 6623 |
Downloading nvidia-cufile-cu12
|
| 6624 |
Downloading hf-xet
|
| 6625 |
Downloading setuptools
|
|
|
|
| 6639 |
Downloading nvidia-cublas-cu12
|
| 6640 |
Downloading nvidia-cudnn-cu12
|
| 6641 |
Downloading torch
|
| 6642 |
+
Installed 37 packages in 459ms
|
| 6643 |
</div>
|
| 6644 |
</div>
|
| 6645 |
<div class="cell-stderr">Fetching 66 files: 0%| | 0/66 [00:00<?, ?it/s]
|
| 6646 |
+
Fetching 66 files: 2%|▏ | 1/66 [00:00<00:15, 4.22it/s]
|
| 6647 |
+
Fetching 66 files: 5%|▍ | 3/66 [00:00<00:07, 8.32it/s]
|
| 6648 |
+
Fetching 66 files: 21%|██ | 14/66 [00:00<00:01, 35.22it/s]
|
| 6649 |
+
Fetching 66 files: 29%|██▉ | 19/66 [00:01<00:02, 18.12it/s]
|
| 6650 |
+
Fetching 66 files: 55%|█████▍ | 36/66 [00:01<00:01, 29.43it/s]
|
| 6651 |
+
Fetching 66 files: 70%|██████▉ | 46/66 [00:01<00:00, 38.98it/s]
|
| 6652 |
+
Fetching 66 files: 82%|████████▏ | 54/66 [00:01<00:00, 42.11it/s]
|
| 6653 |
+
Fetching 66 files: 95%|█████████▌| 63/66 [00:01<00:00, 48.66it/s]
|
| 6654 |
+
Fetching 66 files: 100%|██████████| 66/66 [00:01<00:00, 33.84it/s]</div>
|
| 6655 |
<div class="cell-artifacts">
|
| 6656 |
<h4>Artifacts:</h4>
|
| 6657 |
<a href="artifacts/megablocks_run/megablocks_results.json" class="artifact" target="_blank">megablocks_results.json</a>
|
|
|
|
| 6914 |
Performance Summary:
|
| 6915 |
Implementation Avg (ms) P95 (ms) Tokens/sec Relative Speed
|
| 6916 |
--------------------------------------------------------------------------------
|
| 6917 |
+
megablocks_results 3.82 8.44 26147 1.00x
|
| 6918 |
+
yamoe_results 4.25 4.27 23533 0.90x
|
| 6919 |
+
binned_results 36.14 37.79 2767 0.11x
|
| 6920 |
+
gptoss_results 46.67 50.34 2143 0.08x
|
| 6921 |
+
gptoss_training_results 47.25 50.99 2116 0.08x
|
| 6922 |
+
|
| 6923 |
+
Fastest: megablocks_results (3.82ms avg)
|
| 6924 |
+
Slowest: gptoss_training_results (47.25ms avg)
|
| 6925 |
+
Max Speedup: 12.4x
|
| 6926 |
</div>
|
| 6927 |
<div class="uv-install-logs" id="uv-logs-visualization">
|
| 6928 |
<div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
|
| 6929 |
<div class="uv-logs-content" style="display: none;">
|
|
|
|
|
|
|
| 6930 |
Downloading pillow (6.3MiB)
|
| 6931 |
+
Downloading matplotlib (8.3MiB)
|
| 6932 |
Downloading fonttools (4.7MiB)
|
| 6933 |
+
Downloading numpy (16.2MiB)
|
| 6934 |
Downloading kiwisolver (1.4MiB)
|
| 6935 |
Downloading kiwisolver
|
| 6936 |
Downloading pillow
|