+
+
+ +▼ code +▼ output + ▶ uv-logs + | +Cell: setup | 304.89s + | + +Raw +
+
+
  1
+  2
+  3
+  4
+  5
+  6
+  7
+  8
+  9
+ 10
+ 11
+ 12
+ 13
+ 14
+ 15
+ 16
+ 17
+ 18
+ 19
+ 20
+ 21
+ 22
+ 23
+ 24
+ 25
+ 26
+ 27
+ 28
+ 29
+ 30
+ 31
+ 32
+ 33
+ 34
+ 35
+ 36
+ 37
+ 38
+ 39
+ 40
+ 41
+ 42
+ 43
+ 44
+ 45
+ 46
+ 47
+ 48
+ 49
+ 50
+ 51
+ 52
+ 53
+ 54
+ 55
+ 56
+ 57
+ 58
+ 59
+ 60
+ 61
+ 62
+ 63
+ 64
+ 65
+ 66
+ 67
+ 68
+ 69
+ 70
+ 71
+ 72
+ 73
+ 74
+ 75
+ 76
+ 77
+ 78
+ 79
+ 80
+ 81
+ 82
+ 83
+ 84
+ 85
+ 86
+ 87
+ 88
+ 89
+ 90
+ 91
+ 92
+ 93
+ 94
+ 95
+ 96
+ 97
+ 98
+ 99
+100
+101
+102
+103
+104
+105
+106
+107
+108
+109
+110
+111
+112
+113
+114
+115
+116
+117
+118
+119
+120
+121
+122
+123
+124
+125
+126
+127
+128
+129
+130
+131
+132
+133
+134
+135
+136
+137
+138
+139
+140
+141
+142
+143
+144
+145
+146
+147
+148
+149
+150
+151
+152
+153
+154
+155
+156
+157
+158
+159
+160
+161
+162
+163
+164
+165
+166
+167
+168
+169
+170
+171
+172
+173
+174
+175
+176
+177
+178
+179
+180
+181
+182
+183
+184
+185
+186
+187
+188
+189
+190
+191
+192
+193
+194
+195
+196
+197
+198
+199
+200
+201
+202
+203
+204
+205
+206
+207
+208
+209
+210
+211
+212
+213
+214
+215
+216
+217
+218
+219
+220
# /// script
+# requires-python = ">=3.12"
+# dependencies = [
+#     "accelerate>=1.10.1",
+#     "torch>=2.7.0",
+#     "kernels==0.10.0",
+#     "transformers@https://github.com/huggingface/transformers.git",
+#     "ipdb>=0.13.13",
+#     "matplotlib>=3.7.2",
+#     "numpy>=1.24.3",
+# ]
+# ///
+
+import torch
+from transformers import GptOssForCausalLM, PreTrainedTokenizerFast, Mxfp4Config
+import time
+import torch.nn as nn
+from kernels import register_kernel_mapping, Mode, LayerRepository
+import sys
+import torch.profiler
+import gc
+
+def reset_peak_memory_stats():
+    """Clear CUDA cache and reset memory allocation counters."""
+    torch.cuda.empty_cache()
+    if torch.cuda.is_available():
+        torch.cuda.reset_peak_memory_stats()
+    gc.collect()
+
+def get_memory_stats():
+    """Get current and peak CUDA memory usage."""
+    if not torch.cuda.is_available():
+        return {"allocated_gb": 0, "peak_gb": 0, "reserved_gb": 0}
+    return {
+        "allocated_gb": torch.cuda.memory_allocated() / 1e9,
+        "peak_gb": torch.cuda.max_memory_allocated() / 1e9,
+        "reserved_gb": torch.cuda.memory_reserved() / 1e9,
+    }
+
+def override_kernel_layer_name(cls_name: str, value) -> bool:
+    """Helper to dynamically override the kernel_layer_name in a model class."""
+    for mod in sys.modules.values():
+        if mod is None:
+            continue
+        obj = getattr(mod, cls_name, None)
+        if isinstance(obj, type) and issubclass(obj, nn.Module):
+            setattr(obj, "kernel_layer_name", value)
+            print(f"Overrode {cls_name}.kernel_layer_name to {value}")
+            return True
+    return False
+
+def run_generation(model, inputs, max_tokens=64):
+    """Run a single generation pass and measure its duration."""
+    with torch.inference_mode():
+        start_time = time.perf_counter()
+        generated = model.generate(
+            **inputs,
+            max_new_tokens=max_tokens,
+            do_sample=False,
+            temperature=None,
+        )
+        end_time = time.perf_counter()
+    return generated, end_time - start_time
+
+
+# Init the model the normal way
+model_id = "openai/gpt-oss-20b"
+tokenizer = PreTrainedTokenizerFast.from_pretrained(model_id)
+quantization_config = Mxfp4Config(dequantize=True)
+
+# Now we want to add some custom kernel mapping
+custom_mapping = dict(
+    Yamoe=dict(
+        cuda={
+            Mode.INFERENCE: LayerRepository(
+                repo_id="drbh/yamoe",
+                layer_name="Yamoe",
+                revision="v0.3.0",
+            ),
+        },
+    )
+)
+# First add the mapping
+register_kernel_mapping(custom_mapping)
+# Then override the layer name in the model class
+override_kernel_layer_name("GptOssMLP", "Yamoe")
+
+# TODO: remove this line once RMSNorm is working
+override_kernel_layer_name("GptOssRMSNorm", None)
+
+## Normal model stuff
+
+model = GptOssForCausalLM.from_pretrained(
+    model_id,
+    dtype="bfloat16",
+    device_map="auto",
+    use_kernels=True,
+    quantization_config=quantization_config,
+).eval()
+
+
+messages = [
+    {"role": "system", "content": "What is Tensor Parallelism?"},
+]
+
+inputs = tokenizer.apply_chat_template(
+    messages,
+    add_generation_prompt=True,
+    return_tensors="pt",
+    return_dict=True,
+    reasoning_effort="low",
+).to("cuda")
+
+
+
+def run_generation(model, inputs, max_tokens=64):
+    with torch.inference_mode():
+        start_time = time.perf_counter()
+        generated = model.generate(
+            **inputs,
+            max_new_tokens=max_tokens,
+            do_sample=False,
+            temperature=None,
+        )
+        end_time = time.perf_counter()
+    return generated, end_time - start_time
+
+
+print("\n=== Running Benchmarks ===")
+print(f"Model: {model_id}")
+print(f"Device: {torch.cuda.get_device_name()}")
+print(f"Initial memory: {get_memory_stats()}\n")
+
+# Warmup
+print("Running warmup...")
+for _ in range(2):
+    _ = run_generation(model, inputs, max_tokens=16)
+
+reset_peak_memory_stats()
+
+# Benchmark runs
+num_runs = 5
+max_tokens = 64
+times = []
+
+print(f"\nRunning {num_runs} benchmark iterations with {max_tokens} tokens...")
+for i in range(num_runs):
+    reset_peak_memory_stats()
+    generated, elapsed = run_generation(model, inputs, max_tokens)
+    times.append(elapsed)
+    mem_stats = get_memory_stats()
+    tokens_per_sec = max_tokens / elapsed
+    print(f"Run {i+1}: {elapsed:.3f}s ({tokens_per_sec:.1f} tok/s) | Peak: {mem_stats['peak_gb']:.2f}GB")
+
+# Statistics
+avg_time = sum(times) / len(times)
+min_time = min(times)
+max_time = max(times)
+avg_tokens_per_sec = max_tokens / avg_time
+
+print(f"\n=== Benchmark Results ===")
+print(f"Average: {avg_time:.3f}s ({avg_tokens_per_sec:.1f} tok/s)")
+print(f"Min: {min_time:.3f}s | Max: {max_time:.3f}s")
+
+# Final memory stats
+final_mem = get_memory_stats()
+print(f"\nFinal Memory:")
+print(f"  Allocated: {final_mem['allocated_gb']:.2f}GB")
+print(f"  Peak: {final_mem['peak_gb']:.2f}GB")
+print(f"  Reserved: {final_mem['reserved_gb']:.2f}GB")
+
+
+print("\n=== Running with Profiler ===")
+reset_peak_memory_stats()
+
+with torch.profiler.profile(
+    activities=[
+        torch.profiler.ProfilerActivity.CPU,
+        torch.profiler.ProfilerActivity.CUDA,
+    ],
+    record_shapes=True,
+    profile_memory=True,
+    with_stack=True,
+) as prof:
+    generated, elapsed = run_generation(model, inputs, max_tokens=64)
+
+print(f"Generation time: {elapsed:.2f} seconds")
+
+# Print profiler results
+print("\n=== Top 10 CUDA operations by time ===")
+print(prof.key_averages().table(sort_by="cuda_time_total", row_limit=10))
+
+print("\n=== Top 10 operations by memory ===")
+print(prof.key_averages().table(sort_by="self_cuda_memory_usage", row_limit=10))
+
+# Memory stats
+mem_stats = get_memory_stats()
+print(f"\nPeak Memory: {mem_stats['peak_gb']:.2f}GB")
+
+# Save trace if needed
+prof.export_chrome_trace("trace.json")
+print("\nProfile trace saved to trace.json")
+
+
+# Decode and print output
+print("\nGenerated text:")
+print(tokenizer.decode(generated[0][inputs["input_ids"].shape[-1] :]))
+
+
+# save times and memory stats for charting
+with open("benchmark_times.txt", "w") as f:
+    for t in times:
+        f.write(f"{t}\n")
+with open("benchmark_memory.txt", "w") as f:
+    f.write(f"{final_mem['allocated_gb']},{final_mem['peak_gb']},{final_mem['reserved_gb']}\n")
+
+# save avg_tokens_per_sec for charting
+with open("benchmark_avg_tokens_per_sec.txt", "w") as f:
+    f.write(f"{avg_tokens_per_sec}\n")
+
+
+ +
+
+
Overrode GptOssMLP.kernel_layer_name to Yamoe +Overrode GptOssRMSNorm.kernel_layer_name to None + +=== Running Benchmarks === +Model: openai/gpt-oss-20b +Device: NVIDIA L4 +Initial memory: {'allocated_gb': 9.390148608, 'peak_gb': 15.5643264, 'reserved_gb': 17.177772032} + +Running warmup... + +Running 5 benchmark iterations with 64 tokens... +Run 1: 12.075s (5.3 tok/s) | Peak: 9.41GB +Run 2: 12.071s (5.3 tok/s) | Peak: 9.41GB +Run 3: 12.070s (5.3 tok/s) | Peak: 9.41GB +Run 4: 12.071s (5.3 tok/s) | Peak: 9.41GB +Run 5: 12.071s (5.3 tok/s) | Peak: 9.41GB + +=== Benchmark Results === +Average: 12.072s (5.3 tok/s) +Min: 12.070s | Max: 12.075s + +Final Memory: + Allocated: 9.40GB + Peak: 9.41GB + Reserved: 10.33GB + +=== Running with Profiler === +Generation time: 12.73 seconds + +=== Top 10 CUDA operations by time === +------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ + Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg CPU Mem Self CPU Mem CUDA Mem Self CUDA Mem # of Calls +------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ + _yamoe_74a2acb_dirty::experts 1.40% 148.156ms 66.87% 7.074s 4.606ms 52.388ms 0.46% 10.583s 6.890ms 0 B -2.98 KB 18.88 MB -2.11 GB 1536 + aten::bmm 1.25% 132.560ms 1.75% 185.015ms 29.803us 10.486s 91.79% 10.486s 1.689ms 0 B 0 B 63.12 MB 63.12 MB 6208 +void cutlass::Kernel2<cutlass_80_wmma_tensorop_bf16_... 0.00% 0.000us 0.00% 0.000us 0.000us 10.319s 90.32% 10.319s 3.412ms 0 B 0 B 0 B 0 B 3024 + aten::linear 0.54% 57.566ms 3.78% 399.802ms 51.627us 0.000us 0.00% 645.165ms 83.312us 0 B 0 B 76.88 MB 0 B 7744 + aten::addmm 1.81% 191.354ms 2.57% 272.095ms 35.429us 352.039ms 3.08% 352.151ms 45.853us 0 B 0 B 52.31 MB 52.31 MB 7680 +std::enable_if<!(false), void>::type internal::gemvx... 0.00% 0.000us 0.00% 0.000us 0.000us 344.917ms 3.02% 344.917ms 74.982us 0 B 0 B 0 B 0 B 4600 + aten::matmul 0.31% 32.441ms 1.72% 181.712ms 56.785us 0.000us 0.00% 303.821ms 94.944us 0 B 0 B 87.68 MB 0 B 3200 +std::enable_if<!(false), void>::type internal::gemvx... 0.00% 0.000us 0.00% 0.000us 0.000us 293.850ms 2.57% 293.850ms 97.173us 0 B 0 B 0 B 0 B 3024 + aten::mm 0.01% 1.506ms 0.02% 2.161ms 33.768us 293.014ms 2.56% 293.014ms 4.578ms 0 B 0 B 24.56 MB 24.56 MB 64 + ampere_bf16_s16816gemm_bf16_128x64_ldg8_f2f_nn 0.00% 0.000us 0.00% 0.000us 0.000us 102.278ms 0.90% 102.278ms 4.262ms 0 B 0 B 0 B 0 B 24 +------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ +Self CPU time total: 10.579s +Self CUDA time total: 11.424s + + +=== Top 10 operations by memory === +------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ + Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg CPU Mem Self CPU Mem CUDA Mem Self CUDA Mem # of Calls +------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ + aten::empty 0.68% 72.026ms 0.68% 72.026ms 4.244us 0.000us 0.00% 0.000us 0.000us 296 B 296 B 3.49 GB 3.49 GB 16973 + aten::clamp 0.46% 48.185ms 0.69% 72.630ms 15.762us 10.269ms 0.09% 10.269ms 2.229us 0 B 0 B 616.69 MB 616.69 MB 4608 + aten::mul 1.76% 186.048ms 2.93% 310.383ms 14.181us 47.780ms 0.42% 47.792ms 2.184us 784 B 784 B 554.93 MB 554.93 MB 21888 + aten::cat 0.78% 82.030ms 1.22% 129.113ms 16.536us 17.028ms 0.15% 17.030ms 2.181us 0 B 0 B 387.88 MB 387.88 MB 7808 + aten::sigmoid 0.09% 9.855ms 0.16% 16.652ms 10.841us 2.889ms 0.03% 2.889ms 1.881us 0 B 0 B 307.97 MB 307.97 MB 1536 + aten::empty_strided 1.08% 114.498ms 1.10% 116.720ms 5.564us 0.000us 0.00% 0.000us 0.000us 0 B 0 B 216.60 MB 216.60 MB 20979 + aten::add 0.93% 97.861ms 1.56% 164.673ms 15.047us 16.394ms 0.14% 16.395ms 1.498us 0 B 0 B 91.03 MB 91.03 MB 10944 + aten::pow 0.36% 38.271ms 0.55% 58.020ms 18.501us 4.117ms 0.04% 4.117ms 1.313us 0 B 0 B 75.58 MB 75.58 MB 3136 + aten::bmm 1.25% 132.560ms 1.75% 185.015ms 29.803us 10.486s 91.79% 10.486s 1.689ms 0 B 0 B 63.12 MB 63.12 MB 6208 + aten::sub 0.51% 53.869ms 0.82% 87.218ms 13.626us 9.277ms 0.08% 9.355ms 1.461us 0 B 0 B 53.04 MB 53.01 MB 6401 +------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ +Self CPU time total: 10.579s +Self CUDA time total: 11.424s + + +Peak Memory: 9.41GB + +Profile trace saved to trace.json + +Generated text: +<|channel|>analysis<|message|>We need to explain what Tensor Parallelism is. It's a concept in distributed training of large language models. It refers to splitting the weight matrices (tensors) across multiple devices, so each device holds a slice of the matrix. During forward/backward passes, each device computes partial results and then they are +
+
+
▶ UV Install Logs
+ +
+
Fetching 3 files: 0%| | 0/3 [00:00<?, ?it/s] +Fetching 3 files: 33%|███▎ | 1/3 [00:11<00:23, 11.59s/it] +Fetching 3 files: 67%|██████▋ | 2/3 [00:16<00:07, 7.73s/it] +Fetching 3 files: 100%|██████████| 3/3 [00:16<00:00, 5.54s/it] +You are using full precision kernels, we will dequantize the model to bf16. To use the quantized model with quantization kernels, please set use_kernels=False + +Loading checkpoint shards: 0%| | 0/3 [00:00<?, ?it/s] +Loading checkpoint shards: 33%|███▎ | 1/3 [00:03<00:06, 3.23s/it] +Loading checkpoint shards: 67%|██████▋ | 2/3 [00:06<00:03, 3.15s/it] +Loading checkpoint shards: 100%|██████████| 3/3 [00:08<00:00, 2.50s/it] +Loading checkpoint shards: 100%|██████████| 3/3 [00:08<00:00, 2.68s/it] + +Fetching 6 files: 0%| | 0/6 [00:00<?, ?it/s] +Fetching 6 files: 17%|█▋ | 1/6 [00:00<00:00, 5.23it/s] +Fetching 6 files: 50%|█████ | 3/6 [00:00<00:00, 6.19it/s] +Fetching 6 files: 100%|██████████| 6/6 [00:00<00:00, 12.15it/s] +/tmp/uvnote-run-hjgpkuq6/home/.cache/uv/environments-v2/setup-30bb029f3f83f37d/lib/python3.12/site-packages/kernels/layer.py:868: UserWarning: +No kernel mapping found for layer `None`. Check if the layer name matches one of the kernels in the mapping or add the kernel you want to use to the mapping. Defaulting to original forward implementation. + warnings.warn( +/tmp/uvnote-run-hjgpkuq6/home/.cache/uv/environments-v2/setup-30bb029f3f83f37d/lib/python3.12/site-packages/kernels/layer.py:868: UserWarning: +No kernel mapping found for layer `None`. Check if the layer name matches one of the kernels in the mapping or add the kernel you want to use to the mapping. Defaulting to original forward implementation. + warnings.warn(
+ +
+
+ +
+
+ +▼ code +▼ output + ▶ uv-logs + | +Cell: charts | deps: matplotlib, numpy | 3.51s + | + +Raw +
+
+
  1
+  2
+  3
+  4
+  5
+  6
+  7
+  8
+  9
+ 10
+ 11
+ 12
+ 13
+ 14
+ 15
+ 16
+ 17
+ 18
+ 19
+ 20
+ 21
+ 22
+ 23
+ 24
+ 25
+ 26
+ 27
+ 28
+ 29
+ 30
+ 31
+ 32
+ 33
+ 34
+ 35
+ 36
+ 37
+ 38
+ 39
+ 40
+ 41
+ 42
+ 43
+ 44
+ 45
+ 46
+ 47
+ 48
+ 49
+ 50
+ 51
+ 52
+ 53
+ 54
+ 55
+ 56
+ 57
+ 58
+ 59
+ 60
+ 61
+ 62
+ 63
+ 64
+ 65
+ 66
+ 67
+ 68
+ 69
+ 70
+ 71
+ 72
+ 73
+ 74
+ 75
+ 76
+ 77
+ 78
+ 79
+ 80
+ 81
+ 82
+ 83
+ 84
+ 85
+ 86
+ 87
+ 88
+ 89
+ 90
+ 91
+ 92
+ 93
+ 94
+ 95
+ 96
+ 97
+ 98
+ 99
+100
+101
+102
+103
+104
+105
+106
+107
+108
+109
+110
+111
+112
+113
+114
+115
+116
+117
+118
+119
+120
+121
+122
+123
+124
+125
+126
+127
+128
+129
+130
+131
+132
+133
import matplotlib.pyplot as plt
+import numpy as np
+import os
+
+# get the pathf rom UVNOTE_SETUP env var
+setup_path = os.getenv("UVNOTE_INPUT_SETUP", ".")
+print(f"Reading benchmark data from: {setup_path}")
+
+num_runs = 5
+max_tokens = 64
+times = []
+with open(os.path.join(setup_path, "benchmark_times.txt"), "r") as f:
+    for line in f:
+        times.append(float(line.strip()))
+
+
+avg_time = 0.0
+min_time = 0.0
+max_time = 0.0
+final_mem = {"allocated_gb": 0.0, "peak_gb": 0.0, "reserved_gb": 0.0}
+
+avg_tokens_per_sec = 0.0
+with open(os.path.join(setup_path, "benchmark_avg_tokens_per_sec.txt"), "r") as f:
+    avg_tokens_per_sec = float(f.read().strip())
+
+times_file = os.path.join(setup_path, "benchmark_times.txt")
+memory_file = os.path.join(setup_path, "benchmark_memory.txt")
+
+
+# Minimal brutalist palette (dark theme): grayscale + 1 accent
+ACCENT = '#5ec8f8'   # calm cyan-blue accent
+FG = '#e6e6e6'       # light gray text/lines
+MUTED = '#9aa0a6'    # muted gray for secondary
+GRID = '#333333'     # grid lines
+
+# Styling tuned for clarity, high contrast, few colors
+plt.style.use('dark_background')
+plt.rcParams['figure.facecolor'] = 'none'
+plt.rcParams['axes.facecolor'] = 'none'
+plt.rcParams['savefig.facecolor'] = 'none'
+plt.rcParams['savefig.transparent'] = True
+plt.rcParams['font.family'] = 'monospace'
+plt.rcParams['font.weight'] = 'bold'
+plt.rcParams['axes.linewidth'] = 3
+plt.rcParams['grid.linewidth'] = 2
+plt.rcParams['lines.linewidth'] = 3
+plt.rcParams['patch.linewidth'] = 2
+
+# Prepare data
+runs = list(range(1, len(times) + 1))
+tokens_per_sec_all = [max_tokens / t for t in times]
+
+# Chart 1: Throughput Performance
+fig1, ax1 = plt.subplots(1, 1, figsize=(12, 6))
+fig1.patch.set_alpha(0)
+ax1.patch.set_alpha(0)
+
+ax1.plot(runs, tokens_per_sec_all, color=ACCENT, marker='o', markersize=12,
+         markerfacecolor=ACCENT, markeredgecolor=FG, markeredgewidth=3, linewidth=5, label='tok/s')
+ax1.fill_between(runs, 0, tokens_per_sec_all, alpha=0.2, color=ACCENT)
+ax1.axhline(y=avg_tokens_per_sec, color=FG, linestyle='--', linewidth=3,
+            label=f'AVG: {avg_tokens_per_sec:.1f}')
+ax1.set_title('THROUGHPUT PERFORMANCE', color=FG, fontsize=18, pad=20, fontweight='bold')
+ax1.set_xlabel('RUN NUMBER', color=FG, fontsize=14, fontweight='bold')
+ax1.set_ylabel('TOKENS/SEC', color=FG, fontsize=14, fontweight='bold')
+ax1.grid(True, color=GRID, alpha=0.5, linewidth=2)
+ax1.tick_params(colors=FG, labelsize=12)
+legend1 = ax1.legend(frameon=False, loc='lower right')
+for text in legend1.get_texts():
+    text.set_color(FG)
+    text.set_fontweight('bold')
+plt.tight_layout()
+plt.savefig('throughput.png', dpi=150, bbox_inches='tight', transparent=True)
+plt.show()
+
+# Chart 2: Generation Latency
+fig2, ax2 = plt.subplots(1, 1, figsize=(12, 6))
+fig2.patch.set_alpha(0)
+ax2.patch.set_alpha(0)
+
+bar_colors = [ACCENT if i % 2 == 0 else MUTED for i in range(len(times))]
+bars = ax2.bar(runs, times, color=bar_colors, edgecolor=FG, linewidth=3, width=0.6)
+ax2.axhline(y=avg_time, color=FG, linestyle='--', linewidth=3,
+            label=f'AVG: {avg_time:.2f}s')
+for i, (run, time, bar) in enumerate(zip(runs, times, bars)):
+    ax2.text(run, time + 0.02, f'{time:.2f}s', ha='center', va='bottom',
+             color=FG, fontweight='bold', fontsize=11)
+ax2.set_title('GENERATION LATENCY', color=FG, fontsize=18, pad=20, fontweight='bold')
+ax2.set_xlabel('RUN NUMBER', color=FG, fontsize=14, fontweight='bold')
+ax2.set_ylabel('TIME (SECONDS)', color=FG, fontsize=14, fontweight='bold')
+ax2.grid(True, axis='y', color=GRID, alpha=0.5, linewidth=2)
+ax2.tick_params(colors=FG, labelsize=12)
+ax2.set_ylim(0, max(times) * 1.15)
+legend2 = ax2.legend(frameon=False, loc='upper right')
+for text in legend2.get_texts():
+    text.set_color(FG)
+    text.set_fontweight('bold')
+plt.tight_layout()
+plt.savefig('latency.png', dpi=150, bbox_inches='tight', transparent=True)
+plt.show()
+
+# Chart 3: Memory Usage
+fig3, ax3 = plt.subplots(1, 1, figsize=(12, 6))
+fig3.patch.set_alpha(0)
+ax3.patch.set_alpha(0)
+
+memory_labels = ['ALLOCATED', 'PEAK', 'RESERVED']
+memory_values = [final_mem['allocated_gb'], final_mem['peak_gb'], final_mem['reserved_gb']]
+colors_mem = [MUTED, ACCENT, FG]
+bars = ax3.barh(memory_labels, memory_values, color=colors_mem, edgecolor=FG, linewidth=3, height=0.5)
+for i, (label, value, bar) in enumerate(zip(memory_labels, memory_values, bars)):
+    ax3.text(value + 0.5, i, f'{value:.1f} GB', va='center',
+             color=FG, fontweight='bold', fontsize=13)
+ax3.set_title('MEMORY USAGE', color=FG, fontsize=18, pad=20, fontweight='bold')
+ax3.set_xlabel('GIGABYTES', color=FG, fontsize=14, fontweight='bold')
+ax3.set_xlim(0, max(memory_values) * 1.3)
+ax3.grid(True, axis='x', color=GRID, alpha=0.5, linewidth=2)
+ax3.tick_params(colors=FG, labelsize=12)
+ax3.set_yticks(range(len(memory_labels)))
+ax3.set_yticklabels(memory_labels, fontweight='bold')
+plt.tight_layout()
+plt.savefig('memory.png', dpi=150, bbox_inches='tight', transparent=True)
+plt.show()
+
+print(f"\n📊 Charts saved as:")
+print(f"  • throughput.png")
+print(f"  • latency.png")
+print(f"  • memory.png")
+print(f"\nBenchmark Summary:")
+print(f"  avg tokens/sec: {avg_tokens_per_sec:.1f}")
+print(f"  min time: {min_time:.3f}s")
+print(f"  max time: {max_time:.3f}s")
+print(f"  peak memory: {final_mem['peak_gb']:.2f}GB")
+
+ +
+
+
Reading benchmark data from: /home/ubuntu/Projects/yamoe-gpt-integration/.uvnote/cache/0e89c413a25ded7b4d6fab2a010f0538ba2b35fb5f619a0dfced3121d3ccf879 + +📊 Charts saved as: + • throughput.png + • latency.png + • memory.png + +Benchmark Summary: + avg tokens/sec: 5.3 + min time: 0.000s + max time: 0.000s + peak memory: 0.00GB +
+
+
▶ UV Install Logs
+ +
+
/home/ubuntu/Projects/yamoe-gpt-integration/.uvnote/cells/charts.py:123: UserWarning: Attempting to set identical low and high xlims makes transformation singular; automatically expanding. + ax3.set_xlim(0, max(memory_values) * 1.3) +/home/ubuntu/Projects/yamoe-gpt-integration/.uvnote/cells/charts.py:128: UserWarning: Tight layout not applied. The left and right margins cannot be made large enough to accommodate all Axes decorations. + plt.tight_layout()
+
+

Artifacts:

+latency.png +memory.png +throughput.png +
+latency.png +
+
+memory.png +
+
+throughput.png +
+
+
+
+