-
-
- -▼ code -▼ output - ▶ uv-logs - | -Cell: setup | 304.89s - | - -Raw -
-
-
  1
-  2
-  3
-  4
-  5
-  6
-  7
-  8
-  9
- 10
- 11
- 12
- 13
- 14
- 15
- 16
- 17
- 18
- 19
- 20
- 21
- 22
- 23
- 24
- 25
- 26
- 27
- 28
- 29
- 30
- 31
- 32
- 33
- 34
- 35
- 36
- 37
- 38
- 39
- 40
- 41
- 42
- 43
- 44
- 45
- 46
- 47
- 48
- 49
- 50
- 51
- 52
- 53
- 54
- 55
- 56
- 57
- 58
- 59
- 60
- 61
- 62
- 63
- 64
- 65
- 66
- 67
- 68
- 69
- 70
- 71
- 72
- 73
- 74
- 75
- 76
- 77
- 78
- 79
- 80
- 81
- 82
- 83
- 84
- 85
- 86
- 87
- 88
- 89
- 90
- 91
- 92
- 93
- 94
- 95
- 96
- 97
- 98
- 99
-100
-101
-102
-103
-104
-105
-106
-107
-108
-109
-110
-111
-112
-113
-114
-115
-116
-117
-118
-119
-120
-121
-122
-123
-124
-125
-126
-127
-128
-129
-130
-131
-132
-133
-134
-135
-136
-137
-138
-139
-140
-141
-142
-143
-144
-145
-146
-147
-148
-149
-150
-151
-152
-153
-154
-155
-156
-157
-158
-159
-160
-161
-162
-163
-164
-165
-166
-167
-168
-169
-170
-171
-172
-173
-174
-175
-176
-177
-178
-179
-180
-181
-182
-183
-184
-185
-186
-187
-188
-189
-190
-191
-192
-193
-194
-195
-196
-197
-198
-199
-200
-201
-202
-203
-204
-205
-206
-207
-208
-209
-210
-211
-212
-213
-214
-215
-216
-217
-218
-219
-220
# /// script
-# requires-python = ">=3.12"
-# dependencies = [
-#     "accelerate>=1.10.1",
-#     "torch>=2.7.0",
-#     "kernels==0.10.0",
-#     "transformers@https://github.com/huggingface/transformers.git",
-#     "ipdb>=0.13.13",
-#     "matplotlib>=3.7.2",
-#     "numpy>=1.24.3",
-# ]
-# ///
-
-import torch
-from transformers import GptOssForCausalLM, PreTrainedTokenizerFast, Mxfp4Config
-import time
-import torch.nn as nn
-from kernels import register_kernel_mapping, Mode, LayerRepository
-import sys
-import torch.profiler
-import gc
-
-def reset_peak_memory_stats():
-    """Clear CUDA cache and reset memory allocation counters."""
-    torch.cuda.empty_cache()
-    if torch.cuda.is_available():
-        torch.cuda.reset_peak_memory_stats()
-    gc.collect()
-
-def get_memory_stats():
-    """Get current and peak CUDA memory usage."""
-    if not torch.cuda.is_available():
-        return {"allocated_gb": 0, "peak_gb": 0, "reserved_gb": 0}
-    return {
-        "allocated_gb": torch.cuda.memory_allocated() / 1e9,
-        "peak_gb": torch.cuda.max_memory_allocated() / 1e9,
-        "reserved_gb": torch.cuda.memory_reserved() / 1e9,
-    }
-
-def override_kernel_layer_name(cls_name: str, value) -> bool:
-    """Helper to dynamically override the kernel_layer_name in a model class."""
-    for mod in sys.modules.values():
-        if mod is None:
-            continue
-        obj = getattr(mod, cls_name, None)
-        if isinstance(obj, type) and issubclass(obj, nn.Module):
-            setattr(obj, "kernel_layer_name", value)
-            print(f"Overrode {cls_name}.kernel_layer_name to {value}")
-            return True
-    return False
-
-def run_generation(model, inputs, max_tokens=64):
-    """Run a single generation pass and measure its duration."""
-    with torch.inference_mode():
-        start_time = time.perf_counter()
-        generated = model.generate(
-            **inputs,
-            max_new_tokens=max_tokens,
-            do_sample=False,
-            temperature=None,
-        )
-        end_time = time.perf_counter()
-    return generated, end_time - start_time
-
-
-# Init the model the normal way
-model_id = "openai/gpt-oss-20b"
-tokenizer = PreTrainedTokenizerFast.from_pretrained(model_id)
-quantization_config = Mxfp4Config(dequantize=True)
-
-# Now we want to add some custom kernel mapping
-custom_mapping = dict(
-    Yamoe=dict(
-        cuda={
-            Mode.INFERENCE: LayerRepository(
-                repo_id="drbh/yamoe",
-                layer_name="Yamoe",
-                revision="v0.3.0",
-            ),
-        },
-    )
-)
-# First add the mapping
-register_kernel_mapping(custom_mapping)
-# Then override the layer name in the model class
-override_kernel_layer_name("GptOssMLP", "Yamoe")
-
-# TODO: remove this line once RMSNorm is working
-override_kernel_layer_name("GptOssRMSNorm", None)
-
-## Normal model stuff
-
-model = GptOssForCausalLM.from_pretrained(
-    model_id,
-    dtype="bfloat16",
-    device_map="auto",
-    use_kernels=True,
-    quantization_config=quantization_config,
-).eval()
-
-
-messages = [
-    {"role": "system", "content": "What is Tensor Parallelism?"},
-]
-
-inputs = tokenizer.apply_chat_template(
-    messages,
-    add_generation_prompt=True,
-    return_tensors="pt",
-    return_dict=True,
-    reasoning_effort="low",
-).to("cuda")
-
-
-
-def run_generation(model, inputs, max_tokens=64):
-    with torch.inference_mode():
-        start_time = time.perf_counter()
-        generated = model.generate(
-            **inputs,
-            max_new_tokens=max_tokens,
-            do_sample=False,
-            temperature=None,
-        )
-        end_time = time.perf_counter()
-    return generated, end_time - start_time
-
-
-print("\n=== Running Benchmarks ===")
-print(f"Model: {model_id}")
-print(f"Device: {torch.cuda.get_device_name()}")
-print(f"Initial memory: {get_memory_stats()}\n")
-
-# Warmup
-print("Running warmup...")
-for _ in range(2):
-    _ = run_generation(model, inputs, max_tokens=16)
-
-reset_peak_memory_stats()
-
-# Benchmark runs
-num_runs = 5
-max_tokens = 64
-times = []
-
-print(f"\nRunning {num_runs} benchmark iterations with {max_tokens} tokens...")
-for i in range(num_runs):
-    reset_peak_memory_stats()
-    generated, elapsed = run_generation(model, inputs, max_tokens)
-    times.append(elapsed)
-    mem_stats = get_memory_stats()
-    tokens_per_sec = max_tokens / elapsed
-    print(f"Run {i+1}: {elapsed:.3f}s ({tokens_per_sec:.1f} tok/s) | Peak: {mem_stats['peak_gb']:.2f}GB")
-
-# Statistics
-avg_time = sum(times) / len(times)
-min_time = min(times)
-max_time = max(times)
-avg_tokens_per_sec = max_tokens / avg_time
-
-print(f"\n=== Benchmark Results ===")
-print(f"Average: {avg_time:.3f}s ({avg_tokens_per_sec:.1f} tok/s)")
-print(f"Min: {min_time:.3f}s | Max: {max_time:.3f}s")
-
-# Final memory stats
-final_mem = get_memory_stats()
-print(f"\nFinal Memory:")
-print(f"  Allocated: {final_mem['allocated_gb']:.2f}GB")
-print(f"  Peak: {final_mem['peak_gb']:.2f}GB")
-print(f"  Reserved: {final_mem['reserved_gb']:.2f}GB")
-
-
-print("\n=== Running with Profiler ===")
-reset_peak_memory_stats()
-
-with torch.profiler.profile(
-    activities=[
-        torch.profiler.ProfilerActivity.CPU,
-        torch.profiler.ProfilerActivity.CUDA,
-    ],
-    record_shapes=True,
-    profile_memory=True,
-    with_stack=True,
-) as prof:
-    generated, elapsed = run_generation(model, inputs, max_tokens=64)
-
-print(f"Generation time: {elapsed:.2f} seconds")
-
-# Print profiler results
-print("\n=== Top 10 CUDA operations by time ===")
-print(prof.key_averages().table(sort_by="cuda_time_total", row_limit=10))
-
-print("\n=== Top 10 operations by memory ===")
-print(prof.key_averages().table(sort_by="self_cuda_memory_usage", row_limit=10))
-
-# Memory stats
-mem_stats = get_memory_stats()
-print(f"\nPeak Memory: {mem_stats['peak_gb']:.2f}GB")
-
-# Save trace if needed
-prof.export_chrome_trace("trace.json")
-print("\nProfile trace saved to trace.json")
-
-
-# Decode and print output
-print("\nGenerated text:")
-print(tokenizer.decode(generated[0][inputs["input_ids"].shape[-1] :]))
-
-
-# save times and memory stats for charting
-with open("benchmark_times.txt", "w") as f:
-    for t in times:
-        f.write(f"{t}\n")
-with open("benchmark_memory.txt", "w") as f:
-    f.write(f"{final_mem['allocated_gb']},{final_mem['peak_gb']},{final_mem['reserved_gb']}\n")
-
-# save avg_tokens_per_sec for charting
-with open("benchmark_avg_tokens_per_sec.txt", "w") as f:
-    f.write(f"{avg_tokens_per_sec}\n")
-
-
- -
-
-
Overrode GptOssMLP.kernel_layer_name to Yamoe -Overrode GptOssRMSNorm.kernel_layer_name to None - -=== Running Benchmarks === -Model: openai/gpt-oss-20b -Device: NVIDIA L4 -Initial memory: {'allocated_gb': 9.390148608, 'peak_gb': 15.5643264, 'reserved_gb': 17.177772032} - -Running warmup... - -Running 5 benchmark iterations with 64 tokens... -Run 1: 12.075s (5.3 tok/s) | Peak: 9.41GB -Run 2: 12.071s (5.3 tok/s) | Peak: 9.41GB -Run 3: 12.070s (5.3 tok/s) | Peak: 9.41GB -Run 4: 12.071s (5.3 tok/s) | Peak: 9.41GB -Run 5: 12.071s (5.3 tok/s) | Peak: 9.41GB - -=== Benchmark Results === -Average: 12.072s (5.3 tok/s) -Min: 12.070s | Max: 12.075s - -Final Memory: - Allocated: 9.40GB - Peak: 9.41GB - Reserved: 10.33GB - -=== Running with Profiler === -Generation time: 12.73 seconds - -=== Top 10 CUDA operations by time === -------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg CPU Mem Self CPU Mem CUDA Mem Self CUDA Mem # of Calls -------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - _yamoe_74a2acb_dirty::experts 1.40% 148.156ms 66.87% 7.074s 4.606ms 52.388ms 0.46% 10.583s 6.890ms 0 B -2.98 KB 18.88 MB -2.11 GB 1536 - aten::bmm 1.25% 132.560ms 1.75% 185.015ms 29.803us 10.486s 91.79% 10.486s 1.689ms 0 B 0 B 63.12 MB 63.12 MB 6208 -void cutlass::Kernel2<cutlass_80_wmma_tensorop_bf16_... 0.00% 0.000us 0.00% 0.000us 0.000us 10.319s 90.32% 10.319s 3.412ms 0 B 0 B 0 B 0 B 3024 - aten::linear 0.54% 57.566ms 3.78% 399.802ms 51.627us 0.000us 0.00% 645.165ms 83.312us 0 B 0 B 76.88 MB 0 B 7744 - aten::addmm 1.81% 191.354ms 2.57% 272.095ms 35.429us 352.039ms 3.08% 352.151ms 45.853us 0 B 0 B 52.31 MB 52.31 MB 7680 -std::enable_if<!(false), void>::type internal::gemvx... 0.00% 0.000us 0.00% 0.000us 0.000us 344.917ms 3.02% 344.917ms 74.982us 0 B 0 B 0 B 0 B 4600 - aten::matmul 0.31% 32.441ms 1.72% 181.712ms 56.785us 0.000us 0.00% 303.821ms 94.944us 0 B 0 B 87.68 MB 0 B 3200 -std::enable_if<!(false), void>::type internal::gemvx... 0.00% 0.000us 0.00% 0.000us 0.000us 293.850ms 2.57% 293.850ms 97.173us 0 B 0 B 0 B 0 B 3024 - aten::mm 0.01% 1.506ms 0.02% 2.161ms 33.768us 293.014ms 2.56% 293.014ms 4.578ms 0 B 0 B 24.56 MB 24.56 MB 64 - ampere_bf16_s16816gemm_bf16_128x64_ldg8_f2f_nn 0.00% 0.000us 0.00% 0.000us 0.000us 102.278ms 0.90% 102.278ms 4.262ms 0 B 0 B 0 B 0 B 24 -------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 10.579s -Self CUDA time total: 11.424s - - -=== Top 10 operations by memory === -------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg CPU Mem Self CPU Mem CUDA Mem Self CUDA Mem # of Calls -------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ - aten::empty 0.68% 72.026ms 0.68% 72.026ms 4.244us 0.000us 0.00% 0.000us 0.000us 296 B 296 B 3.49 GB 3.49 GB 16973 - aten::clamp 0.46% 48.185ms 0.69% 72.630ms 15.762us 10.269ms 0.09% 10.269ms 2.229us 0 B 0 B 616.69 MB 616.69 MB 4608 - aten::mul 1.76% 186.048ms 2.93% 310.383ms 14.181us 47.780ms 0.42% 47.792ms 2.184us 784 B 784 B 554.93 MB 554.93 MB 21888 - aten::cat 0.78% 82.030ms 1.22% 129.113ms 16.536us 17.028ms 0.15% 17.030ms 2.181us 0 B 0 B 387.88 MB 387.88 MB 7808 - aten::sigmoid 0.09% 9.855ms 0.16% 16.652ms 10.841us 2.889ms 0.03% 2.889ms 1.881us 0 B 0 B 307.97 MB 307.97 MB 1536 - aten::empty_strided 1.08% 114.498ms 1.10% 116.720ms 5.564us 0.000us 0.00% 0.000us 0.000us 0 B 0 B 216.60 MB 216.60 MB 20979 - aten::add 0.93% 97.861ms 1.56% 164.673ms 15.047us 16.394ms 0.14% 16.395ms 1.498us 0 B 0 B 91.03 MB 91.03 MB 10944 - aten::pow 0.36% 38.271ms 0.55% 58.020ms 18.501us 4.117ms 0.04% 4.117ms 1.313us 0 B 0 B 75.58 MB 75.58 MB 3136 - aten::bmm 1.25% 132.560ms 1.75% 185.015ms 29.803us 10.486s 91.79% 10.486s 1.689ms 0 B 0 B 63.12 MB 63.12 MB 6208 - aten::sub 0.51% 53.869ms 0.82% 87.218ms 13.626us 9.277ms 0.08% 9.355ms 1.461us 0 B 0 B 53.04 MB 53.01 MB 6401 -------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -Self CPU time total: 10.579s -Self CUDA time total: 11.424s - - -Peak Memory: 9.41GB - -Profile trace saved to trace.json - -Generated text: -<|channel|>analysis<|message|>We need to explain what Tensor Parallelism is. It's a concept in distributed training of large language models. It refers to splitting the weight matrices (tensors) across multiple devices, so each device holds a slice of the matrix. During forward/backward passes, each device computes partial results and then they are -
-
-
▶ UV Install Logs
- -
-
Fetching 3 files: 0%| | 0/3 [00:00<?, ?it/s] -Fetching 3 files: 33%|███▎ | 1/3 [00:11<00:23, 11.59s/it] -Fetching 3 files: 67%|██████▋ | 2/3 [00:16<00:07, 7.73s/it] -Fetching 3 files: 100%|██████████| 3/3 [00:16<00:00, 5.54s/it] -You are using full precision kernels, we will dequantize the model to bf16. To use the quantized model with quantization kernels, please set use_kernels=False - -Loading checkpoint shards: 0%| | 0/3 [00:00<?, ?it/s] -Loading checkpoint shards: 33%|███▎ | 1/3 [00:03<00:06, 3.23s/it] -Loading checkpoint shards: 67%|██████▋ | 2/3 [00:06<00:03, 3.15s/it] -Loading checkpoint shards: 100%|██████████| 3/3 [00:08<00:00, 2.50s/it] -Loading checkpoint shards: 100%|██████████| 3/3 [00:08<00:00, 2.68s/it] - -Fetching 6 files: 0%| | 0/6 [00:00<?, ?it/s] -Fetching 6 files: 17%|█▋ | 1/6 [00:00<00:00, 5.23it/s] -Fetching 6 files: 50%|█████ | 3/6 [00:00<00:00, 6.19it/s] -Fetching 6 files: 100%|██████████| 6/6 [00:00<00:00, 12.15it/s] -/tmp/uvnote-run-hjgpkuq6/home/.cache/uv/environments-v2/setup-30bb029f3f83f37d/lib/python3.12/site-packages/kernels/layer.py:868: UserWarning: -No kernel mapping found for layer `None`. Check if the layer name matches one of the kernels in the mapping or add the kernel you want to use to the mapping. Defaulting to original forward implementation. - warnings.warn( -/tmp/uvnote-run-hjgpkuq6/home/.cache/uv/environments-v2/setup-30bb029f3f83f37d/lib/python3.12/site-packages/kernels/layer.py:868: UserWarning: -No kernel mapping found for layer `None`. Check if the layer name matches one of the kernels in the mapping or add the kernel you want to use to the mapping. Defaulting to original forward implementation. - warnings.warn(
- -
-
- -
-
- -▼ code -▼ output - ▶ uv-logs - | -Cell: charts | deps: matplotlib, numpy | 3.51s - | - -Raw -
-
-
  1
-  2
-  3
-  4
-  5
-  6
-  7
-  8
-  9
- 10
- 11
- 12
- 13
- 14
- 15
- 16
- 17
- 18
- 19
- 20
- 21
- 22
- 23
- 24
- 25
- 26
- 27
- 28
- 29
- 30
- 31
- 32
- 33
- 34
- 35
- 36
- 37
- 38
- 39
- 40
- 41
- 42
- 43
- 44
- 45
- 46
- 47
- 48
- 49
- 50
- 51
- 52
- 53
- 54
- 55
- 56
- 57
- 58
- 59
- 60
- 61
- 62
- 63
- 64
- 65
- 66
- 67
- 68
- 69
- 70
- 71
- 72
- 73
- 74
- 75
- 76
- 77
- 78
- 79
- 80
- 81
- 82
- 83
- 84
- 85
- 86
- 87
- 88
- 89
- 90
- 91
- 92
- 93
- 94
- 95
- 96
- 97
- 98
- 99
-100
-101
-102
-103
-104
-105
-106
-107
-108
-109
-110
-111
-112
-113
-114
-115
-116
-117
-118
-119
-120
-121
-122
-123
-124
-125
-126
-127
-128
-129
-130
-131
-132
-133
import matplotlib.pyplot as plt
-import numpy as np
-import os
-
-# get the pathf rom UVNOTE_SETUP env var
-setup_path = os.getenv("UVNOTE_INPUT_SETUP", ".")
-print(f"Reading benchmark data from: {setup_path}")
-
-num_runs = 5
-max_tokens = 64
-times = []
-with open(os.path.join(setup_path, "benchmark_times.txt"), "r") as f:
-    for line in f:
-        times.append(float(line.strip()))
-
-
-avg_time = 0.0
-min_time = 0.0
-max_time = 0.0
-final_mem = {"allocated_gb": 0.0, "peak_gb": 0.0, "reserved_gb": 0.0}
-
-avg_tokens_per_sec = 0.0
-with open(os.path.join(setup_path, "benchmark_avg_tokens_per_sec.txt"), "r") as f:
-    avg_tokens_per_sec = float(f.read().strip())
-
-times_file = os.path.join(setup_path, "benchmark_times.txt")
-memory_file = os.path.join(setup_path, "benchmark_memory.txt")
-
-
-# Minimal brutalist palette (dark theme): grayscale + 1 accent
-ACCENT = '#5ec8f8'   # calm cyan-blue accent
-FG = '#e6e6e6'       # light gray text/lines
-MUTED = '#9aa0a6'    # muted gray for secondary
-GRID = '#333333'     # grid lines
-
-# Styling tuned for clarity, high contrast, few colors
-plt.style.use('dark_background')
-plt.rcParams['figure.facecolor'] = 'none'
-plt.rcParams['axes.facecolor'] = 'none'
-plt.rcParams['savefig.facecolor'] = 'none'
-plt.rcParams['savefig.transparent'] = True
-plt.rcParams['font.family'] = 'monospace'
-plt.rcParams['font.weight'] = 'bold'
-plt.rcParams['axes.linewidth'] = 3
-plt.rcParams['grid.linewidth'] = 2
-plt.rcParams['lines.linewidth'] = 3
-plt.rcParams['patch.linewidth'] = 2
-
-# Prepare data
-runs = list(range(1, len(times) + 1))
-tokens_per_sec_all = [max_tokens / t for t in times]
-
-# Chart 1: Throughput Performance
-fig1, ax1 = plt.subplots(1, 1, figsize=(12, 6))
-fig1.patch.set_alpha(0)
-ax1.patch.set_alpha(0)
-
-ax1.plot(runs, tokens_per_sec_all, color=ACCENT, marker='o', markersize=12,
-         markerfacecolor=ACCENT, markeredgecolor=FG, markeredgewidth=3, linewidth=5, label='tok/s')
-ax1.fill_between(runs, 0, tokens_per_sec_all, alpha=0.2, color=ACCENT)
-ax1.axhline(y=avg_tokens_per_sec, color=FG, linestyle='--', linewidth=3,
-            label=f'AVG: {avg_tokens_per_sec:.1f}')
-ax1.set_title('THROUGHPUT PERFORMANCE', color=FG, fontsize=18, pad=20, fontweight='bold')
-ax1.set_xlabel('RUN NUMBER', color=FG, fontsize=14, fontweight='bold')
-ax1.set_ylabel('TOKENS/SEC', color=FG, fontsize=14, fontweight='bold')
-ax1.grid(True, color=GRID, alpha=0.5, linewidth=2)
-ax1.tick_params(colors=FG, labelsize=12)
-legend1 = ax1.legend(frameon=False, loc='lower right')
-for text in legend1.get_texts():
-    text.set_color(FG)
-    text.set_fontweight('bold')
-plt.tight_layout()
-plt.savefig('throughput.png', dpi=150, bbox_inches='tight', transparent=True)
-plt.show()
-
-# Chart 2: Generation Latency
-fig2, ax2 = plt.subplots(1, 1, figsize=(12, 6))
-fig2.patch.set_alpha(0)
-ax2.patch.set_alpha(0)
-
-bar_colors = [ACCENT if i % 2 == 0 else MUTED for i in range(len(times))]
-bars = ax2.bar(runs, times, color=bar_colors, edgecolor=FG, linewidth=3, width=0.6)
-ax2.axhline(y=avg_time, color=FG, linestyle='--', linewidth=3,
-            label=f'AVG: {avg_time:.2f}s')
-for i, (run, time, bar) in enumerate(zip(runs, times, bars)):
-    ax2.text(run, time + 0.02, f'{time:.2f}s', ha='center', va='bottom',
-             color=FG, fontweight='bold', fontsize=11)
-ax2.set_title('GENERATION LATENCY', color=FG, fontsize=18, pad=20, fontweight='bold')
-ax2.set_xlabel('RUN NUMBER', color=FG, fontsize=14, fontweight='bold')
-ax2.set_ylabel('TIME (SECONDS)', color=FG, fontsize=14, fontweight='bold')
-ax2.grid(True, axis='y', color=GRID, alpha=0.5, linewidth=2)
-ax2.tick_params(colors=FG, labelsize=12)
-ax2.set_ylim(0, max(times) * 1.15)
-legend2 = ax2.legend(frameon=False, loc='upper right')
-for text in legend2.get_texts():
-    text.set_color(FG)
-    text.set_fontweight('bold')
-plt.tight_layout()
-plt.savefig('latency.png', dpi=150, bbox_inches='tight', transparent=True)
-plt.show()
-
-# Chart 3: Memory Usage
-fig3, ax3 = plt.subplots(1, 1, figsize=(12, 6))
-fig3.patch.set_alpha(0)
-ax3.patch.set_alpha(0)
-
-memory_labels = ['ALLOCATED', 'PEAK', 'RESERVED']
-memory_values = [final_mem['allocated_gb'], final_mem['peak_gb'], final_mem['reserved_gb']]
-colors_mem = [MUTED, ACCENT, FG]
-bars = ax3.barh(memory_labels, memory_values, color=colors_mem, edgecolor=FG, linewidth=3, height=0.5)
-for i, (label, value, bar) in enumerate(zip(memory_labels, memory_values, bars)):
-    ax3.text(value + 0.5, i, f'{value:.1f} GB', va='center',
-             color=FG, fontweight='bold', fontsize=13)
-ax3.set_title('MEMORY USAGE', color=FG, fontsize=18, pad=20, fontweight='bold')
-ax3.set_xlabel('GIGABYTES', color=FG, fontsize=14, fontweight='bold')
-ax3.set_xlim(0, max(memory_values) * 1.3)
-ax3.grid(True, axis='x', color=GRID, alpha=0.5, linewidth=2)
-ax3.tick_params(colors=FG, labelsize=12)
-ax3.set_yticks(range(len(memory_labels)))
-ax3.set_yticklabels(memory_labels, fontweight='bold')
-plt.tight_layout()
-plt.savefig('memory.png', dpi=150, bbox_inches='tight', transparent=True)
-plt.show()
-
-print(f"\n📊 Charts saved as:")
-print(f"  • throughput.png")
-print(f"  • latency.png")
-print(f"  • memory.png")
-print(f"\nBenchmark Summary:")
-print(f"  avg tokens/sec: {avg_tokens_per_sec:.1f}")
-print(f"  min time: {min_time:.3f}s")
-print(f"  max time: {max_time:.3f}s")
-print(f"  peak memory: {final_mem['peak_gb']:.2f}GB")
-
- -
-
-
Reading benchmark data from: /home/ubuntu/Projects/yamoe-gpt-integration/.uvnote/cache/0e89c413a25ded7b4d6fab2a010f0538ba2b35fb5f619a0dfced3121d3ccf879 - -📊 Charts saved as: - • throughput.png - • latency.png - • memory.png - -Benchmark Summary: - avg tokens/sec: 5.3 - min time: 0.000s - max time: 0.000s - peak memory: 0.00GB -
-
-
▶ UV Install Logs
- -
-
/home/ubuntu/Projects/yamoe-gpt-integration/.uvnote/cells/charts.py:123: UserWarning: Attempting to set identical low and high xlims makes transformation singular; automatically expanding. - ax3.set_xlim(0, max(memory_values) * 1.3) -/home/ubuntu/Projects/yamoe-gpt-integration/.uvnote/cells/charts.py:128: UserWarning: Tight layout not applied. The left and right margins cannot be made large enough to accommodate all Axes decorations. - plt.tight_layout()
-
-

Artifacts:

-latency.png -memory.png -throughput.png -
-latency.png -
-
-memory.png -
-
-throughput.png -
-
-
-
-