+
+▼ code
+▼ output
+ ▶ uv-logs
+ |
+Cell: setup | 304.89s
+ |
+
+Raw
+
+
+
+
+
+1 + 2 + 3 + 4 + 5 + 6 + 7 + 8 + 9 + 10 + 11 + 12 + 13 + 14 + 15 + 16 + 17 + 18 + 19 + 20 + 21 + 22 + 23 + 24 + 25 + 26 + 27 + 28 + 29 + 30 + 31 + 32 + 33 + 34 + 35 + 36 + 37 + 38 + 39 + 40 + 41 + 42 + 43 + 44 + 45 + 46 + 47 + 48 + 49 + 50 + 51 + 52 + 53 + 54 + 55 + 56 + 57 + 58 + 59 + 60 + 61 + 62 + 63 + 64 + 65 + 66 + 67 + 68 + 69 + 70 + 71 + 72 + 73 + 74 + 75 + 76 + 77 + 78 + 79 + 80 + 81 + 82 + 83 + 84 + 85 + 86 + 87 + 88 + 89 + 90 + 91 + 92 + 93 + 94 + 95 + 96 + 97 + 98 + 99 +100 +101 +102 +103 +104 +105 +106 +107 +108 +109 +110 +111 +112 +113 +114 +115 +116 +117 +118 +119 +120 +121 +122 +123 +124 +125 +126 +127 +128 +129 +130 +131 +132 +133 +134 +135 +136 +137 +138 +139 +140 +141 +142 +143 +144 +145 +146 +147 +148 +149 +150 +151 +152 +153 +154 +155 +156 +157 +158 +159 +160 +161 +162 +163 +164 +165 +166 +167 +168 +169 +170 +171 +172 +173 +174 +175 +176 +177 +178 +179 +180 +181 +182 +183 +184 +185 +186 +187 +188 +189 +190 +191 +192 +193 +194 +195 +196 +197 +198 +199 +200 +201 +202 +203 +204 +205 +206 +207 +208 +209 +210 +211 +212 +213 +214 +215 +216 +217 +218 +219 +220 | # /// script +# requires-python = ">=3.12" +# dependencies = [ +# "accelerate>=1.10.1", +# "torch>=2.7.0", +# "kernels==0.10.0", +# "transformers@https://github.com/huggingface/transformers.git", +# "ipdb>=0.13.13", +# "matplotlib>=3.7.2", +# "numpy>=1.24.3", +# ] +# /// + +import torch +from transformers import GptOssForCausalLM, PreTrainedTokenizerFast, Mxfp4Config +import time +import torch.nn as nn +from kernels import register_kernel_mapping, Mode, LayerRepository +import sys +import torch.profiler +import gc + +def reset_peak_memory_stats(): + """Clear CUDA cache and reset memory allocation counters.""" + torch.cuda.empty_cache() + if torch.cuda.is_available(): + torch.cuda.reset_peak_memory_stats() + gc.collect() + +def get_memory_stats(): + """Get current and peak CUDA memory usage.""" + if not torch.cuda.is_available(): + return {"allocated_gb": 0, "peak_gb": 0, "reserved_gb": 0} + return { + "allocated_gb": torch.cuda.memory_allocated() / 1e9, + "peak_gb": torch.cuda.max_memory_allocated() / 1e9, + "reserved_gb": torch.cuda.memory_reserved() / 1e9, + } + +def override_kernel_layer_name(cls_name: str, value) -> bool: + """Helper to dynamically override the kernel_layer_name in a model class.""" + for mod in sys.modules.values(): + if mod is None: + continue + obj = getattr(mod, cls_name, None) + if isinstance(obj, type) and issubclass(obj, nn.Module): + setattr(obj, "kernel_layer_name", value) + print(f"Overrode {cls_name}.kernel_layer_name to {value}") + return True + return False + +def run_generation(model, inputs, max_tokens=64): + """Run a single generation pass and measure its duration.""" + with torch.inference_mode(): + start_time = time.perf_counter() + generated = model.generate( + **inputs, + max_new_tokens=max_tokens, + do_sample=False, + temperature=None, + ) + end_time = time.perf_counter() + return generated, end_time - start_time + + +# Init the model the normal way +model_id = "openai/gpt-oss-20b" +tokenizer = PreTrainedTokenizerFast.from_pretrained(model_id) +quantization_config = Mxfp4Config(dequantize=True) + +# Now we want to add some custom kernel mapping +custom_mapping = dict( + Yamoe=dict( + cuda={ + Mode.INFERENCE: LayerRepository( + repo_id="drbh/yamoe", + layer_name="Yamoe", + revision="v0.3.0", + ), + }, + ) +) +# First add the mapping +register_kernel_mapping(custom_mapping) +# Then override the layer name in the model class +override_kernel_layer_name("GptOssMLP", "Yamoe") + +# TODO: remove this line once RMSNorm is working +override_kernel_layer_name("GptOssRMSNorm", None) + +## Normal model stuff + +model = GptOssForCausalLM.from_pretrained( + model_id, + dtype="bfloat16", + device_map="auto", + use_kernels=True, + quantization_config=quantization_config, +).eval() + + +messages = [ + {"role": "system", "content": "What is Tensor Parallelism?"}, +] + +inputs = tokenizer.apply_chat_template( + messages, + add_generation_prompt=True, + return_tensors="pt", + return_dict=True, + reasoning_effort="low", +).to("cuda") + + + +def run_generation(model, inputs, max_tokens=64): + with torch.inference_mode(): + start_time = time.perf_counter() + generated = model.generate( + **inputs, + max_new_tokens=max_tokens, + do_sample=False, + temperature=None, + ) + end_time = time.perf_counter() + return generated, end_time - start_time + + +print("\n=== Running Benchmarks ===") +print(f"Model: {model_id}") +print(f"Device: {torch.cuda.get_device_name()}") +print(f"Initial memory: {get_memory_stats()}\n") + +# Warmup +print("Running warmup...") +for _ in range(2): + _ = run_generation(model, inputs, max_tokens=16) + +reset_peak_memory_stats() + +# Benchmark runs +num_runs = 5 +max_tokens = 64 +times = [] + +print(f"\nRunning {num_runs} benchmark iterations with {max_tokens} tokens...") +for i in range(num_runs): + reset_peak_memory_stats() + generated, elapsed = run_generation(model, inputs, max_tokens) + times.append(elapsed) + mem_stats = get_memory_stats() + tokens_per_sec = max_tokens / elapsed + print(f"Run {i+1}: {elapsed:.3f}s ({tokens_per_sec:.1f} tok/s) | Peak: {mem_stats['peak_gb']:.2f}GB") + +# Statistics +avg_time = sum(times) / len(times) +min_time = min(times) +max_time = max(times) +avg_tokens_per_sec = max_tokens / avg_time + +print(f"\n=== Benchmark Results ===") +print(f"Average: {avg_time:.3f}s ({avg_tokens_per_sec:.1f} tok/s)") +print(f"Min: {min_time:.3f}s | Max: {max_time:.3f}s") + +# Final memory stats +final_mem = get_memory_stats() +print(f"\nFinal Memory:") +print(f" Allocated: {final_mem['allocated_gb']:.2f}GB") +print(f" Peak: {final_mem['peak_gb']:.2f}GB") +print(f" Reserved: {final_mem['reserved_gb']:.2f}GB") + + +print("\n=== Running with Profiler ===") +reset_peak_memory_stats() + +with torch.profiler.profile( + activities=[ + torch.profiler.ProfilerActivity.CPU, + torch.profiler.ProfilerActivity.CUDA, + ], + record_shapes=True, + profile_memory=True, + with_stack=True, +) as prof: + generated, elapsed = run_generation(model, inputs, max_tokens=64) + +print(f"Generation time: {elapsed:.2f} seconds") + +# Print profiler results +print("\n=== Top 10 CUDA operations by time ===") +print(prof.key_averages().table(sort_by="cuda_time_total", row_limit=10)) + +print("\n=== Top 10 operations by memory ===") +print(prof.key_averages().table(sort_by="self_cuda_memory_usage", row_limit=10)) + +# Memory stats +mem_stats = get_memory_stats() +print(f"\nPeak Memory: {mem_stats['peak_gb']:.2f}GB") + +# Save trace if needed +prof.export_chrome_trace("trace.json") +print("\nProfile trace saved to trace.json") + + +# Decode and print output +print("\nGenerated text:") +print(tokenizer.decode(generated[0][inputs["input_ids"].shape[-1] :])) + + +# save times and memory stats for charting +with open("benchmark_times.txt", "w") as f: + for t in times: + f.write(f"{t}\n") +with open("benchmark_memory.txt", "w") as f: + f.write(f"{final_mem['allocated_gb']},{final_mem['peak_gb']},{final_mem['reserved_gb']}\n") + +# save avg_tokens_per_sec for charting +with open("benchmark_avg_tokens_per_sec.txt", "w") as f: + f.write(f"{avg_tokens_per_sec}\n") + + |
+
+Overrode GptOssMLP.kernel_layer_name to Yamoe
+Overrode GptOssRMSNorm.kernel_layer_name to None
+
+=== Running Benchmarks ===
+Model: openai/gpt-oss-20b
+Device: NVIDIA L4
+Initial memory: {'allocated_gb': 9.390148608, 'peak_gb': 15.5643264, 'reserved_gb': 17.177772032}
+
+Running warmup...
+
+Running 5 benchmark iterations with 64 tokens...
+Run 1: 12.075s (5.3 tok/s) | Peak: 9.41GB
+Run 2: 12.071s (5.3 tok/s) | Peak: 9.41GB
+Run 3: 12.070s (5.3 tok/s) | Peak: 9.41GB
+Run 4: 12.071s (5.3 tok/s) | Peak: 9.41GB
+Run 5: 12.071s (5.3 tok/s) | Peak: 9.41GB
+
+=== Benchmark Results ===
+Average: 12.072s (5.3 tok/s)
+Min: 12.070s | Max: 12.075s
+
+Final Memory:
+ Allocated: 9.40GB
+ Peak: 9.41GB
+ Reserved: 10.33GB
+
+=== Running with Profiler ===
+Generation time: 12.73 seconds
+
+=== Top 10 CUDA operations by time ===
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg CPU Mem Self CPU Mem CUDA Mem Self CUDA Mem # of Calls
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+ _yamoe_74a2acb_dirty::experts 1.40% 148.156ms 66.87% 7.074s 4.606ms 52.388ms 0.46% 10.583s 6.890ms 0 B -2.98 KB 18.88 MB -2.11 GB 1536
+ aten::bmm 1.25% 132.560ms 1.75% 185.015ms 29.803us 10.486s 91.79% 10.486s 1.689ms 0 B 0 B 63.12 MB 63.12 MB 6208
+void cutlass::Kernel2<cutlass_80_wmma_tensorop_bf16_... 0.00% 0.000us 0.00% 0.000us 0.000us 10.319s 90.32% 10.319s 3.412ms 0 B 0 B 0 B 0 B 3024
+ aten::linear 0.54% 57.566ms 3.78% 399.802ms 51.627us 0.000us 0.00% 645.165ms 83.312us 0 B 0 B 76.88 MB 0 B 7744
+ aten::addmm 1.81% 191.354ms 2.57% 272.095ms 35.429us 352.039ms 3.08% 352.151ms 45.853us 0 B 0 B 52.31 MB 52.31 MB 7680
+std::enable_if<!(false), void>::type internal::gemvx... 0.00% 0.000us 0.00% 0.000us 0.000us 344.917ms 3.02% 344.917ms 74.982us 0 B 0 B 0 B 0 B 4600
+ aten::matmul 0.31% 32.441ms 1.72% 181.712ms 56.785us 0.000us 0.00% 303.821ms 94.944us 0 B 0 B 87.68 MB 0 B 3200
+std::enable_if<!(false), void>::type internal::gemvx... 0.00% 0.000us 0.00% 0.000us 0.000us 293.850ms 2.57% 293.850ms 97.173us 0 B 0 B 0 B 0 B 3024
+ aten::mm 0.01% 1.506ms 0.02% 2.161ms 33.768us 293.014ms 2.56% 293.014ms 4.578ms 0 B 0 B 24.56 MB 24.56 MB 64
+ ampere_bf16_s16816gemm_bf16_128x64_ldg8_f2f_nn 0.00% 0.000us 0.00% 0.000us 0.000us 102.278ms 0.90% 102.278ms 4.262ms 0 B 0 B 0 B 0 B 24
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+Self CPU time total: 10.579s
+Self CUDA time total: 11.424s
+
+
+=== Top 10 operations by memory ===
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg CPU Mem Self CPU Mem CUDA Mem Self CUDA Mem # of Calls
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+ aten::empty 0.68% 72.026ms 0.68% 72.026ms 4.244us 0.000us 0.00% 0.000us 0.000us 296 B 296 B 3.49 GB 3.49 GB 16973
+ aten::clamp 0.46% 48.185ms 0.69% 72.630ms 15.762us 10.269ms 0.09% 10.269ms 2.229us 0 B 0 B 616.69 MB 616.69 MB 4608
+ aten::mul 1.76% 186.048ms 2.93% 310.383ms 14.181us 47.780ms 0.42% 47.792ms 2.184us 784 B 784 B 554.93 MB 554.93 MB 21888
+ aten::cat 0.78% 82.030ms 1.22% 129.113ms 16.536us 17.028ms 0.15% 17.030ms 2.181us 0 B 0 B 387.88 MB 387.88 MB 7808
+ aten::sigmoid 0.09% 9.855ms 0.16% 16.652ms 10.841us 2.889ms 0.03% 2.889ms 1.881us 0 B 0 B 307.97 MB 307.97 MB 1536
+ aten::empty_strided 1.08% 114.498ms 1.10% 116.720ms 5.564us 0.000us 0.00% 0.000us 0.000us 0 B 0 B 216.60 MB 216.60 MB 20979
+ aten::add 0.93% 97.861ms 1.56% 164.673ms 15.047us 16.394ms 0.14% 16.395ms 1.498us 0 B 0 B 91.03 MB 91.03 MB 10944
+ aten::pow 0.36% 38.271ms 0.55% 58.020ms 18.501us 4.117ms 0.04% 4.117ms 1.313us 0 B 0 B 75.58 MB 75.58 MB 3136
+ aten::bmm 1.25% 132.560ms 1.75% 185.015ms 29.803us 10.486s 91.79% 10.486s 1.689ms 0 B 0 B 63.12 MB 63.12 MB 6208
+ aten::sub 0.51% 53.869ms 0.82% 87.218ms 13.626us 9.277ms 0.08% 9.355ms 1.461us 0 B 0 B 53.04 MB 53.01 MB 6401
+------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
+Self CPU time total: 10.579s
+Self CUDA time total: 11.424s
+
+
+Peak Memory: 9.41GB
+
+Profile trace saved to trace.json
+
+Generated text:
+<|channel|>analysis<|message|>We need to explain what Tensor Parallelism is. It's a concept in distributed training of large language models. It refers to splitting the weight matrices (tensors) across multiple devices, so each device holds a slice of the matrix. During forward/backward passes, each device computes partial results and then they are
+
+
+
+▶ UV Install Logs
+
+Fetching 3 files: 0%| | 0/3 [00:00<?, ?it/s]
+Fetching 3 files: 33%|███▎ | 1/3 [00:11<00:23, 11.59s/it]
+Fetching 3 files: 67%|██████▋ | 2/3 [00:16<00:07, 7.73s/it]
+Fetching 3 files: 100%|██████████| 3/3 [00:16<00:00, 5.54s/it]
+You are using full precision kernels, we will dequantize the model to bf16. To use the quantized model with quantization kernels, please set use_kernels=False
+
+Loading checkpoint shards: 0%| | 0/3 [00:00<?, ?it/s]
+Loading checkpoint shards: 33%|███▎ | 1/3 [00:03<00:06, 3.23s/it]
+Loading checkpoint shards: 67%|██████▋ | 2/3 [00:06<00:03, 3.15s/it]
+Loading checkpoint shards: 100%|██████████| 3/3 [00:08<00:00, 2.50s/it]
+Loading checkpoint shards: 100%|██████████| 3/3 [00:08<00:00, 2.68s/it]
+
+Fetching 6 files: 0%| | 0/6 [00:00<?, ?it/s]
+Fetching 6 files: 17%|█▋ | 1/6 [00:00<00:00, 5.23it/s]
+Fetching 6 files: 50%|█████ | 3/6 [00:00<00:00, 6.19it/s]
+Fetching 6 files: 100%|██████████| 6/6 [00:00<00:00, 12.15it/s]
+/tmp/uvnote-run-hjgpkuq6/home/.cache/uv/environments-v2/setup-30bb029f3f83f37d/lib/python3.12/site-packages/kernels/layer.py:868: UserWarning:
+No kernel mapping found for layer `None`. Check if the layer name matches one of the kernels in the mapping or add the kernel you want to use to the mapping. Defaulting to original forward implementation.
+ warnings.warn(
+/tmp/uvnote-run-hjgpkuq6/home/.cache/uv/environments-v2/setup-30bb029f3f83f37d/lib/python3.12/site-packages/kernels/layer.py:868: UserWarning:
+No kernel mapping found for layer `None`. Check if the layer name matches one of the kernels in the mapping or add the kernel you want to use to the mapping. Defaulting to original forward implementation.
+ warnings.warn(
+
+
+
+
+
+