-
-▼ code
-▼ output
- ▶ uv-logs
- |
-Cell: setup | 304.89s
- |
-
-Raw
-
-
-
-
-
-1 - 2 - 3 - 4 - 5 - 6 - 7 - 8 - 9 - 10 - 11 - 12 - 13 - 14 - 15 - 16 - 17 - 18 - 19 - 20 - 21 - 22 - 23 - 24 - 25 - 26 - 27 - 28 - 29 - 30 - 31 - 32 - 33 - 34 - 35 - 36 - 37 - 38 - 39 - 40 - 41 - 42 - 43 - 44 - 45 - 46 - 47 - 48 - 49 - 50 - 51 - 52 - 53 - 54 - 55 - 56 - 57 - 58 - 59 - 60 - 61 - 62 - 63 - 64 - 65 - 66 - 67 - 68 - 69 - 70 - 71 - 72 - 73 - 74 - 75 - 76 - 77 - 78 - 79 - 80 - 81 - 82 - 83 - 84 - 85 - 86 - 87 - 88 - 89 - 90 - 91 - 92 - 93 - 94 - 95 - 96 - 97 - 98 - 99 -100 -101 -102 -103 -104 -105 -106 -107 -108 -109 -110 -111 -112 -113 -114 -115 -116 -117 -118 -119 -120 -121 -122 -123 -124 -125 -126 -127 -128 -129 -130 -131 -132 -133 -134 -135 -136 -137 -138 -139 -140 -141 -142 -143 -144 -145 -146 -147 -148 -149 -150 -151 -152 -153 -154 -155 -156 -157 -158 -159 -160 -161 -162 -163 -164 -165 -166 -167 -168 -169 -170 -171 -172 -173 -174 -175 -176 -177 -178 -179 -180 -181 -182 -183 -184 -185 -186 -187 -188 -189 -190 -191 -192 -193 -194 -195 -196 -197 -198 -199 -200 -201 -202 -203 -204 -205 -206 -207 -208 -209 -210 -211 -212 -213 -214 -215 -216 -217 -218 -219 -220 | # /// script -# requires-python = ">=3.12" -# dependencies = [ -# "accelerate>=1.10.1", -# "torch>=2.7.0", -# "kernels==0.10.0", -# "transformers@https://github.com/huggingface/transformers.git", -# "ipdb>=0.13.13", -# "matplotlib>=3.7.2", -# "numpy>=1.24.3", -# ] -# /// - -import torch -from transformers import GptOssForCausalLM, PreTrainedTokenizerFast, Mxfp4Config -import time -import torch.nn as nn -from kernels import register_kernel_mapping, Mode, LayerRepository -import sys -import torch.profiler -import gc - -def reset_peak_memory_stats(): - """Clear CUDA cache and reset memory allocation counters.""" - torch.cuda.empty_cache() - if torch.cuda.is_available(): - torch.cuda.reset_peak_memory_stats() - gc.collect() - -def get_memory_stats(): - """Get current and peak CUDA memory usage.""" - if not torch.cuda.is_available(): - return {"allocated_gb": 0, "peak_gb": 0, "reserved_gb": 0} - return { - "allocated_gb": torch.cuda.memory_allocated() / 1e9, - "peak_gb": torch.cuda.max_memory_allocated() / 1e9, - "reserved_gb": torch.cuda.memory_reserved() / 1e9, - } - -def override_kernel_layer_name(cls_name: str, value) -> bool: - """Helper to dynamically override the kernel_layer_name in a model class.""" - for mod in sys.modules.values(): - if mod is None: - continue - obj = getattr(mod, cls_name, None) - if isinstance(obj, type) and issubclass(obj, nn.Module): - setattr(obj, "kernel_layer_name", value) - print(f"Overrode {cls_name}.kernel_layer_name to {value}") - return True - return False - -def run_generation(model, inputs, max_tokens=64): - """Run a single generation pass and measure its duration.""" - with torch.inference_mode(): - start_time = time.perf_counter() - generated = model.generate( - **inputs, - max_new_tokens=max_tokens, - do_sample=False, - temperature=None, - ) - end_time = time.perf_counter() - return generated, end_time - start_time - - -# Init the model the normal way -model_id = "openai/gpt-oss-20b" -tokenizer = PreTrainedTokenizerFast.from_pretrained(model_id) -quantization_config = Mxfp4Config(dequantize=True) - -# Now we want to add some custom kernel mapping -custom_mapping = dict( - Yamoe=dict( - cuda={ - Mode.INFERENCE: LayerRepository( - repo_id="drbh/yamoe", - layer_name="Yamoe", - revision="v0.3.0", - ), - }, - ) -) -# First add the mapping -register_kernel_mapping(custom_mapping) -# Then override the layer name in the model class -override_kernel_layer_name("GptOssMLP", "Yamoe") - -# TODO: remove this line once RMSNorm is working -override_kernel_layer_name("GptOssRMSNorm", None) - -## Normal model stuff - -model = GptOssForCausalLM.from_pretrained( - model_id, - dtype="bfloat16", - device_map="auto", - use_kernels=True, - quantization_config=quantization_config, -).eval() - - -messages = [ - {"role": "system", "content": "What is Tensor Parallelism?"}, -] - -inputs = tokenizer.apply_chat_template( - messages, - add_generation_prompt=True, - return_tensors="pt", - return_dict=True, - reasoning_effort="low", -).to("cuda") - - - -def run_generation(model, inputs, max_tokens=64): - with torch.inference_mode(): - start_time = time.perf_counter() - generated = model.generate( - **inputs, - max_new_tokens=max_tokens, - do_sample=False, - temperature=None, - ) - end_time = time.perf_counter() - return generated, end_time - start_time - - -print("\n=== Running Benchmarks ===") -print(f"Model: {model_id}") -print(f"Device: {torch.cuda.get_device_name()}") -print(f"Initial memory: {get_memory_stats()}\n") - -# Warmup -print("Running warmup...") -for _ in range(2): - _ = run_generation(model, inputs, max_tokens=16) - -reset_peak_memory_stats() - -# Benchmark runs -num_runs = 5 -max_tokens = 64 -times = [] - -print(f"\nRunning {num_runs} benchmark iterations with {max_tokens} tokens...") -for i in range(num_runs): - reset_peak_memory_stats() - generated, elapsed = run_generation(model, inputs, max_tokens) - times.append(elapsed) - mem_stats = get_memory_stats() - tokens_per_sec = max_tokens / elapsed - print(f"Run {i+1}: {elapsed:.3f}s ({tokens_per_sec:.1f} tok/s) | Peak: {mem_stats['peak_gb']:.2f}GB") - -# Statistics -avg_time = sum(times) / len(times) -min_time = min(times) -max_time = max(times) -avg_tokens_per_sec = max_tokens / avg_time - -print(f"\n=== Benchmark Results ===") -print(f"Average: {avg_time:.3f}s ({avg_tokens_per_sec:.1f} tok/s)") -print(f"Min: {min_time:.3f}s | Max: {max_time:.3f}s") - -# Final memory stats -final_mem = get_memory_stats() -print(f"\nFinal Memory:") -print(f" Allocated: {final_mem['allocated_gb']:.2f}GB") -print(f" Peak: {final_mem['peak_gb']:.2f}GB") -print(f" Reserved: {final_mem['reserved_gb']:.2f}GB") - - -print("\n=== Running with Profiler ===") -reset_peak_memory_stats() - -with torch.profiler.profile( - activities=[ - torch.profiler.ProfilerActivity.CPU, - torch.profiler.ProfilerActivity.CUDA, - ], - record_shapes=True, - profile_memory=True, - with_stack=True, -) as prof: - generated, elapsed = run_generation(model, inputs, max_tokens=64) - -print(f"Generation time: {elapsed:.2f} seconds") - -# Print profiler results -print("\n=== Top 10 CUDA operations by time ===") -print(prof.key_averages().table(sort_by="cuda_time_total", row_limit=10)) - -print("\n=== Top 10 operations by memory ===") -print(prof.key_averages().table(sort_by="self_cuda_memory_usage", row_limit=10)) - -# Memory stats -mem_stats = get_memory_stats() -print(f"\nPeak Memory: {mem_stats['peak_gb']:.2f}GB") - -# Save trace if needed -prof.export_chrome_trace("trace.json") -print("\nProfile trace saved to trace.json") - - -# Decode and print output -print("\nGenerated text:") -print(tokenizer.decode(generated[0][inputs["input_ids"].shape[-1] :])) - - -# save times and memory stats for charting -with open("benchmark_times.txt", "w") as f: - for t in times: - f.write(f"{t}\n") -with open("benchmark_memory.txt", "w") as f: - f.write(f"{final_mem['allocated_gb']},{final_mem['peak_gb']},{final_mem['reserved_gb']}\n") - -# save avg_tokens_per_sec for charting -with open("benchmark_avg_tokens_per_sec.txt", "w") as f: - f.write(f"{avg_tokens_per_sec}\n") - - |
-
-Overrode GptOssMLP.kernel_layer_name to Yamoe
-Overrode GptOssRMSNorm.kernel_layer_name to None
-
-=== Running Benchmarks ===
-Model: openai/gpt-oss-20b
-Device: NVIDIA L4
-Initial memory: {'allocated_gb': 9.390148608, 'peak_gb': 15.5643264, 'reserved_gb': 17.177772032}
-
-Running warmup...
-
-Running 5 benchmark iterations with 64 tokens...
-Run 1: 12.075s (5.3 tok/s) | Peak: 9.41GB
-Run 2: 12.071s (5.3 tok/s) | Peak: 9.41GB
-Run 3: 12.070s (5.3 tok/s) | Peak: 9.41GB
-Run 4: 12.071s (5.3 tok/s) | Peak: 9.41GB
-Run 5: 12.071s (5.3 tok/s) | Peak: 9.41GB
-
-=== Benchmark Results ===
-Average: 12.072s (5.3 tok/s)
-Min: 12.070s | Max: 12.075s
-
-Final Memory:
- Allocated: 9.40GB
- Peak: 9.41GB
- Reserved: 10.33GB
-
-=== Running with Profiler ===
-Generation time: 12.73 seconds
-
-=== Top 10 CUDA operations by time ===
-------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
- Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg CPU Mem Self CPU Mem CUDA Mem Self CUDA Mem # of Calls
-------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
- _yamoe_74a2acb_dirty::experts 1.40% 148.156ms 66.87% 7.074s 4.606ms 52.388ms 0.46% 10.583s 6.890ms 0 B -2.98 KB 18.88 MB -2.11 GB 1536
- aten::bmm 1.25% 132.560ms 1.75% 185.015ms 29.803us 10.486s 91.79% 10.486s 1.689ms 0 B 0 B 63.12 MB 63.12 MB 6208
-void cutlass::Kernel2<cutlass_80_wmma_tensorop_bf16_... 0.00% 0.000us 0.00% 0.000us 0.000us 10.319s 90.32% 10.319s 3.412ms 0 B 0 B 0 B 0 B 3024
- aten::linear 0.54% 57.566ms 3.78% 399.802ms 51.627us 0.000us 0.00% 645.165ms 83.312us 0 B 0 B 76.88 MB 0 B 7744
- aten::addmm 1.81% 191.354ms 2.57% 272.095ms 35.429us 352.039ms 3.08% 352.151ms 45.853us 0 B 0 B 52.31 MB 52.31 MB 7680
-std::enable_if<!(false), void>::type internal::gemvx... 0.00% 0.000us 0.00% 0.000us 0.000us 344.917ms 3.02% 344.917ms 74.982us 0 B 0 B 0 B 0 B 4600
- aten::matmul 0.31% 32.441ms 1.72% 181.712ms 56.785us 0.000us 0.00% 303.821ms 94.944us 0 B 0 B 87.68 MB 0 B 3200
-std::enable_if<!(false), void>::type internal::gemvx... 0.00% 0.000us 0.00% 0.000us 0.000us 293.850ms 2.57% 293.850ms 97.173us 0 B 0 B 0 B 0 B 3024
- aten::mm 0.01% 1.506ms 0.02% 2.161ms 33.768us 293.014ms 2.56% 293.014ms 4.578ms 0 B 0 B 24.56 MB 24.56 MB 64
- ampere_bf16_s16816gemm_bf16_128x64_ldg8_f2f_nn 0.00% 0.000us 0.00% 0.000us 0.000us 102.278ms 0.90% 102.278ms 4.262ms 0 B 0 B 0 B 0 B 24
-------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
-Self CPU time total: 10.579s
-Self CUDA time total: 11.424s
-
-
-=== Top 10 operations by memory ===
-------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
- Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg CPU Mem Self CPU Mem CUDA Mem Self CUDA Mem # of Calls
-------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
- aten::empty 0.68% 72.026ms 0.68% 72.026ms 4.244us 0.000us 0.00% 0.000us 0.000us 296 B 296 B 3.49 GB 3.49 GB 16973
- aten::clamp 0.46% 48.185ms 0.69% 72.630ms 15.762us 10.269ms 0.09% 10.269ms 2.229us 0 B 0 B 616.69 MB 616.69 MB 4608
- aten::mul 1.76% 186.048ms 2.93% 310.383ms 14.181us 47.780ms 0.42% 47.792ms 2.184us 784 B 784 B 554.93 MB 554.93 MB 21888
- aten::cat 0.78% 82.030ms 1.22% 129.113ms 16.536us 17.028ms 0.15% 17.030ms 2.181us 0 B 0 B 387.88 MB 387.88 MB 7808
- aten::sigmoid 0.09% 9.855ms 0.16% 16.652ms 10.841us 2.889ms 0.03% 2.889ms 1.881us 0 B 0 B 307.97 MB 307.97 MB 1536
- aten::empty_strided 1.08% 114.498ms 1.10% 116.720ms 5.564us 0.000us 0.00% 0.000us 0.000us 0 B 0 B 216.60 MB 216.60 MB 20979
- aten::add 0.93% 97.861ms 1.56% 164.673ms 15.047us 16.394ms 0.14% 16.395ms 1.498us 0 B 0 B 91.03 MB 91.03 MB 10944
- aten::pow 0.36% 38.271ms 0.55% 58.020ms 18.501us 4.117ms 0.04% 4.117ms 1.313us 0 B 0 B 75.58 MB 75.58 MB 3136
- aten::bmm 1.25% 132.560ms 1.75% 185.015ms 29.803us 10.486s 91.79% 10.486s 1.689ms 0 B 0 B 63.12 MB 63.12 MB 6208
- aten::sub 0.51% 53.869ms 0.82% 87.218ms 13.626us 9.277ms 0.08% 9.355ms 1.461us 0 B 0 B 53.04 MB 53.01 MB 6401
-------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
-Self CPU time total: 10.579s
-Self CUDA time total: 11.424s
-
-
-Peak Memory: 9.41GB
-
-Profile trace saved to trace.json
-
-Generated text:
-<|channel|>analysis<|message|>We need to explain what Tensor Parallelism is. It's a concept in distributed training of large language models. It refers to splitting the weight matrices (tensors) across multiple devices, so each device holds a slice of the matrix. During forward/backward passes, each device computes partial results and then they are
-
-
-
-▶ UV Install Logs
-
-Fetching 3 files: 0%| | 0/3 [00:00<?, ?it/s]
-Fetching 3 files: 33%|███▎ | 1/3 [00:11<00:23, 11.59s/it]
-Fetching 3 files: 67%|██████▋ | 2/3 [00:16<00:07, 7.73s/it]
-Fetching 3 files: 100%|██████████| 3/3 [00:16<00:00, 5.54s/it]
-You are using full precision kernels, we will dequantize the model to bf16. To use the quantized model with quantization kernels, please set use_kernels=False
-
-Loading checkpoint shards: 0%| | 0/3 [00:00<?, ?it/s]
-Loading checkpoint shards: 33%|███▎ | 1/3 [00:03<00:06, 3.23s/it]
-Loading checkpoint shards: 67%|██████▋ | 2/3 [00:06<00:03, 3.15s/it]
-Loading checkpoint shards: 100%|██████████| 3/3 [00:08<00:00, 2.50s/it]
-Loading checkpoint shards: 100%|██████████| 3/3 [00:08<00:00, 2.68s/it]
-
-Fetching 6 files: 0%| | 0/6 [00:00<?, ?it/s]
-Fetching 6 files: 17%|█▋ | 1/6 [00:00<00:00, 5.23it/s]
-Fetching 6 files: 50%|█████ | 3/6 [00:00<00:00, 6.19it/s]
-Fetching 6 files: 100%|██████████| 6/6 [00:00<00:00, 12.15it/s]
-/tmp/uvnote-run-hjgpkuq6/home/.cache/uv/environments-v2/setup-30bb029f3f83f37d/lib/python3.12/site-packages/kernels/layer.py:868: UserWarning:
-No kernel mapping found for layer `None`. Check if the layer name matches one of the kernels in the mapping or add the kernel you want to use to the mapping. Defaulting to original forward implementation.
- warnings.warn(
-/tmp/uvnote-run-hjgpkuq6/home/.cache/uv/environments-v2/setup-30bb029f3f83f37d/lib/python3.12/site-packages/kernels/layer.py:868: UserWarning:
-No kernel mapping found for layer `None`. Check if the layer name matches one of the kernels in the mapping or add the kernel you want to use to the mapping. Defaulting to original forward implementation.
- warnings.warn(
-
-
-
-
-
-