Reference kernel
+
+
+
+
+
+▼ code
+▼ output
+ ▶ uv-logs
+ |
+Cell: forward_only | 100.45s | FAILED
+ |
+
+Raw
+
+
+
+
+
+
+1
+2
+3
+4
+5
+6
+7
+8
+9
+10
+11
+12
+13
+14
+15
+16
+17
+18
+19
+20
+21
+22
+23
+24
+25
+26
+27
+28
+29
+30
+31
+32
+33
+34
+35
+36
+37
+38
+39
+40
+41
+42
+43
+44
+45
+46
+47
+48
+49
+50
+51
+52
+53
+54
+55
+56
+57
+58
+59
+60
+61
+62
+63
+64
+65
+66
+67
+68
+69
+70
+71
+72
+73
+74
+75
+76
+77
+78
+79
+80
+81
+82
+83
+84
+85
+86
+87
+88
+89
+90
+91
+92
+93
+94
+95
+96
+
+
+
+
+
+
+# /// script
+# requires-python = ">=3.12"
+# dependencies = [
+# "accelerate>=1.10.1",
+# "torch>=2.7.0",
+# "kernels==0.10.0",
+# "transformers@https://github.com/huggingface/transformers.git",
+# "ipdb>=0.13.13",
+# "matplotlib>=3.7.2",
+# "numpy>=1.24.3",
+# ]
+# ///
+
+import torch
+from transformers import GptOssForCausalLM, PreTrainedTokenizerFast, Mxfp4Config
+import time
+import torch.nn as nn
+from kernels import register_kernel_mapping, Mode, LayerRepository
+import sys
+import torch.profiler
+import gc
+import logging
+
+# set to debug logging
+logging.basicConfig(level=logging.INFO)
+
+def reset_peak_memory_stats():
+ """Clear CUDA cache and reset memory allocation counters."""
+ torch.cuda.empty_cache()
+ if torch.cuda.is_available():
+ torch.cuda.reset_peak_memory_stats()
+ gc.collect()
+
+def get_memory_stats():
+ """Get current and peak CUDA memory usage."""
+ if not torch.cuda.is_available():
+ return {"allocated_gb": 0, "peak_gb": 0, "reserved_gb": 0}
+ return {
+ "allocated_gb": torch.cuda.memory_allocated() / 1e9,
+ "peak_gb": torch.cuda.max_memory_allocated() / 1e9,
+ "reserved_gb": torch.cuda.memory_reserved() / 1e9,
+ }
+
+def override_kernel_layer_name(cls_name: str, value) -> bool:
+ """Helper to dynamically override the kernel_layer_name in a model class."""
+ for mod in sys.modules.values():
+ if mod is None:
+ continue
+ obj = getattr(mod, cls_name, None)
+ if isinstance(obj, type) and issubclass(obj, nn.Module):
+ setattr(obj, "kernel_layer_name", value)
+ print(f"Overrode {cls_name}.kernel_layer_name to {value}")
+ return True
+ return False
+
+
+# Init the model the normal way
+model_id = "openai/gpt-oss-20b"
+tokenizer = PreTrainedTokenizerFast.from_pretrained(model_id)
+quantization_config = Mxfp4Config(dequantize=True)
+
+
+model = GptOssForCausalLM.from_pretrained(
+ model_id,
+ dtype="bfloat16",
+ device_map="auto",
+ use_kernels=True,
+ quantization_config=quantization_config,
+).eval()
+
+messages = [
+ {"role": "system", "content": "What is Tensor Parallelism?"},
+]
+
+inputs = tokenizer.apply_chat_template(
+ messages,
+ add_generation_prompt=True,
+ return_tensors="pt",
+ return_dict=True,
+ reasoning_effort="low",
+).to("cuda")
+
+max_tokens = 512
+
+with torch.inference_mode():
+ start_time = time.perf_counter()
+ generated = model.generate(
+ **inputs,
+ max_new_tokens=max_tokens,
+ do_sample=False,
+ temperature=None,
+ )
+ end_time = time.perf_counter()
+
+print(tokenizer.decode(generated[0], skip_special_tokens=False))
+print(f"Generation took {end_time - start_time:.2f} seconds")
+
+
+
+
+▶ UV Install Logs
+
+Fetching 3 files: 0%| | 0/3 [00:00<?, ?it/s]
+Fetching 3 files: 33%|███▎ | 1/3 [00:13<00:27, 13.93s/it]
+Fetching 3 files: 67%|██████▋ | 2/3 [00:17<00:08, 8.08s/it]
+Fetching 3 files: 100%|██████████| 3/3 [00:17<00:00, 5.97s/it]
+You are using full precision kernels, we will dequantize the model to bf16. To use the quantized model with quantization kernels, please set use_kernels=False
+
+Loading checkpoint shards: 0%| | 0/3 [00:00<?, ?it/s]
+Loading checkpoint shards: 33%|███▎ | 1/3 [00:03<00:06, 3.23s/it]
+Loading checkpoint shards: 67%|██████▋ | 2/3 [00:06<00:03, 3.14s/it]
+Loading checkpoint shards: 100%|██████████| 3/3 [00:08<00:00, 2.49s/it]
+Loading checkpoint shards: 100%|██████████| 3/3 [00:08<00:00, 2.68s/it]
+INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP`
+
+Fetching 66 files: 0%| | 0/66 [00:00<?, ?it/s]
+Fetching 66 files: 2%|▏ | 1/66 [00:00<00:15, 4.28it/s]
+Fetching 66 files: 26%|██▌ | 17/66 [00:01<00:03, 12.73it/s]
+Fetching 66 files: 100%|██████████| 66/66 [00:01<00:00, 47.76it/s]
+INFO:root:Using layer `LigerRMSNorm` from repo `kernels-community/liger_kernels` (revision: main) for layer `LigerRMSNorm`
+
+Fetching 17 files: 0%| | 0/17 [00:00<?, ?it/s]
+Fetching 17 files: 65%|██████▍ | 11/17 [00:00<00:00, 104.99it/s]
+Fetching 17 files: 100%|██████████| 17/17 [00:00<00:00, 128.06it/s]
+INFO:root:Using layer `LigerRMSNorm` from repo `kernels-community/liger_kernels` (revision: main) for layer `LigerRMSNorm`
+INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP`
+INFO:root:Using layer `LigerRMSNorm` from repo `kernels-community/liger_kernels` (revision: main) for layer `LigerRMSNorm`
+INFO:root:Using layer `LigerRMSNorm` from repo `kernels-community/liger_kernels` (revision: main) for layer `LigerRMSNorm`
+INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP`
+INFO:root:Using layer `LigerRMSNorm` from repo `kernels-community/liger_kernels` (revision: main) for layer `LigerRMSNorm`
+INFO:root:Using layer `LigerRMSNorm` from repo `kernels-community/liger_kernels` (revision: main) for layer `LigerRMSNorm`
+INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP`
+INFO:root:Using layer `LigerRMSNorm` from repo `kernels-community/liger_kernels` (revision: main) for layer `LigerRMSNorm`
+INFO:root:Using layer `LigerRMSNorm` from repo `kernels-community/liger_kernels` (revision: main) for layer `LigerRMSNorm`
+INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP`
+INFO:root:Using layer `LigerRMSNorm` from repo `kernels-community/liger_kernels` (revision: main) for layer `LigerRMSNorm`
+INFO:root:Using layer `LigerRMSNorm` from repo `kernels-community/liger_kernels` (revision: main) for layer `LigerRMSNorm`
+INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP`
+INFO:root:Using layer `LigerRMSNorm` from repo `kernels-community/liger_kernels` (revision: main) for layer `LigerRMSNorm`
+INFO:root:Using layer `LigerRMSNorm` from repo `kernels-community/liger_kernels` (revision: main) for layer `LigerRMSNorm`
+INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP`
+INFO:root:Using layer `LigerRMSNorm` from repo `kernels-community/liger_kernels` (revision: main) for layer `LigerRMSNorm`
+INFO:root:Using layer `LigerRMSNorm` from repo `kernels-community/liger_kernels` (revision: main) for layer `LigerRMSNorm`
+INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP`
+INFO:root:Using layer `LigerRMSNorm` from repo `kernels-community/liger_kernels` (revision: main) for layer `LigerRMSNorm`
+INFO:root:Using layer `LigerRMSNorm` from repo `kernels-community/liger_kernels` (revision: main) for layer `LigerRMSNorm`
+INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP`
+INFO:root:Using layer `LigerRMSNorm` from repo `kernels-community/liger_kernels` (revision: main) for layer `LigerRMSNorm`
+INFO:root:Using layer `LigerRMSNorm` from repo `kernels-community/liger_kernels` (revision: main) for layer `LigerRMSNorm`
+INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP`
+INFO:root:Using layer `LigerRMSNorm` from repo `kernels-community/liger_kernels` (revision: main) for layer `LigerRMSNorm`
+INFO:root:Using layer `LigerRMSNorm` from repo `kernels-community/liger_kernels` (revision: main) for layer `LigerRMSNorm`
+INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP`
+INFO:root:Using layer `LigerRMSNorm` from repo `kernels-community/liger_kernels` (revision: main) for layer `LigerRMSNorm`
+INFO:root:Using layer `LigerRMSNorm` from repo `kernels-community/liger_kernels` (revision: main) for layer `LigerRMSNorm`
+INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP`
+INFO:root:Using layer `LigerRMSNorm` from repo `kernels-community/liger_kernels` (revision: main) for layer `LigerRMSNorm`
+INFO:root:Using layer `LigerRMSNorm` from repo `kernels-community/liger_kernels` (revision: main) for layer `LigerRMSNorm`
+INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP`
+INFO:root:Using layer `LigerRMSNorm` from repo `kernels-community/liger_kernels` (revision: main) for layer `LigerRMSNorm`
+INFO:root:Using layer `LigerRMSNorm` from repo `kernels-community/liger_kernels` (revision: main) for layer `LigerRMSNorm`
+INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP`
+INFO:root:Using layer `LigerRMSNorm` from repo `kernels-community/liger_kernels` (revision: main) for layer `LigerRMSNorm`
+INFO:root:Using layer `LigerRMSNorm` from repo `kernels-community/liger_kernels` (revision: main) for layer `LigerRMSNorm`
+INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP`
+INFO:root:Using layer `LigerRMSNorm` from repo `kernels-community/liger_kernels` (revision: main) for layer `LigerRMSNorm`
+INFO:root:Using layer `LigerRMSNorm` from repo `kernels-community/liger_kernels` (revision: main) for layer `LigerRMSNorm`
+INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP`
+INFO:root:Using layer `LigerRMSNorm` from repo `kernels-community/liger_kernels` (revision: main) for layer `LigerRMSNorm`
+INFO:root:Using layer `LigerRMSNorm` from repo `kernels-community/liger_kernels` (revision: main) for layer `LigerRMSNorm`
+INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP`
+INFO:root:Using layer `LigerRMSNorm` from repo `kernels-community/liger_kernels` (revision: main) for layer `LigerRMSNorm`
+INFO:root:Using layer `LigerRMSNorm` from repo `kernels-community/liger_kernels` (revision: main) for layer `LigerRMSNorm`
+INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP`
+INFO:root:Using layer `LigerRMSNorm` from repo `kernels-community/liger_kernels` (revision: main) for layer `LigerRMSNorm`
+INFO:root:Using layer `LigerRMSNorm` from repo `kernels-community/liger_kernels` (revision: main) for layer `LigerRMSNorm`
+INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP`
+INFO:root:Using layer `LigerRMSNorm` from repo `kernels-community/liger_kernels` (revision: main) for layer `LigerRMSNorm`
+INFO:root:Using layer `LigerRMSNorm` from repo `kernels-community/liger_kernels` (revision: main) for layer `LigerRMSNorm`
+INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP`
+INFO:root:Using layer `LigerRMSNorm` from repo `kernels-community/liger_kernels` (revision: main) for layer `LigerRMSNorm`
+INFO:root:Using layer `LigerRMSNorm` from repo `kernels-community/liger_kernels` (revision: main) for layer `LigerRMSNorm`
+INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP`
+INFO:root:Using layer `LigerRMSNorm` from repo `kernels-community/liger_kernels` (revision: main) for layer `LigerRMSNorm`
+INFO:root:Using layer `LigerRMSNorm` from repo `kernels-community/liger_kernels` (revision: main) for layer `LigerRMSNorm`
+INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP`
+INFO:root:Using layer `LigerRMSNorm` from repo `kernels-community/liger_kernels` (revision: main) for layer `LigerRMSNorm`
+INFO:root:Using layer `LigerRMSNorm` from repo `kernels-community/liger_kernels` (revision: main) for layer `LigerRMSNorm`
+INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP`
+INFO:root:Using layer `LigerRMSNorm` from repo `kernels-community/liger_kernels` (revision: main) for layer `LigerRMSNorm`
+INFO:root:Using layer `LigerRMSNorm` from repo `kernels-community/liger_kernels` (revision: main) for layer `LigerRMSNorm`
+INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP`
+INFO:root:Using layer `LigerRMSNorm` from repo `kernels-community/liger_kernels` (revision: main) for layer `LigerRMSNorm`
+INFO:root:Using layer `LigerRMSNorm` from repo `kernels-community/liger_kernels` (revision: main) for layer `LigerRMSNorm`
+INFO:root:Using layer `LigerRMSNorm` from repo `kernels-community/liger_kernels` (revision: main) for layer `LigerRMSNorm`
+INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP`
+INFO:root:Using layer `LigerRMSNorm` from repo `kernels-community/liger_kernels` (revision: main) for layer `LigerRMSNorm`
+INFO:root:Using layer `LigerRMSNorm` from repo `kernels-community/liger_kernels` (revision: main) for layer `LigerRMSNorm`
+INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP`
+INFO:root:Using layer `LigerRMSNorm` from repo `kernels-community/liger_kernels` (revision: main) for layer `LigerRMSNorm`
+INFO:root:Using layer `LigerRMSNorm` from repo `kernels-community/liger_kernels` (revision: main) for layer `LigerRMSNorm`
+INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP`
+INFO:root:Using layer `LigerRMSNorm` from repo `kernels-community/liger_kernels` (revision: main) for layer `LigerRMSNorm`
+INFO:root:Using layer `LigerRMSNorm` from repo `kernels-community/liger_kernels` (revision: main) for layer `LigerRMSNorm`
+INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP`
+INFO:root:Using layer `LigerRMSNorm` from repo `kernels-community/liger_kernels` (revision: main) for layer `LigerRMSNorm`
+INFO:root:Using layer `LigerRMSNorm` from repo `kernels-community/liger_kernels` (revision: main) for layer `LigerRMSNorm`
+INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP`
+INFO:root:Using layer `LigerRMSNorm` from repo `kernels-community/liger_kernels` (revision: main) for layer `LigerRMSNorm`
+INFO:root:Using layer `LigerRMSNorm` from repo `kernels-community/liger_kernels` (revision: main) for layer `LigerRMSNorm`
+INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP`
+INFO:root:Using layer `LigerRMSNorm` from repo `kernels-community/liger_kernels` (revision: main) for layer `LigerRMSNorm`
+INFO:root:Using layer `LigerRMSNorm` from repo `kernels-community/liger_kernels` (revision: main) for layer `LigerRMSNorm`
+INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP`
+INFO:root:Using layer `LigerRMSNorm` from repo `kernels-community/liger_kernels` (revision: main) for layer `LigerRMSNorm`
+INFO:root:Using layer `LigerRMSNorm` from repo `kernels-community/liger_kernels` (revision: main) for layer `LigerRMSNorm`
+INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP`
+INFO:root:Using layer `LigerRMSNorm` from repo `kernels-community/liger_kernels` (revision: main) for layer `LigerRMSNorm`
+INFO:root:Using layer `LigerRMSNorm` from repo `kernels-community/liger_kernels` (revision: main) for layer `LigerRMSNorm`
+INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP`
+INFO:root:Using layer `LigerRMSNorm` from repo `kernels-community/liger_kernels` (revision: main) for layer `LigerRMSNorm`
+INFO:root:Using layer `LigerRMSNorm` from repo `kernels-community/liger_kernels` (revision: main) for layer `LigerRMSNorm`
+INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP`
+INFO:root:Using layer `LigerRMSNorm` from repo `kernels-community/liger_kernels` (revision: main) for layer `LigerRMSNorm`
+INFO:root:Using layer `LigerRMSNorm` from repo `kernels-community/liger_kernels` (revision: main) for layer `LigerRMSNorm`
+INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP`
+INFO:root:Using layer `LigerRMSNorm` from repo `kernels-community/liger_kernels` (revision: main) for layer `LigerRMSNorm`
+INFO:root:Using layer `LigerRMSNorm` from repo `kernels-community/liger_kernels` (revision: main) for layer `LigerRMSNorm`
+INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP`
+INFO:root:Using layer `LigerRMSNorm` from repo `kernels-community/liger_kernels` (revision: main) for layer `LigerRMSNorm`
+INFO:root:Using layer `LigerRMSNorm` from repo `kernels-community/liger_kernels` (revision: main) for layer `LigerRMSNorm`
+INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP`
+INFO:root:Using layer `LigerRMSNorm` from repo `kernels-community/liger_kernels` (revision: main) for layer `LigerRMSNorm`
+INFO:root:Using layer `LigerRMSNorm` from repo `kernels-community/liger_kernels` (revision: main) for layer `LigerRMSNorm`
+INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP`
+INFO:root:Using layer `LigerRMSNorm` from repo `kernels-community/liger_kernels` (revision: main) for layer `LigerRMSNorm`
+INFO:root:Using layer `LigerRMSNorm` from repo `kernels-community/liger_kernels` (revision: main) for layer `LigerRMSNorm`
+INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP`
+INFO:root:Using layer `LigerRMSNorm` from repo `kernels-community/liger_kernels` (revision: main) for layer `LigerRMSNorm`
+INFO:root:Using layer `LigerRMSNorm` from repo `kernels-community/liger_kernels` (revision: main) for layer `LigerRMSNorm`
+INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP`
+INFO:root:Using layer `LigerRMSNorm` from repo `kernels-community/liger_kernels` (revision: main) for layer `LigerRMSNorm`
+INFO:root:Using layer `LigerRMSNorm` from repo `kernels-community/liger_kernels` (revision: main) for layer `LigerRMSNorm`
+INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP`
+INFO:root:Using layer `LigerRMSNorm` from repo `kernels-community/liger_kernels` (revision: main) for layer `LigerRMSNorm`
+INFO:root:Using layer `LigerRMSNorm` from repo `kernels-community/liger_kernels` (revision: main) for layer `LigerRMSNorm`
+INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP`
+INFO:root:Using layer `LigerRMSNorm` from repo `kernels-community/liger_kernels` (revision: main) for layer `LigerRMSNorm`
+INFO:root:Using layer `LigerRMSNorm` from repo `kernels-community/liger_kernels` (revision: main) for layer `LigerRMSNorm`
+INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP`
+INFO:root:Using layer `LigerRMSNorm` from repo `kernels-community/liger_kernels` (revision: main) for layer `LigerRMSNorm`
+INFO:root:Using layer `LigerRMSNorm` from repo `kernels-community/liger_kernels` (revision: main) for layer `LigerRMSNorm`
+INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP`
+INFO:root:Using layer `LigerRMSNorm` from repo `kernels-community/liger_kernels` (revision: main) for layer `LigerRMSNorm`
+INFO:root:Using layer `LigerRMSNorm` from repo `kernels-community/liger_kernels` (revision: main) for layer `LigerRMSNorm`
+INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP`
+INFO:root:Using layer `LigerRMSNorm` from repo `kernels-community/liger_kernels` (revision: main) for layer `LigerRMSNorm`
+INFO:root:Using layer `LigerRMSNorm` from repo `kernels-community/liger_kernels` (revision: main) for layer `LigerRMSNorm`
+INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP`
+INFO:root:Using layer `LigerRMSNorm` from repo `kernels-community/liger_kernels` (revision: main) for layer `LigerRMSNorm`
+INFO:root:Using layer `LigerRMSNorm` from repo `kernels-community/liger_kernels` (revision: main) for layer `LigerRMSNorm`
+INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP`
+INFO:root:Using layer `LigerRMSNorm` from repo `kernels-community/liger_kernels` (revision: main) for layer `LigerRMSNorm`
+INFO:root:Using layer `LigerRMSNorm` from repo `kernels-community/liger_kernels` (revision: main) for layer `LigerRMSNorm`
+INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP`
+INFO:root:Using layer `LigerRMSNorm` from repo `kernels-community/liger_kernels` (revision: main) for layer `LigerRMSNorm`
+INFO:root:Using layer `LigerRMSNorm` from repo `kernels-community/liger_kernels` (revision: main) for layer `LigerRMSNorm`
+INFO:root:Using layer `LigerRMSNorm` from repo `kernels-community/liger_kernels` (revision: main) for layer `LigerRMSNorm`
+Traceback (most recent call last):
+ File "/home/ubuntu/Projects/yamoe-gpt-integration/.uvnote/cells/forward_only.py", line 87, in <module>
+ generated = model.generate(
+ ^^^^^^^^^^^^^^^
+ File "/tmp/uvnote-run-b1sqpts5/home/.cache/uv/environments-v2/forward-only-e48b1c4099197fb9/lib/python3.12/site-packages/torch/utils/_contextlib.py", line 120, in decorate_context
+ return func(*args, **kwargs)
+ ^^^^^^^^^^^^^^^^^^^^^
+ File "/tmp/uvnote-run-b1sqpts5/home/.cache/uv/environments-v2/forward-only-e48b1c4099197fb9/lib/python3.12/site-packages/transformers/generation/utils.py", line 2546, in generate
+ result = decoding_method(
+ ^^^^^^^^^^^^^^^^
+ File "/tmp/uvnote-run-b1sqpts5/home/.cache/uv/environments-v2/forward-only-e48b1c4099197fb9/lib/python3.12/site-packages/transformers/generation/utils.py", line 2766, in _sample
+ outputs = self(**model_inputs, return_dict=True)
+ ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+ File "/tmp/uvnote-run-b1sqpts5/home/.cache/uv/environments-v2/forward-only-e48b1c4099197fb9/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1773, in _wrapped_call_impl
+ return self._call_impl(*args, **kwargs)
+ ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+ File "/tmp/uvnote-run-b1sqpts5/home/.cache/uv/environments-v2/forward-only-e48b1c4099197fb9/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1784, in _call_impl
+ return forward_call(*args, **kwargs)
+ ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+ File "/tmp/uvnote-run-b1sqpts5/home/.cache/uv/environments-v2/forward-only-e48b1c4099197fb9/lib/python3.12/site-packages/accelerate/hooks.py", line 175, in new_forward
+ output = module._old_forward(*args, **kwargs)
+ ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+ File "/tmp/uvnote-run-b1sqpts5/home/.cache/uv/environments-v2/forward-only-e48b1c4099197fb9/lib/python3.12/site-packages/transformers/utils/generic.py", line 783, in wrapper
+ output = func(self, *args, **kwargs)
+ ^^^^^^^^^^^^^^^^^^^^^^^^^^^
+ File "/tmp/uvnote-run-b1sqpts5/home/.cache/uv/environments-v2/forward-only-e48b1c4099197fb9/lib/python3.12/site-packages/transformers/models/gpt_oss/modeling_gpt_oss.py", line 668, in forward
+ outputs: MoeModelOutputWithPast = self.model(
+ ^^^^^^^^^^^
+ File "/tmp/uvnote-run-b1sqpts5/home/.cache/uv/environments-v2/forward-only-e48b1c4099197fb9/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1773, in _wrapped_call_impl
+ return self._call_impl(*args, **kwargs)
+ ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+ File "/tmp/uvnote-run-b1sqpts5/home/.cache/uv/environments-v2/forward-only-e48b1c4099197fb9/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1784, in _call_impl
+ return forward_call(*args, **kwargs)
+ ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+ File "/tmp/uvnote-run-b1sqpts5/home/.cache/uv/environments-v2/forward-only-e48b1c4099197fb9/lib/python3.12/site-packages/transformers/utils/generic.py", line 929, in wrapper
+ outputs = func(self, *args, **kwargs)
+ ^^^^^^^^^^^^^^^^^^^^^^^^^^^
+ File "/tmp/uvnote-run-b1sqpts5/home/.cache/uv/environments-v2/forward-only-e48b1c4099197fb9/lib/python3.12/site-packages/transformers/models/gpt_oss/modeling_gpt_oss.py", line 507, in forward
+ hidden_states = decoder_layer(
+ ^^^^^^^^^^^^^^
+ File "/tmp/uvnote-run-b1sqpts5/home/.cache/uv/environments-v2/forward-only-e48b1c4099197fb9/lib/python3.12/site-packages/transformers/modeling_layers.py", line 94, in __call__
+ return super().__call__(*args, **kwargs)
+ ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+ File "/tmp/uvnote-run-b1sqpts5/home/.cache/uv/environments-v2/forward-only-e48b1c4099197fb9/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1773, in _wrapped_call_impl
+ return self._call_impl(*args, **kwargs)
+ ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+ File "/tmp/uvnote-run-b1sqpts5/home/.cache/uv/environments-v2/forward-only-e48b1c4099197fb9/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1784, in _call_impl
+ return forward_call(*args, **kwargs)
+ ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+ File "/tmp/uvnote-run-b1sqpts5/home/.cache/uv/environments-v2/forward-only-e48b1c4099197fb9/lib/python3.12/site-packages/accelerate/hooks.py", line 175, in new_forward
+ output = module._old_forward(*args, **kwargs)
+ ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+ File "/tmp/uvnote-run-b1sqpts5/home/.cache/uv/environments-v2/forward-only-e48b1c4099197fb9/lib/python3.12/site-packages/transformers/utils/deprecation.py", line 172, in wrapped_func
+ return func(*args, **kwargs)
+ ^^^^^^^^^^^^^^^^^^^^^
+ File "/tmp/uvnote-run-b1sqpts5/home/.cache/uv/environments-v2/forward-only-e48b1c4099197fb9/lib/python3.12/site-packages/transformers/models/gpt_oss/modeling_gpt_oss.py", line 369, in forward
+ hidden_states = self.input_layernorm(hidden_states)
+ ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+ File "/tmp/uvnote-run-b1sqpts5/home/.cache/uv/environments-v2/forward-only-e48b1c4099197fb9/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1773, in _wrapped_call_impl
+ return self._call_impl(*args, **kwargs)
+ ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+ File "/tmp/uvnote-run-b1sqpts5/home/.cache/uv/environments-v2/forward-only-e48b1c4099197fb9/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1784, in _call_impl
+ return forward_call(*args, **kwargs)
+ ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+ File "/tmp/uvnote-run-b1sqpts5/home/.cache/huggingface/hub/models--kernels-community--liger_kernels/snapshots/7435d25a8faf175758be14046371a5b0c686f94c/build/torch-universal/liger_kernels/layers.py", line 30, in forward
+ return LigerRMSNormFunction.apply(
+ ^^^^^^^^^^^^^^^^^^^^^^^^^^^
+ File "/tmp/uvnote-run-b1sqpts5/home/.cache/uv/environments-v2/forward-only-e48b1c4099197fb9/lib/python3.12/site-packages/torch/autograd/function.py", line 576, in apply
+ return super().apply(*args, **kwargs) # type: ignore[misc]
+ ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+ File "/tmp/uvnote-run-b1sqpts5/home/.cache/huggingface/hub/models--kernels-community--liger_kernels/snapshots/7435d25a8faf175758be14046371a5b0c686f94c/build/torch-universal/liger_kernels/utils.py", line 48, in wrapper
+ return fn(ctx, *args, **kwargs)
+ ^^^^^^^^^^^^^^^^^^^^^^^^
+ File "/tmp/uvnote-run-b1sqpts5/home/.cache/huggingface/hub/models--kernels-community--liger_kernels/snapshots/7435d25a8faf175758be14046371a5b0c686f94c/build/torch-universal/liger_kernels/rms_norm.py", line 338, in forward
+ Y, X, RSTD, BLOCK_SIZE, num_warps, casting_mode = rms_norm_forward(X, W, eps, offset, casting_mode)
+ ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+ File "/tmp/uvnote-run-b1sqpts5/home/.cache/huggingface/hub/models--kernels-community--liger_kernels/snapshots/7435d25a8faf175758be14046371a5b0c686f94c/build/torch-universal/liger_kernels/rms_norm.py", line 230, in rms_norm_forward
+ _rms_norm_forward_kernel[(n_rows,)](
+ File "/tmp/uvnote-run-b1sqpts5/home/.cache/uv/environments-v2/forward-only-e48b1c4099197fb9/lib/python3.12/site-packages/triton/runtime/jit.py", line 390, in <lambda>
+ return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)
+ ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+ File "/tmp/uvnote-run-b1sqpts5/home/.cache/uv/environments-v2/forward-only-e48b1c4099197fb9/lib/python3.12/site-packages/triton/runtime/jit.py", line 617, in run
+ kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata,
+ File "/tmp/uvnote-run-b1sqpts5/home/.cache/uv/environments-v2/forward-only-e48b1c4099197fb9/lib/python3.12/site-packages/triton/backends/nvidia/driver.py", line 708, in __call__
+ self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,
+ValueError: Pointer argument (at 0) cannot be accessed from Triton (cpu tensor?)
+