+

Reference kernel

+
+
+ +▼ code +▼ output + ▶ uv-logs + | +Cell: forward_only | 100.45s | FAILED + | + +Raw +
+
+
+
+1 +2 +3 +4 +5 +6 +7 +8 +9 +10 +11 +12 +13 +14 +15 +16 +17 +18 +19 +20 +21 +22 +23 +24 +25 +26 +27 +28 +29 +30 +31 +32 +33 +34 +35 +36 +37 +38 +39 +40 +41 +42 +43 +44 +45 +46 +47 +48 +49 +50 +51 +52 +53 +54 +55 +56 +57 +58 +59 +60 +61 +62 +63 +64 +65 +66 +67 +68 +69 +70 +71 +72 +73 +74 +75 +76 +77 +78 +79 +80 +81 +82 +83 +84 +85 +86 +87 +88 +89 +90 +91 +92 +93 +94 +95 +96 +
+
+
# /// script
+# requires-python = ">=3.12"
+# dependencies = [
+#     "accelerate>=1.10.1",
+#     "torch>=2.7.0",
+#     "kernels==0.10.0",
+#     "transformers@https://github.com/huggingface/transformers.git",
+#     "ipdb>=0.13.13",
+#     "matplotlib>=3.7.2",
+#     "numpy>=1.24.3",
+# ]
+# ///
+
+import torch
+from transformers import GptOssForCausalLM, PreTrainedTokenizerFast, Mxfp4Config
+import time
+import torch.nn as nn
+from kernels import register_kernel_mapping, Mode, LayerRepository
+import sys
+import torch.profiler
+import gc
+import logging
+
+# set to debug logging
+logging.basicConfig(level=logging.INFO)
+
+def reset_peak_memory_stats():
+    """Clear CUDA cache and reset memory allocation counters."""
+    torch.cuda.empty_cache()
+    if torch.cuda.is_available():
+        torch.cuda.reset_peak_memory_stats()
+    gc.collect()
+
+def get_memory_stats():
+    """Get current and peak CUDA memory usage."""
+    if not torch.cuda.is_available():
+        return {"allocated_gb": 0, "peak_gb": 0, "reserved_gb": 0}
+    return {
+        "allocated_gb": torch.cuda.memory_allocated() / 1e9,
+        "peak_gb": torch.cuda.max_memory_allocated() / 1e9,
+        "reserved_gb": torch.cuda.memory_reserved() / 1e9,
+    }
+
+def override_kernel_layer_name(cls_name: str, value) -> bool:
+    """Helper to dynamically override the kernel_layer_name in a model class."""
+    for mod in sys.modules.values():
+        if mod is None:
+            continue
+        obj = getattr(mod, cls_name, None)
+        if isinstance(obj, type) and issubclass(obj, nn.Module):
+            setattr(obj, "kernel_layer_name", value)
+            print(f"Overrode {cls_name}.kernel_layer_name to {value}")
+            return True
+    return False
+
+
+# Init the model the normal way
+model_id = "openai/gpt-oss-20b"
+tokenizer = PreTrainedTokenizerFast.from_pretrained(model_id)
+quantization_config = Mxfp4Config(dequantize=True)
+
+
+model = GptOssForCausalLM.from_pretrained(
+    model_id,
+    dtype="bfloat16",
+    device_map="auto",
+    use_kernels=True,
+    quantization_config=quantization_config,
+).eval()
+
+messages = [
+    {"role": "system", "content": "What is Tensor Parallelism?"},
+]
+
+inputs = tokenizer.apply_chat_template(
+    messages,
+    add_generation_prompt=True,
+    return_tensors="pt",
+    return_dict=True,
+    reasoning_effort="low",
+).to("cuda")
+
+max_tokens = 512
+
+with torch.inference_mode():
+    start_time = time.perf_counter()
+    generated = model.generate(
+        **inputs,
+        max_new_tokens=max_tokens,
+        do_sample=False,
+        temperature=None,
+    )
+    end_time = time.perf_counter()
+
+print(tokenizer.decode(generated[0], skip_special_tokens=False))
+print(f"Generation took {end_time - start_time:.2f} seconds")
+
+ +
+
+
+
+
+
+
▶ UV Install Logs
+ +
+
Fetching 3 files: 0%| | 0/3 [00:00<?, ?it/s] +Fetching 3 files: 33%|███▎ | 1/3 [00:13<00:27, 13.93s/it] +Fetching 3 files: 67%|██████▋ | 2/3 [00:17<00:08, 8.08s/it] +Fetching 3 files: 100%|██████████| 3/3 [00:17<00:00, 5.97s/it] +You are using full precision kernels, we will dequantize the model to bf16. To use the quantized model with quantization kernels, please set use_kernels=False + +Loading checkpoint shards: 0%| | 0/3 [00:00<?, ?it/s] +Loading checkpoint shards: 33%|███▎ | 1/3 [00:03<00:06, 3.23s/it] +Loading checkpoint shards: 67%|██████▋ | 2/3 [00:06<00:03, 3.14s/it] +Loading checkpoint shards: 100%|██████████| 3/3 [00:08<00:00, 2.49s/it] +Loading checkpoint shards: 100%|██████████| 3/3 [00:08<00:00, 2.68s/it] +INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP` + +Fetching 66 files: 0%| | 0/66 [00:00<?, ?it/s] +Fetching 66 files: 2%|▏ | 1/66 [00:00<00:15, 4.28it/s] +Fetching 66 files: 26%|██▌ | 17/66 [00:01<00:03, 12.73it/s] +Fetching 66 files: 100%|██████████| 66/66 [00:01<00:00, 47.76it/s] +INFO:root:Using layer `LigerRMSNorm` from repo `kernels-community/liger_kernels` (revision: main) for layer `LigerRMSNorm` + +Fetching 17 files: 0%| | 0/17 [00:00<?, ?it/s] +Fetching 17 files: 65%|██████▍ | 11/17 [00:00<00:00, 104.99it/s] +Fetching 17 files: 100%|██████████| 17/17 [00:00<00:00, 128.06it/s] +INFO:root:Using layer `LigerRMSNorm` from repo `kernels-community/liger_kernels` (revision: main) for layer `LigerRMSNorm` +INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP` +INFO:root:Using layer `LigerRMSNorm` from repo `kernels-community/liger_kernels` (revision: main) for layer `LigerRMSNorm` +INFO:root:Using layer `LigerRMSNorm` from repo `kernels-community/liger_kernels` (revision: main) for layer `LigerRMSNorm` +INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP` +INFO:root:Using layer `LigerRMSNorm` from repo `kernels-community/liger_kernels` (revision: main) for layer `LigerRMSNorm` +INFO:root:Using layer `LigerRMSNorm` from repo `kernels-community/liger_kernels` (revision: main) for layer `LigerRMSNorm` +INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP` +INFO:root:Using layer `LigerRMSNorm` from repo `kernels-community/liger_kernels` (revision: main) for layer `LigerRMSNorm` +INFO:root:Using layer `LigerRMSNorm` from repo `kernels-community/liger_kernels` (revision: main) for layer `LigerRMSNorm` +INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP` +INFO:root:Using layer `LigerRMSNorm` from repo `kernels-community/liger_kernels` (revision: main) for layer `LigerRMSNorm` +INFO:root:Using layer `LigerRMSNorm` from repo `kernels-community/liger_kernels` (revision: main) for layer `LigerRMSNorm` +INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP` +INFO:root:Using layer `LigerRMSNorm` from repo `kernels-community/liger_kernels` (revision: main) for layer `LigerRMSNorm` +INFO:root:Using layer `LigerRMSNorm` from repo `kernels-community/liger_kernels` (revision: main) for layer `LigerRMSNorm` +INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP` +INFO:root:Using layer `LigerRMSNorm` from repo `kernels-community/liger_kernels` (revision: main) for layer `LigerRMSNorm` +INFO:root:Using layer `LigerRMSNorm` from repo `kernels-community/liger_kernels` (revision: main) for layer `LigerRMSNorm` +INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP` +INFO:root:Using layer `LigerRMSNorm` from repo `kernels-community/liger_kernels` (revision: main) for layer `LigerRMSNorm` +INFO:root:Using layer `LigerRMSNorm` from repo `kernels-community/liger_kernels` (revision: main) for layer `LigerRMSNorm` +INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP` +INFO:root:Using layer `LigerRMSNorm` from repo `kernels-community/liger_kernels` (revision: main) for layer `LigerRMSNorm` +INFO:root:Using layer `LigerRMSNorm` from repo `kernels-community/liger_kernels` (revision: main) for layer `LigerRMSNorm` +INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP` +INFO:root:Using layer `LigerRMSNorm` from repo `kernels-community/liger_kernels` (revision: main) for layer `LigerRMSNorm` +INFO:root:Using layer `LigerRMSNorm` from repo `kernels-community/liger_kernels` (revision: main) for layer `LigerRMSNorm` +INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP` +INFO:root:Using layer `LigerRMSNorm` from repo `kernels-community/liger_kernels` (revision: main) for layer `LigerRMSNorm` +INFO:root:Using layer `LigerRMSNorm` from repo `kernels-community/liger_kernels` (revision: main) for layer `LigerRMSNorm` +INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP` +INFO:root:Using layer `LigerRMSNorm` from repo `kernels-community/liger_kernels` (revision: main) for layer `LigerRMSNorm` +INFO:root:Using layer `LigerRMSNorm` from repo `kernels-community/liger_kernels` (revision: main) for layer `LigerRMSNorm` +INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP` +INFO:root:Using layer `LigerRMSNorm` from repo `kernels-community/liger_kernels` (revision: main) for layer `LigerRMSNorm` +INFO:root:Using layer `LigerRMSNorm` from repo `kernels-community/liger_kernels` (revision: main) for layer `LigerRMSNorm` +INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP` +INFO:root:Using layer `LigerRMSNorm` from repo `kernels-community/liger_kernels` (revision: main) for layer `LigerRMSNorm` +INFO:root:Using layer `LigerRMSNorm` from repo `kernels-community/liger_kernels` (revision: main) for layer `LigerRMSNorm` +INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP` +INFO:root:Using layer `LigerRMSNorm` from repo `kernels-community/liger_kernels` (revision: main) for layer `LigerRMSNorm` +INFO:root:Using layer `LigerRMSNorm` from repo `kernels-community/liger_kernels` (revision: main) for layer `LigerRMSNorm` +INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP` +INFO:root:Using layer `LigerRMSNorm` from repo `kernels-community/liger_kernels` (revision: main) for layer `LigerRMSNorm` +INFO:root:Using layer `LigerRMSNorm` from repo `kernels-community/liger_kernels` (revision: main) for layer `LigerRMSNorm` +INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP` +INFO:root:Using layer `LigerRMSNorm` from repo `kernels-community/liger_kernels` (revision: main) for layer `LigerRMSNorm` +INFO:root:Using layer `LigerRMSNorm` from repo `kernels-community/liger_kernels` (revision: main) for layer `LigerRMSNorm` +INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP` +INFO:root:Using layer `LigerRMSNorm` from repo `kernels-community/liger_kernels` (revision: main) for layer `LigerRMSNorm` +INFO:root:Using layer `LigerRMSNorm` from repo `kernels-community/liger_kernels` (revision: main) for layer `LigerRMSNorm` +INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP` +INFO:root:Using layer `LigerRMSNorm` from repo `kernels-community/liger_kernels` (revision: main) for layer `LigerRMSNorm` +INFO:root:Using layer `LigerRMSNorm` from repo `kernels-community/liger_kernels` (revision: main) for layer `LigerRMSNorm` +INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP` +INFO:root:Using layer `LigerRMSNorm` from repo `kernels-community/liger_kernels` (revision: main) for layer `LigerRMSNorm` +INFO:root:Using layer `LigerRMSNorm` from repo `kernels-community/liger_kernels` (revision: main) for layer `LigerRMSNorm` +INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP` +INFO:root:Using layer `LigerRMSNorm` from repo `kernels-community/liger_kernels` (revision: main) for layer `LigerRMSNorm` +INFO:root:Using layer `LigerRMSNorm` from repo `kernels-community/liger_kernels` (revision: main) for layer `LigerRMSNorm` +INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP` +INFO:root:Using layer `LigerRMSNorm` from repo `kernels-community/liger_kernels` (revision: main) for layer `LigerRMSNorm` +INFO:root:Using layer `LigerRMSNorm` from repo `kernels-community/liger_kernels` (revision: main) for layer `LigerRMSNorm` +INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP` +INFO:root:Using layer `LigerRMSNorm` from repo `kernels-community/liger_kernels` (revision: main) for layer `LigerRMSNorm` +INFO:root:Using layer `LigerRMSNorm` from repo `kernels-community/liger_kernels` (revision: main) for layer `LigerRMSNorm` +INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP` +INFO:root:Using layer `LigerRMSNorm` from repo `kernels-community/liger_kernels` (revision: main) for layer `LigerRMSNorm` +INFO:root:Using layer `LigerRMSNorm` from repo `kernels-community/liger_kernels` (revision: main) for layer `LigerRMSNorm` +INFO:root:Using layer `LigerRMSNorm` from repo `kernels-community/liger_kernels` (revision: main) for layer `LigerRMSNorm` +INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP` +INFO:root:Using layer `LigerRMSNorm` from repo `kernels-community/liger_kernels` (revision: main) for layer `LigerRMSNorm` +INFO:root:Using layer `LigerRMSNorm` from repo `kernels-community/liger_kernels` (revision: main) for layer `LigerRMSNorm` +INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP` +INFO:root:Using layer `LigerRMSNorm` from repo `kernels-community/liger_kernels` (revision: main) for layer `LigerRMSNorm` +INFO:root:Using layer `LigerRMSNorm` from repo `kernels-community/liger_kernels` (revision: main) for layer `LigerRMSNorm` +INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP` +INFO:root:Using layer `LigerRMSNorm` from repo `kernels-community/liger_kernels` (revision: main) for layer `LigerRMSNorm` +INFO:root:Using layer `LigerRMSNorm` from repo `kernels-community/liger_kernels` (revision: main) for layer `LigerRMSNorm` +INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP` +INFO:root:Using layer `LigerRMSNorm` from repo `kernels-community/liger_kernels` (revision: main) for layer `LigerRMSNorm` +INFO:root:Using layer `LigerRMSNorm` from repo `kernels-community/liger_kernels` (revision: main) for layer `LigerRMSNorm` +INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP` +INFO:root:Using layer `LigerRMSNorm` from repo `kernels-community/liger_kernels` (revision: main) for layer `LigerRMSNorm` +INFO:root:Using layer `LigerRMSNorm` from repo `kernels-community/liger_kernels` (revision: main) for layer `LigerRMSNorm` +INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP` +INFO:root:Using layer `LigerRMSNorm` from repo `kernels-community/liger_kernels` (revision: main) for layer `LigerRMSNorm` +INFO:root:Using layer `LigerRMSNorm` from repo `kernels-community/liger_kernels` (revision: main) for layer `LigerRMSNorm` +INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP` +INFO:root:Using layer `LigerRMSNorm` from repo `kernels-community/liger_kernels` (revision: main) for layer `LigerRMSNorm` +INFO:root:Using layer `LigerRMSNorm` from repo `kernels-community/liger_kernels` (revision: main) for layer `LigerRMSNorm` +INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP` +INFO:root:Using layer `LigerRMSNorm` from repo `kernels-community/liger_kernels` (revision: main) for layer `LigerRMSNorm` +INFO:root:Using layer `LigerRMSNorm` from repo `kernels-community/liger_kernels` (revision: main) for layer `LigerRMSNorm` +INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP` +INFO:root:Using layer `LigerRMSNorm` from repo `kernels-community/liger_kernels` (revision: main) for layer `LigerRMSNorm` +INFO:root:Using layer `LigerRMSNorm` from repo `kernels-community/liger_kernels` (revision: main) for layer `LigerRMSNorm` +INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP` +INFO:root:Using layer `LigerRMSNorm` from repo `kernels-community/liger_kernels` (revision: main) for layer `LigerRMSNorm` +INFO:root:Using layer `LigerRMSNorm` from repo `kernels-community/liger_kernels` (revision: main) for layer `LigerRMSNorm` +INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP` +INFO:root:Using layer `LigerRMSNorm` from repo `kernels-community/liger_kernels` (revision: main) for layer `LigerRMSNorm` +INFO:root:Using layer `LigerRMSNorm` from repo `kernels-community/liger_kernels` (revision: main) for layer `LigerRMSNorm` +INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP` +INFO:root:Using layer `LigerRMSNorm` from repo `kernels-community/liger_kernels` (revision: main) for layer `LigerRMSNorm` +INFO:root:Using layer `LigerRMSNorm` from repo `kernels-community/liger_kernels` (revision: main) for layer `LigerRMSNorm` +INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP` +INFO:root:Using layer `LigerRMSNorm` from repo `kernels-community/liger_kernels` (revision: main) for layer `LigerRMSNorm` +INFO:root:Using layer `LigerRMSNorm` from repo `kernels-community/liger_kernels` (revision: main) for layer `LigerRMSNorm` +INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP` +INFO:root:Using layer `LigerRMSNorm` from repo `kernels-community/liger_kernels` (revision: main) for layer `LigerRMSNorm` +INFO:root:Using layer `LigerRMSNorm` from repo `kernels-community/liger_kernels` (revision: main) for layer `LigerRMSNorm` +INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP` +INFO:root:Using layer `LigerRMSNorm` from repo `kernels-community/liger_kernels` (revision: main) for layer `LigerRMSNorm` +INFO:root:Using layer `LigerRMSNorm` from repo `kernels-community/liger_kernels` (revision: main) for layer `LigerRMSNorm` +INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP` +INFO:root:Using layer `LigerRMSNorm` from repo `kernels-community/liger_kernels` (revision: main) for layer `LigerRMSNorm` +INFO:root:Using layer `LigerRMSNorm` from repo `kernels-community/liger_kernels` (revision: main) for layer `LigerRMSNorm` +INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP` +INFO:root:Using layer `LigerRMSNorm` from repo `kernels-community/liger_kernels` (revision: main) for layer `LigerRMSNorm` +INFO:root:Using layer `LigerRMSNorm` from repo `kernels-community/liger_kernels` (revision: main) for layer `LigerRMSNorm` +INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP` +INFO:root:Using layer `LigerRMSNorm` from repo `kernels-community/liger_kernels` (revision: main) for layer `LigerRMSNorm` +INFO:root:Using layer `LigerRMSNorm` from repo `kernels-community/liger_kernels` (revision: main) for layer `LigerRMSNorm` +INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP` +INFO:root:Using layer `LigerRMSNorm` from repo `kernels-community/liger_kernels` (revision: main) for layer `LigerRMSNorm` +INFO:root:Using layer `LigerRMSNorm` from repo `kernels-community/liger_kernels` (revision: main) for layer `LigerRMSNorm` +INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP` +INFO:root:Using layer `LigerRMSNorm` from repo `kernels-community/liger_kernels` (revision: main) for layer `LigerRMSNorm` +INFO:root:Using layer `LigerRMSNorm` from repo `kernels-community/liger_kernels` (revision: main) for layer `LigerRMSNorm` +INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP` +INFO:root:Using layer `LigerRMSNorm` from repo `kernels-community/liger_kernels` (revision: main) for layer `LigerRMSNorm` +INFO:root:Using layer `LigerRMSNorm` from repo `kernels-community/liger_kernels` (revision: main) for layer `LigerRMSNorm` +INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP` +INFO:root:Using layer `LigerRMSNorm` from repo `kernels-community/liger_kernels` (revision: main) for layer `LigerRMSNorm` +INFO:root:Using layer `LigerRMSNorm` from repo `kernels-community/liger_kernels` (revision: main) for layer `LigerRMSNorm` +INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP` +INFO:root:Using layer `LigerRMSNorm` from repo `kernels-community/liger_kernels` (revision: main) for layer `LigerRMSNorm` +INFO:root:Using layer `LigerRMSNorm` from repo `kernels-community/liger_kernels` (revision: main) for layer `LigerRMSNorm` +INFO:root:Using layer `MegaBlocksMoeMLP` from repo `kernels-community/megablocks` (revision: main) for layer `MegaBlocksMoeMLP` +INFO:root:Using layer `LigerRMSNorm` from repo `kernels-community/liger_kernels` (revision: main) for layer `LigerRMSNorm` +INFO:root:Using layer `LigerRMSNorm` from repo `kernels-community/liger_kernels` (revision: main) for layer `LigerRMSNorm` +INFO:root:Using layer `LigerRMSNorm` from repo `kernels-community/liger_kernels` (revision: main) for layer `LigerRMSNorm` +Traceback (most recent call last): + File "/home/ubuntu/Projects/yamoe-gpt-integration/.uvnote/cells/forward_only.py", line 87, in <module> + generated = model.generate( + ^^^^^^^^^^^^^^^ + File "/tmp/uvnote-run-b1sqpts5/home/.cache/uv/environments-v2/forward-only-e48b1c4099197fb9/lib/python3.12/site-packages/torch/utils/_contextlib.py", line 120, in decorate_context + return func(*args, **kwargs) + ^^^^^^^^^^^^^^^^^^^^^ + File "/tmp/uvnote-run-b1sqpts5/home/.cache/uv/environments-v2/forward-only-e48b1c4099197fb9/lib/python3.12/site-packages/transformers/generation/utils.py", line 2546, in generate + result = decoding_method( + ^^^^^^^^^^^^^^^^ + File "/tmp/uvnote-run-b1sqpts5/home/.cache/uv/environments-v2/forward-only-e48b1c4099197fb9/lib/python3.12/site-packages/transformers/generation/utils.py", line 2766, in _sample + outputs = self(**model_inputs, return_dict=True) + ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + File "/tmp/uvnote-run-b1sqpts5/home/.cache/uv/environments-v2/forward-only-e48b1c4099197fb9/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1773, in _wrapped_call_impl + return self._call_impl(*args, **kwargs) + ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + File "/tmp/uvnote-run-b1sqpts5/home/.cache/uv/environments-v2/forward-only-e48b1c4099197fb9/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1784, in _call_impl + return forward_call(*args, **kwargs) + ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + File "/tmp/uvnote-run-b1sqpts5/home/.cache/uv/environments-v2/forward-only-e48b1c4099197fb9/lib/python3.12/site-packages/accelerate/hooks.py", line 175, in new_forward + output = module._old_forward(*args, **kwargs) + ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + File "/tmp/uvnote-run-b1sqpts5/home/.cache/uv/environments-v2/forward-only-e48b1c4099197fb9/lib/python3.12/site-packages/transformers/utils/generic.py", line 783, in wrapper + output = func(self, *args, **kwargs) + ^^^^^^^^^^^^^^^^^^^^^^^^^^^ + File "/tmp/uvnote-run-b1sqpts5/home/.cache/uv/environments-v2/forward-only-e48b1c4099197fb9/lib/python3.12/site-packages/transformers/models/gpt_oss/modeling_gpt_oss.py", line 668, in forward + outputs: MoeModelOutputWithPast = self.model( + ^^^^^^^^^^^ + File "/tmp/uvnote-run-b1sqpts5/home/.cache/uv/environments-v2/forward-only-e48b1c4099197fb9/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1773, in _wrapped_call_impl + return self._call_impl(*args, **kwargs) + ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + File "/tmp/uvnote-run-b1sqpts5/home/.cache/uv/environments-v2/forward-only-e48b1c4099197fb9/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1784, in _call_impl + return forward_call(*args, **kwargs) + ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + File "/tmp/uvnote-run-b1sqpts5/home/.cache/uv/environments-v2/forward-only-e48b1c4099197fb9/lib/python3.12/site-packages/transformers/utils/generic.py", line 929, in wrapper + outputs = func(self, *args, **kwargs) + ^^^^^^^^^^^^^^^^^^^^^^^^^^^ + File "/tmp/uvnote-run-b1sqpts5/home/.cache/uv/environments-v2/forward-only-e48b1c4099197fb9/lib/python3.12/site-packages/transformers/models/gpt_oss/modeling_gpt_oss.py", line 507, in forward + hidden_states = decoder_layer( + ^^^^^^^^^^^^^^ + File "/tmp/uvnote-run-b1sqpts5/home/.cache/uv/environments-v2/forward-only-e48b1c4099197fb9/lib/python3.12/site-packages/transformers/modeling_layers.py", line 94, in __call__ + return super().__call__(*args, **kwargs) + ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + File "/tmp/uvnote-run-b1sqpts5/home/.cache/uv/environments-v2/forward-only-e48b1c4099197fb9/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1773, in _wrapped_call_impl + return self._call_impl(*args, **kwargs) + ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + File "/tmp/uvnote-run-b1sqpts5/home/.cache/uv/environments-v2/forward-only-e48b1c4099197fb9/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1784, in _call_impl + return forward_call(*args, **kwargs) + ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + File "/tmp/uvnote-run-b1sqpts5/home/.cache/uv/environments-v2/forward-only-e48b1c4099197fb9/lib/python3.12/site-packages/accelerate/hooks.py", line 175, in new_forward + output = module._old_forward(*args, **kwargs) + ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + File "/tmp/uvnote-run-b1sqpts5/home/.cache/uv/environments-v2/forward-only-e48b1c4099197fb9/lib/python3.12/site-packages/transformers/utils/deprecation.py", line 172, in wrapped_func + return func(*args, **kwargs) + ^^^^^^^^^^^^^^^^^^^^^ + File "/tmp/uvnote-run-b1sqpts5/home/.cache/uv/environments-v2/forward-only-e48b1c4099197fb9/lib/python3.12/site-packages/transformers/models/gpt_oss/modeling_gpt_oss.py", line 369, in forward + hidden_states = self.input_layernorm(hidden_states) + ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + File "/tmp/uvnote-run-b1sqpts5/home/.cache/uv/environments-v2/forward-only-e48b1c4099197fb9/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1773, in _wrapped_call_impl + return self._call_impl(*args, **kwargs) + ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + File "/tmp/uvnote-run-b1sqpts5/home/.cache/uv/environments-v2/forward-only-e48b1c4099197fb9/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1784, in _call_impl + return forward_call(*args, **kwargs) + ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + File "/tmp/uvnote-run-b1sqpts5/home/.cache/huggingface/hub/models--kernels-community--liger_kernels/snapshots/7435d25a8faf175758be14046371a5b0c686f94c/build/torch-universal/liger_kernels/layers.py", line 30, in forward + return LigerRMSNormFunction.apply( + ^^^^^^^^^^^^^^^^^^^^^^^^^^^ + File "/tmp/uvnote-run-b1sqpts5/home/.cache/uv/environments-v2/forward-only-e48b1c4099197fb9/lib/python3.12/site-packages/torch/autograd/function.py", line 576, in apply + return super().apply(*args, **kwargs) # type: ignore[misc] + ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + File "/tmp/uvnote-run-b1sqpts5/home/.cache/huggingface/hub/models--kernels-community--liger_kernels/snapshots/7435d25a8faf175758be14046371a5b0c686f94c/build/torch-universal/liger_kernels/utils.py", line 48, in wrapper + return fn(ctx, *args, **kwargs) + ^^^^^^^^^^^^^^^^^^^^^^^^ + File "/tmp/uvnote-run-b1sqpts5/home/.cache/huggingface/hub/models--kernels-community--liger_kernels/snapshots/7435d25a8faf175758be14046371a5b0c686f94c/build/torch-universal/liger_kernels/rms_norm.py", line 338, in forward + Y, X, RSTD, BLOCK_SIZE, num_warps, casting_mode = rms_norm_forward(X, W, eps, offset, casting_mode) + ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + File "/tmp/uvnote-run-b1sqpts5/home/.cache/huggingface/hub/models--kernels-community--liger_kernels/snapshots/7435d25a8faf175758be14046371a5b0c686f94c/build/torch-universal/liger_kernels/rms_norm.py", line 230, in rms_norm_forward + _rms_norm_forward_kernel[(n_rows,)]( + File "/tmp/uvnote-run-b1sqpts5/home/.cache/uv/environments-v2/forward-only-e48b1c4099197fb9/lib/python3.12/site-packages/triton/runtime/jit.py", line 390, in <lambda> + return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs) + ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + File "/tmp/uvnote-run-b1sqpts5/home/.cache/uv/environments-v2/forward-only-e48b1c4099197fb9/lib/python3.12/site-packages/triton/runtime/jit.py", line 617, in run + kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata, launch_metadata, + File "/tmp/uvnote-run-b1sqpts5/home/.cache/uv/environments-v2/forward-only-e48b1c4099197fb9/lib/python3.12/site-packages/triton/backends/nvidia/driver.py", line 708, in __call__ + self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl, +ValueError: Pointer argument (at 0) cannot be accessed from Triton (cpu tensor?)
+
+
+ +

Backwards

+