+

on_github: huggingface/kernels-uvnotes

+

Torch LayerNorm Implementation

+

GPU Info

+
+
+ +▼ code +▼ output + ▶ uv-logs + | +Cell: nv | 0.22s + | + +Raw +
+
+
+
import subprocess
+print(subprocess.run(["nvidia-smi"], capture_output=True, text=True).stdout)
+
+ +
+
+
+
+
Wed Oct 22 08:58:23 2025 ++-----------------------------------------------------------------------------------------+ +| NVIDIA-SMI 570.195.03 Driver Version: 570.195.03 CUDA Version: 12.8 | +|-----------------------------------------+------------------------+----------------------+ +| GPU Name Persistence-M | Bus-Id Disp.A | Volatile Uncorr. ECC | +| Fan Temp Perf Pwr:Usage/Cap | Memory-Usage | GPU-Util Compute M. | +| | | MIG M. | +|=========================================+========================+======================| +| 0 NVIDIA L40S On | 00000000:4D:00.0 Off | 0 | +| N/A 26C P8 22W / 350W | 0MiB / 46068MiB | 0% Default | +| | | N/A | ++-----------------------------------------+------------------------+----------------------+ + ++-----------------------------------------------------------------------------------------+ +| Processes: | +| GPU GI CI PID Type Process name GPU Memory | +| ID ID Usage | +|=========================================================================================| +| No running processes found | ++-----------------------------------------------------------------------------------------+ + +
+
+
+ +

LayerNorm Benchmark (PyTorch)

+
+
+ +▼ code +▼ output + ▶ uv-logs + | +Cell: benchmark | 0.01s | FAILED + | + +Raw +
+
+
+
# /// script
+# requires-python = ">=3.10"
+# dependencies = [
+#     "numpy",
+#     "torch",
+#     "kernels-benchmark-tools",
+# ]
+#
+# [tool.uv.sources]
+# kernels-benchmark-tools = { path = "/home/ubuntu/Projects/kernels-benchmarks-consolidated/tools", editable = true }
+# ///
+import torch
+import kernels_benchmark_tools as kbt
+
+
+def torch_layer_norm(x, weight, bias, eps: float = 1e-5):
+    return torch.nn.functional.layer_norm(x, (x.shape[-1],), weight, bias, eps)
+
+kbt.add(
+    "torch_layer_norm",
+    torch_layer_norm,
+    tags={"family": "torch", "op": "layer_norm"},
+)
+
+if __name__ == "__main__":
+    device = "cuda" if torch.cuda.is_available() else "cpu"
+    dtype = "float32" if device == "cpu" else "bfloat16"
+
+    wl = list(kbt.layer_norm.llama_workloads(dtype)) if device == "cuda" else list(kbt.layer_norm.cpu_workloads(dtype))
+
+    kbt.run(
+        wl,
+        jsonl="ln.jsonl",
+        reps=5,
+        warmup=2,
+        gen=kbt.layer_norm.gen_inputs,
+        ref=kbt.layer_norm.ref_layer_norm,
+        cmp=kbt.layer_norm.cmp_allclose,
+        profile_trace=False,
+    )
+    kbt.summarize(["ln.jsonl"])
+
+ +
+
+
+
+
× Failed to resolve script requirement + ╰─▶ Distribution not found at: + file:///home/ubuntu/Projects/kernels-benchmarks-consolidated/tools +
+
+
+