Add Metal fused_add_rms_norm + rms_norm kernels for vLLM

New Metal kernels implementing both rms_norm and fused_add_rms_norm
with the exact signatures vLLM expects.

rms_norm(out, input, weight, epsilon):
out = (input / RMS(input)) * weight

fused_add_rms_norm(input, residual, weight, epsilon):
residual += input
input = (residual / RMS(residual)) * weight

The fused variant saves memory bandwidth by combining residual
addition and variance accumulation into a single pass. Every
transformer layer calls this operation.

Features:
- Supports fp16, bf16, fp32 dtypes
- Threadgroup-wide reduction using simd_sum + shared memory
- Float32 accumulation for numerical stability
- Handles strided input layouts (input_stride parameter)
- Comprehensive tests with property-based checks

Co-developed-by: Claude Code v2.1.50 (claude-opus-4-6)

Files changed (10) hide show

build.toml +18 -0
flake.nix +17 -0
fused-rms-norm-metal/rms_norm.metal +168 -0
fused-rms-norm-metal/rms_norm.mm +198 -0
fused-rms-norm-metal/utils.metal +131 -0
tests/test_rms_norm.py +175 -0
torch-ext/fused_rms_norm/__init__.py +8 -0
torch-ext/fused_rms_norm/_custom_ops.py +21 -0
torch-ext/torch_binding.cpp +20 -0
torch-ext/torch_binding.h +9 -0

build.toml ADDED Viewed

	@@ -0,0 +1,18 @@

+[general]
+name = "fused_rms_norm"
+backends = ["metal"]
+[torch]
+src = [
+  "torch-ext/torch_binding.cpp",
+  "torch-ext/torch_binding.h",
+]
+[kernel.fused_rms_norm_metal]
+backend = "metal"
+src = [
+  "fused-rms-norm-metal/rms_norm.metal",
+  "fused-rms-norm-metal/rms_norm.mm",
+  "fused-rms-norm-metal/utils.metal",
+]
+depends = ["torch"]

flake.nix ADDED Viewed

	@@ -0,0 +1,17 @@

+{
+  description = "Flake for fused RMS normalization kernel";
+  inputs = {
+    kernel-builder.url = "github:huggingface/kernel-builder";
+  };
+  outputs =
+    {
+      self,
+      kernel-builder,
+    }:
+    kernel-builder.lib.genFlakeOutputs {
+      inherit self;
+      path = ./.;
+    };
+}

fused-rms-norm-metal/rms_norm.metal ADDED Viewed

	@@ -0,0 +1,168 @@

+#include <metal_stdlib>
+#include "utils.metal"
+using namespace metal;
+// Maximum number of simdgroups per threadgroup for reduction.
+// 512 threads / 32 threads per simdgroup = 16 simdgroups max.
+constant constexpr int MAX_SIMDGROUPS = 16;
+// Threadgroup-wide sum reduction using simdgroups.
+// Each thread contributes a value; returns the total sum to all threads.
+static inline float threadgroup_reduce_sum(
+    float value,
+    threadgroup float *shared [[threadgroup(0)]],
+    uint tid [[thread_position_in_threadgroup]],
+    uint tg_size [[threads_per_threadgroup]]) {
+  // Phase 1: reduce within each simdgroup.
+  float simd_val = simd_sum(value);
+  // Phase 2: first thread of each simdgroup writes to shared memory.
+  uint simdgroup_id = tid / 32;
+  uint lane_id = tid % 32;
+  if (lane_id == 0) {
+    shared[simdgroup_id] = simd_val;
+  }
+  threadgroup_barrier(mem_flags::mem_threadgroup);
+  // Phase 3: first simdgroup reduces across simdgroup partial sums.
+  uint num_simdgroups = (tg_size + 31) / 32;
+  float result = 0.0f;
+  if (tid < num_simdgroups) {
+    result = shared[tid];
+  }
+  result = simd_sum(result);
+  // Broadcast result to all threads via shared memory.
+  if (tid == 0) {
+    shared[0] = result;
+  }
+  threadgroup_barrier(mem_flags::mem_threadgroup);
+  return shared[0];
+}
+// RMS normalization kernel.
+// out[token, i] = (input[token, i] / RMS(input[token, :])) * weight[i]
+// where RMS = sqrt(mean(x^2) + epsilon)
+//
+// One threadgroup per token. Threads stride across hidden_size.
+template <typename scalar_t>
+kernel void rms_norm_kernel(
+    device scalar_t *out [[buffer(0)]],
+    const device scalar_t *input [[buffer(1)]],
+    const device scalar_t *weight [[buffer(2)]],
+    const device float &epsilon [[buffer(3)]],
+    const device int &num_tokens [[buffer(4)]],
+    const device int &hidden_size [[buffer(5)]],
+    const device int64_t &input_stride [[buffer(6)]],
+    threadgroup float *shared [[threadgroup(0)]],
+    uint token_idx [[threadgroup_position_in_grid]],
+    uint tid [[thread_position_in_threadgroup]],
+    uint tg_size [[threads_per_threadgroup]]) {
+  // Phase 1: accumulate sum of squares for variance.
+  float variance = 0.0f;
+  for (int i = tid; i < hidden_size; i += tg_size) {
+    float x = static_cast<float>(input[token_idx * input_stride + i]);
+    variance += x * x;
+  }
+  // Phase 2: reduce variance across threadgroup.
+  variance = threadgroup_reduce_sum(variance, shared, tid, tg_size);
+  // Phase 3: compute scaling factor.
+  float s_variance = rsqrt(variance / static_cast<float>(hidden_size) + epsilon);
+  // Phase 4: normalize and scale.
+  for (int i = tid; i < hidden_size; i += tg_size) {
+    float x = static_cast<float>(input[token_idx * input_stride + i]);
+    float w = static_cast<float>(weight[i]);
+    out[token_idx * hidden_size + i] = static_cast<scalar_t>(x * s_variance * w);
+  }
+}
+// Fused residual addition + RMS normalization kernel.
+//
+// After execution:
+//   residual[token, i] = old_residual[token, i] + old_input[token, i]
+//   input[token, i]    = rms_norm(new_residual[token, :]) * weight[i]
+//
+// This fuses two memory passes into one: the residual addition and variance
+// accumulation happen in the same loop, saving memory bandwidth.
+template <typename scalar_t>
+kernel void fused_add_rms_norm_kernel(
+    device scalar_t *input [[buffer(0)]],
+    device scalar_t *residual [[buffer(1)]],
+    const device scalar_t *weight [[buffer(2)]],
+    const device float &epsilon [[buffer(3)]],
+    const device int &num_tokens [[buffer(4)]],
+    const device int &hidden_size [[buffer(5)]],
+    const device int64_t &input_stride [[buffer(6)]],
+    threadgroup float *shared [[threadgroup(0)]],
+    uint token_idx [[threadgroup_position_in_grid]],
+    uint tid [[thread_position_in_threadgroup]],
+    uint tg_size [[threads_per_threadgroup]]) {
+  // Phase 1: add residual and accumulate variance in one pass.
+  float variance = 0.0f;
+  for (int i = tid; i < hidden_size; i += tg_size) {
+    float inp = static_cast<float>(input[token_idx * input_stride + i]);
+    float res = static_cast<float>(residual[token_idx * hidden_size + i]);
+    float z = inp + res;
+    variance += z * z;
+    residual[token_idx * hidden_size + i] = static_cast<scalar_t>(z);
+  }
+  // Phase 2: reduce variance across threadgroup.
+  variance = threadgroup_reduce_sum(variance, shared, tid, tg_size);
+  // Phase 3: compute scaling factor.
+  float s_variance = rsqrt(variance / static_cast<float>(hidden_size) + epsilon);
+  // Phase 4: read updated residual, normalize, and write to input.
+  for (int i = tid; i < hidden_size; i += tg_size) {
+    float x = static_cast<float>(residual[token_idx * hidden_size + i]);
+    float w = static_cast<float>(weight[i]);
+    input[token_idx * input_stride + i] = static_cast<scalar_t>(x * s_variance * w);
+  }
+}
+// Instantiate kernel variants.
+#define instantiate_rms_norm(type)                                              \
+  template [[host_name("rms_norm_" #type)]] [[kernel]] void                    \
+  rms_norm_kernel<type>(                                                       \
+      device type *out [[buffer(0)]],                                          \
+      const device type *input [[buffer(1)]],                                  \
+      const device type *weight [[buffer(2)]],                                 \
+      const device float &epsilon [[buffer(3)]],                               \
+      const device int &num_tokens [[buffer(4)]],                              \
+      const device int &hidden_size [[buffer(5)]],                             \
+      const device int64_t &input_stride [[buffer(6)]],                        \
+      threadgroup float *shared [[threadgroup(0)]],                            \
+      uint token_idx [[threadgroup_position_in_grid]],                         \
+      uint tid [[thread_position_in_threadgroup]],                             \
+      uint tg_size [[threads_per_threadgroup]]);
+#define instantiate_fused_add_rms_norm(type)                                   \
+  template [[host_name("fused_add_rms_norm_" #type)]] [[kernel]] void          \
+  fused_add_rms_norm_kernel<type>(                                             \
+      device type *input [[buffer(0)]],                                        \
+      device type *residual [[buffer(1)]],                                     \
+      const device type *weight [[buffer(2)]],                                 \
+      const device float &epsilon [[buffer(3)]],                               \
+      const device int &num_tokens [[buffer(4)]],                              \
+      const device int &hidden_size [[buffer(5)]],                             \
+      const device int64_t &input_stride [[buffer(6)]],                        \
+      threadgroup float *shared [[threadgroup(0)]],                            \
+      uint token_idx [[threadgroup_position_in_grid]],                         \
+      uint tid [[thread_position_in_threadgroup]],                             \
+      uint tg_size [[threads_per_threadgroup]]);
+instantiate_rms_norm(float);
+instantiate_rms_norm(half);
+instantiate_rms_norm(bfloat16_t);
+instantiate_fused_add_rms_norm(float);
+instantiate_fused_add_rms_norm(half);
+instantiate_fused_add_rms_norm(bfloat16_t);

fused-rms-norm-metal/rms_norm.mm ADDED Viewed

	@@ -0,0 +1,198 @@

+#include <ATen/mps/MPSDevice.h>
+#include <ATen/mps/MPSStream.h>
+#include <torch/torch.h>
+#import <Foundation/Foundation.h>
+#import <Metal/Metal.h>
+#include <dlfcn.h>
+#include <string>
+static inline id<MTLBuffer> getMTLBufferStorage(const torch::Tensor &tensor) {
+  return __builtin_bit_cast(id<MTLBuffer>, tensor.storage().data());
+}
+static std::string getModuleDirectory() {
+  Dl_info dl_info;
+  if (dladdr((void *)getModuleDirectory, &dl_info)) {
+    std::string path(dl_info.dli_fname);
+    size_t pos = path.find_last_of('/');
+    if (pos != std::string::npos) {
+      return path.substr(0, pos);
+    }
+  }
+  return ".";
+}
+// Helper to select kernel name by dtype.
+static NSString *kernelNameForDtype(const char *prefix,
+                                    torch::ScalarType dtype) {
+  switch (dtype) {
+  case torch::kFloat:
+    return [NSString stringWithFormat:@"%s_float", prefix];
+  case torch::kHalf:
+    return [NSString stringWithFormat:@"%s_half", prefix];
+  case torch::kBFloat16:
+    return [NSString stringWithFormat:@"%s_bfloat16_t", prefix];
+  default:
+    TORCH_CHECK(false, "Unsupported dtype: ", dtype);
+    return nil;
+  }
+}
+// Helper to load metallib and create pipeline state.
+static id<MTLComputePipelineState>
+createPipeline(id<MTLDevice> device, NSString *kernName, NSError **error) {
+  std::string moduleDir = getModuleDirectory();
+  std::string metallibPath = moduleDir + "/" + METALLIB_PATH;
+  NSString *metallibPathStr =
+      [NSString stringWithUTF8String:metallibPath.c_str()];
+  NSURL *metallibURL = [NSURL fileURLWithPath:metallibPathStr];
+  id<MTLLibrary> lib = [device newLibraryWithURL:metallibURL error:error];
+  TORCH_CHECK(lib, "Failed to load Metal library at ", metallibPath,
+              *error ? [NSString stringWithFormat:@": %@",
+                                                  (*error).localizedDescription]
+                           .UTF8String
+                     : "");
+  id<MTLFunction> fn = [lib newFunctionWithName:kernName];
+  TORCH_CHECK(fn, "Missing Metal kernel function: ", kernName.UTF8String);
+  return [device newComputePipelineStateWithFunction:fn error:error];
+}
+// Dispatch a layernorm kernel with 7 buffer bindings + threadgroup memory.
+static void dispatchNormKernel(id<MTLComputePipelineState> pso,
+                               at::mps::MPSStream *stream,
+                               id<MTLCommandBuffer> cmdBuf,
+                               // Buffers 0-2: tensor buffers with offsets
+                               id<MTLBuffer> buf0, NSUInteger off0,
+                               id<MTLBuffer> buf1, NSUInteger off1,
+                               id<MTLBuffer> buf2, NSUInteger off2,
+                               // Scalars
+                               float epsilon, int32_t num_tokens,
+                               int32_t hidden_size, int64_t input_stride,
+                               // Grid
+                               uint32_t threadgroups,
+                               uint32_t threads_per_tg) {
+  // Shared memory: MAX_SIMDGROUPS (16) floats for reduction.
+  const uint32_t shared_mem_size = 16 * sizeof(float);
+  dispatch_queue_t q = stream->queue();
+  dispatch_sync(q, ^{
+    id<MTLComputeCommandEncoder> enc = [cmdBuf computeCommandEncoder];
+    TORCH_CHECK(enc, "Failed to create compute encoder");
+    [enc setComputePipelineState:pso];
+    [enc setBuffer:buf0 offset:off0 atIndex:0];
+    [enc setBuffer:buf1 offset:off1 atIndex:1];
+    [enc setBuffer:buf2 offset:off2 atIndex:2];
+    [enc setBytes:&epsilon length:sizeof(float) atIndex:3];
+    [enc setBytes:&num_tokens length:sizeof(int32_t) atIndex:4];
+    [enc setBytes:&hidden_size length:sizeof(int32_t) atIndex:5];
+    [enc setBytes:&input_stride length:sizeof(int64_t) atIndex:6];
+    [enc setThreadgroupMemoryLength:shared_mem_size atIndex:0];
+    MTLSize grid = MTLSizeMake(threadgroups, 1, 1);
+    MTLSize tg = MTLSizeMake(threads_per_tg, 1, 1);
+    [enc dispatchThreadgroups:grid threadsPerThreadgroup:tg];
+    [enc endEncoding];
+  });
+  stream->synchronize(at::mps::SyncType::COMMIT);
+}
+void rms_norm(torch::Tensor &out, torch::Tensor &input,
+              torch::Tensor &weight, double epsilon) {
+  TORCH_CHECK(out.is_contiguous(), "out must be contiguous");
+  TORCH_CHECK(weight.is_contiguous(), "weight must be contiguous");
+  TORCH_CHECK(input.device().is_mps(), "input must be on MPS device");
+  const int hidden_size = input.size(-1);
+  const int64_t input_stride = input.stride(-2);
+  const int num_tokens =
+      static_cast<int>(input.numel() / hidden_size);
+  @autoreleasepool {
+    at::mps::MPSStream *stream = at::mps::getCurrentMPSStream();
+    TORCH_CHECK(stream, "Failed to get MPS stream");
+    id<MTLDevice> device = stream->device();
+    id<MTLCommandBuffer> cmdBuf = stream->commandBuffer();
+    NSError *error = nil;
+    NSString *kernName = kernelNameForDtype("rms_norm", input.scalar_type());
+    id<MTLComputePipelineState> pso = createPipeline(device, kernName, &error);
+    TORCH_CHECK(pso, "Pipeline creation failed",
+                error ? [NSString stringWithFormat:@": %@",
+                                                   error.localizedDescription]
+                            .UTF8String
+                      : "");
+    const uint32_t threads_per_tg =
+        std::min<uint32_t>(512, hidden_size);
+    dispatchNormKernel(
+        pso, stream, cmdBuf,
+        getMTLBufferStorage(out),
+        out.storage_offset() * out.element_size(),
+        getMTLBufferStorage(input),
+        input.storage_offset() * input.element_size(),
+        getMTLBufferStorage(weight),
+        weight.storage_offset() * weight.element_size(),
+        static_cast<float>(epsilon), static_cast<int32_t>(num_tokens),
+        static_cast<int32_t>(hidden_size), input_stride,
+        static_cast<uint32_t>(num_tokens), threads_per_tg);
+  }
+}
+void fused_add_rms_norm(torch::Tensor &input, torch::Tensor &residual,
+                        torch::Tensor &weight, double epsilon) {
+  TORCH_CHECK(residual.is_contiguous(), "residual must be contiguous");
+  TORCH_CHECK(weight.is_contiguous(), "weight must be contiguous");
+  TORCH_CHECK(input.device().is_mps(), "input must be on MPS device");
+  TORCH_CHECK(input.scalar_type() == residual.scalar_type(),
+              "input and residual must have same dtype");
+  const int hidden_size = input.size(-1);
+  const int64_t input_stride = input.stride(-2);
+  const int num_tokens =
+      static_cast<int>(input.numel() / hidden_size);
+  @autoreleasepool {
+    at::mps::MPSStream *stream = at::mps::getCurrentMPSStream();
+    TORCH_CHECK(stream, "Failed to get MPS stream");
+    id<MTLDevice> device = stream->device();
+    id<MTLCommandBuffer> cmdBuf = stream->commandBuffer();
+    NSError *error = nil;
+    NSString *kernName =
+        kernelNameForDtype("fused_add_rms_norm", input.scalar_type());
+    id<MTLComputePipelineState> pso = createPipeline(device, kernName, &error);
+    TORCH_CHECK(pso, "Pipeline creation failed",
+                error ? [NSString stringWithFormat:@": %@",
+                                                   error.localizedDescription]
+                            .UTF8String
+                      : "");
+    const uint32_t threads_per_tg =
+        std::min<uint32_t>(512, hidden_size);
+    dispatchNormKernel(
+        pso, stream, cmdBuf,
+        getMTLBufferStorage(input),
+        input.storage_offset() * input.element_size(),
+        getMTLBufferStorage(residual),
+        residual.storage_offset() * residual.element_size(),
+        getMTLBufferStorage(weight),
+        weight.storage_offset() * weight.element_size(),
+        static_cast<float>(epsilon), static_cast<int32_t>(num_tokens),
+        static_cast<int32_t>(hidden_size), input_stride,
+        static_cast<uint32_t>(num_tokens), threads_per_tg);
+  }
+}

fused-rms-norm-metal/utils.metal ADDED Viewed

	@@ -0,0 +1,131 @@

+#include <metal_stdlib>
+using namespace metal;
+#if defined(__HAVE_BFLOAT__)
+typedef bfloat bfloat16_t;
+#else
+constexpr METAL_FUNC uint16_t float_to_bfloat_bits(float x) {
+  if ((as_type<uint32_t>(x) & ~_fp_encoding_traits<float>::sign_mask) >
+      _fp_encoding_traits<float>::inf_mask) {
+    return uint16_t(as_type<uint32_t>(0x7FC0));
+  }
+  uint32_t float_bits = as_type<uint32_t>(x);
+  float_bits += ((float_bits >> 16) & 1) + as_type<uint32_t>(0x7FFF);
+  return float_bits >> 16;
+}
+constexpr METAL_FUNC float bfloat_bits_to_float(uint16_t x) {
+  return as_type<float>((uint32_t)x << 16);
+}
+struct _MLX_BFloat16;
+template <typename T>
+static constexpr constant bool can_convert_to_bfloat =
+    !is_same_v<T, _MLX_BFloat16> && is_convertible_v<T, float>;
+template <typename T>
+static constexpr constant bool can_convert_from_bfloat =
+    !is_same_v<T, _MLX_BFloat16> && is_convertible_v<float, T>;
+struct _MLX_BFloat16 {
+  uint16_t bits_;
+  _MLX_BFloat16() thread = default;
+  _MLX_BFloat16() threadgroup = default;
+  _MLX_BFloat16() device = default;
+  _MLX_BFloat16() constant = default;
+  struct bits_to_bfloat_struct {};
+  static constexpr METAL_FUNC bits_to_bfloat_struct bits_to_bfloat() {
+    return bits_to_bfloat_struct();
+  }
+  constexpr METAL_FUNC _MLX_BFloat16(uint16_t bits, bits_to_bfloat_struct)
+      : bits_(bits) {}
+  template <typename T,
+            typename = typename enable_if<can_convert_to_bfloat<T>>::type>
+  constexpr METAL_FUNC _MLX_BFloat16(T x) thread
+      : bits_(float_to_bfloat_bits(static_cast<float>(x))) {}
+  template <typename T,
+            typename = typename enable_if<can_convert_to_bfloat<T>>::type>
+  constexpr METAL_FUNC _MLX_BFloat16(T x) threadgroup
+      : bits_(float_to_bfloat_bits(static_cast<float>(x))) {}
+  template <typename T,
+            typename = typename enable_if<can_convert_to_bfloat<T>>::type>
+  constexpr METAL_FUNC _MLX_BFloat16(T x) device
+      : bits_(float_to_bfloat_bits(static_cast<float>(x))) {}
+  template <typename T,
+            typename = typename enable_if<can_convert_to_bfloat<T>>::type>
+  constexpr METAL_FUNC _MLX_BFloat16(T x) constant
+      : bits_(float_to_bfloat_bits(static_cast<float>(x))) {}
+  template <typename T,
+            typename = typename enable_if<can_convert_from_bfloat<T>>::type>
+  constexpr METAL_FUNC operator T() const thread {
+    return static_cast<T>(bfloat_bits_to_float(bits_));
+  }
+  template <typename T,
+            typename = typename enable_if<can_convert_from_bfloat<T>>::type>
+  constexpr METAL_FUNC operator T() const threadgroup {
+    return static_cast<T>(bfloat_bits_to_float(bits_));
+  }
+  template <typename T,
+            typename = typename enable_if<can_convert_from_bfloat<T>>::type>
+  constexpr METAL_FUNC operator T() const device {
+    return static_cast<T>(bfloat_bits_to_float(bits_));
+  }
+  template <typename T,
+            typename = typename enable_if<can_convert_from_bfloat<T>>::type>
+  constexpr METAL_FUNC operator T() constant {
+    return static_cast<T>(bfloat_bits_to_float(bits_));
+  }
+};
+constexpr METAL_FUNC _MLX_BFloat16 operator-(_MLX_BFloat16 x) {
+  return -static_cast<float>(x);
+}
+#define bfloat_binop_base(__op__, __operator__, otype, atype, btype, ctype)    \
+  constexpr METAL_FUNC otype __operator__(atype lhs, btype rhs) {              \
+    return static_cast<ctype>(lhs) __op__ static_cast<ctype>(rhs);             \
+  }
+#define bfloat_binop_helper(__op__, __operator__, otype, itype, ctype)         \
+  constexpr METAL_FUNC otype __operator__(_MLX_BFloat16 lhs, itype rhs) {      \
+    return static_cast<ctype>(lhs) __op__ static_cast<ctype>(rhs);             \
+  }                                                                            \
+  constexpr METAL_FUNC otype __operator__(itype lhs, _MLX_BFloat16 rhs) {      \
+    return static_cast<ctype>(lhs) __op__ static_cast<ctype>(rhs);             \
+  }
+#define bfloat_binop(_op_, _operator_)                                         \
+  bfloat_binop_base(_op_, _operator_, _MLX_BFloat16, _MLX_BFloat16,            \
+                    _MLX_BFloat16, float);                                     \
+  bfloat_binop_helper(_op_, _operator_, float, float, float);                  \
+  bfloat_binop_helper(_op_, _operator_, float, half, float);                   \
+  bfloat_binop_helper(_op_, _operator_, _MLX_BFloat16, int32_t, float);        \
+  bfloat_binop_helper(_op_, _operator_, _MLX_BFloat16, uint32_t, float);       \
+  bfloat_binop_helper(_op_, _operator_, _MLX_BFloat16, int64_t, float);        \
+  bfloat_binop_helper(_op_, _operator_, _MLX_BFloat16, uint64_t, float);
+bfloat_binop(+, operator+);
+bfloat_binop(-, operator-);
+bfloat_binop(*, operator*);
+bfloat_binop(/, operator/);
+#undef bfloat_binop_base
+#undef bfloat_binop_helper
+#undef bfloat_binop
+typedef struct _MLX_BFloat16 bfloat16_t;
+#endif

tests/test_rms_norm.py ADDED Viewed

	@@ -0,0 +1,175 @@

+"""Tests for Metal RMS normalization kernels.
+Validates rms_norm and fused_add_rms_norm against PyTorch reference
+implementations across dtypes and hidden sizes.
+"""
+import pytest
+import torch
+import fused_rms_norm as ops
+def _is_mps_available() -> bool:
+    return hasattr(torch.backends, "mps") and torch.backends.mps.is_available()
+if _is_mps_available():
+    DEVICES = ["mps"]
+else:
+    DEVICES = [f"cuda:{i}" for i in range(max(1, torch.cuda.device_count()))]
+DTYPES = [torch.float32, torch.float16, torch.bfloat16]
+HIDDEN_SIZES = [128, 768, 2048, 4096]
+NUM_TOKENS = [1, 7, 32]
+EPSILON = 1e-6
+def _ref_rms_norm(
+    input: torch.Tensor,
+    weight: torch.Tensor,
+    epsilon: float,
+) -> torch.Tensor:
+    """Pure-PyTorch reference for RMS normalization."""
+    variance = input.float().pow(2).mean(dim=-1, keepdim=True)
+    inv_rms = torch.rsqrt(variance + epsilon)
+    return (input.float() * inv_rms * weight.float()).to(input.dtype)
+def _ref_fused_add_rms_norm(
+    input: torch.Tensor,
+    residual: torch.Tensor,
+    weight: torch.Tensor,
+    epsilon: float,
+) -> tuple[torch.Tensor, torch.Tensor]:
+    """Pure-PyTorch reference for fused residual add + RMS norm.
+    Returns (normalized_output, updated_residual).
+    """
+    new_residual = residual.float() + input.float()
+    variance = new_residual.pow(2).mean(dim=-1, keepdim=True)
+    inv_rms = torch.rsqrt(variance + epsilon)
+    normalized = (new_residual * inv_rms * weight.float()).to(input.dtype)
+    return normalized, new_residual.to(residual.dtype)
+@pytest.mark.parametrize("device", DEVICES)
+@pytest.mark.parametrize("dtype", DTYPES)
+@pytest.mark.parametrize("hidden_size", HIDDEN_SIZES)
+@pytest.mark.parametrize("num_tokens", NUM_TOKENS)
+@torch.inference_mode()
+def test_rms_norm(
+    device: str,
+    dtype: torch.dtype,
+    hidden_size: int,
+    num_tokens: int,
+) -> None:
+    torch.manual_seed(42)
+    input = torch.randn(num_tokens, hidden_size, dtype=dtype, device=device)
+    weight = torch.randn(hidden_size, dtype=dtype, device=device)
+    out = torch.empty_like(input)
+    # Run kernel.
+    ops.rms_norm(out, input, weight, EPSILON)
+    # Run reference on CPU.
+    ref = _ref_rms_norm(input.cpu(), weight.cpu(), EPSILON)
+    # Compare.
+    if dtype == torch.float32:
+        atol, rtol = 1e-5, 1e-5
+    elif dtype == torch.float16:
+        atol, rtol = 1e-3, 1e-3
+    else:  # bfloat16
+        atol, rtol = 2e-2, 2e-2
+    torch.testing.assert_close(out.cpu(), ref, atol=atol, rtol=rtol)
+@pytest.mark.parametrize("device", DEVICES)
+@pytest.mark.parametrize("dtype", DTYPES)
+@pytest.mark.parametrize("hidden_size", HIDDEN_SIZES)
+@pytest.mark.parametrize("num_tokens", NUM_TOKENS)
+@torch.inference_mode()
+def test_fused_add_rms_norm(
+    device: str,
+    dtype: torch.dtype,
+    hidden_size: int,
+    num_tokens: int,
+) -> None:
+    torch.manual_seed(42)
+    input = torch.randn(num_tokens, hidden_size, dtype=dtype, device=device)
+    residual = torch.randn(num_tokens, hidden_size, dtype=dtype, device=device)
+    weight = torch.randn(hidden_size, dtype=dtype, device=device)
+    # Compute reference on CPU BEFORE running kernel (kernel modifies in-place).
+    ref_normalized, ref_residual = _ref_fused_add_rms_norm(
+        input.cpu(), residual.cpu(), weight.cpu(), EPSILON
+    )
+    # Run kernel (modifies input and residual in-place).
+    ops.fused_add_rms_norm(input, residual, weight, EPSILON)
+    # Compare.
+    if dtype == torch.float32:
+        atol, rtol = 1e-5, 1e-5
+    elif dtype == torch.float16:
+        atol, rtol = 1e-3, 1e-3
+    else:  # bfloat16
+        atol, rtol = 2e-2, 2e-2
+    torch.testing.assert_close(
+        residual.cpu(), ref_residual, atol=atol, rtol=rtol
+    )
+    torch.testing.assert_close(
+        input.cpu(), ref_normalized, atol=atol, rtol=rtol
+    )
+@pytest.mark.parametrize("device", DEVICES)
+@pytest.mark.parametrize("dtype", [torch.float32])
+@torch.inference_mode()
+def test_rms_norm_weight_scaling(
+    device: str,
+    dtype: torch.dtype,
+) -> None:
+    """Verify that weight=1 gives pure RMS normalization."""
+    hidden_size = 256
+    num_tokens = 4
+    input = torch.randn(num_tokens, hidden_size, dtype=dtype, device=device)
+    weight_ones = torch.ones(hidden_size, dtype=dtype, device=device)
+    weight_twos = 2.0 * torch.ones(hidden_size, dtype=dtype, device=device)
+    out_ones = torch.empty_like(input)
+    out_twos = torch.empty_like(input)
+    ops.rms_norm(out_ones, input, weight_ones, EPSILON)
+    ops.rms_norm(out_twos, input, weight_twos, EPSILON)
+    # weight=2 should produce exactly 2x the weight=1 result.
+    torch.testing.assert_close(
+        out_twos.cpu(), 2.0 * out_ones.cpu(), atol=1e-5, rtol=1e-5
+    )
+@pytest.mark.parametrize("device", DEVICES)
+@pytest.mark.parametrize("dtype", [torch.float32])
+@torch.inference_mode()
+def test_fused_add_rms_norm_residual_accumulation(
+    device: str,
+    dtype: torch.dtype,
+) -> None:
+    """Verify residual is correctly accumulated (residual += input)."""
+    hidden_size = 128
+    num_tokens = 2
+    input = torch.randn(num_tokens, hidden_size, dtype=dtype, device=device)
+    residual = torch.randn(num_tokens, hidden_size, dtype=dtype, device=device)
+    weight = torch.ones(hidden_size, dtype=dtype, device=device)
+    expected_residual = (residual + input).cpu()
+    ops.fused_add_rms_norm(input, residual, weight, EPSILON)
+    torch.testing.assert_close(
+        residual.cpu(), expected_residual, atol=1e-5, rtol=1e-5
+    )

torch-ext/fused_rms_norm/__init__.py ADDED Viewed

	@@ -0,0 +1,8 @@

+from ._custom_ops import fused_add_rms_norm, rms_norm
+from ._ops import ops
+__all__ = [
+    "fused_add_rms_norm",
+    "ops",
+    "rms_norm",
+]

torch-ext/fused_rms_norm/_custom_ops.py ADDED Viewed

	@@ -0,0 +1,21 @@

+import torch
+from ._ops import ops
+def rms_norm(
+    out: torch.Tensor,
+    input: torch.Tensor,
+    weight: torch.Tensor,
+    epsilon: float,
+) -> None:
+    ops.rms_norm(out, input, weight, epsilon)
+def fused_add_rms_norm(
+    input: torch.Tensor,
+    residual: torch.Tensor,
+    weight: torch.Tensor,
+    epsilon: float,
+) -> None:
+    ops.fused_add_rms_norm(input, residual, weight, epsilon)

torch-ext/torch_binding.cpp ADDED Viewed

	@@ -0,0 +1,20 @@

+#include <torch/library.h>
+#include "registration.h"
+#include "torch_binding.h"
+TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
+  ops.def(
+      "rms_norm(Tensor! result, Tensor input, Tensor weight, float epsilon) -> ()");
+#if defined(METAL_KERNEL)
+  ops.impl("rms_norm", torch::kMPS, rms_norm);
+#endif
+  ops.def(
+      "fused_add_rms_norm(Tensor! input, Tensor! residual, Tensor weight, float epsilon) -> ()");
+#if defined(METAL_KERNEL)
+  ops.impl("fused_add_rms_norm", torch::kMPS, fused_add_rms_norm);
+#endif
+}
+REGISTER_EXTENSION(TORCH_EXTENSION_NAME)

torch-ext/torch_binding.h ADDED Viewed

	@@ -0,0 +1,9 @@

+#pragma once
+#include <torch/torch.h>
+void rms_norm(torch::Tensor& out, torch::Tensor& input,
+              torch::Tensor& weight, double epsilon);
+void fused_add_rms_norm(torch::Tensor& input, torch::Tensor& residual,
+                        torch::Tensor& weight, double epsilon);