Add Metal rotary embedding kernel matching vLLM interface

New Metal kernel implementing rotary_embedding(positions, query, key,
head_size, cos_sin_cache, is_neox) with the exact signature vLLM expects.

Features:
- Supports fp16, bf16, fp32 dtypes
- NeoX style (Llama, Mistral) and GPT-J style rotation
- Arbitrary head dims (64, 128, 256)
- GQA support (separate num_heads / num_kv_heads)
- Optional key tensor (key=None skips key rotation)
- Function constants for IS_NEOX (zero-cost specialization)
- Precomputed cos_sin_cache lookup (not on-the-fly frequency computation)

Includes comprehensive tests against pure-PyTorch reference implementation.

Co-developed-by: Claude Code v2.1.50 (claude-opus-4-6)

Files changed (10) hide show

build.toml +18 -0
flake.nix +17 -0
rotary-embedding-metal/rotary_embedding.metal +118 -0
rotary-embedding-metal/rotary_embedding.mm +198 -0
rotary-embedding-metal/utils.metal +131 -0
tests/test_rotary_embedding.py +210 -0
torch-ext/rotary_embedding/__init__.py +7 -0
torch-ext/rotary_embedding/_custom_ops.py +17 -0
torch-ext/torch_binding.cpp +15 -0
torch-ext/torch_binding.h +7 -0

build.toml ADDED Viewed

	@@ -0,0 +1,18 @@

+[general]
+name = "rotary_embedding"
+backends = ["metal"]
+[torch]
+src = [
+  "torch-ext/torch_binding.cpp",
+  "torch-ext/torch_binding.h",
+]
+[kernel.rotary_embedding_metal]
+backend = "metal"
+src = [
+  "rotary-embedding-metal/rotary_embedding.metal",
+  "rotary-embedding-metal/rotary_embedding.mm",
+  "rotary-embedding-metal/utils.metal",
+]
+depends = ["torch"]

flake.nix ADDED Viewed

	@@ -0,0 +1,17 @@

+{
+  description = "Flake for rotary embedding kernel";
+  inputs = {
+    kernel-builder.url = "github:huggingface/kernel-builder";
+  };
+  outputs =
+    {
+      self,
+      kernel-builder,
+    }:
+    kernel-builder.lib.genFlakeOutputs {
+      inherit self;
+      path = ./.;
+    };
+}

rotary-embedding-metal/rotary_embedding.metal ADDED Viewed

	@@ -0,0 +1,118 @@

+#include <metal_stdlib>
+#include "utils.metal"
+using namespace metal;
+// Function constants for compile-time specialization.
+// IS_NEOX: true for GPT-NeoX style (Llama, Mistral), false for GPT-J style.
+constant bool IS_NEOX [[function_constant(0)]];
+// Rotary embedding kernel.
+//
+// Each threadgroup processes one token. Threads within the threadgroup
+// are mapped to (head_idx, rot_offset) pairs covering both query and key.
+//
+// The cos_sin_cache layout is [max_position, rot_dim] where:
+//   cache[pos, 0:rot_dim/2] = cos values
+//   cache[pos, rot_dim/2:rot_dim] = sin values
+//
+// For NeoX style (IS_NEOX=true):
+//   x_index = rot_offset, y_index = embed_dim + rot_offset
+// For GPT-J style (IS_NEOX=false):
+//   x_index = 2 * rot_offset, y_index = 2 * rot_offset + 1
+template <typename scalar_t>
+kernel void rotary_embedding_kernel(
+    const device int64_t *positions [[buffer(0)]],
+    device scalar_t *query [[buffer(1)]],
+    device scalar_t *key [[buffer(2)]],
+    const device scalar_t *cos_sin_cache [[buffer(3)]],
+    const device int &rot_dim [[buffer(4)]],
+    const device int64_t &query_stride [[buffer(5)]],
+    const device int64_t &key_stride [[buffer(6)]],
+    const device int &head_size [[buffer(7)]],
+    const device int &num_heads [[buffer(8)]],
+    const device int &num_kv_heads [[buffer(9)]],
+    const device int &has_key [[buffer(10)]],
+    uint token_idx [[threadgroup_position_in_grid]],
+    uint tid [[thread_position_in_threadgroup]],
+    uint threads_per_tg [[threads_per_threadgroup]]) {
+  const int embed_dim = rot_dim / 2;
+  const int64_t pos = positions[token_idx];
+  const device scalar_t *cache_ptr = cos_sin_cache + pos * rot_dim;
+  // Process query heads.
+  for (int i = tid; i < num_heads * embed_dim; i += threads_per_tg) {
+    const int head_idx = i / embed_dim;
+    const int rot_offset = i % embed_dim;
+    int x_index, y_index;
+    if (IS_NEOX) {
+      x_index = rot_offset;
+      y_index = embed_dim + rot_offset;
+    } else {
+      x_index = 2 * rot_offset;
+      y_index = 2 * rot_offset + 1;
+    }
+    const int64_t token_head = token_idx * query_stride + head_idx * head_size;
+    const float cos_val = static_cast<float>(cache_ptr[rot_offset]);
+    const float sin_val = static_cast<float>(cache_ptr[embed_dim + rot_offset]);
+    const float x = static_cast<float>(query[token_head + x_index]);
+    const float y = static_cast<float>(query[token_head + y_index]);
+    query[token_head + x_index] = static_cast<scalar_t>(x * cos_val - y * sin_val);
+    query[token_head + y_index] = static_cast<scalar_t>(y * cos_val + x * sin_val);
+  }
+  // Process key heads (if key is provided).
+  if (has_key) {
+    for (int i = tid; i < num_kv_heads * embed_dim; i += threads_per_tg) {
+      const int head_idx = i / embed_dim;
+      const int rot_offset = i % embed_dim;
+      int x_index, y_index;
+      if (IS_NEOX) {
+        x_index = rot_offset;
+        y_index = embed_dim + rot_offset;
+      } else {
+        x_index = 2 * rot_offset;
+        y_index = 2 * rot_offset + 1;
+      }
+      const int64_t token_head = token_idx * key_stride + head_idx * head_size;
+      const float cos_val = static_cast<float>(cache_ptr[rot_offset]);
+      const float sin_val = static_cast<float>(cache_ptr[embed_dim + rot_offset]);
+      const float x = static_cast<float>(key[token_head + x_index]);
+      const float y = static_cast<float>(key[token_head + y_index]);
+      key[token_head + x_index] = static_cast<scalar_t>(x * cos_val - y * sin_val);
+      key[token_head + y_index] = static_cast<scalar_t>(y * cos_val + x * sin_val);
+    }
+  }
+}
+// Instantiate kernel variants for each dtype.
+#define instantiate_rotary_embedding(type)                                     \
+  template [[host_name("rotary_embedding_" #type)]] [[kernel]] void            \
+  rotary_embedding_kernel<type>(                                               \
+      const device int64_t *positions [[buffer(0)]],                           \
+      device type *query [[buffer(1)]],                                        \
+      device type *key [[buffer(2)]],                                          \
+      const device type *cos_sin_cache [[buffer(3)]],                          \
+      const device int &rot_dim [[buffer(4)]],                                 \
+      const device int64_t &query_stride [[buffer(5)]],                        \
+      const device int64_t &key_stride [[buffer(6)]],                          \
+      const device int &head_size [[buffer(7)]],                               \
+      const device int &num_heads [[buffer(8)]],                               \
+      const device int &num_kv_heads [[buffer(9)]],                            \
+      const device int &has_key [[buffer(10)]],                                \
+      uint token_idx [[threadgroup_position_in_grid]],                         \
+      uint tid [[thread_position_in_threadgroup]],                             \
+      uint threads_per_tg [[threads_per_threadgroup]]);
+instantiate_rotary_embedding(float);
+instantiate_rotary_embedding(half);
+instantiate_rotary_embedding(bfloat16_t);

rotary-embedding-metal/rotary_embedding.mm ADDED Viewed

	@@ -0,0 +1,198 @@

+#include <ATen/mps/MPSDevice.h>
+#include <ATen/mps/MPSStream.h>
+#include <torch/torch.h>
+#import <Foundation/Foundation.h>
+#import <Metal/Metal.h>
+#include <dlfcn.h>
+#include <string>
+static inline id<MTLBuffer> getMTLBufferStorage(const torch::Tensor &tensor) {
+  return __builtin_bit_cast(id<MTLBuffer>, tensor.storage().data());
+}
+static std::string getModuleDirectory() {
+  Dl_info dl_info;
+  if (dladdr((void *)getModuleDirectory, &dl_info)) {
+    std::string path(dl_info.dli_fname);
+    size_t pos = path.find_last_of('/');
+    if (pos != std::string::npos) {
+      return path.substr(0, pos);
+    }
+  }
+  return ".";
+}
+void rotary_embedding(torch::Tensor &positions, torch::Tensor &query,
+                      std::optional<torch::Tensor> key, int64_t head_size,
+                      torch::Tensor &cos_sin_cache, bool is_neox) {
+  TORCH_CHECK(query.device().is_mps(), "query must be on MPS device");
+  TORCH_CHECK(positions.device().is_mps(), "positions must be on MPS device");
+  TORCH_CHECK(cos_sin_cache.device().is_mps(),
+              "cos_sin_cache must be on MPS device");
+  // Determine tensor dimensions.
+  // positions: [num_tokens] or [batch, seq_len]
+  // query:     [num_tokens, num_heads * head_size] or
+  //            [num_tokens, num_heads, head_size]
+  const int64_t num_tokens = positions.numel();
+  // Flatten positions to 1D for kernel simplicity.
+  torch::Tensor positions_flat = positions.reshape({-1});
+  // Compute query/key strides along the token dimension.
+  // Standard layout: [num_tokens, num_heads, head_size]
+  // Batched layout: [batch, seq_len, num_heads, head_size]
+  // The token dim is at index (positions.dim() - 1) in query after
+  // accounting for batch dims, but we flatten positions so use stride(0)
+  // relative to the flattened view.
+  //
+  // For standard [num_tokens, num_heads, head_size]:
+  //   query_stride = num_heads * head_size (stride along dim 0)
+  // For flattened [num_tokens, num_heads * head_size]:
+  //   query_stride = num_heads * head_size (stride along dim 0)
+  int64_t query_stride = query.stride(0);
+  int64_t key_stride = key.has_value() ? key->stride(0) : 0;
+  // Compute num_heads from tensor size. Works for both flat and split layouts.
+  const int num_heads =
+      static_cast<int>(query.numel() / (num_tokens * head_size));
+  const int num_kv_heads =
+      key.has_value()
+          ? static_cast<int>(key->numel() / (num_tokens * head_size))
+          : 0;
+  const int rot_dim = cos_sin_cache.size(-1);
+  const int embed_dim = rot_dim / 2;
+  const int has_key = key.has_value() ? 1 : 0;
+  @autoreleasepool {
+    at::mps::MPSStream *stream = at::mps::getCurrentMPSStream();
+    TORCH_CHECK(stream, "Failed to get current MPS stream");
+    id<MTLDevice> device = stream->device();
+    id<MTLCommandBuffer> cmdBuf = stream->commandBuffer();
+    TORCH_CHECK(cmdBuf, "Failed to get command buffer");
+    // Load metallib.
+    std::string moduleDir = getModuleDirectory();
+    std::string metallibPath = moduleDir + "/" + METALLIB_PATH;
+    NSString *metallibPathStr =
+        [NSString stringWithUTF8String:metallibPath.c_str()];
+    NSURL *metallibURL = [NSURL fileURLWithPath:metallibPathStr];
+    NSError *error = nil;
+    id<MTLLibrary> lib = [device newLibraryWithURL:metallibURL error:&error];
+    TORCH_CHECK(lib, "Failed to load Metal library at ", metallibPath,
+                error ? [NSString stringWithFormat:@": %@",
+                                                   error.localizedDescription]
+                            .UTF8String
+                      : "");
+    // Select kernel variant based on dtype.
+    NSString *kernName = nil;
+    switch (query.scalar_type()) {
+    case torch::kFloat:
+      kernName = @"rotary_embedding_float";
+      break;
+    case torch::kHalf:
+      kernName = @"rotary_embedding_half";
+      break;
+    case torch::kBFloat16:
+      kernName = @"rotary_embedding_bfloat16_t";
+      break;
+    default:
+      TORCH_CHECK(false, "Unsupported dtype for rotary_embedding: ",
+                  query.scalar_type());
+    }
+    // Set function constant for IS_NEOX.
+    MTLFunctionConstantValues *constants =
+        [[MTLFunctionConstantValues alloc] init];
+    [constants setConstantValue:&is_neox type:MTLDataTypeBool atIndex:0];
+    id<MTLFunction> fn = [lib newFunctionWithName:kernName
+                                   constantValues:constants
+                                            error:&error];
+    TORCH_CHECK(fn, "Missing Metal kernel function: ", kernName.UTF8String,
+                error ? [NSString stringWithFormat:@": %@",
+                                                   error.localizedDescription]
+                            .UTF8String
+                      : "");
+    id<MTLComputePipelineState> pso =
+        [device newComputePipelineStateWithFunction:fn error:&error];
+    TORCH_CHECK(pso, "Failed to create pipeline state",
+                error ? [NSString stringWithFormat:@": %@",
+                                                   error.localizedDescription]
+                            .UTF8String
+                      : "");
+    dispatch_queue_t q = stream->queue();
+    dispatch_sync(q, ^{
+      id<MTLComputeCommandEncoder> enc = [cmdBuf computeCommandEncoder];
+      TORCH_CHECK(enc, "Failed to create compute encoder");
+      [enc setComputePipelineState:pso];
+      // Buffer 0: positions (flattened)
+      [enc setBuffer:getMTLBufferStorage(positions_flat)
+              offset:positions_flat.storage_offset() *
+                     positions_flat.element_size()
+             atIndex:0];
+      // Buffer 1: query
+      [enc setBuffer:getMTLBufferStorage(query)
+              offset:query.storage_offset() * query.element_size()
+             atIndex:1];
+      // Buffer 2: key (or query as dummy if no key)
+      if (key.has_value()) {
+        [enc setBuffer:getMTLBufferStorage(*key)
+                offset:key->storage_offset() * key->element_size()
+               atIndex:2];
+      } else {
+        // Pass query buffer as dummy; has_key=0 ensures it's never accessed
+        [enc setBuffer:getMTLBufferStorage(query)
+                offset:query.storage_offset() * query.element_size()
+               atIndex:2];
+      }
+      // Buffer 3: cos_sin_cache
+      [enc setBuffer:getMTLBufferStorage(cos_sin_cache)
+              offset:cos_sin_cache.storage_offset() *
+                     cos_sin_cache.element_size()
+             atIndex:3];
+      // Scalar parameters via setBytes.
+      const int32_t rot_dim_i32 = static_cast<int32_t>(rot_dim);
+      [enc setBytes:&rot_dim_i32 length:sizeof(int32_t) atIndex:4];
+      [enc setBytes:&query_stride length:sizeof(int64_t) atIndex:5];
+      [enc setBytes:&key_stride length:sizeof(int64_t) atIndex:6];
+      const int32_t head_size_i32 = static_cast<int32_t>(head_size);
+      [enc setBytes:&head_size_i32 length:sizeof(int32_t) atIndex:7];
+      const int32_t num_heads_i32 = static_cast<int32_t>(num_heads);
+      [enc setBytes:&num_heads_i32 length:sizeof(int32_t) atIndex:8];
+      const int32_t num_kv_heads_i32 = static_cast<int32_t>(num_kv_heads);
+      [enc setBytes:&num_kv_heads_i32 length:sizeof(int32_t) atIndex:9];
+      const int32_t has_key_i32 = static_cast<int32_t>(has_key);
+      [enc setBytes:&has_key_i32 length:sizeof(int32_t) atIndex:10];
+      // Dispatch: one threadgroup per token.
+      const uint32_t threads_per_tg =
+          std::min<uint32_t>(512, std::max(num_heads, num_kv_heads) * embed_dim);
+      MTLSize grid = MTLSizeMake(num_tokens, 1, 1);
+      MTLSize tg = MTLSizeMake(threads_per_tg, 1, 1);
+      [enc dispatchThreadgroups:grid threadsPerThreadgroup:tg];
+      [enc endEncoding];
+    });
+    stream->synchronize(at::mps::SyncType::COMMIT);
+  }
+}

rotary-embedding-metal/utils.metal ADDED Viewed

	@@ -0,0 +1,131 @@

+#include <metal_stdlib>
+using namespace metal;
+#if defined(__HAVE_BFLOAT__)
+typedef bfloat bfloat16_t;
+#else
+constexpr METAL_FUNC uint16_t float_to_bfloat_bits(float x) {
+  if ((as_type<uint32_t>(x) & ~_fp_encoding_traits<float>::sign_mask) >
+      _fp_encoding_traits<float>::inf_mask) {
+    return uint16_t(as_type<uint32_t>(0x7FC0));
+  }
+  uint32_t float_bits = as_type<uint32_t>(x);
+  float_bits += ((float_bits >> 16) & 1) + as_type<uint32_t>(0x7FFF);
+  return float_bits >> 16;
+}
+constexpr METAL_FUNC float bfloat_bits_to_float(uint16_t x) {
+  return as_type<float>((uint32_t)x << 16);
+}
+struct _MLX_BFloat16;
+template <typename T>
+static constexpr constant bool can_convert_to_bfloat =
+    !is_same_v<T, _MLX_BFloat16> && is_convertible_v<T, float>;
+template <typename T>
+static constexpr constant bool can_convert_from_bfloat =
+    !is_same_v<T, _MLX_BFloat16> && is_convertible_v<float, T>;
+struct _MLX_BFloat16 {
+  uint16_t bits_;
+  _MLX_BFloat16() thread = default;
+  _MLX_BFloat16() threadgroup = default;
+  _MLX_BFloat16() device = default;
+  _MLX_BFloat16() constant = default;
+  struct bits_to_bfloat_struct {};
+  static constexpr METAL_FUNC bits_to_bfloat_struct bits_to_bfloat() {
+    return bits_to_bfloat_struct();
+  }
+  constexpr METAL_FUNC _MLX_BFloat16(uint16_t bits, bits_to_bfloat_struct)
+      : bits_(bits) {}
+  template <typename T,
+            typename = typename enable_if<can_convert_to_bfloat<T>>::type>
+  constexpr METAL_FUNC _MLX_BFloat16(T x) thread
+      : bits_(float_to_bfloat_bits(static_cast<float>(x))) {}
+  template <typename T,
+            typename = typename enable_if<can_convert_to_bfloat<T>>::type>
+  constexpr METAL_FUNC _MLX_BFloat16(T x) threadgroup
+      : bits_(float_to_bfloat_bits(static_cast<float>(x))) {}
+  template <typename T,
+            typename = typename enable_if<can_convert_to_bfloat<T>>::type>
+  constexpr METAL_FUNC _MLX_BFloat16(T x) device
+      : bits_(float_to_bfloat_bits(static_cast<float>(x))) {}
+  template <typename T,
+            typename = typename enable_if<can_convert_to_bfloat<T>>::type>
+  constexpr METAL_FUNC _MLX_BFloat16(T x) constant
+      : bits_(float_to_bfloat_bits(static_cast<float>(x))) {}
+  template <typename T,
+            typename = typename enable_if<can_convert_from_bfloat<T>>::type>
+  constexpr METAL_FUNC operator T() const thread {
+    return static_cast<T>(bfloat_bits_to_float(bits_));
+  }
+  template <typename T,
+            typename = typename enable_if<can_convert_from_bfloat<T>>::type>
+  constexpr METAL_FUNC operator T() const threadgroup {
+    return static_cast<T>(bfloat_bits_to_float(bits_));
+  }
+  template <typename T,
+            typename = typename enable_if<can_convert_from_bfloat<T>>::type>
+  constexpr METAL_FUNC operator T() const device {
+    return static_cast<T>(bfloat_bits_to_float(bits_));
+  }
+  template <typename T,
+            typename = typename enable_if<can_convert_from_bfloat<T>>::type>
+  constexpr METAL_FUNC operator T() constant {
+    return static_cast<T>(bfloat_bits_to_float(bits_));
+  }
+};
+constexpr METAL_FUNC _MLX_BFloat16 operator-(_MLX_BFloat16 x) {
+  return -static_cast<float>(x);
+}
+#define bfloat_binop_base(__op__, __operator__, otype, atype, btype, ctype)    \
+  constexpr METAL_FUNC otype __operator__(atype lhs, btype rhs) {              \
+    return static_cast<ctype>(lhs) __op__ static_cast<ctype>(rhs);             \
+  }
+#define bfloat_binop_helper(__op__, __operator__, otype, itype, ctype)         \
+  constexpr METAL_FUNC otype __operator__(_MLX_BFloat16 lhs, itype rhs) {      \
+    return static_cast<ctype>(lhs) __op__ static_cast<ctype>(rhs);             \
+  }                                                                            \
+  constexpr METAL_FUNC otype __operator__(itype lhs, _MLX_BFloat16 rhs) {      \
+    return static_cast<ctype>(lhs) __op__ static_cast<ctype>(rhs);             \
+  }
+#define bfloat_binop(_op_, _operator_)                                         \
+  bfloat_binop_base(_op_, _operator_, _MLX_BFloat16, _MLX_BFloat16,            \
+                    _MLX_BFloat16, float);                                     \
+  bfloat_binop_helper(_op_, _operator_, float, float, float);                  \
+  bfloat_binop_helper(_op_, _operator_, float, half, float);                   \
+  bfloat_binop_helper(_op_, _operator_, _MLX_BFloat16, int32_t, float);        \
+  bfloat_binop_helper(_op_, _operator_, _MLX_BFloat16, uint32_t, float);       \
+  bfloat_binop_helper(_op_, _operator_, _MLX_BFloat16, int64_t, float);        \
+  bfloat_binop_helper(_op_, _operator_, _MLX_BFloat16, uint64_t, float);
+bfloat_binop(+, operator+);
+bfloat_binop(-, operator-);
+bfloat_binop(*, operator*);
+bfloat_binop(/, operator/);
+#undef bfloat_binop_base
+#undef bfloat_binop_helper
+#undef bfloat_binop
+typedef struct _MLX_BFloat16 bfloat16_t;
+#endif

tests/test_rotary_embedding.py ADDED Viewed

	@@ -0,0 +1,210 @@

+"""Tests for Metal rotary embedding kernel.
+Validates correctness against a pure-PyTorch reference implementation
+for both NeoX (Llama/Mistral) and GPT-J rotation styles.
+"""
+import pytest
+import torch
+import rotary_embedding as ops
+def _is_mps_available() -> bool:
+    return hasattr(torch.backends, "mps") and torch.backends.mps.is_available()
+if _is_mps_available():
+    DEVICES = ["mps"]
+else:
+    DEVICES = [f"cuda:{i}" for i in range(max(1, torch.cuda.device_count()))]
+DTYPES = [torch.float32, torch.float16, torch.bfloat16]
+HEAD_SIZES = [64, 128, 256]
+NUM_HEADS = [8, 32]
+NUM_KV_HEADS = [1, 8]  # GQA and MHA
+IS_NEOX = [True, False]
+NUM_TOKENS = [1, 7, 32]
+MAX_POSITION = 8192
+ROTARY_DIM_FRACTIONS = [1.0]  # Full rotation; 0.5 for partial
+def _build_cos_sin_cache(
+    max_position: int,
+    rotary_dim: int,
+    dtype: torch.dtype,
+    device: str,
+    base: float = 10000.0,
+) -> torch.Tensor:
+    """Build a cos/sin cache matching vLLM's convention."""
+    inv_freq = 1.0 / (
+        base ** (torch.arange(0, rotary_dim, 2, dtype=torch.float32) / rotary_dim)
+    )
+    t = torch.arange(max_position, dtype=torch.float32)
+    freqs = torch.outer(t, inv_freq)  # [max_position, rotary_dim/2]
+    cos_vals = freqs.cos()
+    sin_vals = freqs.sin()
+    cache = torch.cat([cos_vals, sin_vals], dim=-1)  # [max_position, rotary_dim]
+    return cache.to(dtype=dtype, device=device)
+def _ref_rotary_embedding(
+    positions: torch.Tensor,
+    query: torch.Tensor,
+    key: torch.Tensor | None,
+    head_size: int,
+    cos_sin_cache: torch.Tensor,
+    is_neox: bool,
+) -> None:
+    """Pure-PyTorch reference implementation of rotary embedding."""
+    rot_dim = cos_sin_cache.shape[-1]
+    embed_dim = rot_dim // 2
+    num_tokens = positions.numel()
+    positions_flat = positions.reshape(-1)
+    for t in range(num_tokens):
+        pos = positions_flat[t].item()
+        cos_vals = cos_sin_cache[pos, :embed_dim].float()
+        sin_vals = cos_sin_cache[pos, embed_dim:].float()
+        # Apply to query heads.
+        num_heads = query.shape[-2]
+        for h in range(num_heads):
+            for d in range(embed_dim):
+                if is_neox:
+                    x_idx, y_idx = d, embed_dim + d
+                else:
+                    x_idx, y_idx = 2 * d, 2 * d + 1
+                x = query[t, h, x_idx].float()
+                y = query[t, h, y_idx].float()
+                query[t, h, x_idx] = (x * cos_vals[d] - y * sin_vals[d]).to(
+                    query.dtype
+                )
+                query[t, h, y_idx] = (y * cos_vals[d] + x * sin_vals[d]).to(
+                    query.dtype
+                )
+        # Apply to key heads.
+        if key is not None:
+            num_kv_heads = key.shape[-2]
+            for h in range(num_kv_heads):
+                for d in range(embed_dim):
+                    if is_neox:
+                        x_idx, y_idx = d, embed_dim + d
+                    else:
+                        x_idx, y_idx = 2 * d, 2 * d + 1
+                    x = key[t, h, x_idx].float()
+                    y = key[t, h, y_idx].float()
+                    key[t, h, x_idx] = (x * cos_vals[d] - y * sin_vals[d]).to(
+                        key.dtype
+                    )
+                    key[t, h, y_idx] = (y * cos_vals[d] + x * sin_vals[d]).to(
+                        key.dtype
+                    )
+@pytest.mark.parametrize("device", DEVICES)
+@pytest.mark.parametrize("dtype", DTYPES)
+@pytest.mark.parametrize("head_size", HEAD_SIZES)
+@pytest.mark.parametrize("num_heads", NUM_HEADS)
+@pytest.mark.parametrize("num_kv_heads", NUM_KV_HEADS)
+@pytest.mark.parametrize("is_neox", IS_NEOX)
+@pytest.mark.parametrize("num_tokens", NUM_TOKENS)
+@torch.inference_mode()
+def test_rotary_embedding(
+    device: str,
+    dtype: torch.dtype,
+    head_size: int,
+    num_heads: int,
+    num_kv_heads: int,
+    is_neox: bool,
+    num_tokens: int,
+) -> None:
+    # Skip invalid GQA configs.
+    if num_heads % num_kv_heads != 0:
+        pytest.skip("num_heads must be divisible by num_kv_heads")
+    rotary_dim = head_size  # Full rotation
+    cos_sin_cache = _build_cos_sin_cache(
+        MAX_POSITION, rotary_dim, dtype, device
+    )
+    # Random positions (arbitrary, non-contiguous to test flexibility).
+    positions = torch.randint(0, MAX_POSITION, (num_tokens,), device=device)
+    # Random query and key tensors.
+    query = torch.randn(num_tokens, num_heads, head_size, dtype=dtype, device=device)
+    key = torch.randn(num_tokens, num_kv_heads, head_size, dtype=dtype, device=device)
+    # Clone for reference.
+    query_ref = query.clone()
+    key_ref = key.clone()
+    # Run kernel.
+    ops.rotary_embedding(positions, query, key, head_size, cos_sin_cache, is_neox)
+    # Run reference.
+    _ref_rotary_embedding(
+        positions.cpu() if device != "cpu" else positions,
+        query_ref.cpu() if device != "cpu" else query_ref,
+        key_ref.cpu() if device != "cpu" else key_ref,
+        head_size,
+        cos_sin_cache.cpu() if device != "cpu" else cos_sin_cache,
+        is_neox,
+    )
+    # Compare. Use relaxed tolerances for fp16/bf16.
+    if dtype == torch.float32:
+        atol, rtol = 1e-5, 1e-5
+    elif dtype == torch.float16:
+        atol, rtol = 1e-3, 1e-3
+    else:  # bfloat16
+        atol, rtol = 2e-2, 2e-2
+    query_ref_dev = query_ref.to(device=device)
+    key_ref_dev = key_ref.to(device=device)
+    torch.testing.assert_close(query, query_ref_dev, atol=atol, rtol=rtol)
+    torch.testing.assert_close(key, key_ref_dev, atol=atol, rtol=rtol)
+@pytest.mark.parametrize("device", DEVICES)
+@pytest.mark.parametrize("dtype", [torch.float32])
+@pytest.mark.parametrize("is_neox", [True])
+@torch.inference_mode()
+def test_rotary_embedding_no_key(
+    device: str,
+    dtype: torch.dtype,
+    is_neox: bool,
+) -> None:
+    """Test that passing key=None works correctly."""
+    head_size = 128
+    num_heads = 8
+    num_tokens = 4
+    rotary_dim = head_size
+    cos_sin_cache = _build_cos_sin_cache(
+        MAX_POSITION, rotary_dim, dtype, device
+    )
+    positions = torch.randint(0, MAX_POSITION, (num_tokens,), device=device)
+    query = torch.randn(num_tokens, num_heads, head_size, dtype=dtype, device=device)
+    query_ref = query.clone()
+    # Run kernel with key=None.
+    ops.rotary_embedding(positions, query, None, head_size, cos_sin_cache, is_neox)
+    # Run reference with key=None.
+    _ref_rotary_embedding(
+        positions.cpu(),
+        query_ref.cpu(),
+        None,
+        head_size,
+        cos_sin_cache.cpu(),
+        is_neox,
+    )
+    query_ref_dev = query_ref.to(device=device)
+    torch.testing.assert_close(query, query_ref_dev, atol=1e-5, rtol=1e-5)

torch-ext/rotary_embedding/__init__.py ADDED Viewed

	@@ -0,0 +1,7 @@

+from ._custom_ops import rotary_embedding
+from ._ops import ops
+__all__ = [
+    "ops",
+    "rotary_embedding",
+]

torch-ext/rotary_embedding/_custom_ops.py ADDED Viewed

	@@ -0,0 +1,17 @@

+from typing import Optional
+import torch
+from ._ops import ops
+def rotary_embedding(
+    positions: torch.Tensor,
+    query: torch.Tensor,
+    key: Optional[torch.Tensor],
+    head_size: int,
+    cos_sin_cache: torch.Tensor,
+    is_neox: bool,
+) -> None:
+    ops.rotary_embedding(positions, query, key, head_size, cos_sin_cache,
+                         is_neox)

torch-ext/torch_binding.cpp ADDED Viewed

	@@ -0,0 +1,15 @@

+#include <torch/library.h>
+#include "registration.h"
+#include "torch_binding.h"
+TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
+  ops.def("rotary_embedding(Tensor positions, Tensor! query,"
+          "                 Tensor!? key, int head_size,"
+          "                 Tensor cos_sin_cache, bool is_neox) -> ()");
+#if defined(METAL_KERNEL)
+  ops.impl("rotary_embedding", torch::kMPS, rotary_embedding);
+#endif
+}
+REGISTER_EXTENSION(TORCH_EXTENSION_NAME)

torch-ext/torch_binding.h ADDED Viewed

	@@ -0,0 +1,7 @@

+#pragma once
+#include <torch/torch.h>
+void rotary_embedding(torch::Tensor &positions, torch::Tensor &query,
+                      std::optional<torch::Tensor> key, int64_t head_size,
+                      torch::Tensor &cos_sin_cache, bool is_neox);