Doesnt crash?

Files changed (4) hide show

paged-attention-metal/cache.mm +20 -9
paged-attention-metal/paged_attention.mm +30 -12
tests/kernels/test_attention.py +3 -3
torch-ext/torch_binding.cpp +59 -59

paged-attention-metal/cache.mm CHANGED Viewed

@@ -1,4 +1,6 @@
 #include <torch/torch.h>
 #import <Foundation/Foundation.h>
 #import <Metal/Metal.h>
@@ -34,10 +36,13 @@ void swap_blocks(torch::Tensor& src, torch::Tensor& dst,
   const int64_t num_blocks          = block_mapping.size(0);
   @autoreleasepool {
-    id<MTLCommandBuffer> commandBuffer = torch::mps::get_command_buffer();
     TORCH_CHECK(commandBuffer, "Failed to get MPS command buffer");
-    dispatch_queue_t serialQueue = torch::mps::get_dispatch_queue();
     dispatch_sync(serialQueue, ^{
       id<MTLBlitCommandEncoder> blitEncoder = [commandBuffer blitCommandEncoder];
@@ -60,7 +65,7 @@ void swap_blocks(torch::Tensor& src, torch::Tensor& dst,
       }
       [blitEncoder endEncoding];
-      torch::mps::commit();
     });
   }
 }
@@ -145,10 +150,13 @@ void copy_blocks(const std::vector<torch::Tensor>& key_caches,
     TORCH_CHECK(pso, err.localizedDescription.UTF8String);
     // --- Encode dispatch ----------------------------------------------
-    id<MTLCommandBuffer> cmdBuf = torch::mps::get_command_buffer();
     TORCH_CHECK(cmdBuf, "Failed to get command buffer");
-    dispatch_queue_t q = torch::mps::get_dispatch_queue();
     dispatch_sync(q, ^{
       id<MTLComputeCommandEncoder> enc = [cmdBuf computeCommandEncoder];
       TORCH_CHECK(enc, "Failed to create compute encoder");
@@ -171,7 +179,7 @@ void copy_blocks(const std::vector<torch::Tensor>& key_caches,
       [enc dispatchThreads:grid threadsPerThreadgroup:tg];
       [enc endEncoding];
-      torch::mps::commit();
     });
   }
 }
@@ -248,10 +256,13 @@ void reshape_and_cache(
     // -----------------------------------------------------------------
     // Encode dispatch
     // -----------------------------------------------------------------
-    id<MTLCommandBuffer> cmdBuf = torch::mps::get_command_buffer();
     TORCH_CHECK(cmdBuf, "Failed to get command buffer");
-    dispatch_queue_t q = torch::mps::get_dispatch_queue();
     dispatch_sync(q, ^{
       id<MTLComputeCommandEncoder> enc = [cmdBuf computeCommandEncoder];
       TORCH_CHECK(enc, "Failed to create compute encoder");
@@ -298,7 +309,7 @@ void reshape_and_cache(
       [enc dispatchThreads:grid threadsPerThreadgroup:tg];
       [enc endEncoding];
-      torch::mps::commit();
     });
   }
 }

 #include <torch/torch.h>
+#include <ATen/mps/MPSStream.h>
+#include <ATen/mps/MPSDevice.h>
 #import <Foundation/Foundation.h>
 #import <Metal/Metal.h>
   const int64_t num_blocks          = block_mapping.size(0);
   @autoreleasepool {
+    at::mps::MPSStream* stream = at::mps::getCurrentMPSStream();
+    TORCH_CHECK(stream, "Failed to get current MPS stream");
+    id<MTLCommandBuffer> commandBuffer = stream->commandBuffer();
     TORCH_CHECK(commandBuffer, "Failed to get MPS command buffer");
+    dispatch_queue_t serialQueue = stream->queue();
     dispatch_sync(serialQueue, ^{
       id<MTLBlitCommandEncoder> blitEncoder = [commandBuffer blitCommandEncoder];
       }
       [blitEncoder endEncoding];
+      stream->synchronize(at::mps::SyncType::COMMIT);
     });
   }
 }
     TORCH_CHECK(pso, err.localizedDescription.UTF8String);
     // --- Encode dispatch ----------------------------------------------
+    at::mps::MPSStream* stream = at::mps::getCurrentMPSStream();
+    TORCH_CHECK(stream, "Failed to get current MPS stream");
+    id<MTLCommandBuffer> cmdBuf = stream->commandBuffer();
     TORCH_CHECK(cmdBuf, "Failed to get command buffer");
+    dispatch_queue_t q = stream->queue();
     dispatch_sync(q, ^{
       id<MTLComputeCommandEncoder> enc = [cmdBuf computeCommandEncoder];
       TORCH_CHECK(enc, "Failed to create compute encoder");
       [enc dispatchThreads:grid threadsPerThreadgroup:tg];
       [enc endEncoding];
+      stream->synchronize(at::mps::SyncType::COMMIT);
     });
   }
 }
     // -----------------------------------------------------------------
     // Encode dispatch
     // -----------------------------------------------------------------
+    at::mps::MPSStream* stream = at::mps::getCurrentMPSStream();
+    TORCH_CHECK(stream, "Failed to get current MPS stream");
+    id<MTLCommandBuffer> cmdBuf = stream->commandBuffer();
     TORCH_CHECK(cmdBuf, "Failed to get command buffer");
+    dispatch_queue_t q = stream->queue();
     dispatch_sync(q, ^{
       id<MTLComputeCommandEncoder> enc = [cmdBuf computeCommandEncoder];
       TORCH_CHECK(enc, "Failed to create compute encoder");
       [enc dispatchThreads:grid threadsPerThreadgroup:tg];
       [enc endEncoding];
+      stream->synchronize(at::mps::SyncType::COMMIT);
     });
   }
 }

paged-attention-metal/paged_attention.mm CHANGED Viewed

@@ -1,4 +1,6 @@
 #include <torch/torch.h>
 #import <Foundation/Foundation.h>
 #import <Metal/Metal.h>
@@ -43,7 +45,7 @@ static std::string getKernelName(const std::string& base_name, torch::ScalarType
                            "_nt" + std::to_string(num_threads) +
                            "_nsl" + std::to_string(num_simd_lanes);
-  if (partition_size > 0) {
     kernel_name += "_ps" + std::to_string(partition_size);
   }
@@ -60,7 +62,6 @@ static size_t calculateSharedMemorySize(int max_seq_len, int head_size, int num_
   // Output workspace for cross-warp reduction: head_size * sizeof(float)
   size_t output_size = head_size * sizeof(float);
   return std::max(logits_size + reduction_size, output_size);
 }
@@ -122,9 +123,9 @@ void paged_attention_v1(
   // Calculate shared memory requirements
   size_t shared_memory_size = calculateSharedMemorySize(max_seq_len, head_size, num_threads, num_simd_lanes);
-  // Get kernel name
   std::string kernel_name = getKernelName("paged_attention", query.scalar_type(),
-                                         head_size, block_size, num_threads, num_simd_lanes);
   @autoreleasepool {
     id<MTLDevice> device = MTLCreateSystemDefaultDevice();
@@ -156,10 +157,13 @@ void paged_attention_v1(
                 error ? error.localizedDescription.UTF8String : "unknown error");
     // Setup command buffer and encoder
-    id<MTLCommandBuffer> cmdBuf = torch::mps::get_command_buffer();
     TORCH_CHECK(cmdBuf, "Failed to get MPS command buffer");
-    dispatch_queue_t q = torch::mps::get_dispatch_queue();
     dispatch_sync(q, ^{
       id<MTLComputeCommandEncoder> enc = [cmdBuf computeCommandEncoder];
       TORCH_CHECK(enc, "Failed to create compute command encoder");
@@ -249,7 +253,7 @@ void paged_attention_v1(
       [enc dispatchThreadgroups:grid threadsPerThreadgroup:threadgroup];
       [enc endEncoding];
-      torch::mps::commit();
     });
   }
 }
@@ -310,8 +314,19 @@ void paged_attention_v2(
   // Get kernel names
   std::string kernel_name = getKernelName("paged_attention", query.scalar_type(),
                                          head_size, block_size, num_threads, num_simd_lanes, partition_size);
-  std::string reduce_kernel_name = getKernelName("paged_attention_v2_reduce", query.scalar_type(),
-                                                head_size, 0, num_threads, num_simd_lanes, partition_size);
   @autoreleasepool {
     id<MTLDevice> device = MTLCreateSystemDefaultDevice();
@@ -327,10 +342,13 @@ void paged_attention_v2(
                 error ? error.localizedDescription.UTF8String : "unknown error");
     // Setup command buffer and queue
-    id<MTLCommandBuffer> cmdBuf = torch::mps::get_command_buffer();
     TORCH_CHECK(cmdBuf, "Failed to get MPS command buffer");
-    dispatch_queue_t q = torch::mps::get_dispatch_queue();
     dispatch_sync(q, ^{
       // ==================================================================
       // Phase 1: Main paged attention kernel with partitioning
@@ -508,7 +526,7 @@ void paged_attention_v2(
       [reduceEnc dispatchThreadgroups:reduceGrid threadsPerThreadgroup:reduceThreadgroup];
       [reduceEnc endEncoding];
-      torch::mps::commit();
     });
   }
 }

 #include <torch/torch.h>
+#include <ATen/mps/MPSStream.h>
+#include <ATen/mps/MPSDevice.h>
 #import <Foundation/Foundation.h>
 #import <Metal/Metal.h>
                            "_nt" + std::to_string(num_threads) +
                            "_nsl" + std::to_string(num_simd_lanes);
+  if (partition_size >= 0) {
     kernel_name += "_ps" + std::to_string(partition_size);
   }
   // Output workspace for cross-warp reduction: head_size * sizeof(float)
   size_t output_size = head_size * sizeof(float);
   return std::max(logits_size + reduction_size, output_size);
 }
   // Calculate shared memory requirements
   size_t shared_memory_size = calculateSharedMemorySize(max_seq_len, head_size, num_threads, num_simd_lanes);
+  // Get kernel name - v1 kernels have partition_size=0 in their name
   std::string kernel_name = getKernelName("paged_attention", query.scalar_type(),
+                                         head_size, block_size, num_threads, num_simd_lanes, partition_size);
   @autoreleasepool {
     id<MTLDevice> device = MTLCreateSystemDefaultDevice();
                 error ? error.localizedDescription.UTF8String : "unknown error");
     // Setup command buffer and encoder
+    at::mps::MPSStream* stream = at::mps::getCurrentMPSStream();
+    TORCH_CHECK(stream, "Failed to get current MPS stream");
+    id<MTLCommandBuffer> cmdBuf = stream->commandBuffer();
     TORCH_CHECK(cmdBuf, "Failed to get MPS command buffer");
+    dispatch_queue_t q = stream->queue();
     dispatch_sync(q, ^{
       id<MTLComputeCommandEncoder> enc = [cmdBuf computeCommandEncoder];
       TORCH_CHECK(enc, "Failed to create compute command encoder");
       [enc dispatchThreadgroups:grid threadsPerThreadgroup:threadgroup];
       [enc endEncoding];
+      stream->synchronize(at::mps::SyncType::COMMIT);
     });
   }
 }
   // Get kernel names
   std::string kernel_name = getKernelName("paged_attention", query.scalar_type(),
                                          head_size, block_size, num_threads, num_simd_lanes, partition_size);
+  // Reduce kernel doesn't have block_size in its name
+  std::string reduce_kernel_name = "paged_attention_v2_reduce";
+  switch (query.scalar_type()) {
+    case torch::kFloat: reduce_kernel_name += "_float"; break;
+    case torch::kHalf: reduce_kernel_name += "_half"; break;
+    case torch::kBFloat16: reduce_kernel_name += "_bfloat16_t"; break;
+    default:
+      TORCH_CHECK(false, "Unsupported dtype for paged attention: ", query.scalar_type());
+  }
+  reduce_kernel_name += "_hs" + std::to_string(head_size) +
+                       "_nt" + std::to_string(num_threads) +
+                       "_nsl" + std::to_string(num_simd_lanes) +
+                       "_ps" + std::to_string(partition_size);
   @autoreleasepool {
     id<MTLDevice> device = MTLCreateSystemDefaultDevice();
                 error ? error.localizedDescription.UTF8String : "unknown error");
     // Setup command buffer and queue
+    at::mps::MPSStream* stream = at::mps::getCurrentMPSStream();
+    TORCH_CHECK(stream, "Failed to get current MPS stream");
+    id<MTLCommandBuffer> cmdBuf = stream->commandBuffer();
     TORCH_CHECK(cmdBuf, "Failed to get MPS command buffer");
+    dispatch_queue_t q = stream->queue();
     dispatch_sync(q, ^{
       // ==================================================================
       // Phase 1: Main paged attention kernel with partitioning
       [reduceEnc dispatchThreadgroups:reduceGrid threadsPerThreadgroup:reduceThreadgroup];
       [reduceEnc endEncoding];
+      stream->synchronize(at::mps::SyncType::COMMIT);
     });
   }
 }

tests/kernels/test_attention.py CHANGED Viewed

@@ -228,7 +228,7 @@ def test_paged_attention(
                 64,
                 0,
             ),
-            cond=(head_size == HEAD_SIZES[0] and block_size == BLOCK_SIZES[0]),
         )
     elif version in ("v2", "rocm"):
@@ -291,7 +291,7 @@ def test_paged_attention(
                     64,
                     0,
                 ),
-                cond=(head_size == HEAD_SIZES[0] and block_size == BLOCK_SIZES[0]),
             )
         else:
@@ -336,7 +336,7 @@ def test_paged_attention(
                     k_scale,
                     v_scale,
                 ),
-                cond=(head_size == HEAD_SIZES[0] and block_size == BLOCK_SIZES[0]),
             )
     else:

                 64,
                 0,
             ),
+            cond=(head_size == HEAD_SIZES[0] and block_size == BLOCK_SIZES[0] and not device.startswith("mps")),
         )
     elif version in ("v2", "rocm"):
                     64,
                     0,
                 ),
+                cond=(head_size == HEAD_SIZES[0] and block_size == BLOCK_SIZES[0] and not device.startswith("mps")),
             )
         else:
                     k_scale,
                     v_scale,
                 ),
+                cond=(head_size == HEAD_SIZES[0] and block_size == BLOCK_SIZES[0] and not device.startswith("mps")),
             )
     else:

torch-ext/torch_binding.cpp CHANGED Viewed

@@ -15,103 +15,103 @@
 // https://github.com/pytorch/pytorch/blob/main/aten/src/ATen/native/README.md#annotations
 TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
-  // Attention ops
-  // Compute the attention between an input query and the cached
-  // keys/values using PagedAttention.
-  ops.def(
-      "paged_attention_v1("
-      "    Tensor! out, Tensor query, Tensor key_cache,"
-      "    Tensor value_cache, int num_kv_heads, float scale,"
-      "    Tensor block_tables, Tensor seq_lens, int block_size,"
-      "    int max_seq_len, Tensor? alibi_slopes,"
-      "    str kv_cache_dtype, Tensor k_scale, Tensor v_scale,"
-      "    int tp_rank, int blocksparse_local_blocks,"
-      "    int blocksparse_vert_stride, int blocksparse_block_size,"
-      "    int blocksparse_head_sliding_step) -> ()");
 #if defined(CUDA_KERNEL) || defined(ROCM_KERNEL)
     ops.impl("paged_attention_v1", torch::kCUDA, &paged_attention_v1);
 #elif defined(METAL_KERNEL)
     ops.impl("paged_attention_v1", torch::kMPS, paged_attention_v1);
 #endif
-  // PagedAttention V2.
-  ops.def(
-      "paged_attention_v2("
-      "    Tensor! out, Tensor! exp_sums, Tensor! max_logits,"
-      "    Tensor! tmp_out, Tensor query, Tensor key_cache,"
-      "    Tensor value_cache, int num_kv_heads, float scale,"
-      "    Tensor block_tables, Tensor seq_lens, int block_size,"
-      "    int max_seq_len, Tensor? alibi_slopes,"
-      "    str kv_cache_dtype, Tensor k_scale, Tensor v_scale,"
-      "    int tp_rank, int blocksparse_local_blocks,"
-      "    int blocksparse_vert_stride, int blocksparse_block_size,"
-      "    int blocksparse_head_sliding_step) -> ()");
 #if defined(CUDA_KERNEL) || defined(ROCM_KERNEL)
     ops.impl("paged_attention_v2", torch::kCUDA, &paged_attention_v2);
 #elif defined(METAL_KERNEL)
     ops.impl("paged_attention_v2", torch::kMPS, paged_attention_v2);
 #endif
-  // Swap in (out) the cache blocks from src to dst.
-  ops.def(
-      "swap_blocks(Tensor src, Tensor! dst, Tensor block_mapping) -> ()");
 #if defined(CUDA_KERNEL) || defined(ROCM_KERNEL)
     ops.impl("swap_blocks", torch::kCUDA, &swap_blocks);
 #elif defined(METAL_KERNEL)
     ops.impl("swap_blocks", torch::kMPS, swap_blocks);
 #endif
-  // Copy the cache blocks from src to dst.
-  ops.def(
-      "copy_blocks(Tensor(a!)[] key_caches, Tensor[](b!) value_caches, "
-      "Tensor block_mapping) -> ()");
 #if defined(CUDA_KERNEL) || defined(ROCM_KERNEL)
     ops.impl("copy_blocks", torch::kCUDA, &copy_blocks);
 #elif defined(METAL_KERNEL)
     ops.impl("copy_blocks", torch::kMPS, copy_blocks);
 #endif
-  // Reshape the key and value tensors and cache them.
-  ops.def(
-      "reshape_and_cache(Tensor key, Tensor value,"
-      "                  Tensor! key_cache, Tensor! value_cache,"
-      "                  Tensor slot_mapping,"
-      "                  str kv_cache_dtype,"
-      "                  Tensor k_scale, Tensor v_scale) -> ()");
 #if defined(CUDA_KERNEL) || defined(ROCM_KERNEL)
     ops.impl("reshape_and_cache", torch::kCUDA, &reshape_and_cache);
 #elif defined(METAL_KERNEL)
     ops.impl("reshape_and_cache", torch::kMPS, reshape_and_cache);
 #endif
-  // Reshape the key and value tensors and cache them.
-  ops.def(
-      "reshape_and_cache_flash(Tensor key, Tensor value,"
-      "                        Tensor! key_cache,"
-      "                        Tensor! value_cache,"
-      "                        Tensor slot_mapping,"
-      "                        str kv_cache_dtype,"
-      "                        Tensor k_scale, Tensor v_scale) -> ()");
 #if defined(CUDA_KERNEL) || defined(ROCM_KERNEL)
     ops.impl("reshape_and_cache_flash", torch::kCUDA, &reshape_and_cache_flash);
 #elif defined(METAL_KERNEL)
     ops.impl("reshape_and_cache_flash", torch::kMPS, reshape_and_cache_flash);
 #endif
-  // Gets the specified device attribute.
-  ops.def("get_device_attribute(int attribute, int device_id) -> int");
-  ops.impl("get_device_attribute", &get_device_attribute);
-  // Gets the maximum shared memory per block device attribute.
-  ops.def(
-      "get_max_shared_memory_per_block_device_attribute(int device_id) -> int");
-  ops.impl("get_max_shared_memory_per_block_device_attribute",
-                  &get_max_shared_memory_per_block_device_attribute);
-  // Convert the key and value cache to fp8 data type.
-  ops.def(
-      "convert_fp8(Tensor! dst_cache, Tensor src_cache, float scale, "
-      "str kv_cache_dtype) -> ()");
 #if defined(CUDA_KERNEL) || defined(ROCM_KERNEL)
     ops.impl("convert_fp8", torch::kCUDA, &convert_fp8);
 #elif defined(METAL_KERNEL)

 // https://github.com/pytorch/pytorch/blob/main/aten/src/ATen/native/README.md#annotations
 TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
+    // Attention ops
+    // Compute the attention between an input query and the cached
+    // keys/values using PagedAttention.
+    ops.def(
+        "paged_attention_v1("
+        "    Tensor! out, Tensor query, Tensor key_cache,"
+        "    Tensor value_cache, int num_kv_heads, float scale,"
+        "    Tensor block_tables, Tensor seq_lens, int block_size,"
+        "    int max_seq_len, Tensor? alibi_slopes,"
+        "    str kv_cache_dtype, Tensor k_scale, Tensor v_scale,"
+        "    int tp_rank, int blocksparse_local_blocks,"
+        "    int blocksparse_vert_stride, int blocksparse_block_size,"
+        "    int blocksparse_head_sliding_step) -> ()");
 #if defined(CUDA_KERNEL) || defined(ROCM_KERNEL)
     ops.impl("paged_attention_v1", torch::kCUDA, &paged_attention_v1);
 #elif defined(METAL_KERNEL)
     ops.impl("paged_attention_v1", torch::kMPS, paged_attention_v1);
 #endif
+    // PagedAttention V2.
+    ops.def(
+        "paged_attention_v2("
+        "    Tensor! out, Tensor! exp_sums, Tensor! max_logits,"
+        "    Tensor! tmp_out, Tensor query, Tensor key_cache,"
+        "    Tensor value_cache, int num_kv_heads, float scale,"
+        "    Tensor block_tables, Tensor seq_lens, int block_size,"
+        "    int max_seq_len, Tensor? alibi_slopes,"
+        "    str kv_cache_dtype, Tensor k_scale, Tensor v_scale,"
+        "    int tp_rank, int blocksparse_local_blocks,"
+        "    int blocksparse_vert_stride, int blocksparse_block_size,"
+        "    int blocksparse_head_sliding_step) -> ()");
 #if defined(CUDA_KERNEL) || defined(ROCM_KERNEL)
     ops.impl("paged_attention_v2", torch::kCUDA, &paged_attention_v2);
 #elif defined(METAL_KERNEL)
     ops.impl("paged_attention_v2", torch::kMPS, paged_attention_v2);
 #endif
+    // Swap in (out) the cache blocks from src to dst.
+    ops.def(
+        "swap_blocks(Tensor src, Tensor! dst, Tensor block_mapping) -> ()");
 #if defined(CUDA_KERNEL) || defined(ROCM_KERNEL)
     ops.impl("swap_blocks", torch::kCUDA, &swap_blocks);
 #elif defined(METAL_KERNEL)
     ops.impl("swap_blocks", torch::kMPS, swap_blocks);
 #endif
+    // Copy the cache blocks from src to dst.
+    ops.def(
+        "copy_blocks(Tensor(a!)[] key_caches, Tensor[](b!) value_caches, "
+        "Tensor block_mapping) -> ()");
 #if defined(CUDA_KERNEL) || defined(ROCM_KERNEL)
     ops.impl("copy_blocks", torch::kCUDA, &copy_blocks);
 #elif defined(METAL_KERNEL)
     ops.impl("copy_blocks", torch::kMPS, copy_blocks);
 #endif
+    // Reshape the key and value tensors and cache them.
+    ops.def(
+        "reshape_and_cache(Tensor key, Tensor value,"
+        "                  Tensor! key_cache, Tensor! value_cache,"
+        "                  Tensor slot_mapping,"
+        "                  str kv_cache_dtype,"
+        "                  Tensor k_scale, Tensor v_scale) -> ()");
 #if defined(CUDA_KERNEL) || defined(ROCM_KERNEL)
     ops.impl("reshape_and_cache", torch::kCUDA, &reshape_and_cache);
 #elif defined(METAL_KERNEL)
     ops.impl("reshape_and_cache", torch::kMPS, reshape_and_cache);
 #endif
+    // Reshape the key and value tensors and cache them.
+    ops.def(
+        "reshape_and_cache_flash(Tensor key, Tensor value,"
+        "                        Tensor! key_cache,"
+        "                        Tensor! value_cache,"
+        "                        Tensor slot_mapping,"
+        "                        str kv_cache_dtype,"
+        "                        Tensor k_scale, Tensor v_scale) -> ()");
 #if defined(CUDA_KERNEL) || defined(ROCM_KERNEL)
     ops.impl("reshape_and_cache_flash", torch::kCUDA, &reshape_and_cache_flash);
 #elif defined(METAL_KERNEL)
     ops.impl("reshape_and_cache_flash", torch::kMPS, reshape_and_cache_flash);
 #endif
+    // Gets the specified device attribute.
+    ops.def("get_device_attribute(int attribute, int device_id) -> int");
+    ops.impl("get_device_attribute", &get_device_attribute);
+    // Gets the maximum shared memory per block device attribute.
+    ops.def(
+        "get_max_shared_memory_per_block_device_attribute(int device_id) -> int");
+    ops.impl("get_max_shared_memory_per_block_device_attribute",
+             &get_max_shared_memory_per_block_device_attribute);
+    // Convert the key and value cache to fp8 data type.
+    ops.def(
+        "convert_fp8(Tensor! dst_cache, Tensor src_cache, float scale, "
+        "str kv_cache_dtype) -> ()");
 #if defined(CUDA_KERNEL) || defined(ROCM_KERNEL)
     ops.impl("convert_fp8", torch::kCUDA, &convert_fp8);
 #elif defined(METAL_KERNEL)