feat: basic template

Files changed (16) hide show

.gitattributes +3 -0
.gitignore +1 -0
README.md +48 -0
__KERNEL_NAME_NORMALIZED___cpu/__KERNEL_NAME_NORMALIZED___cpu.cpp +15 -0
__KERNEL_NAME_NORMALIZED___cuda/__KERNEL_NAME_NORMALIZED__.cu +33 -0
__KERNEL_NAME_NORMALIZED___metal/__KERNEL_NAME_NORMALIZED__.metal +14 -0
__KERNEL_NAME_NORMALIZED___metal/__KERNEL_NAME_NORMALIZED__.mm +63 -0
__KERNEL_NAME_NORMALIZED___xpu/__KERNEL_NAME_NORMALIZED__.cpp +20 -0
build.toml +55 -0
example.py +41 -0
flake.nix +11 -0
tests/__init__.py +0 -0
tests/test___KERNEL_NAME_NORMALIZED__.py +21 -0
torch-ext/__KERNEL_NAME_NORMALIZED__/__init__.py +12 -0
torch-ext/torch_binding.cpp +19 -0
torch-ext/torch_binding.h +5 -0

.gitattributes ADDED Viewed

	@@ -0,0 +1,3 @@

+*.so filter=lfs diff=lfs merge=lfs -text
+*.pth filter=lfs diff=lfs merge=lfs -text
+*.bin filter=lfs diff=lfs merge=lfs -text

.gitignore ADDED Viewed

	@@ -0,0 +1 @@


1	+ build

README.md ADDED Viewed

	@@ -0,0 +1,48 @@

+# __KERNEL_NAME__
+A custom kernel for PyTorch.
+## Installation
+```bash
+pip install __REPO_ID__
+```
+## Usage
+```python
+import torch
+from __KERNEL_NAME_NORMALIZED__ import __KERNEL_NAME_NORMALIZED__
+# Create input tensor
+x = torch.randn(1024, 1024, device="cuda")
+# Run kernel
+result = __KERNEL_NAME_NORMALIZED__(x)
+```
+## Development
+### Building
+```bash
+nix develop
+nix run .#build-and-copy
+```
+### Testing
+```bash
+nix develop .#test
+pytest tests/
+```
+### Test as a `kernels` user
+```bash
+uv run example.py
+```
+## License
+Apache 2.0

__KERNEL_NAME_NORMALIZED___cpu/__KERNEL_NAME_NORMALIZED___cpu.cpp ADDED Viewed

	@@ -0,0 +1,15 @@

+#include <torch/all.h>
+void __KERNEL_NAME_NORMALIZED__(torch::Tensor &out, torch::Tensor const &input) {
+    TORCH_CHECK(out.dtype() == torch::kFloat32, "Output tensor must be float32");
+    TORCH_CHECK(input.dtype() == torch::kFloat32, "Input tensor must be float32");
+    TORCH_CHECK(out.numel() == input.numel(), "Tensors must have same size");
+    const float* in_ptr = input.data_ptr<float>();
+    float* out_ptr = out.data_ptr<float>();
+    int64_t n = input.numel();
+    for (int64_t i = 0; i < n; ++i) {
+        out_ptr[i] = in_ptr[i] + 1.0f;
+    }
+}

__KERNEL_NAME_NORMALIZED___cuda/__KERNEL_NAME_NORMALIZED__.cu ADDED Viewed

	@@ -0,0 +1,33 @@

+#include <ATen/cuda/CUDAContext.h>
+#include <c10/cuda/CUDAGuard.h>
+#include <torch/all.h>
+__global__ void __KERNEL_NAME_NORMALIZED___kernel(float *__restrict__ out,
+                            float const *__restrict__ input, const int n) {
+  int idx = blockIdx.x * blockDim.x + threadIdx.x;
+  if (idx < n) {
+    out[idx] = input[idx] + 1.0f;
+  }
+}
+void __KERNEL_NAME_NORMALIZED__(torch::Tensor &out, torch::Tensor const &input) {
+  TORCH_CHECK(input.device().is_cuda(), "input must be a CUDA tensor");
+  TORCH_CHECK(input.is_contiguous(), "input must be contiguous");
+  TORCH_CHECK(input.scalar_type() == at::ScalarType::Float,
+              "__KERNEL_NAME_NORMALIZED__ only supports float32");
+  TORCH_CHECK(input.sizes() == out.sizes(),
+              "Tensors must have the same shape");
+  TORCH_CHECK(input.scalar_type() == out.scalar_type(),
+              "Tensors must have the same dtype");
+  TORCH_CHECK(input.device() == out.device(),
+              "Tensors must be on the same device");
+  int n = input.numel();
+  int threads = 256;
+  int blocks = (n + threads - 1) / threads;
+  const at::cuda::OptionalCUDAGuard device_guard(device_of(input));
+  const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+  __KERNEL_NAME_NORMALIZED___kernel<<<blocks, threads, 0, stream>>>(
+      out.data_ptr<float>(), input.data_ptr<float>(), n);
+}

__KERNEL_NAME_NORMALIZED___metal/__KERNEL_NAME_NORMALIZED__.metal ADDED Viewed

	@@ -0,0 +1,14 @@

+#include <metal_stdlib>
+using namespace metal;
+kernel void __KERNEL_NAME_NORMALIZED___forward_kernel_float(device const float *input [[buffer(0)]],
+                                device float *output [[buffer(1)]],
+                                uint index [[thread_position_in_grid]]) {
+    output[index] = input[index] + 1.0f;
+}
+kernel void __KERNEL_NAME_NORMALIZED___forward_kernel_half(device const half *input [[buffer(0)]],
+                                device half *output [[buffer(1)]],
+                                uint index [[thread_position_in_grid]]) {
+    output[index] = input[index] + static_cast<half>(1.0);
+}

__KERNEL_NAME_NORMALIZED___metal/__KERNEL_NAME_NORMALIZED__.mm ADDED Viewed

	@@ -0,0 +1,63 @@

+#include <torch/torch.h>
+#import <Foundation/Foundation.h>
+#import <Metal/Metal.h>
+#ifdef EMBEDDED_METALLIB_HEADER
+#include EMBEDDED_METALLIB_HEADER
+#else
+#error "EMBEDDED_METALLIB_HEADER not defined"
+#endif
+static inline id<MTLBuffer> getMTLBufferStorage(const torch::Tensor &tensor) {
+  return __builtin_bit_cast(id<MTLBuffer>, tensor.storage().data());
+}
+void __KERNEL_NAME_NORMALIZED__(torch::Tensor &out, torch::Tensor const &input) {
+  TORCH_CHECK(input.device().is_mps(), "input must be a MPS tensor");
+  TORCH_CHECK(input.is_contiguous(), "input must be contiguous");
+  TORCH_CHECK(input.scalar_type() == torch::kFloat ||
+                  input.scalar_type() == torch::kHalf,
+              "only float32 and float16 supported");
+  TORCH_CHECK(input.sizes() == out.sizes(), "Tensors must have same shape");
+  TORCH_CHECK(input.scalar_type() == out.scalar_type(), "Tensors must have same dtype");
+  TORCH_CHECK(input.device() == out.device(), "Tensors must be on same device");
+  @autoreleasepool {
+    id<MTLDevice> device = MTLCreateSystemDefaultDevice();
+    int numThreads = input.numel();
+    NSError *error = nil;
+    id<MTLLibrary> library = EMBEDDED_METALLIB_NAMESPACE::createLibrary(device, &error);
+    TORCH_CHECK(library, "Failed to create Metal library: ",
+                error.localizedDescription.UTF8String);
+    std::string kernel_name = std::string("__KERNEL_NAME_NORMALIZED___forward_kernel_") +
+        (input.scalar_type() == torch::kFloat ? "float" : "half");
+    id<MTLFunction> func = [library newFunctionWithName:
+        [NSString stringWithUTF8String:kernel_name.c_str()]];
+    TORCH_CHECK(func, "Failed to create function: ", kernel_name.c_str());
+    id<MTLComputePipelineState> pso =
+        [device newComputePipelineStateWithFunction:func error:&error];
+    TORCH_CHECK(pso, error.localizedDescription.UTF8String);
+    id<MTLCommandBuffer> cmdBuf = torch::mps::get_command_buffer();
+    dispatch_sync(torch::mps::get_dispatch_queue(), ^() {
+      id<MTLComputeCommandEncoder> encoder = [cmdBuf computeCommandEncoder];
+      [encoder setComputePipelineState:pso];
+      [encoder setBuffer:getMTLBufferStorage(input)
+                  offset:input.storage_offset() * input.element_size()
+                 atIndex:0];
+      [encoder setBuffer:getMTLBufferStorage(out)
+                  offset:out.storage_offset() * out.element_size()
+                 atIndex:1];
+      NSUInteger tgSize = MIN(pso.maxTotalThreadsPerThreadgroup, (NSUInteger)numThreads);
+      [encoder dispatchThreads:MTLSizeMake(numThreads, 1, 1)
+         threadsPerThreadgroup:MTLSizeMake(tgSize, 1, 1)];
+      [encoder endEncoding];
+      torch::mps::commit();
+    });
+  }
+}

__KERNEL_NAME_NORMALIZED___xpu/__KERNEL_NAME_NORMALIZED__.cpp ADDED Viewed

	@@ -0,0 +1,20 @@

+#include <sycl/sycl.hpp>
+#include <torch/torch.h>
+void __KERNEL_NAME_NORMALIZED__(torch::Tensor& out, const torch::Tensor& input) {
+    TORCH_CHECK(input.device().is_xpu(), "input must be a XPU tensor");
+    TORCH_CHECK(input.is_contiguous(), "input must be contiguous");
+    TORCH_CHECK(input.scalar_type() == torch::kFloat, "only float32 supported");
+    TORCH_CHECK(input.sizes() == out.sizes(), "Tensors must have same shape");
+    TORCH_CHECK(input.scalar_type() == out.scalar_type(), "Tensors must have same dtype");
+    TORCH_CHECK(input.device() == out.device(), "Tensors must be on same device");
+    sycl::queue queue;
+    auto input_ptr = input.data_ptr<float>();
+    auto output_ptr = out.data_ptr<float>();
+    auto n = input.numel();
+    queue.parallel_for(sycl::range<1>(n), [=](sycl::id<1> idx) {
+        output_ptr[idx[0]] = input_ptr[idx[0]] + 1.0f;
+    }).wait();
+}

build.toml ADDED Viewed

	@@ -0,0 +1,55 @@

+[general]
+backends = [
+  "cpu",
+  "cuda",
+  "metal",
+  "rocm",
+  "xpu",
+]
+name = "__KERNEL_NAME__"
+version = 1
+[torch]
+src = [
+  "torch-ext/torch_binding.cpp",
+  "torch-ext/torch_binding.h",
+]
+[kernel.__KERNEL_NAME_NORMALIZED__]
+backend = "cuda"
+depends = ["torch"]
+src = ["__KERNEL_NAME_NORMALIZED___cuda/__KERNEL_NAME_NORMALIZED__.cu"]
+[kernel.__KERNEL_NAME_NORMALIZED___metal]
+backend = "metal"
+depends = ["torch"]
+src = [
+  "__KERNEL_NAME_NORMALIZED___metal/__KERNEL_NAME_NORMALIZED__.mm",
+  "__KERNEL_NAME_NORMALIZED___metal/__KERNEL_NAME_NORMALIZED__.metal",
+]
+[kernel.__KERNEL_NAME_NORMALIZED___rocm]
+backend = "rocm"
+depends = ["torch"]
+rocm-archs = [
+  "gfx906",
+  "gfx908",
+  "gfx90a",
+  "gfx940",
+  "gfx941",
+  "gfx942",
+  "gfx1030",
+  "gfx1100",
+  "gfx1101",
+]
+src = ["__KERNEL_NAME_NORMALIZED___cuda/__KERNEL_NAME_NORMALIZED__.cu"]
+[kernel.__KERNEL_NAME_NORMALIZED___xpu]
+backend = "xpu"
+depends = ["torch"]
+src = ["__KERNEL_NAME_NORMALIZED___xpu/__KERNEL_NAME_NORMALIZED__.cpp"]
+[kernel.__KERNEL_NAME_NORMALIZED___cpu]
+backend = "cpu"
+depends = ["torch"]
+src = ["__KERNEL_NAME_NORMALIZED___cpu/__KERNEL_NAME_NORMALIZED___cpu.cpp"]

example.py ADDED Viewed

	@@ -0,0 +1,41 @@

+# /// script
+# requires-python = ">=3.13"
+# dependencies = [
+#     "kernels",
+#     "torch",
+# ]
+# ///
+import platform
+from pathlib import Path
+import kernels
+import torch
+# Load the locally built kernel
+kernel = kernels.get_local_kernel(Path("build"), "__KERNEL_NAME_NORMALIZED__")
+# Select device
+if platform.system() == "Darwin":
+    device = torch.device("mps")
+elif hasattr(torch, "xpu") and torch.xpu.is_available():
+    device = torch.device("xpu")
+elif torch.version.cuda is not None and torch.cuda.is_available():
+    device = torch.device("cuda")
+else:
+    device = torch.device("cpu")
+print(f"Using device: {device}")
+# Create input tensor
+x = torch.tensor([1.0, 2.0, 3.0], device=device)
+print(f"Input:  {x}")
+# Run kernel (adds 1 to each element)
+result = kernel.__KERNEL_NAME_NORMALIZED__(x)
+print(f"Output: {result}")
+# Verify result
+expected = x + 1.0
+assert torch.allclose(result, expected), "Kernel output doesn't match expected!"
+print("Success!")

flake.nix ADDED Viewed

	@@ -0,0 +1,11 @@

+{
+  inputs = {
+    kernel-builder.url = "github:huggingface/kernels";
+  };
+  outputs =
+    { self, kernel-builder, ... }:
+    kernel-builder.lib.genKernelFlakeOutputs {
+      inherit self;
+      path = ./.;
+    };
+}

tests/__init__.py ADDED Viewed

File without changes

tests/test___KERNEL_NAME_NORMALIZED__.py ADDED Viewed

	@@ -0,0 +1,21 @@

+import platform
+import torch
+import __KERNEL_NAME_NORMALIZED__
+def test___KERNEL_NAME_NORMALIZED__():
+    if platform.system() == "Darwin":
+        device = torch.device("mps")
+    elif hasattr(torch, "xpu") and torch.xpu.is_available():
+        device = torch.device("xpu")
+    elif torch.version.cuda is not None and torch.cuda.is_available():
+        device = torch.device("cuda")
+    else:
+        device = torch.device("cpu")
+    x = torch.randn(1024, 1024, dtype=torch.float32, device=device)
+    expected = x + 1.0
+    result = __KERNEL_NAME_NORMALIZED__.__KERNEL_NAME_NORMALIZED__(x)
+    torch.testing.assert_close(result, expected)

torch-ext/__KERNEL_NAME_NORMALIZED__/__init__.py ADDED Viewed

	@@ -0,0 +1,12 @@

+from typing import Optional
+import torch
+from ._ops import ops
+def __KERNEL_NAME_NORMALIZED__(x: torch.Tensor, out: Optional[torch.Tensor] = None) -> torch.Tensor:
+    if out is None:
+        out = torch.empty_like(x)
+    ops.__KERNEL_NAME_NORMALIZED__(out, x)
+    return out

torch-ext/torch_binding.cpp ADDED Viewed

	@@ -0,0 +1,19 @@

+#include <torch/library.h>
+#include "registration.h"
+#include "torch_binding.h"
+TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
+  ops.def("__KERNEL_NAME_NORMALIZED__(Tensor! out, Tensor input) -> ()");
+#if defined(CPU_KERNEL)
+  ops.impl("__KERNEL_NAME_NORMALIZED__", torch::kCPU, &__KERNEL_NAME_NORMALIZED__);
+#elif defined(CUDA_KERNEL) || defined(ROCM_KERNEL)
+  ops.impl("__KERNEL_NAME_NORMALIZED__", torch::kCUDA, &__KERNEL_NAME_NORMALIZED__);
+#elif defined(METAL_KERNEL)
+  ops.impl("__KERNEL_NAME_NORMALIZED__", torch::kMPS, __KERNEL_NAME_NORMALIZED__);
+#elif defined(XPU_KERNEL)
+  ops.impl("__KERNEL_NAME_NORMALIZED__", torch::kXPU, &__KERNEL_NAME_NORMALIZED__);
+#endif
+}
+REGISTER_EXTENSION(TORCH_EXTENSION_NAME)

torch-ext/torch_binding.h ADDED Viewed

	@@ -0,0 +1,5 @@

+#pragma once
+#include <torch/torch.h>
+void __KERNEL_NAME_NORMALIZED__(torch::Tensor &out, torch::Tensor const &input);