Spaces:
Sleeping
A newer version of the Gradio SDK is available: 6.13.0
HuggingFace Kernels Hub Integration Guide
Overview
The HuggingFace Kernels Hub provides a platform for sharing, discovering, and using pre-compiled CUDA kernels. This guide covers the API for loading kernels, publishing your own, and integrating them into transformers and diffusers models.
Core API
get_kernel
The primary function for loading a compiled CUDA kernel from the Hub.
from huggingface_kernels import get_kernel
# Load a kernel by name from a Hub repository
kernel = get_kernel("username/my-cuda-kernels", "rmsnorm")
# The returned object provides the kernel's Python bindings
output = kernel.rmsnorm_forward(input_tensor, weight, eps)
Signature:
def get_kernel(
repo_id: str,
kernel_name: str,
revision: str = "main",
token: Optional[str] = None,
) -> ModuleType:
"""
Load a compiled CUDA kernel from the HuggingFace Hub.
Args:
repo_id: Repository ID on the Hub (e.g., "username/cuda-kernels")
kernel_name: Name of the kernel as defined in build.toml
revision: Git revision (branch, tag, or commit hash)
token: HuggingFace API token for private repos
Returns:
A Python module with the kernel's exported functions
Raises:
KernelNotFoundError: If the kernel doesn't exist in the repo
CompilationError: If the kernel fails to compile for the current GPU
CUDANotAvailableError: If no CUDA device is available
"""
How it works:
- Downloads the kernel source from the Hub repository
- Checks for a cached compiled version matching the current GPU architecture
- If not cached, compiles the kernel for the current GPU
- Loads the compiled module and returns it
has_kernel
Check if a kernel exists without downloading or compiling it.
from huggingface_kernels import has_kernel
# Check if a specific kernel is available
if has_kernel("username/my-cuda-kernels", "rmsnorm"):
kernel = get_kernel("username/my-cuda-kernels", "rmsnorm")
# Use custom kernel
else:
# Fallback to PyTorch implementation
pass
Signature:
def has_kernel(
repo_id: str,
kernel_name: str,
revision: str = "main",
token: Optional[str] = None,
) -> bool:
"""
Check if a kernel exists in a Hub repository.
Args:
repo_id: Repository ID on the Hub
kernel_name: Name of the kernel
revision: Git revision
token: HuggingFace API token
Returns:
True if the kernel exists and is compatible with the current hardware
"""
get_local_kernel
Load a kernel from a local directory instead of the Hub. Useful during development.
from huggingface_kernels import get_local_kernel
# Load from a local path
kernel = get_local_kernel("/path/to/my-kernels", "rmsnorm")
# Use it the same way as a Hub kernel
output = kernel.rmsnorm_forward(input_tensor, weight, eps)
Signature:
def get_local_kernel(
path: str,
kernel_name: str,
force_recompile: bool = False,
) -> ModuleType:
"""
Load a compiled CUDA kernel from a local directory.
Args:
path: Local filesystem path to the kernel repository
kernel_name: Name of the kernel as defined in build.toml
force_recompile: If True, recompile even if a cached version exists
Returns:
A Python module with the kernel's exported functions
"""
Usage Examples
Example 1: Activation Kernel
import torch
from huggingface_kernels import get_kernel
# Load the GELU activation kernel
activation_kernel = get_kernel("huggingface/cuda-kernels", "gelu")
# Create test input
x = torch.randn(32, 2048, dtype=torch.bfloat16, device="cuda")
# Run the kernel
output = activation_kernel.gelu_forward(x)
# Verify against PyTorch
reference = torch.nn.functional.gelu(x)
print(f"Max diff: {(output - reference).abs().max().item():.6f}")
Example 2: Flash Attention Kernel
import torch
from huggingface_kernels import get_kernel
# Load a custom flash attention kernel
attn_kernel = get_kernel("huggingface/cuda-kernels", "flash_attention")
batch_size = 4
num_heads = 32
seq_len = 2048
head_dim = 64
# Create QKV tensors
q = torch.randn(batch_size, num_heads, seq_len, head_dim,
dtype=torch.bfloat16, device="cuda")
k = torch.randn_like(q)
v = torch.randn_like(q)
# Run custom flash attention
output = attn_kernel.flash_attention_forward(q, k, v, scale=head_dim ** -0.5)
print(f"Output shape: {output.shape}")
# Output shape: torch.Size([4, 32, 2048, 64])
Example 3: RMSNorm Kernel
import torch
from huggingface_kernels import get_kernel
# Load RMSNorm kernel
rmsnorm_kernel = get_kernel("huggingface/cuda-kernels", "rmsnorm")
# Setup
hidden_size = 4096
x = torch.randn(2, 128, hidden_size, dtype=torch.bfloat16, device="cuda")
weight = torch.ones(hidden_size, dtype=torch.bfloat16, device="cuda")
eps = 1e-6
# Forward pass
output = rmsnorm_kernel.rmsnorm_forward(x, weight, eps)
print(f"Input shape: {x.shape}")
print(f"Output shape: {output.shape}")
print(f"Output dtype: {output.dtype}")
Example 4: Transformers Integration
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
from huggingface_kernels import get_kernel
# Load model
model = AutoModelForCausalLM.from_pretrained(
"meta-llama/Llama-2-7b-hf",
torch_dtype=torch.bfloat16,
device_map="cuda",
)
tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-7b-hf")
# Load custom kernels
rmsnorm_kernel = get_kernel("huggingface/cuda-kernels", "rmsnorm")
# Patch RMSNorm layers
from transformers.models.llama.modeling_llama import LlamaRMSNorm
for name, module in model.named_modules():
if isinstance(module, LlamaRMSNorm):
original_forward = module.forward
def make_forward(mod):
def forward(hidden_states):
return rmsnorm_kernel.rmsnorm_forward(
hidden_states,
mod.weight,
mod.variance_epsilon,
)
return forward
module.forward = make_forward(module)
# Run inference
inputs = tokenizer("Hello, world!", return_tensors="pt").to("cuda")
outputs = model.generate(**inputs, max_new_tokens=50)
print(tokenizer.decode(outputs[0]))
Example 5: Diffusers Integration
import torch
from diffusers import StableDiffusion3Pipeline
from diffusers.models.normalization import RMSNorm
from huggingface_kernels import get_kernel
# Load pipeline
pipe = StableDiffusion3Pipeline.from_pretrained(
"stabilityai/stable-diffusion-3-medium",
torch_dtype=torch.bfloat16,
)
# Load kernels
rmsnorm_kernel = get_kernel("huggingface/cuda-kernels", "rmsnorm")
geglu_kernel = get_kernel("huggingface/cuda-kernels", "geglu")
# Patch RMSNorm
for name, module in pipe.transformer.named_modules():
if isinstance(module, RMSNorm):
def make_forward(mod):
def forward(hidden_states):
if mod.weight is not None:
return rmsnorm_kernel.rmsnorm_forward(
hidden_states, mod.weight, mod.eps
)
else:
return rmsnorm_kernel.rmsnorm_no_weight_forward(
hidden_states, mod.eps
)
return forward
module.forward = make_forward(module)
# Move to GPU and run
pipe.to("cuda")
image = pipe("A photo of a cat").images[0]
image.save("output.png")
Publishing Kernels
Repository Structure
my-cuda-kernels/
βββ build.toml # Build configuration (required)
βββ src/
β βββ rmsnorm.cu # CUDA source files
β βββ gelu.cu
β βββ rope.cu
βββ python/
β βββ bindings.py # Optional Python wrappers
βββ tests/
β βββ test_rmsnorm.py
β βββ test_gelu.py
βββ README.md # Documentation
build.toml Configuration
The build.toml file defines how kernels are compiled:
[build]
# CUDA toolkit version to use
cuda-version = "12.4"
# Target GPU architectures
# sm_75 = T4 (Turing)
# sm_80 = A100 (Ampere)
# sm_86 = A10G (Ampere)
# sm_89 = L4 (Ada Lovelace)
# sm_90 = H100 (Hopper)
cuda-capabilities = ["8.0", "9.0"]
# Optional: Extra compiler flags
extra-cuda-flags = ["--use_fast_math"]
# Optional: C++ standard
cpp-standard = "c++17"
# Define individual kernels
[kernel.rmsnorm]
src = ["src/rmsnorm.cu"]
[kernel.gelu]
src = ["src/gelu.cu"]
[kernel.geglu]
src = ["src/geglu.cu"]
[kernel.rope]
src = ["src/rope.cu"]
# Kernel with multiple source files
[kernel.fused_attention]
src = [
"src/fused_attention.cu",
"src/attention_utils.cu",
]
# Kernel with extra dependencies
[kernel.flash_attention]
src = ["src/flash_attention.cu"]
extra-include-dirs = ["include/"]
extra-cuda-flags = ["--maxrregcount=128"]
Publishing to the Hub
# Install the huggingface_hub CLI
pip install huggingface_hub
# Login
huggingface-cli login
# Create a new repository
huggingface-cli repo create my-cuda-kernels --type model
# Clone and add your files
git clone https://huggingface.co/username/my-cuda-kernels
cd my-cuda-kernels
# Copy your files...
# Push
git add .
git commit -m "Initial kernel release"
git push
Versioning
Use git tags for versioning your kernels:
git tag v1.0.0
git push origin v1.0.0
Then users can pin to a specific version:
kernel = get_kernel("username/my-cuda-kernels", "rmsnorm", revision="v1.0.0")
build.toml Configuration Reference
Top-Level Build Section
[build]
# Required: CUDA toolkit version
cuda-version = "12.4" # Options: "11.8", "12.1", "12.4"
# Required: Target architectures
cuda-capabilities = ["8.0", "9.0"]
# Optional: Extra CUDA compiler flags
extra-cuda-flags = [
"--use_fast_math", # Fast math approximations
"--maxrregcount=128", # Limit registers per thread
"-lineinfo", # Include line info for profiling
"--ptxas-options=-v", # Verbose PTX assembler output
]
# Optional: C++ standard (default: c++17)
cpp-standard = "c++17"
# Optional: Extra include directories (relative to repo root)
extra-include-dirs = ["include/", "third_party/cutlass/include/"]
# Optional: Extra libraries to link
extra-libs = ["cublas"]
Kernel Section
[kernel.NAME]
# Required: Source files (relative to repo root)
src = ["src/kernel.cu"]
# Optional: Override global settings for this kernel
extra-cuda-flags = ["--maxrregcount=64"]
extra-include-dirs = ["include/special/"]
Multi-Architecture Configuration
[build]
cuda-version = "12.4"
# Support multiple GPU architectures
cuda-capabilities = [
"7.5", # T4 (Turing)
"8.0", # A100 (Ampere)
"8.6", # A10G (Ampere)
"8.9", # L4 (Ada Lovelace)
"9.0", # H100 (Hopper)
]
Note: Each additional architecture increases compile time and binary size. Only include architectures you need to support.
Available Community Kernels
The following are examples of kernels available on the Hub (check the Hub for the latest list):
| Repository | Kernels | Target GPUs | Description |
|---|---|---|---|
huggingface/cuda-kernels |
rmsnorm, gelu, geglu, rope | H100, A100 | Official HF kernel collection |
huggingface/flash-attention |
flash_attn_v2 | H100, A100 | Flash Attention 2 implementation |
community/fused-ops |
fused_rmsnorm_linear, fused_geglu | H100 | Fused operation kernels |
community/quantization |
int8_gemm, int4_dequant | A100, H100 | Quantization kernels |
community/video-kernels |
temporal_attention, spatial_rope | H100 | Video model kernels |
Discovering Kernels
from huggingface_hub import HfApi
api = HfApi()
# Search for kernel repositories
results = api.list_models(
search="cuda-kernels",
sort="downloads",
direction=-1,
)
for model in results:
print(f"{model.modelId}: {model.downloads} downloads")
Advanced: Custom Kernel Development Workflow
Step 1: Create Build Configuration
# build.toml
[build]
cuda-version = "12.4"
cuda-capabilities = ["9.0"] # Start with your dev GPU
[kernel.my_kernel]
src = ["src/my_kernel.cu"]
Step 2: Write the CUDA Kernel
// src/my_kernel.cu
#include <torch/extension.h>
#include <cuda_bf16.h>
__global__ void my_kernel_impl(
const __nv_bfloat16* input,
__nv_bfloat16* output,
int n
) {
int idx = blockIdx.x * blockDim.x + threadIdx.x;
if (idx < n) {
float val = __bfloat162float(input[idx]);
// Your computation here
output[idx] = __float2bfloat16(val);
}
}
torch::Tensor my_kernel_forward(torch::Tensor input) {
auto output = torch::empty_like(input);
int n = input.numel();
int block_size = 256;
int grid_size = (n + block_size - 1) / block_size;
my_kernel_impl<<<grid_size, block_size>>>(
reinterpret_cast<const __nv_bfloat16*>(input.data_ptr()),
reinterpret_cast<__nv_bfloat16*>(output.data_ptr()),
n
);
return output;
}
PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
m.def("my_kernel_forward", &my_kernel_forward, "My custom kernel");
}
Step 3: Test Locally
from huggingface_kernels import get_local_kernel
import torch
kernel = get_local_kernel("./", "my_kernel")
x = torch.randn(1024, dtype=torch.bfloat16, device="cuda")
output = kernel.my_kernel_forward(x)
print(f"Output shape: {output.shape}, dtype: {output.dtype}")
Step 4: Benchmark
import torch
import time
kernel = get_local_kernel("./", "my_kernel", force_recompile=True)
x = torch.randn(1024 * 1024, dtype=torch.bfloat16, device="cuda")
# Warmup
for _ in range(10):
kernel.my_kernel_forward(x)
torch.cuda.synchronize()
# Benchmark
start = time.perf_counter()
for _ in range(1000):
kernel.my_kernel_forward(x)
torch.cuda.synchronize()
elapsed = time.perf_counter() - start
print(f"Average: {elapsed / 1000 * 1e6:.1f} us")
Step 5: Publish
# Add build.toml and source files to your Hub repo
# Then push as shown in the Publishing section above
Error Handling
Common Errors
from huggingface_kernels import get_kernel, KernelNotFoundError, CompilationError
try:
kernel = get_kernel("username/my-kernels", "rmsnorm")
except KernelNotFoundError:
print("Kernel not found in repository")
# Fallback to PyTorch implementation
except CompilationError as e:
print(f"Compilation failed: {e}")
# Check CUDA version, GPU architecture compatibility
except RuntimeError as e:
if "CUDA" in str(e):
print("CUDA not available")
raise
Debugging Compilation Issues
import torch
print(f"CUDA available: {torch.cuda.is_available()}")
print(f"CUDA version: {torch.version.cuda}")
print(f"GPU: {torch.cuda.get_device_name()}")
print(f"Compute capability: {torch.cuda.get_device_capability()}")
Summary
- Use
get_kernel()to load pre-compiled kernels from the Hub - Use
has_kernel()to check availability before loading - Use
get_local_kernel()for development and testing - Configure builds with
build.tomlspecifying CUDA version and target architectures - Publish kernels to the Hub for community sharing
- Always provide fallbacks for environments where custom kernels are not available