test / skill_example /scripts /huggingface_kernels_example.py
Jack-Khuu
Demo
88a1dd2
#!/usr/bin/env python3
"""
Example: Using HuggingFace Kernels library to load and use optimized CUDA kernels.
This script demonstrates how to:
1. Load kernels from the HuggingFace Hub using get_kernel()
2. Check kernel availability with has_kernel()
3. Integrate Hub kernels with transformers/diffusers models
Requirements:
pip install kernels torch numpy
Usage:
python huggingface_kernels_example.py
"""
import time
from typing import Optional
import torch
import torch.nn as nn
def check_environment():
"""Print environment information for debugging."""
print("=" * 60)
print("Environment")
print("=" * 60)
print(f"PyTorch: {torch.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
print(f"CUDA version: {torch.version.cuda}")
print(f"GPU: {torch.cuda.get_device_name()}")
print(f"GPU capability: {torch.cuda.get_device_capability()}")
print()
def demo_basic_kernel_loading():
"""Demonstrate basic kernel loading from Hub."""
print("=" * 60)
print("Part 1: Basic Kernel Loading")
print("=" * 60)
try:
from kernels import get_kernel, has_kernel
repo_id = "kernels-community/activation"
print(f"\n1. Checking kernel availability: {repo_id}")
if has_kernel(repo_id):
print(" Kernel is available for this environment")
print(f"\n2. Loading kernel from Hub...")
activation = get_kernel(repo_id, version=1)
print(f"\n3. Available functions:")
functions = [f for f in dir(activation) if not f.startswith('_')]
for func in functions[:10]:
print(f" - {func}")
if len(functions) > 10:
print(f" ... and {len(functions) - 10} more")
print(f"\n4. Testing gelu_fast kernel...")
x = torch.randn((4, 4), dtype=torch.float16, device="cuda")
y = torch.empty_like(x)
activation.gelu_fast(y, x)
print(f" Input shape: {x.shape}")
print(f" Output shape: {y.shape}")
print(f" Success!")
return activation
else:
print(" No compatible build available for this environment")
return None
except ImportError:
print("\n kernels library not installed. Install with: pip install kernels")
return None
except Exception as e:
print(f"\n Error: {e}")
return None
def demo_benchmark(activation_kernel):
"""Benchmark Hub kernel against PyTorch implementation."""
print("\n" + "=" * 60)
print("Part 2: Benchmark Hub Kernel vs PyTorch")
print("=" * 60)
if activation_kernel is None:
print(" Skipping (kernel not loaded)")
return
sizes = [(1024, 2048), (4096, 4096), (8192, 8192)]
for size in sizes:
x = torch.randn(size, dtype=torch.float16, device="cuda")
y_hub = torch.empty_like(x)
y_torch = torch.empty_like(x)
for _ in range(5):
activation_kernel.gelu_fast(y_hub, x)
y_torch = torch.nn.functional.gelu(x)
torch.cuda.synchronize()
iterations = 100
start = time.perf_counter()
for _ in range(iterations):
activation_kernel.gelu_fast(y_hub, x)
torch.cuda.synchronize()
hub_time = (time.perf_counter() - start) / iterations * 1000
start = time.perf_counter()
for _ in range(iterations):
y_torch = torch.nn.functional.gelu(x)
torch.cuda.synchronize()
torch_time = (time.perf_counter() - start) / iterations * 1000
speedup = torch_time / hub_time
print(f"\n Shape {size}:")
print(f" Hub kernel: {hub_time:.4f} ms")
print(f" PyTorch: {torch_time:.4f} ms")
print(f" Speedup: {speedup:.2f}x")
def demo_model_integration():
"""Demonstrate integrating Hub kernels with models."""
print("\n" + "=" * 60)
print("Part 3: Integration with Models")
print("=" * 60)
try:
from kernels import get_kernel, has_kernel
repo_id = "kernels-community/triton-layer-norm"
if not has_kernel(repo_id):
print(f" {repo_id} not available, skipping")
return
print(f"\n1. Loading {repo_id}...")
layer_norm = get_kernel(repo_id)
print(f"\n2. Available functions:")
functions = [f for f in dir(layer_norm) if not f.startswith('_')]
for func in functions:
print(f" - {func}")
class SimpleModel(nn.Module):
def __init__(self, hidden_size=2048):
super().__init__()
self.norm = nn.RMSNorm(hidden_size)
self.linear = nn.Linear(hidden_size, hidden_size)
def forward(self, x):
x = self.norm(x)
x = self.linear(x)
return x
print(f"\n3. Creating model and patching RMSNorm...")
model = SimpleModel().cuda().to(torch.bfloat16)
def patch_rmsnorm(model, kernel):
for name, module in model.named_modules():
if isinstance(module, nn.RMSNorm):
eps = module.eps
def make_forward(mod, epsilon):
def forward(x):
if hasattr(kernel, 'rms_norm'):
return kernel.rms_norm(x, mod.weight, eps=epsilon)
elif hasattr(kernel, 'rmsnorm'):
return kernel.rmsnorm(x, mod.weight, eps=epsilon)
else:
return mod._original_forward(x)
return forward
module._original_forward = module.forward
module.forward = make_forward(module, eps)
print(f" Patched: {name}")
patch_rmsnorm(model, layer_norm)
print(f"\n4. Testing forward pass...")
x = torch.randn(2, 1024, 2048, dtype=torch.bfloat16, device="cuda")
with torch.inference_mode():
y = model(x)
print(f" Input: {x.shape}")
print(f" Output: {y.shape}")
print(f" Success!")
except ImportError:
print(" kernels library not installed")
except Exception as e:
print(f" Error: {e}")
def main():
print("=" * 60)
print("HuggingFace Kernels Integration Example")
print("=" * 60)
check_environment()
if not torch.cuda.is_available():
print("CUDA not available. This example requires a GPU.")
return
activation = demo_basic_kernel_loading()
demo_benchmark(activation)
demo_model_integration()
print("\n" + "=" * 60)
print("Done!")
print("=" * 60)
if __name__ == "__main__":
main()