Spaces:
Sleeping
Sleeping
| #!/usr/bin/env python3 | |
| """ | |
| Example: Using HuggingFace Kernels library to load and use optimized CUDA kernels. | |
| This script demonstrates how to: | |
| 1. Load kernels from the HuggingFace Hub using get_kernel() | |
| 2. Check kernel availability with has_kernel() | |
| 3. Integrate Hub kernels with transformers/diffusers models | |
| Requirements: | |
| pip install kernels torch numpy | |
| Usage: | |
| python huggingface_kernels_example.py | |
| """ | |
| import time | |
| from typing import Optional | |
| import torch | |
| import torch.nn as nn | |
| def check_environment(): | |
| """Print environment information for debugging.""" | |
| print("=" * 60) | |
| print("Environment") | |
| print("=" * 60) | |
| print(f"PyTorch: {torch.__version__}") | |
| print(f"CUDA available: {torch.cuda.is_available()}") | |
| if torch.cuda.is_available(): | |
| print(f"CUDA version: {torch.version.cuda}") | |
| print(f"GPU: {torch.cuda.get_device_name()}") | |
| print(f"GPU capability: {torch.cuda.get_device_capability()}") | |
| print() | |
| def demo_basic_kernel_loading(): | |
| """Demonstrate basic kernel loading from Hub.""" | |
| print("=" * 60) | |
| print("Part 1: Basic Kernel Loading") | |
| print("=" * 60) | |
| try: | |
| from kernels import get_kernel, has_kernel | |
| repo_id = "kernels-community/activation" | |
| print(f"\n1. Checking kernel availability: {repo_id}") | |
| if has_kernel(repo_id): | |
| print(" Kernel is available for this environment") | |
| print(f"\n2. Loading kernel from Hub...") | |
| activation = get_kernel(repo_id, version=1) | |
| print(f"\n3. Available functions:") | |
| functions = [f for f in dir(activation) if not f.startswith('_')] | |
| for func in functions[:10]: | |
| print(f" - {func}") | |
| if len(functions) > 10: | |
| print(f" ... and {len(functions) - 10} more") | |
| print(f"\n4. Testing gelu_fast kernel...") | |
| x = torch.randn((4, 4), dtype=torch.float16, device="cuda") | |
| y = torch.empty_like(x) | |
| activation.gelu_fast(y, x) | |
| print(f" Input shape: {x.shape}") | |
| print(f" Output shape: {y.shape}") | |
| print(f" Success!") | |
| return activation | |
| else: | |
| print(" No compatible build available for this environment") | |
| return None | |
| except ImportError: | |
| print("\n kernels library not installed. Install with: pip install kernels") | |
| return None | |
| except Exception as e: | |
| print(f"\n Error: {e}") | |
| return None | |
| def demo_benchmark(activation_kernel): | |
| """Benchmark Hub kernel against PyTorch implementation.""" | |
| print("\n" + "=" * 60) | |
| print("Part 2: Benchmark Hub Kernel vs PyTorch") | |
| print("=" * 60) | |
| if activation_kernel is None: | |
| print(" Skipping (kernel not loaded)") | |
| return | |
| sizes = [(1024, 2048), (4096, 4096), (8192, 8192)] | |
| for size in sizes: | |
| x = torch.randn(size, dtype=torch.float16, device="cuda") | |
| y_hub = torch.empty_like(x) | |
| y_torch = torch.empty_like(x) | |
| for _ in range(5): | |
| activation_kernel.gelu_fast(y_hub, x) | |
| y_torch = torch.nn.functional.gelu(x) | |
| torch.cuda.synchronize() | |
| iterations = 100 | |
| start = time.perf_counter() | |
| for _ in range(iterations): | |
| activation_kernel.gelu_fast(y_hub, x) | |
| torch.cuda.synchronize() | |
| hub_time = (time.perf_counter() - start) / iterations * 1000 | |
| start = time.perf_counter() | |
| for _ in range(iterations): | |
| y_torch = torch.nn.functional.gelu(x) | |
| torch.cuda.synchronize() | |
| torch_time = (time.perf_counter() - start) / iterations * 1000 | |
| speedup = torch_time / hub_time | |
| print(f"\n Shape {size}:") | |
| print(f" Hub kernel: {hub_time:.4f} ms") | |
| print(f" PyTorch: {torch_time:.4f} ms") | |
| print(f" Speedup: {speedup:.2f}x") | |
| def demo_model_integration(): | |
| """Demonstrate integrating Hub kernels with models.""" | |
| print("\n" + "=" * 60) | |
| print("Part 3: Integration with Models") | |
| print("=" * 60) | |
| try: | |
| from kernels import get_kernel, has_kernel | |
| repo_id = "kernels-community/triton-layer-norm" | |
| if not has_kernel(repo_id): | |
| print(f" {repo_id} not available, skipping") | |
| return | |
| print(f"\n1. Loading {repo_id}...") | |
| layer_norm = get_kernel(repo_id) | |
| print(f"\n2. Available functions:") | |
| functions = [f for f in dir(layer_norm) if not f.startswith('_')] | |
| for func in functions: | |
| print(f" - {func}") | |
| class SimpleModel(nn.Module): | |
| def __init__(self, hidden_size=2048): | |
| super().__init__() | |
| self.norm = nn.RMSNorm(hidden_size) | |
| self.linear = nn.Linear(hidden_size, hidden_size) | |
| def forward(self, x): | |
| x = self.norm(x) | |
| x = self.linear(x) | |
| return x | |
| print(f"\n3. Creating model and patching RMSNorm...") | |
| model = SimpleModel().cuda().to(torch.bfloat16) | |
| def patch_rmsnorm(model, kernel): | |
| for name, module in model.named_modules(): | |
| if isinstance(module, nn.RMSNorm): | |
| eps = module.eps | |
| def make_forward(mod, epsilon): | |
| def forward(x): | |
| if hasattr(kernel, 'rms_norm'): | |
| return kernel.rms_norm(x, mod.weight, eps=epsilon) | |
| elif hasattr(kernel, 'rmsnorm'): | |
| return kernel.rmsnorm(x, mod.weight, eps=epsilon) | |
| else: | |
| return mod._original_forward(x) | |
| return forward | |
| module._original_forward = module.forward | |
| module.forward = make_forward(module, eps) | |
| print(f" Patched: {name}") | |
| patch_rmsnorm(model, layer_norm) | |
| print(f"\n4. Testing forward pass...") | |
| x = torch.randn(2, 1024, 2048, dtype=torch.bfloat16, device="cuda") | |
| with torch.inference_mode(): | |
| y = model(x) | |
| print(f" Input: {x.shape}") | |
| print(f" Output: {y.shape}") | |
| print(f" Success!") | |
| except ImportError: | |
| print(" kernels library not installed") | |
| except Exception as e: | |
| print(f" Error: {e}") | |
| def main(): | |
| print("=" * 60) | |
| print("HuggingFace Kernels Integration Example") | |
| print("=" * 60) | |
| check_environment() | |
| if not torch.cuda.is_available(): | |
| print("CUDA not available. This example requires a GPU.") | |
| return | |
| activation = demo_basic_kernel_loading() | |
| demo_benchmark(activation) | |
| demo_model_integration() | |
| print("\n" + "=" * 60) | |
| print("Done!") | |
| print("=" * 60) | |
| if __name__ == "__main__": | |
| main() | |