Spaces:

KhookieThief
/

test

Sleeping

test / skill_example /scripts /huggingface_kernels_example.py

Jack-Khuu

Demo

88a1dd2 9 days ago

6.89 kB

	#!/usr/bin/env python3
	"""
	Example: Using HuggingFace Kernels library to load and use optimized CUDA kernels.

	This script demonstrates how to:
	1. Load kernels from the HuggingFace Hub using get_kernel()
	2. Check kernel availability with has_kernel()
	3. Integrate Hub kernels with transformers/diffusers models

	Requirements:
	pip install kernels torch numpy

	Usage:
	python huggingface_kernels_example.py
	"""

	import time
	from typing import Optional

	import torch
	import torch.nn as nn


	def check_environment():
	"""Print environment information for debugging."""
	print("=" * 60)
	print("Environment")
	print("=" * 60)
	print(f"PyTorch: {torch.__version__}")
	print(f"CUDA available: {torch.cuda.is_available()}")
	if torch.cuda.is_available():
	print(f"CUDA version: {torch.version.cuda}")
	print(f"GPU: {torch.cuda.get_device_name()}")
	print(f"GPU capability: {torch.cuda.get_device_capability()}")
	print()


	def demo_basic_kernel_loading():
	"""Demonstrate basic kernel loading from Hub."""
	print("=" * 60)
	print("Part 1: Basic Kernel Loading")
	print("=" * 60)

	try:
	from kernels import get_kernel, has_kernel

	repo_id = "kernels-community/activation"

	print(f"\n1. Checking kernel availability: {repo_id}")
	if has_kernel(repo_id):
	print(" Kernel is available for this environment")

	print(f"\n2. Loading kernel from Hub...")
	activation = get_kernel(repo_id, version=1)

	print(f"\n3. Available functions:")
	functions = [f for f in dir(activation) if not f.startswith('_')]
	for func in functions[:10]:
	print(f" - {func}")
	if len(functions) > 10:
	print(f" ... and {len(functions) - 10} more")

	print(f"\n4. Testing gelu_fast kernel...")
	x = torch.randn((4, 4), dtype=torch.float16, device="cuda")
	y = torch.empty_like(x)

	activation.gelu_fast(y, x)
	print(f" Input shape: {x.shape}")
	print(f" Output shape: {y.shape}")
	print(f" Success!")

	return activation
	else:
	print(" No compatible build available for this environment")
	return None

	except ImportError:
	print("\n kernels library not installed. Install with: pip install kernels")
	return None
	except Exception as e:
	print(f"\n Error: {e}")
	return None


	def demo_benchmark(activation_kernel):
	"""Benchmark Hub kernel against PyTorch implementation."""
	print("\n" + "=" * 60)
	print("Part 2: Benchmark Hub Kernel vs PyTorch")
	print("=" * 60)

	if activation_kernel is None:
	print(" Skipping (kernel not loaded)")
	return

	sizes = [(1024, 2048), (4096, 4096), (8192, 8192)]

	for size in sizes:
	x = torch.randn(size, dtype=torch.float16, device="cuda")
	y_hub = torch.empty_like(x)
	y_torch = torch.empty_like(x)

	for _ in range(5):
	activation_kernel.gelu_fast(y_hub, x)
	y_torch = torch.nn.functional.gelu(x)
	torch.cuda.synchronize()

	iterations = 100
	start = time.perf_counter()
	for _ in range(iterations):
	activation_kernel.gelu_fast(y_hub, x)
	torch.cuda.synchronize()
	hub_time = (time.perf_counter() - start) / iterations * 1000

	start = time.perf_counter()
	for _ in range(iterations):
	y_torch = torch.nn.functional.gelu(x)
	torch.cuda.synchronize()
	torch_time = (time.perf_counter() - start) / iterations * 1000

	speedup = torch_time / hub_time
	print(f"\n Shape {size}:")
	print(f" Hub kernel: {hub_time:.4f} ms")
	print(f" PyTorch: {torch_time:.4f} ms")
	print(f" Speedup: {speedup:.2f}x")


	def demo_model_integration():
	"""Demonstrate integrating Hub kernels with models."""
	print("\n" + "=" * 60)
	print("Part 3: Integration with Models")
	print("=" * 60)

	try:
	from kernels import get_kernel, has_kernel

	repo_id = "kernels-community/triton-layer-norm"

	if not has_kernel(repo_id):
	print(f" {repo_id} not available, skipping")
	return

	print(f"\n1. Loading {repo_id}...")
	layer_norm = get_kernel(repo_id)

	print(f"\n2. Available functions:")
	functions = [f for f in dir(layer_norm) if not f.startswith('_')]
	for func in functions:
	print(f" - {func}")

	class SimpleModel(nn.Module):
	def __init__(self, hidden_size=2048):
	super().__init__()
	self.norm = nn.RMSNorm(hidden_size)
	self.linear = nn.Linear(hidden_size, hidden_size)

	def forward(self, x):
	x = self.norm(x)
	x = self.linear(x)
	return x

	print(f"\n3. Creating model and patching RMSNorm...")
	model = SimpleModel().cuda().to(torch.bfloat16)

	def patch_rmsnorm(model, kernel):
	for name, module in model.named_modules():
	if isinstance(module, nn.RMSNorm):
	eps = module.eps

	def make_forward(mod, epsilon):
	def forward(x):
	if hasattr(kernel, 'rms_norm'):
	return kernel.rms_norm(x, mod.weight, eps=epsilon)
	elif hasattr(kernel, 'rmsnorm'):
	return kernel.rmsnorm(x, mod.weight, eps=epsilon)
	else:
	return mod._original_forward(x)
	return forward

	module._original_forward = module.forward
	module.forward = make_forward(module, eps)
	print(f" Patched: {name}")

	patch_rmsnorm(model, layer_norm)

	print(f"\n4. Testing forward pass...")
	x = torch.randn(2, 1024, 2048, dtype=torch.bfloat16, device="cuda")
	with torch.inference_mode():
	y = model(x)
	print(f" Input: {x.shape}")
	print(f" Output: {y.shape}")
	print(f" Success!")

	except ImportError:
	print(" kernels library not installed")
	except Exception as e:
	print(f" Error: {e}")


	def main():
	print("=" * 60)
	print("HuggingFace Kernels Integration Example")
	print("=" * 60)

	check_environment()

	if not torch.cuda.is_available():
	print("CUDA not available. This example requires a GPU.")
	return

	activation = demo_basic_kernel_loading()
	demo_benchmark(activation)
	demo_model_integration()

	print("\n" + "=" * 60)
	print("Done!")
	print("=" * 60)


	if __name__ == "__main__":
	main()