File size: 6,892 Bytes
88a1dd2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
#!/usr/bin/env python3
"""
Example: Using HuggingFace Kernels library to load and use optimized CUDA kernels.

This script demonstrates how to:
1. Load kernels from the HuggingFace Hub using get_kernel()
2. Check kernel availability with has_kernel()
3. Integrate Hub kernels with transformers/diffusers models

Requirements:
    pip install kernels torch numpy

Usage:
    python huggingface_kernels_example.py
"""

import time
from typing import Optional

import torch
import torch.nn as nn


def check_environment():
    """Print environment information for debugging."""
    print("=" * 60)
    print("Environment")
    print("=" * 60)
    print(f"PyTorch: {torch.__version__}")
    print(f"CUDA available: {torch.cuda.is_available()}")
    if torch.cuda.is_available():
        print(f"CUDA version: {torch.version.cuda}")
        print(f"GPU: {torch.cuda.get_device_name()}")
        print(f"GPU capability: {torch.cuda.get_device_capability()}")
    print()


def demo_basic_kernel_loading():
    """Demonstrate basic kernel loading from Hub."""
    print("=" * 60)
    print("Part 1: Basic Kernel Loading")
    print("=" * 60)

    try:
        from kernels import get_kernel, has_kernel

        repo_id = "kernels-community/activation"

        print(f"\n1. Checking kernel availability: {repo_id}")
        if has_kernel(repo_id):
            print("   Kernel is available for this environment")

            print(f"\n2. Loading kernel from Hub...")
            activation = get_kernel(repo_id, version=1)

            print(f"\n3. Available functions:")
            functions = [f for f in dir(activation) if not f.startswith('_')]
            for func in functions[:10]:
                print(f"   - {func}")
            if len(functions) > 10:
                print(f"   ... and {len(functions) - 10} more")

            print(f"\n4. Testing gelu_fast kernel...")
            x = torch.randn((4, 4), dtype=torch.float16, device="cuda")
            y = torch.empty_like(x)

            activation.gelu_fast(y, x)
            print(f"   Input shape: {x.shape}")
            print(f"   Output shape: {y.shape}")
            print(f"   Success!")

            return activation
        else:
            print("   No compatible build available for this environment")
            return None

    except ImportError:
        print("\n   kernels library not installed. Install with: pip install kernels")
        return None
    except Exception as e:
        print(f"\n   Error: {e}")
        return None


def demo_benchmark(activation_kernel):
    """Benchmark Hub kernel against PyTorch implementation."""
    print("\n" + "=" * 60)
    print("Part 2: Benchmark Hub Kernel vs PyTorch")
    print("=" * 60)

    if activation_kernel is None:
        print("   Skipping (kernel not loaded)")
        return

    sizes = [(1024, 2048), (4096, 4096), (8192, 8192)]

    for size in sizes:
        x = torch.randn(size, dtype=torch.float16, device="cuda")
        y_hub = torch.empty_like(x)
        y_torch = torch.empty_like(x)

        for _ in range(5):
            activation_kernel.gelu_fast(y_hub, x)
            y_torch = torch.nn.functional.gelu(x)
        torch.cuda.synchronize()

        iterations = 100
        start = time.perf_counter()
        for _ in range(iterations):
            activation_kernel.gelu_fast(y_hub, x)
        torch.cuda.synchronize()
        hub_time = (time.perf_counter() - start) / iterations * 1000

        start = time.perf_counter()
        for _ in range(iterations):
            y_torch = torch.nn.functional.gelu(x)
        torch.cuda.synchronize()
        torch_time = (time.perf_counter() - start) / iterations * 1000

        speedup = torch_time / hub_time
        print(f"\n   Shape {size}:")
        print(f"   Hub kernel: {hub_time:.4f} ms")
        print(f"   PyTorch:    {torch_time:.4f} ms")
        print(f"   Speedup:    {speedup:.2f}x")


def demo_model_integration():
    """Demonstrate integrating Hub kernels with models."""
    print("\n" + "=" * 60)
    print("Part 3: Integration with Models")
    print("=" * 60)

    try:
        from kernels import get_kernel, has_kernel

        repo_id = "kernels-community/triton-layer-norm"

        if not has_kernel(repo_id):
            print(f"   {repo_id} not available, skipping")
            return

        print(f"\n1. Loading {repo_id}...")
        layer_norm = get_kernel(repo_id)

        print(f"\n2. Available functions:")
        functions = [f for f in dir(layer_norm) if not f.startswith('_')]
        for func in functions:
            print(f"   - {func}")

        class SimpleModel(nn.Module):
            def __init__(self, hidden_size=2048):
                super().__init__()
                self.norm = nn.RMSNorm(hidden_size)
                self.linear = nn.Linear(hidden_size, hidden_size)

            def forward(self, x):
                x = self.norm(x)
                x = self.linear(x)
                return x

        print(f"\n3. Creating model and patching RMSNorm...")
        model = SimpleModel().cuda().to(torch.bfloat16)

        def patch_rmsnorm(model, kernel):
            for name, module in model.named_modules():
                if isinstance(module, nn.RMSNorm):
                    eps = module.eps

                    def make_forward(mod, epsilon):
                        def forward(x):
                            if hasattr(kernel, 'rms_norm'):
                                return kernel.rms_norm(x, mod.weight, eps=epsilon)
                            elif hasattr(kernel, 'rmsnorm'):
                                return kernel.rmsnorm(x, mod.weight, eps=epsilon)
                            else:
                                return mod._original_forward(x)
                        return forward

                    module._original_forward = module.forward
                    module.forward = make_forward(module, eps)
                    print(f"   Patched: {name}")

        patch_rmsnorm(model, layer_norm)

        print(f"\n4. Testing forward pass...")
        x = torch.randn(2, 1024, 2048, dtype=torch.bfloat16, device="cuda")
        with torch.inference_mode():
            y = model(x)
        print(f"   Input: {x.shape}")
        print(f"   Output: {y.shape}")
        print(f"   Success!")

    except ImportError:
        print("   kernels library not installed")
    except Exception as e:
        print(f"   Error: {e}")


def main():
    print("=" * 60)
    print("HuggingFace Kernels Integration Example")
    print("=" * 60)

    check_environment()

    if not torch.cuda.is_available():
        print("CUDA not available. This example requires a GPU.")
        return

    activation = demo_basic_kernel_loading()
    demo_benchmark(activation)
    demo_model_integration()

    print("\n" + "=" * 60)
    print("Done!")
    print("=" * 60)


if __name__ == "__main__":
    main()