#!/usr/bin/env python3
"""
Example: Using HuggingFace Kernels library to load and use optimized CUDA kernels.

This script demonstrates how to:
1. Load kernels from the HuggingFace Hub using get_kernel()
2. Check kernel availability with has_kernel()
3. Integrate Hub kernels with transformers/diffusers models

Requirements:
    pip install kernels torch numpy

Usage:
    python huggingface_kernels_example.py
"""

import time
from typing import Optional

import torch
import torch.nn as nn


def check_environment():
    """Print environment information for debugging."""
    print("=" * 60)
    print("Environment")
    print("=" * 60)
    print(f"PyTorch: {torch.__version__}")
    print(f"CUDA available: {torch.cuda.is_available()}")
    if torch.cuda.is_available():
        print(f"CUDA version: {torch.version.cuda}")
        print(f"GPU: {torch.cuda.get_device_name()}")
        print(f"GPU capability: {torch.cuda.get_device_capability()}")
    print()


def demo_basic_kernel_loading():
    """Demonstrate basic kernel loading from Hub."""
    print("=" * 60)
    print("Part 1: Basic Kernel Loading")
    print("=" * 60)

    try:
        from kernels import get_kernel, has_kernel

        repo_id = "kernels-community/activation"

        print(f"\n1. Checking kernel availability: {repo_id}")
        if has_kernel(repo_id):
            print("   Kernel is available for this environment")

            print(f"\n2. Loading kernel from Hub...")
            activation = get_kernel(repo_id, version=1)

            print(f"\n3. Available functions:")
            functions = [f for f in dir(activation) if not f.startswith('_')]
            for func in functions[:10]:
                print(f"   - {func}")
            if len(functions) > 10:
                print(f"   ... and {len(functions) - 10} more")

            print(f"\n4. Testing gelu_fast kernel...")
            x = torch.randn((4, 4), dtype=torch.float16, device="cuda")
            y = torch.empty_like(x)

            activation.gelu_fast(y, x)
            print(f"   Input shape: {x.shape}")
            print(f"   Output shape: {y.shape}")
            print(f"   Success!")

            return activation
        else:
            print("   No compatible build available for this environment")
            return None

    except ImportError:
        print("\n   kernels library not installed. Install with: pip install kernels")
        return None
    except Exception as e:
        print(f"\n   Error: {e}")
        return None


def demo_benchmark(activation_kernel):
    """Benchmark Hub kernel against PyTorch implementation."""
    print("\n" + "=" * 60)
    print("Part 2: Benchmark Hub Kernel vs PyTorch")
    print("=" * 60)

    if activation_kernel is None:
        print("   Skipping (kernel not loaded)")
        return

    sizes = [(1024, 2048), (4096, 4096), (8192, 8192)]

    for size in sizes:
        x = torch.randn(size, dtype=torch.float16, device="cuda")
        y_hub = torch.empty_like(x)
        y_torch = torch.empty_like(x)

        for _ in range(5):
            activation_kernel.gelu_fast(y_hub, x)
            y_torch = torch.nn.functional.gelu(x)
        torch.cuda.synchronize()

        iterations = 100
        start = time.perf_counter()
        for _ in range(iterations):
            activation_kernel.gelu_fast(y_hub, x)
        torch.cuda.synchronize()
        hub_time = (time.perf_counter() - start) / iterations * 1000

        start = time.perf_counter()
        for _ in range(iterations):
            y_torch = torch.nn.functional.gelu(x)
        torch.cuda.synchronize()
        torch_time = (time.perf_counter() - start) / iterations * 1000

        speedup = torch_time / hub_time
        print(f"\n   Shape {size}:")
        print(f"   Hub kernel: {hub_time:.4f} ms")
        print(f"   PyTorch:    {torch_time:.4f} ms")
        print(f"   Speedup:    {speedup:.2f}x")


def demo_model_integration():
    """Demonstrate integrating Hub kernels with models."""
    print("\n" + "=" * 60)
    print("Part 3: Integration with Models")
    print("=" * 60)

    try:
        from kernels import get_kernel, has_kernel

        repo_id = "kernels-community/triton-layer-norm"

        if not has_kernel(repo_id):
            print(f"   {repo_id} not available, skipping")
            return

        print(f"\n1. Loading {repo_id}...")
        layer_norm = get_kernel(repo_id)

        print(f"\n2. Available functions:")
        functions = [f for f in dir(layer_norm) if not f.startswith('_')]
        for func in functions:
            print(f"   - {func}")

        class SimpleModel(nn.Module):
            def __init__(self, hidden_size=2048):
                super().__init__()
                self.norm = nn.RMSNorm(hidden_size)
                self.linear = nn.Linear(hidden_size, hidden_size)

            def forward(self, x):
                x = self.norm(x)
                x = self.linear(x)
                return x

        print(f"\n3. Creating model and patching RMSNorm...")
        model = SimpleModel().cuda().to(torch.bfloat16)

        def patch_rmsnorm(model, kernel):
            for name, module in model.named_modules():
                if isinstance(module, nn.RMSNorm):
                    eps = module.eps

                    def make_forward(mod, epsilon):
                        def forward(x):
                            if hasattr(kernel, 'rms_norm'):
                                return kernel.rms_norm(x, mod.weight, eps=epsilon)
                            elif hasattr(kernel, 'rmsnorm'):
                                return kernel.rmsnorm(x, mod.weight, eps=epsilon)
                            else:
                                return mod._original_forward(x)
                        return forward

                    module._original_forward = module.forward
                    module.forward = make_forward(module, eps)
                    print(f"   Patched: {name}")

        patch_rmsnorm(model, layer_norm)

        print(f"\n4. Testing forward pass...")
        x = torch.randn(2, 1024, 2048, dtype=torch.bfloat16, device="cuda")
        with torch.inference_mode():
            y = model(x)
        print(f"   Input: {x.shape}")
        print(f"   Output: {y.shape}")
        print(f"   Success!")

    except ImportError:
        print("   kernels library not installed")
    except Exception as e:
        print(f"   Error: {e}")


def main():
    print("=" * 60)
    print("HuggingFace Kernels Integration Example")
    print("=" * 60)

    check_environment()

    if not torch.cuda.is_available():
        print("CUDA not available. This example requires a GPU.")
        return

    activation = demo_basic_kernel_loading()
    demo_benchmark(activation)
    demo_model_integration()

    print("\n" + "=" * 60)
    print("Done!")
    print("=" * 60)


if __name__ == "__main__":
    main()