#!/usr/bin/env python3
"""
Compile CUDA kernels locally for deployment to Hugging Face Spaces.
"""

import sys
import os
import torch
from pathlib import Path

# Suppress warnings for cleaner output
import warnings
warnings.filterwarnings('ignore')

print("=" * 60)
print("StyleForge CUDA Kernel Compiler")
print("=" * 60)
print()

# Check CUDA availability
if not torch.cuda.is_available():
    print("ERROR: CUDA is not available on this system.")
    print("This script requires a CUDA-capable GPU.")
    sys.exit(1)

print(f"CUDA Version: {torch.version.cuda}")
print(f"PyTorch Version: {torch.__version__}")
print(f"GPU: {torch.cuda.get_device_name(0)}")

# Get compute capability
major, minor = torch.cuda.get_device_capability(0)
compute_capability = f"{major}.{minor}"
print(f"Compute Capability: {compute_capability}")
print()

# Create prebuilt directory
prebuilt_dir = Path("kernels/prebuilt")
prebuilt_dir.mkdir(exist_ok=True, parents=True)

print("Compiling CUDA kernels...")
print("-" * 60)

try:
    # Import PyTorch CUDA extension utilities
    from torch.utils.cpp_extension import load_inline, CUDA_HOME

    if CUDA_HOME is None:
        print("ERROR: CUDA_HOME is not set. CUDA toolkit may not be installed.")
        sys.exit(1)

    print(f"CUDA Home: {CUDA_HOME}")

    # Read CUDA source
    kernel_path = Path("kernels/instance_norm.cu")
    if not kernel_path.exists():
        print(f"ERROR: Kernel source not found at {kernel_path}")
        sys.exit(1)

    cuda_source = kernel_path.read_text()
    print(f"Loaded CUDA source: {len(cuda_source)} bytes")

    # Architecture-specific flags for Hugging Face GPUs
    extra_cuda_cflags = ['-O3', '--use_fast_math']
    hf_arch_flags = [
        '-gencode=arch=compute_70,code=sm_70',  # V100
        '-gencode=arch=compute_75,code=sm_75',  # T4
        '-gencode=arch=compute_80,code=sm_80',  # A100
    ]
    extra_cuda_cflags.extend(hf_arch_flags)

    print("Build flags:", ' '.join(extra_cuda_cflags))
    print()
    print("Compiling... (this may take 1-2 minutes)")

    # Compile the kernel
    # Note: PyTorch 2.x requires cpp_sources even if empty (bindings are in CUDA)
    module = load_inline(
        name='fused_instance_norm',
        cpp_sources=[],  # Empty since bindings are in the .cu file
        cuda_sources=[cuda_source],
        extra_cuda_cflags=extra_cuda_cflags,
        verbose=False
    )

    print()
    print("-" * 60)
    print("Compilation successful!")
    print()

    # Find the compiled library
    import torch.utils.cpp_extension
    build_dir = Path(torch.utils.cpp_extension._get_build_directory('fused_instance_norm', False))
    print(f"Build directory: {build_dir}")

    so_files = list(build_dir.rglob("*.so")) + list(build_dir.rglob("*.pyd"))

    if not so_files:
        print("ERROR: No compiled .so/.pyd file found")
        sys.exit(1)

    # Copy to prebuilt directory
    import shutil
    for src_file in so_files:
        dst_file = prebuilt_dir / src_file.name
        shutil.copy2(src_file, dst_file)
        size_kb = dst_file.stat().st_size / 1024
        print(f"Copied: {dst_file.name} ({size_kb:.1f} KB)")

    print()
    print("=" * 60)
    print("Kernel compilation complete!")
    print(f"Pre-compiled kernels saved to: {prebuilt_dir}")
    print()
    print("Download the .so file and add it to your local repo:")
    print("  kernels/prebuilt/" + list(prebuilt_dir.glob("*.so"))[0].name if list(prebuilt_dir.glob("*.so")) else "")
    print("=" * 60)

except Exception as e:
    print()
    print("-" * 60)
    print("ERROR: Compilation failed!")
    print(f"Details: {e}")
    import traceback
    traceback.print_exc()
    sys.exit(1)