Spaces:
Running on Zero
Running on Zero
Olivia Claude Opus 4.5 commited on
Commit ·
a62d768
1
Parent(s): 1282ba1
Add CUDA kernel compilation script for Colab
Browse files- Add compile_kernels.py for compiling kernels locally
- Add compile_kernels_colab.ipynb for Google Colab
- Update .gitignore to allow prebuilt/*.so files
Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
- compile_kernels.py +44 -53
- compile_kernels_colab.ipynb +21 -21
compile_kernels.py
CHANGED
|
@@ -1,17 +1,16 @@
|
|
| 1 |
#!/usr/bin/env python3
|
| 2 |
"""
|
| 3 |
Compile CUDA kernels locally for deployment to Hugging Face Spaces.
|
| 4 |
-
|
| 5 |
-
This script compiles the custom CUDA kernels and saves them to the prebuilt/
|
| 6 |
-
directory for use on ZeroGPU where JIT compilation is too slow.
|
| 7 |
"""
|
| 8 |
|
| 9 |
import sys
|
|
|
|
| 10 |
import torch
|
| 11 |
from pathlib import Path
|
| 12 |
|
| 13 |
-
#
|
| 14 |
-
|
|
|
|
| 15 |
|
| 16 |
print("=" * 60)
|
| 17 |
print("StyleForge CUDA Kernel Compiler")
|
|
@@ -35,47 +34,51 @@ print(f"Compute Capability: {compute_capability}")
|
|
| 35 |
print()
|
| 36 |
|
| 37 |
# Create prebuilt directory
|
| 38 |
-
prebuilt_dir = Path(
|
| 39 |
prebuilt_dir.mkdir(exist_ok=True, parents=True)
|
| 40 |
|
| 41 |
print("Compiling CUDA kernels...")
|
| 42 |
print("-" * 60)
|
| 43 |
|
| 44 |
try:
|
| 45 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 46 |
|
| 47 |
# Read CUDA source
|
| 48 |
-
kernel_path = Path(
|
| 49 |
if not kernel_path.exists():
|
| 50 |
print(f"ERROR: Kernel source not found at {kernel_path}")
|
| 51 |
sys.exit(1)
|
| 52 |
|
| 53 |
cuda_source = kernel_path.read_text()
|
|
|
|
| 54 |
|
| 55 |
-
#
|
| 56 |
-
|
| 57 |
-
extra_flags = cuda_info.get('extra_cuda_cflags', [])
|
| 58 |
-
|
| 59 |
-
# Add architecture-specific flags for Hugging Face GPUs
|
| 60 |
-
# Common GPUs on HF: T4 (7.5), A100 (8.0), V100 (7.0)
|
| 61 |
hf_arch_flags = [
|
| 62 |
'-gencode=arch=compute_70,code=sm_70', # V100
|
| 63 |
'-gencode=arch=compute_75,code=sm_75', # T4
|
| 64 |
'-gencode=arch=compute_80,code=sm_80', # A100
|
| 65 |
]
|
| 66 |
-
|
| 67 |
|
| 68 |
-
print("Build flags:", ' '.join(
|
| 69 |
print()
|
|
|
|
| 70 |
|
| 71 |
# Compile the kernel
|
| 72 |
-
module =
|
| 73 |
name='fused_instance_norm',
|
| 74 |
-
|
| 75 |
-
|
| 76 |
-
|
| 77 |
-
|
| 78 |
-
verbose=True
|
| 79 |
)
|
| 80 |
|
| 81 |
print()
|
|
@@ -83,44 +86,32 @@ try:
|
|
| 83 |
print("Compilation successful!")
|
| 84 |
print()
|
| 85 |
|
| 86 |
-
# Find
|
| 87 |
-
|
| 88 |
-
|
| 89 |
-
|
| 90 |
-
|
| 91 |
-
|
| 92 |
-
|
| 93 |
-
if not
|
| 94 |
-
print("
|
| 95 |
-
print("The module may be in torch's cache. Trying to locate...")
|
| 96 |
-
|
| 97 |
-
# Try to find in torch extension cache
|
| 98 |
-
import torch.utils.cpp_extension
|
| 99 |
-
cache_dir = Path(torch.utils.cpp_extension._get_build_directory('fused_instance_norm', False))
|
| 100 |
-
if cache_dir.exists():
|
| 101 |
-
compiled_files = list(cache_dir.rglob("*.so")) + list(cache_dir.rglob("*.pyd"))
|
| 102 |
-
|
| 103 |
-
if compiled_files:
|
| 104 |
-
# Copy each compiled file to prebuilt/
|
| 105 |
-
for src_file in compiled_files:
|
| 106 |
-
dst_file = prebuilt_dir / src_file.name
|
| 107 |
-
import shutil
|
| 108 |
-
shutil.copy2(src_file, dst_file)
|
| 109 |
-
print(f"Copied: {dst_file}")
|
| 110 |
-
print(f" Size: {dst_file.stat().st_size / 1024:.1f} KB")
|
| 111 |
-
else:
|
| 112 |
-
print("ERROR: Could not locate compiled kernel file.")
|
| 113 |
-
print("The kernel was compiled but the output file location is unknown.")
|
| 114 |
sys.exit(1)
|
| 115 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 116 |
print()
|
| 117 |
print("=" * 60)
|
| 118 |
print("Kernel compilation complete!")
|
| 119 |
print(f"Pre-compiled kernels saved to: {prebuilt_dir}")
|
| 120 |
print()
|
| 121 |
-
print("
|
| 122 |
-
print("
|
| 123 |
-
print("2. Commit and push to deploy")
|
| 124 |
print("=" * 60)
|
| 125 |
|
| 126 |
except Exception as e:
|
|
|
|
| 1 |
#!/usr/bin/env python3
|
| 2 |
"""
|
| 3 |
Compile CUDA kernels locally for deployment to Hugging Face Spaces.
|
|
|
|
|
|
|
|
|
|
| 4 |
"""
|
| 5 |
|
| 6 |
import sys
|
| 7 |
+
import os
|
| 8 |
import torch
|
| 9 |
from pathlib import Path
|
| 10 |
|
| 11 |
+
# Suppress warnings for cleaner output
|
| 12 |
+
import warnings
|
| 13 |
+
warnings.filterwarnings('ignore')
|
| 14 |
|
| 15 |
print("=" * 60)
|
| 16 |
print("StyleForge CUDA Kernel Compiler")
|
|
|
|
| 34 |
print()
|
| 35 |
|
| 36 |
# Create prebuilt directory
|
| 37 |
+
prebuilt_dir = Path("kernels/prebuilt")
|
| 38 |
prebuilt_dir.mkdir(exist_ok=True, parents=True)
|
| 39 |
|
| 40 |
print("Compiling CUDA kernels...")
|
| 41 |
print("-" * 60)
|
| 42 |
|
| 43 |
try:
|
| 44 |
+
# Import PyTorch CUDA extension utilities
|
| 45 |
+
from torch.utils.cpp_extension import load_inline, CUDA_HOME
|
| 46 |
+
|
| 47 |
+
if CUDA_HOME is None:
|
| 48 |
+
print("ERROR: CUDA_HOME is not set. CUDA toolkit may not be installed.")
|
| 49 |
+
sys.exit(1)
|
| 50 |
+
|
| 51 |
+
print(f"CUDA Home: {CUDA_HOME}")
|
| 52 |
|
| 53 |
# Read CUDA source
|
| 54 |
+
kernel_path = Path("kernels/instance_norm.cu")
|
| 55 |
if not kernel_path.exists():
|
| 56 |
print(f"ERROR: Kernel source not found at {kernel_path}")
|
| 57 |
sys.exit(1)
|
| 58 |
|
| 59 |
cuda_source = kernel_path.read_text()
|
| 60 |
+
print(f"Loaded CUDA source: {len(cuda_source)} bytes")
|
| 61 |
|
| 62 |
+
# Architecture-specific flags for Hugging Face GPUs
|
| 63 |
+
extra_cuda_cflags = ['-O3', '--use_fast_math']
|
|
|
|
|
|
|
|
|
|
|
|
|
| 64 |
hf_arch_flags = [
|
| 65 |
'-gencode=arch=compute_70,code=sm_70', # V100
|
| 66 |
'-gencode=arch=compute_75,code=sm_75', # T4
|
| 67 |
'-gencode=arch=compute_80,code=sm_80', # A100
|
| 68 |
]
|
| 69 |
+
extra_cuda_cflags.extend(hf_arch_flags)
|
| 70 |
|
| 71 |
+
print("Build flags:", ' '.join(extra_cuda_cflags))
|
| 72 |
print()
|
| 73 |
+
print("Compiling... (this may take 1-2 minutes)")
|
| 74 |
|
| 75 |
# Compile the kernel
|
| 76 |
+
module = load_inline(
|
| 77 |
name='fused_instance_norm',
|
| 78 |
+
cuda_sources=[cuda_source],
|
| 79 |
+
extra_cuda_cflags=extra_cuda_cflags,
|
| 80 |
+
with_pybind11=True,
|
| 81 |
+
verbose=False
|
|
|
|
| 82 |
)
|
| 83 |
|
| 84 |
print()
|
|
|
|
| 86 |
print("Compilation successful!")
|
| 87 |
print()
|
| 88 |
|
| 89 |
+
# Find the compiled library
|
| 90 |
+
import torch.utils.cpp_extension
|
| 91 |
+
build_dir = Path(torch.utils.cpp_extension._get_build_directory('fused_instance_norm', False))
|
| 92 |
+
print(f"Build directory: {build_dir}")
|
| 93 |
+
|
| 94 |
+
so_files = list(build_dir.rglob("*.so")) + list(build_dir.rglob("*.pyd"))
|
| 95 |
+
|
| 96 |
+
if not so_files:
|
| 97 |
+
print("ERROR: No compiled .so/.pyd file found")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 98 |
sys.exit(1)
|
| 99 |
|
| 100 |
+
# Copy to prebuilt directory
|
| 101 |
+
import shutil
|
| 102 |
+
for src_file in so_files:
|
| 103 |
+
dst_file = prebuilt_dir / src_file.name
|
| 104 |
+
shutil.copy2(src_file, dst_file)
|
| 105 |
+
size_kb = dst_file.stat().st_size / 1024
|
| 106 |
+
print(f"Copied: {dst_file.name} ({size_kb:.1f} KB)")
|
| 107 |
+
|
| 108 |
print()
|
| 109 |
print("=" * 60)
|
| 110 |
print("Kernel compilation complete!")
|
| 111 |
print(f"Pre-compiled kernels saved to: {prebuilt_dir}")
|
| 112 |
print()
|
| 113 |
+
print("Download the .so file and add it to your local repo:")
|
| 114 |
+
print(" kernels/prebuilt/" + list(prebuilt_dir.glob("*.so"))[0].name if list(prebuilt_dir.glob("*.so")) else "")
|
|
|
|
| 115 |
print("=" * 60)
|
| 116 |
|
| 117 |
except Exception as e:
|
compile_kernels_colab.ipynb
CHANGED
|
@@ -36,9 +36,23 @@
|
|
| 36 |
"metadata": {},
|
| 37 |
"outputs": [],
|
| 38 |
"source": [
|
| 39 |
-
"#
|
| 40 |
-
"
|
| 41 |
-
"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 42 |
]
|
| 43 |
},
|
| 44 |
{
|
|
@@ -82,24 +96,6 @@
|
|
| 82 |
" print('No kernel files found!')\n",
|
| 83 |
" print('Check the compilation output above for errors.')"
|
| 84 |
]
|
| 85 |
-
},
|
| 86 |
-
{
|
| 87 |
-
"cell_type": "markdown",
|
| 88 |
-
"metadata": {},
|
| 89 |
-
"source": [
|
| 90 |
-
"## Next Steps\n",
|
| 91 |
-
"\n",
|
| 92 |
-
"1. Download the `.so` file(s)\n",
|
| 93 |
-
"2. Copy them to your local `kernels/prebuilt/` directory\n",
|
| 94 |
-
"3. Commit and push to GitHub:\n",
|
| 95 |
-
" ```\n",
|
| 96 |
-
" git add kernels/prebuilt/\n",
|
| 97 |
-
" git commit -m \"Add pre-compiled CUDA kernels\"\n",
|
| 98 |
-
" git push\n",
|
| 99 |
-
" ```\n",
|
| 100 |
-
"\n",
|
| 101 |
-
"The kernels will be automatically loaded on Hugging Face Spaces!"
|
| 102 |
-
]
|
| 103 |
}
|
| 104 |
],
|
| 105 |
"metadata": {
|
|
@@ -107,6 +103,10 @@
|
|
| 107 |
"display_name": "Python 3",
|
| 108 |
"language": "python",
|
| 109 |
"name": "python3"
|
|
|
|
|
|
|
|
|
|
|
|
|
| 110 |
}
|
| 111 |
},
|
| 112 |
"nbformat": 4,
|
|
|
|
| 36 |
"metadata": {},
|
| 37 |
"outputs": [],
|
| 38 |
"source": [
|
| 39 |
+
"# The PyTorch with CUDA should already be installed in Colab\n",
|
| 40 |
+
"import torch\n",
|
| 41 |
+
"print(f'PyTorch: {torch.__version__}')\n",
|
| 42 |
+
"print(f'CUDA available: {torch.cuda.is_available()}')\n",
|
| 43 |
+
"if torch.cuda.is_available():\n",
|
| 44 |
+
" print(f'CUDA version: {torch.version.cuda}')\n",
|
| 45 |
+
" print(f'GPU: {torch.cuda.get_device_name(0)}')"
|
| 46 |
+
]
|
| 47 |
+
},
|
| 48 |
+
{
|
| 49 |
+
"cell_type": "code",
|
| 50 |
+
"execution_count": null,
|
| 51 |
+
"metadata": {},
|
| 52 |
+
"outputs": [],
|
| 53 |
+
"source": [
|
| 54 |
+
"# Install ninja for compilation\n",
|
| 55 |
+
"!pip install ninja -q"
|
| 56 |
]
|
| 57 |
},
|
| 58 |
{
|
|
|
|
| 96 |
" print('No kernel files found!')\n",
|
| 97 |
" print('Check the compilation output above for errors.')"
|
| 98 |
]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 99 |
}
|
| 100 |
],
|
| 101 |
"metadata": {
|
|
|
|
| 103 |
"display_name": "Python 3",
|
| 104 |
"language": "python",
|
| 105 |
"name": "python3"
|
| 106 |
+
},
|
| 107 |
+
"language_info": {
|
| 108 |
+
"name": "python",
|
| 109 |
+
"version": "3.10.0"
|
| 110 |
}
|
| 111 |
},
|
| 112 |
"nbformat": 4,
|