Spaces:
Sleeping
Sleeping
Olivia
commited on
Commit
·
1282ba1
1
Parent(s):
d3d412a
info endpoint
Browse files- .gitignore +4 -0
- app.py +14 -15
- compile_kernels.py +133 -0
- compile_kernels_colab.ipynb +114 -0
- kernels/__init__.py +104 -6
- kernels/prebuilt/README.md +64 -0
.gitignore
CHANGED
|
@@ -33,3 +33,7 @@ Thumbs.db
|
|
| 33 |
# Gradio
|
| 34 |
gradio_cached_examples/
|
| 35 |
flagged/
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 33 |
# Gradio
|
| 34 |
gradio_cached_examples/
|
| 35 |
flagged/
|
| 36 |
+
|
| 37 |
+
# BUT allow pre-compiled kernels in prebuilt/
|
| 38 |
+
!kernels/prebuilt/*.so
|
| 39 |
+
!kernels/prebuilt/*.pyd
|
app.py
CHANGED
|
@@ -120,10 +120,15 @@ if SPACES_AVAILABLE:
|
|
| 120 |
|
| 121 |
# Check CUDA kernels availability
|
| 122 |
try:
|
| 123 |
-
from kernels import check_cuda_kernels, get_fused_instance_norm,
|
| 124 |
-
# On ZeroGPU
|
|
|
|
| 125 |
CUDA_KERNELS_AVAILABLE = check_cuda_kernels()
|
| 126 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 127 |
except Exception:
|
| 128 |
CUDA_KERNELS_AVAILABLE = False
|
| 129 |
print("CUDA Kernels: Not Available (using PyTorch fallback)")
|
|
@@ -579,7 +584,12 @@ if _SPACES_ZERO_GPU:
|
|
| 579 |
print("Device: CUDA (ZeroGPU mode - lazy initialization)")
|
| 580 |
else:
|
| 581 |
print(f"Device: {get_device().type.upper()}")
|
| 582 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 583 |
|
| 584 |
# Skip model preloading on ZeroGPU to avoid CUDA init in main process
|
| 585 |
if not _SPACES_ZERO_GPU:
|
|
@@ -1364,22 +1374,11 @@ def stylize_image_impl(
|
|
| 1364 |
add_watermark: bool
|
| 1365 |
) -> Tuple[Optional[Image.Image], str, Optional[str]]:
|
| 1366 |
"""Main stylization function for Gradio."""
|
| 1367 |
-
global CUDA_KERNELS_AVAILABLE
|
| 1368 |
|
| 1369 |
if input_image is None:
|
| 1370 |
return None, "Please upload an image first.", None
|
| 1371 |
|
| 1372 |
try:
|
| 1373 |
-
# On ZeroGPU, compile CUDA kernels within the GPU task on first use
|
| 1374 |
-
if SPACES_AVAILABLE and not CUDA_KERNELS_AVAILABLE:
|
| 1375 |
-
try:
|
| 1376 |
-
from kernels import compile_kernels
|
| 1377 |
-
CUDA_KERNELS_AVAILABLE = compile_kernels()
|
| 1378 |
-
if CUDA_KERNELS_AVAILABLE:
|
| 1379 |
-
print("CUDA kernels compiled successfully within GPU task!")
|
| 1380 |
-
except Exception as e:
|
| 1381 |
-
print(f"Failed to compile CUDA kernels: {e}")
|
| 1382 |
-
|
| 1383 |
# Convert to RGB if needed
|
| 1384 |
if input_image.mode != 'RGB':
|
| 1385 |
input_image = input_image.convert('RGB')
|
|
|
|
| 120 |
|
| 121 |
# Check CUDA kernels availability
|
| 122 |
try:
|
| 123 |
+
from kernels import check_cuda_kernels, get_fused_instance_norm, load_prebuilt_kernels
|
| 124 |
+
# On ZeroGPU: Uses pre-compiled kernels from prebuilt/ if available
|
| 125 |
+
# On local: JIT compiles kernels if prebuilt not found
|
| 126 |
CUDA_KERNELS_AVAILABLE = check_cuda_kernels()
|
| 127 |
+
if SPACES_AVAILABLE:
|
| 128 |
+
status = "Pre-compiled" if CUDA_KERNELS_AVAILABLE else "PyTorch GPU fallback (no prebuilt kernels)"
|
| 129 |
+
print(f"CUDA Kernels: {status}")
|
| 130 |
+
else:
|
| 131 |
+
print(f"CUDA Kernels: {'Available' if CUDA_KERNELS_AVAILABLE else 'Not Available (using PyTorch fallback)'}")
|
| 132 |
except Exception:
|
| 133 |
CUDA_KERNELS_AVAILABLE = False
|
| 134 |
print("CUDA Kernels: Not Available (using PyTorch fallback)")
|
|
|
|
| 584 |
print("Device: CUDA (ZeroGPU mode - lazy initialization)")
|
| 585 |
else:
|
| 586 |
print(f"Device: {get_device().type.upper()}")
|
| 587 |
+
|
| 588 |
+
if SPACES_AVAILABLE:
|
| 589 |
+
status = "Pre-compiled" if CUDA_KERNELS_AVAILABLE else "PyTorch GPU fallback"
|
| 590 |
+
print(f"CUDA Kernels: {status}")
|
| 591 |
+
else:
|
| 592 |
+
print(f"CUDA Kernels: {'Available' if CUDA_KERNELS_AVAILABLE else 'Not Available (using PyTorch fallback)'}")
|
| 593 |
|
| 594 |
# Skip model preloading on ZeroGPU to avoid CUDA init in main process
|
| 595 |
if not _SPACES_ZERO_GPU:
|
|
|
|
| 1374 |
add_watermark: bool
|
| 1375 |
) -> Tuple[Optional[Image.Image], str, Optional[str]]:
|
| 1376 |
"""Main stylization function for Gradio."""
|
|
|
|
| 1377 |
|
| 1378 |
if input_image is None:
|
| 1379 |
return None, "Please upload an image first.", None
|
| 1380 |
|
| 1381 |
try:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1382 |
# Convert to RGB if needed
|
| 1383 |
if input_image.mode != 'RGB':
|
| 1384 |
input_image = input_image.convert('RGB')
|
compile_kernels.py
ADDED
|
@@ -0,0 +1,133 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
"""
|
| 3 |
+
Compile CUDA kernels locally for deployment to Hugging Face Spaces.
|
| 4 |
+
|
| 5 |
+
This script compiles the custom CUDA kernels and saves them to the prebuilt/
|
| 6 |
+
directory for use on ZeroGPU where JIT compilation is too slow.
|
| 7 |
+
"""
|
| 8 |
+
|
| 9 |
+
import sys
|
| 10 |
+
import torch
|
| 11 |
+
from pathlib import Path
|
| 12 |
+
|
| 13 |
+
# Add kernels directory to path
|
| 14 |
+
sys.path.insert(0, str(Path(__file__).parent))
|
| 15 |
+
|
| 16 |
+
print("=" * 60)
|
| 17 |
+
print("StyleForge CUDA Kernel Compiler")
|
| 18 |
+
print("=" * 60)
|
| 19 |
+
print()
|
| 20 |
+
|
| 21 |
+
# Check CUDA availability
|
| 22 |
+
if not torch.cuda.is_available():
|
| 23 |
+
print("ERROR: CUDA is not available on this system.")
|
| 24 |
+
print("This script requires a CUDA-capable GPU.")
|
| 25 |
+
sys.exit(1)
|
| 26 |
+
|
| 27 |
+
print(f"CUDA Version: {torch.version.cuda}")
|
| 28 |
+
print(f"PyTorch Version: {torch.__version__}")
|
| 29 |
+
print(f"GPU: {torch.cuda.get_device_name(0)}")
|
| 30 |
+
|
| 31 |
+
# Get compute capability
|
| 32 |
+
major, minor = torch.cuda.get_device_capability(0)
|
| 33 |
+
compute_capability = f"{major}.{minor}"
|
| 34 |
+
print(f"Compute Capability: {compute_capability}")
|
| 35 |
+
print()
|
| 36 |
+
|
| 37 |
+
# Create prebuilt directory
|
| 38 |
+
prebuilt_dir = Path(__file__).parent / "kernels" / "prebuilt"
|
| 39 |
+
prebuilt_dir.mkdir(exist_ok=True, parents=True)
|
| 40 |
+
|
| 41 |
+
print("Compiling CUDA kernels...")
|
| 42 |
+
print("-" * 60)
|
| 43 |
+
|
| 44 |
+
try:
|
| 45 |
+
from kernels.cuda_build import compile_inline, get_cuda_info
|
| 46 |
+
|
| 47 |
+
# Read CUDA source
|
| 48 |
+
kernel_path = Path(__file__).parent / "kernels" / "instance_norm.cu"
|
| 49 |
+
if not kernel_path.exists():
|
| 50 |
+
print(f"ERROR: Kernel source not found at {kernel_path}")
|
| 51 |
+
sys.exit(1)
|
| 52 |
+
|
| 53 |
+
cuda_source = kernel_path.read_text()
|
| 54 |
+
|
| 55 |
+
# Get CUDA build info
|
| 56 |
+
cuda_info = get_cuda_info()
|
| 57 |
+
extra_flags = cuda_info.get('extra_cuda_cflags', [])
|
| 58 |
+
|
| 59 |
+
# Add architecture-specific flags for Hugging Face GPUs
|
| 60 |
+
# Common GPUs on HF: T4 (7.5), A100 (8.0), V100 (7.0)
|
| 61 |
+
hf_arch_flags = [
|
| 62 |
+
'-gencode=arch=compute_70,code=sm_70', # V100
|
| 63 |
+
'-gencode=arch=compute_75,code=sm_75', # T4
|
| 64 |
+
'-gencode=arch=compute_80,code=sm_80', # A100
|
| 65 |
+
]
|
| 66 |
+
extra_flags.extend(hf_arch_flags)
|
| 67 |
+
|
| 68 |
+
print("Build flags:", ' '.join(extra_flags))
|
| 69 |
+
print()
|
| 70 |
+
|
| 71 |
+
# Compile the kernel
|
| 72 |
+
module = compile_inline(
|
| 73 |
+
name='fused_instance_norm',
|
| 74 |
+
cuda_source=cuda_source,
|
| 75 |
+
functions=['forward'],
|
| 76 |
+
build_directory=Path(__file__).parent / "build",
|
| 77 |
+
extra_cuda_cflags=extra_flags,
|
| 78 |
+
verbose=True
|
| 79 |
+
)
|
| 80 |
+
|
| 81 |
+
print()
|
| 82 |
+
print("-" * 60)
|
| 83 |
+
print("Compilation successful!")
|
| 84 |
+
print()
|
| 85 |
+
|
| 86 |
+
# Find and copy the compiled library
|
| 87 |
+
build_dir = Path(__file__).parent / "build"
|
| 88 |
+
so_files = list(build_dir.rglob("fused_instance_norm*.so"))
|
| 89 |
+
pyd_files = list(build_dir.rglob("fused_instance_norm*.pyd"))
|
| 90 |
+
|
| 91 |
+
compiled_files = so_files + pyd_files
|
| 92 |
+
|
| 93 |
+
if not compiled_files:
|
| 94 |
+
print("WARNING: No compiled .so/.pyd file found in build/")
|
| 95 |
+
print("The module may be in torch's cache. Trying to locate...")
|
| 96 |
+
|
| 97 |
+
# Try to find in torch extension cache
|
| 98 |
+
import torch.utils.cpp_extension
|
| 99 |
+
cache_dir = Path(torch.utils.cpp_extension._get_build_directory('fused_instance_norm', False))
|
| 100 |
+
if cache_dir.exists():
|
| 101 |
+
compiled_files = list(cache_dir.rglob("*.so")) + list(cache_dir.rglob("*.pyd"))
|
| 102 |
+
|
| 103 |
+
if compiled_files:
|
| 104 |
+
# Copy each compiled file to prebuilt/
|
| 105 |
+
for src_file in compiled_files:
|
| 106 |
+
dst_file = prebuilt_dir / src_file.name
|
| 107 |
+
import shutil
|
| 108 |
+
shutil.copy2(src_file, dst_file)
|
| 109 |
+
print(f"Copied: {dst_file}")
|
| 110 |
+
print(f" Size: {dst_file.stat().st_size / 1024:.1f} KB")
|
| 111 |
+
else:
|
| 112 |
+
print("ERROR: Could not locate compiled kernel file.")
|
| 113 |
+
print("The kernel was compiled but the output file location is unknown.")
|
| 114 |
+
sys.exit(1)
|
| 115 |
+
|
| 116 |
+
print()
|
| 117 |
+
print("=" * 60)
|
| 118 |
+
print("Kernel compilation complete!")
|
| 119 |
+
print(f"Pre-compiled kernels saved to: {prebuilt_dir}")
|
| 120 |
+
print()
|
| 121 |
+
print("Next steps:")
|
| 122 |
+
print("1. Test the kernels locally: python -c 'from kernels import compile_kernels; print(compile_kernels())'")
|
| 123 |
+
print("2. Commit and push to deploy")
|
| 124 |
+
print("=" * 60)
|
| 125 |
+
|
| 126 |
+
except Exception as e:
|
| 127 |
+
print()
|
| 128 |
+
print("-" * 60)
|
| 129 |
+
print("ERROR: Compilation failed!")
|
| 130 |
+
print(f"Details: {e}")
|
| 131 |
+
import traceback
|
| 132 |
+
traceback.print_exc()
|
| 133 |
+
sys.exit(1)
|
compile_kernels_colab.ipynb
ADDED
|
@@ -0,0 +1,114 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"cells": [
|
| 3 |
+
{
|
| 4 |
+
"cell_type": "markdown",
|
| 5 |
+
"metadata": {},
|
| 6 |
+
"source": [
|
| 7 |
+
"# Compile StyleForge CUDA Kernels\n",
|
| 8 |
+
"\n",
|
| 9 |
+
"Run this notebook in Google Colab to compile the CUDA kernels and download them for deployment to Hugging Face Spaces."
|
| 10 |
+
]
|
| 11 |
+
},
|
| 12 |
+
{
|
| 13 |
+
"cell_type": "code",
|
| 14 |
+
"execution_count": null,
|
| 15 |
+
"metadata": {},
|
| 16 |
+
"outputs": [],
|
| 17 |
+
"source": [
|
| 18 |
+
"# Check if GPU is available\n",
|
| 19 |
+
"!nvidia-smi"
|
| 20 |
+
]
|
| 21 |
+
},
|
| 22 |
+
{
|
| 23 |
+
"cell_type": "code",
|
| 24 |
+
"execution_count": null,
|
| 25 |
+
"metadata": {},
|
| 26 |
+
"outputs": [],
|
| 27 |
+
"source": [
|
| 28 |
+
"# Clone the repository\n",
|
| 29 |
+
"!git clone https://github.com/olivialiau/StyleForge\n",
|
| 30 |
+
"%cd StyleForge/huggingface-space"
|
| 31 |
+
]
|
| 32 |
+
},
|
| 33 |
+
{
|
| 34 |
+
"cell_type": "code",
|
| 35 |
+
"execution_count": null,
|
| 36 |
+
"metadata": {},
|
| 37 |
+
"outputs": [],
|
| 38 |
+
"source": [
|
| 39 |
+
"# Install dependencies\n",
|
| 40 |
+
"!pip install torch torchvision --index-url https://download.pytorch.org/whl/cu121\n",
|
| 41 |
+
"!pip install ninja"
|
| 42 |
+
]
|
| 43 |
+
},
|
| 44 |
+
{
|
| 45 |
+
"cell_type": "code",
|
| 46 |
+
"execution_count": null,
|
| 47 |
+
"metadata": {},
|
| 48 |
+
"outputs": [],
|
| 49 |
+
"source": [
|
| 50 |
+
"# Run the compilation script\n",
|
| 51 |
+
"!python compile_kernels.py"
|
| 52 |
+
]
|
| 53 |
+
},
|
| 54 |
+
{
|
| 55 |
+
"cell_type": "code",
|
| 56 |
+
"execution_count": null,
|
| 57 |
+
"metadata": {},
|
| 58 |
+
"outputs": [],
|
| 59 |
+
"source": [
|
| 60 |
+
"# List the compiled kernels\n",
|
| 61 |
+
"!ls -lh kernels/prebuilt/"
|
| 62 |
+
]
|
| 63 |
+
},
|
| 64 |
+
{
|
| 65 |
+
"cell_type": "code",
|
| 66 |
+
"execution_count": null,
|
| 67 |
+
"metadata": {},
|
| 68 |
+
"outputs": [],
|
| 69 |
+
"source": [
|
| 70 |
+
"# Download the compiled kernels\n",
|
| 71 |
+
"from google.colab import files\n",
|
| 72 |
+
"import pathlib\n",
|
| 73 |
+
"\n",
|
| 74 |
+
"prebuilt_dir = pathlib.Path('kernels/prebuilt')\n",
|
| 75 |
+
"kernel_files = list(prebuilt_dir.glob('*.so')) + list(prebuilt_dir.glob('*.pyd'))\n",
|
| 76 |
+
"\n",
|
| 77 |
+
"if kernel_files:\n",
|
| 78 |
+
" for f in kernel_files:\n",
|
| 79 |
+
" print(f'Downloading: {f.name}')\n",
|
| 80 |
+
" files.download(str(f))\n",
|
| 81 |
+
"else:\n",
|
| 82 |
+
" print('No kernel files found!')\n",
|
| 83 |
+
" print('Check the compilation output above for errors.')"
|
| 84 |
+
]
|
| 85 |
+
},
|
| 86 |
+
{
|
| 87 |
+
"cell_type": "markdown",
|
| 88 |
+
"metadata": {},
|
| 89 |
+
"source": [
|
| 90 |
+
"## Next Steps\n",
|
| 91 |
+
"\n",
|
| 92 |
+
"1. Download the `.so` file(s)\n",
|
| 93 |
+
"2. Copy them to your local `kernels/prebuilt/` directory\n",
|
| 94 |
+
"3. Commit and push to GitHub:\n",
|
| 95 |
+
" ```\n",
|
| 96 |
+
" git add kernels/prebuilt/\n",
|
| 97 |
+
" git commit -m \"Add pre-compiled CUDA kernels\"\n",
|
| 98 |
+
" git push\n",
|
| 99 |
+
" ```\n",
|
| 100 |
+
"\n",
|
| 101 |
+
"The kernels will be automatically loaded on Hugging Face Spaces!"
|
| 102 |
+
]
|
| 103 |
+
}
|
| 104 |
+
],
|
| 105 |
+
"metadata": {
|
| 106 |
+
"kernelspec": {
|
| 107 |
+
"display_name": "Python 3",
|
| 108 |
+
"language": "python",
|
| 109 |
+
"name": "python3"
|
| 110 |
+
}
|
| 111 |
+
},
|
| 112 |
+
"nbformat": 4,
|
| 113 |
+
"nbformat_minor": 4
|
| 114 |
+
}
|
kernels/__init__.py
CHANGED
|
@@ -2,11 +2,13 @@
|
|
| 2 |
StyleForge CUDA Kernels Package
|
| 3 |
Custom CUDA kernels for accelerated neural style transfer.
|
| 4 |
|
| 5 |
-
For ZeroGPU:
|
|
|
|
| 6 |
"""
|
| 7 |
|
| 8 |
import torch
|
| 9 |
import os
|
|
|
|
| 10 |
|
| 11 |
# Try to import CUDA kernels, fall back gracefully
|
| 12 |
_CUDA_KERNELS_AVAILABLE = False
|
|
@@ -16,6 +18,9 @@ _KERNELS_COMPILED = False
|
|
| 16 |
# Check if running on ZeroGPU
|
| 17 |
_ZERO_GPU = os.environ.get('SPACE_ID', '').startswith('hf.co') or os.environ.get('ZERO_GPU') == '1'
|
| 18 |
|
|
|
|
|
|
|
|
|
|
| 19 |
|
| 20 |
def check_cuda_kernels():
|
| 21 |
"""Check if CUDA kernels are available."""
|
|
@@ -23,28 +28,114 @@ def check_cuda_kernels():
|
|
| 23 |
|
| 24 |
|
| 25 |
def get_fused_instance_norm(num_features, **kwargs):
|
| 26 |
-
"""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 27 |
if _FusedInstanceNorm2d is not None:
|
| 28 |
try:
|
| 29 |
return _FusedInstanceNorm2d(num_features, **kwargs)
|
| 30 |
except Exception:
|
| 31 |
pass
|
| 32 |
-
# Fallback to PyTorch
|
| 33 |
return torch.nn.InstanceNorm2d(num_features, affine=kwargs.get('affine', True))
|
| 34 |
|
| 35 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 36 |
def compile_kernels():
|
| 37 |
"""
|
| 38 |
Compile CUDA kernels on-demand.
|
| 39 |
|
| 40 |
-
|
| 41 |
-
|
| 42 |
"""
|
| 43 |
global _CUDA_KERNELS_AVAILABLE, _FusedInstanceNorm2d, _KERNELS_COMPILED
|
| 44 |
|
| 45 |
if _KERNELS_COMPILED:
|
| 46 |
return _CUDA_KERNELS_AVAILABLE
|
| 47 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 48 |
if not torch.cuda.is_available():
|
| 49 |
_KERNELS_COMPILED = True
|
| 50 |
return False
|
|
@@ -63,9 +154,15 @@ def compile_kernels():
|
|
| 63 |
return False
|
| 64 |
|
| 65 |
|
| 66 |
-
# Auto-compile on import for non-ZeroGPU environments
|
| 67 |
if torch.cuda.is_available() and not _ZERO_GPU:
|
| 68 |
compile_kernels()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 69 |
|
| 70 |
|
| 71 |
__all__ = [
|
|
@@ -73,4 +170,5 @@ __all__ = [
|
|
| 73 |
'get_fused_instance_norm',
|
| 74 |
'FusedInstanceNorm2d',
|
| 75 |
'compile_kernels',
|
|
|
|
| 76 |
]
|
|
|
|
| 2 |
StyleForge CUDA Kernels Package
|
| 3 |
Custom CUDA kernels for accelerated neural style transfer.
|
| 4 |
|
| 5 |
+
For ZeroGPU: Pre-compiled kernels are loaded from prebuilt/.
|
| 6 |
+
For local: Kernels are JIT-compiled if prebuilt not available.
|
| 7 |
"""
|
| 8 |
|
| 9 |
import torch
|
| 10 |
import os
|
| 11 |
+
from pathlib import Path
|
| 12 |
|
| 13 |
# Try to import CUDA kernels, fall back gracefully
|
| 14 |
_CUDA_KERNELS_AVAILABLE = False
|
|
|
|
| 18 |
# Check if running on ZeroGPU
|
| 19 |
_ZERO_GPU = os.environ.get('SPACE_ID', '').startswith('hf.co') or os.environ.get('ZERO_GPU') == '1'
|
| 20 |
|
| 21 |
+
# Path to pre-compiled kernels
|
| 22 |
+
_PREBUILT_PATH = Path(__file__).parent / "prebuilt"
|
| 23 |
+
|
| 24 |
|
| 25 |
def check_cuda_kernels():
|
| 26 |
"""Check if CUDA kernels are available."""
|
|
|
|
| 28 |
|
| 29 |
|
| 30 |
def get_fused_instance_norm(num_features, **kwargs):
|
| 31 |
+
"""
|
| 32 |
+
Get FusedInstanceNorm2d module or PyTorch fallback.
|
| 33 |
+
|
| 34 |
+
On ZeroGPU: Uses pre-compiled kernels if available.
|
| 35 |
+
On local: May use custom fused kernels (prebuilt or JIT).
|
| 36 |
+
"""
|
| 37 |
if _FusedInstanceNorm2d is not None:
|
| 38 |
try:
|
| 39 |
return _FusedInstanceNorm2d(num_features, **kwargs)
|
| 40 |
except Exception:
|
| 41 |
pass
|
| 42 |
+
# Fallback to PyTorch (still GPU-accelerated, just not custom fused)
|
| 43 |
return torch.nn.InstanceNorm2d(num_features, affine=kwargs.get('affine', True))
|
| 44 |
|
| 45 |
|
| 46 |
+
def load_prebuilt_kernels():
|
| 47 |
+
"""
|
| 48 |
+
Try to load pre-compiled CUDA kernels from prebuilt/ directory.
|
| 49 |
+
|
| 50 |
+
Returns True if successful, False otherwise.
|
| 51 |
+
"""
|
| 52 |
+
global _FusedInstanceNorm2d, _CUDA_KERNELS_AVAILABLE, _KERNELS_COMPILED
|
| 53 |
+
|
| 54 |
+
if _KERNELS_COMPILED:
|
| 55 |
+
return _CUDA_KERNELS_AVAILABLE
|
| 56 |
+
|
| 57 |
+
# Check if prebuilt kernels exist
|
| 58 |
+
prebuilt_files = list(_PREBUILT_PATH.glob("*.so")) + list(_PREBUILT_PATH.glob("*.pyd"))
|
| 59 |
+
if not prebuilt_files:
|
| 60 |
+
print("No pre-compiled kernels found in prebuilt/")
|
| 61 |
+
return False
|
| 62 |
+
|
| 63 |
+
try:
|
| 64 |
+
# Try to import from prebuilt directory
|
| 65 |
+
import sys
|
| 66 |
+
if str(_PREBUILT_PATH) not in sys.path:
|
| 67 |
+
sys.path.insert(0, str(_PREBUILT_PATH))
|
| 68 |
+
|
| 69 |
+
# Try to load the prebuilt module
|
| 70 |
+
for kernel_file in prebuilt_files:
|
| 71 |
+
try:
|
| 72 |
+
# Import the compiled module
|
| 73 |
+
module_name = kernel_file.stem
|
| 74 |
+
spec = __import__('importlib.util').util.spec_from_file_location(module_name, kernel_file)
|
| 75 |
+
if spec and spec.loader:
|
| 76 |
+
mod = __import__('importlib.util').util.module_from_spec(spec)
|
| 77 |
+
spec.loader.exec_module(mod)
|
| 78 |
+
print(f"Loaded pre-compiled kernel: {kernel_file.name}")
|
| 79 |
+
|
| 80 |
+
# Create FusedInstanceNorm2d class
|
| 81 |
+
class PrebuiltFusedInstanceNorm2d(torch.nn.Module):
|
| 82 |
+
def __init__(self, num_features, **kwargs):
|
| 83 |
+
super().__init__()
|
| 84 |
+
self.num_features = num_features
|
| 85 |
+
self.eps = kwargs.get('eps', 1e-5)
|
| 86 |
+
if kwargs.get('affine', True):
|
| 87 |
+
self.gamma = torch.nn.Parameter(torch.ones(num_features))
|
| 88 |
+
self.beta = torch.nn.Parameter(torch.zeros(num_features))
|
| 89 |
+
else:
|
| 90 |
+
self.register_buffer('gamma', torch.ones(num_features))
|
| 91 |
+
self.register_buffer('beta', torch.zeros(num_features))
|
| 92 |
+
self._pytorch_norm = torch.nn.InstanceNorm2d(num_features, **kwargs)
|
| 93 |
+
|
| 94 |
+
def forward(self, x):
|
| 95 |
+
try:
|
| 96 |
+
return mod.fused_instance_norm_forward(
|
| 97 |
+
x.contiguous(), self.gamma, self.beta, self.eps
|
| 98 |
+
)
|
| 99 |
+
except Exception:
|
| 100 |
+
return self._pytorch_norm(x)
|
| 101 |
+
|
| 102 |
+
_FusedInstanceNorm2d = PrebuiltFusedInstanceNorm2d
|
| 103 |
+
_CUDA_KERNELS_AVAILABLE = True
|
| 104 |
+
_KERNELS_COMPILED = True
|
| 105 |
+
return True
|
| 106 |
+
except Exception as e:
|
| 107 |
+
print(f"Failed to load {kernel_file.name}: {e}")
|
| 108 |
+
continue
|
| 109 |
+
|
| 110 |
+
except Exception as e:
|
| 111 |
+
print(f"Failed to load prebuilt kernels: {e}")
|
| 112 |
+
|
| 113 |
+
return False
|
| 114 |
+
|
| 115 |
+
|
| 116 |
def compile_kernels():
|
| 117 |
"""
|
| 118 |
Compile CUDA kernels on-demand.
|
| 119 |
|
| 120 |
+
On ZeroGPU: Tries pre-compiled kernels first.
|
| 121 |
+
On local: Compiles custom CUDA kernels.
|
| 122 |
"""
|
| 123 |
global _CUDA_KERNELS_AVAILABLE, _FusedInstanceNorm2d, _KERNELS_COMPILED
|
| 124 |
|
| 125 |
if _KERNELS_COMPILED:
|
| 126 |
return _CUDA_KERNELS_AVAILABLE
|
| 127 |
|
| 128 |
+
# First, try pre-compiled kernels
|
| 129 |
+
if load_prebuilt_kernels():
|
| 130 |
+
print("Using pre-compiled CUDA kernels!")
|
| 131 |
+
return True
|
| 132 |
+
|
| 133 |
+
# Fall back to JIT compilation (only on local, not ZeroGPU)
|
| 134 |
+
if _ZERO_GPU:
|
| 135 |
+
print("ZeroGPU mode: No pre-compiled kernels found, using PyTorch fallback")
|
| 136 |
+
_KERNELS_COMPILED = True
|
| 137 |
+
return False
|
| 138 |
+
|
| 139 |
if not torch.cuda.is_available():
|
| 140 |
_KERNELS_COMPILED = True
|
| 141 |
return False
|
|
|
|
| 154 |
return False
|
| 155 |
|
| 156 |
|
| 157 |
+
# Auto-compile on import for non-ZeroGPU environments with CUDA
|
| 158 |
if torch.cuda.is_available() and not _ZERO_GPU:
|
| 159 |
compile_kernels()
|
| 160 |
+
elif _ZERO_GPU:
|
| 161 |
+
# On ZeroGPU, try prebuilt kernels
|
| 162 |
+
if load_prebuilt_kernels():
|
| 163 |
+
print("ZeroGPU: Using pre-compiled CUDA kernels!")
|
| 164 |
+
else:
|
| 165 |
+
print("ZeroGPU: No pre-compiled kernels, using PyTorch GPU fallback")
|
| 166 |
|
| 167 |
|
| 168 |
__all__ = [
|
|
|
|
| 170 |
'get_fused_instance_norm',
|
| 171 |
'FusedInstanceNorm2d',
|
| 172 |
'compile_kernels',
|
| 173 |
+
'load_prebuilt_kernels',
|
| 174 |
]
|
kernels/prebuilt/README.md
ADDED
|
@@ -0,0 +1,64 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Pre-Compiled CUDA Kernels
|
| 2 |
+
|
| 3 |
+
This directory contains pre-compiled CUDA kernels for use on Hugging Face Spaces.
|
| 4 |
+
|
| 5 |
+
## How to Compile Kernels Locally
|
| 6 |
+
|
| 7 |
+
To compile the CUDA kernels locally and upload them here:
|
| 8 |
+
|
| 9 |
+
### 1. Compile Locally
|
| 10 |
+
|
| 11 |
+
Run this script from the `huggingface-space` directory:
|
| 12 |
+
|
| 13 |
+
```bash
|
| 14 |
+
python compile_kernels.py
|
| 15 |
+
```
|
| 16 |
+
|
| 17 |
+
Or compile manually:
|
| 18 |
+
|
| 19 |
+
```bash
|
| 20 |
+
cd huggingface-space
|
| 21 |
+
python -c "
|
| 22 |
+
from kernels.cuda_build import compile_inline
|
| 23 |
+
from pathlib import Path
|
| 24 |
+
|
| 25 |
+
cuda_source = (Path('kernels') / 'instance_norm.cu').read_text()
|
| 26 |
+
module = compile_inline(
|
| 27 |
+
name='fused_instance_norm',
|
| 28 |
+
cuda_source=cuda_source,
|
| 29 |
+
functions=['forward'],
|
| 30 |
+
build_directory=Path('build'),
|
| 31 |
+
verbose=True
|
| 32 |
+
)
|
| 33 |
+
print('Compiled successfully!')
|
| 34 |
+
print(f'Module location: {module.__file__}')
|
| 35 |
+
"
|
| 36 |
+
```
|
| 37 |
+
|
| 38 |
+
### 2. Copy Compiled File
|
| 39 |
+
|
| 40 |
+
After compilation, copy the compiled `.so` file to this directory:
|
| 41 |
+
|
| 42 |
+
```bash
|
| 43 |
+
# Find the compiled file (usually in build/)
|
| 44 |
+
find build/ -name "*.so" -exec cp {} kernels/prebuilt/ \;
|
| 45 |
+
```
|
| 46 |
+
|
| 47 |
+
### 3. Commit and Push
|
| 48 |
+
|
| 49 |
+
```bash
|
| 50 |
+
git add kernels/prebuilt/
|
| 51 |
+
git commit -m "Add pre-compiled CUDA kernels"
|
| 52 |
+
git push
|
| 53 |
+
```
|
| 54 |
+
|
| 55 |
+
## Notes
|
| 56 |
+
|
| 57 |
+
- The compiled kernels are architecture-specific (e.g., `sm_70`, `sm_75`, `sm_86`)
|
| 58 |
+
- Hugging Face Spaces typically use Tesla T4 (sm_75) or A100 (sm_80)
|
| 59 |
+
- For maximum compatibility, compile with multiple compute capabilities
|
| 60 |
+
|
| 61 |
+
## Current Status
|
| 62 |
+
|
| 63 |
+
No pre-compiled kernels found. The app will use PyTorch's InstanceNorm2d fallback,
|
| 64 |
+
which is still GPU-accelerated but not as fast as custom fused kernels.
|