Spaces:

oliau
/

StyleForge

Sleeping

App Files Files Community

Olivia commited on 15 days ago

Commit

1282ba1

1 Parent(s): d3d412a

info endpoint

Browse files

Files changed (6) hide show

.gitignore +4 -0
app.py +14 -15
compile_kernels.py +133 -0
compile_kernels_colab.ipynb +114 -0
kernels/__init__.py +104 -6
kernels/prebuilt/README.md +64 -0

.gitignore CHANGED Viewed

@@ -33,3 +33,7 @@ Thumbs.db
 # Gradio
 gradio_cached_examples/
 flagged/

 # Gradio
 gradio_cached_examples/
 flagged/
+# BUT allow pre-compiled kernels in prebuilt/
+!kernels/prebuilt/*.so
+!kernels/prebuilt/*.pyd

app.py CHANGED Viewed

@@ -120,10 +120,15 @@ if SPACES_AVAILABLE:
 # Check CUDA kernels availability
 try:
-    from kernels import check_cuda_kernels, get_fused_instance_norm, compile_kernels
-    # On ZeroGPU, kernels will be compiled on-demand within GPU tasks
     CUDA_KERNELS_AVAILABLE = check_cuda_kernels()
-    print(f"CUDA Kernels: {'Available (lazy-loaded)' if not CUDA_KERNELS_AVAILABLE and SPACES_AVAILABLE else 'Available' if CUDA_KERNELS_AVAILABLE else 'Not Available'}")
 except Exception:
     CUDA_KERNELS_AVAILABLE = False
     print("CUDA Kernels: Not Available (using PyTorch fallback)")
@@ -579,7 +584,12 @@ if _SPACES_ZERO_GPU:
     print("Device: CUDA (ZeroGPU mode - lazy initialization)")
 else:
     print(f"Device: {get_device().type.upper()}")
-print(f"CUDA Kernels: {'Available' if CUDA_KERNELS_AVAILABLE else 'Not Available (will compile on first GPU task)'}")
 # Skip model preloading on ZeroGPU to avoid CUDA init in main process
 if not _SPACES_ZERO_GPU:
@@ -1364,22 +1374,11 @@ def stylize_image_impl(
     add_watermark: bool
 ) -> Tuple[Optional[Image.Image], str, Optional[str]]:
     """Main stylization function for Gradio."""
-    global CUDA_KERNELS_AVAILABLE
     if input_image is None:
         return None, "Please upload an image first.", None
     try:
-        # On ZeroGPU, compile CUDA kernels within the GPU task on first use
-        if SPACES_AVAILABLE and not CUDA_KERNELS_AVAILABLE:
-            try:
-                from kernels import compile_kernels
-                CUDA_KERNELS_AVAILABLE = compile_kernels()
-                if CUDA_KERNELS_AVAILABLE:
-                    print("CUDA kernels compiled successfully within GPU task!")
-            except Exception as e:
-                print(f"Failed to compile CUDA kernels: {e}")
         # Convert to RGB if needed
         if input_image.mode != 'RGB':
             input_image = input_image.convert('RGB')

 # Check CUDA kernels availability
 try:
+    from kernels import check_cuda_kernels, get_fused_instance_norm, load_prebuilt_kernels
+    # On ZeroGPU: Uses pre-compiled kernels from prebuilt/ if available
+    # On local: JIT compiles kernels if prebuilt not found
     CUDA_KERNELS_AVAILABLE = check_cuda_kernels()
+    if SPACES_AVAILABLE:
+        status = "Pre-compiled" if CUDA_KERNELS_AVAILABLE else "PyTorch GPU fallback (no prebuilt kernels)"
+        print(f"CUDA Kernels: {status}")
+    else:
+        print(f"CUDA Kernels: {'Available' if CUDA_KERNELS_AVAILABLE else 'Not Available (using PyTorch fallback)'}")
 except Exception:
     CUDA_KERNELS_AVAILABLE = False
     print("CUDA Kernels: Not Available (using PyTorch fallback)")
     print("Device: CUDA (ZeroGPU mode - lazy initialization)")
 else:
     print(f"Device: {get_device().type.upper()}")
+if SPACES_AVAILABLE:
+    status = "Pre-compiled" if CUDA_KERNELS_AVAILABLE else "PyTorch GPU fallback"
+    print(f"CUDA Kernels: {status}")
+else:
+    print(f"CUDA Kernels: {'Available' if CUDA_KERNELS_AVAILABLE else 'Not Available (using PyTorch fallback)'}")
 # Skip model preloading on ZeroGPU to avoid CUDA init in main process
 if not _SPACES_ZERO_GPU:
     add_watermark: bool
 ) -> Tuple[Optional[Image.Image], str, Optional[str]]:
     """Main stylization function for Gradio."""
     if input_image is None:
         return None, "Please upload an image first.", None
     try:
         # Convert to RGB if needed
         if input_image.mode != 'RGB':
             input_image = input_image.convert('RGB')

compile_kernels.py ADDED Viewed

	@@ -0,0 +1,133 @@

+#!/usr/bin/env python3
+"""
+Compile CUDA kernels locally for deployment to Hugging Face Spaces.
+This script compiles the custom CUDA kernels and saves them to the prebuilt/
+directory for use on ZeroGPU where JIT compilation is too slow.
+"""
+import sys
+import torch
+from pathlib import Path
+# Add kernels directory to path
+sys.path.insert(0, str(Path(__file__).parent))
+print("=" * 60)
+print("StyleForge CUDA Kernel Compiler")
+print("=" * 60)
+print()
+# Check CUDA availability
+if not torch.cuda.is_available():
+    print("ERROR: CUDA is not available on this system.")
+    print("This script requires a CUDA-capable GPU.")
+    sys.exit(1)
+print(f"CUDA Version: {torch.version.cuda}")
+print(f"PyTorch Version: {torch.__version__}")
+print(f"GPU: {torch.cuda.get_device_name(0)}")
+# Get compute capability
+major, minor = torch.cuda.get_device_capability(0)
+compute_capability = f"{major}.{minor}"
+print(f"Compute Capability: {compute_capability}")
+print()
+# Create prebuilt directory
+prebuilt_dir = Path(__file__).parent / "kernels" / "prebuilt"
+prebuilt_dir.mkdir(exist_ok=True, parents=True)
+print("Compiling CUDA kernels...")
+print("-" * 60)
+try:
+    from kernels.cuda_build import compile_inline, get_cuda_info
+    # Read CUDA source
+    kernel_path = Path(__file__).parent / "kernels" / "instance_norm.cu"
+    if not kernel_path.exists():
+        print(f"ERROR: Kernel source not found at {kernel_path}")
+        sys.exit(1)
+    cuda_source = kernel_path.read_text()
+    # Get CUDA build info
+    cuda_info = get_cuda_info()
+    extra_flags = cuda_info.get('extra_cuda_cflags', [])
+    # Add architecture-specific flags for Hugging Face GPUs
+    # Common GPUs on HF: T4 (7.5), A100 (8.0), V100 (7.0)
+    hf_arch_flags = [
+        '-gencode=arch=compute_70,code=sm_70',  # V100
+        '-gencode=arch=compute_75,code=sm_75',  # T4
+        '-gencode=arch=compute_80,code=sm_80',  # A100
+    ]
+    extra_flags.extend(hf_arch_flags)
+    print("Build flags:", ' '.join(extra_flags))
+    print()
+    # Compile the kernel
+    module = compile_inline(
+        name='fused_instance_norm',
+        cuda_source=cuda_source,
+        functions=['forward'],
+        build_directory=Path(__file__).parent / "build",
+        extra_cuda_cflags=extra_flags,
+        verbose=True
+    )
+    print()
+    print("-" * 60)
+    print("Compilation successful!")
+    print()
+    # Find and copy the compiled library
+    build_dir = Path(__file__).parent / "build"
+    so_files = list(build_dir.rglob("fused_instance_norm*.so"))
+    pyd_files = list(build_dir.rglob("fused_instance_norm*.pyd"))
+    compiled_files = so_files + pyd_files
+    if not compiled_files:
+        print("WARNING: No compiled .so/.pyd file found in build/")
+        print("The module may be in torch's cache. Trying to locate...")
+        # Try to find in torch extension cache
+        import torch.utils.cpp_extension
+        cache_dir = Path(torch.utils.cpp_extension._get_build_directory('fused_instance_norm', False))
+        if cache_dir.exists():
+            compiled_files = list(cache_dir.rglob("*.so")) + list(cache_dir.rglob("*.pyd"))
+    if compiled_files:
+        # Copy each compiled file to prebuilt/
+        for src_file in compiled_files:
+            dst_file = prebuilt_dir / src_file.name
+            import shutil
+            shutil.copy2(src_file, dst_file)
+            print(f"Copied: {dst_file}")
+            print(f"  Size: {dst_file.stat().st_size / 1024:.1f} KB")
+    else:
+        print("ERROR: Could not locate compiled kernel file.")
+        print("The kernel was compiled but the output file location is unknown.")
+        sys.exit(1)
+    print()
+    print("=" * 60)
+    print("Kernel compilation complete!")
+    print(f"Pre-compiled kernels saved to: {prebuilt_dir}")
+    print()
+    print("Next steps:")
+    print("1. Test the kernels locally: python -c 'from kernels import compile_kernels; print(compile_kernels())'")
+    print("2. Commit and push to deploy")
+    print("=" * 60)
+except Exception as e:
+    print()
+    print("-" * 60)
+    print("ERROR: Compilation failed!")
+    print(f"Details: {e}")
+    import traceback
+    traceback.print_exc()
+    sys.exit(1)

compile_kernels_colab.ipynb ADDED Viewed

	@@ -0,0 +1,114 @@

+{
+  "cells": [
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# Compile StyleForge CUDA Kernels\n",
+        "\n",
+        "Run this notebook in Google Colab to compile the CUDA kernels and download them for deployment to Hugging Face Spaces."
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {},
+      "outputs": [],
+      "source": [
+        "# Check if GPU is available\n",
+        "!nvidia-smi"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {},
+      "outputs": [],
+      "source": [
+        "# Clone the repository\n",
+        "!git clone https://github.com/olivialiau/StyleForge\n",
+        "%cd StyleForge/huggingface-space"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {},
+      "outputs": [],
+      "source": [
+        "# Install dependencies\n",
+        "!pip install torch torchvision --index-url https://download.pytorch.org/whl/cu121\n",
+        "!pip install ninja"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {},
+      "outputs": [],
+      "source": [
+        "# Run the compilation script\n",
+        "!python compile_kernels.py"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {},
+      "outputs": [],
+      "source": [
+        "# List the compiled kernels\n",
+        "!ls -lh kernels/prebuilt/"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {},
+      "outputs": [],
+      "source": [
+        "# Download the compiled kernels\n",
+        "from google.colab import files\n",
+        "import pathlib\n",
+        "\n",
+        "prebuilt_dir = pathlib.Path('kernels/prebuilt')\n",
+        "kernel_files = list(prebuilt_dir.glob('*.so')) + list(prebuilt_dir.glob('*.pyd'))\n",
+        "\n",
+        "if kernel_files:\n",
+        "    for f in kernel_files:\n",
+        "        print(f'Downloading: {f.name}')\n",
+        "        files.download(str(f))\n",
+        "else:\n",
+        "    print('No kernel files found!')\n",
+        "    print('Check the compilation output above for errors.')"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "## Next Steps\n",
+        "\n",
+        "1. Download the `.so` file(s)\n",
+        "2. Copy them to your local `kernels/prebuilt/` directory\n",
+        "3. Commit and push to GitHub:\n",
+        "   ```\n",
+        "   git add kernels/prebuilt/\n",
+        "   git commit -m \"Add pre-compiled CUDA kernels\"\n",
+        "   git push\n",
+        "   ```\n",
+        "\n",
+        "The kernels will be automatically loaded on Hugging Face Spaces!"
+      ]
+    }
+  ],
+  "metadata": {
+    "kernelspec": {
+      "display_name": "Python 3",
+      "language": "python",
+      "name": "python3"
+    }
+  },
+  "nbformat": 4,
+  "nbformat_minor": 4
+}

kernels/__init__.py CHANGED Viewed

@@ -2,11 +2,13 @@
 StyleForge CUDA Kernels Package
 Custom CUDA kernels for accelerated neural style transfer.
-For ZeroGPU: Kernels are compiled on-demand within GPU task context.
 """
 import torch
 import os
 # Try to import CUDA kernels, fall back gracefully
 _CUDA_KERNELS_AVAILABLE = False
@@ -16,6 +18,9 @@ _KERNELS_COMPILED = False
 # Check if running on ZeroGPU
 _ZERO_GPU = os.environ.get('SPACE_ID', '').startswith('hf.co') or os.environ.get('ZERO_GPU') == '1'
 def check_cuda_kernels():
     """Check if CUDA kernels are available."""
@@ -23,28 +28,114 @@ def check_cuda_kernels():
 def get_fused_instance_norm(num_features, **kwargs):
-    """Get FusedInstanceNorm2d module or PyTorch fallback."""
     if _FusedInstanceNorm2d is not None:
         try:
             return _FusedInstanceNorm2d(num_features, **kwargs)
         except Exception:
             pass
-    # Fallback to PyTorch
     return torch.nn.InstanceNorm2d(num_features, affine=kwargs.get('affine', True))
 def compile_kernels():
     """
     Compile CUDA kernels on-demand.
-    This function is called within a GPU task on ZeroGPU to ensure
-    compilation happens within the task's timeout budget.
     """
     global _CUDA_KERNELS_AVAILABLE, _FusedInstanceNorm2d, _KERNELS_COMPILED
     if _KERNELS_COMPILED:
         return _CUDA_KERNELS_AVAILABLE
     if not torch.cuda.is_available():
         _KERNELS_COMPILED = True
         return False
@@ -63,9 +154,15 @@ def compile_kernels():
         return False
-# Auto-compile on import for non-ZeroGPU environments
 if torch.cuda.is_available() and not _ZERO_GPU:
     compile_kernels()
 __all__ = [
@@ -73,4 +170,5 @@ __all__ = [
     'get_fused_instance_norm',
     'FusedInstanceNorm2d',
     'compile_kernels',
 ]

 StyleForge CUDA Kernels Package
 Custom CUDA kernels for accelerated neural style transfer.
+For ZeroGPU: Pre-compiled kernels are loaded from prebuilt/.
+For local: Kernels are JIT-compiled if prebuilt not available.
 """
 import torch
 import os
+from pathlib import Path
 # Try to import CUDA kernels, fall back gracefully
 _CUDA_KERNELS_AVAILABLE = False
 # Check if running on ZeroGPU
 _ZERO_GPU = os.environ.get('SPACE_ID', '').startswith('hf.co') or os.environ.get('ZERO_GPU') == '1'
+# Path to pre-compiled kernels
+_PREBUILT_PATH = Path(__file__).parent / "prebuilt"
 def check_cuda_kernels():
     """Check if CUDA kernels are available."""
 def get_fused_instance_norm(num_features, **kwargs):
+    """
+    Get FusedInstanceNorm2d module or PyTorch fallback.
+    On ZeroGPU: Uses pre-compiled kernels if available.
+    On local: May use custom fused kernels (prebuilt or JIT).
+    """
     if _FusedInstanceNorm2d is not None:
         try:
             return _FusedInstanceNorm2d(num_features, **kwargs)
         except Exception:
             pass
+    # Fallback to PyTorch (still GPU-accelerated, just not custom fused)
     return torch.nn.InstanceNorm2d(num_features, affine=kwargs.get('affine', True))
+def load_prebuilt_kernels():
+    """
+    Try to load pre-compiled CUDA kernels from prebuilt/ directory.
+    Returns True if successful, False otherwise.
+    """
+    global _FusedInstanceNorm2d, _CUDA_KERNELS_AVAILABLE, _KERNELS_COMPILED
+    if _KERNELS_COMPILED:
+        return _CUDA_KERNELS_AVAILABLE
+    # Check if prebuilt kernels exist
+    prebuilt_files = list(_PREBUILT_PATH.glob("*.so")) + list(_PREBUILT_PATH.glob("*.pyd"))
+    if not prebuilt_files:
+        print("No pre-compiled kernels found in prebuilt/")
+        return False
+    try:
+        # Try to import from prebuilt directory
+        import sys
+        if str(_PREBUILT_PATH) not in sys.path:
+            sys.path.insert(0, str(_PREBUILT_PATH))
+        # Try to load the prebuilt module
+        for kernel_file in prebuilt_files:
+            try:
+                # Import the compiled module
+                module_name = kernel_file.stem
+                spec = __import__('importlib.util').util.spec_from_file_location(module_name, kernel_file)
+                if spec and spec.loader:
+                    mod = __import__('importlib.util').util.module_from_spec(spec)
+                    spec.loader.exec_module(mod)
+                    print(f"Loaded pre-compiled kernel: {kernel_file.name}")
+                    # Create FusedInstanceNorm2d class
+                    class PrebuiltFusedInstanceNorm2d(torch.nn.Module):
+                        def __init__(self, num_features, **kwargs):
+                            super().__init__()
+                            self.num_features = num_features
+                            self.eps = kwargs.get('eps', 1e-5)
+                            if kwargs.get('affine', True):
+                                self.gamma = torch.nn.Parameter(torch.ones(num_features))
+                                self.beta = torch.nn.Parameter(torch.zeros(num_features))
+                            else:
+                                self.register_buffer('gamma', torch.ones(num_features))
+                                self.register_buffer('beta', torch.zeros(num_features))
+                            self._pytorch_norm = torch.nn.InstanceNorm2d(num_features, **kwargs)
+                        def forward(self, x):
+                            try:
+                                return mod.fused_instance_norm_forward(
+                                    x.contiguous(), self.gamma, self.beta, self.eps
+                                )
+                            except Exception:
+                                return self._pytorch_norm(x)
+                    _FusedInstanceNorm2d = PrebuiltFusedInstanceNorm2d
+                    _CUDA_KERNELS_AVAILABLE = True
+                    _KERNELS_COMPILED = True
+                    return True
+            except Exception as e:
+                print(f"Failed to load {kernel_file.name}: {e}")
+                continue
+    except Exception as e:
+        print(f"Failed to load prebuilt kernels: {e}")
+    return False
 def compile_kernels():
     """
     Compile CUDA kernels on-demand.
+    On ZeroGPU: Tries pre-compiled kernels first.
+    On local: Compiles custom CUDA kernels.
     """
     global _CUDA_KERNELS_AVAILABLE, _FusedInstanceNorm2d, _KERNELS_COMPILED
     if _KERNELS_COMPILED:
         return _CUDA_KERNELS_AVAILABLE
+    # First, try pre-compiled kernels
+    if load_prebuilt_kernels():
+        print("Using pre-compiled CUDA kernels!")
+        return True
+    # Fall back to JIT compilation (only on local, not ZeroGPU)
+    if _ZERO_GPU:
+        print("ZeroGPU mode: No pre-compiled kernels found, using PyTorch fallback")
+        _KERNELS_COMPILED = True
+        return False
     if not torch.cuda.is_available():
         _KERNELS_COMPILED = True
         return False
         return False
+# Auto-compile on import for non-ZeroGPU environments with CUDA
 if torch.cuda.is_available() and not _ZERO_GPU:
     compile_kernels()
+elif _ZERO_GPU:
+    # On ZeroGPU, try prebuilt kernels
+    if load_prebuilt_kernels():
+        print("ZeroGPU: Using pre-compiled CUDA kernels!")
+    else:
+        print("ZeroGPU: No pre-compiled kernels, using PyTorch GPU fallback")
 __all__ = [
     'get_fused_instance_norm',
     'FusedInstanceNorm2d',
     'compile_kernels',
+    'load_prebuilt_kernels',
 ]

kernels/prebuilt/README.md ADDED Viewed

	@@ -0,0 +1,64 @@

+# Pre-Compiled CUDA Kernels
+This directory contains pre-compiled CUDA kernels for use on Hugging Face Spaces.
+## How to Compile Kernels Locally
+To compile the CUDA kernels locally and upload them here:
+### 1. Compile Locally
+Run this script from the `huggingface-space` directory:
+```bash
+python compile_kernels.py
+```
+Or compile manually:
+```bash
+cd huggingface-space
+python -c "
+from kernels.cuda_build import compile_inline
+from pathlib import Path
+cuda_source = (Path('kernels') / 'instance_norm.cu').read_text()
+module = compile_inline(
+    name='fused_instance_norm',
+    cuda_source=cuda_source,
+    functions=['forward'],
+    build_directory=Path('build'),
+    verbose=True
+)
+print('Compiled successfully!')
+print(f'Module location: {module.__file__}')
+"
+```
+### 2. Copy Compiled File
+After compilation, copy the compiled `.so` file to this directory:
+```bash
+# Find the compiled file (usually in build/)
+find build/ -name "*.so" -exec cp {} kernels/prebuilt/ \;
+```
+### 3. Commit and Push
+```bash
+git add kernels/prebuilt/
+git commit -m "Add pre-compiled CUDA kernels"
+git push
+```
+## Notes
+- The compiled kernels are architecture-specific (e.g., `sm_70`, `sm_75`, `sm_86`)
+- Hugging Face Spaces typically use Tesla T4 (sm_75) or A100 (sm_80)
+- For maximum compatibility, compile with multiple compute capabilities
+## Current Status
+No pre-compiled kernels found. The app will use PyTorch's InstanceNorm2d fallback,
+which is still GPU-accelerated but not as fast as custom fused kernels.