Olivia commited on
Commit
1282ba1
·
1 Parent(s): d3d412a

info endpoint

Browse files
.gitignore CHANGED
@@ -33,3 +33,7 @@ Thumbs.db
33
  # Gradio
34
  gradio_cached_examples/
35
  flagged/
 
 
 
 
 
33
  # Gradio
34
  gradio_cached_examples/
35
  flagged/
36
+
37
+ # BUT allow pre-compiled kernels in prebuilt/
38
+ !kernels/prebuilt/*.so
39
+ !kernels/prebuilt/*.pyd
app.py CHANGED
@@ -120,10 +120,15 @@ if SPACES_AVAILABLE:
120
 
121
  # Check CUDA kernels availability
122
  try:
123
- from kernels import check_cuda_kernels, get_fused_instance_norm, compile_kernels
124
- # On ZeroGPU, kernels will be compiled on-demand within GPU tasks
 
125
  CUDA_KERNELS_AVAILABLE = check_cuda_kernels()
126
- print(f"CUDA Kernels: {'Available (lazy-loaded)' if not CUDA_KERNELS_AVAILABLE and SPACES_AVAILABLE else 'Available' if CUDA_KERNELS_AVAILABLE else 'Not Available'}")
 
 
 
 
127
  except Exception:
128
  CUDA_KERNELS_AVAILABLE = False
129
  print("CUDA Kernels: Not Available (using PyTorch fallback)")
@@ -579,7 +584,12 @@ if _SPACES_ZERO_GPU:
579
  print("Device: CUDA (ZeroGPU mode - lazy initialization)")
580
  else:
581
  print(f"Device: {get_device().type.upper()}")
582
- print(f"CUDA Kernels: {'Available' if CUDA_KERNELS_AVAILABLE else 'Not Available (will compile on first GPU task)'}")
 
 
 
 
 
583
 
584
  # Skip model preloading on ZeroGPU to avoid CUDA init in main process
585
  if not _SPACES_ZERO_GPU:
@@ -1364,22 +1374,11 @@ def stylize_image_impl(
1364
  add_watermark: bool
1365
  ) -> Tuple[Optional[Image.Image], str, Optional[str]]:
1366
  """Main stylization function for Gradio."""
1367
- global CUDA_KERNELS_AVAILABLE
1368
 
1369
  if input_image is None:
1370
  return None, "Please upload an image first.", None
1371
 
1372
  try:
1373
- # On ZeroGPU, compile CUDA kernels within the GPU task on first use
1374
- if SPACES_AVAILABLE and not CUDA_KERNELS_AVAILABLE:
1375
- try:
1376
- from kernels import compile_kernels
1377
- CUDA_KERNELS_AVAILABLE = compile_kernels()
1378
- if CUDA_KERNELS_AVAILABLE:
1379
- print("CUDA kernels compiled successfully within GPU task!")
1380
- except Exception as e:
1381
- print(f"Failed to compile CUDA kernels: {e}")
1382
-
1383
  # Convert to RGB if needed
1384
  if input_image.mode != 'RGB':
1385
  input_image = input_image.convert('RGB')
 
120
 
121
  # Check CUDA kernels availability
122
  try:
123
+ from kernels import check_cuda_kernels, get_fused_instance_norm, load_prebuilt_kernels
124
+ # On ZeroGPU: Uses pre-compiled kernels from prebuilt/ if available
125
+ # On local: JIT compiles kernels if prebuilt not found
126
  CUDA_KERNELS_AVAILABLE = check_cuda_kernels()
127
+ if SPACES_AVAILABLE:
128
+ status = "Pre-compiled" if CUDA_KERNELS_AVAILABLE else "PyTorch GPU fallback (no prebuilt kernels)"
129
+ print(f"CUDA Kernels: {status}")
130
+ else:
131
+ print(f"CUDA Kernels: {'Available' if CUDA_KERNELS_AVAILABLE else 'Not Available (using PyTorch fallback)'}")
132
  except Exception:
133
  CUDA_KERNELS_AVAILABLE = False
134
  print("CUDA Kernels: Not Available (using PyTorch fallback)")
 
584
  print("Device: CUDA (ZeroGPU mode - lazy initialization)")
585
  else:
586
  print(f"Device: {get_device().type.upper()}")
587
+
588
+ if SPACES_AVAILABLE:
589
+ status = "Pre-compiled" if CUDA_KERNELS_AVAILABLE else "PyTorch GPU fallback"
590
+ print(f"CUDA Kernels: {status}")
591
+ else:
592
+ print(f"CUDA Kernels: {'Available' if CUDA_KERNELS_AVAILABLE else 'Not Available (using PyTorch fallback)'}")
593
 
594
  # Skip model preloading on ZeroGPU to avoid CUDA init in main process
595
  if not _SPACES_ZERO_GPU:
 
1374
  add_watermark: bool
1375
  ) -> Tuple[Optional[Image.Image], str, Optional[str]]:
1376
  """Main stylization function for Gradio."""
 
1377
 
1378
  if input_image is None:
1379
  return None, "Please upload an image first.", None
1380
 
1381
  try:
 
 
 
 
 
 
 
 
 
 
1382
  # Convert to RGB if needed
1383
  if input_image.mode != 'RGB':
1384
  input_image = input_image.convert('RGB')
compile_kernels.py ADDED
@@ -0,0 +1,133 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ Compile CUDA kernels locally for deployment to Hugging Face Spaces.
4
+
5
+ This script compiles the custom CUDA kernels and saves them to the prebuilt/
6
+ directory for use on ZeroGPU where JIT compilation is too slow.
7
+ """
8
+
9
+ import sys
10
+ import torch
11
+ from pathlib import Path
12
+
13
+ # Add kernels directory to path
14
+ sys.path.insert(0, str(Path(__file__).parent))
15
+
16
+ print("=" * 60)
17
+ print("StyleForge CUDA Kernel Compiler")
18
+ print("=" * 60)
19
+ print()
20
+
21
+ # Check CUDA availability
22
+ if not torch.cuda.is_available():
23
+ print("ERROR: CUDA is not available on this system.")
24
+ print("This script requires a CUDA-capable GPU.")
25
+ sys.exit(1)
26
+
27
+ print(f"CUDA Version: {torch.version.cuda}")
28
+ print(f"PyTorch Version: {torch.__version__}")
29
+ print(f"GPU: {torch.cuda.get_device_name(0)}")
30
+
31
+ # Get compute capability
32
+ major, minor = torch.cuda.get_device_capability(0)
33
+ compute_capability = f"{major}.{minor}"
34
+ print(f"Compute Capability: {compute_capability}")
35
+ print()
36
+
37
+ # Create prebuilt directory
38
+ prebuilt_dir = Path(__file__).parent / "kernels" / "prebuilt"
39
+ prebuilt_dir.mkdir(exist_ok=True, parents=True)
40
+
41
+ print("Compiling CUDA kernels...")
42
+ print("-" * 60)
43
+
44
+ try:
45
+ from kernels.cuda_build import compile_inline, get_cuda_info
46
+
47
+ # Read CUDA source
48
+ kernel_path = Path(__file__).parent / "kernels" / "instance_norm.cu"
49
+ if not kernel_path.exists():
50
+ print(f"ERROR: Kernel source not found at {kernel_path}")
51
+ sys.exit(1)
52
+
53
+ cuda_source = kernel_path.read_text()
54
+
55
+ # Get CUDA build info
56
+ cuda_info = get_cuda_info()
57
+ extra_flags = cuda_info.get('extra_cuda_cflags', [])
58
+
59
+ # Add architecture-specific flags for Hugging Face GPUs
60
+ # Common GPUs on HF: T4 (7.5), A100 (8.0), V100 (7.0)
61
+ hf_arch_flags = [
62
+ '-gencode=arch=compute_70,code=sm_70', # V100
63
+ '-gencode=arch=compute_75,code=sm_75', # T4
64
+ '-gencode=arch=compute_80,code=sm_80', # A100
65
+ ]
66
+ extra_flags.extend(hf_arch_flags)
67
+
68
+ print("Build flags:", ' '.join(extra_flags))
69
+ print()
70
+
71
+ # Compile the kernel
72
+ module = compile_inline(
73
+ name='fused_instance_norm',
74
+ cuda_source=cuda_source,
75
+ functions=['forward'],
76
+ build_directory=Path(__file__).parent / "build",
77
+ extra_cuda_cflags=extra_flags,
78
+ verbose=True
79
+ )
80
+
81
+ print()
82
+ print("-" * 60)
83
+ print("Compilation successful!")
84
+ print()
85
+
86
+ # Find and copy the compiled library
87
+ build_dir = Path(__file__).parent / "build"
88
+ so_files = list(build_dir.rglob("fused_instance_norm*.so"))
89
+ pyd_files = list(build_dir.rglob("fused_instance_norm*.pyd"))
90
+
91
+ compiled_files = so_files + pyd_files
92
+
93
+ if not compiled_files:
94
+ print("WARNING: No compiled .so/.pyd file found in build/")
95
+ print("The module may be in torch's cache. Trying to locate...")
96
+
97
+ # Try to find in torch extension cache
98
+ import torch.utils.cpp_extension
99
+ cache_dir = Path(torch.utils.cpp_extension._get_build_directory('fused_instance_norm', False))
100
+ if cache_dir.exists():
101
+ compiled_files = list(cache_dir.rglob("*.so")) + list(cache_dir.rglob("*.pyd"))
102
+
103
+ if compiled_files:
104
+ # Copy each compiled file to prebuilt/
105
+ for src_file in compiled_files:
106
+ dst_file = prebuilt_dir / src_file.name
107
+ import shutil
108
+ shutil.copy2(src_file, dst_file)
109
+ print(f"Copied: {dst_file}")
110
+ print(f" Size: {dst_file.stat().st_size / 1024:.1f} KB")
111
+ else:
112
+ print("ERROR: Could not locate compiled kernel file.")
113
+ print("The kernel was compiled but the output file location is unknown.")
114
+ sys.exit(1)
115
+
116
+ print()
117
+ print("=" * 60)
118
+ print("Kernel compilation complete!")
119
+ print(f"Pre-compiled kernels saved to: {prebuilt_dir}")
120
+ print()
121
+ print("Next steps:")
122
+ print("1. Test the kernels locally: python -c 'from kernels import compile_kernels; print(compile_kernels())'")
123
+ print("2. Commit and push to deploy")
124
+ print("=" * 60)
125
+
126
+ except Exception as e:
127
+ print()
128
+ print("-" * 60)
129
+ print("ERROR: Compilation failed!")
130
+ print(f"Details: {e}")
131
+ import traceback
132
+ traceback.print_exc()
133
+ sys.exit(1)
compile_kernels_colab.ipynb ADDED
@@ -0,0 +1,114 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "markdown",
5
+ "metadata": {},
6
+ "source": [
7
+ "# Compile StyleForge CUDA Kernels\n",
8
+ "\n",
9
+ "Run this notebook in Google Colab to compile the CUDA kernels and download them for deployment to Hugging Face Spaces."
10
+ ]
11
+ },
12
+ {
13
+ "cell_type": "code",
14
+ "execution_count": null,
15
+ "metadata": {},
16
+ "outputs": [],
17
+ "source": [
18
+ "# Check if GPU is available\n",
19
+ "!nvidia-smi"
20
+ ]
21
+ },
22
+ {
23
+ "cell_type": "code",
24
+ "execution_count": null,
25
+ "metadata": {},
26
+ "outputs": [],
27
+ "source": [
28
+ "# Clone the repository\n",
29
+ "!git clone https://github.com/olivialiau/StyleForge\n",
30
+ "%cd StyleForge/huggingface-space"
31
+ ]
32
+ },
33
+ {
34
+ "cell_type": "code",
35
+ "execution_count": null,
36
+ "metadata": {},
37
+ "outputs": [],
38
+ "source": [
39
+ "# Install dependencies\n",
40
+ "!pip install torch torchvision --index-url https://download.pytorch.org/whl/cu121\n",
41
+ "!pip install ninja"
42
+ ]
43
+ },
44
+ {
45
+ "cell_type": "code",
46
+ "execution_count": null,
47
+ "metadata": {},
48
+ "outputs": [],
49
+ "source": [
50
+ "# Run the compilation script\n",
51
+ "!python compile_kernels.py"
52
+ ]
53
+ },
54
+ {
55
+ "cell_type": "code",
56
+ "execution_count": null,
57
+ "metadata": {},
58
+ "outputs": [],
59
+ "source": [
60
+ "# List the compiled kernels\n",
61
+ "!ls -lh kernels/prebuilt/"
62
+ ]
63
+ },
64
+ {
65
+ "cell_type": "code",
66
+ "execution_count": null,
67
+ "metadata": {},
68
+ "outputs": [],
69
+ "source": [
70
+ "# Download the compiled kernels\n",
71
+ "from google.colab import files\n",
72
+ "import pathlib\n",
73
+ "\n",
74
+ "prebuilt_dir = pathlib.Path('kernels/prebuilt')\n",
75
+ "kernel_files = list(prebuilt_dir.glob('*.so')) + list(prebuilt_dir.glob('*.pyd'))\n",
76
+ "\n",
77
+ "if kernel_files:\n",
78
+ " for f in kernel_files:\n",
79
+ " print(f'Downloading: {f.name}')\n",
80
+ " files.download(str(f))\n",
81
+ "else:\n",
82
+ " print('No kernel files found!')\n",
83
+ " print('Check the compilation output above for errors.')"
84
+ ]
85
+ },
86
+ {
87
+ "cell_type": "markdown",
88
+ "metadata": {},
89
+ "source": [
90
+ "## Next Steps\n",
91
+ "\n",
92
+ "1. Download the `.so` file(s)\n",
93
+ "2. Copy them to your local `kernels/prebuilt/` directory\n",
94
+ "3. Commit and push to GitHub:\n",
95
+ " ```\n",
96
+ " git add kernels/prebuilt/\n",
97
+ " git commit -m \"Add pre-compiled CUDA kernels\"\n",
98
+ " git push\n",
99
+ " ```\n",
100
+ "\n",
101
+ "The kernels will be automatically loaded on Hugging Face Spaces!"
102
+ ]
103
+ }
104
+ ],
105
+ "metadata": {
106
+ "kernelspec": {
107
+ "display_name": "Python 3",
108
+ "language": "python",
109
+ "name": "python3"
110
+ }
111
+ },
112
+ "nbformat": 4,
113
+ "nbformat_minor": 4
114
+ }
kernels/__init__.py CHANGED
@@ -2,11 +2,13 @@
2
  StyleForge CUDA Kernels Package
3
  Custom CUDA kernels for accelerated neural style transfer.
4
 
5
- For ZeroGPU: Kernels are compiled on-demand within GPU task context.
 
6
  """
7
 
8
  import torch
9
  import os
 
10
 
11
  # Try to import CUDA kernels, fall back gracefully
12
  _CUDA_KERNELS_AVAILABLE = False
@@ -16,6 +18,9 @@ _KERNELS_COMPILED = False
16
  # Check if running on ZeroGPU
17
  _ZERO_GPU = os.environ.get('SPACE_ID', '').startswith('hf.co') or os.environ.get('ZERO_GPU') == '1'
18
 
 
 
 
19
 
20
  def check_cuda_kernels():
21
  """Check if CUDA kernels are available."""
@@ -23,28 +28,114 @@ def check_cuda_kernels():
23
 
24
 
25
  def get_fused_instance_norm(num_features, **kwargs):
26
- """Get FusedInstanceNorm2d module or PyTorch fallback."""
 
 
 
 
 
27
  if _FusedInstanceNorm2d is not None:
28
  try:
29
  return _FusedInstanceNorm2d(num_features, **kwargs)
30
  except Exception:
31
  pass
32
- # Fallback to PyTorch
33
  return torch.nn.InstanceNorm2d(num_features, affine=kwargs.get('affine', True))
34
 
35
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
36
  def compile_kernels():
37
  """
38
  Compile CUDA kernels on-demand.
39
 
40
- This function is called within a GPU task on ZeroGPU to ensure
41
- compilation happens within the task's timeout budget.
42
  """
43
  global _CUDA_KERNELS_AVAILABLE, _FusedInstanceNorm2d, _KERNELS_COMPILED
44
 
45
  if _KERNELS_COMPILED:
46
  return _CUDA_KERNELS_AVAILABLE
47
 
 
 
 
 
 
 
 
 
 
 
 
48
  if not torch.cuda.is_available():
49
  _KERNELS_COMPILED = True
50
  return False
@@ -63,9 +154,15 @@ def compile_kernels():
63
  return False
64
 
65
 
66
- # Auto-compile on import for non-ZeroGPU environments
67
  if torch.cuda.is_available() and not _ZERO_GPU:
68
  compile_kernels()
 
 
 
 
 
 
69
 
70
 
71
  __all__ = [
@@ -73,4 +170,5 @@ __all__ = [
73
  'get_fused_instance_norm',
74
  'FusedInstanceNorm2d',
75
  'compile_kernels',
 
76
  ]
 
2
  StyleForge CUDA Kernels Package
3
  Custom CUDA kernels for accelerated neural style transfer.
4
 
5
+ For ZeroGPU: Pre-compiled kernels are loaded from prebuilt/.
6
+ For local: Kernels are JIT-compiled if prebuilt not available.
7
  """
8
 
9
  import torch
10
  import os
11
+ from pathlib import Path
12
 
13
  # Try to import CUDA kernels, fall back gracefully
14
  _CUDA_KERNELS_AVAILABLE = False
 
18
  # Check if running on ZeroGPU
19
  _ZERO_GPU = os.environ.get('SPACE_ID', '').startswith('hf.co') or os.environ.get('ZERO_GPU') == '1'
20
 
21
+ # Path to pre-compiled kernels
22
+ _PREBUILT_PATH = Path(__file__).parent / "prebuilt"
23
+
24
 
25
  def check_cuda_kernels():
26
  """Check if CUDA kernels are available."""
 
28
 
29
 
30
  def get_fused_instance_norm(num_features, **kwargs):
31
+ """
32
+ Get FusedInstanceNorm2d module or PyTorch fallback.
33
+
34
+ On ZeroGPU: Uses pre-compiled kernels if available.
35
+ On local: May use custom fused kernels (prebuilt or JIT).
36
+ """
37
  if _FusedInstanceNorm2d is not None:
38
  try:
39
  return _FusedInstanceNorm2d(num_features, **kwargs)
40
  except Exception:
41
  pass
42
+ # Fallback to PyTorch (still GPU-accelerated, just not custom fused)
43
  return torch.nn.InstanceNorm2d(num_features, affine=kwargs.get('affine', True))
44
 
45
 
46
+ def load_prebuilt_kernels():
47
+ """
48
+ Try to load pre-compiled CUDA kernels from prebuilt/ directory.
49
+
50
+ Returns True if successful, False otherwise.
51
+ """
52
+ global _FusedInstanceNorm2d, _CUDA_KERNELS_AVAILABLE, _KERNELS_COMPILED
53
+
54
+ if _KERNELS_COMPILED:
55
+ return _CUDA_KERNELS_AVAILABLE
56
+
57
+ # Check if prebuilt kernels exist
58
+ prebuilt_files = list(_PREBUILT_PATH.glob("*.so")) + list(_PREBUILT_PATH.glob("*.pyd"))
59
+ if not prebuilt_files:
60
+ print("No pre-compiled kernels found in prebuilt/")
61
+ return False
62
+
63
+ try:
64
+ # Try to import from prebuilt directory
65
+ import sys
66
+ if str(_PREBUILT_PATH) not in sys.path:
67
+ sys.path.insert(0, str(_PREBUILT_PATH))
68
+
69
+ # Try to load the prebuilt module
70
+ for kernel_file in prebuilt_files:
71
+ try:
72
+ # Import the compiled module
73
+ module_name = kernel_file.stem
74
+ spec = __import__('importlib.util').util.spec_from_file_location(module_name, kernel_file)
75
+ if spec and spec.loader:
76
+ mod = __import__('importlib.util').util.module_from_spec(spec)
77
+ spec.loader.exec_module(mod)
78
+ print(f"Loaded pre-compiled kernel: {kernel_file.name}")
79
+
80
+ # Create FusedInstanceNorm2d class
81
+ class PrebuiltFusedInstanceNorm2d(torch.nn.Module):
82
+ def __init__(self, num_features, **kwargs):
83
+ super().__init__()
84
+ self.num_features = num_features
85
+ self.eps = kwargs.get('eps', 1e-5)
86
+ if kwargs.get('affine', True):
87
+ self.gamma = torch.nn.Parameter(torch.ones(num_features))
88
+ self.beta = torch.nn.Parameter(torch.zeros(num_features))
89
+ else:
90
+ self.register_buffer('gamma', torch.ones(num_features))
91
+ self.register_buffer('beta', torch.zeros(num_features))
92
+ self._pytorch_norm = torch.nn.InstanceNorm2d(num_features, **kwargs)
93
+
94
+ def forward(self, x):
95
+ try:
96
+ return mod.fused_instance_norm_forward(
97
+ x.contiguous(), self.gamma, self.beta, self.eps
98
+ )
99
+ except Exception:
100
+ return self._pytorch_norm(x)
101
+
102
+ _FusedInstanceNorm2d = PrebuiltFusedInstanceNorm2d
103
+ _CUDA_KERNELS_AVAILABLE = True
104
+ _KERNELS_COMPILED = True
105
+ return True
106
+ except Exception as e:
107
+ print(f"Failed to load {kernel_file.name}: {e}")
108
+ continue
109
+
110
+ except Exception as e:
111
+ print(f"Failed to load prebuilt kernels: {e}")
112
+
113
+ return False
114
+
115
+
116
  def compile_kernels():
117
  """
118
  Compile CUDA kernels on-demand.
119
 
120
+ On ZeroGPU: Tries pre-compiled kernels first.
121
+ On local: Compiles custom CUDA kernels.
122
  """
123
  global _CUDA_KERNELS_AVAILABLE, _FusedInstanceNorm2d, _KERNELS_COMPILED
124
 
125
  if _KERNELS_COMPILED:
126
  return _CUDA_KERNELS_AVAILABLE
127
 
128
+ # First, try pre-compiled kernels
129
+ if load_prebuilt_kernels():
130
+ print("Using pre-compiled CUDA kernels!")
131
+ return True
132
+
133
+ # Fall back to JIT compilation (only on local, not ZeroGPU)
134
+ if _ZERO_GPU:
135
+ print("ZeroGPU mode: No pre-compiled kernels found, using PyTorch fallback")
136
+ _KERNELS_COMPILED = True
137
+ return False
138
+
139
  if not torch.cuda.is_available():
140
  _KERNELS_COMPILED = True
141
  return False
 
154
  return False
155
 
156
 
157
+ # Auto-compile on import for non-ZeroGPU environments with CUDA
158
  if torch.cuda.is_available() and not _ZERO_GPU:
159
  compile_kernels()
160
+ elif _ZERO_GPU:
161
+ # On ZeroGPU, try prebuilt kernels
162
+ if load_prebuilt_kernels():
163
+ print("ZeroGPU: Using pre-compiled CUDA kernels!")
164
+ else:
165
+ print("ZeroGPU: No pre-compiled kernels, using PyTorch GPU fallback")
166
 
167
 
168
  __all__ = [
 
170
  'get_fused_instance_norm',
171
  'FusedInstanceNorm2d',
172
  'compile_kernels',
173
+ 'load_prebuilt_kernels',
174
  ]
kernels/prebuilt/README.md ADDED
@@ -0,0 +1,64 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Pre-Compiled CUDA Kernels
2
+
3
+ This directory contains pre-compiled CUDA kernels for use on Hugging Face Spaces.
4
+
5
+ ## How to Compile Kernels Locally
6
+
7
+ To compile the CUDA kernels locally and upload them here:
8
+
9
+ ### 1. Compile Locally
10
+
11
+ Run this script from the `huggingface-space` directory:
12
+
13
+ ```bash
14
+ python compile_kernels.py
15
+ ```
16
+
17
+ Or compile manually:
18
+
19
+ ```bash
20
+ cd huggingface-space
21
+ python -c "
22
+ from kernels.cuda_build import compile_inline
23
+ from pathlib import Path
24
+
25
+ cuda_source = (Path('kernels') / 'instance_norm.cu').read_text()
26
+ module = compile_inline(
27
+ name='fused_instance_norm',
28
+ cuda_source=cuda_source,
29
+ functions=['forward'],
30
+ build_directory=Path('build'),
31
+ verbose=True
32
+ )
33
+ print('Compiled successfully!')
34
+ print(f'Module location: {module.__file__}')
35
+ "
36
+ ```
37
+
38
+ ### 2. Copy Compiled File
39
+
40
+ After compilation, copy the compiled `.so` file to this directory:
41
+
42
+ ```bash
43
+ # Find the compiled file (usually in build/)
44
+ find build/ -name "*.so" -exec cp {} kernels/prebuilt/ \;
45
+ ```
46
+
47
+ ### 3. Commit and Push
48
+
49
+ ```bash
50
+ git add kernels/prebuilt/
51
+ git commit -m "Add pre-compiled CUDA kernels"
52
+ git push
53
+ ```
54
+
55
+ ## Notes
56
+
57
+ - The compiled kernels are architecture-specific (e.g., `sm_70`, `sm_75`, `sm_86`)
58
+ - Hugging Face Spaces typically use Tesla T4 (sm_75) or A100 (sm_80)
59
+ - For maximum compatibility, compile with multiple compute capabilities
60
+
61
+ ## Current Status
62
+
63
+ No pre-compiled kernels found. The app will use PyTorch's InstanceNorm2d fallback,
64
+ which is still GPU-accelerated but not as fast as custom fused kernels.