Spaces:

TSXu
/

UniCalli_Dev

Running on Zero

TSXu commited on Jan 29

Commit

a1f5b88

1 Parent(s): afa3a21

Refactor to use Float8 + torch.compile from FLUX-Kontext-fp8

- Apply Float8DynamicActivationFloat8WeightConfig via torchao
- Use torch.compile with max-autotune mode and inductor configs:
- conv_1x1_as_mm, coordinate_descent_tuning, max_autotune, triton.cudagraphs
- Optimizations applied in app.py init_generator() for ZeroGPU
- CLI mode in inference.py also supports --float8 and --compile flags

Files changed (3) hide show

app.py +84 -44
inference.py +30 -24
optimization.py +162 -203

app.py CHANGED Viewed

@@ -1,7 +1,7 @@
 # -*- coding: utf-8 -*-
 """
 Gradio Demo for Chinese Calligraphy Generation - HuggingFace Space Version
-With interactive session mode to avoid model reloading
 """
 # IMPORTANT: import spaces first before any CUDA-related packages
@@ -11,6 +11,8 @@ import gradio as gr
 import json
 import csv
 import time
 # Load author and font mappings from CSV
 def load_author_fonts_from_csv(csv_path):
@@ -57,20 +59,22 @@ try:
 except:
     author_styles = {}
-# Initialize generator (will be done lazily on first generation)
 generator = None
-# Pre-download model files at startup (before user clicks)
 def preload_model_files():
     """Pre-download model files to cache at startup (no GPU needed)"""
-    import os
     from huggingface_hub import snapshot_download
     hf_token = os.environ.get("HF_TOKEN", None)
     print("Pre-downloading model files to cache...")
     try:
-        # Only download safetensors, embedding, and font files (not the .bin files)
         local_dir = snapshot_download(
             repo_id="TSXu/Unicalli_Pro",
             allow_patterns=[
@@ -79,9 +83,7 @@ def preload_model_files():
                 "internvl_embedding/*",
                 "*.ttf",
             ],
-            ignore_patterns=[
-                "*.bin",  # Skip large .bin files
-            ],
             token=hf_token
         )
         print(f"✓ Model files cached at: {local_dir}")
@@ -90,67 +92,105 @@ def preload_model_files():
         print(f"Warning: Could not pre-download model files: {e}")
         return None
-# Pre-download at startup
 print("="*50)
 print("Starting model pre-download...")
 _cached_model_dir = preload_model_files()
 print("="*50)
 def init_generator():
-    """Initialize the generator (lazy loading)"""
-    global generator, _cached_model_dir
     if generator is None:
-        # Enable optimized SDPA attention backends for faster inference
-        import torch
-        import os
         try:
             torch.backends.cuda.enable_flash_sdp(True)
             torch.backends.cuda.enable_mem_efficient_sdp(True)
-            torch.backends.cuda.enable_math_sdp(False)  # Disable slow math backend
-            print("✓ Enabled Flash Attention / Memory-Efficient SDPA backends")
         except Exception as e:
-            print(f"Note: Could not configure SDPA backends: {e}")
-        # Use pre-cached model directory
-        if _cached_model_dir:
-            intern_vlm_path = os.path.join(_cached_model_dir, "internvl_embedding")
-            checkpoint_path = _cached_model_dir
-            print(f"Using pre-cached model from: {_cached_model_dir}")
-        else:
-            # Fallback to HF Hub download if pre-download failed
-            from huggingface_hub import snapshot_download
-            hf_token = os.environ.get("HF_TOKEN", None)
-            print("Downloading model from TSXu/Unicalli_Pro...")
-            _cached_model_dir = snapshot_download(
-                repo_id="TSXu/Unicalli_Pro",
-                token=hf_token
-            )
-            intern_vlm_path = os.path.join(_cached_model_dir, "internvl_embedding")
-            checkpoint_path = _cached_model_dir
         print(f"Using lightweight embedding from: {intern_vlm_path}")
-        # Lazy import to avoid CUDA initialization at module load time
         from inference import CalligraphyGenerator
         generator = CalligraphyGenerator(
             model_name="flux-dev",
             device="cuda",
-            offload=True,  # Enable offload to save GPU memory
-            intern_vlm_path=intern_vlm_path,  # Lightweight embedding (~500MB vs ~2GB)
-            checkpoint_path=checkpoint_path,  # Use pre-cached model
             font_descriptions_path='dataset/chirography.json',
             author_descriptions_path='dataset/calligraphy_styles_en.json',
             use_deepspeed=False,
-            use_4bit_quantization=False,  # Full precision model
-            use_float8_quantization=True,  # Enable Float8 quantization for faster inference
-            use_torch_compile=True,  # Enable torch.compile with inductor optimizations
-            compile_mode="reduce-overhead",  # Best for inference speed
-            dtype="fp32",  # Use fp32 to avoid CUBLAS errors on ZeroGPU
         )
     return generator

 # -*- coding: utf-8 -*-
 """
 Gradio Demo for Chinese Calligraphy Generation - HuggingFace Space Version
+With Float8 quantization and AOT compilation for faster inference
 """
 # IMPORTANT: import spaces first before any CUDA-related packages
 import json
 import csv
 import time
+import torch
+import os
 # Load author and font mappings from CSV
 def load_author_fonts_from_csv(csv_path):
 except:
     author_styles = {}
+# Global generator instance
 generator = None
+_cached_model_dir = None
+_is_optimized = False
+# ============================================================
+# Pre-download model files at startup (no GPU needed)
+# ============================================================
 def preload_model_files():
     """Pre-download model files to cache at startup (no GPU needed)"""
     from huggingface_hub import snapshot_download
     hf_token = os.environ.get("HF_TOKEN", None)
     print("Pre-downloading model files to cache...")
     try:
         local_dir = snapshot_download(
             repo_id="TSXu/Unicalli_Pro",
             allow_patterns=[
                 "internvl_embedding/*",
                 "*.ttf",
             ],
+            ignore_patterns=["*.bin"],
             token=hf_token
         )
         print(f"✓ Model files cached at: {local_dir}")
         print(f"Warning: Could not pre-download model files: {e}")
         return None
 print("="*50)
 print("Starting model pre-download...")
 _cached_model_dir = preload_model_files()
 print("="*50)
+# ============================================================
+# AOT Optimization Configuration (from FLUX-Kontext-fp8)
+# ============================================================
+from torch.utils._pytree import tree_map_only
+from torchao.quantization import quantize_, Float8DynamicActivationFloat8WeightConfig
+# Inductor configuration for optimal performance
+INDUCTOR_CONFIGS = {
+    'conv_1x1_as_mm': True,
+    'epilogue_fusion': False,
+    'coordinate_descent_tuning': True,
+    'coordinate_descent_check_all_directions': True,
+    'max_autotune': True,
+    'triton.cudagraphs': True,
+}
+def apply_optimizations(model):
+    """
+    Apply Float8 quantization and torch.compile with inductor optimizations.
+    Based on FLUX-Kontext-fp8 optimization techniques.
+    """
+    import torch._inductor.config as inductor_config
+    # Apply inductor configurations
+    for key, value in INDUCTOR_CONFIGS.items():
+        if hasattr(inductor_config, key):
+            setattr(inductor_config, key, value)
+    print("="*50)
+    print("Applying Float8 quantization...")
+    quantize_(model, Float8DynamicActivationFloat8WeightConfig())
+    print("✓ Float8 quantization complete!")
+    print("Applying torch.compile with inductor optimizations...")
+    compiled_model = torch.compile(
+        model,
+        mode="max-autotune",
+        backend="inductor",
+        dynamic=True,
+    )
+    print("✓ torch.compile applied!")
+    print("="*50)
+    return compiled_model
 def init_generator():
+    """Initialize the generator with Float8 + torch.compile optimization"""
+    global generator, _cached_model_dir, _is_optimized
     if generator is None:
+        # Enable CUDA optimizations
         try:
+            torch.backends.cuda.matmul.allow_tf32 = True
+            torch.backends.cudnn.allow_tf32 = True
+            torch.backends.cudnn.benchmark = True
             torch.backends.cuda.enable_flash_sdp(True)
             torch.backends.cuda.enable_mem_efficient_sdp(True)
+            torch.backends.cuda.enable_math_sdp(False)
+            print("✓ CUDA optimizations enabled (TF32, cuDNN benchmark, Flash SDPA)")
         except Exception as e:
+            print(f"Note: Some CUDA optimizations failed: {e}")
+        intern_vlm_path = os.path.join(_cached_model_dir, "internvl_embedding")
+        checkpoint_path = _cached_model_dir
+        print(f"Using pre-cached model from: {_cached_model_dir}")
         print(f"Using lightweight embedding from: {intern_vlm_path}")
         from inference import CalligraphyGenerator
         generator = CalligraphyGenerator(
             model_name="flux-dev",
             device="cuda",
+            offload=True,
+            intern_vlm_path=intern_vlm_path,
+            checkpoint_path=checkpoint_path,
             font_descriptions_path='dataset/chirography.json',
             author_descriptions_path='dataset/calligraphy_styles_en.json',
             use_deepspeed=False,
+            use_4bit_quantization=False,
+            use_float8_quantization=False,  # Apply manually below
+            use_torch_compile=False,  # Apply manually below
+            dtype="fp32",
         )
+        # Apply Float8 quantization + torch.compile
+        if not _is_optimized:
+            print("Applying optimizations to transformer...")
+            generator.model = apply_optimizations(generator.model)
+            _is_optimized = True
+            print("✓ Transformer optimized with Float8 + torch.compile!")
     return generator

inference.py CHANGED Viewed

@@ -18,7 +18,6 @@ from huggingface_hub import hf_hub_download, snapshot_download
 from src.flux.util import configs, load_ae, load_clip, load_t5
 from src.flux.model import Flux
 from src.flux.xflux_pipeline import XFluxSampler
-from optimization import optimize_model, enable_cuda_optimizations, check_optimization_support
 # HuggingFace Hub model IDs
@@ -188,9 +187,6 @@ class CalligraphyGenerator:
         self.use_torch_compile = use_torch_compile
         self.compile_mode = compile_mode
         self.forced_dtype = dtype  # "fp16", "bf16", "fp32", or None for auto
-        # Enable CUDA optimizations early
-        enable_cuda_optimizations()
         # Load font and author style descriptions
         if os.path.exists(font_descriptions_path):
@@ -247,16 +243,9 @@ class CalligraphyGenerator:
             if self.use_deepspeed:
                 self.model = self._init_deepspeed(self.model)
-        # Apply Float8 quantization and torch.compile optimizations
-        if not self.use_deepspeed and not self.use_4bit_quantization:
-            if self.use_float8_quantization or self.use_torch_compile:
-                self.model = optimize_model(
-                    self.model,
-                    device=str(self.device),
-                    use_float8=self.use_float8_quantization,
-                    use_compile=self.use_torch_compile,
-                    compile_mode=self.compile_mode
-                )
         # Load VAE
         if self.use_deepspeed or offload:
@@ -1115,27 +1104,44 @@ if __name__ == "__main__":
     parser.add_argument("--list-fonts", action="store_true", help="List available font styles")
     parser.add_argument("--float8", action="store_true", help="Use Float8 quantization (torchao) for faster inference")
     parser.add_argument("--compile", action="store_true", help="Use torch.compile for optimized inference")
-    parser.add_argument("--compile-mode", type=str, default="reduce-overhead",
                        choices=["reduce-overhead", "max-autotune", "default"],
                        help="torch.compile mode")
-    parser.add_argument("--check-optimizations", action="store_true", help="Check available optimization support")
     args = parser.parse_args()
-    # Check optimization support if requested
-    if args.check_optimizations:
-        check_optimization_support()
-        exit(0)
     # Initialize generator
     generator = CalligraphyGenerator(
         model_name="flux-dev",
         device=args.device,
         checkpoint_path=args.checkpoint,
-        use_float8_quantization=args.float8,
-        use_torch_compile=args.compile,
-        compile_mode=args.compile_mode
     )
     # List available options
     if args.list_authors:

 from src.flux.util import configs, load_ae, load_clip, load_t5
 from src.flux.model import Flux
 from src.flux.xflux_pipeline import XFluxSampler
 # HuggingFace Hub model IDs
         self.use_torch_compile = use_torch_compile
         self.compile_mode = compile_mode
         self.forced_dtype = dtype  # "fp16", "bf16", "fp32", or None for auto
         # Load font and author style descriptions
         if os.path.exists(font_descriptions_path):
             if self.use_deepspeed:
                 self.model = self._init_deepspeed(self.model)
+        # Note: Float8 quantization and torch.compile optimizations
+        # are applied externally (e.g., in app.py) for better control
+        # over the optimization process with ZeroGPU AOT compilation.
         # Load VAE
         if self.use_deepspeed or offload:
     parser.add_argument("--list-fonts", action="store_true", help="List available font styles")
     parser.add_argument("--float8", action="store_true", help="Use Float8 quantization (torchao) for faster inference")
     parser.add_argument("--compile", action="store_true", help="Use torch.compile for optimized inference")
+    parser.add_argument("--compile-mode", type=str, default="max-autotune",
                        choices=["reduce-overhead", "max-autotune", "default"],
                        help="torch.compile mode")
     args = parser.parse_args()
     # Initialize generator
     generator = CalligraphyGenerator(
         model_name="flux-dev",
         device=args.device,
         checkpoint_path=args.checkpoint,
     )
+    # Apply optimizations if requested (CLI mode)
+    if args.float8 or args.compile:
+        from torchao.quantization import quantize_, Float8DynamicActivationFloat8WeightConfig
+        import torch._inductor.config as inductor_config
+        # Inductor configs from FLUX-Kontext-fp8
+        inductor_config.conv_1x1_as_mm = True
+        inductor_config.coordinate_descent_tuning = True
+        inductor_config.coordinate_descent_check_all_directions = True
+        inductor_config.max_autotune = True
+        if args.float8:
+            print("Applying Float8 quantization...")
+            quantize_(generator.model, Float8DynamicActivationFloat8WeightConfig())
+            print("✓ Float8 quantization complete!")
+        if args.compile:
+            print(f"Applying torch.compile (mode={args.compile_mode})...")
+            generator.model = torch.compile(
+                generator.model,
+                mode=args.compile_mode,
+                backend="inductor",
+                dynamic=True,
+            )
+            print("✓ torch.compile applied!")
     # List available options
     if args.list_authors:

optimization.py CHANGED Viewed

@@ -1,16 +1,34 @@
 """
-Model optimization utilities for faster inference using:
-- Float8 quantization via torchao
-- torch.compile with inductor optimizations
-- CUDA graph capture for reduced kernel launch overhead
-Inspired by FLUX-Kontext-fp8 optimization techniques.
 """
-from typing import Optional, Callable, Any
 import torch
 import torch.nn as nn
 # Inductor configuration for optimal performance
 INDUCTOR_CONFIGS = {
@@ -19,94 +37,114 @@ INDUCTOR_CONFIGS = {
     'coordinate_descent_tuning': True,
     'coordinate_descent_check_all_directions': True,
     'max_autotune': True,
 }
-def apply_float8_quantization(model: nn.Module, device: str = "cuda") -> nn.Module:
     """
-    Apply Float8 dynamic activation and weight quantization using torchao.
-    This provides significant speedup on GPUs with native FP8 support (H100, etc.)
-    and reasonable speedup on other GPUs through reduced memory bandwidth.
     Args:
-        model: PyTorch model to quantize
-        device: Target device for the model
     Returns:
-        Quantized model
     """
-    try:
-        from torchao.quantization import quantize_, Float8DynamicActivationFloat8WeightConfig
-        print("Applying Float8 dynamic activation + Float8 weight quantization...")
-        # Move model to device first if not already there
-        if next(model.parameters()).device.type != device:
-            model = model.to(device)
-        # Apply float8 quantization
         quantize_(model, Float8DynamicActivationFloat8WeightConfig())
-        print("Float8 quantization applied successfully!")
-        return model
-    except ImportError as e:
-        print(f"torchao not available for Float8 quantization: {e}")
-        print("Install with: pip install torchao")
-        return model
-    except Exception as e:
-        print(f"Float8 quantization failed: {e}")
-        print("Falling back to unquantized model")
-        return model
-def apply_torch_compile(
-    model: nn.Module,
-    mode: str = "reduce-overhead",
-    fullgraph: bool = False,
-    dynamic: bool = True,
-    backend: str = "inductor"
-) -> nn.Module:
     """
-    Apply torch.compile with optimized settings for inference.
     Args:
-        model: PyTorch model to compile
-        mode: Compilation mode - "reduce-overhead" (best for inference),
-              "max-autotune" (slower compile, faster runtime), or "default"
-        fullgraph: If True, requires entire forward to be capturable (faster but stricter)
-        dynamic: If True, allows dynamic shapes (recommended for variable input sizes)
-        backend: Compiler backend - "inductor" is recommended
-    Returns:
-        Compiled model
     """
-    try:
-        import torch._inductor.config as inductor_config
-        # Apply inductor configurations
-        for key, value in INDUCTOR_CONFIGS.items():
-            if hasattr(inductor_config, key):
-                setattr(inductor_config, key, value)
-        print(f"Applying torch.compile with mode='{mode}', backend='{backend}'...")
-        compiled_model = torch.compile(
-            model,
-            mode=mode,
-            fullgraph=fullgraph,
-            dynamic=dynamic,
-            backend=backend
-        )
-        print("torch.compile applied successfully!")
-        return compiled_model
-    except Exception as e:
-        print(f"torch.compile failed: {e}")
-        print("Falling back to uncompiled model")
-        return model
 def enable_cuda_optimizations():
@@ -136,6 +174,48 @@ def enable_cuda_optimizations():
         print(f"Some CUDA optimizations failed: {e}")
 def optimize_model(
     model: nn.Module,
     device: str = "cuda",
@@ -144,42 +224,24 @@ def optimize_model(
     compile_mode: str = "reduce-overhead"
 ) -> nn.Module:
     """
-    Apply all optimizations to the model for maximum inference speed.
-    Optimizations applied:
-    1. CUDA backend optimizations (TF32, Flash SDPA, etc.)
-    2. Float8 quantization via torchao (if available)
-    3. torch.compile with inductor optimizations
-    Args:
-        model: PyTorch model to optimize
-        device: Target device
-        use_float8: Whether to apply Float8 quantization
-        use_compile: Whether to apply torch.compile
-        compile_mode: Mode for torch.compile
-    Returns:
-        Optimized model
     """
     print("=" * 50)
     print("Applying model optimizations...")
     print("=" * 50)
-    # 1. Enable CUDA optimizations
     enable_cuda_optimizations()
-    # 2. Move model to device
     if next(model.parameters()).device.type != device:
         print(f"Moving model to {device}...")
         model = model.to(device)
-    # 3. Apply Float8 quantization
     if use_float8:
-        model = apply_float8_quantization(model, device)
-    # 4. Apply torch.compile
     if use_compile:
-        model = apply_torch_compile(model, mode=compile_mode)
     print("=" * 50)
     print("Model optimization complete!")
@@ -188,132 +250,29 @@ def optimize_model(
     return model
-def warmup_model(
-    model: nn.Module,
-    warmup_fn: Callable[[], Any],
-    num_warmup: int = 3
-):
-    """
-    Warmup the compiled model to trigger JIT compilation.
-    Args:
-        model: The model (should already be compiled)
-        warmup_fn: Function that runs a forward pass
-        num_warmup: Number of warmup iterations
-    """
-    print(f"Warming up model with {num_warmup} iterations...")
-    with torch.no_grad():
-        for i in range(num_warmup):
-            try:
-                warmup_fn()
-                print(f"  Warmup {i+1}/{num_warmup} complete")
-            except Exception as e:
-                print(f"  Warmup {i+1}/{num_warmup} failed: {e}")
-    # Synchronize CUDA
-    if torch.cuda.is_available():
-        torch.cuda.synchronize()
-    print("Model warmup complete!")
-class CUDAGraphWrapper(nn.Module):
-    """
-    Wrapper that captures and replays CUDA graphs for reduced kernel launch overhead.
-    Note: CUDA graphs require static input shapes. Use this only if your input
-    dimensions are fixed.
-    """
-    def __init__(self, model: nn.Module, warmup_fn: Callable[[], tuple]):
-        super().__init__()
-        self.model = model
-        self.graph = None
-        self.static_inputs = None
-        self.static_outputs = None
-        self._captured = False
-    def capture(self, *sample_inputs):
-        """
-        Capture the CUDA graph with sample inputs.
-        Args:
-            *sample_inputs: Sample inputs with the exact shapes that will be used
-        """
-        if not torch.cuda.is_available():
-            print("CUDA not available, skipping graph capture")
-            return
-        print("Capturing CUDA graph...")
-        # Warmup
-        with torch.no_grad():
-            for _ in range(3):
-                _ = self.model(*sample_inputs)
-        torch.cuda.synchronize()
-        # Capture
-        self.graph = torch.cuda.CUDAGraph()
-        # Create static tensors
-        self.static_inputs = tuple(inp.clone() for inp in sample_inputs)
-        with torch.cuda.graph(self.graph):
-            self.static_outputs = self.model(*self.static_inputs)
-        self._captured = True
-        print("CUDA graph captured successfully!")
-    def forward(self, *inputs):
-        if not self._captured:
-            return self.model(*inputs)
-        # Copy inputs to static buffers
-        for static_inp, inp in zip(self.static_inputs, inputs):
-            static_inp.copy_(inp)
-        # Replay graph
-        self.graph.replay()
-        return self.static_outputs
-# Utility function to check available optimizations
 def check_optimization_support():
-    """
-    Check which optimizations are available on the current system.
-    """
     print("Checking optimization support...")
     print("-" * 40)
-    # CUDA
     print(f"CUDA available: {torch.cuda.is_available()}")
     if torch.cuda.is_available():
         print(f"  Device: {torch.cuda.get_device_name()}")
-        print(f"  Capability: {torch.cuda.get_device_capability()}")
-    # torch.compile
-    try:
-        import torch._dynamo
-        print(f"torch.compile available: True")
-    except ImportError:
-        print(f"torch.compile available: False")
-    # torchao Float8
     try:
         from torchao.quantization import Float8DynamicActivationFloat8WeightConfig
-        print(f"torchao Float8 available: True")
     except ImportError:
-        print(f"torchao Float8 available: False")
-    # Flash Attention
     try:
-        flash_available = torch.backends.cuda.flash_sdp_enabled() if torch.cuda.is_available() else False
-        print(f"Flash SDPA available: {flash_available}")
-    except:
-        print(f"Flash SDPA available: Unknown")
     print("-" * 40)

 """
+Model optimization utilities using AOT compilation and Float8 quantization.
+Based on FLUX-Kontext-fp8 optimization techniques.
+Key optimizations:
+1. Float8 dynamic activation + Float8 weight quantization via torchao
+2. AOT (Ahead-of-Time) compilation via torch.export + spaces.aoti_compile
+3. CUDA graph capture for reduced kernel launch overhead
 """
+from typing import Any, Callable, Optional
+from typing import ParamSpec
+import spaces
 import torch
 import torch.nn as nn
+from torch.utils._pytree import tree_map_only
+from torchao.quantization import quantize_, Float8DynamicActivationFloat8WeightConfig
+P = ParamSpec('P')
+# Dynamic shape specifications for the Flux transformer
+# These allow variable sequence lengths during inference
+TRANSFORMER_HIDDEN_DIM = torch.export.Dim('hidden', min=512, max=16384)
+TRANSFORMER_DYNAMIC_SHAPES = {
+    'img': {1: TRANSFORMER_HIDDEN_DIM},
+    'img_ids': {1: TRANSFORMER_HIDDEN_DIM},
+}
 # Inductor configuration for optimal performance
 INDUCTOR_CONFIGS = {
     'coordinate_descent_tuning': True,
     'coordinate_descent_check_all_directions': True,
     'max_autotune': True,
+    'triton.cudagraphs': True,
 }
+def optimize_flux_model_(
+    model: nn.Module,
+    sample_forward_fn: Callable[[], Any],
+    device: str = "cuda"
+):
     """
+    Optimize the Flux model using Float8 quantization and AOT compilation.
+    This function:
+    1. Captures a sample forward pass to determine input shapes
+    2. Applies Float8 quantization to the model
+    3. Exports the model with dynamic shapes
+    4. Compiles using AOT inductor with optimized configs
+    5. Applies the compiled model back
     Args:
+        model: The Flux transformer model to optimize
+        sample_forward_fn: A function that runs a sample forward pass
+        device: Target device
     Returns:
+        Optimized model
     """
+    @spaces.GPU(duration=1500)
+    def compile_transformer():
+        # Step 1: Capture the forward pass to get input shapes
+        with spaces.aoti_capture(model) as call:
+            sample_forward_fn()
+        # Step 2: Build dynamic shapes from captured call
+        dynamic_shapes = tree_map_only((torch.Tensor, bool), lambda t: None, call.kwargs)
+        dynamic_shapes |= TRANSFORMER_DYNAMIC_SHAPES
+        # Step 3: Apply Float8 quantization
+        print("Applying Float8 dynamic activation + Float8 weight quantization...")
         quantize_(model, Float8DynamicActivationFloat8WeightConfig())
+        print("Float8 quantization complete!")
+        # Step 4: Export the model with dynamic shapes
+        print("Exporting model with torch.export...")
+        exported = torch.export.export(
+            mod=model,
+            args=call.args,
+            kwargs=call.kwargs,
+            dynamic_shapes=dynamic_shapes,
+        )
+        print("Model exported successfully!")
+        # Step 5: AOT compile with inductor configs
+        print("AOT compiling with inductor optimizations...")
+        return spaces.aoti_compile(exported, INDUCTOR_CONFIGS)
+    # Apply the compiled model
+    print("=" * 50)
+    print("Starting AOT optimization pipeline...")
+    print("=" * 50)
+    spaces.aoti_apply(compile_transformer(), model)
+    print("=" * 50)
+    print("AOT optimization complete!")
+    print("=" * 50)
+def optimize_pipeline_(
+    pipeline: Any,
+    sample_image: Any,
+    sample_prompt: str = "sample prompt"
+):
     """
+    Optimize a diffusers-style pipeline.
+    Compatible with FluxPipeline or similar pipelines.
     Args:
+        pipeline: The pipeline with a .transformer attribute
+        sample_image: Sample image for capturing input shapes
+        sample_prompt: Sample prompt text
     """
+    @spaces.GPU(duration=1500)
+    def compile_transformer():
+        with spaces.aoti_capture(pipeline.transformer) as call:
+            pipeline(image=sample_image, prompt=sample_prompt)
+        dynamic_shapes = tree_map_only((torch.Tensor, bool), lambda t: None, call.kwargs)
+        dynamic_shapes |= TRANSFORMER_DYNAMIC_SHAPES
+        # Fuse QKV projections if available (diffusers pipelines)
+        if hasattr(pipeline.transformer, 'fuse_qkv_projections'):
+            pipeline.transformer.fuse_qkv_projections()
+        quantize_(pipeline.transformer, Float8DynamicActivationFloat8WeightConfig())
+        exported = torch.export.export(
+            mod=pipeline.transformer,
+            args=call.args,
+            kwargs=call.kwargs,
+            dynamic_shapes=dynamic_shapes,
+        )
+        return spaces.aoti_compile(exported, INDUCTOR_CONFIGS)
+    spaces.aoti_apply(compile_transformer(), pipeline.transformer)
 def enable_cuda_optimizations():
         print(f"Some CUDA optimizations failed: {e}")
+# Simpler optimization for non-ZeroGPU environments
+def apply_float8_quantization_simple(model: nn.Module) -> nn.Module:
+    """
+    Apply Float8 quantization without AOT compilation.
+    Use this for non-ZeroGPU environments or when AOT is not available.
+    """
+    try:
+        print("Applying Float8 quantization...")
+        quantize_(model, Float8DynamicActivationFloat8WeightConfig())
+        print("Float8 quantization applied!")
+        return model
+    except Exception as e:
+        print(f"Float8 quantization failed: {e}")
+        return model
+def apply_torch_compile_simple(
+    model: nn.Module,
+    mode: str = "reduce-overhead",
+    backend: str = "inductor"
+) -> nn.Module:
+    """
+    Apply torch.compile without AOT (JIT compilation).
+    Use this as a fallback when AOT compilation is not available.
+    """
+    try:
+        import torch._inductor.config as inductor_config
+        # Apply inductor configurations
+        for key, value in INDUCTOR_CONFIGS.items():
+            if hasattr(inductor_config, key):
+                setattr(inductor_config, key, value)
+        print(f"Applying torch.compile with mode='{mode}'...")
+        compiled_model = torch.compile(model, mode=mode, backend=backend)
+        print("torch.compile applied!")
+        return compiled_model
+    except Exception as e:
+        print(f"torch.compile failed: {e}")
+        return model
 def optimize_model(
     model: nn.Module,
     device: str = "cuda",
     compile_mode: str = "reduce-overhead"
 ) -> nn.Module:
     """
+    Simple optimization wrapper for non-ZeroGPU environments.
+    For ZeroGPU, use optimize_flux_model_ instead.
     """
     print("=" * 50)
     print("Applying model optimizations...")
     print("=" * 50)
     enable_cuda_optimizations()
     if next(model.parameters()).device.type != device:
         print(f"Moving model to {device}...")
         model = model.to(device)
     if use_float8:
+        model = apply_float8_quantization_simple(model)
     if use_compile:
+        model = apply_torch_compile_simple(model, mode=compile_mode)
     print("=" * 50)
     print("Model optimization complete!")
     return model
 def check_optimization_support():
+    """Check which optimizations are available."""
     print("Checking optimization support...")
     print("-" * 40)
     print(f"CUDA available: {torch.cuda.is_available()}")
     if torch.cuda.is_available():
         print(f"  Device: {torch.cuda.get_device_name()}")
     try:
         from torchao.quantization import Float8DynamicActivationFloat8WeightConfig
+        print(f"torchao Float8: Available")
     except ImportError:
+        print(f"torchao Float8: Not available")
     try:
+        import spaces
+        print(f"spaces (ZeroGPU AOT): Available")
+        print(f"  aoti_capture: {hasattr(spaces, 'aoti_capture')}")
+        print(f"  aoti_compile: {hasattr(spaces, 'aoti_compile')}")
+        print(f"  aoti_apply: {hasattr(spaces, 'aoti_apply')}")
+    except ImportError:
+        print(f"spaces (ZeroGPU AOT): Not available")
     print("-" * 40)