Spaces:

TSXu
/

UniCalli_Dev

Running on Zero

TSXu commited on Jan 29

Commit

a53108a

1 Parent(s): 3974489

Add Float8 quantization and torch.compile optimizations

- Add optimization.py module with torchao Float8 quantization support
- Add torch.compile with inductor optimizations (max_autotune, cudagraphs, etc.)
- Enable CUDA optimizations (TF32, Flash SDPA, cuDNN benchmark)
- Add --float8, --compile, --compile-mode CLI arguments
- Update requirements.txt with torchao>=0.4.0 and torch>=2.4.0

Files changed (3) hide show

inference.py +40 -1
optimization.py +322 -0
requirements.txt +2 -1

inference.py CHANGED Viewed

@@ -18,6 +18,7 @@ from huggingface_hub import hf_hub_download, snapshot_download
 from src.flux.util import configs, load_ae, load_clip, load_t5
 from src.flux.model import Flux
 from src.flux.xflux_pipeline import XFluxSampler
 # HuggingFace Hub model IDs
@@ -150,6 +151,9 @@ class CalligraphyGenerator:
         author_descriptions_path: str = "calligraphy_styles_en.json",
         use_deepspeed: bool = False,
         use_4bit_quantization: bool = False,
         deepspeed_config: Optional[str] = None,
         dtype: Optional[str] = None
     ):
@@ -166,6 +170,10 @@ class CalligraphyGenerator:
             font_descriptions_path: path to font style descriptions JSON
             author_descriptions_path: path to author style descriptions JSON
             use_deepspeed: whether to use DeepSpeed ZeRO for memory optimization
             deepspeed_config: path to DeepSpeed config JSON file
             dtype: force specific dtype for inference: "fp16", "bf16", "fp32", or None for auto
         """
@@ -176,7 +184,13 @@ class CalligraphyGenerator:
         self.use_deepspeed = use_deepspeed
         self.deepspeed_config = deepspeed_config
         self.use_4bit_quantization = use_4bit_quantization
         self.forced_dtype = dtype  # "fp16", "bf16", "fp32", or None for auto
         # Load font and author style descriptions
         if os.path.exists(font_descriptions_path):
@@ -232,6 +246,17 @@ class CalligraphyGenerator:
             )
             if self.use_deepspeed:
                 self.model = self._init_deepspeed(self.model)
         # Load VAE
         if self.use_deepspeed or offload:
@@ -1088,14 +1113,28 @@ if __name__ == "__main__":
     parser.add_argument("--checkpoint", type=str, default=None, help="Checkpoint path")
     parser.add_argument("--list-authors", action="store_true", help="List available authors")
     parser.add_argument("--list-fonts", action="store_true", help="List available font styles")
     args = parser.parse_args()
     # Initialize generator
     generator = CalligraphyGenerator(
         model_name="flux-dev",
         device=args.device,
-        checkpoint_path=args.checkpoint
     )
     # List available options

 from src.flux.util import configs, load_ae, load_clip, load_t5
 from src.flux.model import Flux
 from src.flux.xflux_pipeline import XFluxSampler
+from optimization import optimize_model, enable_cuda_optimizations, check_optimization_support
 # HuggingFace Hub model IDs
         author_descriptions_path: str = "calligraphy_styles_en.json",
         use_deepspeed: bool = False,
         use_4bit_quantization: bool = False,
+        use_float8_quantization: bool = False,
+        use_torch_compile: bool = False,
+        compile_mode: str = "reduce-overhead",
         deepspeed_config: Optional[str] = None,
         dtype: Optional[str] = None
     ):
             font_descriptions_path: path to font style descriptions JSON
             author_descriptions_path: path to author style descriptions JSON
             use_deepspeed: whether to use DeepSpeed ZeRO for memory optimization
+            use_4bit_quantization: whether to use 4-bit quantization (quanto/bitsandbytes)
+            use_float8_quantization: whether to use Float8 quantization (torchao) for faster inference
+            use_torch_compile: whether to use torch.compile for optimized inference
+            compile_mode: torch.compile mode - "reduce-overhead", "max-autotune", or "default"
             deepspeed_config: path to DeepSpeed config JSON file
             dtype: force specific dtype for inference: "fp16", "bf16", "fp32", or None for auto
         """
         self.use_deepspeed = use_deepspeed
         self.deepspeed_config = deepspeed_config
         self.use_4bit_quantization = use_4bit_quantization
+        self.use_float8_quantization = use_float8_quantization
+        self.use_torch_compile = use_torch_compile
+        self.compile_mode = compile_mode
         self.forced_dtype = dtype  # "fp16", "bf16", "fp32", or None for auto
+        # Enable CUDA optimizations early
+        enable_cuda_optimizations()
         # Load font and author style descriptions
         if os.path.exists(font_descriptions_path):
             )
             if self.use_deepspeed:
                 self.model = self._init_deepspeed(self.model)
+        # Apply Float8 quantization and torch.compile optimizations
+        if not self.use_deepspeed and not self.use_4bit_quantization:
+            if self.use_float8_quantization or self.use_torch_compile:
+                self.model = optimize_model(
+                    self.model,
+                    device=str(self.device),
+                    use_float8=self.use_float8_quantization,
+                    use_compile=self.use_torch_compile,
+                    compile_mode=self.compile_mode
+                )
         # Load VAE
         if self.use_deepspeed or offload:
     parser.add_argument("--checkpoint", type=str, default=None, help="Checkpoint path")
     parser.add_argument("--list-authors", action="store_true", help="List available authors")
     parser.add_argument("--list-fonts", action="store_true", help="List available font styles")
+    parser.add_argument("--float8", action="store_true", help="Use Float8 quantization (torchao) for faster inference")
+    parser.add_argument("--compile", action="store_true", help="Use torch.compile for optimized inference")
+    parser.add_argument("--compile-mode", type=str, default="reduce-overhead",
+                       choices=["reduce-overhead", "max-autotune", "default"],
+                       help="torch.compile mode")
+    parser.add_argument("--check-optimizations", action="store_true", help="Check available optimization support")
     args = parser.parse_args()
+    # Check optimization support if requested
+    if args.check_optimizations:
+        check_optimization_support()
+        exit(0)
     # Initialize generator
     generator = CalligraphyGenerator(
         model_name="flux-dev",
         device=args.device,
+        checkpoint_path=args.checkpoint,
+        use_float8_quantization=args.float8,
+        use_torch_compile=args.compile,
+        compile_mode=args.compile_mode
     )
     # List available options

optimization.py ADDED Viewed

	@@ -0,0 +1,322 @@

+"""
+Model optimization utilities for faster inference using:
+- Float8 quantization via torchao
+- torch.compile with inductor optimizations
+- CUDA graph capture for reduced kernel launch overhead
+Inspired by FLUX-Kontext-fp8 optimization techniques.
+"""
+from typing import Optional, Callable, Any
+import torch
+import torch.nn as nn
+# Inductor configuration for optimal performance
+INDUCTOR_CONFIGS = {
+    'conv_1x1_as_mm': True,
+    'epilogue_fusion': False,
+    'coordinate_descent_tuning': True,
+    'coordinate_descent_check_all_directions': True,
+    'max_autotune': True,
+}
+def apply_float8_quantization(model: nn.Module, device: str = "cuda") -> nn.Module:
+    """
+    Apply Float8 dynamic activation and weight quantization using torchao.
+    This provides significant speedup on GPUs with native FP8 support (H100, etc.)
+    and reasonable speedup on other GPUs through reduced memory bandwidth.
+    Args:
+        model: PyTorch model to quantize
+        device: Target device for the model
+    Returns:
+        Quantized model
+    """
+    try:
+        from torchao.quantization import quantize_, Float8DynamicActivationFloat8WeightConfig
+        print("Applying Float8 dynamic activation + Float8 weight quantization...")
+        # Move model to device first if not already there
+        if next(model.parameters()).device.type != device:
+            model = model.to(device)
+        # Apply float8 quantization
+        quantize_(model, Float8DynamicActivationFloat8WeightConfig())
+        print("Float8 quantization applied successfully!")
+        return model
+    except ImportError as e:
+        print(f"torchao not available for Float8 quantization: {e}")
+        print("Install with: pip install torchao")
+        return model
+    except Exception as e:
+        print(f"Float8 quantization failed: {e}")
+        print("Falling back to unquantized model")
+        return model
+def apply_torch_compile(
+    model: nn.Module,
+    mode: str = "reduce-overhead",
+    fullgraph: bool = False,
+    dynamic: bool = True,
+    backend: str = "inductor"
+) -> nn.Module:
+    """
+    Apply torch.compile with optimized settings for inference.
+    Args:
+        model: PyTorch model to compile
+        mode: Compilation mode - "reduce-overhead" (best for inference),
+              "max-autotune" (slower compile, faster runtime), or "default"
+        fullgraph: If True, requires entire forward to be capturable (faster but stricter)
+        dynamic: If True, allows dynamic shapes (recommended for variable input sizes)
+        backend: Compiler backend - "inductor" is recommended
+    Returns:
+        Compiled model
+    """
+    try:
+        import torch._inductor.config as inductor_config
+        # Apply inductor configurations
+        for key, value in INDUCTOR_CONFIGS.items():
+            if hasattr(inductor_config, key):
+                setattr(inductor_config, key, value)
+        print(f"Applying torch.compile with mode='{mode}', backend='{backend}'...")
+        compiled_model = torch.compile(
+            model,
+            mode=mode,
+            fullgraph=fullgraph,
+            dynamic=dynamic,
+            backend=backend
+        )
+        print("torch.compile applied successfully!")
+        return compiled_model
+    except Exception as e:
+        print(f"torch.compile failed: {e}")
+        print("Falling back to uncompiled model")
+        return model
+def enable_cuda_optimizations():
+    """
+    Enable various CUDA optimizations for better performance.
+    """
+    if not torch.cuda.is_available():
+        print("CUDA not available, skipping CUDA optimizations")
+        return
+    try:
+        # Enable TF32 for faster matmul on Ampere+ GPUs
+        torch.backends.cuda.matmul.allow_tf32 = True
+        torch.backends.cudnn.allow_tf32 = True
+        # Enable cuDNN benchmark mode for faster convolutions
+        torch.backends.cudnn.benchmark = True
+        # Enable flash/memory-efficient SDPA backends
+        torch.backends.cuda.enable_flash_sdp(True)
+        torch.backends.cuda.enable_mem_efficient_sdp(True)
+        torch.backends.cuda.enable_math_sdp(False)  # Disable slower math backend
+        print("CUDA optimizations enabled (TF32, cuDNN benchmark, Flash SDPA)")
+    except Exception as e:
+        print(f"Some CUDA optimizations failed: {e}")
+def optimize_model(
+    model: nn.Module,
+    device: str = "cuda",
+    use_float8: bool = True,
+    use_compile: bool = True,
+    compile_mode: str = "reduce-overhead"
+) -> nn.Module:
+    """
+    Apply all optimizations to the model for maximum inference speed.
+    Optimizations applied:
+    1. CUDA backend optimizations (TF32, Flash SDPA, etc.)
+    2. Float8 quantization via torchao (if available)
+    3. torch.compile with inductor optimizations
+    Args:
+        model: PyTorch model to optimize
+        device: Target device
+        use_float8: Whether to apply Float8 quantization
+        use_compile: Whether to apply torch.compile
+        compile_mode: Mode for torch.compile
+    Returns:
+        Optimized model
+    """
+    print("=" * 50)
+    print("Applying model optimizations...")
+    print("=" * 50)
+    # 1. Enable CUDA optimizations
+    enable_cuda_optimizations()
+    # 2. Move model to device
+    if next(model.parameters()).device.type != device:
+        print(f"Moving model to {device}...")
+        model = model.to(device)
+    # 3. Apply Float8 quantization
+    if use_float8:
+        model = apply_float8_quantization(model, device)
+    # 4. Apply torch.compile
+    if use_compile:
+        model = apply_torch_compile(model, mode=compile_mode)
+    print("=" * 50)
+    print("Model optimization complete!")
+    print("=" * 50)
+    return model
+def warmup_model(
+    model: nn.Module,
+    warmup_fn: Callable[[], Any],
+    num_warmup: int = 3
+):
+    """
+    Warmup the compiled model to trigger JIT compilation.
+    Args:
+        model: The model (should already be compiled)
+        warmup_fn: Function that runs a forward pass
+        num_warmup: Number of warmup iterations
+    """
+    print(f"Warming up model with {num_warmup} iterations...")
+    with torch.no_grad():
+        for i in range(num_warmup):
+            try:
+                warmup_fn()
+                print(f"  Warmup {i+1}/{num_warmup} complete")
+            except Exception as e:
+                print(f"  Warmup {i+1}/{num_warmup} failed: {e}")
+    # Synchronize CUDA
+    if torch.cuda.is_available():
+        torch.cuda.synchronize()
+    print("Model warmup complete!")
+class CUDAGraphWrapper(nn.Module):
+    """
+    Wrapper that captures and replays CUDA graphs for reduced kernel launch overhead.
+    Note: CUDA graphs require static input shapes. Use this only if your input
+    dimensions are fixed.
+    """
+    def __init__(self, model: nn.Module, warmup_fn: Callable[[], tuple]):
+        super().__init__()
+        self.model = model
+        self.graph = None
+        self.static_inputs = None
+        self.static_outputs = None
+        self._captured = False
+    def capture(self, *sample_inputs):
+        """
+        Capture the CUDA graph with sample inputs.
+        Args:
+            *sample_inputs: Sample inputs with the exact shapes that will be used
+        """
+        if not torch.cuda.is_available():
+            print("CUDA not available, skipping graph capture")
+            return
+        print("Capturing CUDA graph...")
+        # Warmup
+        with torch.no_grad():
+            for _ in range(3):
+                _ = self.model(*sample_inputs)
+        torch.cuda.synchronize()
+        # Capture
+        self.graph = torch.cuda.CUDAGraph()
+        # Create static tensors
+        self.static_inputs = tuple(inp.clone() for inp in sample_inputs)
+        with torch.cuda.graph(self.graph):
+            self.static_outputs = self.model(*self.static_inputs)
+        self._captured = True
+        print("CUDA graph captured successfully!")
+    def forward(self, *inputs):
+        if not self._captured:
+            return self.model(*inputs)
+        # Copy inputs to static buffers
+        for static_inp, inp in zip(self.static_inputs, inputs):
+            static_inp.copy_(inp)
+        # Replay graph
+        self.graph.replay()
+        return self.static_outputs
+# Utility function to check available optimizations
+def check_optimization_support():
+    """
+    Check which optimizations are available on the current system.
+    """
+    print("Checking optimization support...")
+    print("-" * 40)
+    # CUDA
+    print(f"CUDA available: {torch.cuda.is_available()}")
+    if torch.cuda.is_available():
+        print(f"  Device: {torch.cuda.get_device_name()}")
+        print(f"  Capability: {torch.cuda.get_device_capability()}")
+    # torch.compile
+    try:
+        import torch._dynamo
+        print(f"torch.compile available: True")
+    except ImportError:
+        print(f"torch.compile available: False")
+    # torchao Float8
+    try:
+        from torchao.quantization import Float8DynamicActivationFloat8WeightConfig
+        print(f"torchao Float8 available: True")
+    except ImportError:
+        print(f"torchao Float8 available: False")
+    # Flash Attention
+    try:
+        flash_available = torch.backends.cuda.flash_sdp_enabled() if torch.cuda.is_available() else False
+        print(f"Flash SDPA available: {flash_available}")
+    except:
+        print(f"Flash SDPA available: Unknown")
+    print("-" * 40)
+if __name__ == "__main__":
+    check_optimization_support()

requirements.txt CHANGED Viewed

@@ -8,8 +8,9 @@ safetensors>=0.4.0
 # Model and inference
 optimum-quanto
 bitsandbytes>=0.41.0
-torch
 torchvision
 timm
 sentencepiece
 diffusers

 # Model and inference
 optimum-quanto
 bitsandbytes>=0.41.0
+torch>=2.4.0
 torchvision
+torchao>=0.4.0
 timm
 sentencepiece
 diffusers