Spaces:

TSXu
/

UniCalli_Dev

Running on Zero

TSXu commited on Jan 29

Commit

6e4c09b

1 Parent(s): 96d51ed

Use proper spaces.aoti_capture + aoti_compile + aoti_apply for AOT compilation

- Use spaces.aoti_capture to capture real forward pass inputs
- Use spaces.aoti_compile with INDUCTOR_CONFIGS (including triton.cudagraphs)
- Use spaces.aoti_apply to apply compiled model
- Separate compilation (5 min one-time) and generation (2 min)
- First run: AOT compilation takes 3-5 minutes
- Subsequent runs: only 2 minutes for generation

This follows FLUX-Kontext-fp8 pattern exactly.

Files changed (1) hide show

app.py +79 -151

app.py CHANGED Viewed

@@ -101,14 +101,9 @@ print("="*50)
 # ============================================================
 # AOT Optimization Configuration (from FLUX-Kontext-fp8)
 # ============================================================
 from torchao.quantization import quantize_, Float8DynamicActivationFloat8WeightConfig
-# Fixed input dimensions for this model
-# width=128, height=896 (7 chars × 128)
-# After VAE packing: h=448, w=64, img_seq_len = 448*64 = 28672
-IMG_SEQ_LEN = 28672  # Fixed: (896/2) * (128/2)
-TXT_SEQ_LEN = 512    # Fixed: T5 max_length
 # Inductor configuration for optimal performance
 INDUCTOR_CONFIGS = {
     'conv_1x1_as_mm': True,
@@ -120,102 +115,9 @@ INDUCTOR_CONFIGS = {
 }
-def create_sample_inputs(device="cuda", dtype=torch.float32):
-    """
-    Create sample inputs with fixed dimensions for torch.export.
-    """
-    batch_size = 1
-    hidden_size = 3072  # Flux hidden size
-    vec_dim = 768       # CLIP vec dim
-    cond_txt_dim = 896  # Condition text embedding dim
-    sample_inputs = {
-        'img': torch.randn(batch_size, IMG_SEQ_LEN, 64, device=device, dtype=dtype),  # 64 = in_channels
-        'img_ids': torch.zeros(batch_size, IMG_SEQ_LEN, 3, device=device, dtype=dtype),
-        'txt': torch.randn(batch_size, TXT_SEQ_LEN, 4096, device=device, dtype=dtype),  # 4096 = T5 dim
-        'txt_ids': torch.zeros(batch_size, TXT_SEQ_LEN, 3, device=device, dtype=dtype),
-        'y': torch.randn(batch_size, vec_dim, device=device, dtype=dtype),
-        'timesteps': torch.tensor([0.5], device=device, dtype=dtype),
-        'timesteps2': torch.tensor([0.5], device=device, dtype=dtype),
-        'cond_txt_latent': torch.randn(batch_size, 5, cond_txt_dim, device=device, dtype=dtype),  # 5 cond tokens
-        'guidance': torch.tensor([3.5], device=device, dtype=dtype),
-    }
-    return sample_inputs
-def apply_aot_optimization(model, device="cuda"):
-    """
-    Apply Float8 quantization and AOT compilation with torch.export.
-    Based on FLUX-Kontext-fp8 optimization pattern.
-    """
-    import torch._inductor.config as inductor_config
-    # Apply inductor configurations
-    for key, value in INDUCTOR_CONFIGS.items():
-        if hasattr(inductor_config, key):
-            setattr(inductor_config, key, value)
-    print("="*50)
-    print("Starting AOT optimization with fixed input shapes...")
-    print(f"  img_seq_len: {IMG_SEQ_LEN}")
-    print(f"  txt_seq_len: {TXT_SEQ_LEN}")
-    print("="*50)
-    # Step 1: Apply Float8 quantization
-    print("Applying Float8 quantization...")
-    quantize_(model, Float8DynamicActivationFloat8WeightConfig())
-    print("✓ Float8 quantization complete!")
-    # Step 2: Create sample inputs for export
-    print("Creating sample inputs for torch.export...")
-    sample_inputs = create_sample_inputs(device=device, dtype=torch.float32)
-    # Step 3: Export model with fixed shapes (no dynamic dims needed)
-    print("Exporting model with torch.export (fixed shapes)...")
-    try:
-        exported = torch.export.export(
-            model,
-            args=(),
-            kwargs=sample_inputs,
-            strict=False,  # Allow some graph breaks if needed
-        )
-        print("✓ Model exported!")
-        # Step 4: AOT compile with inductor
-        print("AOT compiling with torch._inductor.aot_compile...")
-        compiled_path = torch._inductor.aot_compile(
-            exported.module(),
-            args=(),
-            kwargs=sample_inputs,
-            options=INDUCTOR_CONFIGS,
-        )
-        print(f"✓ AOT compiled to: {compiled_path}")
-        # Step 5: Load the compiled model
-        print("Loading AOT compiled model...")
-        compiled_model = torch._export.aot_load(compiled_path, device=device)
-        print("✓ AOT model loaded!")
-        return compiled_model
-    except Exception as e:
-        print(f"AOT compilation failed: {e}")
-        print("Falling back to torch.compile (JIT)...")
-        # Fallback to JIT compilation
-        compiled_model = torch.compile(
-            model,
-            mode="max-autotune",
-            backend="inductor",
-            fullgraph=False,
-        )
-        print("✓ torch.compile (JIT) applied!")
-        return compiled_model
 def init_generator():
-    """Initialize the generator with Float8 + AOT compilation"""
-    global generator, _cached_model_dir, _is_optimized
     if generator is None:
         # Enable CUDA optimizations
@@ -247,21 +149,71 @@ def init_generator():
             author_descriptions_path='dataset/calligraphy_styles_en.json',
             use_deepspeed=False,
             use_4bit_quantization=False,
-            use_float8_quantization=False,  # Apply via AOT below
-            use_torch_compile=False,  # Apply via AOT below
             dtype="fp32",
         )
-        # Apply Float8 quantization + AOT compilation (fixed input shapes)
-        if not _is_optimized:
-            print("Applying Float8 + AOT optimizations to transformer...")
-            generator.model = apply_aot_optimization(generator.model, device="cuda")
-            _is_optimized = True
-            print("✓ Transformer optimized with Float8 + AOT compilation!")
     return generator
 def update_font_choices(author: str):
     """
     Update available font choices based on selected author
@@ -283,31 +235,13 @@ def parse_font_style(font_style: str) -> str:
     return None
-@spaces.GPU(duration=300)  # 5 minutes for first-time compilation
-def compile_and_warmup():
-    """
-    Compile the model with Float8 + AOT optimization (first time only).
-    """
-    print("="*50)
-    print("First-time compilation starting...")
-    print("="*50)
-    gen = init_generator()
-    # Warmup run to trigger JIT compilation
-    print("Running warmup generation...")
-    gen.generate(text="测", font_style="楷", author=None, num_steps=1, seed=42)
-    print("="*50)
-    print("Compilation and warmup complete!")
-    print("="*50)
-    return gen
-@spaces.GPU(duration=120)  # 2 minutes for normal generation (20s + 25steps * 4s = ~120s)
 def run_generation(text, font, author, num_steps, start_seed, num_images):
     """
-    Run generation after model is already compiled.
-    Duration: ~20s base + 4s per step, for up to 8 images.
     """
-    gen = init_generator()  # Returns cached generator
     results = []
     seeds_used = []
@@ -337,18 +271,10 @@ def interactive_session(
     progress=gr.Progress()
 ):
     """
-    Interactive session: compile model once (5 min), then generate images (2 min each).
-    Args:
-        text: Input text (1-7 characters)
-        author_dropdown: Selected author
-        font_style: Font style
-        num_steps: Inference steps
-        start_seed: Starting seed
-        num_images: Number of images to generate (each with different seed)
-    Yields:
-        Progress status, gallery of results
     """
     global _is_optimized
@@ -366,14 +292,16 @@ def interactive_session(
     # Determine author
     author = author_dropdown if author_dropdown != "None (Synthetic / 合成风格)" else None
-    # Step 1: Compile model if needed (first time only, 5 min budget)
     if not _is_optimized:
-        yield "⏳ 首次运行，正在编译优化模型（约3-5分钟）... / First run, compiling...", []
-        compile_and_warmup()
-        yield "✅ 模型编译完成！/ Model compiled!", []
-    # Step 2: Run generation (2 min budget)
-    yield f"🎨 开始生成 {num_images} 张图片... / Generating {num_images} images...", []
     progress(0.1, desc="生成中...")
     results, seeds_used = run_generation(
@@ -382,7 +310,7 @@ def interactive_session(
     progress(1.0, desc="完成!")
-    # Final yield
     if num_images > 1:
         final_status = f"✅ 全部完成！共 {num_images} 张 (Seeds: {seeds_used[0]}-{seeds_used[-1]})"
     else:

 # ============================================================
 # AOT Optimization Configuration (from FLUX-Kontext-fp8)
 # ============================================================
+from torch.utils._pytree import tree_map_only
 from torchao.quantization import quantize_, Float8DynamicActivationFloat8WeightConfig
 # Inductor configuration for optimal performance
 INDUCTOR_CONFIGS = {
     'conv_1x1_as_mm': True,
 }
 def init_generator():
+    """Initialize the generator (without optimization - that's done separately)"""
+    global generator, _cached_model_dir
     if generator is None:
         # Enable CUDA optimizations
             author_descriptions_path='dataset/calligraphy_styles_en.json',
             use_deepspeed=False,
             use_4bit_quantization=False,
+            use_float8_quantization=False,
+            use_torch_compile=False,
             dtype="fp32",
         )
     return generator
+def optimize_transformer_(gen):
+    """
+    Apply Float8 quantization + AOT compilation using spaces.aoti_capture.
+    Based on FLUX-Kontext-fp8 pattern.
+    """
+    model = gen.model
+    @spaces.GPU(duration=300)  # 5 minutes for compilation
+    def compile_transformer():
+        print("="*50)
+        print("Starting AOT compilation with spaces.aoti_capture...")
+        print("="*50)
+        # Step 1: Capture model forward during a real generation
+        print("Capturing model forward pass...")
+        with spaces.aoti_capture(model) as call:
+            gen.generate(
+                text="测试",
+                font_style="楷",
+                author=None,
+                num_steps=1,
+                seed=42,
+            )
+        # Step 2: Build dynamic shapes (we use fixed shapes, so set to None)
+        dynamic_shapes = tree_map_only((torch.Tensor, bool), lambda t: None, call.kwargs)
+        # Step 3: Apply Float8 quantization
+        print("Applying Float8 quantization...")
+        quantize_(model, Float8DynamicActivationFloat8WeightConfig())
+        print("✓ Float8 quantization complete!")
+        # Step 4: Export model
+        print("Exporting model with torch.export...")
+        exported = torch.export.export(
+            mod=model,
+            args=call.args,
+            kwargs=call.kwargs,
+            dynamic_shapes=dynamic_shapes,
+        )
+        print("✓ Model exported!")
+        # Step 5: AOT compile
+        print("AOT compiling with spaces.aoti_compile...")
+        print(f"  Inductor configs: {INDUCTOR_CONFIGS}")
+        return spaces.aoti_compile(exported, INDUCTOR_CONFIGS)
+    # Run compilation and apply the result
+    print("="*50)
+    print("Running AOT compilation (this takes 3-5 minutes)...")
+    print("="*50)
+    spaces.aoti_apply(compile_transformer(), model)
+    print("="*50)
+    print("✓ AOT compilation complete! Model is now optimized.")
+    print("="*50)
 def update_font_choices(author: str):
     """
     Update available font choices based on selected author
     return None
+@spaces.GPU(duration=120)  # 2 minutes for normal generation
 def run_generation(text, font, author, num_steps, start_seed, num_images):
     """
+    Run generation with the optimized model.
+    Duration: 20s base + ~4s per step per image.
     """
+    gen = init_generator()
     results = []
     seeds_used = []
     progress=gr.Progress()
 ):
     """
+    Interactive session with separate compilation and generation phases.
+    - First time: 5 min for AOT compilation (one-time)
+    - After that: 2 min for generation
     """
     global _is_optimized
     # Determine author
     author = author_dropdown if author_dropdown != "None (Synthetic / 合成风格)" else None
+    # Step 1: AOT Compile if not done yet (5 min, one-time only)
     if not _is_optimized:
+        yield "⏳ 首次运行，需要编译优化模型（约3-5分钟，仅此一次）...", []
+        gen = init_generator()
+        optimize_transformer_(gen)  # This uses @spaces.GPU(duration=300) internally
+        _is_optimized = True
+        yield "✅ 模型编译完成！后续生成将会很快。", []
+    # Step 2: Run generation (2 min)
+    yield f"🎨 开始生成 {num_images} 张图片...", []
     progress(0.1, desc="生成中...")
     results, seeds_used = run_generation(
     progress(1.0, desc="完成!")
+    # Final status
     if num_images > 1:
         final_status = f"✅ 全部完成！共 {num_images} 张 (Seeds: {seeds_used[0]}-{seeds_used[-1]})"
     else: