Spaces:

TSXu
/

UniCalli_Dev

Running on Zero

TSXu commited on Jan 29

Commit

7c94f94

1 Parent(s): 6e4c09b

Implement proper AOT compilation with spaces.aoti_capture/compile/apply

Exactly following FLUX-Kontext-fp8 pattern:
1. spaces.aoti_capture(model) - capture forward pass during real inference
2. tree_map_only to build dynamic shapes
3. quantize_(model, Float8DynamicActivationFloat8WeightConfig())
4. torch.export.export(model, args, kwargs, dynamic_shapes)
5. spaces.aoti_compile(exported, INDUCTOR_CONFIGS)
6. spaces.aoti_apply(compiled, model)

Separate functions:
- compile_model_first_time(): 300s for AOT compilation (one-time)
- run_generation(): 120s for normal generation

Files changed (1) hide show

app.py +76 -52

app.py CHANGED Viewed

@@ -157,22 +157,44 @@ def init_generator():
     return generator
-def optimize_transformer_(gen):
     """
-    Apply Float8 quantization + AOT compilation using spaces.aoti_capture.
-    Based on FLUX-Kontext-fp8 pattern.
     """
     model = gen.model
-    @spaces.GPU(duration=300)  # 5 minutes for compilation
     def compile_transformer():
         print("="*50)
-        print("Starting AOT compilation with spaces.aoti_capture...")
         print("="*50)
-        # Step 1: Capture model forward during a real generation
-        print("Capturing model forward pass...")
         with spaces.aoti_capture(model) as call:
             gen.generate(
                 text="测试",
                 font_style="楷",
@@ -180,17 +202,20 @@ def optimize_transformer_(gen):
                 num_steps=1,
                 seed=42,
             )
-        # Step 2: Build dynamic shapes (we use fixed shapes, so set to None)
         dynamic_shapes = tree_map_only((torch.Tensor, bool), lambda t: None, call.kwargs)
         # Step 3: Apply Float8 quantization
-        print("Applying Float8 quantization...")
         quantize_(model, Float8DynamicActivationFloat8WeightConfig())
         print("✓ Float8 quantization complete!")
-        # Step 4: Export model
-        print("Exporting model with torch.export...")
         exported = torch.export.export(
             mod=model,
             args=call.args,
@@ -199,61 +224,62 @@ def optimize_transformer_(gen):
         )
         print("✓ Model exported!")
-        # Step 5: AOT compile
-        print("AOT compiling with spaces.aoti_compile...")
         print(f"  Inductor configs: {INDUCTOR_CONFIGS}")
-        return spaces.aoti_compile(exported, INDUCTOR_CONFIGS)
     # Run compilation and apply the result
-    print("="*50)
-    print("Running AOT compilation (this takes 3-5 minutes)...")
-    print("="*50)
     spaces.aoti_apply(compile_transformer(), model)
     print("="*50)
-    print("✓ AOT compilation complete! Model is now optimized.")
     print("="*50)
-def update_font_choices(author: str):
     """
-    Update available font choices based on selected author
     """
-    if author == "None (Synthetic / 合成风格)" or author not in AUTHOR_FONTS:
-        choices = list(FONT_STYLE_NAMES.values())
-    else:
-        available_fonts = AUTHOR_FONTS[author]
-        choices = [FONT_STYLE_NAMES[font] for font in available_fonts if font in FONT_STYLE_NAMES]
-    return gr.Dropdown(choices=choices, value=choices[0] if choices else None)
-def parse_font_style(font_style: str) -> str:
-    """Extract font key from display name"""
-    for font_key, font_display in FONT_STYLE_NAMES.items():
-        if font_display == font_style:
-            return font_key
-    return None
 @spaces.GPU(duration=120)  # 2 minutes for normal generation
 def run_generation(text, font, author, num_steps, start_seed, num_images):
     """
-    Run generation with the optimized model.
-    Duration: 20s base + ~4s per step per image.
     """
-    gen = init_generator()
     results = []
     seeds_used = []
     for i in range(num_images):
         current_seed = start_seed + i
         result_img, cond_img = gen.generate(
-            text=text,
-            font_style=font,
-            author=author,
-            num_steps=num_steps,
-            seed=current_seed,
         )
         results.append((result_img, f"Seed: {current_seed}"))
         seeds_used.append(current_seed)
@@ -271,8 +297,7 @@ def interactive_session(
     progress=gr.Progress()
 ):
     """
-    Interactive session with separate compilation and generation phases.
     - First time: 5 min for AOT compilation (one-time)
     - After that: 2 min for generation
     """
@@ -292,17 +317,16 @@ def interactive_session(
     # Determine author
     author = author_dropdown if author_dropdown != "None (Synthetic / 合成风格)" else None
-    # Step 1: AOT Compile if not done yet (5 min, one-time only)
     if not _is_optimized:
-        yield "⏳ 首次运行，需要编译优化模型（约3-5分钟，仅此一次）...", []
-        gen = init_generator()
-        optimize_transformer_(gen)  # This uses @spaces.GPU(duration=300) internally
-        _is_optimized = True
-        yield "✅ 模型编译完成！后续生成将会很快。", []
     # Step 2: Run generation (2 min)
     yield f"🎨 开始生成 {num_images} 张图片...", []
-    progress(0.1, desc="生成中...")
     results, seeds_used = run_generation(
         text, font, author, num_steps, start_seed, num_images

     return generator
+def update_font_choices(author: str):
+    """
+    Update available font choices based on selected author
+    """
+    if author == "None (Synthetic / 合成风格)" or author not in AUTHOR_FONTS:
+        choices = list(FONT_STYLE_NAMES.values())
+    else:
+        available_fonts = AUTHOR_FONTS[author]
+        choices = [FONT_STYLE_NAMES[font] for font in available_fonts if font in FONT_STYLE_NAMES]
+    return gr.Dropdown(choices=choices, value=choices[0] if choices else None)
+def parse_font_style(font_style: str) -> str:
+    """Extract font key from display name"""
+    for font_key, font_display in FONT_STYLE_NAMES.items():
+        if font_display == font_style:
+            return font_key
+    return None
+def aot_compile_transformer(gen):
     """
+    AOT compile the transformer using spaces.aoti_capture/compile/apply.
+    Exactly following FLUX-Kontext-fp8 pattern.
     """
     model = gen.model
+    @spaces.GPU(duration=300)  # 5 minutes for AOT compilation
     def compile_transformer():
         print("="*50)
+        print("Starting AOT compilation (FLUX-Kontext-fp8 pattern)...")
         print("="*50)
+        # Step 1: Capture model forward during a real inference
+        print("Step 1: Capturing model forward pass with spaces.aoti_capture...")
         with spaces.aoti_capture(model) as call:
+            # Run a sample generation to capture the forward call
             gen.generate(
                 text="测试",
                 font_style="楷",
                 num_steps=1,
                 seed=42,
             )
+        print("✓ Forward pass captured!")
+        # Step 2: Build dynamic shapes (None = fixed shapes)
+        print("Step 2: Building dynamic shapes...")
         dynamic_shapes = tree_map_only((torch.Tensor, bool), lambda t: None, call.kwargs)
+        print("✓ Dynamic shapes built!")
         # Step 3: Apply Float8 quantization
+        print("Step 3: Applying Float8 quantization...")
         quantize_(model, Float8DynamicActivationFloat8WeightConfig())
         print("✓ Float8 quantization complete!")
+        # Step 4: Export model with torch.export
+        print("Step 4: Exporting model with torch.export...")
         exported = torch.export.export(
             mod=model,
             args=call.args,
         )
         print("✓ Model exported!")
+        # Step 5: AOT compile with spaces.aoti_compile
+        print("Step 5: AOT compiling with spaces.aoti_compile...")
         print(f"  Inductor configs: {INDUCTOR_CONFIGS}")
+        compiled = spaces.aoti_compile(exported, INDUCTOR_CONFIGS)
+        print("✓ AOT compilation complete!")
+        return compiled
     # Run compilation and apply the result
+    print("Running AOT compilation...")
     spaces.aoti_apply(compile_transformer(), model)
     print("="*50)
+    print("✓ AOT compiled model applied!")
     print("="*50)
+@spaces.GPU(duration=300)  # 5 minutes for first-time AOT compilation
+def compile_model_first_time():
     """
+    First-time: Load model and run AOT compilation.
+    Returns the optimized generator.
     """
+    global _is_optimized, generator
+    print("="*50)
+    print("First-time run: Loading model and running AOT compilation...")
+    print("="*50)
+    # Load model
+    gen = init_generator()
+    # AOT compile the transformer
+    aot_compile_transformer(gen)
+    _is_optimized = True
+    print("="*50)
+    print("✓ Model loaded and AOT compiled!")
+    print("="*50)
+    return gen
 @spaces.GPU(duration=120)  # 2 minutes for normal generation
 def run_generation(text, font, author, num_steps, start_seed, num_images):
     """
+    Run generation with the AOT-compiled model.
     """
+    gen = init_generator()  # Returns the already-compiled generator
     results = []
     seeds_used = []
     for i in range(num_images):
         current_seed = start_seed + i
         result_img, cond_img = gen.generate(
+            text=text, font_style=font, author=author,
+            num_steps=num_steps, seed=current_seed,
         )
         results.append((result_img, f"Seed: {current_seed}"))
         seeds_used.append(current_seed)
     progress=gr.Progress()
 ):
     """
+    Interactive session:
     - First time: 5 min for AOT compilation (one-time)
     - After that: 2 min for generation
     """
     # Determine author
     author = author_dropdown if author_dropdown != "None (Synthetic / 合成风格)" else None
+    # Step 1: AOT compile if not done yet (5 min, one-time)
     if not _is_optimized:
+        yield "⏳ 首次运行，AOT编译优化模型（约3-5分钟，仅此一次）...", []
+        progress(0.1, desc="AOT编译中...")
+        compile_model_first_time()
+        yield "✅ AOT编译完成！", []
     # Step 2: Run generation (2 min)
     yield f"🎨 开始生成 {num_images} 张图片...", []
+    progress(0.5, desc="生成中...")
     results, seeds_used = run_generation(
         text, font, author, num_steps, start_seed, num_images