Spaces:

TSXu
/

UniCalli_Dev

Running on Zero

TSXu commited on Jan 30

Commit

8c4267f

1 Parent(s): b21384d

Add compiled graph caching to HF Hub

- Check Hub for cached compiled_graph.pt2 on startup
- If exists, load using ZeroGPUCompiledModel (fast ~30s)
- If not, compile and upload to Hub for future use
- Based on zerogpu-aoti/Flux-Compiled-Graph pattern

Files changed (1) hide show

app.py +116 -34

app.py CHANGED Viewed

@@ -152,6 +152,79 @@ INDUCTOR_CONFIGS = {
     'triton.cudagraphs': True,
 }
 def init_generator():
     """Initialize the generator (without optimization - that's done separately)"""
@@ -216,16 +289,16 @@ def parse_font_style(font_style: str) -> str:
     return None
-@spaces.GPU(duration=900) # don't change this duration!!!
 def compile_model_first_time():
     """
-    First-time: Load model and run AOT compilation.
-    With Float8 quantization for faster inference.
     """
     global _is_optimized, generator
     logger.info("="*50)
-    logger.info("First-time run: Loading model and AOT compiling...")
     logger.info("="*50)
     try:
@@ -233,10 +306,25 @@ def compile_model_first_time():
         gen = init_generator()
         model = gen.model
-        # ========== AOT Compilation (fp32 only - testing export without quantization) ==========
         # Step 1: Capture model forward during a real inference
-        logger.info("Step 1: Capturing model forward pass with spaces.aoti_capture...")
         with spaces.aoti_capture(model) as call:
             gen.generate(
                 text="测试长度等于七",
@@ -256,33 +344,22 @@ def compile_model_first_time():
             else:
                 logger.info(f"    {k}: {type(v).__name__} = {v}")
-        # Step 2: Build dynamic_shapes (FLUX-Kontext-fp8 pattern: all static)
-        # tree_map_only maps all tensors/bools to None = static shape
         # Step 2: Build dynamic_shapes (all static)
-        # For non-tensor types (like float ip_scale), we must use None
         logger.info("Step 2: Building static shapes...")
         dynamic_shapes = {}
         for k, v in call.kwargs.items():
-            if isinstance(v, torch.Tensor):
-                dynamic_shapes[k] = None  # Static shape for tensors
-            else:
-                dynamic_shapes[k] = None  # Must be None for non-tensor types
         logger.info(f"  dynamic_shapes: {dynamic_shapes}")
         logger.info("✓ Static shapes configured!")
-        # Step 3: Disable gradients on model (required for AOT export)
         logger.info("Step 3: Disabling gradients on model...")
         model.eval()
         model.requires_grad_(False)
         logger.info("✓ Model in eval mode with gradients disabled!")
-        # Step 4: Float8 quantization DISABLED (causes issues on some GPU types)
-        # logger.info("Step 4: Applying Float8 quantization...")
-        # quantize_(model, Float8DynamicActivationFloat8WeightConfig())
-        # logger.info("✓ Float8 quantization complete!")
-        # Step 4 (was Step 5): Detach inputs (requires_grad causes issues with AOT export)
-        logger.info("Step 5: Detaching inputs to avoid gradient issues...")
         detached_args = tuple(
             a.detach() if isinstance(a, torch.Tensor) else a for a in call.args
         )
@@ -292,8 +369,8 @@ def compile_model_first_time():
         }
         logger.info("✓ Inputs detached!")
-        # Step 6: Export model with torch.export.export (not draft_export)
-        logger.info("Step 6: Exporting model with torch.export.export...")
         exported = torch.export.export(
             mod=model,
             args=detached_args,
@@ -302,12 +379,16 @@ def compile_model_first_time():
         )
         logger.info("✓ Model exported!")
-        # Step 7: AOT compile with spaces.aoti_compile
-        logger.info("Step 7: AOT compiling with spaces.aoti_compile...")
         logger.info(f"  Inductor configs: {INDUCTOR_CONFIGS}")
         compiled = spaces.aoti_compile(exported, INDUCTOR_CONFIGS)
         logger.info("✓ AOT compilation complete!")
         # Step 8: Apply compiled model
         logger.info("Step 8: Applying compiled model...")
         spaces.aoti_apply(compiled, model)
@@ -315,6 +396,9 @@ def compile_model_first_time():
         _is_optimized = True
         logger.info("="*50)
     except Exception as e:
         logger.error("="*50)
         logger.error("AOT COMPILATION FAILED!")
@@ -322,17 +406,12 @@ def compile_model_first_time():
         logger.error(f"Exception: {e}")
         logger.error("Full traceback:")
         logger.error(traceback.format_exc())
-        # Save full error to file
         with open("aot_error.log", "w") as f:
             f.write(f"Exception: {e}\n\n")
             f.write(traceback.format_exc())
         raise
-    logger.info("✓ Model loaded and AOT compiled!")
-    logger.info("="*50)
     # NOTE: Don't return gen - causes pickle error in ZeroGPU multiprocessing
-    # Generator is stored in global variable and accessed via init_generator()
     return None
@@ -387,12 +466,15 @@ def interactive_session(
     # Determine author
     author = author_dropdown if author_dropdown != "None (Synthetic / 合成风格)" else None
-    # Step 1: AOT compile if not done yet (5 min, one-time)
     if not _is_optimized:
-        yield "⏳ 首次运行，AOT编译优化模型（约3-5分钟，仅此一次）...", []
-        progress(0.1, desc="AOT编译中...")
         compile_model_first_time()
-        yield "✅ AOT编译完成！", []
     # Step 2: Run generation (2 min)
     yield f"🎨 开始生成 {num_images} 张图片...", []

     'triton.cudagraphs': True,
 }
+# ============================================================
+# AOT Compiled Graph Caching (save to / load from HF Hub)
+# ============================================================
+HF_CACHE_REPO = "TSXu/Unicalli_Pro"
+HF_CACHE_FILENAME = "compiled_graph.pt2"
+def _check_compiled_graph_exists():
+    """Check if compiled graph exists on HF Hub"""
+    from huggingface_hub import hf_hub_download, HfApi
+    try:
+        api = HfApi()
+        files = api.list_repo_files(HF_CACHE_REPO)
+        return HF_CACHE_FILENAME in files
+    except Exception as e:
+        logger.info(f"Could not check Hub for compiled graph: {e}")
+        return False
+def _load_compiled_graph(model):
+    """Load compiled graph from HF Hub using ZeroGPU internals"""
+    from huggingface_hub import hf_hub_download
+    from spaces.zero.torch.aoti import ZeroGPUCompiledModel, ZeroGPUWeights, drain_module_parameters
+    logger.info(f"Downloading compiled graph from {HF_CACHE_REPO}/{HF_CACHE_FILENAME}...")
+    compiled_graph_file = hf_hub_download(HF_CACHE_REPO, HF_CACHE_FILENAME)
+    logger.info(f"✓ Downloaded to: {compiled_graph_file}")
+    logger.info("Loading compiled graph into model...")
+    state_dict = model.state_dict()
+    zerogpu_weights = ZeroGPUWeights({name: weight for name, weight in state_dict.items()})
+    compiled = ZeroGPUCompiledModel(compiled_graph_file, zerogpu_weights)
+    # Replace forward method
+    setattr(model, "forward", compiled)
+    drain_module_parameters(model)
+    logger.info("✓ Compiled graph loaded and applied!")
+    return True
+def _upload_compiled_graph(compiled):
+    """Upload compiled graph to HF Hub"""
+    from huggingface_hub import upload_file
+    import tempfile
+    hf_token = os.environ.get("HF_TOKEN")
+    if not hf_token:
+        logger.warning("HF_TOKEN not set, cannot upload compiled graph")
+        return False
+    logger.info(f"Uploading compiled graph to {HF_CACHE_REPO}/{HF_CACHE_FILENAME}...")
+    # Save archive to temp file
+    with tempfile.NamedTemporaryFile(suffix=".pt2", delete=False) as f:
+        f.write(compiled.archive_file.getvalue())
+        temp_path = f.name
+    try:
+        upload_file(
+            path_or_fileobj=temp_path,
+            path_in_repo=HF_CACHE_FILENAME,
+            repo_id=HF_CACHE_REPO,
+            token=hf_token,
+            commit_message="Upload AOT compiled graph",
+        )
+        logger.info("✓ Compiled graph uploaded to Hub!")
+        return True
+    except Exception as e:
+        logger.error(f"Failed to upload compiled graph: {e}")
+        return False
+    finally:
+        os.unlink(temp_path)
 def init_generator():
     """Initialize the generator (without optimization - that's done separately)"""
     return None
+@spaces.GPU(duration=900)  # 15 min for compilation (if needed)
 def compile_model_first_time():
     """
+    First-time: Load model and either load cached compiled graph or compile from scratch.
+    Compiled graph is cached on HF Hub for fast subsequent cold starts.
     """
     global _is_optimized, generator
     logger.info("="*50)
+    logger.info("First-time run: Loading model...")
     logger.info("="*50)
     try:
         gen = init_generator()
         model = gen.model
+        # Check if compiled graph exists on Hub
+        logger.info("Checking for cached compiled graph on HF Hub...")
+        if _check_compiled_graph_exists():
+            logger.info("="*50)
+            logger.info("Found cached compiled graph! Loading from Hub...")
+            logger.info("="*50)
+            _load_compiled_graph(model)
+            _is_optimized = True
+            logger.info("✓ Model loaded with cached compiled graph!")
+            logger.info("="*50)
+            return None
+        # No cached graph, compile from scratch
+        logger.info("="*50)
+        logger.info("No cached graph found. Compiling from scratch...")
+        logger.info("="*50)
         # Step 1: Capture model forward during a real inference
+        logger.info("Step 1: Capturing model forward pass...")
         with spaces.aoti_capture(model) as call:
             gen.generate(
                 text="测试长度等于七",
             else:
                 logger.info(f"    {k}: {type(v).__name__} = {v}")
         # Step 2: Build dynamic_shapes (all static)
         logger.info("Step 2: Building static shapes...")
         dynamic_shapes = {}
         for k, v in call.kwargs.items():
+            dynamic_shapes[k] = None  # Static shape for all
         logger.info(f"  dynamic_shapes: {dynamic_shapes}")
         logger.info("✓ Static shapes configured!")
+        # Step 3: Disable gradients on model
         logger.info("Step 3: Disabling gradients on model...")
         model.eval()
         model.requires_grad_(False)
         logger.info("✓ Model in eval mode with gradients disabled!")
+        # Step 4: Detach inputs
+        logger.info("Step 4: Detaching inputs...")
         detached_args = tuple(
             a.detach() if isinstance(a, torch.Tensor) else a for a in call.args
         )
         }
         logger.info("✓ Inputs detached!")
+        # Step 5: Export model
+        logger.info("Step 5: Exporting model with torch.export.export...")
         exported = torch.export.export(
             mod=model,
             args=detached_args,
         )
         logger.info("✓ Model exported!")
+        # Step 6: AOT compile
+        logger.info("Step 6: AOT compiling with spaces.aoti_compile...")
         logger.info(f"  Inductor configs: {INDUCTOR_CONFIGS}")
         compiled = spaces.aoti_compile(exported, INDUCTOR_CONFIGS)
         logger.info("✓ AOT compilation complete!")
+        # Step 7: Upload compiled graph to Hub
+        logger.info("Step 7: Uploading compiled graph to Hub...")
+        _upload_compiled_graph(compiled)
         # Step 8: Apply compiled model
         logger.info("Step 8: Applying compiled model...")
         spaces.aoti_apply(compiled, model)
         _is_optimized = True
         logger.info("="*50)
+        logger.info("✓ Model compiled and cached to Hub!")
+        logger.info("="*50)
     except Exception as e:
         logger.error("="*50)
         logger.error("AOT COMPILATION FAILED!")
         logger.error(f"Exception: {e}")
         logger.error("Full traceback:")
         logger.error(traceback.format_exc())
         with open("aot_error.log", "w") as f:
             f.write(f"Exception: {e}\n\n")
             f.write(traceback.format_exc())
         raise
     # NOTE: Don't return gen - causes pickle error in ZeroGPU multiprocessing
     return None
     # Determine author
     author = author_dropdown if author_dropdown != "None (Synthetic / 合成风格)" else None
+    # Step 1: Load compiled graph (cached) or compile (first time)
     if not _is_optimized:
+        if _check_compiled_graph_exists():
+            yield "⏳ 加载已缓存的编译模型...", []
+        else:
+            yield "⏳ 首次运行，编译优化模型（约5-10分钟，仅此一次）...", []
+        progress(0.1, desc="加载/编译中...")
         compile_model_first_time()
+        yield "✅ 模型加载完成！", []
     # Step 2: Run generation (2 min)
     yield f"🎨 开始生成 {num_images} 张图片...", []