Spaces:

TSXu
/

UniCalli_Dev

Running on Zero

TSXu commited on Jan 30

Commit

a895d85

1 Parent(s): d9765a4

Remove AOT compilation code, keep FA3 + FP8 only

- Removed compile_model_first_time()
- Removed AOT cache functions (_check_compiled_graph_exists, _load_compiled_graph, _upload_compiled_graph)
- Removed INDUCTOR_CONFIGS
- Simplified logging setup
- FA3 + FP8 quantization is fast enough without pre-compilation

Files changed (2) hide show

FLUX-Kontext-fp8 +0 -1
app.py +6 -239

FLUX-Kontext-fp8 DELETED Viewed

	@@ -1 +0,0 @@
1	- Subproject commit 1588a5618e83f18d291920de2b399d530edf8dbc

app.py CHANGED Viewed

@@ -1,45 +1,23 @@
 # -*- coding: utf-8 -*-
 """
 Gradio Demo for Chinese Calligraphy Generation - HuggingFace Space Version
-With Float8 quantization and AOT compilation for faster inference
 """
-# Install compatible torch 2.8 + torchvision 0.23 + torchao + spaces (for AOT compilation)
-# spaces.aoti_capture requires PyTorch 2.8+
 import os
 import sys
 import logging
-import traceback
 from datetime import datetime
-# Setup logging to file
-LOG_FILE = "aot_compile.log"
 logging.basicConfig(
-    level=logging.DEBUG,
     format='%(asctime)s [%(levelname)s] %(message)s',
-    handlers=[
-        logging.FileHandler(LOG_FILE, mode='w', encoding='utf-8'),
-        logging.StreamHandler(sys.stdout)
-    ]
 )
 logger = logging.getLogger(__name__)
-# Also redirect print to log
-class LoggingPrinter:
-    def __init__(self, logger, original_stdout):
-        self.logger = logger
-        self.original_stdout = original_stdout
-    def write(self, message):
-        if message.strip():
-            self.logger.info(message.strip())
-        self.original_stdout.write(message)
-    def flush(self):
-        self.original_stdout.flush()
-# Keep original stdout for gradio
-_original_stdout = sys.stdout
-# Install compatible nightly versions - let pip resolve the exact matching versions
 os.system('pip install --upgrade --pre --extra-index-url https://download.pytorch.org/whl/nightly/cu126 torch torchvision torchao spaces')
 logger.info("torch + torchvision + torchao + spaces (nightly) installation complete!")
@@ -100,7 +78,6 @@ except:
 # Global generator instance
 generator = None
 _cached_model_dir = None
-_is_optimized = False
 # ============================================================
 # Pre-download model files at startup (no GPU needed)
@@ -160,94 +137,10 @@ print("="*50)
 # ============================================================
-# AOT Optimization Configuration (from FLUX-Kontext-fp8)
 # ============================================================
-from torch.utils._pytree import tree_map_only
-# FP8 quantization for faster inference (works with FA3)
 from torchao.quantization import quantize_, Float8DynamicActivationFloat8WeightConfig
-# Inductor configuration for optimal performance
-INDUCTOR_CONFIGS = {
-    'conv_1x1_as_mm': True,
-    'epilogue_fusion': False,
-    'coordinate_descent_tuning': True,
-    'coordinate_descent_check_all_directions': True,
-    'max_autotune': True,
-    'triton.cudagraphs': True,
-}
-# ============================================================
-# AOT Compiled Graph Caching (save to / load from HF Hub)
-# ============================================================
-HF_CACHE_REPO = "TSXu/Unicalli_Pro"
-HF_CACHE_FILENAME = "compiled_graph.pt2"
-def _check_compiled_graph_exists():
-    """Check if compiled graph exists on HF Hub (fast check)"""
-    from huggingface_hub import hf_hub_url, get_hf_file_metadata
-    try:
-        url = hf_hub_url(HF_CACHE_REPO, HF_CACHE_FILENAME)
-        get_hf_file_metadata(url)  # Raises if file doesn't exist
-        return True
-    except Exception:
-        return False
-def _load_compiled_graph(model):
-    """Load compiled graph from HF Hub using ZeroGPU internals"""
-    from huggingface_hub import hf_hub_download
-    from spaces.zero.torch.aoti import ZeroGPUCompiledModel, ZeroGPUWeights, drain_module_parameters
-    logger.info(f"Downloading compiled graph from {HF_CACHE_REPO}/{HF_CACHE_FILENAME}...")
-    compiled_graph_file = hf_hub_download(HF_CACHE_REPO, HF_CACHE_FILENAME)
-    logger.info(f"✓ Downloaded to: {compiled_graph_file}")
-    logger.info("Loading compiled graph into model...")
-    state_dict = model.state_dict()
-    zerogpu_weights = ZeroGPUWeights({name: weight for name, weight in state_dict.items()})
-    compiled = ZeroGPUCompiledModel(compiled_graph_file, zerogpu_weights)
-    # Replace forward method
-    setattr(model, "forward", compiled)
-    drain_module_parameters(model)
-    logger.info("✓ Compiled graph loaded and applied!")
-    return True
-def _upload_compiled_graph(compiled):
-    """Upload compiled graph to HF Hub"""
-    from huggingface_hub import upload_file
-    import tempfile
-    hf_token = os.environ.get("HF_TOKEN")
-    if not hf_token:
-        logger.warning("HF_TOKEN not set, cannot upload compiled graph")
-        return False
-    logger.info(f"Uploading compiled graph to {HF_CACHE_REPO}/{HF_CACHE_FILENAME}...")
-    # Save archive to temp file
-    with tempfile.NamedTemporaryFile(suffix=".pt2", delete=False) as f:
-        f.write(compiled.archive_file.getvalue())
-        temp_path = f.name
-    try:
-        upload_file(
-            path_or_fileobj=temp_path,
-            path_in_repo=HF_CACHE_FILENAME,
-            repo_id=HF_CACHE_REPO,
-            token=hf_token,
-            commit_message="Upload AOT compiled graph",
-        )
-        logger.info("✓ Compiled graph uploaded to Hub!")
-        return True
-    except Exception as e:
-        logger.error(f"Failed to upload compiled graph: {e}")
-        return False
-    finally:
-        os.unlink(temp_path)
 def init_generator():
     """Initialize the generator (without optimization - that's done separately)"""
@@ -312,132 +205,6 @@ def parse_font_style(font_style: str) -> str:
     return None
-@spaces.GPU(duration=900)  # 15 min for compilation (if needed)
-def compile_model_first_time():
-    """
-    First-time: Load model and either load cached compiled graph or compile from scratch.
-    Compiled graph is cached on HF Hub for fast subsequent cold starts.
-    """
-    global _is_optimized, generator
-    logger.info("="*50)
-    logger.info("First-time run: Loading model...")
-    logger.info("="*50)
-    try:
-        # Load model
-        gen = init_generator()
-        model = gen.model
-        # Check if compiled graph exists on Hub
-        logger.info("Checking for cached compiled graph on HF Hub...")
-        if _check_compiled_graph_exists():
-            logger.info("="*50)
-            logger.info("Found cached compiled graph! Loading from Hub...")
-            logger.info("="*50)
-            _load_compiled_graph(model)
-            _is_optimized = True
-            logger.info("✓ Model loaded with cached compiled graph!")
-            logger.info("="*50)
-            return None
-        # No cached graph, compile from scratch
-        logger.info("="*50)
-        logger.info("No cached graph found. Compiling from scratch...")
-        logger.info("="*50)
-        # Step 1: Capture model forward during a real inference
-        logger.info("Step 1: Capturing model forward pass...")
-        with spaces.aoti_capture(model) as call:
-            gen.generate(
-                text="测试长度等于七",
-                font_style="楷",
-                author=None,
-                num_steps=1,
-                seed=42,
-            )
-        logger.info("✓ Forward pass captured!")
-        # Log call info
-        logger.info(f"  call.args types: {[type(a).__name__ for a in call.args]}")
-        logger.info(f"  call.kwargs keys: {list(call.kwargs.keys())}")
-        for k, v in call.kwargs.items():
-            if hasattr(v, 'shape'):
-                logger.info(f"    {k}: tensor shape={v.shape}, dtype={v.dtype}")
-            else:
-                logger.info(f"    {k}: {type(v).__name__} = {v}")
-        # Step 2: Build dynamic_shapes (all static)
-        logger.info("Step 2: Building static shapes...")
-        dynamic_shapes = {}
-        for k, v in call.kwargs.items():
-            dynamic_shapes[k] = None  # Static shape for all
-        logger.info(f"  dynamic_shapes: {dynamic_shapes}")
-        logger.info("✓ Static shapes configured!")
-        # Step 3: Disable gradients on model
-        logger.info("Step 3: Disabling gradients on model...")
-        model.eval()
-        model.requires_grad_(False)
-        logger.info("✓ Model in eval mode with gradients disabled!")
-        # Step 4: Detach inputs
-        logger.info("Step 4: Detaching inputs...")
-        detached_args = tuple(
-            a.detach() if isinstance(a, torch.Tensor) else a for a in call.args
-        )
-        detached_kwargs = {
-            k: v.detach() if isinstance(v, torch.Tensor) else v
-            for k, v in call.kwargs.items()
-        }
-        logger.info("✓ Inputs detached!")
-        # Step 5: Export model
-        logger.info("Step 5: Exporting model with torch.export.export...")
-        exported = torch.export.export(
-            mod=model,
-            args=detached_args,
-            kwargs=detached_kwargs,
-            dynamic_shapes=dynamic_shapes,
-        )
-        logger.info("✓ Model exported!")
-        # Step 6: AOT compile
-        logger.info("Step 6: AOT compiling with spaces.aoti_compile...")
-        logger.info(f"  Inductor configs: {INDUCTOR_CONFIGS}")
-        compiled = spaces.aoti_compile(exported, INDUCTOR_CONFIGS)
-        logger.info("✓ AOT compilation complete!")
-        # Step 7: Upload compiled graph to Hub
-        logger.info("Step 7: Uploading compiled graph to Hub...")
-        _upload_compiled_graph(compiled)
-        # Step 8: Apply compiled model
-        logger.info("Step 8: Applying compiled model...")
-        spaces.aoti_apply(compiled, model)
-        logger.info("✓ AOT compiled model applied!")
-        _is_optimized = True
-        logger.info("="*50)
-        logger.info("✓ Model compiled and cached to Hub!")
-        logger.info("="*50)
-    except Exception as e:
-        logger.error("="*50)
-        logger.error("AOT COMPILATION FAILED!")
-        logger.error("="*50)
-        logger.error(f"Exception: {e}")
-        logger.error("Full traceback:")
-        logger.error(traceback.format_exc())
-        with open("aot_error.log", "w") as f:
-            f.write(f"Exception: {e}\n\n")
-            f.write(traceback.format_exc())
-        raise
-    # NOTE: Don't return gen - causes pickle error in ZeroGPU multiprocessing
-    return None
 def _get_generation_duration(text, font, author, num_steps, start_seed, num_images):
     """Calculate dynamic GPU duration: 20s loading + 1.5s per step per image"""
     return 20 + int(1.5 * num_steps * num_images)

 # -*- coding: utf-8 -*-
 """
 Gradio Demo for Chinese Calligraphy Generation - HuggingFace Space Version
+With FA3 + FP8 quantization for faster inference
 """
 import os
 import sys
 import logging
 from datetime import datetime
+# Setup logging
 logging.basicConfig(
+    level=logging.INFO,
     format='%(asctime)s [%(levelname)s] %(message)s',
+    handlers=[logging.StreamHandler(sys.stdout)]
 )
 logger = logging.getLogger(__name__)
+# Install nightly versions for FA3 + FP8 support
 os.system('pip install --upgrade --pre --extra-index-url https://download.pytorch.org/whl/nightly/cu126 torch torchvision torchao spaces')
 logger.info("torch + torchvision + torchao + spaces (nightly) installation complete!")
 # Global generator instance
 generator = None
 _cached_model_dir = None
 # ============================================================
 # Pre-download model files at startup (no GPU needed)
 # ============================================================
+# FP8 Quantization (works with FA3)
 # ============================================================
 from torchao.quantization import quantize_, Float8DynamicActivationFloat8WeightConfig
 def init_generator():
     """Initialize the generator (without optimization - that's done separately)"""
     return None
 def _get_generation_duration(text, font, author, num_steps, start_seed, num_images):
     """Calculate dynamic GPU duration: 20s loading + 1.5s per step per image"""
     return 20 + int(1.5 * num_steps * num_images)