Spaces:

ideogram-ai
/

ideogram4

Running on Zero

App Files Files Community

multimodalart HF Staff commited on Jun 3

Commit

e99a766

verified ·

1 Parent(s): 6ed28ce

Apply precompiled AOTI block (dynamic L=16k) to both transformers; ~1.28x on 1024 turbo

Browse files

Files changed (1) hide show

app.py +32 -0

app.py CHANGED Viewed

@@ -27,6 +27,9 @@ MODEL_ID = "diffusers-internal-dev/ideogram-4-nf4-v2"
 # Just the LM head, grafted onto the pipeline's own Qwen3-VL encoder to make it generative.
 LM_HEAD_REPO = "multimodalart/qwen3-vl-8b-instruct-lm-head"
 TOKENIZER_ID = "Qwen/Qwen3-VL-8B-Instruct"  # processor/tokenizer only (no weights)
 MAX_SEED = 2**31 - 1
@@ -56,6 +59,32 @@ try:
 except Exception:
     OUTLINES_AVAILABLE = False
 # --- Caption schema (matches Ideogram's native caption / caption_verifier) ---
 class ObjElement(BaseModel):
@@ -179,6 +208,8 @@ def generate(
     if randomize_seed or seed < 0:
         seed = random.randint(0, MAX_SEED)
     final_prompt = prompt
     if enhance:
         if not OUTLINES_AVAILABLE:
@@ -208,6 +239,7 @@ def _warmup():
     NOTE: runtime nf4->bf16 dequant is intentionally NOT done here — it does not persist across ZeroGPU
     forks (each request re-forks from the nf4 parent process), so bf16+speed will come from a precompiled
     AOTI artifact instead."""
     if ENHANCER is not None:
         upsample_prompt("a red apple on a wooden table", 1024, 1024)
         print("[warmup] upsampler ready on GPU", flush=True)

 # Just the LM head, grafted onto the pipeline's own Qwen3-VL encoder to make it generative.
 LM_HEAD_REPO = "multimodalart/qwen3-vl-8b-instruct-lm-head"
 TOKENIZER_ID = "Qwen/Qwen3-VL-8B-Instruct"  # processor/tokenizer only (no weights)
+# Precompiled (weight-less, dynamic L=16·k) AOTI package for one Ideogram4TransformerBlock.
+# Applied to all 68 block instances across both transformers; ~1.28x on 1024 turbo.
+AOTI_REPO = "multimodalart/i4-block-aoti"
 MAX_SEED = 2**31 - 1
 except Exception:
     OUTLINES_AVAILABLE = False
+# Pre-fetch the AOTI package at startup (CPU/parent); the actual .so bind happens on-GPU (per worker).
+try:
+    from huggingface_hub import snapshot_download
+    AOTI_DIR = snapshot_download(AOTI_REPO, repo_type="model")
+except Exception as e:
+    print(f"[aoti] package fetch failed, running eager: {e!r}", flush=True)
+    AOTI_DIR = None
+# Each ZeroGPU worker re-forks from the parent and must bind the compiled .so itself; guard so it
+# happens exactly once per process.
+_AOTI_APPLIED = False
+def _apply_aoti():
+    """Bind the precompiled block to both transformers (once per GPU worker). Must run inside @spaces.GPU."""
+    global _AOTI_APPLIED
+    if _AOTI_APPLIED or AOTI_DIR is None:
+        return
+    try:
+        spaces.aoti_load_from_package_dir(pipe.transformer, AOTI_DIR)
+        spaces.aoti_load_from_package_dir(pipe.unconditional_transformer, AOTI_DIR)
+        _AOTI_APPLIED = True
+        print("[aoti] compiled block applied to both transformers", flush=True)
+    except Exception as e:  # never let a compile-bind hiccup block generation — fall back to eager
+        print(f"[aoti] apply failed, running eager: {e!r}", flush=True)
 # --- Caption schema (matches Ideogram's native caption / caption_verifier) ---
 class ObjElement(BaseModel):
     if randomize_seed or seed < 0:
         seed = random.randint(0, MAX_SEED)
+    _apply_aoti()  # bind compiled blocks on this worker (no-op after first call)
     final_prompt = prompt
     if enhance:
         if not OUTLINES_AVAILABLE:
     NOTE: runtime nf4->bf16 dequant is intentionally NOT done here — it does not persist across ZeroGPU
     forks (each request re-forks from the nf4 parent process), so bf16+speed will come from a precompiled
     AOTI artifact instead."""
+    _apply_aoti()  # bind compiled blocks + trigger the lazy per-worker .so load
     if ENHANCER is not None:
         upsample_prompt("a red apple on a wooden table", 1024, 1024)
         print("[warmup] upsampler ready on GPU", flush=True)