Spaces:
Running on Zero
Running on Zero
Apply precompiled AOTI block (dynamic L=16k) to both transformers; ~1.28x on 1024 turbo
Browse files
app.py
CHANGED
|
@@ -27,6 +27,9 @@ MODEL_ID = "diffusers-internal-dev/ideogram-4-nf4-v2"
|
|
| 27 |
# Just the LM head, grafted onto the pipeline's own Qwen3-VL encoder to make it generative.
|
| 28 |
LM_HEAD_REPO = "multimodalart/qwen3-vl-8b-instruct-lm-head"
|
| 29 |
TOKENIZER_ID = "Qwen/Qwen3-VL-8B-Instruct" # processor/tokenizer only (no weights)
|
|
|
|
|
|
|
|
|
|
| 30 |
|
| 31 |
MAX_SEED = 2**31 - 1
|
| 32 |
|
|
@@ -56,6 +59,32 @@ try:
|
|
| 56 |
except Exception:
|
| 57 |
OUTLINES_AVAILABLE = False
|
| 58 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 59 |
|
| 60 |
# --- Caption schema (matches Ideogram's native caption / caption_verifier) ---
|
| 61 |
class ObjElement(BaseModel):
|
|
@@ -179,6 +208,8 @@ def generate(
|
|
| 179 |
if randomize_seed or seed < 0:
|
| 180 |
seed = random.randint(0, MAX_SEED)
|
| 181 |
|
|
|
|
|
|
|
| 182 |
final_prompt = prompt
|
| 183 |
if enhance:
|
| 184 |
if not OUTLINES_AVAILABLE:
|
|
@@ -208,6 +239,7 @@ def _warmup():
|
|
| 208 |
NOTE: runtime nf4->bf16 dequant is intentionally NOT done here — it does not persist across ZeroGPU
|
| 209 |
forks (each request re-forks from the nf4 parent process), so bf16+speed will come from a precompiled
|
| 210 |
AOTI artifact instead."""
|
|
|
|
| 211 |
if ENHANCER is not None:
|
| 212 |
upsample_prompt("a red apple on a wooden table", 1024, 1024)
|
| 213 |
print("[warmup] upsampler ready on GPU", flush=True)
|
|
|
|
| 27 |
# Just the LM head, grafted onto the pipeline's own Qwen3-VL encoder to make it generative.
|
| 28 |
LM_HEAD_REPO = "multimodalart/qwen3-vl-8b-instruct-lm-head"
|
| 29 |
TOKENIZER_ID = "Qwen/Qwen3-VL-8B-Instruct" # processor/tokenizer only (no weights)
|
| 30 |
+
# Precompiled (weight-less, dynamic L=16·k) AOTI package for one Ideogram4TransformerBlock.
|
| 31 |
+
# Applied to all 68 block instances across both transformers; ~1.28x on 1024 turbo.
|
| 32 |
+
AOTI_REPO = "multimodalart/i4-block-aoti"
|
| 33 |
|
| 34 |
MAX_SEED = 2**31 - 1
|
| 35 |
|
|
|
|
| 59 |
except Exception:
|
| 60 |
OUTLINES_AVAILABLE = False
|
| 61 |
|
| 62 |
+
# Pre-fetch the AOTI package at startup (CPU/parent); the actual .so bind happens on-GPU (per worker).
|
| 63 |
+
try:
|
| 64 |
+
from huggingface_hub import snapshot_download
|
| 65 |
+
AOTI_DIR = snapshot_download(AOTI_REPO, repo_type="model")
|
| 66 |
+
except Exception as e:
|
| 67 |
+
print(f"[aoti] package fetch failed, running eager: {e!r}", flush=True)
|
| 68 |
+
AOTI_DIR = None
|
| 69 |
+
|
| 70 |
+
# Each ZeroGPU worker re-forks from the parent and must bind the compiled .so itself; guard so it
|
| 71 |
+
# happens exactly once per process.
|
| 72 |
+
_AOTI_APPLIED = False
|
| 73 |
+
|
| 74 |
+
|
| 75 |
+
def _apply_aoti():
|
| 76 |
+
"""Bind the precompiled block to both transformers (once per GPU worker). Must run inside @spaces.GPU."""
|
| 77 |
+
global _AOTI_APPLIED
|
| 78 |
+
if _AOTI_APPLIED or AOTI_DIR is None:
|
| 79 |
+
return
|
| 80 |
+
try:
|
| 81 |
+
spaces.aoti_load_from_package_dir(pipe.transformer, AOTI_DIR)
|
| 82 |
+
spaces.aoti_load_from_package_dir(pipe.unconditional_transformer, AOTI_DIR)
|
| 83 |
+
_AOTI_APPLIED = True
|
| 84 |
+
print("[aoti] compiled block applied to both transformers", flush=True)
|
| 85 |
+
except Exception as e: # never let a compile-bind hiccup block generation — fall back to eager
|
| 86 |
+
print(f"[aoti] apply failed, running eager: {e!r}", flush=True)
|
| 87 |
+
|
| 88 |
|
| 89 |
# --- Caption schema (matches Ideogram's native caption / caption_verifier) ---
|
| 90 |
class ObjElement(BaseModel):
|
|
|
|
| 208 |
if randomize_seed or seed < 0:
|
| 209 |
seed = random.randint(0, MAX_SEED)
|
| 210 |
|
| 211 |
+
_apply_aoti() # bind compiled blocks on this worker (no-op after first call)
|
| 212 |
+
|
| 213 |
final_prompt = prompt
|
| 214 |
if enhance:
|
| 215 |
if not OUTLINES_AVAILABLE:
|
|
|
|
| 239 |
NOTE: runtime nf4->bf16 dequant is intentionally NOT done here — it does not persist across ZeroGPU
|
| 240 |
forks (each request re-forks from the nf4 parent process), so bf16+speed will come from a precompiled
|
| 241 |
AOTI artifact instead."""
|
| 242 |
+
_apply_aoti() # bind compiled blocks + trigger the lazy per-worker .so load
|
| 243 |
if ENHANCER is not None:
|
| 244 |
upsample_prompt("a red apple on a wooden table", 1024, 1024)
|
| 245 |
print("[warmup] upsampler ready on GPU", flush=True)
|