multimodalart HF Staff commited on
Commit
e99a766
·
verified ·
1 Parent(s): 6ed28ce

Apply precompiled AOTI block (dynamic L=16k) to both transformers; ~1.28x on 1024 turbo

Browse files
Files changed (1) hide show
  1. app.py +32 -0
app.py CHANGED
@@ -27,6 +27,9 @@ MODEL_ID = "diffusers-internal-dev/ideogram-4-nf4-v2"
27
  # Just the LM head, grafted onto the pipeline's own Qwen3-VL encoder to make it generative.
28
  LM_HEAD_REPO = "multimodalart/qwen3-vl-8b-instruct-lm-head"
29
  TOKENIZER_ID = "Qwen/Qwen3-VL-8B-Instruct" # processor/tokenizer only (no weights)
 
 
 
30
 
31
  MAX_SEED = 2**31 - 1
32
 
@@ -56,6 +59,32 @@ try:
56
  except Exception:
57
  OUTLINES_AVAILABLE = False
58
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
59
 
60
  # --- Caption schema (matches Ideogram's native caption / caption_verifier) ---
61
  class ObjElement(BaseModel):
@@ -179,6 +208,8 @@ def generate(
179
  if randomize_seed or seed < 0:
180
  seed = random.randint(0, MAX_SEED)
181
 
 
 
182
  final_prompt = prompt
183
  if enhance:
184
  if not OUTLINES_AVAILABLE:
@@ -208,6 +239,7 @@ def _warmup():
208
  NOTE: runtime nf4->bf16 dequant is intentionally NOT done here — it does not persist across ZeroGPU
209
  forks (each request re-forks from the nf4 parent process), so bf16+speed will come from a precompiled
210
  AOTI artifact instead."""
 
211
  if ENHANCER is not None:
212
  upsample_prompt("a red apple on a wooden table", 1024, 1024)
213
  print("[warmup] upsampler ready on GPU", flush=True)
 
27
  # Just the LM head, grafted onto the pipeline's own Qwen3-VL encoder to make it generative.
28
  LM_HEAD_REPO = "multimodalart/qwen3-vl-8b-instruct-lm-head"
29
  TOKENIZER_ID = "Qwen/Qwen3-VL-8B-Instruct" # processor/tokenizer only (no weights)
30
+ # Precompiled (weight-less, dynamic L=16·k) AOTI package for one Ideogram4TransformerBlock.
31
+ # Applied to all 68 block instances across both transformers; ~1.28x on 1024 turbo.
32
+ AOTI_REPO = "multimodalart/i4-block-aoti"
33
 
34
  MAX_SEED = 2**31 - 1
35
 
 
59
  except Exception:
60
  OUTLINES_AVAILABLE = False
61
 
62
+ # Pre-fetch the AOTI package at startup (CPU/parent); the actual .so bind happens on-GPU (per worker).
63
+ try:
64
+ from huggingface_hub import snapshot_download
65
+ AOTI_DIR = snapshot_download(AOTI_REPO, repo_type="model")
66
+ except Exception as e:
67
+ print(f"[aoti] package fetch failed, running eager: {e!r}", flush=True)
68
+ AOTI_DIR = None
69
+
70
+ # Each ZeroGPU worker re-forks from the parent and must bind the compiled .so itself; guard so it
71
+ # happens exactly once per process.
72
+ _AOTI_APPLIED = False
73
+
74
+
75
+ def _apply_aoti():
76
+ """Bind the precompiled block to both transformers (once per GPU worker). Must run inside @spaces.GPU."""
77
+ global _AOTI_APPLIED
78
+ if _AOTI_APPLIED or AOTI_DIR is None:
79
+ return
80
+ try:
81
+ spaces.aoti_load_from_package_dir(pipe.transformer, AOTI_DIR)
82
+ spaces.aoti_load_from_package_dir(pipe.unconditional_transformer, AOTI_DIR)
83
+ _AOTI_APPLIED = True
84
+ print("[aoti] compiled block applied to both transformers", flush=True)
85
+ except Exception as e: # never let a compile-bind hiccup block generation — fall back to eager
86
+ print(f"[aoti] apply failed, running eager: {e!r}", flush=True)
87
+
88
 
89
  # --- Caption schema (matches Ideogram's native caption / caption_verifier) ---
90
  class ObjElement(BaseModel):
 
208
  if randomize_seed or seed < 0:
209
  seed = random.randint(0, MAX_SEED)
210
 
211
+ _apply_aoti() # bind compiled blocks on this worker (no-op after first call)
212
+
213
  final_prompt = prompt
214
  if enhance:
215
  if not OUTLINES_AVAILABLE:
 
239
  NOTE: runtime nf4->bf16 dequant is intentionally NOT done here — it does not persist across ZeroGPU
240
  forks (each request re-forks from the nf4 parent process), so bf16+speed will come from a precompiled
241
  AOTI artifact instead."""
242
+ _apply_aoti() # bind compiled blocks + trigger the lazy per-worker .so load
243
  if ENHANCER is not None:
244
  upsample_prompt("a red apple on a wooden table", 1024, 1024)
245
  print("[warmup] upsampler ready on GPU", flush=True)