multimodalart HF Staff commited on
Commit
d3ed6b5
·
verified ·
1 Parent(s): 31d862b

Re-enable AOTI (PR#6 artifact) + model-card blurb + title

Browse files
Files changed (1) hide show
  1. app.py +26 -10
app.py CHANGED
@@ -19,6 +19,7 @@ import gradio as gr
19
  import requests
20
  import spaces
21
  import torch
 
22
 
23
  from diffusers import Ideogram4Pipeline
24
 
@@ -58,19 +59,33 @@ MODES = {
58
  "Quality · 48 steps": dict(num_inference_steps=48, guidance_schedule=(7.0,) * 45 + (3.0,) * 3, mu=0.0, std=1.5),
59
  }
60
 
61
- # --- Pipeline (nf4). No dequantize: that was only to give AOTI bf16 weights to bind; with AOTI off we run
62
- # the nf4 transformers directly (less VRAM, faster startup). Re-add dequantize when re-enabling AOTI. ---
63
  t = time.perf_counter()
64
  pipe = Ideogram4Pipeline.from_pretrained(MODEL_ID, torch_dtype=torch.bfloat16)
 
 
65
  pipe.to("cuda")
66
- print(f"[timing] pipeline load: {time.perf_counter() - t:.1f}s", flush=True)
67
 
68
  # The local prompt-enhancer LM head is grafted lazily by `pipe.upsample_prompt` on first use (onto the worker's
69
  # GPU), so no explicit load is needed here. Local is only the fallback; Ideogram's remote API is the default.
70
 
71
- # AOTI off: PR #5 changed the block forward (5 flat args -> 4 with a rope tuple), so the compiled .so is
72
- # stale. Recompiling against the new block; re-enable (prefetch + vec-isa prewarm) once the artifact is rebuilt.
73
- AOTI_OK = False
 
 
 
 
 
 
 
 
 
 
 
 
74
 
75
  _AOTI_APPLIED = False
76
 
@@ -181,12 +196,13 @@ except Exception as e: # a flaky ZeroGPU worker must not take down the Space
181
  print(f"[warmup] failed (will warm lazily on first request): {e!r}", flush=True)
182
 
183
 
184
- with gr.Blocks(theme=gr.themes.Citrus(), title="Ideogram 4 (NF4) — diffusers preview") as demo:
185
  gr.Markdown(
186
  "# Ideogram 4\n"
187
- "The first open-weights Ideogram model — a 9.3B-parameter text-to-image diffusion model with strong "
188
- "prompt adherence and text rendering.\n\n"
189
- "[Model](https://huggingface.co/ideogram-ai/ideogram-4-nf4) · Blog (soon)"
 
190
  )
191
 
192
  with gr.Row():
 
19
  import requests
20
  import spaces
21
  import torch
22
+ from huggingface_hub import hf_hub_download
23
 
24
  from diffusers import Ideogram4Pipeline
25
 
 
59
  "Quality · 48 steps": dict(num_inference_steps=48, guidance_schedule=(7.0,) * 45 + (3.0,) * 3, mu=0.0, std=1.5),
60
  }
61
 
62
+ # --- Pipeline: dequantize both transformers nf4 -> bf16 in the parent (CPU) so AOTI can bind its weight-less
63
+ # graph to real bf16 weights (this is repo cold start, which is fine; function cold start stays fast). ---
64
  t = time.perf_counter()
65
  pipe = Ideogram4Pipeline.from_pretrained(MODEL_ID, torch_dtype=torch.bfloat16)
66
+ pipe.transformer.dequantize()
67
+ pipe.unconditional_transformer.dequantize()
68
  pipe.to("cuda")
69
+ print(f"[timing] pipeline load + dequant: {time.perf_counter() - t:.1f}s", flush=True)
70
 
71
  # The local prompt-enhancer LM head is grafted lazily by `pipe.upsample_prompt` on first use (onto the worker's
72
  # GPU), so no explicit load is needed here. Local is only the fallback; Ideogram's remote API is the default.
73
 
74
+ # Pre-fetch the AOTI package AND pre-warm torch-inductor's CPU-ISA probe in the PARENT (repo cold start). The
75
+ # probe (valid_vec_isa_list) compiles test programs (~20s) the first time aoti_blocks_load builds a LazyAOTIModel;
76
+ # doing it once here means every ZeroGPU fork inherits the functools.cache, so per-worker (function cold start)
77
+ # aoti_blocks_load is just the ~instant block patch instead of a ~20s compile.
78
+ try:
79
+ hf_hub_download(AOTI_REPO, "package.pt2", subfolder="Ideogram4TransformerBlock")
80
+ from torch._inductor.cpu_vec_isa import valid_vec_isa_list
81
+
82
+ t = time.perf_counter()
83
+ valid_vec_isa_list()
84
+ print(f"[timing] vec-isa prewarm (parent): {time.perf_counter() - t:.1f}s", flush=True)
85
+ AOTI_OK = True
86
+ except Exception as e:
87
+ AOTI_OK = False
88
+ print(f"[aoti] prefetch/prewarm failed, running eager: {e!r}", flush=True)
89
 
90
  _AOTI_APPLIED = False
91
 
 
196
  print(f"[warmup] failed (will warm lazily on first request): {e!r}", flush=True)
197
 
198
 
199
+ with gr.Blocks(theme=gr.themes.Citrus(), title="Ideogram 4") as demo:
200
  gr.Markdown(
201
  "# Ideogram 4\n"
202
+ "Ideogram's first open-weights model — a 9.3B-parameter text-to-image foundation model at the "
203
+ "forefront of design, with best-in-class text rendering.\n\n"
204
+ "[Model](https://huggingface.co/ideogram-ai/ideogram-4-nf4) · "
205
+ "[Blog](https://ideogram.ai/blog/ideogram-4.0/)"
206
  )
207
 
208
  with gr.Row():