multimodalart HF Staff commited on
Commit
332e802
verified
1 Parent(s): a1f24dc

Drop dequantize (AOTI off -> run nf4 directly)

Browse files
Files changed (1) hide show
  1. app.py +3 -5
app.py CHANGED
@@ -55,14 +55,12 @@ MODES = {
55
  "Quality 路 48 steps": dict(num_inference_steps=48, guidance_schedule=(7.0,) * 45 + (3.0,) * 3, mu=0.0, std=1.5),
56
  }
57
 
58
- # --- Pipeline: dequantize both transformers nf4 -> bf16 in the parent (CPU) so every ZeroGPU fork inherits
59
- # bf16 and AOTI can bind its weight-less graph to real weights. ---
60
  t = time.perf_counter()
61
  pipe = Ideogram4Pipeline.from_pretrained(MODEL_ID, torch_dtype=torch.bfloat16)
62
- pipe.transformer.dequantize()
63
- pipe.unconditional_transformer.dequantize()
64
  pipe.to("cuda")
65
- print(f"[timing] pipeline load + dequant: {time.perf_counter() - t:.1f}s", flush=True)
66
 
67
  # The local prompt-enhancer LM head is grafted lazily by `pipe.upsample_prompt` on first use (onto the worker's
68
  # GPU), so no explicit load is needed here. Local is only the fallback; Ideogram's remote API is the default.
 
55
  "Quality 路 48 steps": dict(num_inference_steps=48, guidance_schedule=(7.0,) * 45 + (3.0,) * 3, mu=0.0, std=1.5),
56
  }
57
 
58
+ # --- Pipeline (nf4). No dequantize: that was only to give AOTI bf16 weights to bind; with AOTI off we run
59
+ # the nf4 transformers directly (less VRAM, faster startup). Re-add dequantize when re-enabling AOTI. ---
60
  t = time.perf_counter()
61
  pipe = Ideogram4Pipeline.from_pretrained(MODEL_ID, torch_dtype=torch.bfloat16)
 
 
62
  pipe.to("cuda")
63
+ print(f"[timing] pipeline load: {time.perf_counter() - t:.1f}s", flush=True)
64
 
65
  # The local prompt-enhancer LM head is grafted lazily by `pipe.upsample_prompt` on first use (onto the worker's
66
  # GPU), so no explicit load is needed here. Local is only the fallback; Ideogram's remote API is the default.