Spaces:
Running on Zero
Running on Zero
Drop dequantize (AOTI off -> run nf4 directly)
Browse files
app.py
CHANGED
|
@@ -55,14 +55,12 @@ MODES = {
|
|
| 55 |
"Quality 路 48 steps": dict(num_inference_steps=48, guidance_schedule=(7.0,) * 45 + (3.0,) * 3, mu=0.0, std=1.5),
|
| 56 |
}
|
| 57 |
|
| 58 |
-
# --- Pipeline
|
| 59 |
-
#
|
| 60 |
t = time.perf_counter()
|
| 61 |
pipe = Ideogram4Pipeline.from_pretrained(MODEL_ID, torch_dtype=torch.bfloat16)
|
| 62 |
-
pipe.transformer.dequantize()
|
| 63 |
-
pipe.unconditional_transformer.dequantize()
|
| 64 |
pipe.to("cuda")
|
| 65 |
-
print(f"[timing] pipeline load
|
| 66 |
|
| 67 |
# The local prompt-enhancer LM head is grafted lazily by `pipe.upsample_prompt` on first use (onto the worker's
|
| 68 |
# GPU), so no explicit load is needed here. Local is only the fallback; Ideogram's remote API is the default.
|
|
|
|
| 55 |
"Quality 路 48 steps": dict(num_inference_steps=48, guidance_schedule=(7.0,) * 45 + (3.0,) * 3, mu=0.0, std=1.5),
|
| 56 |
}
|
| 57 |
|
| 58 |
+
# --- Pipeline (nf4). No dequantize: that was only to give AOTI bf16 weights to bind; with AOTI off we run
|
| 59 |
+
# the nf4 transformers directly (less VRAM, faster startup). Re-add dequantize when re-enabling AOTI. ---
|
| 60 |
t = time.perf_counter()
|
| 61 |
pipe = Ideogram4Pipeline.from_pretrained(MODEL_ID, torch_dtype=torch.bfloat16)
|
|
|
|
|
|
|
| 62 |
pipe.to("cuda")
|
| 63 |
+
print(f"[timing] pipeline load: {time.perf_counter() - t:.1f}s", flush=True)
|
| 64 |
|
| 65 |
# The local prompt-enhancer LM head is grafted lazily by `pipe.upsample_prompt` on first use (onto the worker's
|
| 66 |
# GPU), so no explicit load is needed here. Local is only the fallback; Ideogram's remote API is the default.
|