Spaces:
Running on Zero
Running on Zero
Demo: native pipe.upsample_prompt + spaces.aoti_blocks_load (overlapped) + timing
Browse files- app.py +67 -195
- requirements.txt +2 -2
- v6.txt +0 -55
app.py
CHANGED
|
@@ -3,229 +3,111 @@ import sys
|
|
| 3 |
|
| 4 |
os.environ.setdefault("PYTORCH_CUDA_ALLOC_CONF", "expandable_segments:True")
|
| 5 |
|
| 6 |
-
#
|
| 7 |
_HERE = os.path.dirname(os.path.abspath(__file__))
|
| 8 |
sys.path.insert(0, os.path.join(_HERE, "diffusers_src", "src"))
|
| 9 |
|
| 10 |
import json
|
| 11 |
import random
|
| 12 |
-
|
|
|
|
| 13 |
|
|
|
|
| 14 |
import spaces
|
| 15 |
import torch
|
| 16 |
-
import torch.nn as nn
|
| 17 |
-
import gradio as gr
|
| 18 |
-
from pydantic import BaseModel, Field
|
| 19 |
-
from accelerate import init_empty_weights
|
| 20 |
from huggingface_hub import hf_hub_download
|
| 21 |
-
|
| 22 |
from diffusers import Ideogram4Pipeline
|
| 23 |
-
from transformers import AutoProcessor, Qwen3VLForConditionalGeneration
|
| 24 |
|
| 25 |
-
# --- New (safety-fixed) checkpoint ---
|
| 26 |
MODEL_ID = "diffusers-internal-dev/ideogram-4-nf4-v2"
|
| 27 |
-
# Just the LM head, grafted onto the pipeline's own Qwen3-VL encoder to make it generative.
|
| 28 |
LM_HEAD_REPO = "multimodalart/qwen3-vl-8b-instruct-lm-head"
|
| 29 |
-
TOKENIZER_ID = "Qwen/Qwen3-VL-8B-Instruct" # processor/tokenizer only (no weights)
|
| 30 |
-
# Precompiled (weight-less, dynamic L=16·k) AOTI package for one Ideogram4TransformerBlock.
|
| 31 |
-
# Applied to all 68 block instances across both transformers; ~1.28x on 1024 turbo.
|
| 32 |
AOTI_REPO = "multimodalart/i4-block-aoti"
|
| 33 |
-
|
| 34 |
MAX_SEED = 2**31 - 1
|
| 35 |
|
| 36 |
-
#
|
| 37 |
MODES = {
|
| 38 |
"Turbo · 12 steps": dict(num_inference_steps=12, guidance_schedule=(7.0,) * 11 + (3.0,) * 1, mu=0.5, std=1.75),
|
| 39 |
"Default · 20 steps": dict(num_inference_steps=20, guidance_schedule=(7.0,) * 18 + (3.0,) * 2, mu=0.0, std=1.75),
|
| 40 |
"Quality · 48 steps": dict(num_inference_steps=48, guidance_schedule=(7.0,) * 45 + (3.0,) * 3, mu=0.0, std=1.5),
|
| 41 |
}
|
| 42 |
|
| 43 |
-
# --- Pipeline -
|
|
|
|
|
|
|
| 44 |
pipe = Ideogram4Pipeline.from_pretrained(MODEL_ID, torch_dtype=torch.bfloat16)
|
| 45 |
-
# Dequantize nf4 -> bf16 in the PARENT (CPU) context, BEFORE ZeroGPU forks/packs the model, so every
|
| 46 |
-
# fork inherits bf16 (a fork-local dequant doesn't persist). bitsandbytes supports CPU 4-bit dequant.
|
| 47 |
-
# This also gives AOTI real bf16 weights to bind to its (weight-less) compiled graph.
|
| 48 |
pipe.transformer.dequantize()
|
| 49 |
pipe.unconditional_transformer.dequantize()
|
| 50 |
pipe.to("cuda")
|
|
|
|
| 51 |
|
| 52 |
-
# ---
|
| 53 |
-
upsampler_proc = AutoProcessor.from_pretrained(TOKENIZER_ID)
|
| 54 |
-
LM_HEAD_PATH = hf_hub_download(LM_HEAD_REPO, "lm_head.safetensors") # cached at startup
|
| 55 |
-
|
| 56 |
try:
|
| 57 |
-
|
| 58 |
-
|
| 59 |
-
|
| 60 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 61 |
|
| 62 |
-
# Pre-fetch the AOTI package at startup
|
| 63 |
try:
|
| 64 |
-
|
| 65 |
-
|
| 66 |
except Exception as e:
|
| 67 |
-
|
| 68 |
-
|
| 69 |
|
| 70 |
-
# Each ZeroGPU worker re-forks from the parent and must bind the compiled .so itself; guard so it
|
| 71 |
-
# happens exactly once per process.
|
| 72 |
_AOTI_APPLIED = False
|
| 73 |
|
| 74 |
|
| 75 |
def _apply_aoti():
|
| 76 |
-
"""
|
|
|
|
|
|
|
|
|
|
| 77 |
global _AOTI_APPLIED
|
| 78 |
-
if _AOTI_APPLIED or
|
| 79 |
return
|
| 80 |
try:
|
| 81 |
-
|
| 82 |
-
spaces.
|
|
|
|
| 83 |
_AOTI_APPLIED = True
|
| 84 |
-
print("[
|
| 85 |
-
except Exception as e: # never let a
|
| 86 |
print(f"[aoti] apply failed, running eager: {e!r}", flush=True)
|
| 87 |
|
| 88 |
|
| 89 |
-
# --- Caption schema (matches Ideogram's native caption / caption_verifier) ---
|
| 90 |
-
class ObjElement(BaseModel):
|
| 91 |
-
type: Literal["obj"]
|
| 92 |
-
desc: str
|
| 93 |
-
|
| 94 |
-
|
| 95 |
-
class TextElement(BaseModel):
|
| 96 |
-
type: Literal["text"]
|
| 97 |
-
text: str
|
| 98 |
-
desc: str
|
| 99 |
-
|
| 100 |
-
|
| 101 |
-
class Composition(BaseModel):
|
| 102 |
-
background: str
|
| 103 |
-
elements: List[Union[ObjElement, TextElement]] = Field(min_length=1)
|
| 104 |
-
|
| 105 |
-
|
| 106 |
-
class Caption(BaseModel):
|
| 107 |
-
high_level_description: str
|
| 108 |
-
compositional_deconstruction: Composition
|
| 109 |
-
|
| 110 |
-
|
| 111 |
-
def _load_sections(path):
|
| 112 |
-
sections, cur, buf = {}, None, []
|
| 113 |
-
for line in open(path, encoding="utf-8").read().splitlines():
|
| 114 |
-
s = line.strip()
|
| 115 |
-
if s.startswith("[") and s.endswith("]") and " " not in s:
|
| 116 |
-
if cur is not None:
|
| 117 |
-
sections[cur] = "\n".join(buf).strip()
|
| 118 |
-
cur, buf = s[1:-1].lower(), []
|
| 119 |
-
else:
|
| 120 |
-
buf.append(line)
|
| 121 |
-
if cur is not None:
|
| 122 |
-
sections[cur] = "\n".join(buf).strip()
|
| 123 |
-
return sections
|
| 124 |
-
|
| 125 |
-
|
| 126 |
-
_SEC = _load_sections(os.path.join(_HERE, "v6.txt"))
|
| 127 |
-
SYSTEM_PROMPT = _SEC["system"]
|
| 128 |
-
USER_TEMPLATE = _SEC.get("user", "User idea: {{original_prompt}}")
|
| 129 |
-
|
| 130 |
-
|
| 131 |
-
def _build_enhancer():
|
| 132 |
-
"""Graft the hosted lm_head onto pipe.text_encoder -> a generative model (no second body).
|
| 133 |
-
Done ONCE at import time so nothing heavy happens on the first GPU request. Only the new
|
| 134 |
-
bf16 lm_head is `.to('cuda')` (ZeroGPU defers it); the shared nf4 body is already moved by `pipe`."""
|
| 135 |
-
head = load_file(LM_HEAD_PATH)["lm_head.weight"] # [vocab, hidden] bf16
|
| 136 |
-
with init_empty_weights():
|
| 137 |
-
gen = Qwen3VLForConditionalGeneration(pipe.text_encoder.config)
|
| 138 |
-
gen.model = pipe.text_encoder # reuse the loaded (nf4) encoder body — no extra body in VRAM
|
| 139 |
-
lm = nn.Linear(head.shape[1], head.shape[0], bias=False)
|
| 140 |
-
with torch.no_grad():
|
| 141 |
-
lm.weight.copy_(head.to(torch.bfloat16))
|
| 142 |
-
gen.lm_head = lm.to(torch.bfloat16)
|
| 143 |
-
gen.lm_head.to("cuda") # ZeroGPU-deferred move of just the head
|
| 144 |
-
gen.eval()
|
| 145 |
-
lp = None
|
| 146 |
-
if OUTLINES_AVAILABLE:
|
| 147 |
-
ol_model = outlines.from_transformers(gen, upsampler_proc.tokenizer)
|
| 148 |
-
lp = outlines.Generator(ol_model, Caption).logits_processor # compiles schema->FSM now
|
| 149 |
-
return gen, lp
|
| 150 |
-
|
| 151 |
-
|
| 152 |
-
# Assemble the generative enhancer + structural constraint at STARTUP (not on first request).
|
| 153 |
-
try:
|
| 154 |
-
ENHANCER, LOGITS_PROCESSOR = _build_enhancer()
|
| 155 |
-
except Exception as e: # don't let a graft hiccup block the demo / the bf16 OOM test
|
| 156 |
-
print(f"[enhancer] graft failed, prompt upsampling disabled: {e!r}")
|
| 157 |
-
ENHANCER, LOGITS_PROCESSOR = None, None
|
| 158 |
-
|
| 159 |
-
|
| 160 |
-
def upsample_prompt(prompt: str, width: int, height: int, progress=None) -> str:
|
| 161 |
-
from math import gcd
|
| 162 |
-
from threading import Thread
|
| 163 |
-
from transformers import TextIteratorStreamer
|
| 164 |
-
|
| 165 |
-
gen = ENHANCER
|
| 166 |
-
d = gcd(width, height) or 1
|
| 167 |
-
aspect_ratio = f"{width // d}:{height // d}"
|
| 168 |
-
user = USER_TEMPLATE.replace("{{aspect_ratio}}", aspect_ratio).replace("{{original_prompt}}", prompt)
|
| 169 |
-
messages = [{"role": "system", "content": SYSTEM_PROMPT}, {"role": "user", "content": user}]
|
| 170 |
-
inputs = upsampler_proc.apply_chat_template(
|
| 171 |
-
messages, add_generation_prompt=True, tokenize=True, return_tensors="pt", return_dict=True
|
| 172 |
-
).to(gen.device)
|
| 173 |
-
max_new = 1024
|
| 174 |
-
gen_kwargs = dict(max_new_tokens=max_new, do_sample=True, temperature=1.0, use_cache=True)
|
| 175 |
-
if LOGITS_PROCESSOR is not None:
|
| 176 |
-
LOGITS_PROCESSOR.reset()
|
| 177 |
-
gen_kwargs["logits_processor"] = [LOGITS_PROCESSOR]
|
| 178 |
-
|
| 179 |
-
if progress is None: # warmup path, no UI
|
| 180 |
-
out = gen.generate(**inputs, **gen_kwargs)
|
| 181 |
-
return upsampler_proc.batch_decode(out[:, inputs["input_ids"].shape[1]:], skip_special_tokens=True)[0].strip()
|
| 182 |
-
|
| 183 |
-
# stream tokens so the UI shows the upsampler working
|
| 184 |
-
streamer = TextIteratorStreamer(upsampler_proc.tokenizer, skip_prompt=True, skip_special_tokens=True)
|
| 185 |
-
gen_kwargs["streamer"] = streamer
|
| 186 |
-
thread = Thread(target=gen.generate, kwargs={**inputs, **gen_kwargs})
|
| 187 |
-
thread.start()
|
| 188 |
-
text, n = "", 0
|
| 189 |
-
for chunk in streamer:
|
| 190 |
-
text += chunk
|
| 191 |
-
n += 1
|
| 192 |
-
progress(min(n / max_new, 0.99), desc="✍️ Upsampling prompt…")
|
| 193 |
-
thread.join()
|
| 194 |
-
return text.strip()
|
| 195 |
-
|
| 196 |
-
|
| 197 |
@spaces.GPU(duration=240, size="xlarge")
|
| 198 |
-
def generate(
|
| 199 |
-
|
| 200 |
-
mode: str,
|
| 201 |
-
enhance: bool,
|
| 202 |
-
width: int,
|
| 203 |
-
height: int,
|
| 204 |
-
seed: int,
|
| 205 |
-
randomize_seed: bool,
|
| 206 |
-
progress=gr.Progress(track_tqdm=True),
|
| 207 |
-
):
|
| 208 |
if randomize_seed or seed < 0:
|
| 209 |
seed = random.randint(0, MAX_SEED)
|
| 210 |
|
| 211 |
-
|
|
|
|
|
|
|
| 212 |
|
| 213 |
final_prompt = prompt
|
| 214 |
-
if enhance:
|
| 215 |
-
|
| 216 |
-
|
| 217 |
-
final_prompt = upsample_prompt(prompt, int(
|
|
|
|
|
|
|
|
|
|
|
|
|
| 218 |
|
| 219 |
progress(0.0, desc="🎨 Generating image…")
|
| 220 |
generator = torch.Generator(device="cuda").manual_seed(int(seed))
|
| 221 |
preset = MODES.get(mode, MODES["Default · 20 steps"])
|
| 222 |
-
|
| 223 |
-
|
| 224 |
-
|
| 225 |
-
|
| 226 |
-
generator=generator,
|
| 227 |
-
**preset,
|
| 228 |
-
).images[0]
|
| 229 |
try:
|
| 230 |
caption = json.loads(final_prompt)
|
| 231 |
except Exception:
|
|
@@ -235,50 +117,40 @@ def generate(
|
|
| 235 |
|
| 236 |
@spaces.GPU(size="xlarge")
|
| 237 |
def _warmup():
|
| 238 |
-
"""
|
| 239 |
-
|
| 240 |
-
|
| 241 |
-
|
| 242 |
-
|
| 243 |
-
|
| 244 |
-
upsample_prompt("a red apple on a wooden table", 1024, 1024)
|
| 245 |
-
print("[warmup] upsampler ready on GPU", flush=True)
|
| 246 |
|
| 247 |
|
| 248 |
try:
|
| 249 |
_warmup()
|
| 250 |
-
except Exception as e: # a flaky ZeroGPU worker
|
| 251 |
print(f"[warmup] failed (will warm lazily on first request): {e!r}", flush=True)
|
| 252 |
|
| 253 |
|
| 254 |
with gr.Blocks(theme=gr.themes.Citrus(), title="Ideogram 4 (NF4) — diffusers preview") as demo:
|
| 255 |
gr.Markdown(
|
| 256 |
"## Ideogram 4 (NF4) — diffusers preview\n"
|
| 257 |
-
f"Private demo of [`{MODEL_ID}`](https://huggingface.co/{MODEL_ID})
|
| 258 |
"[diffusers PR](https://github.com/huggingface/diffusers-new-model-addition-ideogram) branch, on ZeroGPU.\n"
|
| 259 |
-
"
|
| 260 |
-
"(the pipeline's own Qwen3-VL encoder + a grafted LM head + Outlines)
|
|
|
|
| 261 |
)
|
| 262 |
|
| 263 |
with gr.Row():
|
| 264 |
with gr.Column():
|
| 265 |
-
prompt = gr.Textbox(
|
| 266 |
-
|
| 267 |
-
value="A photo of a cat holding a sign that says hello world",
|
| 268 |
-
lines=3,
|
| 269 |
-
)
|
| 270 |
-
mode = gr.Radio(
|
| 271 |
-
choices=list(MODES.keys()),
|
| 272 |
-
value="Default · 20 steps",
|
| 273 |
-
label="Mode (speed ↔ quality)",
|
| 274 |
-
)
|
| 275 |
run = gr.Button("Generate", variant="primary")
|
| 276 |
with gr.Accordion("Advanced", open=False):
|
| 277 |
enhance = gr.Checkbox(
|
| 278 |
-
label="Prompt upsampling
|
| 279 |
value=True,
|
| 280 |
-
info="Rewrite the prompt into Ideogram's native JSON caption before generating."
|
| 281 |
-
+ ("" if OUTLINES_AVAILABLE else " ⚠ outlines not installed — runs unconstrained."),
|
| 282 |
)
|
| 283 |
with gr.Row():
|
| 284 |
width = gr.Slider(512, 2048, value=1024, step=64, label="Width")
|
|
|
|
| 3 |
|
| 4 |
os.environ.setdefault("PYTORCH_CUDA_ALLOC_CONF", "expandable_segments:True")
|
| 5 |
|
| 6 |
+
# Bundled diffusers source: PR branch `ideogram4-prompt-enhancement` (YiYi's refactor + native upsampling).
|
| 7 |
_HERE = os.path.dirname(os.path.abspath(__file__))
|
| 8 |
sys.path.insert(0, os.path.join(_HERE, "diffusers_src", "src"))
|
| 9 |
|
| 10 |
import json
|
| 11 |
import random
|
| 12 |
+
import time
|
| 13 |
+
from threading import Thread
|
| 14 |
|
| 15 |
+
import gradio as gr
|
| 16 |
import spaces
|
| 17 |
import torch
|
|
|
|
|
|
|
|
|
|
|
|
|
| 18 |
from huggingface_hub import hf_hub_download
|
| 19 |
+
|
| 20 |
from diffusers import Ideogram4Pipeline
|
|
|
|
| 21 |
|
|
|
|
| 22 |
MODEL_ID = "diffusers-internal-dev/ideogram-4-nf4-v2"
|
|
|
|
| 23 |
LM_HEAD_REPO = "multimodalart/qwen3-vl-8b-instruct-lm-head"
|
|
|
|
|
|
|
|
|
|
| 24 |
AOTI_REPO = "multimodalart/i4-block-aoti"
|
| 25 |
+
AOTI_BLOCK_FILE = "Ideogram4TransformerBlock/package.pt2"
|
| 26 |
MAX_SEED = 2**31 - 1
|
| 27 |
|
| 28 |
+
# V4 presets (forward step-order: main CFG 7.0 -> polish 3.0).
|
| 29 |
MODES = {
|
| 30 |
"Turbo · 12 steps": dict(num_inference_steps=12, guidance_schedule=(7.0,) * 11 + (3.0,) * 1, mu=0.5, std=1.75),
|
| 31 |
"Default · 20 steps": dict(num_inference_steps=20, guidance_schedule=(7.0,) * 18 + (3.0,) * 2, mu=0.0, std=1.75),
|
| 32 |
"Quality · 48 steps": dict(num_inference_steps=48, guidance_schedule=(7.0,) * 45 + (3.0,) * 3, mu=0.0, std=1.5),
|
| 33 |
}
|
| 34 |
|
| 35 |
+
# --- Pipeline: dequantize both transformers nf4 -> bf16 in the parent (CPU) so every ZeroGPU fork inherits
|
| 36 |
+
# bf16 and AOTI can bind its weight-less graph to real weights. ---
|
| 37 |
+
t = time.perf_counter()
|
| 38 |
pipe = Ideogram4Pipeline.from_pretrained(MODEL_ID, torch_dtype=torch.bfloat16)
|
|
|
|
|
|
|
|
|
|
| 39 |
pipe.transformer.dequantize()
|
| 40 |
pipe.unconditional_transformer.dequantize()
|
| 41 |
pipe.to("cuda")
|
| 42 |
+
print(f"[timing] pipeline load + dequant: {time.perf_counter() - t:.1f}s", flush=True)
|
| 43 |
|
| 44 |
+
# --- Native prompt enhancer (grafts the hosted LM head + builds the Outlines processor) at startup. ---
|
|
|
|
|
|
|
|
|
|
| 45 |
try:
|
| 46 |
+
t = time.perf_counter()
|
| 47 |
+
pipe.load_prompt_enhancer(lm_head_repo_id=LM_HEAD_REPO)
|
| 48 |
+
pipe._caption_model.lm_head.to("cuda") # ZeroGPU-deferred move of just the grafted head
|
| 49 |
+
ENHANCER_OK = True
|
| 50 |
+
print(f"[timing] load_prompt_enhancer: {time.perf_counter() - t:.1f}s", flush=True)
|
| 51 |
+
except Exception as e:
|
| 52 |
+
ENHANCER_OK = False
|
| 53 |
+
print(f"[enhancer] disabled: {e!r}", flush=True)
|
| 54 |
|
| 55 |
+
# Pre-fetch the AOTI package at startup so the in-worker patch is cache-only.
|
| 56 |
try:
|
| 57 |
+
hf_hub_download(AOTI_REPO, AOTI_BLOCK_FILE)
|
| 58 |
+
AOTI_OK = True
|
| 59 |
except Exception as e:
|
| 60 |
+
AOTI_OK = False
|
| 61 |
+
print(f"[aoti] prefetch failed, running eager: {e!r}", flush=True)
|
| 62 |
|
|
|
|
|
|
|
| 63 |
_AOTI_APPLIED = False
|
| 64 |
|
| 65 |
|
| 66 |
def _apply_aoti():
|
| 67 |
+
"""Patch the compiled block onto every Ideogram4TransformerBlock of both transformers (once per worker).
|
| 68 |
+
|
| 69 |
+
`aoti_blocks_load` is lazy (binds forward, defers the .so to first diffusion step) and CPU-only, so this is
|
| 70 |
+
safe to run in a background thread overlapping the (transformer-idle) upsampling step."""
|
| 71 |
global _AOTI_APPLIED
|
| 72 |
+
if _AOTI_APPLIED or not AOTI_OK:
|
| 73 |
return
|
| 74 |
try:
|
| 75 |
+
t = time.perf_counter()
|
| 76 |
+
spaces.aoti_blocks_load(pipe.transformer, AOTI_REPO)
|
| 77 |
+
spaces.aoti_blocks_load(pipe.unconditional_transformer, AOTI_REPO)
|
| 78 |
_AOTI_APPLIED = True
|
| 79 |
+
print(f"[timing] aoti_blocks_load (both transformers): {time.perf_counter() - t:.2f}s", flush=True)
|
| 80 |
+
except Exception as e: # never let a bind hiccup block generation
|
| 81 |
print(f"[aoti] apply failed, running eager: {e!r}", flush=True)
|
| 82 |
|
| 83 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 84 |
@spaces.GPU(duration=240, size="xlarge")
|
| 85 |
+
def generate(prompt, mode, enhance, width, height, seed, randomize_seed, progress=gr.Progress(track_tqdm=True)):
|
| 86 |
+
t_enter = time.perf_counter()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 87 |
if randomize_seed or seed < 0:
|
| 88 |
seed = random.randint(0, MAX_SEED)
|
| 89 |
|
| 90 |
+
# Overlap the AOTI block-patch with upsampling: the transformer is idle while the text encoder runs.
|
| 91 |
+
aoti_thread = Thread(target=_apply_aoti, daemon=True)
|
| 92 |
+
aoti_thread.start()
|
| 93 |
|
| 94 |
final_prompt = prompt
|
| 95 |
+
if enhance and ENHANCER_OK:
|
| 96 |
+
progress(0.0, desc="✍️ Upsampling prompt…")
|
| 97 |
+
t = time.perf_counter()
|
| 98 |
+
final_prompt = pipe.upsample_prompt(prompt, height=int(height), width=int(width))[0]
|
| 99 |
+
print(f"[timing] upsample: {time.perf_counter() - t:.2f}s", flush=True)
|
| 100 |
+
|
| 101 |
+
aoti_thread.join() # ensure blocks are patched before the diffusion loop
|
| 102 |
+
print(f"[timing] pre-diffusion (enter -> ready): {time.perf_counter() - t_enter:.2f}s", flush=True)
|
| 103 |
|
| 104 |
progress(0.0, desc="🎨 Generating image…")
|
| 105 |
generator = torch.Generator(device="cuda").manual_seed(int(seed))
|
| 106 |
preset = MODES.get(mode, MODES["Default · 20 steps"])
|
| 107 |
+
t = time.perf_counter()
|
| 108 |
+
image = pipe(prompt=final_prompt, width=int(width), height=int(height), generator=generator, **preset).images[0]
|
| 109 |
+
print(f"[timing] diffusion ({mode}): {time.perf_counter() - t:.2f}s", flush=True)
|
| 110 |
+
|
|
|
|
|
|
|
|
|
|
| 111 |
try:
|
| 112 |
caption = json.loads(final_prompt)
|
| 113 |
except Exception:
|
|
|
|
| 117 |
|
| 118 |
@spaces.GPU(size="xlarge")
|
| 119 |
def _warmup():
|
| 120 |
+
"""Pay the AOTI patch + warm the upsampler on the startup worker (upsample only, no diffusion)."""
|
| 121 |
+
_apply_aoti()
|
| 122 |
+
if ENHANCER_OK:
|
| 123 |
+
t = time.perf_counter()
|
| 124 |
+
pipe.upsample_prompt("a red apple on a wooden table", height=1024, width=1024)
|
| 125 |
+
print(f"[timing] warmup upsample: {time.perf_counter() - t:.2f}s", flush=True)
|
|
|
|
|
|
|
| 126 |
|
| 127 |
|
| 128 |
try:
|
| 129 |
_warmup()
|
| 130 |
+
except Exception as e: # a flaky ZeroGPU worker must not take down the Space
|
| 131 |
print(f"[warmup] failed (will warm lazily on first request): {e!r}", flush=True)
|
| 132 |
|
| 133 |
|
| 134 |
with gr.Blocks(theme=gr.themes.Citrus(), title="Ideogram 4 (NF4) — diffusers preview") as demo:
|
| 135 |
gr.Markdown(
|
| 136 |
"## Ideogram 4 (NF4) — diffusers preview\n"
|
| 137 |
+
f"Private demo of [`{MODEL_ID}`](https://huggingface.co/{MODEL_ID}) on the "
|
| 138 |
"[diffusers PR](https://github.com/huggingface/diffusers-new-model-addition-ideogram) branch, on ZeroGPU.\n"
|
| 139 |
+
"**Prompt upsampling** rewrites your idea into Ideogram's native structured JSON caption "
|
| 140 |
+
"(the pipeline's own Qwen3-VL encoder + a grafted LM head + Outlines) via the native "
|
| 141 |
+
"`pipe.upsample_prompt`."
|
| 142 |
)
|
| 143 |
|
| 144 |
with gr.Row():
|
| 145 |
with gr.Column():
|
| 146 |
+
prompt = gr.Textbox(label="Prompt", value="A photo of a cat holding a sign that says hello world", lines=3)
|
| 147 |
+
mode = gr.Radio(choices=list(MODES.keys()), value="Default · 20 steps", label="Mode (speed ↔ quality)")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 148 |
run = gr.Button("Generate", variant="primary")
|
| 149 |
with gr.Accordion("Advanced", open=False):
|
| 150 |
enhance = gr.Checkbox(
|
| 151 |
+
label="Prompt upsampling",
|
| 152 |
value=True,
|
| 153 |
+
info="Rewrite the prompt into Ideogram's native JSON caption before generating.",
|
|
|
|
| 154 |
)
|
| 155 |
with gr.Row():
|
| 156 |
width = gr.Slider(512, 2048, value=1024, step=64, label="Width")
|
requirements.txt
CHANGED
|
@@ -1,7 +1,7 @@
|
|
| 1 |
-
transformers
|
|
|
|
| 2 |
accelerate
|
| 3 |
bitsandbytes
|
| 4 |
sentencepiece
|
| 5 |
outlines
|
| 6 |
pydantic>=2
|
| 7 |
-
torchvision
|
|
|
|
| 1 |
+
transformers>=5.8
|
| 2 |
+
peft>=0.19
|
| 3 |
accelerate
|
| 4 |
bitsandbytes
|
| 5 |
sentencepiece
|
| 6 |
outlines
|
| 7 |
pydantic>=2
|
|
|
v6.txt
DELETED
|
@@ -1,55 +0,0 @@
|
|
| 1 |
-
[META]
|
| 2 |
-
frozen: false
|
| 3 |
-
description: v6 — v1's medium-flexible spirit, retuned for Qwen3-VL-8B with v2-v5 learnings (anti-transparent, anti subject-drop, density, valid JSON). Example-driven, open to any subject/medium.
|
| 4 |
-
|
| 5 |
-
[SYSTEM]
|
| 6 |
-
You convert a short user idea into a structured JSON caption for an image renderer. Output ONE minified single-line JSON object and NOTHING else (no markdown, no commentary).
|
| 7 |
-
|
| 8 |
-
SCHEMA — keys in this exact order:
|
| 9 |
-
{"high_level_description":"...","compositional_deconstruction":{"background":"...","elements":[ ... ]}}
|
| 10 |
-
- object element: {"type":"obj","desc":"..."}
|
| 11 |
-
- text element: {"type":"text","text":"VERBATIM CHARS","desc":"..."}
|
| 12 |
-
|
| 13 |
-
STEP 1 — PICK THE MEDIUM. It decides what `background` and `elements` mean. Honor any medium or style the user implies; default to photograph only when nothing else fits. Render ANY subject faithfully — real, fantastical, sci-fi, surreal, abstract — in the chosen medium.
|
| 14 |
-
|
| 15 |
-
A) DESIGNED ARTIFACT — poster, logo, album/book cover, flyer, banner, sticker, packaging, app icon, infographic, menu, card, wordmark. THE FRAME IS THE ARTIFACT — never a photo of it hanging in a room.
|
| 16 |
-
- high_level_description: name it as graphic design (e.g. "a minimalist jazz poster, flat graphic design...").
|
| 17 |
-
- background: the design's OWN backdrop only — a flat color, gradient, or simple texture filling the frame. No room, wall, floor, easel, depth, or camera/photo language.
|
| 18 |
-
- elements: the design's parts as a flat 2D layout — a `text` element for every headline/label (verbatim), `obj` elements for the central graphic/illustration/shapes/badges. Place by region (top / center / bottom).
|
| 19 |
-
|
| 20 |
-
B) SCENE — a photograph, illustration, painting, 3D render, anime frame, etc. of a real or imagined place or subject.
|
| 21 |
-
- high_level_description: one sentence naming the subject and the medium/style.
|
| 22 |
-
- background: the scene SHELL — surroundings, ground/sky/walls, atmosphere, ambient light; concrete and specific. The ground/floor/water surface lives here, never as an element.
|
| 23 |
-
- elements: the main subject FIRST as an `obj`, then supporting `obj` elements (props, secondary subjects) that plausibly belong. Add `text` elements only where the scene would really carry text (signs, labels, brands).
|
| 24 |
-
|
| 25 |
-
C) ABSTRACT / CONCEPTUAL — "nostalgia", "chaos and order", "sound waves", pure pattern. Concretize the idea into a deliberate visual composition.
|
| 26 |
-
- background: the dominant color field, gradient, or texture of the composition.
|
| 27 |
-
- elements: the shapes, forms, motifs, or symbolic objects that carry the concept, as `obj` elements. Add `text` only if the idea calls for words.
|
| 28 |
-
|
| 29 |
-
UNIVERSAL RULES (every medium):
|
| 30 |
-
1. The user's core subject/concept MUST appear among the elements (as an `obj`, normally first). Naming it only in high_level_description or background is NOT enough.
|
| 31 |
-
2. Commit to ONE concrete value each (one color, one style, one count). No hedging: ban "various", "such as", "e.g.", "or similar", "maybe", "X or Y" for one property.
|
| 32 |
-
3. NEVER use a transparent, empty, or plain white background UNLESS the user explicitly says "transparent", "isolated", "sticker", or "cutout".
|
| 33 |
-
4. A coherent subject (one animal, person, vehicle, object) is exactly ONE element; its parts go inside its `desc`. Use separate elements for genuinely separate subjects.
|
| 34 |
-
5. Each `desc` is 25-55 words, identity-first, standalone. Do not mention shadows, depth of field, bokeh, lens, focus, or grain.
|
| 35 |
-
6. high_level_description: one sentence, at most 40 words, starts with the subject, names the medium. Preserve non-ASCII characters as-is.
|
| 36 |
-
7. Output STRICTLY VALID JSON: double quotes around every key and string, NO trailing commas, each element object closes with "}" right after its last value.
|
| 37 |
-
8. Catch the "warm" impulse. Only when you are about to describe light as "warm", "golden", "amber", or "honey", stop and check: is there a specific physical source in the scene casting that colour (candle, sunset, lamp, neon, fire)? If YES, name the source and the colour it casts instead of the mood word. If NO, you are just reaching for warmth as ambience — drop it and leave the light neutral ("soft" or "even"). Don't recolour or relight anything else; this only intercepts the warm reach, every other scene and mood the user wants is untouched.
|
| 38 |
-
9. Describe physical reality, not impressions. Avoid mood-words — "luminous", "radiant", "vibrant", "lush", "dynamic", "gorgeous", "stunning", "breathtaking", "mesmerizing", and metaphorical "glowing" — they produce a generic AI look (the same trap as "warm"). Use observable properties: "the cheekbone catches a small highlight", not "luminous complexion".
|
| 39 |
-
10. Every named thing must appear as its own element. Each subject, object, sign, and quoted phrase the user names gets its own element — quoted text (single or double quotes) becomes its own verbatim `text` element. Count the named units in the prompt; the element list must hold at least that many. Don't drop or merge them.
|
| 40 |
-
11. Don't add what wasn't asked for. No glitch art, wireframe overlay, body fragmentation, double-exposure, "dissolving", or extra stylization unless the prompt requests it. Asked for a cinematic photo of a journalist → render that, not a glitch-art composite.
|
| 41 |
-
12. Name attributes concretely, anchored to landmarks. People: skin tone, hair (colour + style), each visible garment with colour, expression, pose, one distinguishing feature. Objects: shape, material, colour, a distinctive part. Place things against named references — "resting on the lower-right corner of the table", not "on the surface".
|
| 42 |
-
13. Name real references by name. If the user names a brand, product, character, place, or person (Nike Dunk Low, Spider-Man, the Eiffel Tower), keep that exact name in the `desc`; don't swap it for a generic look-alike unless they ask for an anonymous one.
|
| 43 |
-
14. "Professional photo/headshot" of a person means professional CONTEXT — neutral attire, soft even daylight, neutral backdrop, friendly expression — not dramatic studio gear; no heavy rim-light or creamy bokeh unless asked.
|
| 44 |
-
|
| 45 |
-
EXAMPLES
|
| 46 |
-
|
| 47 |
-
User idea: a cup of coffee on a table
|
| 48 |
-
Output: {"high_level_description":"A white ceramic cup of black coffee on a worn wooden cafe table, a casual overcast-daylight phone photograph with an off-center composition.","compositional_deconstruction":{"background":"Scratched oak cafe table filling the lower frame, a pale grey mortar-lined brick wall a few feet behind slightly out of focus, a tall window on the left spilling soft overcast daylight across the table, neutral white balance, muted brown and green tones.","elements":[{"type":"obj","desc":"White ceramic cup of black coffee with a thin curved handle turned to the right and a faint crema ring at the rim, resting on a matching round saucer near the center of the table, a thin wisp of steam at the surface."},{"type":"obj","desc":"Brushed-steel teaspoon lying on the saucer to the right of the cup, handle angled toward the lower-right corner, a single small water droplet on the bowl of the spoon."}]}}
|
| 49 |
-
|
| 50 |
-
User idea: a minimalist poster for a jazz festival
|
| 51 |
-
Output: {"high_level_description":"A minimalist jazz festival poster, flat graphic design with bold typography and a single abstract saxophone motif on a deep teal background.","compositional_deconstruction":{"background":"Solid deep teal background filling the entire frame with a subtle fine paper-grain texture and a thin mustard-yellow keyline border just inside the edges, no scene and no depth.","elements":[{"type":"obj","desc":"A large flat geometric saxophone in mustard yellow and cream, centered in the upper two-thirds, built from simple bold shapes with no shading, angled diagonally from lower-left to upper-right."},{"type":"text","text":"JAZZ\nFESTIVAL","desc":"Large bold condensed sans-serif headline in cream, stacked on two lines across the center of the poster, slightly overlapping the saxophone motif."},{"type":"text","text":"NOV 15 · CITY HALL","desc":"Small uppercase mustard-yellow caption centered near the bottom edge with wide letter spacing."}]}}
|
| 52 |
-
|
| 53 |
-
[USER]
|
| 54 |
-
TARGET IMAGE ASPECT RATIO: {{aspect_ratio}} (width:height).
|
| 55 |
-
User idea: {{original_prompt}}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|