multimodalart HF Staff commited on
Commit
87fa87f
·
verified ·
1 Parent(s): 168566a

Demo: native pipe.upsample_prompt + spaces.aoti_blocks_load (overlapped) + timing

Browse files
Files changed (3) hide show
  1. app.py +67 -195
  2. requirements.txt +2 -2
  3. v6.txt +0 -55
app.py CHANGED
@@ -3,229 +3,111 @@ import sys
3
 
4
  os.environ.setdefault("PYTORCH_CUDA_ALLOC_CONF", "expandable_segments:True")
5
 
6
- # Use the bundled diffusers source (PR #2: huggingface/diffusers-new-model-addition-ideogram).
7
  _HERE = os.path.dirname(os.path.abspath(__file__))
8
  sys.path.insert(0, os.path.join(_HERE, "diffusers_src", "src"))
9
 
10
  import json
11
  import random
12
- from typing import List, Literal, Union
 
13
 
 
14
  import spaces
15
  import torch
16
- import torch.nn as nn
17
- import gradio as gr
18
- from pydantic import BaseModel, Field
19
- from accelerate import init_empty_weights
20
  from huggingface_hub import hf_hub_download
21
- from safetensors.torch import load_file
22
  from diffusers import Ideogram4Pipeline
23
- from transformers import AutoProcessor, Qwen3VLForConditionalGeneration
24
 
25
- # --- New (safety-fixed) checkpoint ---
26
  MODEL_ID = "diffusers-internal-dev/ideogram-4-nf4-v2"
27
- # Just the LM head, grafted onto the pipeline's own Qwen3-VL encoder to make it generative.
28
  LM_HEAD_REPO = "multimodalart/qwen3-vl-8b-instruct-lm-head"
29
- TOKENIZER_ID = "Qwen/Qwen3-VL-8B-Instruct" # processor/tokenizer only (no weights)
30
- # Precompiled (weight-less, dynamic L=16·k) AOTI package for one Ideogram4TransformerBlock.
31
- # Applied to all 68 block instances across both transformers; ~1.28x on 1024 turbo.
32
  AOTI_REPO = "multimodalart/i4-block-aoti"
33
-
34
  MAX_SEED = 2**31 - 1
35
 
36
- # --- Sampler modes (V4 presets, forward step-order: main CFG 7.0 -> polish 3.0) ---
37
  MODES = {
38
  "Turbo · 12 steps": dict(num_inference_steps=12, guidance_schedule=(7.0,) * 11 + (3.0,) * 1, mu=0.5, std=1.75),
39
  "Default · 20 steps": dict(num_inference_steps=20, guidance_schedule=(7.0,) * 18 + (3.0,) * 2, mu=0.0, std=1.75),
40
  "Quality · 48 steps": dict(num_inference_steps=48, guidance_schedule=(7.0,) * 45 + (3.0,) * 3, mu=0.0, std=1.5),
41
  }
42
 
43
- # --- Pipeline ---
 
 
44
  pipe = Ideogram4Pipeline.from_pretrained(MODEL_ID, torch_dtype=torch.bfloat16)
45
- # Dequantize nf4 -> bf16 in the PARENT (CPU) context, BEFORE ZeroGPU forks/packs the model, so every
46
- # fork inherits bf16 (a fork-local dequant doesn't persist). bitsandbytes supports CPU 4-bit dequant.
47
- # This also gives AOTI real bf16 weights to bind to its (weight-less) compiled graph.
48
  pipe.transformer.dequantize()
49
  pipe.unconditional_transformer.dequantize()
50
  pipe.to("cuda")
 
51
 
52
- # --- Upsampler tokenizer + pre-fetched LM head (graft done lazily on GPU) ---
53
- upsampler_proc = AutoProcessor.from_pretrained(TOKENIZER_ID)
54
- LM_HEAD_PATH = hf_hub_download(LM_HEAD_REPO, "lm_head.safetensors") # cached at startup
55
-
56
  try:
57
- import outlines
58
- OUTLINES_AVAILABLE = True
59
- except Exception:
60
- OUTLINES_AVAILABLE = False
 
 
 
 
61
 
62
- # Pre-fetch the AOTI package at startup (CPU/parent); the actual .so bind happens on-GPU (per worker).
63
  try:
64
- from huggingface_hub import snapshot_download
65
- AOTI_DIR = snapshot_download(AOTI_REPO, repo_type="model")
66
  except Exception as e:
67
- print(f"[aoti] package fetch failed, running eager: {e!r}", flush=True)
68
- AOTI_DIR = None
69
 
70
- # Each ZeroGPU worker re-forks from the parent and must bind the compiled .so itself; guard so it
71
- # happens exactly once per process.
72
  _AOTI_APPLIED = False
73
 
74
 
75
  def _apply_aoti():
76
- """Bind the precompiled block to both transformers (once per GPU worker). Must run inside @spaces.GPU."""
 
 
 
77
  global _AOTI_APPLIED
78
- if _AOTI_APPLIED or AOTI_DIR is None:
79
  return
80
  try:
81
- spaces.aoti_load_from_package_dir(pipe.transformer, AOTI_DIR)
82
- spaces.aoti_load_from_package_dir(pipe.unconditional_transformer, AOTI_DIR)
 
83
  _AOTI_APPLIED = True
84
- print("[aoti] compiled block applied to both transformers", flush=True)
85
- except Exception as e: # never let a compile-bind hiccup block generation — fall back to eager
86
  print(f"[aoti] apply failed, running eager: {e!r}", flush=True)
87
 
88
 
89
- # --- Caption schema (matches Ideogram's native caption / caption_verifier) ---
90
- class ObjElement(BaseModel):
91
- type: Literal["obj"]
92
- desc: str
93
-
94
-
95
- class TextElement(BaseModel):
96
- type: Literal["text"]
97
- text: str
98
- desc: str
99
-
100
-
101
- class Composition(BaseModel):
102
- background: str
103
- elements: List[Union[ObjElement, TextElement]] = Field(min_length=1)
104
-
105
-
106
- class Caption(BaseModel):
107
- high_level_description: str
108
- compositional_deconstruction: Composition
109
-
110
-
111
- def _load_sections(path):
112
- sections, cur, buf = {}, None, []
113
- for line in open(path, encoding="utf-8").read().splitlines():
114
- s = line.strip()
115
- if s.startswith("[") and s.endswith("]") and " " not in s:
116
- if cur is not None:
117
- sections[cur] = "\n".join(buf).strip()
118
- cur, buf = s[1:-1].lower(), []
119
- else:
120
- buf.append(line)
121
- if cur is not None:
122
- sections[cur] = "\n".join(buf).strip()
123
- return sections
124
-
125
-
126
- _SEC = _load_sections(os.path.join(_HERE, "v6.txt"))
127
- SYSTEM_PROMPT = _SEC["system"]
128
- USER_TEMPLATE = _SEC.get("user", "User idea: {{original_prompt}}")
129
-
130
-
131
- def _build_enhancer():
132
- """Graft the hosted lm_head onto pipe.text_encoder -> a generative model (no second body).
133
- Done ONCE at import time so nothing heavy happens on the first GPU request. Only the new
134
- bf16 lm_head is `.to('cuda')` (ZeroGPU defers it); the shared nf4 body is already moved by `pipe`."""
135
- head = load_file(LM_HEAD_PATH)["lm_head.weight"] # [vocab, hidden] bf16
136
- with init_empty_weights():
137
- gen = Qwen3VLForConditionalGeneration(pipe.text_encoder.config)
138
- gen.model = pipe.text_encoder # reuse the loaded (nf4) encoder body — no extra body in VRAM
139
- lm = nn.Linear(head.shape[1], head.shape[0], bias=False)
140
- with torch.no_grad():
141
- lm.weight.copy_(head.to(torch.bfloat16))
142
- gen.lm_head = lm.to(torch.bfloat16)
143
- gen.lm_head.to("cuda") # ZeroGPU-deferred move of just the head
144
- gen.eval()
145
- lp = None
146
- if OUTLINES_AVAILABLE:
147
- ol_model = outlines.from_transformers(gen, upsampler_proc.tokenizer)
148
- lp = outlines.Generator(ol_model, Caption).logits_processor # compiles schema->FSM now
149
- return gen, lp
150
-
151
-
152
- # Assemble the generative enhancer + structural constraint at STARTUP (not on first request).
153
- try:
154
- ENHANCER, LOGITS_PROCESSOR = _build_enhancer()
155
- except Exception as e: # don't let a graft hiccup block the demo / the bf16 OOM test
156
- print(f"[enhancer] graft failed, prompt upsampling disabled: {e!r}")
157
- ENHANCER, LOGITS_PROCESSOR = None, None
158
-
159
-
160
- def upsample_prompt(prompt: str, width: int, height: int, progress=None) -> str:
161
- from math import gcd
162
- from threading import Thread
163
- from transformers import TextIteratorStreamer
164
-
165
- gen = ENHANCER
166
- d = gcd(width, height) or 1
167
- aspect_ratio = f"{width // d}:{height // d}"
168
- user = USER_TEMPLATE.replace("{{aspect_ratio}}", aspect_ratio).replace("{{original_prompt}}", prompt)
169
- messages = [{"role": "system", "content": SYSTEM_PROMPT}, {"role": "user", "content": user}]
170
- inputs = upsampler_proc.apply_chat_template(
171
- messages, add_generation_prompt=True, tokenize=True, return_tensors="pt", return_dict=True
172
- ).to(gen.device)
173
- max_new = 1024
174
- gen_kwargs = dict(max_new_tokens=max_new, do_sample=True, temperature=1.0, use_cache=True)
175
- if LOGITS_PROCESSOR is not None:
176
- LOGITS_PROCESSOR.reset()
177
- gen_kwargs["logits_processor"] = [LOGITS_PROCESSOR]
178
-
179
- if progress is None: # warmup path, no UI
180
- out = gen.generate(**inputs, **gen_kwargs)
181
- return upsampler_proc.batch_decode(out[:, inputs["input_ids"].shape[1]:], skip_special_tokens=True)[0].strip()
182
-
183
- # stream tokens so the UI shows the upsampler working
184
- streamer = TextIteratorStreamer(upsampler_proc.tokenizer, skip_prompt=True, skip_special_tokens=True)
185
- gen_kwargs["streamer"] = streamer
186
- thread = Thread(target=gen.generate, kwargs={**inputs, **gen_kwargs})
187
- thread.start()
188
- text, n = "", 0
189
- for chunk in streamer:
190
- text += chunk
191
- n += 1
192
- progress(min(n / max_new, 0.99), desc="✍️ Upsampling prompt…")
193
- thread.join()
194
- return text.strip()
195
-
196
-
197
  @spaces.GPU(duration=240, size="xlarge")
198
- def generate(
199
- prompt: str,
200
- mode: str,
201
- enhance: bool,
202
- width: int,
203
- height: int,
204
- seed: int,
205
- randomize_seed: bool,
206
- progress=gr.Progress(track_tqdm=True),
207
- ):
208
  if randomize_seed or seed < 0:
209
  seed = random.randint(0, MAX_SEED)
210
 
211
- _apply_aoti() # bind compiled blocks on this worker (no-op after first call)
 
 
212
 
213
  final_prompt = prompt
214
- if enhance:
215
- if not OUTLINES_AVAILABLE:
216
- gr.Warning("`outlines` is not installed — upsampling without structural constraints.")
217
- final_prompt = upsample_prompt(prompt, int(width), int(height), progress=progress)
 
 
 
 
218
 
219
  progress(0.0, desc="🎨 Generating image…")
220
  generator = torch.Generator(device="cuda").manual_seed(int(seed))
221
  preset = MODES.get(mode, MODES["Default · 20 steps"])
222
- image = pipe(
223
- prompt=final_prompt,
224
- width=int(width),
225
- height=int(height),
226
- generator=generator,
227
- **preset,
228
- ).images[0]
229
  try:
230
  caption = json.loads(final_prompt)
231
  except Exception:
@@ -235,50 +117,40 @@ def generate(
235
 
236
  @spaces.GPU(size="xlarge")
237
  def _warmup():
238
- """Preload the upsampler onto GPU and warm it at STARTUP (graft move + Outlines FSM + first-token JIT).
239
- NOTE: runtime nf4->bf16 dequant is intentionally NOT done here — it does not persist across ZeroGPU
240
- forks (each request re-forks from the nf4 parent process), so bf16+speed will come from a precompiled
241
- AOTI artifact instead."""
242
- _apply_aoti() # bind compiled blocks + trigger the lazy per-worker .so load
243
- if ENHANCER is not None:
244
- upsample_prompt("a red apple on a wooden table", 1024, 1024)
245
- print("[warmup] upsampler ready on GPU", flush=True)
246
 
247
 
248
  try:
249
  _warmup()
250
- except Exception as e: # a flaky ZeroGPU worker (e.g. ECC) must not take down the Space
251
  print(f"[warmup] failed (will warm lazily on first request): {e!r}", flush=True)
252
 
253
 
254
  with gr.Blocks(theme=gr.themes.Citrus(), title="Ideogram 4 (NF4) — diffusers preview") as demo:
255
  gr.Markdown(
256
  "## Ideogram 4 (NF4) — diffusers preview\n"
257
- f"Private demo of [`{MODEL_ID}`](https://huggingface.co/{MODEL_ID}) using the "
258
  "[diffusers PR](https://github.com/huggingface/diffusers-new-model-addition-ideogram) branch, on ZeroGPU.\n"
259
- "Toggle **Prompt upsampling** in Advanced to rewrite your idea into Ideogram's native structured caption "
260
- "(the pipeline's own Qwen3-VL encoder + a grafted LM head + Outlines)."
 
261
  )
262
 
263
  with gr.Row():
264
  with gr.Column():
265
- prompt = gr.Textbox(
266
- label="Prompt",
267
- value="A photo of a cat holding a sign that says hello world",
268
- lines=3,
269
- )
270
- mode = gr.Radio(
271
- choices=list(MODES.keys()),
272
- value="Default · 20 steps",
273
- label="Mode (speed ↔ quality)",
274
- )
275
  run = gr.Button("Generate", variant="primary")
276
  with gr.Accordion("Advanced", open=False):
277
  enhance = gr.Checkbox(
278
- label="Prompt upsampling (Outlines)",
279
  value=True,
280
- info="Rewrite the prompt into Ideogram's native JSON caption before generating."
281
- + ("" if OUTLINES_AVAILABLE else " ⚠ outlines not installed — runs unconstrained."),
282
  )
283
  with gr.Row():
284
  width = gr.Slider(512, 2048, value=1024, step=64, label="Width")
 
3
 
4
  os.environ.setdefault("PYTORCH_CUDA_ALLOC_CONF", "expandable_segments:True")
5
 
6
+ # Bundled diffusers source: PR branch `ideogram4-prompt-enhancement` (YiYi's refactor + native upsampling).
7
  _HERE = os.path.dirname(os.path.abspath(__file__))
8
  sys.path.insert(0, os.path.join(_HERE, "diffusers_src", "src"))
9
 
10
  import json
11
  import random
12
+ import time
13
+ from threading import Thread
14
 
15
+ import gradio as gr
16
  import spaces
17
  import torch
 
 
 
 
18
  from huggingface_hub import hf_hub_download
19
+
20
  from diffusers import Ideogram4Pipeline
 
21
 
 
22
  MODEL_ID = "diffusers-internal-dev/ideogram-4-nf4-v2"
 
23
  LM_HEAD_REPO = "multimodalart/qwen3-vl-8b-instruct-lm-head"
 
 
 
24
  AOTI_REPO = "multimodalart/i4-block-aoti"
25
+ AOTI_BLOCK_FILE = "Ideogram4TransformerBlock/package.pt2"
26
  MAX_SEED = 2**31 - 1
27
 
28
+ # V4 presets (forward step-order: main CFG 7.0 -> polish 3.0).
29
  MODES = {
30
  "Turbo · 12 steps": dict(num_inference_steps=12, guidance_schedule=(7.0,) * 11 + (3.0,) * 1, mu=0.5, std=1.75),
31
  "Default · 20 steps": dict(num_inference_steps=20, guidance_schedule=(7.0,) * 18 + (3.0,) * 2, mu=0.0, std=1.75),
32
  "Quality · 48 steps": dict(num_inference_steps=48, guidance_schedule=(7.0,) * 45 + (3.0,) * 3, mu=0.0, std=1.5),
33
  }
34
 
35
+ # --- Pipeline: dequantize both transformers nf4 -> bf16 in the parent (CPU) so every ZeroGPU fork inherits
36
+ # bf16 and AOTI can bind its weight-less graph to real weights. ---
37
+ t = time.perf_counter()
38
  pipe = Ideogram4Pipeline.from_pretrained(MODEL_ID, torch_dtype=torch.bfloat16)
 
 
 
39
  pipe.transformer.dequantize()
40
  pipe.unconditional_transformer.dequantize()
41
  pipe.to("cuda")
42
+ print(f"[timing] pipeline load + dequant: {time.perf_counter() - t:.1f}s", flush=True)
43
 
44
+ # --- Native prompt enhancer (grafts the hosted LM head + builds the Outlines processor) at startup. ---
 
 
 
45
  try:
46
+ t = time.perf_counter()
47
+ pipe.load_prompt_enhancer(lm_head_repo_id=LM_HEAD_REPO)
48
+ pipe._caption_model.lm_head.to("cuda") # ZeroGPU-deferred move of just the grafted head
49
+ ENHANCER_OK = True
50
+ print(f"[timing] load_prompt_enhancer: {time.perf_counter() - t:.1f}s", flush=True)
51
+ except Exception as e:
52
+ ENHANCER_OK = False
53
+ print(f"[enhancer] disabled: {e!r}", flush=True)
54
 
55
+ # Pre-fetch the AOTI package at startup so the in-worker patch is cache-only.
56
  try:
57
+ hf_hub_download(AOTI_REPO, AOTI_BLOCK_FILE)
58
+ AOTI_OK = True
59
  except Exception as e:
60
+ AOTI_OK = False
61
+ print(f"[aoti] prefetch failed, running eager: {e!r}", flush=True)
62
 
 
 
63
  _AOTI_APPLIED = False
64
 
65
 
66
  def _apply_aoti():
67
+ """Patch the compiled block onto every Ideogram4TransformerBlock of both transformers (once per worker).
68
+
69
+ `aoti_blocks_load` is lazy (binds forward, defers the .so to first diffusion step) and CPU-only, so this is
70
+ safe to run in a background thread overlapping the (transformer-idle) upsampling step."""
71
  global _AOTI_APPLIED
72
+ if _AOTI_APPLIED or not AOTI_OK:
73
  return
74
  try:
75
+ t = time.perf_counter()
76
+ spaces.aoti_blocks_load(pipe.transformer, AOTI_REPO)
77
+ spaces.aoti_blocks_load(pipe.unconditional_transformer, AOTI_REPO)
78
  _AOTI_APPLIED = True
79
+ print(f"[timing] aoti_blocks_load (both transformers): {time.perf_counter() - t:.2f}s", flush=True)
80
+ except Exception as e: # never let a bind hiccup block generation
81
  print(f"[aoti] apply failed, running eager: {e!r}", flush=True)
82
 
83
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
84
  @spaces.GPU(duration=240, size="xlarge")
85
+ def generate(prompt, mode, enhance, width, height, seed, randomize_seed, progress=gr.Progress(track_tqdm=True)):
86
+ t_enter = time.perf_counter()
 
 
 
 
 
 
 
 
87
  if randomize_seed or seed < 0:
88
  seed = random.randint(0, MAX_SEED)
89
 
90
+ # Overlap the AOTI block-patch with upsampling: the transformer is idle while the text encoder runs.
91
+ aoti_thread = Thread(target=_apply_aoti, daemon=True)
92
+ aoti_thread.start()
93
 
94
  final_prompt = prompt
95
+ if enhance and ENHANCER_OK:
96
+ progress(0.0, desc="✍️ Upsampling prompt…")
97
+ t = time.perf_counter()
98
+ final_prompt = pipe.upsample_prompt(prompt, height=int(height), width=int(width))[0]
99
+ print(f"[timing] upsample: {time.perf_counter() - t:.2f}s", flush=True)
100
+
101
+ aoti_thread.join() # ensure blocks are patched before the diffusion loop
102
+ print(f"[timing] pre-diffusion (enter -> ready): {time.perf_counter() - t_enter:.2f}s", flush=True)
103
 
104
  progress(0.0, desc="🎨 Generating image…")
105
  generator = torch.Generator(device="cuda").manual_seed(int(seed))
106
  preset = MODES.get(mode, MODES["Default · 20 steps"])
107
+ t = time.perf_counter()
108
+ image = pipe(prompt=final_prompt, width=int(width), height=int(height), generator=generator, **preset).images[0]
109
+ print(f"[timing] diffusion ({mode}): {time.perf_counter() - t:.2f}s", flush=True)
110
+
 
 
 
111
  try:
112
  caption = json.loads(final_prompt)
113
  except Exception:
 
117
 
118
  @spaces.GPU(size="xlarge")
119
  def _warmup():
120
+ """Pay the AOTI patch + warm the upsampler on the startup worker (upsample only, no diffusion)."""
121
+ _apply_aoti()
122
+ if ENHANCER_OK:
123
+ t = time.perf_counter()
124
+ pipe.upsample_prompt("a red apple on a wooden table", height=1024, width=1024)
125
+ print(f"[timing] warmup upsample: {time.perf_counter() - t:.2f}s", flush=True)
 
 
126
 
127
 
128
  try:
129
  _warmup()
130
+ except Exception as e: # a flaky ZeroGPU worker must not take down the Space
131
  print(f"[warmup] failed (will warm lazily on first request): {e!r}", flush=True)
132
 
133
 
134
  with gr.Blocks(theme=gr.themes.Citrus(), title="Ideogram 4 (NF4) — diffusers preview") as demo:
135
  gr.Markdown(
136
  "## Ideogram 4 (NF4) — diffusers preview\n"
137
+ f"Private demo of [`{MODEL_ID}`](https://huggingface.co/{MODEL_ID}) on the "
138
  "[diffusers PR](https://github.com/huggingface/diffusers-new-model-addition-ideogram) branch, on ZeroGPU.\n"
139
+ "**Prompt upsampling** rewrites your idea into Ideogram's native structured JSON caption "
140
+ "(the pipeline's own Qwen3-VL encoder + a grafted LM head + Outlines) via the native "
141
+ "`pipe.upsample_prompt`."
142
  )
143
 
144
  with gr.Row():
145
  with gr.Column():
146
+ prompt = gr.Textbox(label="Prompt", value="A photo of a cat holding a sign that says hello world", lines=3)
147
+ mode = gr.Radio(choices=list(MODES.keys()), value="Default · 20 steps", label="Mode (speed ↔ quality)")
 
 
 
 
 
 
 
 
148
  run = gr.Button("Generate", variant="primary")
149
  with gr.Accordion("Advanced", open=False):
150
  enhance = gr.Checkbox(
151
+ label="Prompt upsampling",
152
  value=True,
153
+ info="Rewrite the prompt into Ideogram's native JSON caption before generating.",
 
154
  )
155
  with gr.Row():
156
  width = gr.Slider(512, 2048, value=1024, step=64, label="Width")
requirements.txt CHANGED
@@ -1,7 +1,7 @@
1
- transformers
 
2
  accelerate
3
  bitsandbytes
4
  sentencepiece
5
  outlines
6
  pydantic>=2
7
- torchvision
 
1
+ transformers>=5.8
2
+ peft>=0.19
3
  accelerate
4
  bitsandbytes
5
  sentencepiece
6
  outlines
7
  pydantic>=2
 
v6.txt DELETED
@@ -1,55 +0,0 @@
1
- [META]
2
- frozen: false
3
- description: v6 — v1's medium-flexible spirit, retuned for Qwen3-VL-8B with v2-v5 learnings (anti-transparent, anti subject-drop, density, valid JSON). Example-driven, open to any subject/medium.
4
-
5
- [SYSTEM]
6
- You convert a short user idea into a structured JSON caption for an image renderer. Output ONE minified single-line JSON object and NOTHING else (no markdown, no commentary).
7
-
8
- SCHEMA — keys in this exact order:
9
- {"high_level_description":"...","compositional_deconstruction":{"background":"...","elements":[ ... ]}}
10
- - object element: {"type":"obj","desc":"..."}
11
- - text element: {"type":"text","text":"VERBATIM CHARS","desc":"..."}
12
-
13
- STEP 1 — PICK THE MEDIUM. It decides what `background` and `elements` mean. Honor any medium or style the user implies; default to photograph only when nothing else fits. Render ANY subject faithfully — real, fantastical, sci-fi, surreal, abstract — in the chosen medium.
14
-
15
- A) DESIGNED ARTIFACT — poster, logo, album/book cover, flyer, banner, sticker, packaging, app icon, infographic, menu, card, wordmark. THE FRAME IS THE ARTIFACT — never a photo of it hanging in a room.
16
- - high_level_description: name it as graphic design (e.g. "a minimalist jazz poster, flat graphic design...").
17
- - background: the design's OWN backdrop only — a flat color, gradient, or simple texture filling the frame. No room, wall, floor, easel, depth, or camera/photo language.
18
- - elements: the design's parts as a flat 2D layout — a `text` element for every headline/label (verbatim), `obj` elements for the central graphic/illustration/shapes/badges. Place by region (top / center / bottom).
19
-
20
- B) SCENE — a photograph, illustration, painting, 3D render, anime frame, etc. of a real or imagined place or subject.
21
- - high_level_description: one sentence naming the subject and the medium/style.
22
- - background: the scene SHELL — surroundings, ground/sky/walls, atmosphere, ambient light; concrete and specific. The ground/floor/water surface lives here, never as an element.
23
- - elements: the main subject FIRST as an `obj`, then supporting `obj` elements (props, secondary subjects) that plausibly belong. Add `text` elements only where the scene would really carry text (signs, labels, brands).
24
-
25
- C) ABSTRACT / CONCEPTUAL — "nostalgia", "chaos and order", "sound waves", pure pattern. Concretize the idea into a deliberate visual composition.
26
- - background: the dominant color field, gradient, or texture of the composition.
27
- - elements: the shapes, forms, motifs, or symbolic objects that carry the concept, as `obj` elements. Add `text` only if the idea calls for words.
28
-
29
- UNIVERSAL RULES (every medium):
30
- 1. The user's core subject/concept MUST appear among the elements (as an `obj`, normally first). Naming it only in high_level_description or background is NOT enough.
31
- 2. Commit to ONE concrete value each (one color, one style, one count). No hedging: ban "various", "such as", "e.g.", "or similar", "maybe", "X or Y" for one property.
32
- 3. NEVER use a transparent, empty, or plain white background UNLESS the user explicitly says "transparent", "isolated", "sticker", or "cutout".
33
- 4. A coherent subject (one animal, person, vehicle, object) is exactly ONE element; its parts go inside its `desc`. Use separate elements for genuinely separate subjects.
34
- 5. Each `desc` is 25-55 words, identity-first, standalone. Do not mention shadows, depth of field, bokeh, lens, focus, or grain.
35
- 6. high_level_description: one sentence, at most 40 words, starts with the subject, names the medium. Preserve non-ASCII characters as-is.
36
- 7. Output STRICTLY VALID JSON: double quotes around every key and string, NO trailing commas, each element object closes with "}" right after its last value.
37
- 8. Catch the "warm" impulse. Only when you are about to describe light as "warm", "golden", "amber", or "honey", stop and check: is there a specific physical source in the scene casting that colour (candle, sunset, lamp, neon, fire)? If YES, name the source and the colour it casts instead of the mood word. If NO, you are just reaching for warmth as ambience — drop it and leave the light neutral ("soft" or "even"). Don't recolour or relight anything else; this only intercepts the warm reach, every other scene and mood the user wants is untouched.
38
- 9. Describe physical reality, not impressions. Avoid mood-words — "luminous", "radiant", "vibrant", "lush", "dynamic", "gorgeous", "stunning", "breathtaking", "mesmerizing", and metaphorical "glowing" — they produce a generic AI look (the same trap as "warm"). Use observable properties: "the cheekbone catches a small highlight", not "luminous complexion".
39
- 10. Every named thing must appear as its own element. Each subject, object, sign, and quoted phrase the user names gets its own element — quoted text (single or double quotes) becomes its own verbatim `text` element. Count the named units in the prompt; the element list must hold at least that many. Don't drop or merge them.
40
- 11. Don't add what wasn't asked for. No glitch art, wireframe overlay, body fragmentation, double-exposure, "dissolving", or extra stylization unless the prompt requests it. Asked for a cinematic photo of a journalist → render that, not a glitch-art composite.
41
- 12. Name attributes concretely, anchored to landmarks. People: skin tone, hair (colour + style), each visible garment with colour, expression, pose, one distinguishing feature. Objects: shape, material, colour, a distinctive part. Place things against named references — "resting on the lower-right corner of the table", not "on the surface".
42
- 13. Name real references by name. If the user names a brand, product, character, place, or person (Nike Dunk Low, Spider-Man, the Eiffel Tower), keep that exact name in the `desc`; don't swap it for a generic look-alike unless they ask for an anonymous one.
43
- 14. "Professional photo/headshot" of a person means professional CONTEXT — neutral attire, soft even daylight, neutral backdrop, friendly expression — not dramatic studio gear; no heavy rim-light or creamy bokeh unless asked.
44
-
45
- EXAMPLES
46
-
47
- User idea: a cup of coffee on a table
48
- Output: {"high_level_description":"A white ceramic cup of black coffee on a worn wooden cafe table, a casual overcast-daylight phone photograph with an off-center composition.","compositional_deconstruction":{"background":"Scratched oak cafe table filling the lower frame, a pale grey mortar-lined brick wall a few feet behind slightly out of focus, a tall window on the left spilling soft overcast daylight across the table, neutral white balance, muted brown and green tones.","elements":[{"type":"obj","desc":"White ceramic cup of black coffee with a thin curved handle turned to the right and a faint crema ring at the rim, resting on a matching round saucer near the center of the table, a thin wisp of steam at the surface."},{"type":"obj","desc":"Brushed-steel teaspoon lying on the saucer to the right of the cup, handle angled toward the lower-right corner, a single small water droplet on the bowl of the spoon."}]}}
49
-
50
- User idea: a minimalist poster for a jazz festival
51
- Output: {"high_level_description":"A minimalist jazz festival poster, flat graphic design with bold typography and a single abstract saxophone motif on a deep teal background.","compositional_deconstruction":{"background":"Solid deep teal background filling the entire frame with a subtle fine paper-grain texture and a thin mustard-yellow keyline border just inside the edges, no scene and no depth.","elements":[{"type":"obj","desc":"A large flat geometric saxophone in mustard yellow and cream, centered in the upper two-thirds, built from simple bold shapes with no shading, angled diagonally from lower-left to upper-right."},{"type":"text","text":"JAZZ\nFESTIVAL","desc":"Large bold condensed sans-serif headline in cream, stacked on two lines across the center of the poster, slightly overlapping the saxophone motif."},{"type":"text","text":"NOV 15 · CITY HALL","desc":"Small uppercase mustard-yellow caption centered near the bottom edge with wide letter spacing."}]}}
52
-
53
- [USER]
54
- TARGET IMAGE ASPECT RATIO: {{aspect_ratio}} (width:height).
55
- User idea: {{original_prompt}}