Spaces:

ideogram-ai
/

ideogram4

Running on Zero

App Files Files Community

multimodalart HF Staff commited on Jun 3

Commit

49e5d58

verified ·

1 Parent(s): 60f16ec

Add prompt upsampling (Qwen3-VL+Outlines), 3 modes, v2 checkpoint, Citrus theme: app.py

Browse files

Files changed (1) hide show

app.py +138 -31

app.py CHANGED Viewed

@@ -8,61 +8,157 @@ _HERE = os.path.dirname(os.path.abspath(__file__))
 sys.path.insert(0, os.path.join(_HERE, "diffusers_src", "src"))
 import random
 import spaces
 import torch
 import gradio as gr
 from diffusers import Ideogram4Pipeline
-MODEL_ID = "diffusers-internal-dev/ideogram-4-nf4"
 pipe = Ideogram4Pipeline.from_pretrained(MODEL_ID, torch_dtype=torch.bfloat16)
 pipe.to("cuda")
-MAX_SEED = 2**31 - 1
-@spaces.GPU(duration=180)
 def generate(
     prompt: str,
     width: int,
     height: int,
-    num_inference_steps: int,
-    guidance_scale: float,
     seed: int,
     randomize_seed: bool,
     progress=gr.Progress(track_tqdm=True),
 ):
     if randomize_seed or seed < 0:
         seed = random.randint(0, MAX_SEED)
-    generator = torch.Generator(device="cuda").manual_seed(int(seed))
-    steps = int(num_inference_steps)
-    kwargs = dict(
-        prompt=prompt,
         width=int(width),
         height=int(height),
-        num_inference_steps=steps,
         generator=generator,
-    )
-    if guidance_scale > 0:
-        kwargs["guidance_scale"] = float(guidance_scale)
-        kwargs["guidance_schedule"] = None
-    else:
-        # PR default is len 48 (7.0 x45 + 3.0 x3); rebuild it for any step count.
-        tail = min(3, max(0, steps - 1))
-        kwargs["guidance_schedule"] = (7.0,) * (steps - tail) + (3.0,) * tail
-    image = pipe(**kwargs).images[0]
-    return image, seed
-with gr.Blocks(title="Ideogram 4 (NF4) — diffusers preview") as demo:
     gr.Markdown(
         "## Ideogram 4 (NF4) — diffusers preview\n"
         f"Private demo of [`{MODEL_ID}`](https://huggingface.co/{MODEL_ID}) using the "
-        "[diffusers PR #2](https://github.com/huggingface/diffusers-new-model-addition-ideogram/pull/2) "
-        "branch, running on ZeroGPU."
     )
     with gr.Row():
@@ -72,27 +168,38 @@ with gr.Blocks(title="Ideogram 4 (NF4) — diffusers preview") as demo:
                 value="A photo of a cat holding a sign that says hello world",
                 lines=3,
             )
             run = gr.Button("Generate", variant="primary")
             with gr.Accordion("Advanced", open=False):
                 with gr.Row():
                     width = gr.Slider(512, 2048, value=1024, step=64, label="Width")
                     height = gr.Slider(512, 2048, value=1024, step=64, label="Height")
-                steps = gr.Slider(8, 64, value=48, step=1, label="Inference steps")
-                guidance = gr.Slider(
-                    0.0, 15.0, value=0.0, step=0.1,
-                    label="Guidance scale (0 = recommended schedule: 7.0 → 3.0)",
-                )
                 with gr.Row():
                     seed = gr.Number(label="Seed", value=0, precision=0)
                     randomize = gr.Checkbox(label="Randomize seed", value=True)
         with gr.Column():
             out_image = gr.Image(label="Output", type="pil")
             out_seed = gr.Number(label="Seed used", interactive=False, precision=0)
     run.click(
         generate,
-        inputs=[prompt, width, height, steps, guidance, seed, randomize],
-        outputs=[out_image, out_seed],
     )
 demo.queue().launch()

 sys.path.insert(0, os.path.join(_HERE, "diffusers_src", "src"))
 import random
+from typing import List, Literal, Union
 import spaces
 import torch
 import gradio as gr
+from pydantic import BaseModel, Field
 from diffusers import Ideogram4Pipeline
+from transformers import AutoProcessor, Qwen3VLForConditionalGeneration
+# --- New (safety-fixed) checkpoint ---
+MODEL_ID = "diffusers-internal-dev/ideogram-4-nf4-v2"
+# Generative sibling of the text encoder, used for prompt upsampling.
+UPSAMPLER_ID = "Qwen/Qwen3-VL-8B-Instruct"
+MAX_SEED = 2**31 - 1
+# --- Sampler modes (V4 presets, forward step-order: main CFG 7.0 -> polish 3.0) ---
+MODES = {
+    "Turbo · 12 steps": dict(num_inference_steps=12, guidance_schedule=(7.0,) * 11 + (3.0,) * 1, mu=0.5, std=1.75),
+    "Default · 20 steps": dict(num_inference_steps=20, guidance_schedule=(7.0,) * 18 + (3.0,) * 2, mu=0.0, std=1.75),
+    "Quality · 48 steps": dict(num_inference_steps=48, guidance_schedule=(7.0,) * 45 + (3.0,) * 3, mu=0.0, std=1.5),
+}
+# --- Pipeline ---
 pipe = Ideogram4Pipeline.from_pretrained(MODEL_ID, torch_dtype=torch.bfloat16)
 pipe.to("cuda")
+# --- Prompt upsampler (Qwen3-VL-8B-Instruct, generative) ---
+upsampler = Qwen3VLForConditionalGeneration.from_pretrained(UPSAMPLER_ID, torch_dtype=torch.bfloat16)
+upsampler.to("cuda")
+upsampler_proc = AutoProcessor.from_pretrained(UPSAMPLER_ID)
+try:
+    import outlines
+    OUTLINES_AVAILABLE = True
+except Exception:
+    OUTLINES_AVAILABLE = False
+# --- Caption schema (matches Ideogram's native caption / caption_verifier) ---
+class ObjElement(BaseModel):
+    type: Literal["obj"]
+    desc: str
+class TextElement(BaseModel):
+    type: Literal["text"]
+    text: str
+    desc: str
+class Composition(BaseModel):
+    background: str
+    elements: List[Union[ObjElement, TextElement]] = Field(min_length=1)
+class Caption(BaseModel):
+    high_level_description: str
+    compositional_deconstruction: Composition
+def _load_sections(path):
+    sections, cur, buf = {}, None, []
+    for line in open(path, encoding="utf-8").read().splitlines():
+        s = line.strip()
+        if s.startswith("[") and s.endswith("]") and " " not in s:
+            if cur is not None:
+                sections[cur] = "\n".join(buf).strip()
+            cur, buf = s[1:-1].lower(), []
+        else:
+            buf.append(line)
+    if cur is not None:
+        sections[cur] = "\n".join(buf).strip()
+    return sections
+_SEC = _load_sections(os.path.join(_HERE, "v6.txt"))
+SYSTEM_PROMPT = _SEC["system"]
+USER_TEMPLATE = _SEC.get("user", "User idea: {{original_prompt}}")
+_logits_processor = None  # built lazily (compiles the schema -> FSM once)
+def _get_logits_processor():
+    global _logits_processor
+    if _logits_processor is None and OUTLINES_AVAILABLE:
+        ol_model = outlines.from_transformers(upsampler, upsampler_proc.tokenizer)
+        _logits_processor = outlines.Generator(ol_model, Caption).logits_processor
+    return _logits_processor
+def upsample_prompt(prompt: str, width: int, height: int) -> str:
+    from math import gcd
+    d = gcd(width, height) or 1
+    aspect_ratio = f"{width // d}:{height // d}"
+    user = USER_TEMPLATE.replace("{{aspect_ratio}}", aspect_ratio).replace("{{original_prompt}}", prompt)
+    messages = [{"role": "system", "content": SYSTEM_PROMPT}, {"role": "user", "content": user}]
+    inputs = upsampler_proc.apply_chat_template(
+        messages, add_generation_prompt=True, tokenize=True, return_tensors="pt", return_dict=True
+    ).to(upsampler.device)
+    gen_kwargs = dict(max_new_tokens=1024, do_sample=True, temperature=1.0, use_cache=True)
+    lp = _get_logits_processor()
+    if lp is not None:
+        lp.reset()
+        gen_kwargs["logits_processor"] = [lp]
+    out = upsampler.generate(**inputs, **gen_kwargs)
+    return upsampler_proc.batch_decode(
+        out[:, inputs["input_ids"].shape[1]:], skip_special_tokens=True
+    )[0].strip()
+@spaces.GPU(duration=240)
 def generate(
     prompt: str,
+    mode: str,
+    enhance: bool,
     width: int,
     height: int,
     seed: int,
     randomize_seed: bool,
     progress=gr.Progress(track_tqdm=True),
 ):
     if randomize_seed or seed < 0:
         seed = random.randint(0, MAX_SEED)
+    final_prompt = prompt
+    if enhance:
+        if not OUTLINES_AVAILABLE:
+            gr.Warning("`outlines` is not installed — upsampling without structural constraints.")
+        final_prompt = upsample_prompt(prompt, int(width), int(height))
+    generator = torch.Generator(device="cuda").manual_seed(int(seed))
+    preset = MODES.get(mode, MODES["Default · 20 steps"])
+    image = pipe(
+        prompt=final_prompt,
         width=int(width),
         height=int(height),
         generator=generator,
+        **preset,
+    ).images[0]
+    return image, seed, final_prompt
+with gr.Blocks(theme=gr.themes.Citrus(), title="Ideogram 4 (NF4) — diffusers preview") as demo:
     gr.Markdown(
         "## Ideogram 4 (NF4) — diffusers preview\n"
         f"Private demo of [`{MODEL_ID}`](https://huggingface.co/{MODEL_ID}) using the "
+        "[diffusers PR](https://github.com/huggingface/diffusers-new-model-addition-ideogram) branch, on ZeroGPU.\n"
+        "Toggle **Prompt upsampling** in Advanced to rewrite your idea into Ideogram's native structured caption "
+        "(Qwen3-VL-8B + Outlines)."
     )
     with gr.Row():
                 value="A photo of a cat holding a sign that says hello world",
                 lines=3,
             )
+            mode = gr.Radio(
+                choices=list(MODES.keys()),
+                value="Default · 20 steps",
+                label="Mode (speed ↔ quality)",
+            )
             run = gr.Button("Generate", variant="primary")
             with gr.Accordion("Advanced", open=False):
+                enhance = gr.Checkbox(
+                    label="Prompt upsampling (Outlines)",
+                    value=False,
+                    info="Rewrite the prompt into Ideogram's native JSON caption before generating."
+                    + ("" if OUTLINES_AVAILABLE else "  ⚠ outlines not installed — runs unconstrained."),
+                )
                 with gr.Row():
                     width = gr.Slider(512, 2048, value=1024, step=64, label="Width")
                     height = gr.Slider(512, 2048, value=1024, step=64, label="Height")
                 with gr.Row():
                     seed = gr.Number(label="Seed", value=0, precision=0)
                     randomize = gr.Checkbox(label="Randomize seed", value=True)
         with gr.Column():
             out_image = gr.Image(label="Output", type="pil")
             out_seed = gr.Number(label="Seed used", interactive=False, precision=0)
+            out_caption = gr.Textbox(
+                label="Caption fed to the model (upsampled when enabled)",
+                lines=4,
+                show_copy_button=True,
+            )
     run.click(
         generate,
+        inputs=[prompt, mode, enhance, width, height, seed, randomize],
+        outputs=[out_image, out_seed, out_caption],
     )
 demo.queue().launch()