Spaces:

ideogram-ai
/

ideogram4

Running on Zero

App Files Files Community

multimodalart HF Staff commited on 4 days ago

Commit

66efa02

verified ·

1 Parent(s): bc8eac2

Preload upsampler + pipeline on GPU via @spaces.GPU startup warmup

Browse files

Files changed (1) hide show

app.py +67 -22

app.py CHANGED Viewed

@@ -92,36 +92,56 @@ _SEC = _load_sections(os.path.join(_HERE, "v6.txt"))
 SYSTEM_PROMPT = _SEC["system"]
 USER_TEMPLATE = _SEC.get("user", "User idea: {{original_prompt}}")
-_enhancer = None          # Qwen3VLForConditionalGeneration sharing the pipe's encoder body
-_logits_processor = None  # Outlines structural constraint (built once)
 def _build_enhancer():
-    """Graft the hosted lm_head onto pipe.text_encoder -> a generative model (no second body)."""
-    global _enhancer, _logits_processor
-    if _enhancer is not None:
-        return _enhancer
-    device = pipe.text_encoder.device
     head = load_file(LM_HEAD_PATH)["lm_head.weight"]  # [vocab, hidden] bf16
     with init_empty_weights():
         gen = Qwen3VLForConditionalGeneration(pipe.text_encoder.config)
     gen.model = pipe.text_encoder  # reuse the loaded (nf4) encoder body — no extra body in VRAM
-    lm = nn.Linear(head.shape[1], head.shape[0], bias=False).to(device=device, dtype=torch.bfloat16)
     with torch.no_grad():
-        lm.weight.copy_(head.to(device=device, dtype=torch.bfloat16))
-    gen.lm_head = lm
     gen.eval()
-    _enhancer = gen
     if OUTLINES_AVAILABLE:
         ol_model = outlines.from_transformers(gen, upsampler_proc.tokenizer)
-        _logits_processor = outlines.Generator(ol_model, Caption).logits_processor
-    return gen
 def upsample_prompt(prompt: str, width: int, height: int) -> str:
     from math import gcd
-    gen = _build_enhancer()
     d = gcd(width, height) or 1
     aspect_ratio = f"{width // d}:{height // d}"
     user = USER_TEMPLATE.replace("{{aspect_ratio}}", aspect_ratio).replace("{{original_prompt}}", prompt)
@@ -130,16 +150,16 @@ def upsample_prompt(prompt: str, width: int, height: int) -> str:
         messages, add_generation_prompt=True, tokenize=True, return_tensors="pt", return_dict=True
     ).to(gen.device)
     gen_kwargs = dict(max_new_tokens=1024, do_sample=True, temperature=1.0, use_cache=True)
-    if _logits_processor is not None:
-        _logits_processor.reset()
-        gen_kwargs["logits_processor"] = [_logits_processor]
     out = gen.generate(**inputs, **gen_kwargs)
     return upsampler_proc.batch_decode(
         out[:, inputs["input_ids"].shape[1]:], skip_special_tokens=True
     )[0].strip()
-@spaces.GPU(duration=180, size="xlarge")
 def generate(
     prompt: str,
     mode: str,
@@ -171,6 +191,28 @@ def generate(
     return image, seed, final_prompt
 with gr.Blocks(theme=gr.themes.Citrus(), title="Ideogram 4 (NF4) — diffusers preview") as demo:
     gr.Markdown(
         "## Ideogram 4 (NF4) — diffusers preview\n"
@@ -195,8 +237,8 @@ with gr.Blocks(theme=gr.themes.Citrus(), title="Ideogram 4 (NF4) — diffusers p
             run = gr.Button("Generate", variant="primary")
             with gr.Accordion("Advanced", open=False):
                 enhance = gr.Checkbox(
-                    label="Prompt upsampling",
-                    value=True,
                     info="Rewrite the prompt into Ideogram's native JSON caption before generating."
                     + ("" if OUTLINES_AVAILABLE else "  ⚠ outlines not installed — runs unconstrained."),
                 )
@@ -208,14 +250,17 @@ with gr.Blocks(theme=gr.themes.Citrus(), title="Ideogram 4 (NF4) — diffusers p
                     randomize = gr.Checkbox(label="Randomize seed", value=True)
         with gr.Column():
             out_image = gr.Image(label="Output", type="pil")
             out_caption = gr.Textbox(
                 label="Caption fed to the model (upsampled when enabled)",
             )
     run.click(
         generate,
         inputs=[prompt, mode, enhance, width, height, seed, randomize],
-        outputs=[out_image, seed, out_caption],
     )
 demo.queue().launch()

 SYSTEM_PROMPT = _SEC["system"]
 USER_TEMPLATE = _SEC.get("user", "User idea: {{original_prompt}}")
 def _build_enhancer():
+    """Graft the hosted lm_head onto pipe.text_encoder -> a generative model (no second body).
+    Done ONCE at import time so nothing heavy happens on the first GPU request. Only the new
+    bf16 lm_head is `.to('cuda')` (ZeroGPU defers it); the shared nf4 body is already moved by `pipe`."""
     head = load_file(LM_HEAD_PATH)["lm_head.weight"]  # [vocab, hidden] bf16
     with init_empty_weights():
         gen = Qwen3VLForConditionalGeneration(pipe.text_encoder.config)
     gen.model = pipe.text_encoder  # reuse the loaded (nf4) encoder body — no extra body in VRAM
+    lm = nn.Linear(head.shape[1], head.shape[0], bias=False)
     with torch.no_grad():
+        lm.weight.copy_(head.to(torch.bfloat16))
+    gen.lm_head = lm.to(torch.bfloat16)
+    gen.lm_head.to("cuda")  # ZeroGPU-deferred move of just the head
     gen.eval()
+    lp = None
     if OUTLINES_AVAILABLE:
         ol_model = outlines.from_transformers(gen, upsampler_proc.tokenizer)
+        lp = outlines.Generator(ol_model, Caption).logits_processor  # compiles schema->FSM now
+    return gen, lp
+# Assemble the generative enhancer + structural constraint at STARTUP (not on first request).
+try:
+    ENHANCER, LOGITS_PROCESSOR = _build_enhancer()
+except Exception as e:  # don't let a graft hiccup block the demo / the bf16 OOM test
+    print(f"[enhancer] graft failed, prompt upsampling disabled: {e!r}")
+    ENHANCER, LOGITS_PROCESSOR = None, None
+# --- bf16 path: dequantize both transformers nf4 -> bf16 (kept resident; YOLO OOM test) ---
+# diffusers ModelMixin.dequantize() replaces the bnb 4-bit layers with bf16 and drops the quantizer.
+# Done lazily on first GPU request (ZeroGPU: no CUDA at import, and bnb dequant needs the weights on GPU).
+_BF16_DONE = False
+def _ensure_bf16_transformers():
+    global _BF16_DONE
+    if _BF16_DONE:
+        return
+    pipe.transformer.dequantize()
+    pipe.unconditional_transformer.dequantize()
+    torch.cuda.empty_cache()
+    _BF16_DONE = True
 def upsample_prompt(prompt: str, width: int, height: int) -> str:
     from math import gcd
+    gen = ENHANCER
     d = gcd(width, height) or 1
     aspect_ratio = f"{width // d}:{height // d}"
     user = USER_TEMPLATE.replace("{{aspect_ratio}}", aspect_ratio).replace("{{original_prompt}}", prompt)
         messages, add_generation_prompt=True, tokenize=True, return_tensors="pt", return_dict=True
     ).to(gen.device)
     gen_kwargs = dict(max_new_tokens=1024, do_sample=True, temperature=1.0, use_cache=True)
+    if LOGITS_PROCESSOR is not None:
+        LOGITS_PROCESSOR.reset()
+        gen_kwargs["logits_processor"] = [LOGITS_PROCESSOR]
     out = gen.generate(**inputs, **gen_kwargs)
     return upsampler_proc.batch_decode(
         out[:, inputs["input_ids"].shape[1]:], skip_special_tokens=True
     )[0].strip()
+@spaces.GPU(duration=240)
 def generate(
     prompt: str,
     mode: str,
     return image, seed, final_prompt
+@spaces.GPU
+def _warmup():
+    """Force the upsampler + pipeline onto GPU and warm their kernels at STARTUP, so request #1
+    isn't slow. On ZeroGPU, module-level loading is CPU-only; GPU placement + JIT warmup otherwise
+    happen on the first request."""
+    try:
+        if ENHANCER is not None:
+            upsample_prompt("a red apple on a wooden table", 1024, 1024)
+            print("[warmup] upsampler ready on GPU", flush=True)
+    except Exception as e:
+        print(f"[warmup] upsampler warmup skipped: {e!r}", flush=True)
+    try:
+        g = torch.Generator(device="cuda").manual_seed(0)
+        pipe(prompt="a red apple", width=1024, height=1024, generator=g, **MODES["Turbo · 12 steps"])
+        print("[warmup] pipeline ready on GPU", flush=True)
+    except Exception as e:
+        print(f"[warmup] pipeline warmup skipped: {e!r}", flush=True)
+_warmup()
 with gr.Blocks(theme=gr.themes.Citrus(), title="Ideogram 4 (NF4) — diffusers preview") as demo:
     gr.Markdown(
         "## Ideogram 4 (NF4) — diffusers preview\n"
             run = gr.Button("Generate", variant="primary")
             with gr.Accordion("Advanced", open=False):
                 enhance = gr.Checkbox(
+                    label="Prompt upsampling (Outlines)",
+                    value=False,
                     info="Rewrite the prompt into Ideogram's native JSON caption before generating."
                     + ("" if OUTLINES_AVAILABLE else "  ⚠ outlines not installed — runs unconstrained."),
                 )
                     randomize = gr.Checkbox(label="Randomize seed", value=True)
         with gr.Column():
             out_image = gr.Image(label="Output", type="pil")
+            out_seed = gr.Number(label="Seed used", interactive=False, precision=0)
             out_caption = gr.Textbox(
                 label="Caption fed to the model (upsampled when enabled)",
+                lines=4,
+                show_copy_button=True,
             )
     run.click(
         generate,
         inputs=[prompt, mode, enhance, width, height, seed, randomize],
+        outputs=[out_image, out_seed, out_caption],
     )
 demo.queue().launch()