Spaces:

ideogram-ai
/

ideogram4

Running on Zero

App Files Files Community

multimodalart HF Staff commited on Jun 3

Commit

a1f5c28

verified ·

1 Parent(s): 2436f41

Graft hosted lm_head onto pipe encoder for upsampling (no second model)

Browse files

Files changed (1) hide show

app.py +40 -21

app.py CHANGED Viewed

@@ -12,15 +12,20 @@ from typing import List, Literal, Union
 import spaces
 import torch
 import gradio as gr
 from pydantic import BaseModel, Field
 from diffusers import Ideogram4Pipeline
 from transformers import AutoProcessor, Qwen3VLForConditionalGeneration
 # --- New (safety-fixed) checkpoint ---
 MODEL_ID = "diffusers-internal-dev/ideogram-4-nf4-v2"
-# Generative sibling of the text encoder, used for prompt upsampling.
-UPSAMPLER_ID = "Qwen/Qwen3-VL-8B-Instruct"
 MAX_SEED = 2**31 - 1
@@ -35,10 +40,9 @@ MODES = {
 pipe = Ideogram4Pipeline.from_pretrained(MODEL_ID, torch_dtype=torch.bfloat16)
 pipe.to("cuda")
-# --- Prompt upsampler (Qwen3-VL-8B-Instruct, generative) ---
-upsampler = Qwen3VLForConditionalGeneration.from_pretrained(UPSAMPLER_ID, torch_dtype=torch.bfloat16)
-upsampler.to("cuda")
-upsampler_proc = AutoProcessor.from_pretrained(UPSAMPLER_ID)
 try:
     import outlines
@@ -88,33 +92,48 @@ _SEC = _load_sections(os.path.join(_HERE, "v6.txt"))
 SYSTEM_PROMPT = _SEC["system"]
 USER_TEMPLATE = _SEC.get("user", "User idea: {{original_prompt}}")
-_logits_processor = None  # built lazily (compiles the schema -> FSM once)
-def _get_logits_processor():
-    global _logits_processor
-    if _logits_processor is None and OUTLINES_AVAILABLE:
-        ol_model = outlines.from_transformers(upsampler, upsampler_proc.tokenizer)
         _logits_processor = outlines.Generator(ol_model, Caption).logits_processor
-    return _logits_processor
 def upsample_prompt(prompt: str, width: int, height: int) -> str:
     from math import gcd
     d = gcd(width, height) or 1
     aspect_ratio = f"{width // d}:{height // d}"
     user = USER_TEMPLATE.replace("{{aspect_ratio}}", aspect_ratio).replace("{{original_prompt}}", prompt)
     messages = [{"role": "system", "content": SYSTEM_PROMPT}, {"role": "user", "content": user}]
     inputs = upsampler_proc.apply_chat_template(
         messages, add_generation_prompt=True, tokenize=True, return_tensors="pt", return_dict=True
-    ).to(upsampler.device)
     gen_kwargs = dict(max_new_tokens=1024, do_sample=True, temperature=1.0, use_cache=True)
-    lp = _get_logits_processor()
-    if lp is not None:
-        lp.reset()
-        gen_kwargs["logits_processor"] = [lp]
-    out = upsampler.generate(**inputs, **gen_kwargs)
     return upsampler_proc.batch_decode(
         out[:, inputs["input_ids"].shape[1]:], skip_special_tokens=True
     )[0].strip()
@@ -158,7 +177,7 @@ with gr.Blocks(theme=gr.themes.Citrus(), title="Ideogram 4 (NF4) — diffusers p
         f"Private demo of [`{MODEL_ID}`](https://huggingface.co/{MODEL_ID}) using the "
         "[diffusers PR](https://github.com/huggingface/diffusers-new-model-addition-ideogram) branch, on ZeroGPU.\n"
         "Toggle **Prompt upsampling** in Advanced to rewrite your idea into Ideogram's native structured caption "
-        "(Qwen3-VL-8B + Outlines)."
     )
     with gr.Row():

 import spaces
 import torch
+import torch.nn as nn
 import gradio as gr
 from pydantic import BaseModel, Field
+from accelerate import init_empty_weights
+from huggingface_hub import hf_hub_download
+from safetensors.torch import load_file
 from diffusers import Ideogram4Pipeline
 from transformers import AutoProcessor, Qwen3VLForConditionalGeneration
 # --- New (safety-fixed) checkpoint ---
 MODEL_ID = "diffusers-internal-dev/ideogram-4-nf4-v2"
+# Just the LM head, grafted onto the pipeline's own Qwen3-VL encoder to make it generative.
+LM_HEAD_REPO = "multimodalart/qwen3-vl-8b-instruct-lm-head"
+TOKENIZER_ID = "Qwen/Qwen3-VL-8B-Instruct"  # processor/tokenizer only (no weights)
 MAX_SEED = 2**31 - 1
 pipe = Ideogram4Pipeline.from_pretrained(MODEL_ID, torch_dtype=torch.bfloat16)
 pipe.to("cuda")
+# --- Upsampler tokenizer + pre-fetched LM head (graft done lazily on GPU) ---
+upsampler_proc = AutoProcessor.from_pretrained(TOKENIZER_ID)
+LM_HEAD_PATH = hf_hub_download(LM_HEAD_REPO, "lm_head.safetensors")  # cached at startup
 try:
     import outlines
 SYSTEM_PROMPT = _SEC["system"]
 USER_TEMPLATE = _SEC.get("user", "User idea: {{original_prompt}}")
+_enhancer = None          # Qwen3VLForConditionalGeneration sharing the pipe's encoder body
+_logits_processor = None  # Outlines structural constraint (built once)
+def _build_enhancer():
+    """Graft the hosted lm_head onto pipe.text_encoder -> a generative model (no second body)."""
+    global _enhancer, _logits_processor
+    if _enhancer is not None:
+        return _enhancer
+    device = pipe.text_encoder.device
+    head = load_file(LM_HEAD_PATH)["lm_head.weight"]  # [vocab, hidden] bf16
+    with init_empty_weights():
+        gen = Qwen3VLForConditionalGeneration(pipe.text_encoder.config)
+    gen.model = pipe.text_encoder  # reuse the loaded (nf4) encoder body — no extra body in VRAM
+    lm = nn.Linear(head.shape[1], head.shape[0], bias=False).to(device=device, dtype=torch.bfloat16)
+    with torch.no_grad():
+        lm.weight.copy_(head.to(device=device, dtype=torch.bfloat16))
+    gen.lm_head = lm
+    gen.eval()
+    _enhancer = gen
+    if OUTLINES_AVAILABLE:
+        ol_model = outlines.from_transformers(gen, upsampler_proc.tokenizer)
         _logits_processor = outlines.Generator(ol_model, Caption).logits_processor
+    return gen
 def upsample_prompt(prompt: str, width: int, height: int) -> str:
     from math import gcd
+    gen = _build_enhancer()
     d = gcd(width, height) or 1
     aspect_ratio = f"{width // d}:{height // d}"
     user = USER_TEMPLATE.replace("{{aspect_ratio}}", aspect_ratio).replace("{{original_prompt}}", prompt)
     messages = [{"role": "system", "content": SYSTEM_PROMPT}, {"role": "user", "content": user}]
     inputs = upsampler_proc.apply_chat_template(
         messages, add_generation_prompt=True, tokenize=True, return_tensors="pt", return_dict=True
+    ).to(gen.device)
     gen_kwargs = dict(max_new_tokens=1024, do_sample=True, temperature=1.0, use_cache=True)
+    if _logits_processor is not None:
+        _logits_processor.reset()
+        gen_kwargs["logits_processor"] = [_logits_processor]
+    out = gen.generate(**inputs, **gen_kwargs)
     return upsampler_proc.batch_decode(
         out[:, inputs["input_ids"].shape[1]:], skip_special_tokens=True
     )[0].strip()
         f"Private demo of [`{MODEL_ID}`](https://huggingface.co/{MODEL_ID}) using the "
         "[diffusers PR](https://github.com/huggingface/diffusers-new-model-addition-ideogram) branch, on ZeroGPU.\n"
         "Toggle **Prompt upsampling** in Advanced to rewrite your idea into Ideogram's native structured caption "
+        "(the pipeline's own Qwen3-VL encoder + a grafted LM head + Outlines)."
     )
     with gr.Row():