Spaces:

ideogram-ai
/

ideogram4

Running on Zero

App Files Files Community

multimodalart HF Staff commited on 5 days ago

Commit

6ed28ce

verified ·

1 Parent(s): 013c185

Dequant nf4->bf16 in parent/CPU context (persists across ZeroGPU forks) + streamed upsampler progress

Browse files

Files changed (1) hide show

app.py +36 -39

app.py CHANGED Viewed

@@ -39,6 +39,11 @@ MODES = {
 # --- Pipeline ---
 pipe = Ideogram4Pipeline.from_pretrained(MODEL_ID, torch_dtype=torch.bfloat16)
 pipe.to("cuda")
 # --- Upsampler tokenizer + pre-fetched LM head (graft done lazily on GPU) ---
@@ -123,24 +128,10 @@ except Exception as e:  # don't let a graft hiccup block the demo / the bf16 OOM
     ENHANCER, LOGITS_PROCESSOR = None, None
-# --- bf16 path: dequantize both transformers nf4 -> bf16 (kept resident; YOLO OOM test) ---
-# diffusers ModelMixin.dequantize() replaces the bnb 4-bit layers with bf16 and drops the quantizer.
-# Done lazily on first GPU request (ZeroGPU: no CUDA at import, and bnb dequant needs the weights on GPU).
-_BF16_DONE = False
-def _ensure_bf16_transformers():
-    global _BF16_DONE
-    if _BF16_DONE:
-        return
-    pipe.transformer.dequantize()
-    pipe.unconditional_transformer.dequantize()
-    torch.cuda.empty_cache()
-    _BF16_DONE = True
-def upsample_prompt(prompt: str, width: int, height: int) -> str:
     from math import gcd
     gen = ENHANCER
     d = gcd(width, height) or 1
@@ -150,14 +141,28 @@ def upsample_prompt(prompt: str, width: int, height: int) -> str:
     inputs = upsampler_proc.apply_chat_template(
         messages, add_generation_prompt=True, tokenize=True, return_tensors="pt", return_dict=True
     ).to(gen.device)
-    gen_kwargs = dict(max_new_tokens=1024, do_sample=True, temperature=1.0, use_cache=True)
     if LOGITS_PROCESSOR is not None:
         LOGITS_PROCESSOR.reset()
         gen_kwargs["logits_processor"] = [LOGITS_PROCESSOR]
-    out = gen.generate(**inputs, **gen_kwargs)
-    return upsampler_proc.batch_decode(
-        out[:, inputs["input_ids"].shape[1]:], skip_special_tokens=True
-    )[0].strip()
 @spaces.GPU(duration=240, size="xlarge")
@@ -178,8 +183,9 @@ def generate(
     if enhance:
         if not OUTLINES_AVAILABLE:
             gr.Warning("`outlines` is not installed — upsampling without structural constraints.")
-        final_prompt = upsample_prompt(prompt, int(width), int(height))
     generator = torch.Generator(device="cuda").manual_seed(int(seed))
     preset = MODES.get(mode, MODES["Default · 20 steps"])
     image = pipe(
@@ -198,22 +204,13 @@ def generate(
 @spaces.GPU(size="xlarge")
 def _warmup():
-    """Force the upsampler + pipeline onto GPU and warm their kernels at STARTUP, so request #1
-    isn't slow. On ZeroGPU, module-level loading is CPU-only; GPU placement + JIT warmup otherwise
-    happen on the first request."""
-    _ensure_bf16_transformers()
-    try:
-        if ENHANCER is not None:
-            upsample_prompt("a red apple on a wooden table", 1024, 1024)
-            print("[warmup] upsampler ready on GPU", flush=True)
-    except Exception as e:
-        print(f"[warmup] upsampler warmup skipped: {e!r}", flush=True)
-    try:
-        g = torch.Generator(device="cuda").manual_seed(0)
-        pipe(prompt="a red apple", width=1024, height=1024, generator=g, **MODES["Turbo · 12 steps"])
-        print("[warmup] pipeline ready on GPU", flush=True)
-    except Exception as e:
-        print(f"[warmup] pipeline warmup skipped: {e!r}", flush=True)
 try:

 # --- Pipeline ---
 pipe = Ideogram4Pipeline.from_pretrained(MODEL_ID, torch_dtype=torch.bfloat16)
+# Dequantize nf4 -> bf16 in the PARENT (CPU) context, BEFORE ZeroGPU forks/packs the model, so every
+# fork inherits bf16 (a fork-local dequant doesn't persist). bitsandbytes supports CPU 4-bit dequant.
+# This also gives AOTI real bf16 weights to bind to its (weight-less) compiled graph.
+pipe.transformer.dequantize()
+pipe.unconditional_transformer.dequantize()
 pipe.to("cuda")
 # --- Upsampler tokenizer + pre-fetched LM head (graft done lazily on GPU) ---
     ENHANCER, LOGITS_PROCESSOR = None, None
+def upsample_prompt(prompt: str, width: int, height: int, progress=None) -> str:
     from math import gcd
+    from threading import Thread
+    from transformers import TextIteratorStreamer
     gen = ENHANCER
     d = gcd(width, height) or 1
     inputs = upsampler_proc.apply_chat_template(
         messages, add_generation_prompt=True, tokenize=True, return_tensors="pt", return_dict=True
     ).to(gen.device)
+    max_new = 1024
+    gen_kwargs = dict(max_new_tokens=max_new, do_sample=True, temperature=1.0, use_cache=True)
     if LOGITS_PROCESSOR is not None:
         LOGITS_PROCESSOR.reset()
         gen_kwargs["logits_processor"] = [LOGITS_PROCESSOR]
+    if progress is None:  # warmup path, no UI
+        out = gen.generate(**inputs, **gen_kwargs)
+        return upsampler_proc.batch_decode(out[:, inputs["input_ids"].shape[1]:], skip_special_tokens=True)[0].strip()
+    # stream tokens so the UI shows the upsampler working
+    streamer = TextIteratorStreamer(upsampler_proc.tokenizer, skip_prompt=True, skip_special_tokens=True)
+    gen_kwargs["streamer"] = streamer
+    thread = Thread(target=gen.generate, kwargs={**inputs, **gen_kwargs})
+    thread.start()
+    text, n = "", 0
+    for chunk in streamer:
+        text += chunk
+        n += 1
+        progress(min(n / max_new, 0.99), desc="✍️ Upsampling prompt…")
+    thread.join()
+    return text.strip()
 @spaces.GPU(duration=240, size="xlarge")
     if enhance:
         if not OUTLINES_AVAILABLE:
             gr.Warning("`outlines` is not installed — upsampling without structural constraints.")
+        final_prompt = upsample_prompt(prompt, int(width), int(height), progress=progress)
+    progress(0.0, desc="🎨 Generating image…")
     generator = torch.Generator(device="cuda").manual_seed(int(seed))
     preset = MODES.get(mode, MODES["Default · 20 steps"])
     image = pipe(
 @spaces.GPU(size="xlarge")
 def _warmup():
+    """Preload the upsampler onto GPU and warm it at STARTUP (graft move + Outlines FSM + first-token JIT).
+    NOTE: runtime nf4->bf16 dequant is intentionally NOT done here — it does not persist across ZeroGPU
+    forks (each request re-forks from the nf4 parent process), so bf16+speed will come from a precompiled
+    AOTI artifact instead."""
+    if ENHANCER is not None:
+        upsample_prompt("a red apple on a wooden table", 1024, 1024)
+        print("[warmup] upsampler ready on GPU", flush=True)
 try: