Spaces:

multimodalart
/

pid

Running on Zero

App Files Files Community

apolinario commited on 1 day ago

Commit

e2f50b1

1 Parent(s): afb0b5a

Drop intermediate captures + gallery; use gr.ImageSlider for Z-Image vs PiD A/B; dynamic 'Generating Z-Image step X/N' / 'Upscaling' labels

Browse files

Files changed (1) hide show

app.py +26 -36

app.py CHANGED Viewed

@@ -176,7 +176,6 @@ import queue as _queue
 def generate(
     prompt: str,
     num_inference_steps: int = 28,
-    num_captures: int = 4,
     guidance_scale: float = 5.0,
     seed: int = 0,
     resolution: int = 512,
@@ -185,22 +184,16 @@ def generate(
         raise gr.Error("Please enter a prompt.")
     num_inference_steps = int(num_inference_steps)
-    num_captures = int(num_captures)
     H = W = int(resolution)
-    # initial: show the live-preview image, hide the final gallery
-    yield gr.update(visible=True, value=None), gr.update(visible=False, value=None)
-    capture_ks = set(_evenly_spaced_capture_steps(num_inference_steps, num_captures))
-    xt_cb = XtCaptureCallback(capture_ks) if capture_ks else None
     # ---- Run Z-Image in a thread; stream taef1 previews via a queue ----
     preview_q: "_queue.Queue" = _queue.Queue()
     _DONE = object()
     def streaming_cb(pipe, step_index, timestep, callback_kwargs):
-        if xt_cb is not None:
-            xt_cb(pipe, step_index, timestep, callback_kwargs)
         try:
             preview = _taef1_preview(callback_kwargs["latents"], H, W)
             preview_q.put((step_index, preview))
@@ -241,33 +234,31 @@ def generate(
                 raise payload
             raw_output = payload
             break
-        yield gr.update(visible=True, value=payload), gr.update(visible=False)
     thread.join()
     final_latent = extract_latent(pipeline, raw_output, pipe_cfg, H, W)
-    # ---- PiD per-step decode (sequentially) ----
-    steps_iter = []
-    if xt_cb is not None:
-        for K in sorted(xt_cb.captured.keys()):
-            xt_packed_cpu, sigma = xt_cb.captured[K]
-            xt_packed = xt_packed_cpu.to(device="cuda", dtype=DTYPE)
-            xt_latent = extract_latent(pipeline, SimpleNamespace(images=xt_packed), pipe_cfg, H, W)
-            steps_iter.append((f"step {K:02d}/{num_inference_steps}", xt_latent, sigma))
-    final_sigma = float(pipeline.scheduler.sigmas[-1].item())
-    steps_iter.append(("final x₀", final_latent, final_sigma))
-    outputs: list[tuple[Image.Image, str]] = []
-    for label, latent, sigma in steps_iter:
-        with torch.no_grad():
-            baseline_01 = decode_with_pipeline_vae(pipeline, latent, pipe_cfg)
-            pid_img = _pid_decode(latent, baseline_01, sigma, prompt)
-        outputs.append((pid_img, f"{label}  (σ={sigma:.3f})"))
-        # Flash the latest PiD output in the live-preview image during PiD decoding too
-        yield gr.update(visible=True, value=pid_img), gr.update(visible=False)
-    # ---- Done: hide live preview, show the final gallery ----
-    yield gr.update(visible=False, value=None), gr.update(visible=True, value=outputs)
 DESCRIPTION = """
@@ -297,18 +288,17 @@ with gr.Blocks(theme=gr.themes.Citrus(), css=CSS) as demo:
                 resolution = gr.Slider(label="Z-Image resolution", minimum=256, maximum=1024, step=128, value=512)
                 num_inference_steps = gr.Slider(label="Z-Image steps", minimum=8, maximum=50, step=1, value=28)
             with gr.Row():
-                num_captures = gr.Slider(label="Intermediate captures", minimum=1, maximum=8, step=1, value=4)
                 guidance_scale = gr.Slider(label="Guidance", minimum=1.0, maximum=10.0, step=0.5, value=5.0)
-            seed = gr.Number(label="Seed", value=0, precision=0)
             run = gr.Button("Run", variant="primary")
         with gr.Column(scale=2):
-            live_preview = gr.Image(label="Live preview", visible=True, show_label=True, type="pil")
-            gallery = gr.Gallery(label="PiD-decoded denoising trajectory", visible=False, columns=2, object_fit="contain")
     run.click(
         fn=generate,
-        inputs=[prompt, num_inference_steps, num_captures, guidance_scale, seed, resolution],
-        outputs=[live_preview, gallery],
     )
 if __name__ == "__main__":

 def generate(
     prompt: str,
     num_inference_steps: int = 28,
     guidance_scale: float = 5.0,
     seed: int = 0,
     resolution: int = 512,
         raise gr.Error("Please enter a prompt.")
     num_inference_steps = int(num_inference_steps)
     H = W = int(resolution)
+    # initial: show the live preview, hide the final slider
+    yield gr.update(visible=True, value=None, label="Generating Z-Image…"), gr.update(visible=False, value=None)
     # ---- Run Z-Image in a thread; stream taef1 previews via a queue ----
     preview_q: "_queue.Queue" = _queue.Queue()
     _DONE = object()
     def streaming_cb(pipe, step_index, timestep, callback_kwargs):
         try:
             preview = _taef1_preview(callback_kwargs["latents"], H, W)
             preview_q.put((step_index, preview))
                 raise payload
             raw_output = payload
             break
+        label = f"Generating Z-Image — step {step_index + 1}/{num_inference_steps}"
+        yield gr.update(visible=True, value=payload, label=label), gr.update(visible=False)
     thread.join()
     final_latent = extract_latent(pipeline, raw_output, pipe_cfg, H, W)
+    # ---- VAE decode of the final clean latent (Z-Image baseline) ----
+    yield gr.update(visible=True, label="Decoding final Z-Image…"), gr.update(visible=False)
+    with torch.no_grad():
+        baseline_01 = decode_with_pipeline_vae(pipeline, final_latent, pipe_cfg)
+    zimage_img = Image.fromarray(
+        (baseline_01[0].clamp(0, 1).permute(1, 2, 0).float().cpu().numpy() * 255).astype(np.uint8)
+    )
+    # ---- PiD upscaling on the final latent ----
+    yield gr.update(visible=True, value=zimage_img, label="Upscaling with PiD (4× super-resolution, 4 steps)…"), gr.update(visible=False)
+    final_sigma = float(pipeline.scheduler.sigmas[-1].item())
+    with torch.no_grad():
+        pid_img = _pid_decode(final_latent, baseline_01, final_sigma, prompt)
+    # ---- Done: hide live preview, show the A/B slider ----
+    yield (
+        gr.update(visible=False, value=None),
+        gr.update(visible=True, value=(zimage_img, pid_img)),
+    )
 DESCRIPTION = """
                 resolution = gr.Slider(label="Z-Image resolution", minimum=256, maximum=1024, step=128, value=512)
                 num_inference_steps = gr.Slider(label="Z-Image steps", minimum=8, maximum=50, step=1, value=28)
             with gr.Row():
                 guidance_scale = gr.Slider(label="Guidance", minimum=1.0, maximum=10.0, step=0.5, value=5.0)
+                seed = gr.Number(label="Seed", value=0, precision=0)
             run = gr.Button("Run", variant="primary")
         with gr.Column(scale=2):
+            live_preview = gr.Image(label="Generating Z-Image…", visible=True, show_label=True, type="pil")
+            slider = gr.ImageSlider(label="Z-Image (left)  ↔  PiD 4× upscale (right)", visible=False, type="pil")
     run.click(
         fn=generate,
+        inputs=[prompt, num_inference_steps, guidance_scale, seed, resolution],
+        outputs=[live_preview, slider],
     )
 if __name__ == "__main__":