Spaces:

multimodalart
/

pid

Running on Zero

App Files Files Community

apolinario commited on 1 day ago

Commit

18982d7

1 Parent(s): 5655c6f

Preload both PiD checkpoints (2k + 2kto4k); pick at request time based on resolution (>512 → 2kto4k for 4K target)

Browse files

Files changed (1) hide show

app.py +27 -13

app.py CHANGED Viewed

@@ -32,6 +32,7 @@ snapshot_download(
     local_dir=PID_REPO_DIR,
     allow_patterns=[
         "checkpoints/PiD_res2k_sr4x_official_flux_distill_4step/*",
         "checkpoints/ae.safetensors",
     ],
 )
@@ -48,7 +49,6 @@ from pid._src.utils.model_loader import load_model_from_checkpoint
 DTYPE = torch.bfloat16
 BACKBONE = "zimage"
-CKPT_TYPE = "2k"
 SR_SCALE = 4
 PID_INFERENCE_STEPS = 4
@@ -105,19 +105,32 @@ from diffusers import AutoencoderTiny
 taef1 = AutoencoderTiny.from_pretrained("madebyollin/taef1", torch_dtype=DTYPE).to("cuda")
 taef1.eval()
-print("[pid] loading PiD decoder...", flush=True)
-pid_meta = get_pid_checkpoint(BACKBONE, CKPT_TYPE)
-pid_model, _pid_cfg = load_model_from_checkpoint(
-    experiment_name=pid_meta.experiment,
-    checkpoint_path=pid_meta.checkpoint_path,
-    config_file="pid/_src/configs/pid/config.py",
-    enable_fsdp=False,
-    strict=False,
-)
-pid_model.eval()
 print("[pid] ready", flush=True)
 def _latent_to_pil(tensor: torch.Tensor) -> Image.Image:
     """PiD output is (C, T, H, W) with T=1 for image -> PIL.Image."""
     if tensor.dim() == 4:
@@ -145,7 +158,7 @@ def _pid_pixel_to_pil(x: torch.Tensor) -> Image.Image:
     return Image.fromarray(arr)
-def _pid_stream(latent: torch.Tensor, baseline_01: torch.Tensor, sigma: float, caption: str, num_steps: int = PID_INFERENCE_STEPS):
     """Reimplementation of PiDDistillModel.generate_samples_from_batch that yields
     the current pixel-space tensor after each of the `num_steps` student-sampler
     iterations. Final yield is the clean output."""
@@ -310,7 +323,8 @@ def generate(
     # ---- PiD upscaling on the final latent, streaming the 4 internal steps ----
     final_sigma = float(pipeline.scheduler.sigmas[-1].item())
     pid_img = None
-    for k, total, x in _pid_stream(final_latent, baseline_01, final_sigma, prompt):
         pid_img = _pid_pixel_to_pil(x)
         yield (
             gr.update(visible=True, value=pid_img, label=f"Upscaling with PiD — step {k}/{total}"),

     local_dir=PID_REPO_DIR,
     allow_patterns=[
         "checkpoints/PiD_res2k_sr4x_official_flux_distill_4step/*",
+        "checkpoints/PiD_res2kto4k_sr4x_official_flux_distill_4step/*",
         "checkpoints/ae.safetensors",
     ],
 )
 DTYPE = torch.bfloat16
 BACKBONE = "zimage"
 SR_SCALE = 4
 PID_INFERENCE_STEPS = 4
 taef1 = AutoencoderTiny.from_pretrained("madebyollin/taef1", torch_dtype=DTYPE).to("cuda")
 taef1.eval()
+def _load_pid(ckpt_type: str):
+    meta = get_pid_checkpoint(BACKBONE, ckpt_type)
+    print(f"[pid] loading PiD decoder ({ckpt_type})...", flush=True)
+    model, _ = load_model_from_checkpoint(
+        experiment_name=meta.experiment,
+        checkpoint_path=meta.checkpoint_path,
+        config_file="pid/_src/configs/pid/config.py",
+        enable_fsdp=False,
+        strict=False,
+    )
+    model.eval()
+    return model
+pid_models = {
+    "2k": _load_pid("2k"),
+    "2kto4k": _load_pid("2kto4k"),
+}
 print("[pid] ready", flush=True)
+def _pick_pid_model(resolution: int):
+    """2k decoder is trained at 2048px (sweet spot 512 → 2048); 2kto4k handles 1024 → 4K."""
+    return pid_models["2kto4k"] if resolution > 512 else pid_models["2k"]
 def _latent_to_pil(tensor: torch.Tensor) -> Image.Image:
     """PiD output is (C, T, H, W) with T=1 for image -> PIL.Image."""
     if tensor.dim() == 4:
     return Image.fromarray(arr)
+def _pid_stream(pid_model, latent: torch.Tensor, baseline_01: torch.Tensor, sigma: float, caption: str, num_steps: int = PID_INFERENCE_STEPS):
     """Reimplementation of PiDDistillModel.generate_samples_from_batch that yields
     the current pixel-space tensor after each of the `num_steps` student-sampler
     iterations. Final yield is the clean output."""
     # ---- PiD upscaling on the final latent, streaming the 4 internal steps ----
     final_sigma = float(pipeline.scheduler.sigmas[-1].item())
     pid_img = None
+    pid_model = _pick_pid_model(H)
+    for k, total, x in _pid_stream(pid_model, final_latent, baseline_01, final_sigma, prompt):
         pid_img = _pid_pixel_to_pil(x)
         yield (
             gr.update(visible=True, value=pid_img, label=f"Upscaling with PiD — step {k}/{total}"),