Spaces:

allenai
/

RefDecoder

Configuration error

App Files Files Community

xiangfan00 commited on 3 days ago

Commit

3a3a124

1 Parent(s): 5a9bcbe

Try using multiple stages

Browse files

Files changed (1) hide show

app.py +160 -24

app.py CHANGED Viewed

@@ -561,32 +561,159 @@ def decode_with_refdecoder(latents, reference_frame, vae, transformer):
     return video
-@spaces.GPU(duration=160)
-def generate_latents_on_gpu(image, prompt, seed):
-    log_cuda_mem("start generate_latents_on_gpu")
     GENERATION_PIPE.to(DEVICE)
-    log_cuda_mem("after pipe -> cuda")
-    resized_image, height, width = resize_image_for_wan(image, GENERATION_PIPE)
-    generator = torch.Generator(device=DEVICE).manual_seed(seed)
     try:
-        with torch.no_grad():
-            output = GENERATION_PIPE(
-                image=resized_image,
-                prompt=prompt,
-                negative_prompt=NEGATIVE_PROMPT,
-                height=height,
-                width=width,
-                num_frames=NUM_FRAMES,
-                num_inference_steps=NUM_INFERENCE_STEPS,
-                guidance_scale=GUIDANCE_SCALE,
-                generator=generator,
-                output_type="latent",
-            )
-        latents = normalize_latent_shape(output.frames).detach().cpu()
     finally:
         GENERATION_PIPE.to("cpu")
-    log_cuda_mem("after latent generation")
-    return latents, resized_image, height, width
 @spaces.GPU(duration=20)
@@ -635,10 +762,19 @@ def generate_and_decode(image, prompt, seed, progress=gr.Progress()):
     run_dir = OUTPUT_ROOT / f"refdecoder_demo_{uuid.uuid4().hex}"
     run_dir.mkdir(parents=True, exist_ok=True)
-    progress(0.0, desc="Generating latents")
     t0 = time.perf_counter()
-    latents, resized_image, height, width = generate_latents_on_gpu(image, prompt, seed)
     latent_secs = time.perf_counter() - t0
     print(f"[timing] latent generation: {latent_secs:.2f}s")
     reference_frame = build_reference_frame(resized_image, "cpu")

     return video
+CHUNK_BOUNDARIES = (8, 16, 23, NUM_INFERENCE_STEPS)
+assert CHUNK_BOUNDARIES[-1] == NUM_INFERENCE_STEPS
+def _run_diffusion_steps(
+    latents,
+    condition,
+    prompt_embeds,
+    negative_prompt_embeds,
+    image_embeds,
+    timesteps,
+    start_step,
+    end_step,
+    transformer_dtype,
+):
+    """Inlined Wan 2.1 I2V denoising loop. Runs steps [start_step, end_step)."""
+    transformer = GENERATION_PIPE.transformer
+    scheduler = GENERATION_PIPE.scheduler
+    with torch.no_grad():
+        for i in range(start_step, end_step):
+            t = timesteps[i]
+            latent_model_input = torch.cat([latents, condition], dim=1).to(transformer_dtype)
+            timestep = t.expand(latents.shape[0])
+            with transformer.cache_context("cond"):
+                noise_pred = transformer(
+                    hidden_states=latent_model_input,
+                    timestep=timestep,
+                    encoder_hidden_states=prompt_embeds,
+                    encoder_hidden_states_image=image_embeds,
+                    return_dict=False,
+                )[0]
+            with transformer.cache_context("uncond"):
+                noise_uncond = transformer(
+                    hidden_states=latent_model_input,
+                    timestep=timestep,
+                    encoder_hidden_states=negative_prompt_embeds,
+                    encoder_hidden_states_image=image_embeds,
+                    return_dict=False,
+                )[0]
+            noise_pred = noise_uncond + GUIDANCE_SCALE * (noise_pred - noise_uncond)
+            latents = scheduler.step(noise_pred, t, latents, return_dict=False)[0]
+    return latents
+@spaces.GPU(duration=50)
+def generate_latents_setup_on_gpu(resized_image, prompt, seed, height, width):
+    """Encode prompt+image, prepare latents, run the first chunk of denoising steps.
+    Returns a CPU-resident state dict consumable by generate_latents_chunk_on_gpu.
+    """
+    log_cuda_mem("start generate_latents_setup_on_gpu")
     GENERATION_PIPE.to(DEVICE)
     try:
+        transformer_dtype = GENERATION_PIPE.transformer.dtype
+        prompt_embeds, negative_prompt_embeds = GENERATION_PIPE.encode_prompt(
+            prompt=prompt,
+            negative_prompt=NEGATIVE_PROMPT,
+            do_classifier_free_guidance=True,
+            num_videos_per_prompt=1,
+            max_sequence_length=512,
+            device=DEVICE,
+        )
+        prompt_embeds = prompt_embeds.to(transformer_dtype)
+        negative_prompt_embeds = negative_prompt_embeds.to(transformer_dtype)
+        image_embeds = GENERATION_PIPE.encode_image(resized_image, DEVICE)
+        image_embeds = image_embeds.repeat(1, 1, 1).to(transformer_dtype)
+        GENERATION_PIPE.scheduler.set_timesteps(NUM_INFERENCE_STEPS, device=DEVICE)
+        timesteps = GENERATION_PIPE.scheduler.timesteps
+        image_tensor = GENERATION_PIPE.video_processor.preprocess(
+            resized_image, height=height, width=width
+        ).to(DEVICE, dtype=torch.float32)
+        generator = torch.Generator(device=DEVICE).manual_seed(seed)
+        latents, condition = GENERATION_PIPE.prepare_latents(
+            image_tensor,
+            1,
+            GENERATION_PIPE.vae.config.z_dim,
+            height,
+            width,
+            NUM_FRAMES,
+            torch.float32,
+            DEVICE,
+            generator,
+            None,
+            None,
+        )
+        end_step = CHUNK_BOUNDARIES[0]
+        latents = _run_diffusion_steps(
+            latents,
+            condition,
+            prompt_embeds,
+            negative_prompt_embeds,
+            image_embeds,
+            timesteps,
+            0,
+            end_step,
+            transformer_dtype,
+        )
+        state = {
+            "prompt_embeds": prompt_embeds.detach().cpu(),
+            "negative_prompt_embeds": negative_prompt_embeds.detach().cpu(),
+            "image_embeds": image_embeds.detach().cpu(),
+            "condition": condition.detach().cpu(),
+            "latents": latents.detach().cpu(),
+            "step_idx": end_step,
+        }
     finally:
         GENERATION_PIPE.to("cpu")
+    log_cuda_mem("end generate_latents_setup_on_gpu")
+    return state
+@spaces.GPU(duration=50)
+def generate_latents_chunk_on_gpu(state, end_step):
+    """Run denoising steps from state['step_idx'] to end_step. Only transformer is moved to GPU."""
+    log_cuda_mem(f"start latents chunk -> step {end_step}")
+    transformer = GENERATION_PIPE.transformer
+    transformer.to(DEVICE)
+    try:
+        GENERATION_PIPE.scheduler.set_timesteps(NUM_INFERENCE_STEPS, device=DEVICE)
+        timesteps = GENERATION_PIPE.scheduler.timesteps
+        transformer_dtype = transformer.dtype
+        latents = state["latents"].to(DEVICE)
+        condition = state["condition"].to(DEVICE)
+        prompt_embeds = state["prompt_embeds"].to(DEVICE)
+        negative_prompt_embeds = state["negative_prompt_embeds"].to(DEVICE)
+        image_embeds = state["image_embeds"].to(DEVICE)
+        latents = _run_diffusion_steps(
+            latents,
+            condition,
+            prompt_embeds,
+            negative_prompt_embeds,
+            image_embeds,
+            timesteps,
+            state["step_idx"],
+            end_step,
+            transformer_dtype,
+        )
+        state["latents"] = latents.detach().cpu()
+        state["step_idx"] = end_step
+    finally:
+        transformer.to("cpu")
+    log_cuda_mem(f"end latents chunk -> step {end_step}")
+    return state
 @spaces.GPU(duration=20)
     run_dir = OUTPUT_ROOT / f"refdecoder_demo_{uuid.uuid4().hex}"
     run_dir.mkdir(parents=True, exist_ok=True)
+    num_chunks = len(CHUNK_BOUNDARIES)
+    progress(0.0, desc=f"Generating latents (1/{num_chunks})")
     t0 = time.perf_counter()
+    resized_image, height, width = resize_image_for_wan(image, GENERATION_PIPE)
+    state = generate_latents_setup_on_gpu(resized_image, prompt, seed, height, width)
+    for chunk_idx, end_step in enumerate(CHUNK_BOUNDARIES[1:], start=2):
+        progress(
+            0.8 * (chunk_idx - 1) / num_chunks,
+            desc=f"Generating latents ({chunk_idx}/{num_chunks})",
+        )
+        state = generate_latents_chunk_on_gpu(state, end_step)
+    latents = normalize_latent_shape(state["latents"])
     latent_secs = time.perf_counter() - t0
     print(f"[timing] latent generation: {latent_secs:.2f}s")
     reference_frame = build_reference_frame(resized_image, "cpu")