Spaces:

allenai
/

RefDecoder

Configuration error

App Files Files Community

xiangfan00 commited on 3 days ago

Commit

6f279fd

1 Parent(s): 3a3a124

Fixes

Browse files

Files changed (1) hide show

app.py +24 -33

app.py CHANGED Viewed

@@ -113,7 +113,6 @@ def get_module_dtype(module):
 def load_generation_pipe():
-    log_cuda_mem("before load_generation_pipe")
     image_encoder = CLIPVisionModel.from_pretrained(
         MODEL_ID,
         subfolder="image_encoder",
@@ -130,24 +129,20 @@ def load_generation_pipe():
         image_encoder=image_encoder,
         torch_dtype=PIPE_DTYPE,
     )
-    log_cuda_mem("after load_generation_pipe")
     return pipe
 def load_wan_vae():
-    log_cuda_mem("before load_wan_vae")
     vae = DiffusersWanVAE.from_pretrained(
         MODEL_ID,
         subfolder="vae",
         torch_dtype=PIPE_DTYPE,
     )
     vae.eval()
-    log_cuda_mem("after load_wan_vae")
     return vae
 def load_refdecoder_module():
-    log_cuda_mem("before load_refdecoder_module")
     vae = AutoencoderKLWan(
         dropout_p=0.0,
         use_reference=True,
@@ -175,7 +170,6 @@ def load_refdecoder_module():
     vae.load_state_dict(vae_sd, strict=False)
     transformer.load_state_dict(transformer_sd, strict=False)
-    log_cuda_mem("after load_refdecoder_module")
     return vae, transformer
@@ -561,7 +555,11 @@ def decode_with_refdecoder(latents, reference_frame, vae, transformer):
     return video
-CHUNK_BOUNDARIES = (8, 16, 23, NUM_INFERENCE_STEPS)
 assert CHUNK_BOUNDARIES[-1] == NUM_INFERENCE_STEPS
@@ -608,12 +606,18 @@ def _run_diffusion_steps(
 @spaces.GPU(duration=50)
 def generate_latents_setup_on_gpu(resized_image, prompt, seed, height, width):
-    """Encode prompt+image, prepare latents, run the first chunk of denoising steps.
-    Returns a CPU-resident state dict consumable by generate_latents_chunk_on_gpu.
     """
     log_cuda_mem("start generate_latents_setup_on_gpu")
-    GENERATION_PIPE.to(DEVICE)
     try:
         transformer_dtype = GENERATION_PIPE.transformer.dtype
@@ -631,9 +635,6 @@ def generate_latents_setup_on_gpu(resized_image, prompt, seed, height, width):
         image_embeds = GENERATION_PIPE.encode_image(resized_image, DEVICE)
         image_embeds = image_embeds.repeat(1, 1, 1).to(transformer_dtype)
-        GENERATION_PIPE.scheduler.set_timesteps(NUM_INFERENCE_STEPS, device=DEVICE)
-        timesteps = GENERATION_PIPE.scheduler.timesteps
         image_tensor = GENERATION_PIPE.video_processor.preprocess(
             resized_image, height=height, width=width
         ).to(DEVICE, dtype=torch.float32)
@@ -652,29 +653,18 @@ def generate_latents_setup_on_gpu(resized_image, prompt, seed, height, width):
             None,
         )
-        end_step = CHUNK_BOUNDARIES[0]
-        latents = _run_diffusion_steps(
-            latents,
-            condition,
-            prompt_embeds,
-            negative_prompt_embeds,
-            image_embeds,
-            timesteps,
-            0,
-            end_step,
-            transformer_dtype,
-        )
         state = {
             "prompt_embeds": prompt_embeds.detach().cpu(),
             "negative_prompt_embeds": negative_prompt_embeds.detach().cpu(),
             "image_embeds": image_embeds.detach().cpu(),
             "condition": condition.detach().cpu(),
             "latents": latents.detach().cpu(),
-            "step_idx": end_step,
         }
     finally:
-        GENERATION_PIPE.to("cpu")
     log_cuda_mem("end generate_latents_setup_on_gpu")
     return state
@@ -762,16 +752,17 @@ def generate_and_decode(image, prompt, seed, progress=gr.Progress()):
     run_dir = OUTPUT_ROOT / f"refdecoder_demo_{uuid.uuid4().hex}"
     run_dir.mkdir(parents=True, exist_ok=True)
-    num_chunks = len(CHUNK_BOUNDARIES)
-    progress(0.0, desc=f"Generating latents (1/{num_chunks})")
     t0 = time.perf_counter()
     resized_image, height, width = resize_image_for_wan(image, GENERATION_PIPE)
     state = generate_latents_setup_on_gpu(resized_image, prompt, seed, height, width)
-    for chunk_idx, end_step in enumerate(CHUNK_BOUNDARIES[1:], start=2):
         progress(
-            0.8 * (chunk_idx - 1) / num_chunks,
-            desc=f"Generating latents ({chunk_idx}/{num_chunks})",
         )
         state = generate_latents_chunk_on_gpu(state, end_step)
     latents = normalize_latent_shape(state["latents"])

 def load_generation_pipe():
     image_encoder = CLIPVisionModel.from_pretrained(
         MODEL_ID,
         subfolder="image_encoder",
         image_encoder=image_encoder,
         torch_dtype=PIPE_DTYPE,
     )
     return pipe
 def load_wan_vae():
     vae = DiffusersWanVAE.from_pretrained(
         MODEL_ID,
         subfolder="vae",
         torch_dtype=PIPE_DTYPE,
     )
     vae.eval()
     return vae
 def load_refdecoder_module():
     vae = AutoencoderKLWan(
         dropout_p=0.0,
         use_reference=True,
     vae.load_state_dict(vae_sd, strict=False)
     transformer.load_state_dict(transformer_sd, strict=False)
     return vae, transformer
     return video
+_NUM_DENOISING_CHUNKS = 4
+CHUNK_BOUNDARIES = tuple(
+    NUM_INFERENCE_STEPS * (i + 1) // _NUM_DENOISING_CHUNKS
+    for i in range(_NUM_DENOISING_CHUNKS)
+)
 assert CHUNK_BOUNDARIES[-1] == NUM_INFERENCE_STEPS
 @spaces.GPU(duration=50)
 def generate_latents_setup_on_gpu(resized_image, prompt, seed, height, width):
+    """Encode prompt+image, prepare initial latents and condition. NO denoising.
+    Loads only the encoders + VAE to GPU (not the 14B transformer). Returns a
+    CPU-resident state dict consumable by generate_latents_chunk_on_gpu.
     """
     log_cuda_mem("start generate_latents_setup_on_gpu")
+    text_encoder = GENERATION_PIPE.text_encoder
+    image_encoder = GENERATION_PIPE.image_encoder
+    vae = GENERATION_PIPE.vae
+    text_encoder.to(DEVICE)
+    image_encoder.to(DEVICE)
+    vae.to(DEVICE)
     try:
         transformer_dtype = GENERATION_PIPE.transformer.dtype
         image_embeds = GENERATION_PIPE.encode_image(resized_image, DEVICE)
         image_embeds = image_embeds.repeat(1, 1, 1).to(transformer_dtype)
         image_tensor = GENERATION_PIPE.video_processor.preprocess(
             resized_image, height=height, width=width
         ).to(DEVICE, dtype=torch.float32)
             None,
         )
         state = {
             "prompt_embeds": prompt_embeds.detach().cpu(),
             "negative_prompt_embeds": negative_prompt_embeds.detach().cpu(),
             "image_embeds": image_embeds.detach().cpu(),
             "condition": condition.detach().cpu(),
             "latents": latents.detach().cpu(),
+            "step_idx": 0,
         }
     finally:
+        text_encoder.to("cpu")
+        image_encoder.to("cpu")
+        vae.to("cpu")
     log_cuda_mem("end generate_latents_setup_on_gpu")
     return state
     run_dir = OUTPUT_ROOT / f"refdecoder_demo_{uuid.uuid4().hex}"
     run_dir.mkdir(parents=True, exist_ok=True)
+    # 1 setup chunk (encoders + VAE) + len(CHUNK_BOUNDARIES) denoising chunks.
+    total_chunks = 1 + len(CHUNK_BOUNDARIES)
+    progress(0.0, desc=f"Generating latents (1/{total_chunks})")
     t0 = time.perf_counter()
     resized_image, height, width = resize_image_for_wan(image, GENERATION_PIPE)
     state = generate_latents_setup_on_gpu(resized_image, prompt, seed, height, width)
+    for chunk_idx, end_step in enumerate(CHUNK_BOUNDARIES, start=2):
         progress(
+            0.8 * (chunk_idx - 1) / total_chunks,
+            desc=f"Generating latents ({chunk_idx}/{total_chunks})",
         )
         state = generate_latents_chunk_on_gpu(state, end_step)
     latents = normalize_latent_shape(state["latents"])