Spaces:

allenai
/

RefDecoder

Configuration error

App Files Files Community

Arrokothwhi commited on 5 days ago

Commit

8db3b31

1 Parent(s): 05c70dd

add memory print

Browse files

Files changed (1) hide show

app.py +66 -45

app.py CHANGED Viewed

@@ -56,7 +56,25 @@ PIPE_DTYPE = torch.bfloat16 if DEVICE == "cuda" else torch.float32
 pipeline_wan_i2v.ftfy = ftfy
 def load_generation_pipe():
     image_encoder = CLIPVisionModel.from_pretrained(
         MODEL_ID,
         subfolder="image_encoder",
@@ -74,10 +92,12 @@ def load_generation_pipe():
         torch_dtype=PIPE_DTYPE,
     )
     pipe = pipe.to(DEVICE)
     return pipe
 def load_wan_vae():
     vae = DiffusersWanVAE.from_pretrained(
         MODEL_ID,
         subfolder="vae",
@@ -85,10 +105,12 @@ def load_wan_vae():
     )
     vae = vae.to(DEVICE)
     vae.eval()
     return vae
 def load_refdecoder_module():
     vae = AutoencoderKLWan(
         dropout_p=0.0,
         use_reference=True,
@@ -122,6 +144,7 @@ def load_refdecoder_module():
     vae = vae.to(DEVICE).eval()
     transformer = transformer.to(DEVICE).eval()
     return vae, transformer
@@ -207,6 +230,7 @@ def generate_and_decode(image, prompt, seed, progress=gr.Progress(track_tqdm=Fal
     prompt = prompt.strip() if prompt else ""
     seed = int(seed) if seed is not None else random.randint(0, 2**32 - 1)
     run_dir = Path(tempfile.mkdtemp(prefix="refdecoder_demo_"))
     progress(0.05, desc="Loading Wan I2V pipeline")
     pipe = load_generation_pipe()
@@ -231,11 +255,13 @@ def generate_and_decode(image, prompt, seed, progress=gr.Progress(track_tqdm=Fal
             output_type="latent",
         )
     latents = normalize_latent_shape(output.frames).detach().cpu()
     del output
     del pipe
     if torch.cuda.is_available():
         torch.cuda.empty_cache()
     gc.collect()
     latent_path = run_dir / "wan_latents.pt"
     torch.save(
@@ -253,16 +279,19 @@ def generate_and_decode(image, prompt, seed, progress=gr.Progress(track_tqdm=Fal
     wan_vae = load_wan_vae()
     wan_video = decode_with_wan_vae(latents, wan_vae)
     wan_video_path = save_video_tensor(wan_video, run_dir / "wan_vae.mp4")
     del wan_video
     del wan_vae
     if torch.cuda.is_available():
         torch.cuda.empty_cache()
     gc.collect()
     progress(0.82, desc="Decoding with RefDecoder")
     ref_vae, ref_transformer = load_refdecoder_module()
     ref_video = decode_with_refdecoder(latents, reference_frame, ref_vae, ref_transformer)
     ref_video_path = save_video_tensor(ref_video, run_dir / "refdecoder.mp4")
     del ref_video
     del ref_vae
     del ref_transformer
@@ -270,16 +299,10 @@ def generate_and_decode(image, prompt, seed, progress=gr.Progress(track_tqdm=Fal
     if torch.cuda.is_available():
         torch.cuda.empty_cache()
     gc.collect()
-    status = (
-        f"Seed: {seed}\n"
-        f"Prompt: {prompt if prompt else '(empty)'}\n"
-        f"Resolution: {width}x{height}\n"
-        f"Frames: {NUM_FRAMES}\n"
-        f"Latents: {tuple(latents.shape)}"
-    )
     progress(1.0, desc="Done")
-    return wan_video_path, ref_video_path, status
 CUSTOM_CSS = """
@@ -292,7 +315,7 @@ CUSTOM_CSS = """
     --accent: #1f6a52;
     --accent-2: #c96f42;
     --text-main: #201a14;
-    --text-soft: #5c5348;
     --copy-font: "Fraunces", "Iowan Old Style", "Palatino Linotype", serif;
 }
@@ -392,6 +415,10 @@ CUSTOM_CSS = """
 #generate-btn:hover {
     filter: brightness(1.04);
 }
 """
@@ -422,11 +449,11 @@ with gr.Blocks(title="RefDecoder I2V Demo", theme=gr.themes.Soft(), css=CUSTOM_C
                 image_input = gr.Image(
                     label="Reference Image",
                     type="pil",
-                    height=420,
                 )
                 prompt_input = gr.Textbox(
                     label="Motion Prompt",
-                    lines=5,
                     placeholder="A woman turns toward the camera as her hair moves in the wind...",
                 )
                 seed_input = gr.Number(
@@ -441,44 +468,38 @@ with gr.Blocks(title="RefDecoder I2V Demo", theme=gr.themes.Soft(), css=CUSTOM_C
                     elem_id="generate-btn",
                 )
-            with gr.Column(scale=6):
-                with gr.Column(elem_classes="panel-card"):
-                    gr.HTML(
-                        """
-                        <div class="section-title">Run Info</div>
-                        <div class="section-copy">
-                            Generation details for the current comparison run.
-                        </div>
-                        """
-                    )
-                    status_output = gr.Textbox(
-                        label="Run Info",
-                        lines=7,
-                        interactive=False,
-                    )
-                with gr.Column(elem_classes="output-card"):
-                    gr.HTML(
-                        """
-                        <div class="section-title">Wan Baseline</div>
-                        <div class="section-copy">Decoded with Wan2.1's original VAE.</div>
-                        """
-                    )
-                    wan_video_output = gr.Video(label="Wan VAE Decode", height=260)
-                with gr.Column(elem_classes="output-card"):
-                    gr.HTML(
-                        """
-                        <div class="section-title">RefDecoder Result</div>
-                        <div class="section-copy">Decoded with the custom RefDecoder checkpoint.</div>
-                        """
-                    )
-                    ref_video_output = gr.Video(label="RefDecoder Decode", height=260)
         run_button.click(
             fn=generate_and_decode,
             inputs=[image_input, prompt_input, seed_input],
-            outputs=[wan_video_output, ref_video_output, status_output],
         )

 pipeline_wan_i2v.ftfy = ftfy
+def log_cuda_mem(tag):
+    if not torch.cuda.is_available():
+        print(f"[mem] {tag}: CUDA not available")
+        return
+    free_bytes, total_bytes = torch.cuda.mem_get_info()
+    allocated_bytes = torch.cuda.memory_allocated()
+    reserved_bytes = torch.cuda.memory_reserved()
+    print(
+        f"[mem] {tag}: "
+        f"free={free_bytes / 1024**3:.2f} GB, "
+        f"total={total_bytes / 1024**3:.2f} GB, "
+        f"allocated={allocated_bytes / 1024**3:.2f} GB, "
+        f"reserved={reserved_bytes / 1024**3:.2f} GB"
+    )
 def load_generation_pipe():
+    log_cuda_mem("before load_generation_pipe")
     image_encoder = CLIPVisionModel.from_pretrained(
         MODEL_ID,
         subfolder="image_encoder",
         torch_dtype=PIPE_DTYPE,
     )
     pipe = pipe.to(DEVICE)
+    log_cuda_mem("after load_generation_pipe")
     return pipe
 def load_wan_vae():
+    log_cuda_mem("before load_wan_vae")
     vae = DiffusersWanVAE.from_pretrained(
         MODEL_ID,
         subfolder="vae",
     )
     vae = vae.to(DEVICE)
     vae.eval()
+    log_cuda_mem("after load_wan_vae")
     return vae
 def load_refdecoder_module():
+    log_cuda_mem("before load_refdecoder_module")
     vae = AutoencoderKLWan(
         dropout_p=0.0,
         use_reference=True,
     vae = vae.to(DEVICE).eval()
     transformer = transformer.to(DEVICE).eval()
+    log_cuda_mem("after load_refdecoder_module")
     return vae, transformer
     prompt = prompt.strip() if prompt else ""
     seed = int(seed) if seed is not None else random.randint(0, 2**32 - 1)
     run_dir = Path(tempfile.mkdtemp(prefix="refdecoder_demo_"))
+    log_cuda_mem("start generate_and_decode")
     progress(0.05, desc="Loading Wan I2V pipeline")
     pipe = load_generation_pipe()
             output_type="latent",
         )
     latents = normalize_latent_shape(output.frames).detach().cpu()
+    log_cuda_mem("after latent generation")
     del output
     del pipe
     if torch.cuda.is_available():
         torch.cuda.empty_cache()
     gc.collect()
+    log_cuda_mem("after freeing generation pipe")
     latent_path = run_dir / "wan_latents.pt"
     torch.save(
     wan_vae = load_wan_vae()
     wan_video = decode_with_wan_vae(latents, wan_vae)
     wan_video_path = save_video_tensor(wan_video, run_dir / "wan_vae.mp4")
+    log_cuda_mem("after wan decode")
     del wan_video
     del wan_vae
     if torch.cuda.is_available():
         torch.cuda.empty_cache()
     gc.collect()
+    log_cuda_mem("after freeing wan vae")
     progress(0.82, desc="Decoding with RefDecoder")
     ref_vae, ref_transformer = load_refdecoder_module()
     ref_video = decode_with_refdecoder(latents, reference_frame, ref_vae, ref_transformer)
     ref_video_path = save_video_tensor(ref_video, run_dir / "refdecoder.mp4")
+    log_cuda_mem("after refdecoder decode")
     del ref_video
     del ref_vae
     del ref_transformer
     if torch.cuda.is_available():
         torch.cuda.empty_cache()
     gc.collect()
+    log_cuda_mem("after freeing refdecoder")
     progress(1.0, desc="Done")
+    return wan_video_path, ref_video_path
 CUSTOM_CSS = """
     --accent: #1f6a52;
     --accent-2: #c96f42;
     --text-main: #201a14;
+    --text-soft: #201a14;
     --copy-font: "Fraunces", "Iowan Old Style", "Palatino Linotype", serif;
 }
 #generate-btn:hover {
     filter: brightness(1.04);
 }
+.output-grid {
+    gap: 14px;
+}
 """
                 image_input = gr.Image(
                     label="Reference Image",
                     type="pil",
+                    height=320,
                 )
                 prompt_input = gr.Textbox(
                     label="Motion Prompt",
+                    lines=4,
                     placeholder="A woman turns toward the camera as her hair moves in the wind...",
                 )
                 seed_input = gr.Number(
                     elem_id="generate-btn",
                 )
+            with gr.Column(scale=6, elem_classes="panel-card"):
+                gr.HTML(
+                    """
+                    <div class="section-title">Decoder Comparison</div>
+                    <div class="section-copy">
+                        Same Wan latent video, rendered with two different decoders.
+                    </div>
+                    """
+                )
+                with gr.Row(equal_height=True, elem_classes="output-grid"):
+                    with gr.Column(elem_classes="output-card"):
+                        gr.HTML(
+                            """
+                            <div class="section-title">Wan Baseline</div>
+                            <div class="section-copy">Decoded with Wan2.1's original VAE.</div>
+                            """
+                        )
+                        wan_video_output = gr.Video(label="Wan VAE Decode", height=250)
+                    with gr.Column(elem_classes="output-card"):
+                        gr.HTML(
+                            """
+                            <div class="section-title">RefDecoder Result</div>
+                            <div class="section-copy">Decoded with the custom RefDecoder checkpoint.</div>
+                            """
+                        )
+                        ref_video_output = gr.Video(label="RefDecoder Decode", height=250)
         run_button.click(
             fn=generate_and_decode,
             inputs=[image_input, prompt_input, seed_input],
+            outputs=[wan_video_output, ref_video_output],
         )