Spaces:

tedlasai
/

learn2refocus

Sleeping

App Files Files Community

tedlasai commited on Dec 25, 2025

Commit

cc63be8

1 Parent(s): c8862b3

updating app

Browse files

Files changed (3) hide show

app.py +23 -26
simplified_inference.py +72 -8
simplified_validation.py +0 -108

app.py CHANGED Viewed

@@ -5,7 +5,6 @@ import argparse
 import gradio as gr
 from PIL import Image
-import skvideo
 from diffusers.utils import export_to_video
 from inference import load_model, inference_on_image
@@ -14,20 +13,17 @@ from inference import load_model, inference_on_image
 # 1. Load model
 # -----------------------
 args = argparse.Namespace()
-args.blur2vid_hf_repo_path = "tedlasai/blur2vid"
-args.pretrained_model_path = "THUDM/CogVideoX-2b"
-args.model_config_path = "training/configs/outsidephotos.yaml"
-args.video_width = 1280
-args.video_height = 720
-args.seed = None
 pipe, model_config = load_model(args)
-OUTPUT_DIR = Path("/tmp/generated_videos")
 OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
-def generate_video_from_image(image: Image.Image, interval_key: str, num_inference_steps: int) -> str:
     """
     Wrapper for Gradio. Takes an image and returns a video path.
     """
@@ -60,16 +56,15 @@ def generate_video_from_image(image: Image.Image, interval_key: str, num_inferen
 with gr.Blocks(css="footer {visibility: hidden}") as demo:
     gr.Markdown(
         """
-        # 🖼️ ➜ 🎬 Recover Motion from a Blurry Image
-        This demo accompanies the paper **“Generating the Past, Present, and Future from a Motion-Blurred Image”**
-        by Tedla *et al.*, ACM Transactions on Graphics (SIGGRAPH Asia 2025).
-        - 🌐 **Project page:** <https://blur2vid.github.io/>
-        - 💻 **Code:** <https://github.com/tedlasai/blur2vid/>
-        Upload a blurry image and the model will generate a short video showing the recovered motion based on your selection.
-        Note: The image will be resized to 1280×720. We recommend uploading landscape-oriented images.
         """
     )
@@ -82,35 +77,37 @@ with gr.Blocks(css="footer {visibility: hidden}") as demo:
             )
             with gr.Row():
-                tense_choice = gr.Radio(
-                    label="Select the interval to be generated:",
-                    choices=["present", "past, present and future"],
-                    value="past, present and future",
                     interactive=True,
                 )
             num_inference_steps = gr.Slider(
                 label="Number of inference steps",
                 minimum=4,
-                maximum=50,
                 step=1,
-                value=20,
                 info="More steps = better quality but slower",
             )
-            generate_btn = gr.Button("Generate video", variant="primary")
         with gr.Column():
             video_out = gr.Video(
-                label="Generated video",
                 format="mp4",
                 autoplay=True,
                 loop=True,
             )
     generate_btn.click(
-        fn=generate_video_from_image,
-        inputs=[image_in, tense_choice, num_inference_steps],
         outputs=video_out,
         api_name="predict",
     )

 import gradio as gr
 from PIL import Image
 from diffusers.utils import export_to_video
 from inference import load_model, inference_on_image
 # 1. Load model
 # -----------------------
 args = argparse.Namespace()
+args.blur2vid_hf_repo_path = "tedlasai/learn2refocus"
+args.pretrained_model_path = "stabilityai/stable-video-diffusion-img2vid"
+args.seed = 0
 pipe, model_config = load_model(args)
+OUTPUT_DIR = Path("/tmp/output_stacks")
 OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
+def generate_vstack_from_image(image: Image.Image, input_focal_position: int, num_inference_steps: int) -> str:
     """
     Wrapper for Gradio. Takes an image and returns a video path.
     """
 with gr.Blocks(css="footer {visibility: hidden}") as demo:
     gr.Markdown(
         """
+        # 🖼️ ➜ 🎬 Generate Focal Stacks from a Single Image
+        This demo accompanies the paper **“Learning to Refocus with Video Diffusion MOdels”**
+        by Tedla *et al.*, SIGGRAPH Asia 2025.
+        - 🌐 **Project page:** <https://learn2refocus.github.io/>
+        - 💻 **Code:** <https://github.com/tedlasai/learn2refocus/>
+        Upload an image specify the input focal position. Near - 5cm, Far - Infinity. Then, click "Generate stack" to generate a focal stack.
         """
     )
             )
             with gr.Row():
+                input_focal_position = gr.Slider(
+                    label="Input focal position (Near - 5cm, Far - Infinity):",
+                    minimum=0,
+                    maximum=8,
+                    step=1,
+                    value=4,
                     interactive=True,
                 )
             num_inference_steps = gr.Slider(
                 label="Number of inference steps",
                 minimum=4,
+                maximum=25,
                 step=1,
+                value=25,
                 info="More steps = better quality but slower",
             )
+            generate_btn = gr.Button("Generate stack", variant="primary")
         with gr.Column():
             video_out = gr.Video(
+                label="Generated stack",
                 format="mp4",
                 autoplay=True,
                 loop=True,
             )
     generate_btn.click(
+        fn=generate_vstack_from_image,
+        inputs=[image_in, input_focal_position, num_inference_steps],
         outputs=video_out,
         api_name="predict",
     )

simplified_inference.py CHANGED Viewed

@@ -18,20 +18,20 @@
 import math
 import os
-from torch.utils.data import Dataset
-import accelerate
 import numpy as np
 import torch
-import torch.nn.functional as F
 import torch.utils.checkpoint
 from accelerate.logging import get_logger
 from accelerate.utils import set_seed
-from packaging import version
 from tqdm.auto import tqdm
 from transformers import CLIPVisionModelWithProjection
-from simplified_validation import valid_net
 from diffusers import AutoencoderKLTemporalDecoder, UNetSpatioTemporalConditionModel
 from diffusers.utils import check_min_version
 import argparse
 # Will error if the minimal version of diffusers is not installed. Remove at your own risks.
 check_min_version("0.24.0.dev0")
@@ -40,8 +40,6 @@ logger = get_logger(__name__, log_level="INFO")
 import numpy as np
 import torch
 import os
-import glob
 def parse_args():
@@ -150,6 +148,68 @@ def convert_to_batch(image, input_focal_position, sample_frames=9):
         name = os.path.splitext(os.path.basename(scene))[0]
         return {"pixel_values": pixels, "focal_stack_num": focal_stack_num, "original_pixel_values": original_pixels, 'icc_profile': icc_profile, "name": name}
 def main():
     args = parse_args()
@@ -182,7 +242,11 @@ def main():
     unet.eval(); image_encoder.eval(); vae.eval()
     with torch.no_grad():
-        valid_net(args, batch, unet, image_encoder, vae, 0, weight_dtype, device, num_inference_steps=args.num_inference_steps)
 if __name__ == "__main__":
     main()

 import math
 import os
 import numpy as np
 import torch
 import torch.utils.checkpoint
 from accelerate.logging import get_logger
 from accelerate.utils import set_seed
 from tqdm.auto import tqdm
 from transformers import CLIPVisionModelWithProjection
 from diffusers import AutoencoderKLTemporalDecoder, UNetSpatioTemporalConditionModel
 from diffusers.utils import check_min_version
+from simplified_pipeline import StableVideoDiffusionPipeline
+import videoio
+from PIL import Image
 import argparse
 # Will error if the minimal version of diffusers is not installed. Remove at your own risks.
 check_min_version("0.24.0.dev0")
 import numpy as np
 import torch
 import os
 def parse_args():
         name = os.path.splitext(os.path.basename(scene))[0]
         return {"pixel_values": pixels, "focal_stack_num": focal_stack_num, "original_pixel_values": original_pixels, 'icc_profile': icc_profile, "name": name}
+def inference_on_image(args, batch, unet, image_encoder, vae, global_step, weight_dtype, device):
+    pipeline = StableVideoDiffusionPipeline.from_pretrained(
+        args.pretrained_model_path,
+        unet=unet,
+        image_encoder=image_encoder,
+        vae=vae,
+        torch_dtype=weight_dtype,
+    )
+    pipeline.set_progress_bar_config(disable=True)
+    num_frames = 9
+    unet.eval()
+    pixel_values = batch["pixel_values"].to(device)
+    focal_stack_num = batch["focal_stack_num"]
+    svd_output, _ = pipeline(
+        pixel_values,
+        height=pixel_values.shape[3],
+        width=pixel_values.shape[4],
+        num_frames=num_frames,
+        decode_chunk_size=8,
+        motion_bucket_id=0,
+        min_guidance_scale=1.5,
+        max_guidance_scale=1.5,
+        fps=7,
+        noise_aug_strength=0,
+        focal_stack_num = focal_stack_num,
+        num_inference_steps=args.num_inference_steps,
+    )
+    video_frames = svd_output.frames[0]
+    video_frames_normalized = video_frames*0.5 + 0.5
+    video_frames_normalized = torch.clamp(video_frames_normalized,0,1)
+    video_frames_normalized = video_frames_normalized.permute(1,0,2,3)
+    video_frames_normalized = torch.nn.functional.interpolate(video_frames_normalized, ((pixel_values.shape[2]//2)*2, (pixel_values.shape[3]//2)*2), mode='bilinear')
+    return video_frames_normalized, focal_stack_num
+    # run inference
+def write_output(output_dir, frames, focal_stack_num, icc_profile):
+    print("Validation images will be saved to ", output_dir)
+    os.makedirs(output_dir, exist_ok=True)
+    videoio.videosave(os.path.join(
+        output_dir,
+        f"stack.mp4",
+    ), frames.permute(0,2,3,1).cpu().numpy(), fps=5)
+    #save images
+    for i in range(9):
+        #use Pillow to save images
+        img = Image.fromarray((frames[i].permute(1,2,0).cpu().numpy()*255).astype(np.uint8))
+        if icc_profile != "none":
+            img.info['icc_profile'] = icc_profile
+        img.save(os.path.join(output_dir, f"frame_{i}.png"))
 def main():
     args = parse_args()
     unet.eval(); image_encoder.eval(); vae.eval()
     with torch.no_grad():
+        output_frames, focal_stack_num = inference_on_image(args, batch, unet, image_encoder, vae, 0, weight_dtype, device)
+        val_save_dir = os.path.join(args.output_dir, "validation_images", batch['name'])
+        write_output(val_save_dir, output_frames, focal_stack_num, batch['icc_profile'])
 if __name__ == "__main__":
     main()

simplified_validation.py DELETED Viewed

@@ -1,108 +0,0 @@
-from simplified_pipeline import StableVideoDiffusionPipeline
-import os
-import torch
-import numpy as np
-import videoio
-import matplotlib.image
-from PIL import Image
-def valid_net(args, batch, unet, image_encoder, vae, global_step, weight_dtype, device):
-    # The models need unwrapping because for compatibility in distributed training mode.
-    pipeline = StableVideoDiffusionPipeline.from_pretrained(
-        args.pretrained_model_path,
-        unet=unet,
-        image_encoder=image_encoder,
-        vae=vae,
-        torch_dtype=weight_dtype,
-    )
-    pipeline.set_progress_bar_config(disable=True)
-    # run inference
-    val_save_dir = os.path.join(
-        args.output_dir, "validation_images")
-    print("Validation images will be saved to ", val_save_dir)
-    os.makedirs(val_save_dir, exist_ok=True)
-    num_frames = 9
-    unet.eval()
-    #clear gradients (the torch no grad is the magic that makes this work)
-    with torch.no_grad():
-        torch.cuda.empty_cache()
-    pixel_values = batch["pixel_values"].to(device)
-    original_pixel_values = batch['original_pixel_values'].to(device)
-    focal_stack_num = batch["focal_stack_num"]
-    svd_output, gt_frames = pipeline(
-        pixel_values,
-        height=pixel_values.shape[3],
-        width=pixel_values.shape[4],
-        num_frames=num_frames,
-        decode_chunk_size=8,
-        motion_bucket_id=0,
-        min_guidance_scale=1.5,
-        max_guidance_scale=1.5,
-        fps=7,
-        noise_aug_strength=0,
-        focal_stack_num = focal_stack_num,
-        num_inference_steps=args.num_inference_steps,
-    )
-    video_frames = svd_output.frames[0]
-    gt_frames = gt_frames[0]
-    with torch.no_grad():
-        if len(original_pixel_values.shape) == 5:
-            pixel_values = original_pixel_values[0] #assuming batch size is 1
-        else:
-            pixel_values = original_pixel_values.repeat(num_frames, 1, 1, 1)
-        pixel_values_normalized = pixel_values*0.5 + 0.5
-        pixel_values_normalized = torch.clamp(pixel_values_normalized,0,1)
-        video_frames_normalized = video_frames*0.5 + 0.5
-        video_frames_normalized = torch.clamp(video_frames_normalized,0,1)
-        video_frames_normalized = video_frames_normalized.permute(1,0,2,3)
-        gt_frames = torch.clamp(gt_frames,0,1)
-        gt_frames = gt_frames.permute(1,0,2,3)
-        #RESIZE images
-        video_frames_normalized = torch.nn.functional.interpolate(video_frames_normalized, ((pixel_values.shape[2]//2)*2, (pixel_values.shape[3]//2)*2), mode='bilinear')
-        gt_frames = torch.nn.functional.interpolate(gt_frames, ((pixel_values.shape[2]//2)*2, (pixel_values.shape[3]//2)*2), mode='bilinear')
-        pixel_values_normalized = torch.nn.functional.interpolate(pixel_values_normalized, ((pixel_values.shape[2]//2)*2, (pixel_values.shape[3]//2)*2), mode='bilinear')
-        os.makedirs(os.path.join(val_save_dir, f"position_{focal_stack_num}/videos"), exist_ok=True)
-        videoio.videosave(os.path.join(
-            val_save_dir,
-            f"position_{focal_stack_num}/videos/{batch['name']}.mp4",
-        ), video_frames_normalized.permute(0,2,3,1).cpu().numpy(), fps=5)
-        #save images
-        os.makedirs(os.path.join(val_save_dir, f"position_{focal_stack_num}/images"), exist_ok=True)
-        for i in range(num_frames):
-            #use Pillow to save images
-            img = Image.fromarray((video_frames_normalized[i].permute(1,2,0).cpu().numpy()*255).astype(np.uint8))
-            #use index to assign icc profile to img
-            if batch['icc_profile'] != "none":
-                img.info['icc_profile'] = batch['icc_profile']
-            path = os.path.join(val_save_dir, f"position_{focal_stack_num}/images/{batch['name']}_frame_{i}.png")
-            print("Saving image to ", path)
-            img.save(os.path.join(val_save_dir, f"position_{focal_stack_num}/images/{batch['name']}_frame_{i}.png"))
-    del video_frames