ConsistI2V

Paused

App Files Files Community

wren93 commited on Mar 26, 2024

Commit

414f9e8

1 Parent(s): defcb9b

update demo

Browse files

Files changed (1) hide show

app.py +180 -158

app.py CHANGED Viewed

@@ -33,171 +33,194 @@ css = """
 }
 """
-class AnimateController:
-    def __init__(self):
-        # config dirs
-        self.basedir        = os.getcwd()
-        self.savedir        = os.path.join(self.basedir, "samples/Gradio", datetime.now().strftime("%Y-%m-%dT%H-%M-%S"))
-        self.savedir_sample = os.path.join(self.savedir, "sample")
-        os.makedirs(self.savedir, exist_ok=True)
-        self.image_resolution = (256, 256)
-        # config models
-        self.pipeline = ConditionalAnimationPipeline.from_pretrained("TIGER-Lab/ConsistI2V")
-        self.pipeline.to("cuda")
-    def update_textbox_and_save_image(self, input_image, height_slider, width_slider, center_crop):
-        pil_image = Image.fromarray(input_image.astype(np.uint8)).convert("RGB")
-        img_path = os.path.join(self.savedir, "input_image.png")
-        pil_image.save(img_path)
-        self.image_resolution = pil_image.size
-        original_width, original_height = pil_image.size
-        if center_crop:
-            crop_aspect_ratio = width_slider / height_slider
-            aspect_ratio = original_width / original_height
-            if aspect_ratio > crop_aspect_ratio:
-                new_width = int(crop_aspect_ratio * original_height)
-                left = (original_width - new_width) / 2
-                top = 0
-                right = left + new_width
-                bottom = original_height
-                pil_image = pil_image.crop((left, top, right, bottom))
-            elif aspect_ratio < crop_aspect_ratio:
-                new_height = int(original_width / crop_aspect_ratio)
-                top = (original_height - new_height) / 2
-                left = 0
-                right = original_width
-                bottom = top + new_height
-                pil_image = pil_image.crop((left, top, right, bottom))
-        pil_image = pil_image.resize((width_slider, height_slider))
-        return gr.Textbox(value=img_path), gr.Image(value=np.array(pil_image))
-    @spaces.GPU
-    def animate(
-        self,
-        prompt_textbox,
         negative_prompt_textbox,
-        input_image_path,
-        sampler_dropdown,
-        sample_step_slider,
-        width_slider,
-        height_slider,
         txt_cfg_scale_slider,
         img_cfg_scale_slider,
-        center_crop,
         frame_stride,
         use_frameinit,
         frame_init_noise_level,
-        seed_textbox
-    ):
-        if self.pipeline is None:
-            raise gr.Error(f"Please select a pretrained pipeline path.")
-        if input_image_path == "":
-            raise gr.Error(f"Please upload an input image.")
-        if (not center_crop) and (width_slider % 8 != 0 or height_slider % 8 != 0):
-            raise gr.Error(f"`height` and `width` have to be divisible by 8 but are {height_slider} and {width_slider}.")
-        if center_crop and (width_slider % 8 != 0 or height_slider % 8 != 0):
-            raise gr.Error(f"`height` and `width` (after cropping) have to be divisible by 8 but are {height_slider} and {width_slider}.")
-        if is_xformers_available() and int(torch.__version__.split(".")[0]) < 2: self.pipeline.unet.enable_xformers_memory_efficient_attention()
-        if seed_textbox != -1 and seed_textbox != "": torch.manual_seed(int(seed_textbox))
-        else: torch.seed()
-        seed = torch.initial_seed()
-        if input_image_path.startswith("http://") or input_image_path.startswith("https://"):
-            first_frame = Image.open(requests.get(input_image_path, stream=True).raw).convert('RGB')
-        else:
-            first_frame = Image.open(input_image_path).convert('RGB')
-        original_width, original_height = first_frame.size
-        if not center_crop:
-            img_transform = T.Compose([
-                T.ToTensor(),
-                T.Resize((height_slider, width_slider), antialias=None),
-                T.Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5], inplace=True),
-            ])
-        else:
-            aspect_ratio = original_width / original_height
-            crop_aspect_ratio = width_slider / height_slider
-            if aspect_ratio > crop_aspect_ratio:
-                center_crop_width = int(crop_aspect_ratio * original_height)
-                center_crop_height = original_height
-            elif aspect_ratio < crop_aspect_ratio:
-                center_crop_width = original_width
-                center_crop_height = int(original_width / crop_aspect_ratio)
-            else:
-                center_crop_width = original_width
-                center_crop_height = original_height
-            img_transform = T.Compose([
-                T.ToTensor(),
-                T.CenterCrop((center_crop_height, center_crop_width)),
-                T.Resize((height_slider, width_slider), antialias=None),
-                T.Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5], inplace=True),
-            ])
-        first_frame = img_transform(first_frame).unsqueeze(0)
-        first_frame = first_frame.to("cuda")
-        print("first_frame", first_frame.device)
-        if use_frameinit:
-            self.pipeline.init_filter(
-                width         = width_slider,
-                height        = height_slider,
-                video_length  = 16,
-                filter_params = OmegaConf.create({'method': 'gaussian', 'd_s': 0.25, 'd_t': 0.25,})
-            )
-        sample = self.pipeline(
-            prompt_textbox,
-            negative_prompt       = negative_prompt_textbox,
-            first_frames          = first_frame,
-            num_inference_steps   = sample_step_slider,
-            guidance_scale_txt    = txt_cfg_scale_slider,
-            guidance_scale_img    = img_cfg_scale_slider,
-            width                 = width_slider,
-            height                = height_slider,
-            video_length          = 16,
-            noise_sampling_method = "pyoco_mixed",
-            noise_alpha           = 1.0,
-            frame_stride          = frame_stride,
-            use_frameinit         = use_frameinit,
-            frameinit_noise_level = frame_init_noise_level,
-            camera_motion         = None,
-        ).videos
-        global sample_idx
-        sample_idx += 1
-        save_sample_path = os.path.join(self.savedir_sample, f"{sample_idx}.mp4")
-        save_videos_grid(sample, save_sample_path, format="mp4")
-        sample_config = {
-            "prompt": prompt_textbox,
-            "n_prompt": negative_prompt_textbox,
-            "first_frame_path": input_image_path,
-            "sampler": sampler_dropdown,
-            "num_inference_steps": sample_step_slider,
-            "guidance_scale_text": txt_cfg_scale_slider,
-            "guidance_scale_image": img_cfg_scale_slider,
-            "width": width_slider,
-            "height": height_slider,
-            "video_length": 8,
-            "seed": seed
-        }
-        json_str = json.dumps(sample_config, indent=4)
-        with open(os.path.join(self.savedir, "logs.json"), "a") as f:
-            f.write(json_str)
-            f.write("\n\n")
-        return gr.Video(value=save_sample_path)
-controller = AnimateController()
 def ui():
@@ -257,7 +280,7 @@ def ui():
                     with gr.Row():
                         input_image = gr.Image(label="Input Image", interactive=True)
-                        input_image.upload(fn=controller.update_textbox_and_save_image, inputs=[input_image, height_slider, width_slider, center_crop], outputs=[input_image_path, input_image])
                         result_video = gr.Video(label="Generated Animation", interactive=False, autoplay=True)
             def update_and_resize_image(input_image_path, height_slider, width_slider, center_crop):
@@ -265,7 +288,6 @@ def ui():
                     pil_image = Image.open(requests.get(input_image_path, stream=True).raw).convert('RGB')
                 else:
                     pil_image = Image.open(input_image_path).convert('RGB')
-                controller.image_resolution = pil_image.size
                 original_width, original_height = pil_image.size
                 if center_crop:
@@ -293,7 +315,7 @@ def ui():
             input_image_path.submit(fn=update_and_resize_image, inputs=[input_image_path, height_slider, width_slider, center_crop], outputs=[input_image])
             generate_button.click(
-                fn=controller.animate,
                 inputs=[
                     prompt_textbox,
                     negative_prompt_textbox,

 }
 """
+basedir        = os.getcwd()
+savedir        = os.path.join(basedir, "samples/Gradio", datetime.now().strftime("%Y-%m-%dT%H-%M-%S"))
+savedir_sample = os.path.join(savedir, "sample")
+os.makedirs(savedir, exist_ok=True)
+# config models
+pipeline = ConditionalAnimationPipeline.from_pretrained("TIGER-Lab/ConsistI2V", torch_dtype=torch.float16,)
+pipeline.to("cuda")
+# pipeline.to("cuda")
+def update_textbox_and_save_image(input_image, height_slider, width_slider, center_crop):
+    pil_image = Image.fromarray(input_image.astype(np.uint8)).convert("RGB")
+    img_path = os.path.join(savedir, "input_image.png")
+    pil_image.save(img_path)
+    original_width, original_height = pil_image.size
+    if center_crop:
+        crop_aspect_ratio = width_slider / height_slider
+        aspect_ratio = original_width / original_height
+        if aspect_ratio > crop_aspect_ratio:
+            new_width = int(crop_aspect_ratio * original_height)
+            left = (original_width - new_width) / 2
+            top = 0
+            right = left + new_width
+            bottom = original_height
+            pil_image = pil_image.crop((left, top, right, bottom))
+        elif aspect_ratio < crop_aspect_ratio:
+            new_height = int(original_width / crop_aspect_ratio)
+            top = (original_height - new_height) / 2
+            left = 0
+            right = original_width
+            bottom = top + new_height
+            pil_image = pil_image.crop((left, top, right, bottom))
+    pil_image = pil_image.resize((width_slider, height_slider))
+    return gr.Textbox(value=img_path), gr.Image(value=np.array(pil_image))
+def animate(
+    prompt_textbox,
+    negative_prompt_textbox,
+    input_image_path,
+    sampler_dropdown,
+    sample_step_slider,
+    width_slider,
+    height_slider,
+    txt_cfg_scale_slider,
+    img_cfg_scale_slider,
+    center_crop,
+    frame_stride,
+    use_frameinit,
+    frame_init_noise_level,
+    seed_textbox
+):
+    if pipeline is None:
+        raise gr.Error(f"Please select a pretrained pipeline path.")
+    if input_image_path == "":
+        raise gr.Error(f"Please upload an input image.")
+    if (not center_crop) and (width_slider % 8 != 0 or height_slider % 8 != 0):
+        raise gr.Error(f"`height` and `width` have to be divisible by 8 but are {height_slider} and {width_slider}.")
+    if center_crop and (width_slider % 8 != 0 or height_slider % 8 != 0):
+        raise gr.Error(f"`height` and `width` (after cropping) have to be divisible by 8 but are {height_slider} and {width_slider}.")
+    if is_xformers_available() and int(torch.__version__.split(".")[0]) < 2: pipeline.unet.enable_xformers_memory_efficient_attention()
+    if seed_textbox != -1 and seed_textbox != "": torch.manual_seed(int(seed_textbox))
+    else: torch.seed()
+    seed = torch.initial_seed()
+    if input_image_path.startswith("http://") or input_image_path.startswith("https://"):
+        first_frame = Image.open(requests.get(input_image_path, stream=True).raw).convert('RGB')
+    else:
+        first_frame = Image.open(input_image_path).convert('RGB')
+    original_width, original_height = first_frame.size
+    if not center_crop:
+        img_transform = T.Compose([
+            T.ToTensor(),
+            T.Resize((height_slider, width_slider), antialias=None),
+            T.Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5], inplace=True),
+        ])
+    else:
+        aspect_ratio = original_width / original_height
+        crop_aspect_ratio = width_slider / height_slider
+        if aspect_ratio > crop_aspect_ratio:
+            center_crop_width = int(crop_aspect_ratio * original_height)
+            center_crop_height = original_height
+        elif aspect_ratio < crop_aspect_ratio:
+            center_crop_width = original_width
+            center_crop_height = int(original_width / crop_aspect_ratio)
+        else:
+            center_crop_width = original_width
+            center_crop_height = original_height
+        img_transform = T.Compose([
+            T.ToTensor(),
+            T.CenterCrop((center_crop_height, center_crop_width)),
+            T.Resize((height_slider, width_slider), antialias=None),
+            T.Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5], inplace=True),
+        ])
+    first_frame = img_transform(first_frame).unsqueeze(0)
+    if use_frameinit:
+        pipeline.init_filter(
+            width         = width_slider,
+            height        = height_slider,
+            video_length  = 16,
+            filter_params = OmegaConf.create({'method': 'gaussian', 'd_s': 0.25, 'd_t': 0.25,})
+        )
+    sample = run_pipeline(
+        pipeline,
+        prompt_textbox,
         negative_prompt_textbox,
+        first_frame,
+        sample_step_slider,
+        width_slider,
+        height_slider,
         txt_cfg_scale_slider,
         img_cfg_scale_slider,
         frame_stride,
         use_frameinit,
         frame_init_noise_level,
+    )
+    global sample_idx
+    sample_idx += 1
+    save_sample_path = os.path.join(savedir_sample, f"{sample_idx}.mp4")
+    save_videos_grid(sample, save_sample_path, format="mp4")
+    sample_config = {
+        "prompt": prompt_textbox,
+        "n_prompt": negative_prompt_textbox,
+        "first_frame_path": input_image_path,
+        "sampler": sampler_dropdown,
+        "num_inference_steps": sample_step_slider,
+        "guidance_scale_text": txt_cfg_scale_slider,
+        "guidance_scale_image": img_cfg_scale_slider,
+        "width": width_slider,
+        "height": height_slider,
+        "video_length": 8,
+        "seed": seed
+    }
+    json_str = json.dumps(sample_config, indent=4)
+    with open(os.path.join(savedir, "logs.json"), "a") as f:
+        f.write(json_str)
+        f.write("\n\n")
+    return gr.Video(value=save_sample_path)
+@spaces.GPU
+def run_pipeline(
+        pipeline,
+        prompt_textbox,
+        negative_prompt_textbox,
+        first_frame,
+        sample_step_slider,
+        width_slider,
+        height_slider,
+        txt_cfg_scale_slider,
+        img_cfg_scale_slider,
+        frame_stride,
+        use_frameinit,
+        frame_init_noise_level,
+    ):
+    first_frame = first_frame.to("cuda")
+    sample = pipeline(
+        prompt_textbox,
+        negative_prompt       = negative_prompt_textbox,
+        first_frames          = first_frame,
+        num_inference_steps   = sample_step_slider,
+        guidance_scale_txt    = txt_cfg_scale_slider,
+        guidance_scale_img    = img_cfg_scale_slider,
+        width                 = width_slider,
+        height                = height_slider,
+        video_length          = 16,
+        noise_sampling_method = "pyoco_mixed",
+        noise_alpha           = 1.0,
+        frame_stride          = frame_stride,
+        use_frameinit         = use_frameinit,
+        frameinit_noise_level = frame_init_noise_level,
+        camera_motion         = None,
+    ).videos
+    return sample
 def ui():
                     with gr.Row():
                         input_image = gr.Image(label="Input Image", interactive=True)
+                        input_image.upload(fn=update_textbox_and_save_image, inputs=[input_image, height_slider, width_slider, center_crop], outputs=[input_image_path, input_image])
                         result_video = gr.Video(label="Generated Animation", interactive=False, autoplay=True)
             def update_and_resize_image(input_image_path, height_slider, width_slider, center_crop):
                     pil_image = Image.open(requests.get(input_image_path, stream=True).raw).convert('RGB')
                 else:
                     pil_image = Image.open(input_image_path).convert('RGB')
                 original_width, original_height = pil_image.size
                 if center_crop:
             input_image_path.submit(fn=update_and_resize_image, inputs=[input_image_path, height_slider, width_slider, center_crop], outputs=[input_image])
             generate_button.click(
+                fn=animate,
                 inputs=[
                     prompt_textbox,
                     negative_prompt_textbox,