Spaces:

naicoi
/

lipsync-docker

Runtime error

App Files Files Community

naicoi commited on Feb 4

Commit

a353e2d

1 Parent(s): f5651ba

Squash merge opencode/gentle-cactus into pr/3

Browse files

Files changed (2) hide show

app.py +5 -1
latentsync/pipelines/lipsync_pipeline.py +199 -44

app.py CHANGED Viewed

@@ -3,9 +3,13 @@ OutofLipSync - Lipsync Only Application
 Main Gradio UI module
 """
 import logging
 import sys
-import os
 import shutil
 import gradio as gr

 Main Gradio UI module
 """
+import os
+# Optimize PyTorch memory allocation to reduce fragmentation
+os.environ["PYTORCH_ALLOC_CONF"] = "expandable_segments:True"
 import logging
 import sys
 import shutil
 import gradio as gr

latentsync/pipelines/lipsync_pipeline.py CHANGED Viewed

@@ -59,7 +59,10 @@ class LipsyncPipeline(DiffusionPipeline):
     ):
         super().__init__()
-        if hasattr(scheduler.config, "steps_offset") and scheduler.config.steps_offset != 1:
             deprecation_message = (
                 f"The configuration file of this scheduler: {scheduler} is outdated. `steps_offset`"
                 f" should be set to 1 instead of {scheduler.config.steps_offset}. Please make sure "
@@ -68,12 +71,17 @@ class LipsyncPipeline(DiffusionPipeline):
                 " it would be very nice if you could open a Pull request for the `scheduler/scheduler_config.json`"
                 " file"
             )
-            deprecate("steps_offset!=1", "1.0.0", deprecation_message, standard_warn=False)
             new_config = dict(scheduler.config)
             new_config["steps_offset"] = 1
             scheduler._internal_dict = FrozenDict(new_config)
-        if hasattr(scheduler.config, "clip_sample") and scheduler.config.clip_sample is True:
             deprecation_message = (
                 f"The configuration file of this scheduler: {scheduler} has not set the configuration `clip_sample`."
                 " `clip_sample` should be set to False in the configuration file. Please make sure to update the"
@@ -81,15 +89,21 @@ class LipsyncPipeline(DiffusionPipeline):
                 " future versions. If you have downloaded this checkpoint from the Hugging Face Hub, it would be very"
                 " nice if you could open a Pull request for the `scheduler/scheduler_config.json` file"
             )
-            deprecate("clip_sample not set", "1.0.0", deprecation_message, standard_warn=False)
             new_config = dict(scheduler.config)
             new_config["clip_sample"] = False
             scheduler._internal_dict = FrozenDict(new_config)
-        is_unet_version_less_0_9_0 = hasattr(unet.config, "_diffusers_version") and version.parse(
             version.parse(unet.config._diffusers_version).base_version
         ) < version.parse("0.9.0.dev0")
-        is_unet_sample_size_less_64 = hasattr(unet.config, "sample_size") and unet.config.sample_size < 64
         if is_unet_version_less_0_9_0 and is_unet_sample_size_less_64:
             deprecation_message = (
                 "The configuration file of the unet has set the default `sample_size` to smaller than"
@@ -97,12 +111,14 @@ class LipsyncPipeline(DiffusionPipeline):
                 " following: \n- CompVis/stable-diffusion-v1-4 \n- CompVis/stable-diffusion-v1-3 \n-"
                 " CompVis/stable-diffusion-v1-2 \n- CompVis/stable-diffusion-v1-1 \n- runwayml/stable-diffusion-v1-5"
                 " \n- runwayml/stable-diffusion-inpainting \n you should change 'sample_size' to 64 in the"
-                " configuration file. Please make sure to update the config accordingly as leaving `sample_size=32`"
                 " in the config might lead to incorrect results in future versions. If you have downloaded this"
                 " checkpoint from the Hugging Face Hub, it would be very nice if you could open a Pull request for"
                 " the `unet/config.json` file"
             )
-            deprecate("sample_size<64", "1.0.0", deprecation_message, standard_warn=False)
             new_config = dict(unet.config)
             new_config["sample_size"] = 64
             unet._internal_dict = FrozenDict(new_config)
@@ -138,7 +154,9 @@ class LipsyncPipeline(DiffusionPipeline):
         return self.device
     def decode_latents(self, latents):
-        latents = latents / self.vae.config.scaling_factor + self.vae.config.shift_factor
         latents = rearrange(latents, "b c f h w -> (b f) c h w")
         decoded_latents = self.vae.decode(latents).sample
         return decoded_latents
@@ -149,13 +167,17 @@ class LipsyncPipeline(DiffusionPipeline):
         # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
         # and should be between [0, 1]
-        accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
         extra_step_kwargs = {}
         if accepts_eta:
             extra_step_kwargs["eta"] = eta
         # check if the scheduler accepts generator
-        accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys())
         if accepts_generator:
             extra_step_kwargs["generator"] = generator
         return extra_step_kwargs
@@ -164,17 +186,22 @@ class LipsyncPipeline(DiffusionPipeline):
         assert height == width, "Height and width must be equal"
         if height % 8 != 0 or width % 8 != 0:
-            raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.")
         if (callback_steps is None) or (
-            callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0)
         ):
             raise ValueError(
                 f"`callback_steps` has to be a positive integer but is {callback_steps} of type"
                 f" {type(callback_steps)}."
             )
-    def prepare_latents(self, num_frames, num_channels_latents, height, width, dtype, device, generator):
         shape = (
             1,
             num_channels_latents,
@@ -183,7 +210,9 @@ class LipsyncPipeline(DiffusionPipeline):
             width // self.vae_scale_factor,
         )  # (b, c, f, h, w)
         rand_device = "cpu" if device.type == "mps" else device
-        latents = torch.randn(shape, generator=generator, device=rand_device, dtype=dtype).to(device)
         latents = latents.repeat(1, 1, num_frames, 1, 1)
         # scale the initial noise by the standard deviation required by the scheduler
@@ -191,7 +220,15 @@ class LipsyncPipeline(DiffusionPipeline):
         return latents
     def prepare_mask_latents(
-        self, mask, masked_image, height, width, dtype, device, generator, do_classifier_free_guidance
     ):
         # resize the mask to latents shape as we concatenate the mask to the latents
         # we do that before converting to dtype to avoid breaking in case we're using cpu_offload
@@ -202,8 +239,12 @@ class LipsyncPipeline(DiffusionPipeline):
         masked_image = masked_image.to(device=device, dtype=dtype)
         # encode the mask image into latents space so we can concatenate it to the latents
-        masked_image_latents = self.vae.encode(masked_image).latent_dist.sample(generator=generator)
-        masked_image_latents = (masked_image_latents - self.vae.config.shift_factor) * self.vae.config.scaling_factor
         # aligning device to prevent device errors when concating it with the latent model input
         masked_image_latents = masked_image_latents.to(device=device, dtype=dtype)
@@ -215,16 +256,26 @@ class LipsyncPipeline(DiffusionPipeline):
         mask = torch.cat([mask] * 2) if do_classifier_free_guidance else mask
         masked_image_latents = (
-            torch.cat([masked_image_latents] * 2) if do_classifier_free_guidance else masked_image_latents
         )
         return mask, masked_image_latents
-    def prepare_image_latents(self, images, device, dtype, generator, do_classifier_free_guidance):
         images = images.to(device=device, dtype=dtype)
         image_latents = self.vae.encode(images).latent_dist.sample(generator=generator)
-        image_latents = (image_latents - self.vae.config.shift_factor) * self.vae.config.scaling_factor
         image_latents = rearrange(image_latents, "f c h w -> 1 c f h w")
-        image_latents = torch.cat([image_latents] * 2) if do_classifier_free_guidance else image_latents
         return image_latents
@@ -234,7 +285,9 @@ class LipsyncPipeline(DiffusionPipeline):
         self._progress_bar_config.update(kwargs)
     @staticmethod
-    def paste_surrounding_pixels_back(decoded_latents, pixel_values, masks, device, weight_dtype):
         # Paste the surrounding pixels back, because we only want to change the mouth region
         pixel_values = pixel_values.to(device=device, dtype=weight_dtype)
         masks = masks.to(device=device, dtype=weight_dtype)
@@ -263,7 +316,13 @@ class LipsyncPipeline(DiffusionPipeline):
         faces = torch.stack(faces)
         return faces, boxes, affine_matrices
-    def restore_video(self, faces: torch.Tensor, video_frames: np.ndarray, boxes: list, affine_matrices: list):
         video_frames = video_frames[: len(faces)]
         out_frames = []
         print(f"Restoring {len(faces)} faces...")
@@ -272,12 +331,54 @@ class LipsyncPipeline(DiffusionPipeline):
             height = int(y2 - y1)
             width = int(x2 - x1)
             face = torchvision.transforms.functional.resize(
-                face, size=(height, width), interpolation=transforms.InterpolationMode.BICUBIC, antialias=True
             )
-            out_frame = self.image_processor.restorer.restore_img(video_frames[index], face, affine_matrices[index])
             out_frames.append(out_frame)
         return np.stack(out_frames, axis=0)
     def loop_video(self, whisper_chunks: list, video_frames: np.ndarray):
         # If the audio is longer than the video, we need to loop the video
         if len(whisper_chunks) > len(video_frames):
@@ -299,7 +400,9 @@ class LipsyncPipeline(DiffusionPipeline):
                     loop_boxes += boxes[::-1]
                     loop_affine_matrices += affine_matrices[::-1]
-            video_frames = np.concatenate(loop_video_frames, axis=0)[: len(whisper_chunks)]
             faces = torch.cat(loop_faces, dim=0)[: len(whisper_chunks)]
             boxes = loop_boxes[: len(whisper_chunks)]
             affine_matrices = loop_affine_matrices[: len(whisper_chunks)]
@@ -339,7 +442,9 @@ class LipsyncPipeline(DiffusionPipeline):
         # 0. Define call parameters
         device = self._execution_device
         mask_image = load_fixed_mask(height, mask_image_path)
-        self.image_processor = ImageProcessor(height, device="cuda", mask_image=mask_image)
         self.set_progress_bar_config(desc=f"Sample frames: {num_frames}")
         # 1. Default height and width to unet
@@ -362,12 +467,16 @@ class LipsyncPipeline(DiffusionPipeline):
         extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
         whisper_feature = self.audio_encoder.audio2feat(audio_path)
-        whisper_chunks = self.audio_encoder.feature2chunks(feature_array=whisper_feature, fps=video_fps)
         audio_samples = read_audio(audio_path)
         video_frames = read_video(video_path, use_decord=False)
-        video_frames, faces, boxes, affine_matrices = self.loop_video(whisper_chunks, video_frames)
         synced_video_frames = []
@@ -387,7 +496,9 @@ class LipsyncPipeline(DiffusionPipeline):
         num_inferences = math.ceil(len(whisper_chunks) / num_frames)
         for i in tqdm.tqdm(range(num_inferences), desc="Doing inference..."):
             if self.unet.add_audio_layer:
-                audio_embeds = torch.stack(whisper_chunks[i * num_frames : (i + 1) * num_frames])
                 audio_embeds = audio_embeds.to(device, dtype=weight_dtype)
                 if do_classifier_free_guidance:
                     null_audio_embeds = torch.zeros_like(audio_embeds)
@@ -396,8 +507,10 @@ class LipsyncPipeline(DiffusionPipeline):
                 audio_embeds = None
             inference_faces = faces[i * num_frames : (i + 1) * num_frames]
             latents = all_latents[:, :, i * num_frames : (i + 1) * num_frames]
-            ref_pixel_values, masked_pixel_values, masks = self.image_processor.prepare_masks_and_masked_images(
-                inference_faces, affine_transform=False
             )
             # 7. Prepare mask latent variables
@@ -422,30 +535,48 @@ class LipsyncPipeline(DiffusionPipeline):
             )
             # 9. Denoising loop
-            num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
             with self.progress_bar(total=num_inference_steps) as progress_bar:
                 for j, t in enumerate(timesteps):
                     # expand the latents if we are doing classifier free guidance
-                    unet_input = torch.cat([latents] * 2) if do_classifier_free_guidance else latents
                     unet_input = self.scheduler.scale_model_input(unet_input, t)
                     # concat latents, mask, masked_image_latents in the channel dimension
-                    unet_input = torch.cat([unet_input, mask_latents, masked_image_latents, ref_latents], dim=1)
                     # predict the noise residual
-                    noise_pred = self.unet(unet_input, t, encoder_hidden_states=audio_embeds).sample
                     # perform guidance
                     if do_classifier_free_guidance:
                         noise_pred_uncond, noise_pred_audio = noise_pred.chunk(2)
-                        noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_audio - noise_pred_uncond)
                     # compute the previous noisy sample x_t -> x_t-1
-                    latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs).prev_sample
                     # call the callback, if provided
-                    if j == len(timesteps) - 1 or ((j + 1) > num_warmup_steps and (j + 1) % self.scheduler.order == 0):
                         progress_bar.update()
                         if callback is not None and j % callback_steps == 0:
                             callback(j, t, latents)
@@ -455,11 +586,33 @@ class LipsyncPipeline(DiffusionPipeline):
             decoded_latents = self.paste_surrounding_pixels_back(
                 decoded_latents, ref_pixel_values, 1 - masks, device, weight_dtype
             )
-            synced_video_frames.append(decoded_latents)
-        synced_video_frames = self.restore_video(torch.cat(synced_video_frames), video_frames, boxes, affine_matrices)
-        audio_samples_remain_length = int(synced_video_frames.shape[0] / video_fps * audio_sample_rate)
         audio_samples = audio_samples[:audio_samples_remain_length].cpu().numpy()
         if is_train:
@@ -469,7 +622,9 @@ class LipsyncPipeline(DiffusionPipeline):
             shutil.rmtree(temp_dir)
         os.makedirs(temp_dir, exist_ok=True)
-        write_video(os.path.join(temp_dir, "video.mp4"), synced_video_frames, fps=video_fps)
         sf.write(os.path.join(temp_dir, "audio.wav"), audio_samples, audio_sample_rate)

     ):
         super().__init__()
+        if (
+            hasattr(scheduler.config, "steps_offset")
+            and scheduler.config.steps_offset != 1
+        ):
             deprecation_message = (
                 f"The configuration file of this scheduler: {scheduler} is outdated. `steps_offset`"
                 f" should be set to 1 instead of {scheduler.config.steps_offset}. Please make sure "
                 " it would be very nice if you could open a Pull request for the `scheduler/scheduler_config.json`"
                 " file"
             )
+            deprecate(
+                "steps_offset!=1", "1.0.0", deprecation_message, standard_warn=False
+            )
             new_config = dict(scheduler.config)
             new_config["steps_offset"] = 1
             scheduler._internal_dict = FrozenDict(new_config)
+        if (
+            hasattr(scheduler.config, "clip_sample")
+            and scheduler.config.clip_sample is True
+        ):
             deprecation_message = (
                 f"The configuration file of this scheduler: {scheduler} has not set the configuration `clip_sample`."
                 " `clip_sample` should be set to False in the configuration file. Please make sure to update the"
                 " future versions. If you have downloaded this checkpoint from the Hugging Face Hub, it would be very"
                 " nice if you could open a Pull request for the `scheduler/scheduler_config.json` file"
             )
+            deprecate(
+                "clip_sample not set", "1.0.0", deprecation_message, standard_warn=False
+            )
             new_config = dict(scheduler.config)
             new_config["clip_sample"] = False
             scheduler._internal_dict = FrozenDict(new_config)
+        is_unet_version_less_0_9_0 = hasattr(
+            unet.config, "_diffusers_version"
+        ) and version.parse(
             version.parse(unet.config._diffusers_version).base_version
         ) < version.parse("0.9.0.dev0")
+        is_unet_sample_size_less_64 = (
+            hasattr(unet.config, "sample_size") and unet.config.sample_size < 64
+        )
         if is_unet_version_less_0_9_0 and is_unet_sample_size_less_64:
             deprecation_message = (
                 "The configuration file of the unet has set the default `sample_size` to smaller than"
                 " following: \n- CompVis/stable-diffusion-v1-4 \n- CompVis/stable-diffusion-v1-3 \n-"
                 " CompVis/stable-diffusion-v1-2 \n- CompVis/stable-diffusion-v1-1 \n- runwayml/stable-diffusion-v1-5"
                 " \n- runwayml/stable-diffusion-inpainting \n you should change 'sample_size' to 64 in the"
+                " configuration file. Please make sure to update the config accordingly as leaving 'sample_size=32'"
                 " in the config might lead to incorrect results in future versions. If you have downloaded this"
                 " checkpoint from the Hugging Face Hub, it would be very nice if you could open a Pull request for"
                 " the `unet/config.json` file"
             )
+            deprecate(
+                "sample_size<64", "1.0.0", deprecation_message, standard_warn=False
+            )
             new_config = dict(unet.config)
             new_config["sample_size"] = 64
             unet._internal_dict = FrozenDict(new_config)
         return self.device
     def decode_latents(self, latents):
+        latents = (
+            latents / self.vae.config.scaling_factor + self.vae.config.shift_factor
+        )
         latents = rearrange(latents, "b c f h w -> (b f) c h w")
         decoded_latents = self.vae.decode(latents).sample
         return decoded_latents
         # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
         # and should be between [0, 1]
+        accepts_eta = "eta" in set(
+            inspect.signature(self.scheduler.step).parameters.keys()
+        )
         extra_step_kwargs = {}
         if accepts_eta:
             extra_step_kwargs["eta"] = eta
         # check if the scheduler accepts generator
+        accepts_generator = "generator" in set(
+            inspect.signature(self.scheduler.step).parameters.keys()
+        )
         if accepts_generator:
             extra_step_kwargs["generator"] = generator
         return extra_step_kwargs
         assert height == width, "Height and width must be equal"
         if height % 8 != 0 or width % 8 != 0:
+            raise ValueError(
+                f"`height` and `width` have to be divisible by 8 but are {height} and {width}."
+            )
         if (callback_steps is None) or (
+            callback_steps is not None
+            and (not isinstance(callback_steps, int) or callback_steps <= 0)
         ):
             raise ValueError(
                 f"`callback_steps` has to be a positive integer but is {callback_steps} of type"
                 f" {type(callback_steps)}."
             )
+    def prepare_latents(
+        self, num_frames, num_channels_latents, height, width, dtype, device, generator
+    ):
         shape = (
             1,
             num_channels_latents,
             width // self.vae_scale_factor,
         )  # (b, c, f, h, w)
         rand_device = "cpu" if device.type == "mps" else device
+        latents = torch.randn(
+            shape, generator=generator, device=rand_device, dtype=dtype
+        ).to(device)
         latents = latents.repeat(1, 1, num_frames, 1, 1)
         # scale the initial noise by the standard deviation required by the scheduler
         return latents
     def prepare_mask_latents(
+        self,
+        mask,
+        masked_image,
+        height,
+        width,
+        dtype,
+        device,
+        generator,
+        do_classifier_free_guidance,
     ):
         # resize the mask to latents shape as we concatenate the mask to the latents
         # we do that before converting to dtype to avoid breaking in case we're using cpu_offload
         masked_image = masked_image.to(device=device, dtype=dtype)
         # encode the mask image into latents space so we can concatenate it to the latents
+        masked_image_latents = self.vae.encode(masked_image).latent_dist.sample(
+            generator=generator
+        )
+        masked_image_latents = (
+            masked_image_latents - self.vae.config.shift_factor
+        ) * self.vae.config.scaling_factor
         # aligning device to prevent device errors when concating it with the latent model input
         masked_image_latents = masked_image_latents.to(device=device, dtype=dtype)
         mask = torch.cat([mask] * 2) if do_classifier_free_guidance else mask
         masked_image_latents = (
+            torch.cat([masked_image_latents] * 2)
+            if do_classifier_free_guidance
+            else masked_image_latents
         )
         return mask, masked_image_latents
+    def prepare_image_latents(
+        self, images, device, dtype, generator, do_classifier_free_guidance
+    ):
         images = images.to(device=device, dtype=dtype)
         image_latents = self.vae.encode(images).latent_dist.sample(generator=generator)
+        image_latents = (
+            image_latents - self.vae.config.shift_factor
+        ) * self.vae.config.scaling_factor
         image_latents = rearrange(image_latents, "f c h w -> 1 c f h w")
+        image_latents = (
+            torch.cat([image_latents] * 2)
+            if do_classifier_free_guidance
+            else image_latents
+        )
         return image_latents
         self._progress_bar_config.update(kwargs)
     @staticmethod
+    def paste_surrounding_pixels_back(
+        decoded_latents, pixel_values, masks, device, weight_dtype
+    ):
         # Paste the surrounding pixels back, because we only want to change the mouth region
         pixel_values = pixel_values.to(device=device, dtype=weight_dtype)
         masks = masks.to(device=device, dtype=weight_dtype)
         faces = torch.stack(faces)
         return faces, boxes, affine_matrices
+    def restore_video(
+        self,
+        faces: torch.Tensor,
+        video_frames: np.ndarray,
+        boxes: list,
+        affine_matrices: list,
+    ):
         video_frames = video_frames[: len(faces)]
         out_frames = []
         print(f"Restoring {len(faces)} faces...")
             height = int(y2 - y1)
             width = int(x2 - x1)
             face = torchvision.transforms.functional.resize(
+                face,
+                size=(height, width),
+                interpolation=transforms.InterpolationMode.BICUBIC,
+                antialias=True,
+            )
+            out_frame = self.image_processor.restorer.restore_img(
+                video_frames[index], face, affine_matrices[index]
             )
             out_frames.append(out_frame)
         return np.stack(out_frames, axis=0)
+    def restore_video_from_cpu(
+        self,
+        faces_list: List[torch.Tensor],
+        video_frames: np.ndarray,
+        boxes: list,
+        affine_matrices: list,
+    ):
+        """Restore video when faces are stored on CPU to save GPU memory"""
+        video_frames = video_frames[: len(faces_list)]
+        out_frames = []
+        device = self._execution_device
+        print(f"Restoring {len(faces_list)} faces from CPU to GPU {device}...")
+        for index, face_cpu in enumerate(tqdm.tqdm(faces_list)):
+            # Move frame to GPU only when needed for restoration
+            face = face_cpu.to(device=device, dtype=torch.float16)
+            x1, y1, x2, y2 = boxes[index]
+            height = int(y2 - y1)
+            width = int(x2 - x1)
+            face = torchvision.transforms.functional.resize(
+                face,
+                size=(height, width),
+                interpolation=transforms.InterpolationMode.BICUBIC,
+                antialias=True,
+            )
+            out_frame = self.image_processor.restorer.restore_img(
+                video_frames[index], face, affine_matrices[index]
+            )
+            out_frames.append(out_frame)
+            # Explicitly free GPU memory for this frame
+            del face
+            torch.cuda.empty_cache()
+        return np.stack(out_frames, axis=0)
     def loop_video(self, whisper_chunks: list, video_frames: np.ndarray):
         # If the audio is longer than the video, we need to loop the video
         if len(whisper_chunks) > len(video_frames):
                     loop_boxes += boxes[::-1]
                     loop_affine_matrices += affine_matrices[::-1]
+            video_frames = np.concatenate(loop_video_frames, axis=0)[
+                : len(whisper_chunks)
+            ]
             faces = torch.cat(loop_faces, dim=0)[: len(whisper_chunks)]
             boxes = loop_boxes[: len(whisper_chunks)]
             affine_matrices = loop_affine_matrices[: len(whisper_chunks)]
         # 0. Define call parameters
         device = self._execution_device
         mask_image = load_fixed_mask(height, mask_image_path)
+        self.image_processor = ImageProcessor(
+            height, device="cuda", mask_image=mask_image
+        )
         self.set_progress_bar_config(desc=f"Sample frames: {num_frames}")
         # 1. Default height and width to unet
         extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
         whisper_feature = self.audio_encoder.audio2feat(audio_path)
+        whisper_chunks = self.audio_encoder.feature2chunks(
+            feature_array=whisper_feature, fps=video_fps
+        )
         audio_samples = read_audio(audio_path)
         video_frames = read_video(video_path, use_decord=False)
+        video_frames, faces, boxes, affine_matrices = self.loop_video(
+            whisper_chunks, video_frames
+        )
         synced_video_frames = []
         num_inferences = math.ceil(len(whisper_chunks) / num_frames)
         for i in tqdm.tqdm(range(num_inferences), desc="Doing inference..."):
             if self.unet.add_audio_layer:
+                audio_embeds = torch.stack(
+                    whisper_chunks[i * num_frames : (i + 1) * num_frames]
+                )
                 audio_embeds = audio_embeds.to(device, dtype=weight_dtype)
                 if do_classifier_free_guidance:
                     null_audio_embeds = torch.zeros_like(audio_embeds)
                 audio_embeds = None
             inference_faces = faces[i * num_frames : (i + 1) * num_frames]
             latents = all_latents[:, :, i * num_frames : (i + 1) * num_frames]
+            ref_pixel_values, masked_pixel_values, masks = (
+                self.image_processor.prepare_masks_and_masked_images(
+                    inference_faces, affine_transform=False
+                )
             )
             # 7. Prepare mask latent variables
             )
             # 9. Denoising loop
+            num_warmup_steps = (
+                len(timesteps) - num_inference_steps * self.scheduler.order
+            )
             with self.progress_bar(total=num_inference_steps) as progress_bar:
                 for j, t in enumerate(timesteps):
                     # expand the latents if we are doing classifier free guidance
+                    unet_input = (
+                        torch.cat([latents] * 2)
+                        if do_classifier_free_guidance
+                        else latents
+                    )
                     unet_input = self.scheduler.scale_model_input(unet_input, t)
                     # concat latents, mask, masked_image_latents in the channel dimension
+                    unet_input = torch.cat(
+                        [unet_input, mask_latents, masked_image_latents, ref_latents],
+                        dim=1,
+                    )
                     # predict the noise residual
+                    noise_pred = self.unet(
+                        unet_input, t, encoder_hidden_states=audio_embeds
+                    ).sample
                     # perform guidance
                     if do_classifier_free_guidance:
                         noise_pred_uncond, noise_pred_audio = noise_pred.chunk(2)
+                        noise_pred = noise_pred_uncond + guidance_scale * (
+                            noise_pred_audio - noise_pred_uncond
+                        )
                     # compute the previous noisy sample x_t -> x_t-1
+                    latents = self.scheduler.step(
+                        noise_pred, t, latents, **extra_step_kwargs
+                    ).prev_sample
                     # call the callback, if provided
+                    if j == len(timesteps) - 1 or (
+                        (j + 1) > num_warmup_steps
+                        and (j + 1) % self.scheduler.order == 0
+                    ):
                         progress_bar.update()
                         if callback is not None and j % callback_steps == 0:
                             callback(j, t, latents)
             decoded_latents = self.paste_surrounding_pixels_back(
                 decoded_latents, ref_pixel_values, 1 - masks, device, weight_dtype
             )
+            # Move decoded latents to CPU to save GPU memory
+            decoded_latents_cpu = decoded_latents.cpu()
+            synced_video_frames.append(decoded_latents_cpu)
+            # Explicitly clear GPU memory
+            del decoded_latents
+            del ref_pixel_values
+            del masked_pixel_values
+            del masks
+            del mask_latents
+            del masked_image_latents
+            del ref_latents
+            del latents
+            if do_classifier_free_guidance:
+                del noise_pred_uncond, noise_pred_audio
+            del noise_pred
+            torch.cuda.empty_cache()
+        # Restore video from CPU tensors to save GPU memory
+        synced_video_frames = self.restore_video_from_cpu(
+            synced_video_frames, video_frames, boxes, affine_matrices
+        )
+        audio_samples_remain_length = int(
+            synced_video_frames.shape[0] / video_fps * audio_sample_rate
+        )
         audio_samples = audio_samples[:audio_samples_remain_length].cpu().numpy()
         if is_train:
             shutil.rmtree(temp_dir)
         os.makedirs(temp_dir, exist_ok=True)
+        write_video(
+            os.path.join(temp_dir, "video.mp4"), synced_video_frames, fps=video_fps
+        )
         sf.write(os.path.join(temp_dir, "audio.wav"), audio_samples, audio_sample_rate)