adding rife

Files changed (14) hide show

Practical-RIFE +1 -0
__pycache__/handler.cpython-310.pyc +0 -0
download_weights.py +0 -1
handler.py +136 -15
input.jpg +0 -0
memory_stats.log +72 -0
output.mp4 +0 -0
output/gradio/animation_output.mp4 +0 -0
output/gradio/completed_result.mp4 +0 -0
output/gradio/cropped_face.jpg +0 -0
output/gradio/output_video.mp4 +0 -0
requirements.txt +5 -0
sampler.py +7 -7
sped_up_pose_video.mp4 +0 -0

Practical-RIFE ADDED Viewed

	@@ -0,0 +1 @@


1	+ Subproject commit f3e48ceb02e4c21bc8868b03994b98f3402ffb3d

__pycache__/handler.cpython-310.pyc CHANGED Viewed

Binary files a/__pycache__/handler.cpython-310.pyc and b/__pycache__/handler.cpython-310.pyc differ

download_weights.py CHANGED Viewed

@@ -3,7 +3,6 @@ from pathlib import Path, PurePosixPath
 from huggingface_hub import hf_hub_download
 def prepare_base_model():
     print(f'Preparing base stable-diffusion-v1-5 weights...')
     local_dir = "./pretrained_weights/stable-diffusion-v1-5"

 from huggingface_hub import hf_hub_download
 def prepare_base_model():
     print(f'Preparing base stable-diffusion-v1-5 weights...')
     local_dir = "./pretrained_weights/stable-diffusion-v1-5"

handler.py CHANGED Viewed

@@ -10,6 +10,8 @@ from omegaconf import OmegaConf
 from transformers import CLIPVisionModelWithProjection
 import cv2
 import os
 from src.models.pose_guider import PoseGuider
 from src.models.unet_2d_condition import UNet2DConditionModel
 from src.models.unet_3d import UNet3DConditionModel
@@ -20,6 +22,10 @@ from roop.core import start, decode_execution_providers, suggest_max_memory, sug
 from roop.utilities import normalize_output_path
 from roop.processors.frame.core import get_frame_processors_modules
 device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
 if device.type != 'cuda':
@@ -35,6 +41,7 @@ class EndpointHandler():
         self.config = OmegaConf.load(config_path)
         self.weight_dtype = torch.float16
         self.pipeline = None
         self._initialize_pipeline()
@@ -45,13 +52,13 @@ class EndpointHandler():
         if not os.path.exists(config_path):
             raise FileNotFoundError(f"The sd-vae-ft-mse folder was not found at: {config_path}")
-        vae = AutoencoderKL.from_pretrained(config_path).to(device, dtype=self.weight_dtype)
         pretrained_base_model_path_unet = os.path.join(base_dir, 'pretrained_weights', 'stable-diffusion-v1-5', 'unet')
         print("model path is " + pretrained_base_model_path_unet)
         reference_unet = UNet2DConditionModel.from_pretrained(
             pretrained_base_model_path_unet
-        ).to(dtype=self.weight_dtype, device="cuda")
         inference_config_path = os.path.join(base_dir, 'configs', 'inference', 'inference_v2.yaml')
         motion_module_path = os.path.join(base_dir, 'pretrained_weights', 'motion_module.pth')
@@ -65,10 +72,10 @@ class EndpointHandler():
             pretrained_base_model_path_unet,
             motion_module_path,
             unet_additional_kwargs=infer_config.unet_additional_kwargs,
-        ).to(device, dtype=self.weight_dtype)
-        pose_guider = PoseGuider(320, block_out_channels=(16, 32, 96, 256)).to(device, dtype=self.weight_dtype)
-        image_enc = CLIPVisionModelWithProjection.from_pretrained(image_encoder_path).to(device, dtype=self.weight_dtype)
         sched_kwargs = OmegaConf.to_container(infer_config.noise_scheduler_kwargs)
         scheduler = DDIMScheduler(**sched_kwargs)
@@ -83,7 +90,7 @@ class EndpointHandler():
             denoising_unet=denoising_unet,
             pose_guider=pose_guider,
             scheduler=scheduler
-        ).to(device, dtype=self.weight_dtype)
     def _crop_face(self, image, save_path="cropped_face.jpg", margin=0.5):
         # Convert image to OpenCV format
@@ -137,17 +144,112 @@ class EndpointHandler():
         roop.globals.video_encoder = "libx264"
         roop.globals.video_quality = 50
         roop.globals.max_memory = suggest_max_memory()
-        roop.globals.execution_providers = decode_execution_providers(["cpu"])
         roop.globals.execution_threads = suggest_execution_threads()
         for frame_processor in get_frame_processors_modules(roop.globals.frame_processors):
-            if not frame_processor.pre_check():
-                raise ValueError("Frame processor pre-check failed.")
         start()
         return os.path.join(os.getcwd(), output_path)
     def __call__(self, data: Any) -> Dict[str, str]:
         inputs = data.get("inputs", {})
         ref_image_base64 = inputs.get("ref_image", "")
@@ -169,11 +271,15 @@ class EndpointHandler():
         if not os.path.exists(pose_video_path):
             raise FileNotFoundError(f"The pose video was not found at: {pose_video_path}")
-        torch.manual_seed(seed)
-        pose_images = read_frames(pose_video_path)
-        src_fps = get_fps(pose_video_path)
         pose_list = []
         total_length = min(length, len(pose_images))
         for pose_image_pil in pose_images[:total_length]:
@@ -199,11 +305,26 @@ class EndpointHandler():
         cropped_face_path = os.path.join(save_dir, "cropped_face.jpg")
         cropped_face = self._crop_face(ref_image, save_path=cropped_face_path)
         # Perform face swapping
-        final_video_path = self._swap_face(cropped_face, animation_path)
         # Encode the final video in base64
-        with open(final_video_path, "rb") as video_file:
             video_base64 = base64.b64encode(video_file.read()).decode("utf-8")
         torch.cuda.empty_cache()

 from transformers import CLIPVisionModelWithProjection
 import cv2
 import os
+import sys
+import skvideo.io
 from src.models.pose_guider import PoseGuider
 from src.models.unet_2d_condition import UNet2DConditionModel
 from src.models.unet_3d import UNet3DConditionModel
 from roop.utilities import normalize_output_path
 from roop.processors.frame.core import get_frame_processors_modules
+import onnxruntime as ort
+import gc
+import subprocess
 device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
 if device.type != 'cuda':
         self.config = OmegaConf.load(config_path)
         self.weight_dtype = torch.float16
+        self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
         self.pipeline = None
         self._initialize_pipeline()
         if not os.path.exists(config_path):
             raise FileNotFoundError(f"The sd-vae-ft-mse folder was not found at: {config_path}")
+        vae = AutoencoderKL.from_pretrained(config_path).to(self.device, dtype=self.weight_dtype)
         pretrained_base_model_path_unet = os.path.join(base_dir, 'pretrained_weights', 'stable-diffusion-v1-5', 'unet')
         print("model path is " + pretrained_base_model_path_unet)
         reference_unet = UNet2DConditionModel.from_pretrained(
             pretrained_base_model_path_unet
+        ).to(dtype=self.weight_dtype, device=self.device)
         inference_config_path = os.path.join(base_dir, 'configs', 'inference', 'inference_v2.yaml')
         motion_module_path = os.path.join(base_dir, 'pretrained_weights', 'motion_module.pth')
             pretrained_base_model_path_unet,
             motion_module_path,
             unet_additional_kwargs=infer_config.unet_additional_kwargs,
+        ).to(self.device, dtype=self.weight_dtype)
+        pose_guider = PoseGuider(320, block_out_channels=(16, 32, 96, 256)).to(self.device, dtype=self.weight_dtype)
+        image_enc = CLIPVisionModelWithProjection.from_pretrained(image_encoder_path).to(self.device, dtype=self.weight_dtype)
         sched_kwargs = OmegaConf.to_container(infer_config.noise_scheduler_kwargs)
         scheduler = DDIMScheduler(**sched_kwargs)
             denoising_unet=denoising_unet,
             pose_guider=pose_guider,
             scheduler=scheduler
+        ).to(self.device, dtype=self.weight_dtype)
     def _crop_face(self, image, save_path="cropped_face.jpg", margin=0.5):
         # Convert image to OpenCV format
         roop.globals.video_encoder = "libx264"
         roop.globals.video_quality = 50
         roop.globals.max_memory = suggest_max_memory()
+        # Set GPU execution provider
+        roop.globals.execution_providers = decode_execution_providers(["CUDAExecutionProvider"])
         roop.globals.execution_threads = suggest_execution_threads()
+        # Ensure onnxruntime is using the GPU
+        ort.set_default_logger_severity(3)  # Suppress verbose logging
+        providers = ['CUDAExecutionProvider']
+        options = ort.SessionOptions()
+        options.intra_op_num_threads = 1
         for frame_processor in get_frame_processors_modules(roop.globals.frame_processors):
+            if hasattr(frame_processor, 'onnx_session'):
+                frame_processor.onnx_session.set_providers(providers, options)
+        # Clear CUDA cache before starting the face swapping process
+        torch.cuda.empty_cache()
         start()
+        # Clear CUDA cache after the face swapping process
+        for frame_processor in roop.globals.frame_processors:
+            del frame_processor
+        torch.cuda.empty_cache()
         return os.path.join(os.getcwd(), output_path)
+    def print_memory_stat_for_stuff(self, phase, log_file="memory_stats.log"):
+        with open(log_file, "a") as f:
+            f.write(f"Memory Stats - {phase}:\n")
+            f.write(f"Allocated memory: {torch.cuda.memory_allocated() / 1024**2:.2f} MB\n")
+            f.write(f"Reserved memory: {torch.cuda.memory_reserved() / 1024**2:.2f} MB\n")
+            f.write(f"Max allocated memory: {torch.cuda.max_memory_allocated() / 1024**2:.2f} MB\n")
+            f.write(f"Max reserved memory: {torch.cuda.max_memory_reserved() / 1024**2:.2f} MB\n")
+            f.write("="*30 + "\n")
+    def convert_to_playable_format(self, input_path, output_path):
+        command = [
+            "ffmpeg",
+            "-i", input_path,
+            "-c:v", "libx264",
+            "-preset", "fast",
+            "-crf", "18",
+            "-y",  # Overwrite output file if it exists
+            output_path
+        ]
+        result = subprocess.run(command, capture_output=True, text=True)
+        print("Conversion STDOUT:", result.stdout)
+        print("Conversion STDERR:", result.stderr)
+        if result.returncode != 0:
+            raise RuntimeError(f"FFmpeg conversion failed with exit code {result.returncode}")
+    def run_rife_interpolation(self, video_path, output_path, multi=2, scale=1.0):
+        base_dir = os.path.dirname(os.path.abspath(__file__))
+        directory = os.path.join(base_dir, "Practical-RIFE", "inference_video.py")
+        model_directory = os.path.join(base_dir, "Practical-RIFE", "train_log")
+        command = [
+            "python",
+            directory,
+            f"--video={video_path}",
+            f"--output={output_path}",
+            f"--multi={multi}",
+            f"--scale={scale}",
+            f"--model={model_directory}",
+        ]
+        result = subprocess.run(command, capture_output=True, text=True)
+        print(result)
+        print(result.stdout)
+        print(result.stderr)
+        if result.returncode != 0:
+            raise RuntimeError(f"RIFE interpolation failed with exit code {result.returncode}")
+        self.convert_to_playable_format(output_path, "completed_playable.mp4")
+    def speed_up_video(self, input_path, output_path, factor=4):
+        command = [
+            "ffmpeg",
+            "-i", input_path,
+            "-filter:v", f"setpts=PTS/{factor}",
+            "-an",  # Remove audio
+            output_path
+        ]
+        result = subprocess.run(command, capture_output=True, text=True)
+        print("Speed Up Video STDOUT:", result.stdout)
+        print("Speed Up Video STDERR:", result.stderr)
+        if result.returncode != 0:
+            raise RuntimeError(f"FFmpeg speed up failed with exit code {result.returncode}")
+    def slow_down_video(self, input_path, output_path, factor=4):
+        command = [
+            "ffmpeg",
+            "-i", input_path,
+            "-filter:v", f"setpts={factor}*PTS",
+            "-an",  # Remove audio
+            output_path
+        ]
+        result = subprocess.run(command, capture_output=True, text=True)
+        print("Slow Down Video STDOUT:", result.stdout)
+        print("Slow Down Video STDERR:", result.stderr)
+        if result.returncode != 0:
+            raise RuntimeError(f"FFmpeg slow down failed with exit code {result.returncode}")
     def __call__(self, data: Any) -> Dict[str, str]:
         inputs = data.get("inputs", {})
         ref_image_base64 = inputs.get("ref_image", "")
         if not os.path.exists(pose_video_path):
             raise FileNotFoundError(f"The pose video was not found at: {pose_video_path}")
+        # Speed up the pose video by 4x
+        sped_up_pose_video_path = os.path.join(base_dir, "sped_up_pose_video.mp4")
+        self.speed_up_video(pose_video_path, sped_up_pose_video_path, factor=4)
+        torch.manual_seed(seed)
+        pose_images = read_frames(sped_up_pose_video_path)
+        src_fps = get_fps(sped_up_pose_video_path)
         pose_list = []
         total_length = min(length, len(pose_images))
         for pose_image_pil in pose_images[:total_length]:
         cropped_face_path = os.path.join(save_dir, "cropped_face.jpg")
         cropped_face = self._crop_face(ref_image, save_path=cropped_face_path)
+        # Delete the pipeline and clear CUDA cache to free up memory
+        del self.pipeline
+        torch.cuda.empty_cache()
         # Perform face swapping
+        swapped_face_video_path = self._swap_face(cropped_face, animation_path)
+        # Slow down the produced video by 4x
+        slowed_down_animation_path = os.path.join(save_dir, "slowed_down_animation_output.mp4")
+        self.slow_down_video(swapped_face_video_path, slowed_down_animation_path, factor=4)
+        # Clear CUDA cache before RIFE interpolation
+        torch.cuda.empty_cache()
+        # Perform RIFE interpolation
+        rife_output_path = os.path.join(save_dir, "completed_result.mp4")
+        self.run_rife_interpolation(slowed_down_animation_path, rife_output_path, multi=2, scale=0.5)
         # Encode the final video in base64
+        with open(rife_output_path, "rb") as video_file:
             video_base64 = base64.b64encode(video_file.read()).decode("utf-8")
         torch.cuda.empty_cache()

input.jpg CHANGED Viewed

memory_stats.log ADDED Viewed

	@@ -0,0 +1,72 @@

+Memory Stats - Preloading model:
+Allocated memory: 20.48 MB
+Reserved memory: 32.00 MB
+Max allocated memory: 20.48 MB
+Max reserved memory: 32.00 MB
+==============================
+Memory Stats - post loading model model:
+Allocated memory: 20.48 MB
+Reserved memory: 62.00 MB
+Max allocated memory: 40.96 MB
+Max reserved memory: 62.00 MB
+==============================
+Memory Stats - Before video release:
+Allocated memory: 20.48 MB
+Reserved memory: 62.00 MB
+Max allocated memory: 40.96 MB
+Max reserved memory: 62.00 MB
+==============================
+Memory Stats - After video release:
+Allocated memory: 20.48 MB
+Reserved memory: 62.00 MB
+Max allocated memory: 40.96 MB
+Max reserved memory: 62.00 MB
+==============================
+Memory Stats - Before videowriter vid_out:
+Allocated memory: 20.48 MB
+Reserved memory: 62.00 MB
+Max allocated memory: 40.96 MB
+Max reserved memory: 62.00 MB
+==============================
+Memory Stats - After videowriter vid_out:
+Allocated memory: 20.48 MB
+Reserved memory: 62.00 MB
+Max allocated memory: 40.96 MB
+Max reserved memory: 62.00 MB
+==============================
+Memory Stats - Preloading model:
+Allocated memory: 20.48 MB
+Reserved memory: 32.00 MB
+Max allocated memory: 20.48 MB
+Max reserved memory: 32.00 MB
+==============================
+Memory Stats - post loading model model:
+Allocated memory: 20.48 MB
+Reserved memory: 62.00 MB
+Max allocated memory: 40.96 MB
+Max reserved memory: 62.00 MB
+==============================
+Memory Stats - Before video release:
+Allocated memory: 20.48 MB
+Reserved memory: 62.00 MB
+Max allocated memory: 40.96 MB
+Max reserved memory: 62.00 MB
+==============================
+Memory Stats - After video release:
+Allocated memory: 20.48 MB
+Reserved memory: 62.00 MB
+Max allocated memory: 40.96 MB
+Max reserved memory: 62.00 MB
+==============================
+Memory Stats - Before videowriter vid_out:
+Allocated memory: 20.48 MB
+Reserved memory: 62.00 MB
+Max allocated memory: 40.96 MB
+Max reserved memory: 62.00 MB
+==============================
+Memory Stats - After videowriter vid_out:
+Allocated memory: 20.48 MB
+Reserved memory: 62.00 MB
+Max allocated memory: 40.96 MB
+Max reserved memory: 62.00 MB
+==============================

output.mp4 CHANGED Viewed

Binary files a/output.mp4 and b/output.mp4 differ

output/gradio/animation_output.mp4 CHANGED Viewed

Binary files a/output/gradio/animation_output.mp4 and b/output/gradio/animation_output.mp4 differ

output/gradio/completed_result.mp4 ADDED Viewed

Binary file (44 Bytes). View file

output/gradio/cropped_face.jpg CHANGED Viewed

output/gradio/output_video.mp4 DELETED Viewed

Binary file (840 kB)

requirements.txt CHANGED Viewed

@@ -49,3 +49,8 @@ scipy==1.11.4
 torchdiffeq==0.2.3
 torchmetrics==1.2.1
 torchsde==0.2.5

 torchdiffeq==0.2.3
 torchmetrics==1.2.1
 torchsde==0.2.5
+# Additional dependencies for RIFE
+sk-video==1.1.10
+moviepy==1.0.3

sampler.py CHANGED Viewed

@@ -18,7 +18,7 @@ inputs = {
         "pose_video_path": "pose_video.mp4",
         "width": 512,
         "height": 768,
-        "length": 12,
         "num_inference_steps": 25,
         "cfg": 3.5,
         "seed": 123
@@ -28,12 +28,12 @@ inputs = {
 # Simulate an inference call
 output = handler(inputs)
-# Decode the base64 video output
-video_base64 = output.get("video", "")
-video_bytes = base64.b64decode(video_base64)
-# Save the video to a file
-with open("output_video.mp4", "wb") as video_file:
-    video_file.write(video_bytes)
 print("Inference completed. Output video saved as output_video.mp4")

         "pose_video_path": "pose_video.mp4",
         "width": 512,
         "height": 768,
+        "length": 24,
         "num_inference_steps": 25,
         "cfg": 3.5,
         "seed": 123
 # Simulate an inference call
 output = handler(inputs)
+# # Decode the base64 video output
+# video_base64 = output.get("video", "")
+# video_bytes = base64.b64decode(video_base64)
+# # Save the video to a file
+# with open("output_video.mp4", "wb") as video_file:
+#     video_file.write(video_bytes)
 print("Inference completed. Output video saved as output_video.mp4")

sped_up_pose_video.mp4 ADDED Viewed

Binary file (131 kB). View file