root commited on May 28, 2024

Commit

dd31ccf

1 Parent(s): 1bfd414

setting up model

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

__pycache__/handler.cpython-310.pyc +0 -0
configs/inference/inference_v1.yaml +23 -0
configs/inference/inference_v2.yaml +35 -0
configs/prompts/animation.yaml +26 -0
gfpgan/weights/detection_Resnet50_Final.pth +3 -0
gfpgan/weights/parsing_parsenet.pth +3 -0
good_face.jpeg +0 -0
handler.py +247 -0
input.jpg +0 -0
models/GFPGANv1.4.pth +3 -0
models/inswapper_128.onnx +3 -0
output.mp4 +0 -0
output/gradio/animation_output.mp4 +0 -0
output/gradio/cropped_face.jpg +0 -0
output/gradio/output_video.mp4 +0 -0
pose_video.mp4 +0 -0
pretrained_weights/DWPose/dw-ll_ucoco_384.onnx +3 -0
pretrained_weights/DWPose/yolox_l.onnx +3 -0
pretrained_weights/denoising_unet.pth +3 -0
pretrained_weights/image_encoder/config.json +23 -0
pretrained_weights/image_encoder/pytorch_model.bin +3 -0
pretrained_weights/motion_module.pth +3 -0
pretrained_weights/pose_guider.pth +3 -0
pretrained_weights/reference_unet.pth +3 -0
pretrained_weights/sd-vae-ft-mse/config.json +29 -0
pretrained_weights/sd-vae-ft-mse/diffusion_pytorch_model.bin +3 -0
pretrained_weights/sd-vae-ft-mse/diffusion_pytorch_model.safetensors +3 -0
pretrained_weights/stable-diffusion-v1-5/feature_extractor/preprocessor_config.json +20 -0
pretrained_weights/stable-diffusion-v1-5/model_index.json +32 -0
pretrained_weights/stable-diffusion-v1-5/unet/config.json +36 -0
pretrained_weights/stable-diffusion-v1-5/unet/diffusion_pytorch_model.bin +3 -0
pretrained_weights/stable-diffusion-v1-5/v1-inference.yaml +70 -0
requirements.txt +39 -0
roop/__init__.py +0 -0
roop/__pycache__/__init__.cpython-310.pyc +0 -0
roop/__pycache__/capturer.cpython-310.pyc +0 -0
roop/__pycache__/core.cpython-310.pyc +0 -0
roop/__pycache__/face_analyser.cpython-310.pyc +0 -0
roop/__pycache__/globals.cpython-310.pyc +0 -0
roop/__pycache__/metadata.cpython-310.pyc +0 -0
roop/__pycache__/predicter.cpython-310.pyc +0 -0
roop/__pycache__/typing.cpython-310.pyc +0 -0
roop/__pycache__/ui.cpython-310.pyc +0 -0
roop/__pycache__/utilities.cpython-310.pyc +0 -0
roop/capturer.py +20 -0
roop/core.py +215 -0
roop/face_analyser.py +34 -0
roop/globals.py +17 -0
roop/metadata.py +2 -0
roop/predicter.py +43 -0

__pycache__/handler.cpython-310.pyc ADDED Viewed

Binary file (8.09 kB). View file

configs/inference/inference_v1.yaml ADDED Viewed

	@@ -0,0 +1,23 @@

+unet_additional_kwargs:
+  unet_use_cross_frame_attention: false
+  unet_use_temporal_attention:    false
+  use_motion_module:              true
+  motion_module_resolutions:      [1,2,4,8]
+  motion_module_mid_block:        false
+  motion_module_decoder_only:     false
+  motion_module_type:             "Vanilla"
+  motion_module_kwargs:
+    num_attention_heads:                8
+    num_transformer_block:              1
+    attention_block_types:              [ "Temporal_Self", "Temporal_Self" ]
+    temporal_position_encoding:         true
+    temporal_position_encoding_max_len: 24
+    temporal_attention_dim_div:         1
+noise_scheduler_kwargs:
+  beta_start:    0.00085
+  beta_end:      0.012
+  beta_schedule: "linear"
+  steps_offset:  1
+  clip_sample:   False

configs/inference/inference_v2.yaml ADDED Viewed

	@@ -0,0 +1,35 @@

+unet_additional_kwargs:
+  use_inflated_groupnorm: true
+  unet_use_cross_frame_attention: false
+  unet_use_temporal_attention: false
+  use_motion_module: true
+  motion_module_resolutions:
+  - 1
+  - 2
+  - 4
+  - 8
+  motion_module_mid_block: true
+  motion_module_decoder_only: false
+  motion_module_type: Vanilla
+  motion_module_kwargs:
+    num_attention_heads: 8
+    num_transformer_block: 1
+    attention_block_types:
+    - Temporal_Self
+    - Temporal_Self
+    temporal_position_encoding: true
+    temporal_position_encoding_max_len: 32
+    temporal_attention_dim_div: 1
+noise_scheduler_kwargs:
+  beta_start: 0.00085
+  beta_end: 0.012
+  beta_schedule: "linear"
+  clip_sample: false
+  steps_offset: 1
+  ### Zero-SNR params
+  prediction_type: "v_prediction"
+  rescale_betas_zero_snr: True
+  timestep_spacing: "trailing"
+sampler: DDIM

configs/prompts/animation.yaml ADDED Viewed

	@@ -0,0 +1,26 @@

+pretrained_base_model_path: "./pretrained_weights/stable-diffusion-v1-5/"
+pretrained_vae_path: "./pretrained_weights/sd-vae-ft-mse"
+image_encoder_path: "./pretrained_weights/image_encoder"
+denoising_unet_path: "./pretrained_weights/denoising_unet.pth"
+reference_unet_path: "./pretrained_weights/reference_unet.pth"
+pose_guider_path: "./pretrained_weights/pose_guider.pth"
+motion_module_path: "./pretrained_weights/motion_module.pth"
+inference_config: "./configs/inference/inference_v2.yaml"
+weight_dtype: 'fp16'
+test_cases:
+  "./configs/inference/ref_images/anyone-2.png":
+    - "./configs/inference/pose_videos/anyone-video-2_kps.mp4"
+    - "./configs/inference/pose_videos/anyone-video-5_kps.mp4"
+  "./configs/inference/ref_images/anyone-10.png":
+    - "./configs/inference/pose_videos/anyone-video-1_kps.mp4"
+    - "./configs/inference/pose_videos/anyone-video-2_kps.mp4"
+  "./configs/inference/ref_images/anyone-11.png":
+    - "./configs/inference/pose_videos/anyone-video-1_kps.mp4"
+    - "./configs/inference/pose_videos/anyone-video-2_kps.mp4"
+  "./configs/inference/ref_images/anyone-3.png":
+    - "./configs/inference/pose_videos/anyone-video-2_kps.mp4"
+    - "./configs/inference/pose_videos/anyone-video-5_kps.mp4"
+  "./configs/inference/ref_images/anyone-5.png":
+    - "./configs/inference/pose_videos/anyone-video-2_kps.mp4"

gfpgan/weights/detection_Resnet50_Final.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:6d1de9c2944f2ccddca5f5e010ea5ae64a39845a86311af6fdf30841b0a5a16d
+size 109497761

gfpgan/weights/parsing_parsenet.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:3d558d8d0e42c20224f13cf5a29c79eba2d59913419f945545d8cf7b72920de2
+size 85331193

good_face.jpeg ADDED Viewed

handler.py ADDED Viewed

	@@ -0,0 +1,247 @@

+from typing import Dict, Any
+import torch
+from PIL import Image
+import base64
+from io import BytesIO
+import numpy as np
+from diffusers import AutoencoderKL, DDIMScheduler
+from einops import repeat
+from omegaconf import OmegaConf
+from transformers import CLIPVisionModelWithProjection
+import cv2
+import os
+from backgroundremover.bg import remove as remove_bg
+from src.models.pose_guider import PoseGuider
+from src.models.unet_2d_condition import UNet2DConditionModel
+from src.models.unet_3d import UNet3DConditionModel
+from src.pipelines.pipeline_pose2vid_long import Pose2VideoPipeline
+from src.utils.util import read_frames, get_fps, save_videos_grid
+import roop.globals
+from roop.core import start, decode_execution_providers, suggest_max_memory, suggest_execution_threads
+from roop.utilities import normalize_output_path
+from roop.processors.frame.core import get_frame_processors_modules
+device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
+if device.type != 'cuda':
+    raise ValueError("The model requires a GPU for inference.")
+class EndpointHandler():
+    def __init__(self, path=""):
+        self.config = OmegaConf.load("./configs/prompts/animation.yaml")
+        self.weight_dtype = torch.float16
+        self.pipeline = None
+        self._initialize_pipeline()
+    def _initialize_pipeline(self):
+        vae = AutoencoderKL.from_pretrained('./pretrained_weights/sd-vae-ft-mse').to(device, dtype=self.weight_dtype)
+        reference_unet = UNet2DConditionModel.from_pretrained(
+            self.config.pretrained_base_model_path,
+            subfolder="unet"
+        ).to(device, dtype=self.weight_dtype)
+        inference_config_path = self.config.inference_config
+        infer_config = OmegaConf.load(inference_config_path)
+        denoising_unet = UNet3DConditionModel.from_pretrained_2d(
+            self.config.pretrained_base_model_path,
+            self.config.motion_module_path,
+            subfolder="unet",
+            unet_additional_kwargs=infer_config.unet_additional_kwargs,
+        ).to(device, dtype=self.weight_dtype)
+        pose_guider = PoseGuider(320, block_out_channels=(16, 32, 96, 256)).to(device, dtype=self.weight_dtype)
+        image_enc = CLIPVisionModelWithProjection.from_pretrained(self.config.image_encoder_path).to(device, dtype=self.weight_dtype)
+        sched_kwargs = OmegaConf.to_container(infer_config.noise_scheduler_kwargs)
+        scheduler = DDIMScheduler(**sched_kwargs)
+        denoising_unet.load_state_dict(torch.load(self.config.denoising_unet_path, map_location="cpu"), strict=False)
+        reference_unet.load_state_dict(torch.load(self.config.reference_unet_path, map_location="cpu"))
+        pose_guider.load_state_dict(torch.load(self.config.pose_guider_path, map_location="cpu"))
+        self.pipeline = Pose2VideoPipeline(
+            vae=vae,
+            image_encoder=image_enc,
+            reference_unet=reference_unet,
+            denoising_unet=denoising_unet,
+            pose_guider=pose_guider,
+            scheduler=scheduler
+        ).to(device, dtype=self.weight_dtype)
+    def _crop_face(self, image, save_path="cropped_face.jpg", margin=0.3):
+        # Convert image to OpenCV format
+        cv_image = np.array(image)
+        cv_image = cv_image[:, :, ::-1].copy()
+        # Load OpenCV face detector
+        face_cascade = cv2.CascadeClassifier(cv2.data.haarcascades + 'haarcascade_frontalface_default.xml')
+        # Detect faces
+        gray = cv2.cvtColor(cv_image, cv2.COLOR_BGR2GRAY)
+        faces = face_cascade.detectMultiScale(gray, 1.1, 4)
+        if len(faces) == 0:
+            raise ValueError("No faces detected in the reference image.")
+        # Crop the first face found with a margin
+        x, y, w, h = faces[0]
+        x_margin = int(margin * w)
+        y_margin = int(margin * h)
+        x1 = max(0, x - x_margin)
+        y1 = max(0, y - y_margin)
+        x2 = min(cv_image.shape[1], x + w + x_margin)
+        y2 = min(cv_image.shape[0], y + h + y_margin)
+        cropped_face = cv_image[y1:y2, x1:x2]
+        # Convert back to PIL format
+        cropped_face = Image.fromarray(cropped_face[:, :, ::-1]).convert("RGB")
+        # Save the cropped face
+        cropped_face.save(save_path, format="JPEG", quality=95)
+        return cropped_face
+    def _swap_face(self, source_image, target_video_path):
+        # Use a predefined face image instead of the provided source_image
+        source_path = "/root/AnimateAnyone/good_face.jpeg"  # Change this to your known good face image path
+        output_path = "output.mp4"
+        roop.globals.source_path = source_path
+        roop.globals.target_path = target_video_path
+        roop.globals.output_path = normalize_output_path(roop.globals.source_path, roop.globals.target_path, output_path)
+        roop.globals.frame_processors = ["face_swapper", "face_enhancer"]
+        roop.globals.headless = True
+        roop.globals.keep_fps = True
+        roop.globals.keep_audio = True
+        roop.globals.keep_frames = False
+        roop.globals.many_faces = False
+        roop.globals.video_encoder = "libx264"
+        roop.globals.video_quality = 50
+        roop.globals.max_memory = suggest_max_memory()
+        roop.globals.execution_providers = decode_execution_providers(["cpu"])
+        roop.globals.execution_threads = suggest_execution_threads()
+        for frame_processor in get_frame_processors_modules(roop.globals.frame_processors):
+            if not frame_processor.pre_check():
+                raise ValueError("Frame processor pre-check failed.")
+        print(f"Starting face swap with source: {source_path} and target: {target_video_path}")
+        start()
+        print(f"Face swap completed. Output saved to: {output_path}")
+        return os.path.join(os.getcwd(), output_path)
+    def remove_bg_from_image(self, image_data):
+        model_name = "u2net"  # Choose your preferred model: "u2net", "u2net_human_seg", "u2netp"
+        processed_image_data = remove_bg(
+            image_data,
+            model_name=model_name,
+            alpha_matting=True,
+            alpha_matting_foreground_threshold=240,
+            alpha_matting_background_threshold=10,
+            alpha_matting_erode_structure_size=10,
+            alpha_matting_base_size=1000
+        )
+        return processed_image_data
+    def _remove_background(self, input_path, output_path):
+        cap = cv2.VideoCapture(input_path)
+        if not cap.isOpened():
+            raise IOError(f"Error opening video file {input_path}")
+        width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
+        height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
+        fps = int(cap.get(cv2.CAP_PROP_FPS))
+        fourcc = cv2.VideoWriter_fourcc(*'mp4v')
+        out = cv2.VideoWriter(output_path, fourcc, fps, (width, height))
+        frame_count = 0
+        while cap.isOpened():
+            ret, frame = cap.read()
+            if not ret:
+                break
+            frame_count += 1
+            pil_frame = Image.fromarray(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))
+            frame_data = BytesIO()
+            pil_frame.save(frame_data, format="PNG")
+            frame_data = frame_data.getvalue()
+            processed_frame_data = self.remove_bg_from_image(frame_data)
+            processed_pil_frame = Image.open(BytesIO(processed_frame_data))
+            processed_frame = cv2.cvtColor(np.array(processed_pil_frame), cv2.COLOR_RGB2BGR)
+            out.write(processed_frame)
+        cap.release()
+        out.release()
+        if frame_count == 0:
+            raise IOError(f"No frames processed. Error with video file {input_path}")
+    def __call__(self, data: Any) -> Dict[str, str]:
+        inputs = data.get("inputs", {})
+        ref_image_base64 = inputs.get("ref_image", "")
+        pose_video_path = inputs.get("pose_video_path", "")
+        width = inputs.get("width", 512)
+        height = inputs.get("height", 768)
+        length = inputs.get("length", 24)
+        num_inference_steps = inputs.get("num_inference_steps", 25)
+        cfg = inputs.get("cfg", 3.5)
+        seed = inputs.get("seed", 123)
+        ref_image = Image.open(BytesIO(base64.b64decode(ref_image_base64)))
+        torch.manual_seed(seed)
+        pose_images = read_frames(pose_video_path)
+        src_fps = get_fps(pose_video_path)
+        pose_list = []
+        total_length = min(length, len(pose_images))
+        for pose_image_pil in pose_images[:total_length]:
+            pose_list.append(pose_image_pil)
+        video = self.pipeline(
+            ref_image,
+            pose_list,
+            width=width,
+            height=height,
+            video_length=total_length,
+            num_inference_steps=num_inference_steps,
+            guidance_scale=cfg
+        ).videos
+        save_dir = f"./output/gradio"
+        if not os.path.exists(save_dir):
+            os.makedirs(save_dir, exist_ok=True)
+        animation_path = os.path.join(save_dir, "animation_output.mp4")
+        save_videos_grid(video, animation_path, n_rows=1, fps=src_fps)
+        # Crop the face from the reference image and save it
+        cropped_face_path = os.path.join(save_dir, "cropped_face.jpg")
+        cropped_face = self._crop_face(ref_image, save_path=cropped_face_path)
+        # Perform face swapping
+        print(f"Starting face swap with cropped face: {cropped_face_path} and animation: {animation_path}")
+        final_video_path = self._swap_face(cropped_face, animation_path)
+        print(f"Face swap completed. Final video path: {final_video_path}")
+        # Ensure the output file exists before trying to open it
+        if not os.path.exists(final_video_path):
+            raise FileNotFoundError(f"Expected output file not found: {final_video_path}")
+        # Remove the background from the final video
+        bg_removed_video_path = os.path.join(save_dir, "bg_removed_output.mp4")
+        self._remove_background(final_video_path, bg_removed_video_path)
+        print(f"Background removal completed. Output saved to: {bg_removed_video_path}")
+        # Encode the final video in base64
+        with open(bg_removed_video_path, "rb") as video_file:
+            video_base64 = base64.b64encode(video_file.read()).decode("utf-8")
+        torch.cuda.empty_cache()
+        return {"video": video_base64}

input.jpg ADDED Viewed

models/GFPGANv1.4.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:e2cd4703ab14f4d01fd1383a8a8b266f9a5833dacee8e6a79d3bf21a1b6be5ad
+size 348632874

models/inswapper_128.onnx ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:e4a3f08c753cb72d04e10aa0f7dbe3deebbf39567d4ead6dce08e98aa49e16af
+size 554253681

output.mp4 ADDED Viewed

Binary file (96.8 kB). View file

output/gradio/animation_output.mp4 ADDED Viewed

Binary file (79.9 kB). View file

output/gradio/cropped_face.jpg ADDED Viewed

output/gradio/output_video.mp4 ADDED Viewed

Binary file (840 kB). View file

pose_video.mp4 ADDED Viewed

Binary file (755 kB). View file

pretrained_weights/DWPose/dw-ll_ucoco_384.onnx ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:724f4ff2439ed61afb86fb8a1951ec39c6220682803b4a8bd4f598cd913b1843
+size 134399116

pretrained_weights/DWPose/yolox_l.onnx ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:7860ae79de6c89a3c1eb72ae9a2756c0ccfbe04b7791bb5880afabd97855a411
+size 216746733

pretrained_weights/denoising_unet.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:b9e5a2c34fac369e8a922972ca2210916c6af175a0dad907deccf6235816ad52
+size 3438374293

pretrained_weights/image_encoder/config.json ADDED Viewed

	@@ -0,0 +1,23 @@

+{
+  "_name_or_path": "/home/jpinkney/.cache/huggingface/diffusers/models--lambdalabs--sd-image-variations-diffusers/snapshots/ca6f97f838ae1b5bf764f31363a21f388f4d8f3e/image_encoder",
+  "architectures": [
+    "CLIPVisionModelWithProjection"
+  ],
+  "attention_dropout": 0.0,
+  "dropout": 0.0,
+  "hidden_act": "quick_gelu",
+  "hidden_size": 1024,
+  "image_size": 224,
+  "initializer_factor": 1.0,
+  "initializer_range": 0.02,
+  "intermediate_size": 4096,
+  "layer_norm_eps": 1e-05,
+  "model_type": "clip_vision_model",
+  "num_attention_heads": 16,
+  "num_channels": 3,
+  "num_hidden_layers": 24,
+  "patch_size": 14,
+  "projection_dim": 768,
+  "torch_dtype": "float32",
+  "transformers_version": "4.25.1"
+}

pretrained_weights/image_encoder/pytorch_model.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:89d2aa29b5fdf64f3ad4f45fb4227ea98bc45156bbae673b85be1af7783dbabb
+size 1215993967

pretrained_weights/motion_module.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:0d11e01a281b39880da2efeea892215c1313e5713fca3d100a7fbb72ee312ef9
+size 1817900227

pretrained_weights/pose_guider.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:1a8b7c1b4db92980fd977b4fd003c1396bbae9a9cdea00c35d452136d5e4f488
+size 4351337

pretrained_weights/reference_unet.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:beddccb08d49a8b29b0f4d6d456c6521d4382a8d8d48884fa60ba8802509c214
+size 3438323817

pretrained_weights/sd-vae-ft-mse/config.json ADDED Viewed

	@@ -0,0 +1,29 @@

+{
+  "_class_name": "AutoencoderKL",
+  "_diffusers_version": "0.4.2",
+  "act_fn": "silu",
+  "block_out_channels": [
+    128,
+    256,
+    512,
+    512
+  ],
+  "down_block_types": [
+    "DownEncoderBlock2D",
+    "DownEncoderBlock2D",
+    "DownEncoderBlock2D",
+    "DownEncoderBlock2D"
+  ],
+  "in_channels": 3,
+  "latent_channels": 4,
+  "layers_per_block": 2,
+  "norm_num_groups": 32,
+  "out_channels": 3,
+  "sample_size": 256,
+  "up_block_types": [
+    "UpDecoderBlock2D",
+    "UpDecoderBlock2D",
+    "UpDecoderBlock2D",
+    "UpDecoderBlock2D"
+  ]
+}

pretrained_weights/sd-vae-ft-mse/diffusion_pytorch_model.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:1b4889b6b1d4ce7ae320a02dedaeff1780ad77d415ea0d744b476155c6377ddc
+size 334707217

pretrained_weights/sd-vae-ft-mse/diffusion_pytorch_model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:a1d993488569e928462932c8c38a0760b874d166399b14414135bd9c42df5815
+size 334643276

pretrained_weights/stable-diffusion-v1-5/feature_extractor/preprocessor_config.json ADDED Viewed

	@@ -0,0 +1,20 @@

+{
+  "crop_size": 224,
+  "do_center_crop": true,
+  "do_convert_rgb": true,
+  "do_normalize": true,
+  "do_resize": true,
+  "feature_extractor_type": "CLIPFeatureExtractor",
+  "image_mean": [
+    0.48145466,
+    0.4578275,
+    0.40821073
+  ],
+  "image_std": [
+    0.26862954,
+    0.26130258,
+    0.27577711
+  ],
+  "resample": 3,
+  "size": 224
+}

pretrained_weights/stable-diffusion-v1-5/model_index.json ADDED Viewed

	@@ -0,0 +1,32 @@

+{
+  "_class_name": "StableDiffusionPipeline",
+  "_diffusers_version": "0.6.0",
+  "feature_extractor": [
+    "transformers",
+    "CLIPImageProcessor"
+  ],
+  "safety_checker": [
+    "stable_diffusion",
+    "StableDiffusionSafetyChecker"
+  ],
+  "scheduler": [
+    "diffusers",
+    "PNDMScheduler"
+  ],
+  "text_encoder": [
+    "transformers",
+    "CLIPTextModel"
+  ],
+  "tokenizer": [
+    "transformers",
+    "CLIPTokenizer"
+  ],
+  "unet": [
+    "diffusers",
+    "UNet2DConditionModel"
+  ],
+  "vae": [
+    "diffusers",
+    "AutoencoderKL"
+  ]
+}

pretrained_weights/stable-diffusion-v1-5/unet/config.json ADDED Viewed

	@@ -0,0 +1,36 @@

+{
+  "_class_name": "UNet2DConditionModel",
+  "_diffusers_version": "0.6.0",
+  "act_fn": "silu",
+  "attention_head_dim": 8,
+  "block_out_channels": [
+    320,
+    640,
+    1280,
+    1280
+  ],
+  "center_input_sample": false,
+  "cross_attention_dim": 768,
+  "down_block_types": [
+    "CrossAttnDownBlock2D",
+    "CrossAttnDownBlock2D",
+    "CrossAttnDownBlock2D",
+    "DownBlock2D"
+  ],
+  "downsample_padding": 1,
+  "flip_sin_to_cos": true,
+  "freq_shift": 0,
+  "in_channels": 4,
+  "layers_per_block": 2,
+  "mid_block_scale_factor": 1,
+  "norm_eps": 1e-05,
+  "norm_num_groups": 32,
+  "out_channels": 4,
+  "sample_size": 64,
+  "up_block_types": [
+    "UpBlock2D",
+    "CrossAttnUpBlock2D",
+    "CrossAttnUpBlock2D",
+    "CrossAttnUpBlock2D"
+  ]
+}

pretrained_weights/stable-diffusion-v1-5/unet/diffusion_pytorch_model.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:c7da0e21ba7ea50637bee26e81c220844defdf01aafca02b2c42ecdadb813de4
+size 3438354725

pretrained_weights/stable-diffusion-v1-5/v1-inference.yaml ADDED Viewed

	@@ -0,0 +1,70 @@

+model:
+  base_learning_rate: 1.0e-04
+  target: ldm.models.diffusion.ddpm.LatentDiffusion
+  params:
+    linear_start: 0.00085
+    linear_end: 0.0120
+    num_timesteps_cond: 1
+    log_every_t: 200
+    timesteps: 1000
+    first_stage_key: "jpg"
+    cond_stage_key: "txt"
+    image_size: 64
+    channels: 4
+    cond_stage_trainable: false   # Note: different from the one we trained before
+    conditioning_key: crossattn
+    monitor: val/loss_simple_ema
+    scale_factor: 0.18215
+    use_ema: False
+    scheduler_config: # 10000 warmup steps
+      target: ldm.lr_scheduler.LambdaLinearScheduler
+      params:
+        warm_up_steps: [ 10000 ]
+        cycle_lengths: [ 10000000000000 ] # incredibly large number to prevent corner cases
+        f_start: [ 1.e-6 ]
+        f_max: [ 1. ]
+        f_min: [ 1. ]
+    unet_config:
+      target: ldm.modules.diffusionmodules.openaimodel.UNetModel
+      params:
+        image_size: 32 # unused
+        in_channels: 4
+        out_channels: 4
+        model_channels: 320
+        attention_resolutions: [ 4, 2, 1 ]
+        num_res_blocks: 2
+        channel_mult: [ 1, 2, 4, 4 ]
+        num_heads: 8
+        use_spatial_transformer: True
+        transformer_depth: 1
+        context_dim: 768
+        use_checkpoint: True
+        legacy: False
+    first_stage_config:
+      target: ldm.models.autoencoder.AutoencoderKL
+      params:
+        embed_dim: 4
+        monitor: val/rec_loss
+        ddconfig:
+          double_z: true
+          z_channels: 4
+          resolution: 256
+          in_channels: 3
+          out_ch: 3
+          ch: 128
+          ch_mult:
+          - 1
+          - 2
+          - 4
+          - 4
+          num_res_blocks: 2
+          attn_resolutions: []
+          dropout: 0.0
+        lossconfig:
+          target: torch.nn.Identity
+    cond_stage_config:
+      target: ldm.modules.encoders.modules.FrozenCLIPEmbedder

requirements.txt ADDED Viewed

	@@ -0,0 +1,39 @@

+--extra-index-url https://download.pytorch.org/whl/cu118
+numpy==1.23.5
+opencv-python==4.7.0.72
+onnx==1.14.0
+insightface==0.7.3
+psutil==5.9.5
+tk==0.1.0
+customtkinter==5.1.3
+pillow==9.5.0
+torch==2.0.1+cu118; sys_platform != 'darwin'
+torch==2.0.1; sys_platform == 'darwin'
+torchvision==0.15.2+cu118; sys_platform != 'darwin'
+torchvision==0.15.2; sys_platform == 'darwin'
+onnxruntime==1.15.0; sys_platform == 'darwin' and platform_machine != 'arm64'
+onnxruntime-silicon==1.13.1; sys_platform == 'darwin' and platform_machine == 'arm64'
+onnxruntime-gpu==1.15.0; sys_platform != 'darwin'
+tensorflow==2.13.0rc1; sys_platform == 'darwin'
+tensorflow==2.12.0; sys_platform != 'darwin'
+opennsfw2==0.10.2
+protobuf==4.23.2
+tqdm==4.65.0
+gfpgan==1.3.8
+gradio==3.40.1
+tkinterdnd2==0.3.0; sys_platform != 'darwin' and platform_machine != 'arm64'
+tkinterdnd2-universal==1.7.3; sys_platform == 'darwin' and platform_machine == 'arm64'
+onnxruntime-coreml==1.13.1; python_version == '3.9' and sys_platform == 'darwin' and platform_machine != 'arm64'
+# Add additional dependencies
+diffusers==0.24.0
+omegaconf==2.2.3
+# Face swap related dependencies
+facenet-pytorch==2.5.2
+dlib==19.22.0
+# Background removal
+backgroundremover

roop/__init__.py ADDED Viewed

File without changes

roop/__pycache__/__init__.cpython-310.pyc ADDED Viewed

Binary file (129 Bytes). View file

roop/__pycache__/capturer.cpython-310.pyc ADDED Viewed

Binary file (803 Bytes). View file

roop/__pycache__/core.cpython-310.pyc ADDED Viewed

Binary file (8.33 kB). View file

roop/__pycache__/face_analyser.cpython-310.pyc ADDED Viewed

Binary file (1.25 kB). View file

roop/__pycache__/globals.cpython-310.pyc ADDED Viewed

Binary file (525 Bytes). View file

roop/__pycache__/metadata.cpython-310.pyc ADDED Viewed

Binary file (164 Bytes). View file

roop/__pycache__/predicter.cpython-310.pyc ADDED Viewed

Binary file (1.65 kB). View file

roop/__pycache__/typing.cpython-310.pyc ADDED Viewed

Binary file (267 Bytes). View file

roop/__pycache__/ui.cpython-310.pyc ADDED Viewed

Binary file (8.39 kB). View file

roop/__pycache__/utilities.cpython-310.pyc ADDED Viewed

Binary file (5.58 kB). View file

roop/capturer.py ADDED Viewed

	@@ -0,0 +1,20 @@

+from typing import Any
+import cv2
+def get_video_frame(video_path: str, frame_number: int = 0) -> Any:
+    capture = cv2.VideoCapture(video_path)
+    frame_total = capture.get(cv2.CAP_PROP_FRAME_COUNT)
+    capture.set(cv2.CAP_PROP_POS_FRAMES, min(frame_total, frame_number - 1))
+    has_frame, frame = capture.read()
+    capture.release()
+    if has_frame:
+        return frame
+    return None
+def get_video_frame_total(video_path: str) -> int:
+    capture = cv2.VideoCapture(video_path)
+    video_frame_total = int(capture.get(cv2.CAP_PROP_FRAME_COUNT))
+    capture.release()
+    return video_frame_total

roop/core.py ADDED Viewed

	@@ -0,0 +1,215 @@

+#!/usr/bin/env python3
+import os
+import sys
+# single thread doubles cuda performance - needs to be set before torch import
+if any(arg.startswith('--execution-provider') for arg in sys.argv):
+    os.environ['OMP_NUM_THREADS'] = '1'
+# reduce tensorflow log level
+os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'
+import warnings
+from typing import List
+import platform
+import signal
+import shutil
+import argparse
+import torch
+import onnxruntime
+import tensorflow
+import roop.globals
+import roop.metadata
+import roop.ui as ui
+from roop.predicter import predict_image, predict_video
+from roop.processors.frame.core import get_frame_processors_modules
+from roop.utilities import has_image_extension, is_image, is_video, detect_fps, create_video, extract_frames, get_temp_frame_paths, restore_audio, create_temp, move_temp, clean_temp, normalize_output_path
+if 'ROCMExecutionProvider' in roop.globals.execution_providers:
+    del torch
+warnings.filterwarnings('ignore', category=FutureWarning, module='insightface')
+warnings.filterwarnings('ignore', category=UserWarning, module='torchvision')
+def parse_args() -> None:
+    signal.signal(signal.SIGINT, lambda signal_number, frame: destroy())
+    program = argparse.ArgumentParser(formatter_class=lambda prog: argparse.HelpFormatter(prog, max_help_position=100))
+    program.add_argument('-s', '--source', help='select an source image', dest='source_path')
+    program.add_argument('-t', '--target', help='select an target image or video', dest='target_path')
+    program.add_argument('-o', '--output', help='select output file or directory', dest='output_path')
+    program.add_argument('--frame-processor', help='frame processors (choices: face_swapper, face_enhancer, ...)', dest='frame_processor', default=['face_swapper'], nargs='+')
+    program.add_argument('--keep-fps', help='keep original fps', dest='keep_fps', action='store_true', default=False)
+    program.add_argument('--keep-audio', help='keep original audio', dest='keep_audio', action='store_true', default=True)
+    program.add_argument('--keep-frames', help='keep temporary frames', dest='keep_frames', action='store_true', default=False)
+    program.add_argument('--many-faces', help='process every face', dest='many_faces', action='store_true', default=False)
+    program.add_argument('--video-encoder', help='adjust output video encoder', dest='video_encoder', default='libx264', choices=['libx264', 'libx265', 'libvpx-vp9'])
+    program.add_argument('--video-quality', help='adjust output video quality', dest='video_quality', type=int, default=18, choices=range(52), metavar='[0-51]')
+    program.add_argument('--max-memory', help='maximum amount of RAM in GB', dest='max_memory', type=int, default=suggest_max_memory())
+    program.add_argument('--execution-provider', help='available execution provider (choices: cpu, ...)', dest='execution_provider', default=['cpu'], choices=suggest_execution_providers(), nargs='+')
+    program.add_argument('--execution-threads', help='number of execution threads', dest='execution_threads', type=int, default=suggest_execution_threads())
+    program.add_argument('-v', '--version', action='version', version=f'{roop.metadata.name} {roop.metadata.version}')
+    args = program.parse_args()
+    roop.globals.source_path = args.source_path
+    roop.globals.target_path = args.target_path
+    roop.globals.output_path = normalize_output_path(roop.globals.source_path, roop.globals.target_path, args.output_path)
+    roop.globals.frame_processors = args.frame_processor
+    roop.globals.headless = args.source_path or args.target_path or args.output_path
+    roop.globals.keep_fps = args.keep_fps
+    roop.globals.keep_audio = args.keep_audio
+    roop.globals.keep_frames = args.keep_frames
+    roop.globals.many_faces = args.many_faces
+    roop.globals.video_encoder = args.video_encoder
+    roop.globals.video_quality = args.video_quality
+    roop.globals.max_memory = args.max_memory
+    roop.globals.execution_providers = decode_execution_providers(args.execution_provider)
+    roop.globals.execution_threads = args.execution_threads
+def encode_execution_providers(execution_providers: List[str]) -> List[str]:
+    return [execution_provider.replace('ExecutionProvider', '').lower() for execution_provider in execution_providers]
+def decode_execution_providers(execution_providers: List[str]) -> List[str]:
+    return [provider for provider, encoded_execution_provider in zip(onnxruntime.get_available_providers(), encode_execution_providers(onnxruntime.get_available_providers()))
+            if any(execution_provider in encoded_execution_provider for execution_provider in execution_providers)]
+def suggest_max_memory() -> int:
+    if platform.system().lower() == 'darwin':
+        return 10
+    return 14
+def suggest_execution_providers() -> List[str]:
+    return encode_execution_providers(onnxruntime.get_available_providers())
+def suggest_execution_threads() -> int:
+    if 'DmlExecutionProvider' in roop.globals.execution_providers:
+        return 1
+    if 'ROCMExecutionProvider' in roop.globals.execution_providers:
+        return 1
+    return 8
+def limit_resources() -> None:
+    # prevent tensorflow memory leak
+    gpus = tensorflow.config.experimental.list_physical_devices('GPU')
+    for gpu in gpus:
+        tensorflow.config.experimental.set_virtual_device_configuration(gpu, [
+            tensorflow.config.experimental.VirtualDeviceConfiguration(memory_limit=1024)
+        ])
+    # limit memory usage
+    if roop.globals.max_memory:
+        memory = roop.globals.max_memory * 1024 ** 3
+        if platform.system().lower() == 'darwin':
+            memory = roop.globals.max_memory * 1024 ** 6
+        if platform.system().lower() == 'windows':
+            import ctypes
+            kernel32 = ctypes.windll.kernel32
+            kernel32.SetProcessWorkingSetSize(-1, ctypes.c_size_t(memory), ctypes.c_size_t(memory))
+        else:
+            import resource
+            resource.setrlimit(resource.RLIMIT_DATA, (memory, memory))
+def release_resources() -> None:
+    if 'CUDAExecutionProvider' in roop.globals.execution_providers:
+        torch.cuda.empty_cache()
+def pre_check() -> bool:
+    if sys.version_info < (3, 9):
+        update_status('Python version is not supported - please upgrade to 3.9 or higher.')
+        return False
+    if not shutil.which('ffmpeg'):
+        update_status('ffmpeg is not installed.')
+        return False
+    return True
+def update_status(message: str, scope: str = 'ROOP.CORE') -> None:
+    print(f'[{scope}] {message}')
+    if not roop.globals.headless:
+        ui.update_status(message)
+def start() -> None:
+    for frame_processor in get_frame_processors_modules(roop.globals.frame_processors):
+        if not frame_processor.pre_start():
+            return
+    # process image to image
+    if has_image_extension(roop.globals.target_path):
+        if predict_image(roop.globals.target_path):
+            destroy()
+        shutil.copy2(roop.globals.target_path, roop.globals.output_path)
+        for frame_processor in get_frame_processors_modules(roop.globals.frame_processors):
+            update_status('Progressing...', frame_processor.NAME)
+            frame_processor.process_image(roop.globals.source_path, roop.globals.output_path, roop.globals.output_path)
+            frame_processor.post_process()
+            release_resources()
+        if is_image(roop.globals.target_path):
+            update_status('Processing to image succeed!')
+        else:
+            update_status('Processing to image failed!')
+        return
+    # process image to videos
+    if predict_video(roop.globals.target_path):
+        destroy()
+    update_status('Creating temp resources...')
+    create_temp(roop.globals.target_path)
+    update_status('Extracting frames...')
+    extract_frames(roop.globals.target_path)
+    temp_frame_paths = get_temp_frame_paths(roop.globals.target_path)
+    for frame_processor in get_frame_processors_modules(roop.globals.frame_processors):
+        update_status('Progressing...', frame_processor.NAME)
+        frame_processor.process_video(roop.globals.source_path, temp_frame_paths)
+        frame_processor.post_process()
+        release_resources()
+    # handles fps
+    if roop.globals.keep_fps:
+        update_status('Detecting fps...')
+        fps = detect_fps(roop.globals.target_path)
+        update_status(f'Creating video with {fps} fps...')
+        create_video(roop.globals.target_path, fps)
+    else:
+        update_status('Creating video with 30.0 fps...')
+        create_video(roop.globals.target_path)
+    # handle audio
+    if roop.globals.keep_audio:
+        if roop.globals.keep_fps:
+            update_status('Restoring audio...')
+        else:
+            update_status('Restoring audio might cause issues as fps are not kept...')
+        restore_audio(roop.globals.target_path, roop.globals.output_path)
+    else:
+        move_temp(roop.globals.target_path, roop.globals.output_path)
+    # clean and validate
+    clean_temp(roop.globals.target_path)
+    if is_video(roop.globals.target_path):
+        update_status('Processing to video succeed!')
+    else:
+        update_status('Processing to video failed!')
+def destroy() -> None:
+    if roop.globals.target_path:
+        clean_temp(roop.globals.target_path)
+    quit()
+def run() -> None:
+    parse_args()
+    if not pre_check():
+        return
+    for frame_processor in get_frame_processors_modules(roop.globals.frame_processors):
+        if not frame_processor.pre_check():
+            return
+    limit_resources()
+    if roop.globals.headless:
+        start()
+    else:
+        window = ui.init(start, destroy)
+        window.mainloop()

roop/face_analyser.py ADDED Viewed

	@@ -0,0 +1,34 @@

+import threading
+from typing import Any
+import insightface
+import roop.globals
+from roop.typing import Frame
+FACE_ANALYSER = None
+THREAD_LOCK = threading.Lock()
+def get_face_analyser() -> Any:
+    global FACE_ANALYSER
+    with THREAD_LOCK:
+        if FACE_ANALYSER is None:
+            FACE_ANALYSER = insightface.app.FaceAnalysis(name='buffalo_l', providers=roop.globals.execution_providers)
+            FACE_ANALYSER.prepare(ctx_id=0, det_size=(640, 640))
+    return FACE_ANALYSER
+def get_one_face(frame: Frame) -> Any:
+    face = get_face_analyser().get(frame)
+    try:
+        return min(face, key=lambda x: x.bbox[0])
+    except ValueError:
+        return None
+def get_many_faces(frame: Frame) -> Any:
+    try:
+        return get_face_analyser().get(frame)
+    except IndexError:
+        return None

roop/globals.py ADDED Viewed

	@@ -0,0 +1,17 @@

+from typing import List
+source_path = None
+target_path = None
+output_path = None
+frame_processors: List[str] = []
+keep_fps = None
+keep_audio = None
+keep_frames = None
+many_faces = None
+video_encoder = None
+video_quality = None
+max_memory = None
+execution_providers: List[str] = []
+execution_threads = None
+headless = None
+log_level = 'error'

roop/metadata.py ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ name = 'roop'
2	+ version = '1.1.0'

roop/predicter.py ADDED Viewed

	@@ -0,0 +1,43 @@

+import threading
+import numpy
+import opennsfw2
+from PIL import Image
+from keras import Model
+from roop.typing import Frame
+PREDICTOR = None
+THREAD_LOCK = threading.Lock()
+MAX_PROBABILITY = 0.85
+def get_predictor() -> Model:
+    global PREDICTOR
+    with THREAD_LOCK:
+        if PREDICTOR is None:
+            PREDICTOR = opennsfw2.make_open_nsfw_model()
+    return PREDICTOR
+def clear_predictor() -> None:
+    global PREDICTOR
+    PREDICTOR = None
+def predict_frame(target_frame: Frame) -> bool:
+    image = Image.fromarray(target_frame)
+    image = opennsfw2.preprocess_image(image, opennsfw2.Preprocessing.YAHOO)
+    views = numpy.expand_dims(image, axis=0)
+    _, probability = get_predictor().predict(views)[0]
+    return probability > MAX_PROBABILITY
+def predict_image(target_path: str) -> bool:
+    return opennsfw2.predict_image(target_path) > MAX_PROBABILITY
+def predict_video(target_path: str) -> bool:
+    _, probabilities = opennsfw2.predict_video_frames(video_path=target_path, frame_interval=100)
+    return any(probability > MAX_PROBABILITY for probability in probabilities)