diff --git a/__pycache__/handler.cpython-310.pyc b/__pycache__/handler.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..d9dd10a3326afaf825aa214d6a5d44fb68888664 Binary files /dev/null and b/__pycache__/handler.cpython-310.pyc differ diff --git a/configs/inference/inference_v1.yaml b/configs/inference/inference_v1.yaml new file mode 100644 index 0000000000000000000000000000000000000000..e888888b547bf0316e7963a957fa905cb6fe9d65 --- /dev/null +++ b/configs/inference/inference_v1.yaml @@ -0,0 +1,23 @@ +unet_additional_kwargs: + unet_use_cross_frame_attention: false + unet_use_temporal_attention: false + use_motion_module: true + motion_module_resolutions: [1,2,4,8] + motion_module_mid_block: false + motion_module_decoder_only: false + motion_module_type: "Vanilla" + + motion_module_kwargs: + num_attention_heads: 8 + num_transformer_block: 1 + attention_block_types: [ "Temporal_Self", "Temporal_Self" ] + temporal_position_encoding: true + temporal_position_encoding_max_len: 24 + temporal_attention_dim_div: 1 + +noise_scheduler_kwargs: + beta_start: 0.00085 + beta_end: 0.012 + beta_schedule: "linear" + steps_offset: 1 + clip_sample: False \ No newline at end of file diff --git a/configs/inference/inference_v2.yaml b/configs/inference/inference_v2.yaml new file mode 100644 index 0000000000000000000000000000000000000000..d613dca2d2e48a41295a89f47b5a82fd7032dba5 --- /dev/null +++ b/configs/inference/inference_v2.yaml @@ -0,0 +1,35 @@ +unet_additional_kwargs: + use_inflated_groupnorm: true + unet_use_cross_frame_attention: false + unet_use_temporal_attention: false + use_motion_module: true + motion_module_resolutions: + - 1 + - 2 + - 4 + - 8 + motion_module_mid_block: true + motion_module_decoder_only: false + motion_module_type: Vanilla + motion_module_kwargs: + num_attention_heads: 8 + num_transformer_block: 1 + attention_block_types: + - Temporal_Self + - Temporal_Self + temporal_position_encoding: true + temporal_position_encoding_max_len: 32 + temporal_attention_dim_div: 1 + +noise_scheduler_kwargs: + beta_start: 0.00085 + beta_end: 0.012 + beta_schedule: "linear" + clip_sample: false + steps_offset: 1 + ### Zero-SNR params + prediction_type: "v_prediction" + rescale_betas_zero_snr: True + timestep_spacing: "trailing" + +sampler: DDIM \ No newline at end of file diff --git a/configs/prompts/animation.yaml b/configs/prompts/animation.yaml new file mode 100644 index 0000000000000000000000000000000000000000..d7895caf6e82ebf699916e07aed6193f1deb72e5 --- /dev/null +++ b/configs/prompts/animation.yaml @@ -0,0 +1,26 @@ +pretrained_base_model_path: "./pretrained_weights/stable-diffusion-v1-5/" +pretrained_vae_path: "./pretrained_weights/sd-vae-ft-mse" +image_encoder_path: "./pretrained_weights/image_encoder" +denoising_unet_path: "./pretrained_weights/denoising_unet.pth" +reference_unet_path: "./pretrained_weights/reference_unet.pth" +pose_guider_path: "./pretrained_weights/pose_guider.pth" +motion_module_path: "./pretrained_weights/motion_module.pth" + +inference_config: "./configs/inference/inference_v2.yaml" +weight_dtype: 'fp16' + +test_cases: + "./configs/inference/ref_images/anyone-2.png": + - "./configs/inference/pose_videos/anyone-video-2_kps.mp4" + - "./configs/inference/pose_videos/anyone-video-5_kps.mp4" + "./configs/inference/ref_images/anyone-10.png": + - "./configs/inference/pose_videos/anyone-video-1_kps.mp4" + - "./configs/inference/pose_videos/anyone-video-2_kps.mp4" + "./configs/inference/ref_images/anyone-11.png": + - "./configs/inference/pose_videos/anyone-video-1_kps.mp4" + - "./configs/inference/pose_videos/anyone-video-2_kps.mp4" + "./configs/inference/ref_images/anyone-3.png": + - "./configs/inference/pose_videos/anyone-video-2_kps.mp4" + - "./configs/inference/pose_videos/anyone-video-5_kps.mp4" + "./configs/inference/ref_images/anyone-5.png": + - "./configs/inference/pose_videos/anyone-video-2_kps.mp4" diff --git a/gfpgan/weights/detection_Resnet50_Final.pth b/gfpgan/weights/detection_Resnet50_Final.pth new file mode 100644 index 0000000000000000000000000000000000000000..16546738ce0a00a9fd47585e0fc52744d31cc117 --- /dev/null +++ b/gfpgan/weights/detection_Resnet50_Final.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6d1de9c2944f2ccddca5f5e010ea5ae64a39845a86311af6fdf30841b0a5a16d +size 109497761 diff --git a/gfpgan/weights/parsing_parsenet.pth b/gfpgan/weights/parsing_parsenet.pth new file mode 100644 index 0000000000000000000000000000000000000000..1ac2efc50360a79c9905dbac57d9d99cbfbe863c --- /dev/null +++ b/gfpgan/weights/parsing_parsenet.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3d558d8d0e42c20224f13cf5a29c79eba2d59913419f945545d8cf7b72920de2 +size 85331193 diff --git a/good_face.jpeg b/good_face.jpeg new file mode 100644 index 0000000000000000000000000000000000000000..9b67f69e599876ba95cda537d980a6236f8feab5 Binary files /dev/null and b/good_face.jpeg differ diff --git a/handler.py b/handler.py new file mode 100644 index 0000000000000000000000000000000000000000..944106aa90ac4830735d2a6829e091118a3ca44b --- /dev/null +++ b/handler.py @@ -0,0 +1,247 @@ +from typing import Dict, Any +import torch +from PIL import Image +import base64 +from io import BytesIO +import numpy as np +from diffusers import AutoencoderKL, DDIMScheduler +from einops import repeat +from omegaconf import OmegaConf +from transformers import CLIPVisionModelWithProjection +import cv2 +import os +from backgroundremover.bg import remove as remove_bg +from src.models.pose_guider import PoseGuider +from src.models.unet_2d_condition import UNet2DConditionModel +from src.models.unet_3d import UNet3DConditionModel +from src.pipelines.pipeline_pose2vid_long import Pose2VideoPipeline +from src.utils.util import read_frames, get_fps, save_videos_grid +import roop.globals +from roop.core import start, decode_execution_providers, suggest_max_memory, suggest_execution_threads +from roop.utilities import normalize_output_path +from roop.processors.frame.core import get_frame_processors_modules + +device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') + +if device.type != 'cuda': + raise ValueError("The model requires a GPU for inference.") + +class EndpointHandler(): + def __init__(self, path=""): + self.config = OmegaConf.load("./configs/prompts/animation.yaml") + self.weight_dtype = torch.float16 + self.pipeline = None + self._initialize_pipeline() + + def _initialize_pipeline(self): + vae = AutoencoderKL.from_pretrained('./pretrained_weights/sd-vae-ft-mse').to(device, dtype=self.weight_dtype) + + reference_unet = UNet2DConditionModel.from_pretrained( + self.config.pretrained_base_model_path, + subfolder="unet" + ).to(device, dtype=self.weight_dtype) + + inference_config_path = self.config.inference_config + infer_config = OmegaConf.load(inference_config_path) + denoising_unet = UNet3DConditionModel.from_pretrained_2d( + self.config.pretrained_base_model_path, + self.config.motion_module_path, + subfolder="unet", + unet_additional_kwargs=infer_config.unet_additional_kwargs, + ).to(device, dtype=self.weight_dtype) + + pose_guider = PoseGuider(320, block_out_channels=(16, 32, 96, 256)).to(device, dtype=self.weight_dtype) + image_enc = CLIPVisionModelWithProjection.from_pretrained(self.config.image_encoder_path).to(device, dtype=self.weight_dtype) + sched_kwargs = OmegaConf.to_container(infer_config.noise_scheduler_kwargs) + scheduler = DDIMScheduler(**sched_kwargs) + + denoising_unet.load_state_dict(torch.load(self.config.denoising_unet_path, map_location="cpu"), strict=False) + reference_unet.load_state_dict(torch.load(self.config.reference_unet_path, map_location="cpu")) + pose_guider.load_state_dict(torch.load(self.config.pose_guider_path, map_location="cpu")) + + self.pipeline = Pose2VideoPipeline( + vae=vae, + image_encoder=image_enc, + reference_unet=reference_unet, + denoising_unet=denoising_unet, + pose_guider=pose_guider, + scheduler=scheduler + ).to(device, dtype=self.weight_dtype) + + def _crop_face(self, image, save_path="cropped_face.jpg", margin=0.3): + # Convert image to OpenCV format + cv_image = np.array(image) + cv_image = cv_image[:, :, ::-1].copy() + + # Load OpenCV face detector + face_cascade = cv2.CascadeClassifier(cv2.data.haarcascades + 'haarcascade_frontalface_default.xml') + + # Detect faces + gray = cv2.cvtColor(cv_image, cv2.COLOR_BGR2GRAY) + faces = face_cascade.detectMultiScale(gray, 1.1, 4) + + if len(faces) == 0: + raise ValueError("No faces detected in the reference image.") + + # Crop the first face found with a margin + x, y, w, h = faces[0] + x_margin = int(margin * w) + y_margin = int(margin * h) + + x1 = max(0, x - x_margin) + y1 = max(0, y - y_margin) + x2 = min(cv_image.shape[1], x + w + x_margin) + y2 = min(cv_image.shape[0], y + h + y_margin) + + cropped_face = cv_image[y1:y2, x1:x2] + + # Convert back to PIL format + cropped_face = Image.fromarray(cropped_face[:, :, ::-1]).convert("RGB") + + # Save the cropped face + cropped_face.save(save_path, format="JPEG", quality=95) + + return cropped_face + + def _swap_face(self, source_image, target_video_path): + # Use a predefined face image instead of the provided source_image + source_path = "/root/AnimateAnyone/good_face.jpeg" # Change this to your known good face image path + output_path = "output.mp4" + + roop.globals.source_path = source_path + roop.globals.target_path = target_video_path + roop.globals.output_path = normalize_output_path(roop.globals.source_path, roop.globals.target_path, output_path) + roop.globals.frame_processors = ["face_swapper", "face_enhancer"] + roop.globals.headless = True + roop.globals.keep_fps = True + roop.globals.keep_audio = True + roop.globals.keep_frames = False + roop.globals.many_faces = False + roop.globals.video_encoder = "libx264" + roop.globals.video_quality = 50 + roop.globals.max_memory = suggest_max_memory() + roop.globals.execution_providers = decode_execution_providers(["cpu"]) + roop.globals.execution_threads = suggest_execution_threads() + + for frame_processor in get_frame_processors_modules(roop.globals.frame_processors): + if not frame_processor.pre_check(): + raise ValueError("Frame processor pre-check failed.") + + print(f"Starting face swap with source: {source_path} and target: {target_video_path}") + start() + print(f"Face swap completed. Output saved to: {output_path}") + + return os.path.join(os.getcwd(), output_path) + + + def remove_bg_from_image(self, image_data): + model_name = "u2net" # Choose your preferred model: "u2net", "u2net_human_seg", "u2netp" + processed_image_data = remove_bg( + image_data, + model_name=model_name, + alpha_matting=True, + alpha_matting_foreground_threshold=240, + alpha_matting_background_threshold=10, + alpha_matting_erode_structure_size=10, + alpha_matting_base_size=1000 + ) + return processed_image_data + + def _remove_background(self, input_path, output_path): + cap = cv2.VideoCapture(input_path) + if not cap.isOpened(): + raise IOError(f"Error opening video file {input_path}") + + width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH)) + height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT)) + fps = int(cap.get(cv2.CAP_PROP_FPS)) + + fourcc = cv2.VideoWriter_fourcc(*'mp4v') + out = cv2.VideoWriter(output_path, fourcc, fps, (width, height)) + + frame_count = 0 + while cap.isOpened(): + ret, frame = cap.read() + if not ret: + break + + frame_count += 1 + pil_frame = Image.fromarray(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)) + frame_data = BytesIO() + pil_frame.save(frame_data, format="PNG") + frame_data = frame_data.getvalue() + processed_frame_data = self.remove_bg_from_image(frame_data) + processed_pil_frame = Image.open(BytesIO(processed_frame_data)) + processed_frame = cv2.cvtColor(np.array(processed_pil_frame), cv2.COLOR_RGB2BGR) + + out.write(processed_frame) + + cap.release() + out.release() + + if frame_count == 0: + raise IOError(f"No frames processed. Error with video file {input_path}") + + def __call__(self, data: Any) -> Dict[str, str]: + inputs = data.get("inputs", {}) + ref_image_base64 = inputs.get("ref_image", "") + pose_video_path = inputs.get("pose_video_path", "") + width = inputs.get("width", 512) + height = inputs.get("height", 768) + length = inputs.get("length", 24) + num_inference_steps = inputs.get("num_inference_steps", 25) + cfg = inputs.get("cfg", 3.5) + seed = inputs.get("seed", 123) + + ref_image = Image.open(BytesIO(base64.b64decode(ref_image_base64))) + + torch.manual_seed(seed) + pose_images = read_frames(pose_video_path) + src_fps = get_fps(pose_video_path) + + pose_list = [] + total_length = min(length, len(pose_images)) + for pose_image_pil in pose_images[:total_length]: + pose_list.append(pose_image_pil) + + video = self.pipeline( + ref_image, + pose_list, + width=width, + height=height, + video_length=total_length, + num_inference_steps=num_inference_steps, + guidance_scale=cfg + ).videos + + save_dir = f"./output/gradio" + if not os.path.exists(save_dir): + os.makedirs(save_dir, exist_ok=True) + animation_path = os.path.join(save_dir, "animation_output.mp4") + save_videos_grid(video, animation_path, n_rows=1, fps=src_fps) + + # Crop the face from the reference image and save it + cropped_face_path = os.path.join(save_dir, "cropped_face.jpg") + cropped_face = self._crop_face(ref_image, save_path=cropped_face_path) + + # Perform face swapping + print(f"Starting face swap with cropped face: {cropped_face_path} and animation: {animation_path}") + final_video_path = self._swap_face(cropped_face, animation_path) + print(f"Face swap completed. Final video path: {final_video_path}") + + # Ensure the output file exists before trying to open it + if not os.path.exists(final_video_path): + raise FileNotFoundError(f"Expected output file not found: {final_video_path}") + + # Remove the background from the final video + bg_removed_video_path = os.path.join(save_dir, "bg_removed_output.mp4") + self._remove_background(final_video_path, bg_removed_video_path) + print(f"Background removal completed. Output saved to: {bg_removed_video_path}") + + # Encode the final video in base64 + with open(bg_removed_video_path, "rb") as video_file: + video_base64 = base64.b64encode(video_file.read()).decode("utf-8") + + torch.cuda.empty_cache() + + return {"video": video_base64} diff --git a/input.jpg b/input.jpg new file mode 100644 index 0000000000000000000000000000000000000000..e4422c060c4c8506c348945873728e6178e24aaa Binary files /dev/null and b/input.jpg differ diff --git a/models/GFPGANv1.4.pth b/models/GFPGANv1.4.pth new file mode 100644 index 0000000000000000000000000000000000000000..afedb5c7e826056840c9cc183f2c6f0186fd17ba --- /dev/null +++ b/models/GFPGANv1.4.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e2cd4703ab14f4d01fd1383a8a8b266f9a5833dacee8e6a79d3bf21a1b6be5ad +size 348632874 diff --git a/models/inswapper_128.onnx b/models/inswapper_128.onnx new file mode 100644 index 0000000000000000000000000000000000000000..cb672b799d74fdf7ab8b172a1b1d78411f6400f5 --- /dev/null +++ b/models/inswapper_128.onnx @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e4a3f08c753cb72d04e10aa0f7dbe3deebbf39567d4ead6dce08e98aa49e16af +size 554253681 diff --git a/output.mp4 b/output.mp4 new file mode 100644 index 0000000000000000000000000000000000000000..bc6e261f4140e69e68212959a0b132640d22f662 Binary files /dev/null and b/output.mp4 differ diff --git a/output/gradio/animation_output.mp4 b/output/gradio/animation_output.mp4 new file mode 100644 index 0000000000000000000000000000000000000000..bb54c962342f5b67b90b89c5724346a73b6afa75 Binary files /dev/null and b/output/gradio/animation_output.mp4 differ diff --git a/output/gradio/cropped_face.jpg b/output/gradio/cropped_face.jpg new file mode 100644 index 0000000000000000000000000000000000000000..d75957f22b5b1f8d172e0a7fc1b63930a37cbb7e Binary files /dev/null and b/output/gradio/cropped_face.jpg differ diff --git a/output/gradio/output_video.mp4 b/output/gradio/output_video.mp4 new file mode 100644 index 0000000000000000000000000000000000000000..a67d169cf0ddea98e4acf9ed94e7fd068ec20610 Binary files /dev/null and b/output/gradio/output_video.mp4 differ diff --git a/pose_video.mp4 b/pose_video.mp4 new file mode 100644 index 0000000000000000000000000000000000000000..d9c82e7a4e02326114a9f6855ad8c45ea3ee8dac Binary files /dev/null and b/pose_video.mp4 differ diff --git a/pretrained_weights/DWPose/dw-ll_ucoco_384.onnx b/pretrained_weights/DWPose/dw-ll_ucoco_384.onnx new file mode 100644 index 0000000000000000000000000000000000000000..df84ce34881c5701a29e09badd8c96f5c17bd214 --- /dev/null +++ b/pretrained_weights/DWPose/dw-ll_ucoco_384.onnx @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:724f4ff2439ed61afb86fb8a1951ec39c6220682803b4a8bd4f598cd913b1843 +size 134399116 diff --git a/pretrained_weights/DWPose/yolox_l.onnx b/pretrained_weights/DWPose/yolox_l.onnx new file mode 100644 index 0000000000000000000000000000000000000000..d6ff7914feb199e342967b877f8b2ea3179db915 --- /dev/null +++ b/pretrained_weights/DWPose/yolox_l.onnx @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7860ae79de6c89a3c1eb72ae9a2756c0ccfbe04b7791bb5880afabd97855a411 +size 216746733 diff --git a/pretrained_weights/denoising_unet.pth b/pretrained_weights/denoising_unet.pth new file mode 100644 index 0000000000000000000000000000000000000000..46ddca6219170a22849cb99effa96240369b6887 --- /dev/null +++ b/pretrained_weights/denoising_unet.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b9e5a2c34fac369e8a922972ca2210916c6af175a0dad907deccf6235816ad52 +size 3438374293 diff --git a/pretrained_weights/image_encoder/config.json b/pretrained_weights/image_encoder/config.json new file mode 100644 index 0000000000000000000000000000000000000000..251e37d8a59724357a8887da1716fad7b791b9c0 --- /dev/null +++ b/pretrained_weights/image_encoder/config.json @@ -0,0 +1,23 @@ +{ + "_name_or_path": "/home/jpinkney/.cache/huggingface/diffusers/models--lambdalabs--sd-image-variations-diffusers/snapshots/ca6f97f838ae1b5bf764f31363a21f388f4d8f3e/image_encoder", + "architectures": [ + "CLIPVisionModelWithProjection" + ], + "attention_dropout": 0.0, + "dropout": 0.0, + "hidden_act": "quick_gelu", + "hidden_size": 1024, + "image_size": 224, + "initializer_factor": 1.0, + "initializer_range": 0.02, + "intermediate_size": 4096, + "layer_norm_eps": 1e-05, + "model_type": "clip_vision_model", + "num_attention_heads": 16, + "num_channels": 3, + "num_hidden_layers": 24, + "patch_size": 14, + "projection_dim": 768, + "torch_dtype": "float32", + "transformers_version": "4.25.1" +} diff --git a/pretrained_weights/image_encoder/pytorch_model.bin b/pretrained_weights/image_encoder/pytorch_model.bin new file mode 100644 index 0000000000000000000000000000000000000000..167893f2790c143ffda7de008d70cf000136ceed --- /dev/null +++ b/pretrained_weights/image_encoder/pytorch_model.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:89d2aa29b5fdf64f3ad4f45fb4227ea98bc45156bbae673b85be1af7783dbabb +size 1215993967 diff --git a/pretrained_weights/motion_module.pth b/pretrained_weights/motion_module.pth new file mode 100644 index 0000000000000000000000000000000000000000..90e7f21beebba1cf3db21e15996ceffa5bd80f3d --- /dev/null +++ b/pretrained_weights/motion_module.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0d11e01a281b39880da2efeea892215c1313e5713fca3d100a7fbb72ee312ef9 +size 1817900227 diff --git a/pretrained_weights/pose_guider.pth b/pretrained_weights/pose_guider.pth new file mode 100644 index 0000000000000000000000000000000000000000..f71b567653179a98be41ced378805f7c1cc48025 --- /dev/null +++ b/pretrained_weights/pose_guider.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1a8b7c1b4db92980fd977b4fd003c1396bbae9a9cdea00c35d452136d5e4f488 +size 4351337 diff --git a/pretrained_weights/reference_unet.pth b/pretrained_weights/reference_unet.pth new file mode 100644 index 0000000000000000000000000000000000000000..8cc325831535fda0b47fc60b68daa247adf29278 --- /dev/null +++ b/pretrained_weights/reference_unet.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:beddccb08d49a8b29b0f4d6d456c6521d4382a8d8d48884fa60ba8802509c214 +size 3438323817 diff --git a/pretrained_weights/sd-vae-ft-mse/config.json b/pretrained_weights/sd-vae-ft-mse/config.json new file mode 100644 index 0000000000000000000000000000000000000000..0db26717579be63eb0ddbf15b43faa43700dfe5a --- /dev/null +++ b/pretrained_weights/sd-vae-ft-mse/config.json @@ -0,0 +1,29 @@ +{ + "_class_name": "AutoencoderKL", + "_diffusers_version": "0.4.2", + "act_fn": "silu", + "block_out_channels": [ + 128, + 256, + 512, + 512 + ], + "down_block_types": [ + "DownEncoderBlock2D", + "DownEncoderBlock2D", + "DownEncoderBlock2D", + "DownEncoderBlock2D" + ], + "in_channels": 3, + "latent_channels": 4, + "layers_per_block": 2, + "norm_num_groups": 32, + "out_channels": 3, + "sample_size": 256, + "up_block_types": [ + "UpDecoderBlock2D", + "UpDecoderBlock2D", + "UpDecoderBlock2D", + "UpDecoderBlock2D" + ] +} diff --git a/pretrained_weights/sd-vae-ft-mse/diffusion_pytorch_model.bin b/pretrained_weights/sd-vae-ft-mse/diffusion_pytorch_model.bin new file mode 100644 index 0000000000000000000000000000000000000000..ba36f34d64ad3be997b7cab94b0b9acd61272851 --- /dev/null +++ b/pretrained_weights/sd-vae-ft-mse/diffusion_pytorch_model.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1b4889b6b1d4ce7ae320a02dedaeff1780ad77d415ea0d744b476155c6377ddc +size 334707217 diff --git a/pretrained_weights/sd-vae-ft-mse/diffusion_pytorch_model.safetensors b/pretrained_weights/sd-vae-ft-mse/diffusion_pytorch_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..90464d67ac7303d0ee4696334df13da130a948ea --- /dev/null +++ b/pretrained_weights/sd-vae-ft-mse/diffusion_pytorch_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a1d993488569e928462932c8c38a0760b874d166399b14414135bd9c42df5815 +size 334643276 diff --git a/pretrained_weights/stable-diffusion-v1-5/feature_extractor/preprocessor_config.json b/pretrained_weights/stable-diffusion-v1-5/feature_extractor/preprocessor_config.json new file mode 100644 index 0000000000000000000000000000000000000000..5294955ff7801083f720b34b55d0f1f51313c5c5 --- /dev/null +++ b/pretrained_weights/stable-diffusion-v1-5/feature_extractor/preprocessor_config.json @@ -0,0 +1,20 @@ +{ + "crop_size": 224, + "do_center_crop": true, + "do_convert_rgb": true, + "do_normalize": true, + "do_resize": true, + "feature_extractor_type": "CLIPFeatureExtractor", + "image_mean": [ + 0.48145466, + 0.4578275, + 0.40821073 + ], + "image_std": [ + 0.26862954, + 0.26130258, + 0.27577711 + ], + "resample": 3, + "size": 224 +} diff --git a/pretrained_weights/stable-diffusion-v1-5/model_index.json b/pretrained_weights/stable-diffusion-v1-5/model_index.json new file mode 100644 index 0000000000000000000000000000000000000000..daf7e2e2dfc64fb437a2b44525667111b00cb9fc --- /dev/null +++ b/pretrained_weights/stable-diffusion-v1-5/model_index.json @@ -0,0 +1,32 @@ +{ + "_class_name": "StableDiffusionPipeline", + "_diffusers_version": "0.6.0", + "feature_extractor": [ + "transformers", + "CLIPImageProcessor" + ], + "safety_checker": [ + "stable_diffusion", + "StableDiffusionSafetyChecker" + ], + "scheduler": [ + "diffusers", + "PNDMScheduler" + ], + "text_encoder": [ + "transformers", + "CLIPTextModel" + ], + "tokenizer": [ + "transformers", + "CLIPTokenizer" + ], + "unet": [ + "diffusers", + "UNet2DConditionModel" + ], + "vae": [ + "diffusers", + "AutoencoderKL" + ] +} diff --git a/pretrained_weights/stable-diffusion-v1-5/unet/config.json b/pretrained_weights/stable-diffusion-v1-5/unet/config.json new file mode 100644 index 0000000000000000000000000000000000000000..1a02ee8abc93e840ffbcb2d68b66ccbcb74b3ab3 --- /dev/null +++ b/pretrained_weights/stable-diffusion-v1-5/unet/config.json @@ -0,0 +1,36 @@ +{ + "_class_name": "UNet2DConditionModel", + "_diffusers_version": "0.6.0", + "act_fn": "silu", + "attention_head_dim": 8, + "block_out_channels": [ + 320, + 640, + 1280, + 1280 + ], + "center_input_sample": false, + "cross_attention_dim": 768, + "down_block_types": [ + "CrossAttnDownBlock2D", + "CrossAttnDownBlock2D", + "CrossAttnDownBlock2D", + "DownBlock2D" + ], + "downsample_padding": 1, + "flip_sin_to_cos": true, + "freq_shift": 0, + "in_channels": 4, + "layers_per_block": 2, + "mid_block_scale_factor": 1, + "norm_eps": 1e-05, + "norm_num_groups": 32, + "out_channels": 4, + "sample_size": 64, + "up_block_types": [ + "UpBlock2D", + "CrossAttnUpBlock2D", + "CrossAttnUpBlock2D", + "CrossAttnUpBlock2D" + ] +} diff --git a/pretrained_weights/stable-diffusion-v1-5/unet/diffusion_pytorch_model.bin b/pretrained_weights/stable-diffusion-v1-5/unet/diffusion_pytorch_model.bin new file mode 100644 index 0000000000000000000000000000000000000000..f1ffb48de7efbabc851a260efde560d49621a9bc --- /dev/null +++ b/pretrained_weights/stable-diffusion-v1-5/unet/diffusion_pytorch_model.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c7da0e21ba7ea50637bee26e81c220844defdf01aafca02b2c42ecdadb813de4 +size 3438354725 diff --git a/pretrained_weights/stable-diffusion-v1-5/v1-inference.yaml b/pretrained_weights/stable-diffusion-v1-5/v1-inference.yaml new file mode 100644 index 0000000000000000000000000000000000000000..d4effe569e897369918625f9d8be5603a0e6a0d6 --- /dev/null +++ b/pretrained_weights/stable-diffusion-v1-5/v1-inference.yaml @@ -0,0 +1,70 @@ +model: + base_learning_rate: 1.0e-04 + target: ldm.models.diffusion.ddpm.LatentDiffusion + params: + linear_start: 0.00085 + linear_end: 0.0120 + num_timesteps_cond: 1 + log_every_t: 200 + timesteps: 1000 + first_stage_key: "jpg" + cond_stage_key: "txt" + image_size: 64 + channels: 4 + cond_stage_trainable: false # Note: different from the one we trained before + conditioning_key: crossattn + monitor: val/loss_simple_ema + scale_factor: 0.18215 + use_ema: False + + scheduler_config: # 10000 warmup steps + target: ldm.lr_scheduler.LambdaLinearScheduler + params: + warm_up_steps: [ 10000 ] + cycle_lengths: [ 10000000000000 ] # incredibly large number to prevent corner cases + f_start: [ 1.e-6 ] + f_max: [ 1. ] + f_min: [ 1. ] + + unet_config: + target: ldm.modules.diffusionmodules.openaimodel.UNetModel + params: + image_size: 32 # unused + in_channels: 4 + out_channels: 4 + model_channels: 320 + attention_resolutions: [ 4, 2, 1 ] + num_res_blocks: 2 + channel_mult: [ 1, 2, 4, 4 ] + num_heads: 8 + use_spatial_transformer: True + transformer_depth: 1 + context_dim: 768 + use_checkpoint: True + legacy: False + + first_stage_config: + target: ldm.models.autoencoder.AutoencoderKL + params: + embed_dim: 4 + monitor: val/rec_loss + ddconfig: + double_z: true + z_channels: 4 + resolution: 256 + in_channels: 3 + out_ch: 3 + ch: 128 + ch_mult: + - 1 + - 2 + - 4 + - 4 + num_res_blocks: 2 + attn_resolutions: [] + dropout: 0.0 + lossconfig: + target: torch.nn.Identity + + cond_stage_config: + target: ldm.modules.encoders.modules.FrozenCLIPEmbedder diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000000000000000000000000000000000000..a45426d7f0010755d6c4d2716958caa22f4da3ba --- /dev/null +++ b/requirements.txt @@ -0,0 +1,39 @@ +--extra-index-url https://download.pytorch.org/whl/cu118 + +numpy==1.23.5 +opencv-python==4.7.0.72 +onnx==1.14.0 +insightface==0.7.3 +psutil==5.9.5 +tk==0.1.0 +customtkinter==5.1.3 +pillow==9.5.0 +torch==2.0.1+cu118; sys_platform != 'darwin' +torch==2.0.1; sys_platform == 'darwin' +torchvision==0.15.2+cu118; sys_platform != 'darwin' +torchvision==0.15.2; sys_platform == 'darwin' +onnxruntime==1.15.0; sys_platform == 'darwin' and platform_machine != 'arm64' +onnxruntime-silicon==1.13.1; sys_platform == 'darwin' and platform_machine == 'arm64' +onnxruntime-gpu==1.15.0; sys_platform != 'darwin' +tensorflow==2.13.0rc1; sys_platform == 'darwin' +tensorflow==2.12.0; sys_platform != 'darwin' +opennsfw2==0.10.2 +protobuf==4.23.2 +tqdm==4.65.0 +gfpgan==1.3.8 +gradio==3.40.1 +tkinterdnd2==0.3.0; sys_platform != 'darwin' and platform_machine != 'arm64' +tkinterdnd2-universal==1.7.3; sys_platform == 'darwin' and platform_machine == 'arm64' +onnxruntime-coreml==1.13.1; python_version == '3.9' and sys_platform == 'darwin' and platform_machine != 'arm64' + +# Add additional dependencies +diffusers==0.24.0 +omegaconf==2.2.3 + +# Face swap related dependencies +facenet-pytorch==2.5.2 +dlib==19.22.0 + + +# Background removal +backgroundremover \ No newline at end of file diff --git a/roop/__init__.py b/roop/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/roop/__pycache__/__init__.cpython-310.pyc b/roop/__pycache__/__init__.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..0f008d4528c589db44822d308729d41948f137ea Binary files /dev/null and b/roop/__pycache__/__init__.cpython-310.pyc differ diff --git a/roop/__pycache__/capturer.cpython-310.pyc b/roop/__pycache__/capturer.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..926c8f693b18f6ad0c5870377b1238f073aa3d6d Binary files /dev/null and b/roop/__pycache__/capturer.cpython-310.pyc differ diff --git a/roop/__pycache__/core.cpython-310.pyc b/roop/__pycache__/core.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..843e0ad36fcb3ec6045b31868cf0fa6b68a0c6af Binary files /dev/null and b/roop/__pycache__/core.cpython-310.pyc differ diff --git a/roop/__pycache__/face_analyser.cpython-310.pyc b/roop/__pycache__/face_analyser.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..ed05f44be6e6d611e37ab788ca2436d12432e9f3 Binary files /dev/null and b/roop/__pycache__/face_analyser.cpython-310.pyc differ diff --git a/roop/__pycache__/globals.cpython-310.pyc b/roop/__pycache__/globals.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..c9e968afd248ca325d79901862b1a49052bc8493 Binary files /dev/null and b/roop/__pycache__/globals.cpython-310.pyc differ diff --git a/roop/__pycache__/metadata.cpython-310.pyc b/roop/__pycache__/metadata.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..223ccb04ed516a0fe3dba64c72319476bd783d27 Binary files /dev/null and b/roop/__pycache__/metadata.cpython-310.pyc differ diff --git a/roop/__pycache__/predicter.cpython-310.pyc b/roop/__pycache__/predicter.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..11b68e75d2744ac5d61764631c55577b62307cea Binary files /dev/null and b/roop/__pycache__/predicter.cpython-310.pyc differ diff --git a/roop/__pycache__/typing.cpython-310.pyc b/roop/__pycache__/typing.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..7e1ac7fd5961589deb35063edc015e1f1134e2fb Binary files /dev/null and b/roop/__pycache__/typing.cpython-310.pyc differ diff --git a/roop/__pycache__/ui.cpython-310.pyc b/roop/__pycache__/ui.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..6c642d631b18d081d4362d160622bffe7e658bc1 Binary files /dev/null and b/roop/__pycache__/ui.cpython-310.pyc differ diff --git a/roop/__pycache__/utilities.cpython-310.pyc b/roop/__pycache__/utilities.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..da78b773dd7024aa067c530879e8379fda9d2436 Binary files /dev/null and b/roop/__pycache__/utilities.cpython-310.pyc differ diff --git a/roop/capturer.py b/roop/capturer.py new file mode 100644 index 0000000000000000000000000000000000000000..fd49d468dd4cd45832ab9612205968207a6f45cf --- /dev/null +++ b/roop/capturer.py @@ -0,0 +1,20 @@ +from typing import Any +import cv2 + + +def get_video_frame(video_path: str, frame_number: int = 0) -> Any: + capture = cv2.VideoCapture(video_path) + frame_total = capture.get(cv2.CAP_PROP_FRAME_COUNT) + capture.set(cv2.CAP_PROP_POS_FRAMES, min(frame_total, frame_number - 1)) + has_frame, frame = capture.read() + capture.release() + if has_frame: + return frame + return None + + +def get_video_frame_total(video_path: str) -> int: + capture = cv2.VideoCapture(video_path) + video_frame_total = int(capture.get(cv2.CAP_PROP_FRAME_COUNT)) + capture.release() + return video_frame_total diff --git a/roop/core.py b/roop/core.py new file mode 100644 index 0000000000000000000000000000000000000000..cd7ffaf084e72691ea96f3942717329d4ae5f69a --- /dev/null +++ b/roop/core.py @@ -0,0 +1,215 @@ +#!/usr/bin/env python3 + +import os +import sys +# single thread doubles cuda performance - needs to be set before torch import +if any(arg.startswith('--execution-provider') for arg in sys.argv): + os.environ['OMP_NUM_THREADS'] = '1' +# reduce tensorflow log level +os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2' +import warnings +from typing import List +import platform +import signal +import shutil +import argparse +import torch +import onnxruntime +import tensorflow + +import roop.globals +import roop.metadata +import roop.ui as ui +from roop.predicter import predict_image, predict_video +from roop.processors.frame.core import get_frame_processors_modules +from roop.utilities import has_image_extension, is_image, is_video, detect_fps, create_video, extract_frames, get_temp_frame_paths, restore_audio, create_temp, move_temp, clean_temp, normalize_output_path + +if 'ROCMExecutionProvider' in roop.globals.execution_providers: + del torch + +warnings.filterwarnings('ignore', category=FutureWarning, module='insightface') +warnings.filterwarnings('ignore', category=UserWarning, module='torchvision') + + +def parse_args() -> None: + signal.signal(signal.SIGINT, lambda signal_number, frame: destroy()) + program = argparse.ArgumentParser(formatter_class=lambda prog: argparse.HelpFormatter(prog, max_help_position=100)) + program.add_argument('-s', '--source', help='select an source image', dest='source_path') + program.add_argument('-t', '--target', help='select an target image or video', dest='target_path') + program.add_argument('-o', '--output', help='select output file or directory', dest='output_path') + program.add_argument('--frame-processor', help='frame processors (choices: face_swapper, face_enhancer, ...)', dest='frame_processor', default=['face_swapper'], nargs='+') + program.add_argument('--keep-fps', help='keep original fps', dest='keep_fps', action='store_true', default=False) + program.add_argument('--keep-audio', help='keep original audio', dest='keep_audio', action='store_true', default=True) + program.add_argument('--keep-frames', help='keep temporary frames', dest='keep_frames', action='store_true', default=False) + program.add_argument('--many-faces', help='process every face', dest='many_faces', action='store_true', default=False) + program.add_argument('--video-encoder', help='adjust output video encoder', dest='video_encoder', default='libx264', choices=['libx264', 'libx265', 'libvpx-vp9']) + program.add_argument('--video-quality', help='adjust output video quality', dest='video_quality', type=int, default=18, choices=range(52), metavar='[0-51]') + program.add_argument('--max-memory', help='maximum amount of RAM in GB', dest='max_memory', type=int, default=suggest_max_memory()) + program.add_argument('--execution-provider', help='available execution provider (choices: cpu, ...)', dest='execution_provider', default=['cpu'], choices=suggest_execution_providers(), nargs='+') + program.add_argument('--execution-threads', help='number of execution threads', dest='execution_threads', type=int, default=suggest_execution_threads()) + program.add_argument('-v', '--version', action='version', version=f'{roop.metadata.name} {roop.metadata.version}') + + args = program.parse_args() + + roop.globals.source_path = args.source_path + roop.globals.target_path = args.target_path + roop.globals.output_path = normalize_output_path(roop.globals.source_path, roop.globals.target_path, args.output_path) + roop.globals.frame_processors = args.frame_processor + roop.globals.headless = args.source_path or args.target_path or args.output_path + roop.globals.keep_fps = args.keep_fps + roop.globals.keep_audio = args.keep_audio + roop.globals.keep_frames = args.keep_frames + roop.globals.many_faces = args.many_faces + roop.globals.video_encoder = args.video_encoder + roop.globals.video_quality = args.video_quality + roop.globals.max_memory = args.max_memory + roop.globals.execution_providers = decode_execution_providers(args.execution_provider) + roop.globals.execution_threads = args.execution_threads + + +def encode_execution_providers(execution_providers: List[str]) -> List[str]: + return [execution_provider.replace('ExecutionProvider', '').lower() for execution_provider in execution_providers] + + +def decode_execution_providers(execution_providers: List[str]) -> List[str]: + return [provider for provider, encoded_execution_provider in zip(onnxruntime.get_available_providers(), encode_execution_providers(onnxruntime.get_available_providers())) + if any(execution_provider in encoded_execution_provider for execution_provider in execution_providers)] + + +def suggest_max_memory() -> int: + if platform.system().lower() == 'darwin': + return 10 + return 14 + + +def suggest_execution_providers() -> List[str]: + return encode_execution_providers(onnxruntime.get_available_providers()) + + +def suggest_execution_threads() -> int: + if 'DmlExecutionProvider' in roop.globals.execution_providers: + return 1 + if 'ROCMExecutionProvider' in roop.globals.execution_providers: + return 1 + return 8 + + +def limit_resources() -> None: + # prevent tensorflow memory leak + gpus = tensorflow.config.experimental.list_physical_devices('GPU') + for gpu in gpus: + tensorflow.config.experimental.set_virtual_device_configuration(gpu, [ + tensorflow.config.experimental.VirtualDeviceConfiguration(memory_limit=1024) + ]) + # limit memory usage + if roop.globals.max_memory: + memory = roop.globals.max_memory * 1024 ** 3 + if platform.system().lower() == 'darwin': + memory = roop.globals.max_memory * 1024 ** 6 + if platform.system().lower() == 'windows': + import ctypes + kernel32 = ctypes.windll.kernel32 + kernel32.SetProcessWorkingSetSize(-1, ctypes.c_size_t(memory), ctypes.c_size_t(memory)) + else: + import resource + resource.setrlimit(resource.RLIMIT_DATA, (memory, memory)) + + +def release_resources() -> None: + if 'CUDAExecutionProvider' in roop.globals.execution_providers: + torch.cuda.empty_cache() + + +def pre_check() -> bool: + if sys.version_info < (3, 9): + update_status('Python version is not supported - please upgrade to 3.9 or higher.') + return False + if not shutil.which('ffmpeg'): + update_status('ffmpeg is not installed.') + return False + return True + + +def update_status(message: str, scope: str = 'ROOP.CORE') -> None: + print(f'[{scope}] {message}') + if not roop.globals.headless: + ui.update_status(message) + + +def start() -> None: + for frame_processor in get_frame_processors_modules(roop.globals.frame_processors): + if not frame_processor.pre_start(): + return + # process image to image + if has_image_extension(roop.globals.target_path): + if predict_image(roop.globals.target_path): + destroy() + shutil.copy2(roop.globals.target_path, roop.globals.output_path) + for frame_processor in get_frame_processors_modules(roop.globals.frame_processors): + update_status('Progressing...', frame_processor.NAME) + frame_processor.process_image(roop.globals.source_path, roop.globals.output_path, roop.globals.output_path) + frame_processor.post_process() + release_resources() + if is_image(roop.globals.target_path): + update_status('Processing to image succeed!') + else: + update_status('Processing to image failed!') + return + # process image to videos + if predict_video(roop.globals.target_path): + destroy() + update_status('Creating temp resources...') + create_temp(roop.globals.target_path) + update_status('Extracting frames...') + extract_frames(roop.globals.target_path) + temp_frame_paths = get_temp_frame_paths(roop.globals.target_path) + for frame_processor in get_frame_processors_modules(roop.globals.frame_processors): + update_status('Progressing...', frame_processor.NAME) + frame_processor.process_video(roop.globals.source_path, temp_frame_paths) + frame_processor.post_process() + release_resources() + # handles fps + if roop.globals.keep_fps: + update_status('Detecting fps...') + fps = detect_fps(roop.globals.target_path) + update_status(f'Creating video with {fps} fps...') + create_video(roop.globals.target_path, fps) + else: + update_status('Creating video with 30.0 fps...') + create_video(roop.globals.target_path) + # handle audio + if roop.globals.keep_audio: + if roop.globals.keep_fps: + update_status('Restoring audio...') + else: + update_status('Restoring audio might cause issues as fps are not kept...') + restore_audio(roop.globals.target_path, roop.globals.output_path) + else: + move_temp(roop.globals.target_path, roop.globals.output_path) + # clean and validate + clean_temp(roop.globals.target_path) + if is_video(roop.globals.target_path): + update_status('Processing to video succeed!') + else: + update_status('Processing to video failed!') + + +def destroy() -> None: + if roop.globals.target_path: + clean_temp(roop.globals.target_path) + quit() + + +def run() -> None: + parse_args() + if not pre_check(): + return + for frame_processor in get_frame_processors_modules(roop.globals.frame_processors): + if not frame_processor.pre_check(): + return + limit_resources() + if roop.globals.headless: + start() + else: + window = ui.init(start, destroy) + window.mainloop() diff --git a/roop/face_analyser.py b/roop/face_analyser.py new file mode 100644 index 0000000000000000000000000000000000000000..9c0afe458763edb22dc2332f527dfdba48575b1d --- /dev/null +++ b/roop/face_analyser.py @@ -0,0 +1,34 @@ +import threading +from typing import Any +import insightface + +import roop.globals +from roop.typing import Frame + +FACE_ANALYSER = None +THREAD_LOCK = threading.Lock() + + +def get_face_analyser() -> Any: + global FACE_ANALYSER + + with THREAD_LOCK: + if FACE_ANALYSER is None: + FACE_ANALYSER = insightface.app.FaceAnalysis(name='buffalo_l', providers=roop.globals.execution_providers) + FACE_ANALYSER.prepare(ctx_id=0, det_size=(640, 640)) + return FACE_ANALYSER + + +def get_one_face(frame: Frame) -> Any: + face = get_face_analyser().get(frame) + try: + return min(face, key=lambda x: x.bbox[0]) + except ValueError: + return None + + +def get_many_faces(frame: Frame) -> Any: + try: + return get_face_analyser().get(frame) + except IndexError: + return None diff --git a/roop/globals.py b/roop/globals.py new file mode 100644 index 0000000000000000000000000000000000000000..77fd391db235b878ce1f91765596bd76adb06697 --- /dev/null +++ b/roop/globals.py @@ -0,0 +1,17 @@ +from typing import List + +source_path = None +target_path = None +output_path = None +frame_processors: List[str] = [] +keep_fps = None +keep_audio = None +keep_frames = None +many_faces = None +video_encoder = None +video_quality = None +max_memory = None +execution_providers: List[str] = [] +execution_threads = None +headless = None +log_level = 'error' diff --git a/roop/metadata.py b/roop/metadata.py new file mode 100644 index 0000000000000000000000000000000000000000..35b0f0245a38eb9ec024f2ed2c829044f6051c29 --- /dev/null +++ b/roop/metadata.py @@ -0,0 +1,2 @@ +name = 'roop' +version = '1.1.0' diff --git a/roop/predicter.py b/roop/predicter.py new file mode 100644 index 0000000000000000000000000000000000000000..6641cbf3d89afaeb56a0b93c306e86b5953cf74b --- /dev/null +++ b/roop/predicter.py @@ -0,0 +1,43 @@ +import threading +import numpy +import opennsfw2 +from PIL import Image +from keras import Model + +from roop.typing import Frame + +PREDICTOR = None +THREAD_LOCK = threading.Lock() +MAX_PROBABILITY = 0.85 + + +def get_predictor() -> Model: + global PREDICTOR + + with THREAD_LOCK: + if PREDICTOR is None: + PREDICTOR = opennsfw2.make_open_nsfw_model() + return PREDICTOR + + +def clear_predictor() -> None: + global PREDICTOR + + PREDICTOR = None + + +def predict_frame(target_frame: Frame) -> bool: + image = Image.fromarray(target_frame) + image = opennsfw2.preprocess_image(image, opennsfw2.Preprocessing.YAHOO) + views = numpy.expand_dims(image, axis=0) + _, probability = get_predictor().predict(views)[0] + return probability > MAX_PROBABILITY + + +def predict_image(target_path: str) -> bool: + return opennsfw2.predict_image(target_path) > MAX_PROBABILITY + + +def predict_video(target_path: str) -> bool: + _, probabilities = opennsfw2.predict_video_frames(video_path=target_path, frame_interval=100) + return any(probability > MAX_PROBABILITY for probability in probabilities) \ No newline at end of file diff --git a/roop/processors/__init__.py b/roop/processors/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/roop/processors/__pycache__/__init__.cpython-310.pyc b/roop/processors/__pycache__/__init__.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..63d8eb4aa92bc0bdd8ba9fdeb374697a9c663b1a Binary files /dev/null and b/roop/processors/__pycache__/__init__.cpython-310.pyc differ diff --git a/roop/processors/frame/__init__.py b/roop/processors/frame/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/roop/processors/frame/__pycache__/__init__.cpython-310.pyc b/roop/processors/frame/__pycache__/__init__.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..bb4fa5e29c05695eec72bdb1e22208fe204da973 Binary files /dev/null and b/roop/processors/frame/__pycache__/__init__.cpython-310.pyc differ diff --git a/roop/processors/frame/__pycache__/core.cpython-310.pyc b/roop/processors/frame/__pycache__/core.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..ac3b4a736409e1d631adc7ba408ecb2e5138b962 Binary files /dev/null and b/roop/processors/frame/__pycache__/core.cpython-310.pyc differ diff --git a/roop/processors/frame/__pycache__/face_enhancer.cpython-310.pyc b/roop/processors/frame/__pycache__/face_enhancer.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..bce3fed54178f8c21247aee4558568aecab146a7 Binary files /dev/null and b/roop/processors/frame/__pycache__/face_enhancer.cpython-310.pyc differ diff --git a/roop/processors/frame/__pycache__/face_swapper.cpython-310.pyc b/roop/processors/frame/__pycache__/face_swapper.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..050a218ed4a202d7b645bc73d757d7903ddfda37 Binary files /dev/null and b/roop/processors/frame/__pycache__/face_swapper.cpython-310.pyc differ diff --git a/roop/processors/frame/core.py b/roop/processors/frame/core.py new file mode 100644 index 0000000000000000000000000000000000000000..c225f9de483a2914a98392ce9de5bd03f2013a2d --- /dev/null +++ b/roop/processors/frame/core.py @@ -0,0 +1,88 @@ +import os +import importlib +import psutil +from concurrent.futures import ThreadPoolExecutor, as_completed +from queue import Queue +from types import ModuleType +from typing import Any, List, Callable +from tqdm import tqdm + +import roop + +FRAME_PROCESSORS_MODULES: List[ModuleType] = [] +FRAME_PROCESSORS_INTERFACE = [ + 'pre_check', + 'pre_start', + 'process_frame', + 'process_frames', + 'process_image', + 'process_video', + 'post_process' +] + + +def load_frame_processor_module(frame_processor: str) -> Any: + try: + frame_processor_module = importlib.import_module(f'roop.processors.frame.{frame_processor}') + for method_name in FRAME_PROCESSORS_INTERFACE: + if not hasattr(frame_processor_module, method_name): + raise NotImplementedError + except (ImportError, NotImplementedError): + quit(f'Frame processor {frame_processor} crashed.') + return frame_processor_module + + +def get_frame_processors_modules(frame_processors: List[str]) -> List[ModuleType]: + global FRAME_PROCESSORS_MODULES + + if not FRAME_PROCESSORS_MODULES: + for frame_processor in frame_processors: + frame_processor_module = load_frame_processor_module(frame_processor) + FRAME_PROCESSORS_MODULES.append(frame_processor_module) + return FRAME_PROCESSORS_MODULES + + +def multi_process_frame(source_path: str, temp_frame_paths: List[str], process_frames: Callable[[str, List[str], Any], None], update: Callable[[], None]) -> None: + with ThreadPoolExecutor(max_workers=roop.globals.execution_threads) as executor: + futures = [] + queue = create_queue(temp_frame_paths) + queue_per_future = len(temp_frame_paths) // roop.globals.execution_threads + while not queue.empty(): + future = executor.submit(process_frames, source_path, pick_queue(queue, queue_per_future), update) + futures.append(future) + for future in as_completed(futures): + future.result() + + +def create_queue(temp_frame_paths: List[str]) -> Queue[str]: + queue: Queue[str] = Queue() + for frame_path in temp_frame_paths: + queue.put(frame_path) + return queue + + +def pick_queue(queue: Queue[str], queue_per_future: int) -> List[str]: + queues = [] + for _ in range(queue_per_future): + if not queue.empty(): + queues.append(queue.get()) + return queues + + +def process_video(source_path: str, frame_paths: list[str], process_frames: Callable[[str, List[str], Any], None]) -> None: + progress_bar_format = '{l_bar}{bar}| {n_fmt}/{total_fmt} [{elapsed}<{remaining}, {rate_fmt}{postfix}]' + total = len(frame_paths) + with tqdm(total=total, desc='Processing', unit='frame', dynamic_ncols=True, bar_format=progress_bar_format) as progress: + multi_process_frame(source_path, frame_paths, process_frames, lambda: update_progress(progress)) + + +def update_progress(progress: Any = None) -> None: + process = psutil.Process(os.getpid()) + memory_usage = process.memory_info().rss / 1024 / 1024 / 1024 + progress.set_postfix({ + 'memory_usage': '{:.2f}'.format(memory_usage).zfill(5) + 'GB', + 'execution_providers': roop.globals.execution_providers, + 'execution_threads': roop.globals.execution_threads + }) + progress.refresh() + progress.update(1) diff --git a/roop/processors/frame/face_enhancer.py b/roop/processors/frame/face_enhancer.py new file mode 100644 index 0000000000000000000000000000000000000000..cadb65ffc26552de1ea9c6ffe5750c0aa363e981 --- /dev/null +++ b/roop/processors/frame/face_enhancer.py @@ -0,0 +1,81 @@ +from typing import Any, List, Callable +import cv2 +import threading +import gfpgan + +import roop.globals +import roop.processors.frame.core +from roop.core import update_status +from roop.face_analyser import get_one_face +from roop.typing import Frame, Face +from roop.utilities import conditional_download, resolve_relative_path, is_image, is_video + +FACE_ENHANCER = None +THREAD_SEMAPHORE = threading.Semaphore() +THREAD_LOCK = threading.Lock() +NAME = 'ROOP.FACE-ENHANCER' + + +def get_face_enhancer() -> Any: + global FACE_ENHANCER + + with THREAD_LOCK: + if FACE_ENHANCER is None: + model_path = resolve_relative_path('../models/GFPGANv1.4.pth') + # todo: set models path https://github.com/TencentARC/GFPGAN/issues/399 + FACE_ENHANCER = gfpgan.GFPGANer(model_path=model_path, upscale=5) # type: ignore[attr-defined] + return FACE_ENHANCER + + +def pre_check() -> bool: + download_directory_path = resolve_relative_path('../models') + conditional_download(download_directory_path, ['https://github.com/TencentARC/GFPGAN/releases/download/v1.3.4/GFPGANv1.4.pth']) + return True + + +def pre_start() -> bool: + if not is_image(roop.globals.target_path) and not is_video(roop.globals.target_path): + update_status('Select an image or video for target path.', NAME) + return False + return True + + +def post_process() -> None: + global FACE_ENHANCER + + FACE_ENHANCER = None + + +def enhance_face(temp_frame: Frame) -> Frame: + with THREAD_SEMAPHORE: + _, _, temp_frame = get_face_enhancer().enhance( + temp_frame, + paste_back=True + ) + return temp_frame + + +def process_frame(source_face: Face, temp_frame: Frame) -> Frame: + target_face = get_one_face(temp_frame) + if target_face: + temp_frame = enhance_face(temp_frame) + return temp_frame + + +def process_frames(source_path: str, temp_frame_paths: List[str], update: Callable[[], None]) -> None: + for temp_frame_path in temp_frame_paths: + temp_frame = cv2.imread(temp_frame_path) + result = process_frame(None, temp_frame) + cv2.imwrite(temp_frame_path, result) + if update: + update() + + +def process_image(source_path: str, target_path: str, output_path: str) -> None: + target_frame = cv2.imread(target_path) + result = process_frame(None, target_frame) + cv2.imwrite(output_path, result) + + +def process_video(source_path: str, temp_frame_paths: List[str]) -> None: + roop.processors.frame.core.process_video(None, temp_frame_paths, process_frames) diff --git a/roop/processors/frame/face_swapper.py b/roop/processors/frame/face_swapper.py new file mode 100644 index 0000000000000000000000000000000000000000..7eccaa097d064a38d7948c59feb72e52d9ecba77 --- /dev/null +++ b/roop/processors/frame/face_swapper.py @@ -0,0 +1,88 @@ +from typing import Any, List, Callable +import cv2 +import insightface +import threading + +import roop.globals +import roop.processors.frame.core +from roop.core import update_status +from roop.face_analyser import get_one_face, get_many_faces +from roop.typing import Face, Frame +from roop.utilities import conditional_download, resolve_relative_path, is_image, is_video + +FACE_SWAPPER = None +THREAD_LOCK = threading.Lock() +NAME = 'ROOP.FACE-SWAPPER' + + +def get_face_swapper() -> Any: + global FACE_SWAPPER + + with THREAD_LOCK: + if FACE_SWAPPER is None: + model_path = resolve_relative_path('../models/inswapper_128.onnx') + FACE_SWAPPER = insightface.model_zoo.get_model(model_path, providers=roop.globals.execution_providers) + return FACE_SWAPPER + + +def pre_check() -> bool: + download_directory_path = resolve_relative_path('../models') + conditional_download(download_directory_path, ['https://huggingface.co/countfloyd/deepfake/resolve/main/inswapper_128.onnx']) + return True + + +def pre_start() -> bool: + if not is_image(roop.globals.source_path): + update_status('Select an image for source path.', NAME) + return False + elif not get_one_face(cv2.imread(roop.globals.source_path)): + update_status('No face in source path detected.', NAME) + return False + if not is_image(roop.globals.target_path) and not is_video(roop.globals.target_path): + update_status('Select an image or video for target path.', NAME) + return False + return True + + +def post_process() -> None: + global FACE_SWAPPER + + FACE_SWAPPER = None + + +def swap_face(source_face: Face, target_face: Face, temp_frame: Frame) -> Frame: + return get_face_swapper().get(temp_frame, target_face, source_face, paste_back=True) + + +def process_frame(source_face: Face, temp_frame: Frame) -> Frame: + if roop.globals.many_faces: + many_faces = get_many_faces(temp_frame) + if many_faces: + for target_face in many_faces: + temp_frame = swap_face(source_face, target_face, temp_frame) + else: + target_face = get_one_face(temp_frame) + if target_face: + temp_frame = swap_face(source_face, target_face, temp_frame) + return temp_frame + + +def process_frames(source_path: str, temp_frame_paths: List[str], update: Callable[[], None]) -> None: + source_face = get_one_face(cv2.imread(source_path)) + for temp_frame_path in temp_frame_paths: + temp_frame = cv2.imread(temp_frame_path) + result = process_frame(source_face, temp_frame) + cv2.imwrite(temp_frame_path, result) + if update: + update() + + +def process_image(source_path: str, target_path: str, output_path: str) -> None: + source_face = get_one_face(cv2.imread(source_path)) + target_frame = cv2.imread(target_path) + result = process_frame(source_face, target_frame) + cv2.imwrite(output_path, result) + + +def process_video(source_path: str, temp_frame_paths: List[str]) -> None: + roop.processors.frame.core.process_video(source_path, temp_frame_paths, process_frames) diff --git a/roop/typing.py b/roop/typing.py new file mode 100644 index 0000000000000000000000000000000000000000..1cff7440616e20bfe7b8bc287f86d11bf1b0f083 --- /dev/null +++ b/roop/typing.py @@ -0,0 +1,7 @@ +from typing import Any + +from insightface.app.common import Face +import numpy + +Face = Face +Frame = numpy.ndarray[Any, Any] diff --git a/roop/ui.json b/roop/ui.json new file mode 100644 index 0000000000000000000000000000000000000000..49309919763256cc84ea70b02965c2e2bc96de2b --- /dev/null +++ b/roop/ui.json @@ -0,0 +1,158 @@ +{ + "CTk": { + "fg_color": ["gray95", "gray10"] + }, + "CTkToplevel": { + "fg_color": ["gray95", "gray10"] + }, + "CTkFrame": { + "corner_radius": 6, + "border_width": 0, + "fg_color": ["gray90", "gray13"], + "top_fg_color": ["gray85", "gray16"], + "border_color": ["gray65", "gray28"] + }, + "CTkButton": { + "corner_radius": 6, + "border_width": 0, + "fg_color": ["#3a7ebf", "#1f538d"], + "hover_color": ["#325882", "#14375e"], + "border_color": ["#3E454A", "#949A9F"], + "text_color": ["#DCE4EE", "#DCE4EE"], + "text_color_disabled": ["gray74", "gray60"] + }, + "CTkLabel": { + "corner_radius": 0, + "fg_color": "transparent", + "text_color": ["gray14", "gray84"] + }, + "CTkEntry": { + "corner_radius": 6, + "border_width": 2, + "fg_color": ["#F9F9FA", "#343638"], + "border_color": ["#979DA2", "#565B5E"], + "text_color": ["gray14", "gray84"], + "placeholder_text_color": ["gray52", "gray62"] + }, + "CTkCheckbox": { + "corner_radius": 6, + "border_width": 3, + "fg_color": ["#3a7ebf", "#1f538d"], + "border_color": ["#3E454A", "#949A9F"], + "hover_color": ["#325882", "#14375e"], + "checkmark_color": ["#DCE4EE", "gray90"], + "text_color": ["gray14", "gray84"], + "text_color_disabled": ["gray60", "gray45"] + }, + "CTkSwitch": { + "corner_radius": 1000, + "border_width": 3, + "button_length": 0, + "fg_color": ["#939BA2", "#4A4D50"], + "progress_color": ["#3a7ebf", "#1f538d"], + "button_color": ["gray36", "#D5D9DE"], + "button_hover_color": ["gray20", "gray100"], + "text_color": ["gray14", "gray84"], + "text_color_disabled": ["gray60", "gray45"] + }, + "CTkRadiobutton": { + "corner_radius": 1000, + "border_width_checked": 6, + "border_width_unchecked": 3, + "fg_color": ["#3a7ebf", "#1f538d"], + "border_color": ["#3E454A", "#949A9F"], + "hover_color": ["#325882", "#14375e"], + "text_color": ["gray14", "gray84"], + "text_color_disabled": ["gray60", "gray45"] + }, + "CTkProgressBar": { + "corner_radius": 1000, + "border_width": 0, + "fg_color": ["#939BA2", "#4A4D50"], + "progress_color": ["#3a7ebf", "#1f538d"], + "border_color": ["gray", "gray"] + }, + "CTkSlider": { + "corner_radius": 1000, + "button_corner_radius": 1000, + "border_width": 6, + "button_length": 0, + "fg_color": ["#939BA2", "#4A4D50"], + "progress_color": ["gray40", "#AAB0B5"], + "button_color": ["#3a7ebf", "#1f538d"], + "button_hover_color": ["#325882", "#14375e"] + }, + "CTkOptionMenu": { + "corner_radius": 6, + "fg_color": ["#3a7ebf", "#1f538d"], + "button_color": ["#325882", "#14375e"], + "button_hover_color": ["#234567", "#1e2c40"], + "text_color": ["#DCE4EE", "#DCE4EE"], + "text_color_disabled": ["gray74", "gray60"] + }, + "CTkComboBox": { + "corner_radius": 6, + "border_width": 2, + "fg_color": ["#F9F9FA", "#343638"], + "border_color": ["#979DA2", "#565B5E"], + "button_color": ["#979DA2", "#565B5E"], + "button_hover_color": ["#6E7174", "#7A848D"], + "text_color": ["gray14", "gray84"], + "text_color_disabled": ["gray50", "gray45"] + }, + "CTkScrollbar": { + "corner_radius": 1000, + "border_spacing": 4, + "fg_color": "transparent", + "button_color": ["gray55", "gray41"], + "button_hover_color": ["gray40", "gray53"] + }, + "CTkSegmentedButton": { + "corner_radius": 6, + "border_width": 2, + "fg_color": ["#979DA2", "gray29"], + "selected_color": ["#3a7ebf", "#1f538d"], + "selected_hover_color": ["#325882", "#14375e"], + "unselected_color": ["#979DA2", "gray29"], + "unselected_hover_color": ["gray70", "gray41"], + "text_color": ["#DCE4EE", "#DCE4EE"], + "text_color_disabled": ["gray74", "gray60"] + }, + "CTkTextbox": { + "corner_radius": 6, + "border_width": 0, + "fg_color": ["gray100", "gray20"], + "border_color": ["#979DA2", "#565B5E"], + "text_color": ["gray14", "gray84"], + "scrollbar_button_color": ["gray55", "gray41"], + "scrollbar_button_hover_color": ["gray40", "gray53"] + }, + "CTkScrollableFrame": { + "label_fg_color": ["gray80", "gray21"] + }, + "DropdownMenu": { + "fg_color": ["gray90", "gray20"], + "hover_color": ["gray75", "gray28"], + "text_color": ["gray14", "gray84"] + }, + "CTkFont": { + "macOS": { + "family": "Avenir", + "size": 12, + "weight": "normal" + }, + "Windows": { + "family": "Corbel", + "size": 12, + "weight": "normal" + }, + "Linux": { + "family": "Montserrat", + "size": 12, + "weight": "normal" + } + }, + "RoopDonate": { + "text_color": ["#3a7ebf", "gray60"] + } +} diff --git a/roop/ui.py b/roop/ui.py new file mode 100644 index 0000000000000000000000000000000000000000..ba693dac116bd416b91518734fa550e9dfb95c7b --- /dev/null +++ b/roop/ui.py @@ -0,0 +1,231 @@ +import os +import webbrowser +import customtkinter as ctk +from typing import Callable, Tuple +import cv2 +from PIL import Image, ImageOps + +import roop.globals +import roop.metadata +from roop.face_analyser import get_one_face +from roop.capturer import get_video_frame, get_video_frame_total +from roop.predicter import predict_frame +from roop.processors.frame.core import get_frame_processors_modules +from roop.utilities import is_image, is_video, resolve_relative_path + +ROOT = None +ROOT_HEIGHT = 700 +ROOT_WIDTH = 600 + +PREVIEW = None +PREVIEW_MAX_HEIGHT = 700 +PREVIEW_MAX_WIDTH = 1200 + +RECENT_DIRECTORY_SOURCE = None +RECENT_DIRECTORY_TARGET = None +RECENT_DIRECTORY_OUTPUT = None + +preview_label = None +preview_slider = None +source_label = None +target_label = None +status_label = None + + +def init(start: Callable[[], None], destroy: Callable[[], None]) -> ctk.CTk: + global ROOT, PREVIEW + + ROOT = create_root(start, destroy) + PREVIEW = create_preview(ROOT) + + return ROOT + + +def create_root(start: Callable[[], None], destroy: Callable[[], None]) -> ctk.CTk: + global source_label, target_label, status_label + + ctk.deactivate_automatic_dpi_awareness() + ctk.set_appearance_mode('system') + ctk.set_default_color_theme(resolve_relative_path('ui.json')) + + root = ctk.CTk() + root.minsize(ROOT_WIDTH, ROOT_HEIGHT) + root.title(f'{roop.metadata.name} {roop.metadata.version}') + root.configure() + root.protocol('WM_DELETE_WINDOW', lambda: destroy()) + + source_label = ctk.CTkLabel(root, text=None) + source_label.place(relx=0.1, rely=0.1, relwidth=0.3, relheight=0.25) + + target_label = ctk.CTkLabel(root, text=None) + target_label.place(relx=0.6, rely=0.1, relwidth=0.3, relheight=0.25) + + source_button = ctk.CTkButton(root, text='Select a face', cursor='hand2', command=lambda: select_source_path()) + source_button.place(relx=0.1, rely=0.4, relwidth=0.3, relheight=0.1) + + target_button = ctk.CTkButton(root, text='Select a target', cursor='hand2', command=lambda: select_target_path()) + target_button.place(relx=0.6, rely=0.4, relwidth=0.3, relheight=0.1) + + keep_fps_value = ctk.BooleanVar(value=roop.globals.keep_fps) + keep_fps_checkbox = ctk.CTkSwitch(root, text='Keep fps', variable=keep_fps_value, cursor='hand2', command=lambda: setattr(roop.globals, 'keep_fps', not roop.globals.keep_fps)) + keep_fps_checkbox.place(relx=0.1, rely=0.6) + + keep_frames_value = ctk.BooleanVar(value=roop.globals.keep_frames) + keep_frames_switch = ctk.CTkSwitch(root, text='Keep frames', variable=keep_frames_value, cursor='hand2', command=lambda: setattr(roop.globals, 'keep_frames', keep_frames_value.get())) + keep_frames_switch.place(relx=0.1, rely=0.65) + + keep_audio_value = ctk.BooleanVar(value=roop.globals.keep_audio) + keep_audio_switch = ctk.CTkSwitch(root, text='Keep audio', variable=keep_audio_value, cursor='hand2', command=lambda: setattr(roop.globals, 'keep_audio', keep_audio_value.get())) + keep_audio_switch.place(relx=0.6, rely=0.6) + + many_faces_value = ctk.BooleanVar(value=roop.globals.many_faces) + many_faces_switch = ctk.CTkSwitch(root, text='Many faces', variable=many_faces_value, cursor='hand2', command=lambda: setattr(roop.globals, 'many_faces', many_faces_value.get())) + many_faces_switch.place(relx=0.6, rely=0.65) + + start_button = ctk.CTkButton(root, text='Start', cursor='hand2', command=lambda: select_output_path(start)) + start_button.place(relx=0.15, rely=0.75, relwidth=0.2, relheight=0.05) + + stop_button = ctk.CTkButton(root, text='Destroy', cursor='hand2', command=lambda: destroy()) + stop_button.place(relx=0.4, rely=0.75, relwidth=0.2, relheight=0.05) + + preview_button = ctk.CTkButton(root, text='Preview', cursor='hand2', command=lambda: toggle_preview()) + preview_button.place(relx=0.65, rely=0.75, relwidth=0.2, relheight=0.05) + + status_label = ctk.CTkLabel(root, text=None, justify='center') + status_label.place(relx=0.1, rely=0.9, relwidth=0.8) + + donate_label = ctk.CTkLabel(root, text='^_^ Donate to project ^_^', justify='center', cursor='hand2') + donate_label.place(relx=0.1, rely=0.95, relwidth=0.8) + donate_label.configure(text_color=ctk.ThemeManager.theme.get('RoopDonate').get('text_color')) + donate_label.bind('