root
commited on
Commit
·
dd31ccf
1
Parent(s):
1bfd414
setting up model
Browse filesThis view is limited to 50 files because it contains too many changes.
See raw diff
- __pycache__/handler.cpython-310.pyc +0 -0
- configs/inference/inference_v1.yaml +23 -0
- configs/inference/inference_v2.yaml +35 -0
- configs/prompts/animation.yaml +26 -0
- gfpgan/weights/detection_Resnet50_Final.pth +3 -0
- gfpgan/weights/parsing_parsenet.pth +3 -0
- good_face.jpeg +0 -0
- handler.py +247 -0
- input.jpg +0 -0
- models/GFPGANv1.4.pth +3 -0
- models/inswapper_128.onnx +3 -0
- output.mp4 +0 -0
- output/gradio/animation_output.mp4 +0 -0
- output/gradio/cropped_face.jpg +0 -0
- output/gradio/output_video.mp4 +0 -0
- pose_video.mp4 +0 -0
- pretrained_weights/DWPose/dw-ll_ucoco_384.onnx +3 -0
- pretrained_weights/DWPose/yolox_l.onnx +3 -0
- pretrained_weights/denoising_unet.pth +3 -0
- pretrained_weights/image_encoder/config.json +23 -0
- pretrained_weights/image_encoder/pytorch_model.bin +3 -0
- pretrained_weights/motion_module.pth +3 -0
- pretrained_weights/pose_guider.pth +3 -0
- pretrained_weights/reference_unet.pth +3 -0
- pretrained_weights/sd-vae-ft-mse/config.json +29 -0
- pretrained_weights/sd-vae-ft-mse/diffusion_pytorch_model.bin +3 -0
- pretrained_weights/sd-vae-ft-mse/diffusion_pytorch_model.safetensors +3 -0
- pretrained_weights/stable-diffusion-v1-5/feature_extractor/preprocessor_config.json +20 -0
- pretrained_weights/stable-diffusion-v1-5/model_index.json +32 -0
- pretrained_weights/stable-diffusion-v1-5/unet/config.json +36 -0
- pretrained_weights/stable-diffusion-v1-5/unet/diffusion_pytorch_model.bin +3 -0
- pretrained_weights/stable-diffusion-v1-5/v1-inference.yaml +70 -0
- requirements.txt +39 -0
- roop/__init__.py +0 -0
- roop/__pycache__/__init__.cpython-310.pyc +0 -0
- roop/__pycache__/capturer.cpython-310.pyc +0 -0
- roop/__pycache__/core.cpython-310.pyc +0 -0
- roop/__pycache__/face_analyser.cpython-310.pyc +0 -0
- roop/__pycache__/globals.cpython-310.pyc +0 -0
- roop/__pycache__/metadata.cpython-310.pyc +0 -0
- roop/__pycache__/predicter.cpython-310.pyc +0 -0
- roop/__pycache__/typing.cpython-310.pyc +0 -0
- roop/__pycache__/ui.cpython-310.pyc +0 -0
- roop/__pycache__/utilities.cpython-310.pyc +0 -0
- roop/capturer.py +20 -0
- roop/core.py +215 -0
- roop/face_analyser.py +34 -0
- roop/globals.py +17 -0
- roop/metadata.py +2 -0
- roop/predicter.py +43 -0
__pycache__/handler.cpython-310.pyc
ADDED
|
Binary file (8.09 kB). View file
|
|
|
configs/inference/inference_v1.yaml
ADDED
|
@@ -0,0 +1,23 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
unet_additional_kwargs:
|
| 2 |
+
unet_use_cross_frame_attention: false
|
| 3 |
+
unet_use_temporal_attention: false
|
| 4 |
+
use_motion_module: true
|
| 5 |
+
motion_module_resolutions: [1,2,4,8]
|
| 6 |
+
motion_module_mid_block: false
|
| 7 |
+
motion_module_decoder_only: false
|
| 8 |
+
motion_module_type: "Vanilla"
|
| 9 |
+
|
| 10 |
+
motion_module_kwargs:
|
| 11 |
+
num_attention_heads: 8
|
| 12 |
+
num_transformer_block: 1
|
| 13 |
+
attention_block_types: [ "Temporal_Self", "Temporal_Self" ]
|
| 14 |
+
temporal_position_encoding: true
|
| 15 |
+
temporal_position_encoding_max_len: 24
|
| 16 |
+
temporal_attention_dim_div: 1
|
| 17 |
+
|
| 18 |
+
noise_scheduler_kwargs:
|
| 19 |
+
beta_start: 0.00085
|
| 20 |
+
beta_end: 0.012
|
| 21 |
+
beta_schedule: "linear"
|
| 22 |
+
steps_offset: 1
|
| 23 |
+
clip_sample: False
|
configs/inference/inference_v2.yaml
ADDED
|
@@ -0,0 +1,35 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
unet_additional_kwargs:
|
| 2 |
+
use_inflated_groupnorm: true
|
| 3 |
+
unet_use_cross_frame_attention: false
|
| 4 |
+
unet_use_temporal_attention: false
|
| 5 |
+
use_motion_module: true
|
| 6 |
+
motion_module_resolutions:
|
| 7 |
+
- 1
|
| 8 |
+
- 2
|
| 9 |
+
- 4
|
| 10 |
+
- 8
|
| 11 |
+
motion_module_mid_block: true
|
| 12 |
+
motion_module_decoder_only: false
|
| 13 |
+
motion_module_type: Vanilla
|
| 14 |
+
motion_module_kwargs:
|
| 15 |
+
num_attention_heads: 8
|
| 16 |
+
num_transformer_block: 1
|
| 17 |
+
attention_block_types:
|
| 18 |
+
- Temporal_Self
|
| 19 |
+
- Temporal_Self
|
| 20 |
+
temporal_position_encoding: true
|
| 21 |
+
temporal_position_encoding_max_len: 32
|
| 22 |
+
temporal_attention_dim_div: 1
|
| 23 |
+
|
| 24 |
+
noise_scheduler_kwargs:
|
| 25 |
+
beta_start: 0.00085
|
| 26 |
+
beta_end: 0.012
|
| 27 |
+
beta_schedule: "linear"
|
| 28 |
+
clip_sample: false
|
| 29 |
+
steps_offset: 1
|
| 30 |
+
### Zero-SNR params
|
| 31 |
+
prediction_type: "v_prediction"
|
| 32 |
+
rescale_betas_zero_snr: True
|
| 33 |
+
timestep_spacing: "trailing"
|
| 34 |
+
|
| 35 |
+
sampler: DDIM
|
configs/prompts/animation.yaml
ADDED
|
@@ -0,0 +1,26 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
pretrained_base_model_path: "./pretrained_weights/stable-diffusion-v1-5/"
|
| 2 |
+
pretrained_vae_path: "./pretrained_weights/sd-vae-ft-mse"
|
| 3 |
+
image_encoder_path: "./pretrained_weights/image_encoder"
|
| 4 |
+
denoising_unet_path: "./pretrained_weights/denoising_unet.pth"
|
| 5 |
+
reference_unet_path: "./pretrained_weights/reference_unet.pth"
|
| 6 |
+
pose_guider_path: "./pretrained_weights/pose_guider.pth"
|
| 7 |
+
motion_module_path: "./pretrained_weights/motion_module.pth"
|
| 8 |
+
|
| 9 |
+
inference_config: "./configs/inference/inference_v2.yaml"
|
| 10 |
+
weight_dtype: 'fp16'
|
| 11 |
+
|
| 12 |
+
test_cases:
|
| 13 |
+
"./configs/inference/ref_images/anyone-2.png":
|
| 14 |
+
- "./configs/inference/pose_videos/anyone-video-2_kps.mp4"
|
| 15 |
+
- "./configs/inference/pose_videos/anyone-video-5_kps.mp4"
|
| 16 |
+
"./configs/inference/ref_images/anyone-10.png":
|
| 17 |
+
- "./configs/inference/pose_videos/anyone-video-1_kps.mp4"
|
| 18 |
+
- "./configs/inference/pose_videos/anyone-video-2_kps.mp4"
|
| 19 |
+
"./configs/inference/ref_images/anyone-11.png":
|
| 20 |
+
- "./configs/inference/pose_videos/anyone-video-1_kps.mp4"
|
| 21 |
+
- "./configs/inference/pose_videos/anyone-video-2_kps.mp4"
|
| 22 |
+
"./configs/inference/ref_images/anyone-3.png":
|
| 23 |
+
- "./configs/inference/pose_videos/anyone-video-2_kps.mp4"
|
| 24 |
+
- "./configs/inference/pose_videos/anyone-video-5_kps.mp4"
|
| 25 |
+
"./configs/inference/ref_images/anyone-5.png":
|
| 26 |
+
- "./configs/inference/pose_videos/anyone-video-2_kps.mp4"
|
gfpgan/weights/detection_Resnet50_Final.pth
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:6d1de9c2944f2ccddca5f5e010ea5ae64a39845a86311af6fdf30841b0a5a16d
|
| 3 |
+
size 109497761
|
gfpgan/weights/parsing_parsenet.pth
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:3d558d8d0e42c20224f13cf5a29c79eba2d59913419f945545d8cf7b72920de2
|
| 3 |
+
size 85331193
|
good_face.jpeg
ADDED
|
handler.py
ADDED
|
@@ -0,0 +1,247 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from typing import Dict, Any
|
| 2 |
+
import torch
|
| 3 |
+
from PIL import Image
|
| 4 |
+
import base64
|
| 5 |
+
from io import BytesIO
|
| 6 |
+
import numpy as np
|
| 7 |
+
from diffusers import AutoencoderKL, DDIMScheduler
|
| 8 |
+
from einops import repeat
|
| 9 |
+
from omegaconf import OmegaConf
|
| 10 |
+
from transformers import CLIPVisionModelWithProjection
|
| 11 |
+
import cv2
|
| 12 |
+
import os
|
| 13 |
+
from backgroundremover.bg import remove as remove_bg
|
| 14 |
+
from src.models.pose_guider import PoseGuider
|
| 15 |
+
from src.models.unet_2d_condition import UNet2DConditionModel
|
| 16 |
+
from src.models.unet_3d import UNet3DConditionModel
|
| 17 |
+
from src.pipelines.pipeline_pose2vid_long import Pose2VideoPipeline
|
| 18 |
+
from src.utils.util import read_frames, get_fps, save_videos_grid
|
| 19 |
+
import roop.globals
|
| 20 |
+
from roop.core import start, decode_execution_providers, suggest_max_memory, suggest_execution_threads
|
| 21 |
+
from roop.utilities import normalize_output_path
|
| 22 |
+
from roop.processors.frame.core import get_frame_processors_modules
|
| 23 |
+
|
| 24 |
+
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
|
| 25 |
+
|
| 26 |
+
if device.type != 'cuda':
|
| 27 |
+
raise ValueError("The model requires a GPU for inference.")
|
| 28 |
+
|
| 29 |
+
class EndpointHandler():
|
| 30 |
+
def __init__(self, path=""):
|
| 31 |
+
self.config = OmegaConf.load("./configs/prompts/animation.yaml")
|
| 32 |
+
self.weight_dtype = torch.float16
|
| 33 |
+
self.pipeline = None
|
| 34 |
+
self._initialize_pipeline()
|
| 35 |
+
|
| 36 |
+
def _initialize_pipeline(self):
|
| 37 |
+
vae = AutoencoderKL.from_pretrained('./pretrained_weights/sd-vae-ft-mse').to(device, dtype=self.weight_dtype)
|
| 38 |
+
|
| 39 |
+
reference_unet = UNet2DConditionModel.from_pretrained(
|
| 40 |
+
self.config.pretrained_base_model_path,
|
| 41 |
+
subfolder="unet"
|
| 42 |
+
).to(device, dtype=self.weight_dtype)
|
| 43 |
+
|
| 44 |
+
inference_config_path = self.config.inference_config
|
| 45 |
+
infer_config = OmegaConf.load(inference_config_path)
|
| 46 |
+
denoising_unet = UNet3DConditionModel.from_pretrained_2d(
|
| 47 |
+
self.config.pretrained_base_model_path,
|
| 48 |
+
self.config.motion_module_path,
|
| 49 |
+
subfolder="unet",
|
| 50 |
+
unet_additional_kwargs=infer_config.unet_additional_kwargs,
|
| 51 |
+
).to(device, dtype=self.weight_dtype)
|
| 52 |
+
|
| 53 |
+
pose_guider = PoseGuider(320, block_out_channels=(16, 32, 96, 256)).to(device, dtype=self.weight_dtype)
|
| 54 |
+
image_enc = CLIPVisionModelWithProjection.from_pretrained(self.config.image_encoder_path).to(device, dtype=self.weight_dtype)
|
| 55 |
+
sched_kwargs = OmegaConf.to_container(infer_config.noise_scheduler_kwargs)
|
| 56 |
+
scheduler = DDIMScheduler(**sched_kwargs)
|
| 57 |
+
|
| 58 |
+
denoising_unet.load_state_dict(torch.load(self.config.denoising_unet_path, map_location="cpu"), strict=False)
|
| 59 |
+
reference_unet.load_state_dict(torch.load(self.config.reference_unet_path, map_location="cpu"))
|
| 60 |
+
pose_guider.load_state_dict(torch.load(self.config.pose_guider_path, map_location="cpu"))
|
| 61 |
+
|
| 62 |
+
self.pipeline = Pose2VideoPipeline(
|
| 63 |
+
vae=vae,
|
| 64 |
+
image_encoder=image_enc,
|
| 65 |
+
reference_unet=reference_unet,
|
| 66 |
+
denoising_unet=denoising_unet,
|
| 67 |
+
pose_guider=pose_guider,
|
| 68 |
+
scheduler=scheduler
|
| 69 |
+
).to(device, dtype=self.weight_dtype)
|
| 70 |
+
|
| 71 |
+
def _crop_face(self, image, save_path="cropped_face.jpg", margin=0.3):
|
| 72 |
+
# Convert image to OpenCV format
|
| 73 |
+
cv_image = np.array(image)
|
| 74 |
+
cv_image = cv_image[:, :, ::-1].copy()
|
| 75 |
+
|
| 76 |
+
# Load OpenCV face detector
|
| 77 |
+
face_cascade = cv2.CascadeClassifier(cv2.data.haarcascades + 'haarcascade_frontalface_default.xml')
|
| 78 |
+
|
| 79 |
+
# Detect faces
|
| 80 |
+
gray = cv2.cvtColor(cv_image, cv2.COLOR_BGR2GRAY)
|
| 81 |
+
faces = face_cascade.detectMultiScale(gray, 1.1, 4)
|
| 82 |
+
|
| 83 |
+
if len(faces) == 0:
|
| 84 |
+
raise ValueError("No faces detected in the reference image.")
|
| 85 |
+
|
| 86 |
+
# Crop the first face found with a margin
|
| 87 |
+
x, y, w, h = faces[0]
|
| 88 |
+
x_margin = int(margin * w)
|
| 89 |
+
y_margin = int(margin * h)
|
| 90 |
+
|
| 91 |
+
x1 = max(0, x - x_margin)
|
| 92 |
+
y1 = max(0, y - y_margin)
|
| 93 |
+
x2 = min(cv_image.shape[1], x + w + x_margin)
|
| 94 |
+
y2 = min(cv_image.shape[0], y + h + y_margin)
|
| 95 |
+
|
| 96 |
+
cropped_face = cv_image[y1:y2, x1:x2]
|
| 97 |
+
|
| 98 |
+
# Convert back to PIL format
|
| 99 |
+
cropped_face = Image.fromarray(cropped_face[:, :, ::-1]).convert("RGB")
|
| 100 |
+
|
| 101 |
+
# Save the cropped face
|
| 102 |
+
cropped_face.save(save_path, format="JPEG", quality=95)
|
| 103 |
+
|
| 104 |
+
return cropped_face
|
| 105 |
+
|
| 106 |
+
def _swap_face(self, source_image, target_video_path):
|
| 107 |
+
# Use a predefined face image instead of the provided source_image
|
| 108 |
+
source_path = "/root/AnimateAnyone/good_face.jpeg" # Change this to your known good face image path
|
| 109 |
+
output_path = "output.mp4"
|
| 110 |
+
|
| 111 |
+
roop.globals.source_path = source_path
|
| 112 |
+
roop.globals.target_path = target_video_path
|
| 113 |
+
roop.globals.output_path = normalize_output_path(roop.globals.source_path, roop.globals.target_path, output_path)
|
| 114 |
+
roop.globals.frame_processors = ["face_swapper", "face_enhancer"]
|
| 115 |
+
roop.globals.headless = True
|
| 116 |
+
roop.globals.keep_fps = True
|
| 117 |
+
roop.globals.keep_audio = True
|
| 118 |
+
roop.globals.keep_frames = False
|
| 119 |
+
roop.globals.many_faces = False
|
| 120 |
+
roop.globals.video_encoder = "libx264"
|
| 121 |
+
roop.globals.video_quality = 50
|
| 122 |
+
roop.globals.max_memory = suggest_max_memory()
|
| 123 |
+
roop.globals.execution_providers = decode_execution_providers(["cpu"])
|
| 124 |
+
roop.globals.execution_threads = suggest_execution_threads()
|
| 125 |
+
|
| 126 |
+
for frame_processor in get_frame_processors_modules(roop.globals.frame_processors):
|
| 127 |
+
if not frame_processor.pre_check():
|
| 128 |
+
raise ValueError("Frame processor pre-check failed.")
|
| 129 |
+
|
| 130 |
+
print(f"Starting face swap with source: {source_path} and target: {target_video_path}")
|
| 131 |
+
start()
|
| 132 |
+
print(f"Face swap completed. Output saved to: {output_path}")
|
| 133 |
+
|
| 134 |
+
return os.path.join(os.getcwd(), output_path)
|
| 135 |
+
|
| 136 |
+
|
| 137 |
+
def remove_bg_from_image(self, image_data):
|
| 138 |
+
model_name = "u2net" # Choose your preferred model: "u2net", "u2net_human_seg", "u2netp"
|
| 139 |
+
processed_image_data = remove_bg(
|
| 140 |
+
image_data,
|
| 141 |
+
model_name=model_name,
|
| 142 |
+
alpha_matting=True,
|
| 143 |
+
alpha_matting_foreground_threshold=240,
|
| 144 |
+
alpha_matting_background_threshold=10,
|
| 145 |
+
alpha_matting_erode_structure_size=10,
|
| 146 |
+
alpha_matting_base_size=1000
|
| 147 |
+
)
|
| 148 |
+
return processed_image_data
|
| 149 |
+
|
| 150 |
+
def _remove_background(self, input_path, output_path):
|
| 151 |
+
cap = cv2.VideoCapture(input_path)
|
| 152 |
+
if not cap.isOpened():
|
| 153 |
+
raise IOError(f"Error opening video file {input_path}")
|
| 154 |
+
|
| 155 |
+
width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
|
| 156 |
+
height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
|
| 157 |
+
fps = int(cap.get(cv2.CAP_PROP_FPS))
|
| 158 |
+
|
| 159 |
+
fourcc = cv2.VideoWriter_fourcc(*'mp4v')
|
| 160 |
+
out = cv2.VideoWriter(output_path, fourcc, fps, (width, height))
|
| 161 |
+
|
| 162 |
+
frame_count = 0
|
| 163 |
+
while cap.isOpened():
|
| 164 |
+
ret, frame = cap.read()
|
| 165 |
+
if not ret:
|
| 166 |
+
break
|
| 167 |
+
|
| 168 |
+
frame_count += 1
|
| 169 |
+
pil_frame = Image.fromarray(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))
|
| 170 |
+
frame_data = BytesIO()
|
| 171 |
+
pil_frame.save(frame_data, format="PNG")
|
| 172 |
+
frame_data = frame_data.getvalue()
|
| 173 |
+
processed_frame_data = self.remove_bg_from_image(frame_data)
|
| 174 |
+
processed_pil_frame = Image.open(BytesIO(processed_frame_data))
|
| 175 |
+
processed_frame = cv2.cvtColor(np.array(processed_pil_frame), cv2.COLOR_RGB2BGR)
|
| 176 |
+
|
| 177 |
+
out.write(processed_frame)
|
| 178 |
+
|
| 179 |
+
cap.release()
|
| 180 |
+
out.release()
|
| 181 |
+
|
| 182 |
+
if frame_count == 0:
|
| 183 |
+
raise IOError(f"No frames processed. Error with video file {input_path}")
|
| 184 |
+
|
| 185 |
+
def __call__(self, data: Any) -> Dict[str, str]:
|
| 186 |
+
inputs = data.get("inputs", {})
|
| 187 |
+
ref_image_base64 = inputs.get("ref_image", "")
|
| 188 |
+
pose_video_path = inputs.get("pose_video_path", "")
|
| 189 |
+
width = inputs.get("width", 512)
|
| 190 |
+
height = inputs.get("height", 768)
|
| 191 |
+
length = inputs.get("length", 24)
|
| 192 |
+
num_inference_steps = inputs.get("num_inference_steps", 25)
|
| 193 |
+
cfg = inputs.get("cfg", 3.5)
|
| 194 |
+
seed = inputs.get("seed", 123)
|
| 195 |
+
|
| 196 |
+
ref_image = Image.open(BytesIO(base64.b64decode(ref_image_base64)))
|
| 197 |
+
|
| 198 |
+
torch.manual_seed(seed)
|
| 199 |
+
pose_images = read_frames(pose_video_path)
|
| 200 |
+
src_fps = get_fps(pose_video_path)
|
| 201 |
+
|
| 202 |
+
pose_list = []
|
| 203 |
+
total_length = min(length, len(pose_images))
|
| 204 |
+
for pose_image_pil in pose_images[:total_length]:
|
| 205 |
+
pose_list.append(pose_image_pil)
|
| 206 |
+
|
| 207 |
+
video = self.pipeline(
|
| 208 |
+
ref_image,
|
| 209 |
+
pose_list,
|
| 210 |
+
width=width,
|
| 211 |
+
height=height,
|
| 212 |
+
video_length=total_length,
|
| 213 |
+
num_inference_steps=num_inference_steps,
|
| 214 |
+
guidance_scale=cfg
|
| 215 |
+
).videos
|
| 216 |
+
|
| 217 |
+
save_dir = f"./output/gradio"
|
| 218 |
+
if not os.path.exists(save_dir):
|
| 219 |
+
os.makedirs(save_dir, exist_ok=True)
|
| 220 |
+
animation_path = os.path.join(save_dir, "animation_output.mp4")
|
| 221 |
+
save_videos_grid(video, animation_path, n_rows=1, fps=src_fps)
|
| 222 |
+
|
| 223 |
+
# Crop the face from the reference image and save it
|
| 224 |
+
cropped_face_path = os.path.join(save_dir, "cropped_face.jpg")
|
| 225 |
+
cropped_face = self._crop_face(ref_image, save_path=cropped_face_path)
|
| 226 |
+
|
| 227 |
+
# Perform face swapping
|
| 228 |
+
print(f"Starting face swap with cropped face: {cropped_face_path} and animation: {animation_path}")
|
| 229 |
+
final_video_path = self._swap_face(cropped_face, animation_path)
|
| 230 |
+
print(f"Face swap completed. Final video path: {final_video_path}")
|
| 231 |
+
|
| 232 |
+
# Ensure the output file exists before trying to open it
|
| 233 |
+
if not os.path.exists(final_video_path):
|
| 234 |
+
raise FileNotFoundError(f"Expected output file not found: {final_video_path}")
|
| 235 |
+
|
| 236 |
+
# Remove the background from the final video
|
| 237 |
+
bg_removed_video_path = os.path.join(save_dir, "bg_removed_output.mp4")
|
| 238 |
+
self._remove_background(final_video_path, bg_removed_video_path)
|
| 239 |
+
print(f"Background removal completed. Output saved to: {bg_removed_video_path}")
|
| 240 |
+
|
| 241 |
+
# Encode the final video in base64
|
| 242 |
+
with open(bg_removed_video_path, "rb") as video_file:
|
| 243 |
+
video_base64 = base64.b64encode(video_file.read()).decode("utf-8")
|
| 244 |
+
|
| 245 |
+
torch.cuda.empty_cache()
|
| 246 |
+
|
| 247 |
+
return {"video": video_base64}
|
input.jpg
ADDED
|
models/GFPGANv1.4.pth
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:e2cd4703ab14f4d01fd1383a8a8b266f9a5833dacee8e6a79d3bf21a1b6be5ad
|
| 3 |
+
size 348632874
|
models/inswapper_128.onnx
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:e4a3f08c753cb72d04e10aa0f7dbe3deebbf39567d4ead6dce08e98aa49e16af
|
| 3 |
+
size 554253681
|
output.mp4
ADDED
|
Binary file (96.8 kB). View file
|
|
|
output/gradio/animation_output.mp4
ADDED
|
Binary file (79.9 kB). View file
|
|
|
output/gradio/cropped_face.jpg
ADDED
|
output/gradio/output_video.mp4
ADDED
|
Binary file (840 kB). View file
|
|
|
pose_video.mp4
ADDED
|
Binary file (755 kB). View file
|
|
|
pretrained_weights/DWPose/dw-ll_ucoco_384.onnx
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:724f4ff2439ed61afb86fb8a1951ec39c6220682803b4a8bd4f598cd913b1843
|
| 3 |
+
size 134399116
|
pretrained_weights/DWPose/yolox_l.onnx
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:7860ae79de6c89a3c1eb72ae9a2756c0ccfbe04b7791bb5880afabd97855a411
|
| 3 |
+
size 216746733
|
pretrained_weights/denoising_unet.pth
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:b9e5a2c34fac369e8a922972ca2210916c6af175a0dad907deccf6235816ad52
|
| 3 |
+
size 3438374293
|
pretrained_weights/image_encoder/config.json
ADDED
|
@@ -0,0 +1,23 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"_name_or_path": "/home/jpinkney/.cache/huggingface/diffusers/models--lambdalabs--sd-image-variations-diffusers/snapshots/ca6f97f838ae1b5bf764f31363a21f388f4d8f3e/image_encoder",
|
| 3 |
+
"architectures": [
|
| 4 |
+
"CLIPVisionModelWithProjection"
|
| 5 |
+
],
|
| 6 |
+
"attention_dropout": 0.0,
|
| 7 |
+
"dropout": 0.0,
|
| 8 |
+
"hidden_act": "quick_gelu",
|
| 9 |
+
"hidden_size": 1024,
|
| 10 |
+
"image_size": 224,
|
| 11 |
+
"initializer_factor": 1.0,
|
| 12 |
+
"initializer_range": 0.02,
|
| 13 |
+
"intermediate_size": 4096,
|
| 14 |
+
"layer_norm_eps": 1e-05,
|
| 15 |
+
"model_type": "clip_vision_model",
|
| 16 |
+
"num_attention_heads": 16,
|
| 17 |
+
"num_channels": 3,
|
| 18 |
+
"num_hidden_layers": 24,
|
| 19 |
+
"patch_size": 14,
|
| 20 |
+
"projection_dim": 768,
|
| 21 |
+
"torch_dtype": "float32",
|
| 22 |
+
"transformers_version": "4.25.1"
|
| 23 |
+
}
|
pretrained_weights/image_encoder/pytorch_model.bin
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:89d2aa29b5fdf64f3ad4f45fb4227ea98bc45156bbae673b85be1af7783dbabb
|
| 3 |
+
size 1215993967
|
pretrained_weights/motion_module.pth
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:0d11e01a281b39880da2efeea892215c1313e5713fca3d100a7fbb72ee312ef9
|
| 3 |
+
size 1817900227
|
pretrained_weights/pose_guider.pth
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:1a8b7c1b4db92980fd977b4fd003c1396bbae9a9cdea00c35d452136d5e4f488
|
| 3 |
+
size 4351337
|
pretrained_weights/reference_unet.pth
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:beddccb08d49a8b29b0f4d6d456c6521d4382a8d8d48884fa60ba8802509c214
|
| 3 |
+
size 3438323817
|
pretrained_weights/sd-vae-ft-mse/config.json
ADDED
|
@@ -0,0 +1,29 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"_class_name": "AutoencoderKL",
|
| 3 |
+
"_diffusers_version": "0.4.2",
|
| 4 |
+
"act_fn": "silu",
|
| 5 |
+
"block_out_channels": [
|
| 6 |
+
128,
|
| 7 |
+
256,
|
| 8 |
+
512,
|
| 9 |
+
512
|
| 10 |
+
],
|
| 11 |
+
"down_block_types": [
|
| 12 |
+
"DownEncoderBlock2D",
|
| 13 |
+
"DownEncoderBlock2D",
|
| 14 |
+
"DownEncoderBlock2D",
|
| 15 |
+
"DownEncoderBlock2D"
|
| 16 |
+
],
|
| 17 |
+
"in_channels": 3,
|
| 18 |
+
"latent_channels": 4,
|
| 19 |
+
"layers_per_block": 2,
|
| 20 |
+
"norm_num_groups": 32,
|
| 21 |
+
"out_channels": 3,
|
| 22 |
+
"sample_size": 256,
|
| 23 |
+
"up_block_types": [
|
| 24 |
+
"UpDecoderBlock2D",
|
| 25 |
+
"UpDecoderBlock2D",
|
| 26 |
+
"UpDecoderBlock2D",
|
| 27 |
+
"UpDecoderBlock2D"
|
| 28 |
+
]
|
| 29 |
+
}
|
pretrained_weights/sd-vae-ft-mse/diffusion_pytorch_model.bin
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:1b4889b6b1d4ce7ae320a02dedaeff1780ad77d415ea0d744b476155c6377ddc
|
| 3 |
+
size 334707217
|
pretrained_weights/sd-vae-ft-mse/diffusion_pytorch_model.safetensors
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:a1d993488569e928462932c8c38a0760b874d166399b14414135bd9c42df5815
|
| 3 |
+
size 334643276
|
pretrained_weights/stable-diffusion-v1-5/feature_extractor/preprocessor_config.json
ADDED
|
@@ -0,0 +1,20 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"crop_size": 224,
|
| 3 |
+
"do_center_crop": true,
|
| 4 |
+
"do_convert_rgb": true,
|
| 5 |
+
"do_normalize": true,
|
| 6 |
+
"do_resize": true,
|
| 7 |
+
"feature_extractor_type": "CLIPFeatureExtractor",
|
| 8 |
+
"image_mean": [
|
| 9 |
+
0.48145466,
|
| 10 |
+
0.4578275,
|
| 11 |
+
0.40821073
|
| 12 |
+
],
|
| 13 |
+
"image_std": [
|
| 14 |
+
0.26862954,
|
| 15 |
+
0.26130258,
|
| 16 |
+
0.27577711
|
| 17 |
+
],
|
| 18 |
+
"resample": 3,
|
| 19 |
+
"size": 224
|
| 20 |
+
}
|
pretrained_weights/stable-diffusion-v1-5/model_index.json
ADDED
|
@@ -0,0 +1,32 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"_class_name": "StableDiffusionPipeline",
|
| 3 |
+
"_diffusers_version": "0.6.0",
|
| 4 |
+
"feature_extractor": [
|
| 5 |
+
"transformers",
|
| 6 |
+
"CLIPImageProcessor"
|
| 7 |
+
],
|
| 8 |
+
"safety_checker": [
|
| 9 |
+
"stable_diffusion",
|
| 10 |
+
"StableDiffusionSafetyChecker"
|
| 11 |
+
],
|
| 12 |
+
"scheduler": [
|
| 13 |
+
"diffusers",
|
| 14 |
+
"PNDMScheduler"
|
| 15 |
+
],
|
| 16 |
+
"text_encoder": [
|
| 17 |
+
"transformers",
|
| 18 |
+
"CLIPTextModel"
|
| 19 |
+
],
|
| 20 |
+
"tokenizer": [
|
| 21 |
+
"transformers",
|
| 22 |
+
"CLIPTokenizer"
|
| 23 |
+
],
|
| 24 |
+
"unet": [
|
| 25 |
+
"diffusers",
|
| 26 |
+
"UNet2DConditionModel"
|
| 27 |
+
],
|
| 28 |
+
"vae": [
|
| 29 |
+
"diffusers",
|
| 30 |
+
"AutoencoderKL"
|
| 31 |
+
]
|
| 32 |
+
}
|
pretrained_weights/stable-diffusion-v1-5/unet/config.json
ADDED
|
@@ -0,0 +1,36 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"_class_name": "UNet2DConditionModel",
|
| 3 |
+
"_diffusers_version": "0.6.0",
|
| 4 |
+
"act_fn": "silu",
|
| 5 |
+
"attention_head_dim": 8,
|
| 6 |
+
"block_out_channels": [
|
| 7 |
+
320,
|
| 8 |
+
640,
|
| 9 |
+
1280,
|
| 10 |
+
1280
|
| 11 |
+
],
|
| 12 |
+
"center_input_sample": false,
|
| 13 |
+
"cross_attention_dim": 768,
|
| 14 |
+
"down_block_types": [
|
| 15 |
+
"CrossAttnDownBlock2D",
|
| 16 |
+
"CrossAttnDownBlock2D",
|
| 17 |
+
"CrossAttnDownBlock2D",
|
| 18 |
+
"DownBlock2D"
|
| 19 |
+
],
|
| 20 |
+
"downsample_padding": 1,
|
| 21 |
+
"flip_sin_to_cos": true,
|
| 22 |
+
"freq_shift": 0,
|
| 23 |
+
"in_channels": 4,
|
| 24 |
+
"layers_per_block": 2,
|
| 25 |
+
"mid_block_scale_factor": 1,
|
| 26 |
+
"norm_eps": 1e-05,
|
| 27 |
+
"norm_num_groups": 32,
|
| 28 |
+
"out_channels": 4,
|
| 29 |
+
"sample_size": 64,
|
| 30 |
+
"up_block_types": [
|
| 31 |
+
"UpBlock2D",
|
| 32 |
+
"CrossAttnUpBlock2D",
|
| 33 |
+
"CrossAttnUpBlock2D",
|
| 34 |
+
"CrossAttnUpBlock2D"
|
| 35 |
+
]
|
| 36 |
+
}
|
pretrained_weights/stable-diffusion-v1-5/unet/diffusion_pytorch_model.bin
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:c7da0e21ba7ea50637bee26e81c220844defdf01aafca02b2c42ecdadb813de4
|
| 3 |
+
size 3438354725
|
pretrained_weights/stable-diffusion-v1-5/v1-inference.yaml
ADDED
|
@@ -0,0 +1,70 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
model:
|
| 2 |
+
base_learning_rate: 1.0e-04
|
| 3 |
+
target: ldm.models.diffusion.ddpm.LatentDiffusion
|
| 4 |
+
params:
|
| 5 |
+
linear_start: 0.00085
|
| 6 |
+
linear_end: 0.0120
|
| 7 |
+
num_timesteps_cond: 1
|
| 8 |
+
log_every_t: 200
|
| 9 |
+
timesteps: 1000
|
| 10 |
+
first_stage_key: "jpg"
|
| 11 |
+
cond_stage_key: "txt"
|
| 12 |
+
image_size: 64
|
| 13 |
+
channels: 4
|
| 14 |
+
cond_stage_trainable: false # Note: different from the one we trained before
|
| 15 |
+
conditioning_key: crossattn
|
| 16 |
+
monitor: val/loss_simple_ema
|
| 17 |
+
scale_factor: 0.18215
|
| 18 |
+
use_ema: False
|
| 19 |
+
|
| 20 |
+
scheduler_config: # 10000 warmup steps
|
| 21 |
+
target: ldm.lr_scheduler.LambdaLinearScheduler
|
| 22 |
+
params:
|
| 23 |
+
warm_up_steps: [ 10000 ]
|
| 24 |
+
cycle_lengths: [ 10000000000000 ] # incredibly large number to prevent corner cases
|
| 25 |
+
f_start: [ 1.e-6 ]
|
| 26 |
+
f_max: [ 1. ]
|
| 27 |
+
f_min: [ 1. ]
|
| 28 |
+
|
| 29 |
+
unet_config:
|
| 30 |
+
target: ldm.modules.diffusionmodules.openaimodel.UNetModel
|
| 31 |
+
params:
|
| 32 |
+
image_size: 32 # unused
|
| 33 |
+
in_channels: 4
|
| 34 |
+
out_channels: 4
|
| 35 |
+
model_channels: 320
|
| 36 |
+
attention_resolutions: [ 4, 2, 1 ]
|
| 37 |
+
num_res_blocks: 2
|
| 38 |
+
channel_mult: [ 1, 2, 4, 4 ]
|
| 39 |
+
num_heads: 8
|
| 40 |
+
use_spatial_transformer: True
|
| 41 |
+
transformer_depth: 1
|
| 42 |
+
context_dim: 768
|
| 43 |
+
use_checkpoint: True
|
| 44 |
+
legacy: False
|
| 45 |
+
|
| 46 |
+
first_stage_config:
|
| 47 |
+
target: ldm.models.autoencoder.AutoencoderKL
|
| 48 |
+
params:
|
| 49 |
+
embed_dim: 4
|
| 50 |
+
monitor: val/rec_loss
|
| 51 |
+
ddconfig:
|
| 52 |
+
double_z: true
|
| 53 |
+
z_channels: 4
|
| 54 |
+
resolution: 256
|
| 55 |
+
in_channels: 3
|
| 56 |
+
out_ch: 3
|
| 57 |
+
ch: 128
|
| 58 |
+
ch_mult:
|
| 59 |
+
- 1
|
| 60 |
+
- 2
|
| 61 |
+
- 4
|
| 62 |
+
- 4
|
| 63 |
+
num_res_blocks: 2
|
| 64 |
+
attn_resolutions: []
|
| 65 |
+
dropout: 0.0
|
| 66 |
+
lossconfig:
|
| 67 |
+
target: torch.nn.Identity
|
| 68 |
+
|
| 69 |
+
cond_stage_config:
|
| 70 |
+
target: ldm.modules.encoders.modules.FrozenCLIPEmbedder
|
requirements.txt
ADDED
|
@@ -0,0 +1,39 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
--extra-index-url https://download.pytorch.org/whl/cu118
|
| 2 |
+
|
| 3 |
+
numpy==1.23.5
|
| 4 |
+
opencv-python==4.7.0.72
|
| 5 |
+
onnx==1.14.0
|
| 6 |
+
insightface==0.7.3
|
| 7 |
+
psutil==5.9.5
|
| 8 |
+
tk==0.1.0
|
| 9 |
+
customtkinter==5.1.3
|
| 10 |
+
pillow==9.5.0
|
| 11 |
+
torch==2.0.1+cu118; sys_platform != 'darwin'
|
| 12 |
+
torch==2.0.1; sys_platform == 'darwin'
|
| 13 |
+
torchvision==0.15.2+cu118; sys_platform != 'darwin'
|
| 14 |
+
torchvision==0.15.2; sys_platform == 'darwin'
|
| 15 |
+
onnxruntime==1.15.0; sys_platform == 'darwin' and platform_machine != 'arm64'
|
| 16 |
+
onnxruntime-silicon==1.13.1; sys_platform == 'darwin' and platform_machine == 'arm64'
|
| 17 |
+
onnxruntime-gpu==1.15.0; sys_platform != 'darwin'
|
| 18 |
+
tensorflow==2.13.0rc1; sys_platform == 'darwin'
|
| 19 |
+
tensorflow==2.12.0; sys_platform != 'darwin'
|
| 20 |
+
opennsfw2==0.10.2
|
| 21 |
+
protobuf==4.23.2
|
| 22 |
+
tqdm==4.65.0
|
| 23 |
+
gfpgan==1.3.8
|
| 24 |
+
gradio==3.40.1
|
| 25 |
+
tkinterdnd2==0.3.0; sys_platform != 'darwin' and platform_machine != 'arm64'
|
| 26 |
+
tkinterdnd2-universal==1.7.3; sys_platform == 'darwin' and platform_machine == 'arm64'
|
| 27 |
+
onnxruntime-coreml==1.13.1; python_version == '3.9' and sys_platform == 'darwin' and platform_machine != 'arm64'
|
| 28 |
+
|
| 29 |
+
# Add additional dependencies
|
| 30 |
+
diffusers==0.24.0
|
| 31 |
+
omegaconf==2.2.3
|
| 32 |
+
|
| 33 |
+
# Face swap related dependencies
|
| 34 |
+
facenet-pytorch==2.5.2
|
| 35 |
+
dlib==19.22.0
|
| 36 |
+
|
| 37 |
+
|
| 38 |
+
# Background removal
|
| 39 |
+
backgroundremover
|
roop/__init__.py
ADDED
|
File without changes
|
roop/__pycache__/__init__.cpython-310.pyc
ADDED
|
Binary file (129 Bytes). View file
|
|
|
roop/__pycache__/capturer.cpython-310.pyc
ADDED
|
Binary file (803 Bytes). View file
|
|
|
roop/__pycache__/core.cpython-310.pyc
ADDED
|
Binary file (8.33 kB). View file
|
|
|
roop/__pycache__/face_analyser.cpython-310.pyc
ADDED
|
Binary file (1.25 kB). View file
|
|
|
roop/__pycache__/globals.cpython-310.pyc
ADDED
|
Binary file (525 Bytes). View file
|
|
|
roop/__pycache__/metadata.cpython-310.pyc
ADDED
|
Binary file (164 Bytes). View file
|
|
|
roop/__pycache__/predicter.cpython-310.pyc
ADDED
|
Binary file (1.65 kB). View file
|
|
|
roop/__pycache__/typing.cpython-310.pyc
ADDED
|
Binary file (267 Bytes). View file
|
|
|
roop/__pycache__/ui.cpython-310.pyc
ADDED
|
Binary file (8.39 kB). View file
|
|
|
roop/__pycache__/utilities.cpython-310.pyc
ADDED
|
Binary file (5.58 kB). View file
|
|
|
roop/capturer.py
ADDED
|
@@ -0,0 +1,20 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from typing import Any
|
| 2 |
+
import cv2
|
| 3 |
+
|
| 4 |
+
|
| 5 |
+
def get_video_frame(video_path: str, frame_number: int = 0) -> Any:
|
| 6 |
+
capture = cv2.VideoCapture(video_path)
|
| 7 |
+
frame_total = capture.get(cv2.CAP_PROP_FRAME_COUNT)
|
| 8 |
+
capture.set(cv2.CAP_PROP_POS_FRAMES, min(frame_total, frame_number - 1))
|
| 9 |
+
has_frame, frame = capture.read()
|
| 10 |
+
capture.release()
|
| 11 |
+
if has_frame:
|
| 12 |
+
return frame
|
| 13 |
+
return None
|
| 14 |
+
|
| 15 |
+
|
| 16 |
+
def get_video_frame_total(video_path: str) -> int:
|
| 17 |
+
capture = cv2.VideoCapture(video_path)
|
| 18 |
+
video_frame_total = int(capture.get(cv2.CAP_PROP_FRAME_COUNT))
|
| 19 |
+
capture.release()
|
| 20 |
+
return video_frame_total
|
roop/core.py
ADDED
|
@@ -0,0 +1,215 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
|
| 3 |
+
import os
|
| 4 |
+
import sys
|
| 5 |
+
# single thread doubles cuda performance - needs to be set before torch import
|
| 6 |
+
if any(arg.startswith('--execution-provider') for arg in sys.argv):
|
| 7 |
+
os.environ['OMP_NUM_THREADS'] = '1'
|
| 8 |
+
# reduce tensorflow log level
|
| 9 |
+
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'
|
| 10 |
+
import warnings
|
| 11 |
+
from typing import List
|
| 12 |
+
import platform
|
| 13 |
+
import signal
|
| 14 |
+
import shutil
|
| 15 |
+
import argparse
|
| 16 |
+
import torch
|
| 17 |
+
import onnxruntime
|
| 18 |
+
import tensorflow
|
| 19 |
+
|
| 20 |
+
import roop.globals
|
| 21 |
+
import roop.metadata
|
| 22 |
+
import roop.ui as ui
|
| 23 |
+
from roop.predicter import predict_image, predict_video
|
| 24 |
+
from roop.processors.frame.core import get_frame_processors_modules
|
| 25 |
+
from roop.utilities import has_image_extension, is_image, is_video, detect_fps, create_video, extract_frames, get_temp_frame_paths, restore_audio, create_temp, move_temp, clean_temp, normalize_output_path
|
| 26 |
+
|
| 27 |
+
if 'ROCMExecutionProvider' in roop.globals.execution_providers:
|
| 28 |
+
del torch
|
| 29 |
+
|
| 30 |
+
warnings.filterwarnings('ignore', category=FutureWarning, module='insightface')
|
| 31 |
+
warnings.filterwarnings('ignore', category=UserWarning, module='torchvision')
|
| 32 |
+
|
| 33 |
+
|
| 34 |
+
def parse_args() -> None:
|
| 35 |
+
signal.signal(signal.SIGINT, lambda signal_number, frame: destroy())
|
| 36 |
+
program = argparse.ArgumentParser(formatter_class=lambda prog: argparse.HelpFormatter(prog, max_help_position=100))
|
| 37 |
+
program.add_argument('-s', '--source', help='select an source image', dest='source_path')
|
| 38 |
+
program.add_argument('-t', '--target', help='select an target image or video', dest='target_path')
|
| 39 |
+
program.add_argument('-o', '--output', help='select output file or directory', dest='output_path')
|
| 40 |
+
program.add_argument('--frame-processor', help='frame processors (choices: face_swapper, face_enhancer, ...)', dest='frame_processor', default=['face_swapper'], nargs='+')
|
| 41 |
+
program.add_argument('--keep-fps', help='keep original fps', dest='keep_fps', action='store_true', default=False)
|
| 42 |
+
program.add_argument('--keep-audio', help='keep original audio', dest='keep_audio', action='store_true', default=True)
|
| 43 |
+
program.add_argument('--keep-frames', help='keep temporary frames', dest='keep_frames', action='store_true', default=False)
|
| 44 |
+
program.add_argument('--many-faces', help='process every face', dest='many_faces', action='store_true', default=False)
|
| 45 |
+
program.add_argument('--video-encoder', help='adjust output video encoder', dest='video_encoder', default='libx264', choices=['libx264', 'libx265', 'libvpx-vp9'])
|
| 46 |
+
program.add_argument('--video-quality', help='adjust output video quality', dest='video_quality', type=int, default=18, choices=range(52), metavar='[0-51]')
|
| 47 |
+
program.add_argument('--max-memory', help='maximum amount of RAM in GB', dest='max_memory', type=int, default=suggest_max_memory())
|
| 48 |
+
program.add_argument('--execution-provider', help='available execution provider (choices: cpu, ...)', dest='execution_provider', default=['cpu'], choices=suggest_execution_providers(), nargs='+')
|
| 49 |
+
program.add_argument('--execution-threads', help='number of execution threads', dest='execution_threads', type=int, default=suggest_execution_threads())
|
| 50 |
+
program.add_argument('-v', '--version', action='version', version=f'{roop.metadata.name} {roop.metadata.version}')
|
| 51 |
+
|
| 52 |
+
args = program.parse_args()
|
| 53 |
+
|
| 54 |
+
roop.globals.source_path = args.source_path
|
| 55 |
+
roop.globals.target_path = args.target_path
|
| 56 |
+
roop.globals.output_path = normalize_output_path(roop.globals.source_path, roop.globals.target_path, args.output_path)
|
| 57 |
+
roop.globals.frame_processors = args.frame_processor
|
| 58 |
+
roop.globals.headless = args.source_path or args.target_path or args.output_path
|
| 59 |
+
roop.globals.keep_fps = args.keep_fps
|
| 60 |
+
roop.globals.keep_audio = args.keep_audio
|
| 61 |
+
roop.globals.keep_frames = args.keep_frames
|
| 62 |
+
roop.globals.many_faces = args.many_faces
|
| 63 |
+
roop.globals.video_encoder = args.video_encoder
|
| 64 |
+
roop.globals.video_quality = args.video_quality
|
| 65 |
+
roop.globals.max_memory = args.max_memory
|
| 66 |
+
roop.globals.execution_providers = decode_execution_providers(args.execution_provider)
|
| 67 |
+
roop.globals.execution_threads = args.execution_threads
|
| 68 |
+
|
| 69 |
+
|
| 70 |
+
def encode_execution_providers(execution_providers: List[str]) -> List[str]:
|
| 71 |
+
return [execution_provider.replace('ExecutionProvider', '').lower() for execution_provider in execution_providers]
|
| 72 |
+
|
| 73 |
+
|
| 74 |
+
def decode_execution_providers(execution_providers: List[str]) -> List[str]:
|
| 75 |
+
return [provider for provider, encoded_execution_provider in zip(onnxruntime.get_available_providers(), encode_execution_providers(onnxruntime.get_available_providers()))
|
| 76 |
+
if any(execution_provider in encoded_execution_provider for execution_provider in execution_providers)]
|
| 77 |
+
|
| 78 |
+
|
| 79 |
+
def suggest_max_memory() -> int:
|
| 80 |
+
if platform.system().lower() == 'darwin':
|
| 81 |
+
return 10
|
| 82 |
+
return 14
|
| 83 |
+
|
| 84 |
+
|
| 85 |
+
def suggest_execution_providers() -> List[str]:
|
| 86 |
+
return encode_execution_providers(onnxruntime.get_available_providers())
|
| 87 |
+
|
| 88 |
+
|
| 89 |
+
def suggest_execution_threads() -> int:
|
| 90 |
+
if 'DmlExecutionProvider' in roop.globals.execution_providers:
|
| 91 |
+
return 1
|
| 92 |
+
if 'ROCMExecutionProvider' in roop.globals.execution_providers:
|
| 93 |
+
return 1
|
| 94 |
+
return 8
|
| 95 |
+
|
| 96 |
+
|
| 97 |
+
def limit_resources() -> None:
|
| 98 |
+
# prevent tensorflow memory leak
|
| 99 |
+
gpus = tensorflow.config.experimental.list_physical_devices('GPU')
|
| 100 |
+
for gpu in gpus:
|
| 101 |
+
tensorflow.config.experimental.set_virtual_device_configuration(gpu, [
|
| 102 |
+
tensorflow.config.experimental.VirtualDeviceConfiguration(memory_limit=1024)
|
| 103 |
+
])
|
| 104 |
+
# limit memory usage
|
| 105 |
+
if roop.globals.max_memory:
|
| 106 |
+
memory = roop.globals.max_memory * 1024 ** 3
|
| 107 |
+
if platform.system().lower() == 'darwin':
|
| 108 |
+
memory = roop.globals.max_memory * 1024 ** 6
|
| 109 |
+
if platform.system().lower() == 'windows':
|
| 110 |
+
import ctypes
|
| 111 |
+
kernel32 = ctypes.windll.kernel32
|
| 112 |
+
kernel32.SetProcessWorkingSetSize(-1, ctypes.c_size_t(memory), ctypes.c_size_t(memory))
|
| 113 |
+
else:
|
| 114 |
+
import resource
|
| 115 |
+
resource.setrlimit(resource.RLIMIT_DATA, (memory, memory))
|
| 116 |
+
|
| 117 |
+
|
| 118 |
+
def release_resources() -> None:
|
| 119 |
+
if 'CUDAExecutionProvider' in roop.globals.execution_providers:
|
| 120 |
+
torch.cuda.empty_cache()
|
| 121 |
+
|
| 122 |
+
|
| 123 |
+
def pre_check() -> bool:
|
| 124 |
+
if sys.version_info < (3, 9):
|
| 125 |
+
update_status('Python version is not supported - please upgrade to 3.9 or higher.')
|
| 126 |
+
return False
|
| 127 |
+
if not shutil.which('ffmpeg'):
|
| 128 |
+
update_status('ffmpeg is not installed.')
|
| 129 |
+
return False
|
| 130 |
+
return True
|
| 131 |
+
|
| 132 |
+
|
| 133 |
+
def update_status(message: str, scope: str = 'ROOP.CORE') -> None:
|
| 134 |
+
print(f'[{scope}] {message}')
|
| 135 |
+
if not roop.globals.headless:
|
| 136 |
+
ui.update_status(message)
|
| 137 |
+
|
| 138 |
+
|
| 139 |
+
def start() -> None:
|
| 140 |
+
for frame_processor in get_frame_processors_modules(roop.globals.frame_processors):
|
| 141 |
+
if not frame_processor.pre_start():
|
| 142 |
+
return
|
| 143 |
+
# process image to image
|
| 144 |
+
if has_image_extension(roop.globals.target_path):
|
| 145 |
+
if predict_image(roop.globals.target_path):
|
| 146 |
+
destroy()
|
| 147 |
+
shutil.copy2(roop.globals.target_path, roop.globals.output_path)
|
| 148 |
+
for frame_processor in get_frame_processors_modules(roop.globals.frame_processors):
|
| 149 |
+
update_status('Progressing...', frame_processor.NAME)
|
| 150 |
+
frame_processor.process_image(roop.globals.source_path, roop.globals.output_path, roop.globals.output_path)
|
| 151 |
+
frame_processor.post_process()
|
| 152 |
+
release_resources()
|
| 153 |
+
if is_image(roop.globals.target_path):
|
| 154 |
+
update_status('Processing to image succeed!')
|
| 155 |
+
else:
|
| 156 |
+
update_status('Processing to image failed!')
|
| 157 |
+
return
|
| 158 |
+
# process image to videos
|
| 159 |
+
if predict_video(roop.globals.target_path):
|
| 160 |
+
destroy()
|
| 161 |
+
update_status('Creating temp resources...')
|
| 162 |
+
create_temp(roop.globals.target_path)
|
| 163 |
+
update_status('Extracting frames...')
|
| 164 |
+
extract_frames(roop.globals.target_path)
|
| 165 |
+
temp_frame_paths = get_temp_frame_paths(roop.globals.target_path)
|
| 166 |
+
for frame_processor in get_frame_processors_modules(roop.globals.frame_processors):
|
| 167 |
+
update_status('Progressing...', frame_processor.NAME)
|
| 168 |
+
frame_processor.process_video(roop.globals.source_path, temp_frame_paths)
|
| 169 |
+
frame_processor.post_process()
|
| 170 |
+
release_resources()
|
| 171 |
+
# handles fps
|
| 172 |
+
if roop.globals.keep_fps:
|
| 173 |
+
update_status('Detecting fps...')
|
| 174 |
+
fps = detect_fps(roop.globals.target_path)
|
| 175 |
+
update_status(f'Creating video with {fps} fps...')
|
| 176 |
+
create_video(roop.globals.target_path, fps)
|
| 177 |
+
else:
|
| 178 |
+
update_status('Creating video with 30.0 fps...')
|
| 179 |
+
create_video(roop.globals.target_path)
|
| 180 |
+
# handle audio
|
| 181 |
+
if roop.globals.keep_audio:
|
| 182 |
+
if roop.globals.keep_fps:
|
| 183 |
+
update_status('Restoring audio...')
|
| 184 |
+
else:
|
| 185 |
+
update_status('Restoring audio might cause issues as fps are not kept...')
|
| 186 |
+
restore_audio(roop.globals.target_path, roop.globals.output_path)
|
| 187 |
+
else:
|
| 188 |
+
move_temp(roop.globals.target_path, roop.globals.output_path)
|
| 189 |
+
# clean and validate
|
| 190 |
+
clean_temp(roop.globals.target_path)
|
| 191 |
+
if is_video(roop.globals.target_path):
|
| 192 |
+
update_status('Processing to video succeed!')
|
| 193 |
+
else:
|
| 194 |
+
update_status('Processing to video failed!')
|
| 195 |
+
|
| 196 |
+
|
| 197 |
+
def destroy() -> None:
|
| 198 |
+
if roop.globals.target_path:
|
| 199 |
+
clean_temp(roop.globals.target_path)
|
| 200 |
+
quit()
|
| 201 |
+
|
| 202 |
+
|
| 203 |
+
def run() -> None:
|
| 204 |
+
parse_args()
|
| 205 |
+
if not pre_check():
|
| 206 |
+
return
|
| 207 |
+
for frame_processor in get_frame_processors_modules(roop.globals.frame_processors):
|
| 208 |
+
if not frame_processor.pre_check():
|
| 209 |
+
return
|
| 210 |
+
limit_resources()
|
| 211 |
+
if roop.globals.headless:
|
| 212 |
+
start()
|
| 213 |
+
else:
|
| 214 |
+
window = ui.init(start, destroy)
|
| 215 |
+
window.mainloop()
|
roop/face_analyser.py
ADDED
|
@@ -0,0 +1,34 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import threading
|
| 2 |
+
from typing import Any
|
| 3 |
+
import insightface
|
| 4 |
+
|
| 5 |
+
import roop.globals
|
| 6 |
+
from roop.typing import Frame
|
| 7 |
+
|
| 8 |
+
FACE_ANALYSER = None
|
| 9 |
+
THREAD_LOCK = threading.Lock()
|
| 10 |
+
|
| 11 |
+
|
| 12 |
+
def get_face_analyser() -> Any:
|
| 13 |
+
global FACE_ANALYSER
|
| 14 |
+
|
| 15 |
+
with THREAD_LOCK:
|
| 16 |
+
if FACE_ANALYSER is None:
|
| 17 |
+
FACE_ANALYSER = insightface.app.FaceAnalysis(name='buffalo_l', providers=roop.globals.execution_providers)
|
| 18 |
+
FACE_ANALYSER.prepare(ctx_id=0, det_size=(640, 640))
|
| 19 |
+
return FACE_ANALYSER
|
| 20 |
+
|
| 21 |
+
|
| 22 |
+
def get_one_face(frame: Frame) -> Any:
|
| 23 |
+
face = get_face_analyser().get(frame)
|
| 24 |
+
try:
|
| 25 |
+
return min(face, key=lambda x: x.bbox[0])
|
| 26 |
+
except ValueError:
|
| 27 |
+
return None
|
| 28 |
+
|
| 29 |
+
|
| 30 |
+
def get_many_faces(frame: Frame) -> Any:
|
| 31 |
+
try:
|
| 32 |
+
return get_face_analyser().get(frame)
|
| 33 |
+
except IndexError:
|
| 34 |
+
return None
|
roop/globals.py
ADDED
|
@@ -0,0 +1,17 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from typing import List
|
| 2 |
+
|
| 3 |
+
source_path = None
|
| 4 |
+
target_path = None
|
| 5 |
+
output_path = None
|
| 6 |
+
frame_processors: List[str] = []
|
| 7 |
+
keep_fps = None
|
| 8 |
+
keep_audio = None
|
| 9 |
+
keep_frames = None
|
| 10 |
+
many_faces = None
|
| 11 |
+
video_encoder = None
|
| 12 |
+
video_quality = None
|
| 13 |
+
max_memory = None
|
| 14 |
+
execution_providers: List[str] = []
|
| 15 |
+
execution_threads = None
|
| 16 |
+
headless = None
|
| 17 |
+
log_level = 'error'
|
roop/metadata.py
ADDED
|
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
|
|
|
| 1 |
+
name = 'roop'
|
| 2 |
+
version = '1.1.0'
|
roop/predicter.py
ADDED
|
@@ -0,0 +1,43 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import threading
|
| 2 |
+
import numpy
|
| 3 |
+
import opennsfw2
|
| 4 |
+
from PIL import Image
|
| 5 |
+
from keras import Model
|
| 6 |
+
|
| 7 |
+
from roop.typing import Frame
|
| 8 |
+
|
| 9 |
+
PREDICTOR = None
|
| 10 |
+
THREAD_LOCK = threading.Lock()
|
| 11 |
+
MAX_PROBABILITY = 0.85
|
| 12 |
+
|
| 13 |
+
|
| 14 |
+
def get_predictor() -> Model:
|
| 15 |
+
global PREDICTOR
|
| 16 |
+
|
| 17 |
+
with THREAD_LOCK:
|
| 18 |
+
if PREDICTOR is None:
|
| 19 |
+
PREDICTOR = opennsfw2.make_open_nsfw_model()
|
| 20 |
+
return PREDICTOR
|
| 21 |
+
|
| 22 |
+
|
| 23 |
+
def clear_predictor() -> None:
|
| 24 |
+
global PREDICTOR
|
| 25 |
+
|
| 26 |
+
PREDICTOR = None
|
| 27 |
+
|
| 28 |
+
|
| 29 |
+
def predict_frame(target_frame: Frame) -> bool:
|
| 30 |
+
image = Image.fromarray(target_frame)
|
| 31 |
+
image = opennsfw2.preprocess_image(image, opennsfw2.Preprocessing.YAHOO)
|
| 32 |
+
views = numpy.expand_dims(image, axis=0)
|
| 33 |
+
_, probability = get_predictor().predict(views)[0]
|
| 34 |
+
return probability > MAX_PROBABILITY
|
| 35 |
+
|
| 36 |
+
|
| 37 |
+
def predict_image(target_path: str) -> bool:
|
| 38 |
+
return opennsfw2.predict_image(target_path) > MAX_PROBABILITY
|
| 39 |
+
|
| 40 |
+
|
| 41 |
+
def predict_video(target_path: str) -> bool:
|
| 42 |
+
_, probabilities = opennsfw2.predict_video_frames(video_path=target_path, frame_interval=100)
|
| 43 |
+
return any(probability > MAX_PROBABILITY for probability in probabilities)
|