root commited on
Commit
dd31ccf
·
1 Parent(s): 1bfd414

setting up model

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. __pycache__/handler.cpython-310.pyc +0 -0
  2. configs/inference/inference_v1.yaml +23 -0
  3. configs/inference/inference_v2.yaml +35 -0
  4. configs/prompts/animation.yaml +26 -0
  5. gfpgan/weights/detection_Resnet50_Final.pth +3 -0
  6. gfpgan/weights/parsing_parsenet.pth +3 -0
  7. good_face.jpeg +0 -0
  8. handler.py +247 -0
  9. input.jpg +0 -0
  10. models/GFPGANv1.4.pth +3 -0
  11. models/inswapper_128.onnx +3 -0
  12. output.mp4 +0 -0
  13. output/gradio/animation_output.mp4 +0 -0
  14. output/gradio/cropped_face.jpg +0 -0
  15. output/gradio/output_video.mp4 +0 -0
  16. pose_video.mp4 +0 -0
  17. pretrained_weights/DWPose/dw-ll_ucoco_384.onnx +3 -0
  18. pretrained_weights/DWPose/yolox_l.onnx +3 -0
  19. pretrained_weights/denoising_unet.pth +3 -0
  20. pretrained_weights/image_encoder/config.json +23 -0
  21. pretrained_weights/image_encoder/pytorch_model.bin +3 -0
  22. pretrained_weights/motion_module.pth +3 -0
  23. pretrained_weights/pose_guider.pth +3 -0
  24. pretrained_weights/reference_unet.pth +3 -0
  25. pretrained_weights/sd-vae-ft-mse/config.json +29 -0
  26. pretrained_weights/sd-vae-ft-mse/diffusion_pytorch_model.bin +3 -0
  27. pretrained_weights/sd-vae-ft-mse/diffusion_pytorch_model.safetensors +3 -0
  28. pretrained_weights/stable-diffusion-v1-5/feature_extractor/preprocessor_config.json +20 -0
  29. pretrained_weights/stable-diffusion-v1-5/model_index.json +32 -0
  30. pretrained_weights/stable-diffusion-v1-5/unet/config.json +36 -0
  31. pretrained_weights/stable-diffusion-v1-5/unet/diffusion_pytorch_model.bin +3 -0
  32. pretrained_weights/stable-diffusion-v1-5/v1-inference.yaml +70 -0
  33. requirements.txt +39 -0
  34. roop/__init__.py +0 -0
  35. roop/__pycache__/__init__.cpython-310.pyc +0 -0
  36. roop/__pycache__/capturer.cpython-310.pyc +0 -0
  37. roop/__pycache__/core.cpython-310.pyc +0 -0
  38. roop/__pycache__/face_analyser.cpython-310.pyc +0 -0
  39. roop/__pycache__/globals.cpython-310.pyc +0 -0
  40. roop/__pycache__/metadata.cpython-310.pyc +0 -0
  41. roop/__pycache__/predicter.cpython-310.pyc +0 -0
  42. roop/__pycache__/typing.cpython-310.pyc +0 -0
  43. roop/__pycache__/ui.cpython-310.pyc +0 -0
  44. roop/__pycache__/utilities.cpython-310.pyc +0 -0
  45. roop/capturer.py +20 -0
  46. roop/core.py +215 -0
  47. roop/face_analyser.py +34 -0
  48. roop/globals.py +17 -0
  49. roop/metadata.py +2 -0
  50. roop/predicter.py +43 -0
__pycache__/handler.cpython-310.pyc ADDED
Binary file (8.09 kB). View file
 
configs/inference/inference_v1.yaml ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ unet_additional_kwargs:
2
+ unet_use_cross_frame_attention: false
3
+ unet_use_temporal_attention: false
4
+ use_motion_module: true
5
+ motion_module_resolutions: [1,2,4,8]
6
+ motion_module_mid_block: false
7
+ motion_module_decoder_only: false
8
+ motion_module_type: "Vanilla"
9
+
10
+ motion_module_kwargs:
11
+ num_attention_heads: 8
12
+ num_transformer_block: 1
13
+ attention_block_types: [ "Temporal_Self", "Temporal_Self" ]
14
+ temporal_position_encoding: true
15
+ temporal_position_encoding_max_len: 24
16
+ temporal_attention_dim_div: 1
17
+
18
+ noise_scheduler_kwargs:
19
+ beta_start: 0.00085
20
+ beta_end: 0.012
21
+ beta_schedule: "linear"
22
+ steps_offset: 1
23
+ clip_sample: False
configs/inference/inference_v2.yaml ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ unet_additional_kwargs:
2
+ use_inflated_groupnorm: true
3
+ unet_use_cross_frame_attention: false
4
+ unet_use_temporal_attention: false
5
+ use_motion_module: true
6
+ motion_module_resolutions:
7
+ - 1
8
+ - 2
9
+ - 4
10
+ - 8
11
+ motion_module_mid_block: true
12
+ motion_module_decoder_only: false
13
+ motion_module_type: Vanilla
14
+ motion_module_kwargs:
15
+ num_attention_heads: 8
16
+ num_transformer_block: 1
17
+ attention_block_types:
18
+ - Temporal_Self
19
+ - Temporal_Self
20
+ temporal_position_encoding: true
21
+ temporal_position_encoding_max_len: 32
22
+ temporal_attention_dim_div: 1
23
+
24
+ noise_scheduler_kwargs:
25
+ beta_start: 0.00085
26
+ beta_end: 0.012
27
+ beta_schedule: "linear"
28
+ clip_sample: false
29
+ steps_offset: 1
30
+ ### Zero-SNR params
31
+ prediction_type: "v_prediction"
32
+ rescale_betas_zero_snr: True
33
+ timestep_spacing: "trailing"
34
+
35
+ sampler: DDIM
configs/prompts/animation.yaml ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ pretrained_base_model_path: "./pretrained_weights/stable-diffusion-v1-5/"
2
+ pretrained_vae_path: "./pretrained_weights/sd-vae-ft-mse"
3
+ image_encoder_path: "./pretrained_weights/image_encoder"
4
+ denoising_unet_path: "./pretrained_weights/denoising_unet.pth"
5
+ reference_unet_path: "./pretrained_weights/reference_unet.pth"
6
+ pose_guider_path: "./pretrained_weights/pose_guider.pth"
7
+ motion_module_path: "./pretrained_weights/motion_module.pth"
8
+
9
+ inference_config: "./configs/inference/inference_v2.yaml"
10
+ weight_dtype: 'fp16'
11
+
12
+ test_cases:
13
+ "./configs/inference/ref_images/anyone-2.png":
14
+ - "./configs/inference/pose_videos/anyone-video-2_kps.mp4"
15
+ - "./configs/inference/pose_videos/anyone-video-5_kps.mp4"
16
+ "./configs/inference/ref_images/anyone-10.png":
17
+ - "./configs/inference/pose_videos/anyone-video-1_kps.mp4"
18
+ - "./configs/inference/pose_videos/anyone-video-2_kps.mp4"
19
+ "./configs/inference/ref_images/anyone-11.png":
20
+ - "./configs/inference/pose_videos/anyone-video-1_kps.mp4"
21
+ - "./configs/inference/pose_videos/anyone-video-2_kps.mp4"
22
+ "./configs/inference/ref_images/anyone-3.png":
23
+ - "./configs/inference/pose_videos/anyone-video-2_kps.mp4"
24
+ - "./configs/inference/pose_videos/anyone-video-5_kps.mp4"
25
+ "./configs/inference/ref_images/anyone-5.png":
26
+ - "./configs/inference/pose_videos/anyone-video-2_kps.mp4"
gfpgan/weights/detection_Resnet50_Final.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6d1de9c2944f2ccddca5f5e010ea5ae64a39845a86311af6fdf30841b0a5a16d
3
+ size 109497761
gfpgan/weights/parsing_parsenet.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3d558d8d0e42c20224f13cf5a29c79eba2d59913419f945545d8cf7b72920de2
3
+ size 85331193
good_face.jpeg ADDED
handler.py ADDED
@@ -0,0 +1,247 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import Dict, Any
2
+ import torch
3
+ from PIL import Image
4
+ import base64
5
+ from io import BytesIO
6
+ import numpy as np
7
+ from diffusers import AutoencoderKL, DDIMScheduler
8
+ from einops import repeat
9
+ from omegaconf import OmegaConf
10
+ from transformers import CLIPVisionModelWithProjection
11
+ import cv2
12
+ import os
13
+ from backgroundremover.bg import remove as remove_bg
14
+ from src.models.pose_guider import PoseGuider
15
+ from src.models.unet_2d_condition import UNet2DConditionModel
16
+ from src.models.unet_3d import UNet3DConditionModel
17
+ from src.pipelines.pipeline_pose2vid_long import Pose2VideoPipeline
18
+ from src.utils.util import read_frames, get_fps, save_videos_grid
19
+ import roop.globals
20
+ from roop.core import start, decode_execution_providers, suggest_max_memory, suggest_execution_threads
21
+ from roop.utilities import normalize_output_path
22
+ from roop.processors.frame.core import get_frame_processors_modules
23
+
24
+ device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
25
+
26
+ if device.type != 'cuda':
27
+ raise ValueError("The model requires a GPU for inference.")
28
+
29
+ class EndpointHandler():
30
+ def __init__(self, path=""):
31
+ self.config = OmegaConf.load("./configs/prompts/animation.yaml")
32
+ self.weight_dtype = torch.float16
33
+ self.pipeline = None
34
+ self._initialize_pipeline()
35
+
36
+ def _initialize_pipeline(self):
37
+ vae = AutoencoderKL.from_pretrained('./pretrained_weights/sd-vae-ft-mse').to(device, dtype=self.weight_dtype)
38
+
39
+ reference_unet = UNet2DConditionModel.from_pretrained(
40
+ self.config.pretrained_base_model_path,
41
+ subfolder="unet"
42
+ ).to(device, dtype=self.weight_dtype)
43
+
44
+ inference_config_path = self.config.inference_config
45
+ infer_config = OmegaConf.load(inference_config_path)
46
+ denoising_unet = UNet3DConditionModel.from_pretrained_2d(
47
+ self.config.pretrained_base_model_path,
48
+ self.config.motion_module_path,
49
+ subfolder="unet",
50
+ unet_additional_kwargs=infer_config.unet_additional_kwargs,
51
+ ).to(device, dtype=self.weight_dtype)
52
+
53
+ pose_guider = PoseGuider(320, block_out_channels=(16, 32, 96, 256)).to(device, dtype=self.weight_dtype)
54
+ image_enc = CLIPVisionModelWithProjection.from_pretrained(self.config.image_encoder_path).to(device, dtype=self.weight_dtype)
55
+ sched_kwargs = OmegaConf.to_container(infer_config.noise_scheduler_kwargs)
56
+ scheduler = DDIMScheduler(**sched_kwargs)
57
+
58
+ denoising_unet.load_state_dict(torch.load(self.config.denoising_unet_path, map_location="cpu"), strict=False)
59
+ reference_unet.load_state_dict(torch.load(self.config.reference_unet_path, map_location="cpu"))
60
+ pose_guider.load_state_dict(torch.load(self.config.pose_guider_path, map_location="cpu"))
61
+
62
+ self.pipeline = Pose2VideoPipeline(
63
+ vae=vae,
64
+ image_encoder=image_enc,
65
+ reference_unet=reference_unet,
66
+ denoising_unet=denoising_unet,
67
+ pose_guider=pose_guider,
68
+ scheduler=scheduler
69
+ ).to(device, dtype=self.weight_dtype)
70
+
71
+ def _crop_face(self, image, save_path="cropped_face.jpg", margin=0.3):
72
+ # Convert image to OpenCV format
73
+ cv_image = np.array(image)
74
+ cv_image = cv_image[:, :, ::-1].copy()
75
+
76
+ # Load OpenCV face detector
77
+ face_cascade = cv2.CascadeClassifier(cv2.data.haarcascades + 'haarcascade_frontalface_default.xml')
78
+
79
+ # Detect faces
80
+ gray = cv2.cvtColor(cv_image, cv2.COLOR_BGR2GRAY)
81
+ faces = face_cascade.detectMultiScale(gray, 1.1, 4)
82
+
83
+ if len(faces) == 0:
84
+ raise ValueError("No faces detected in the reference image.")
85
+
86
+ # Crop the first face found with a margin
87
+ x, y, w, h = faces[0]
88
+ x_margin = int(margin * w)
89
+ y_margin = int(margin * h)
90
+
91
+ x1 = max(0, x - x_margin)
92
+ y1 = max(0, y - y_margin)
93
+ x2 = min(cv_image.shape[1], x + w + x_margin)
94
+ y2 = min(cv_image.shape[0], y + h + y_margin)
95
+
96
+ cropped_face = cv_image[y1:y2, x1:x2]
97
+
98
+ # Convert back to PIL format
99
+ cropped_face = Image.fromarray(cropped_face[:, :, ::-1]).convert("RGB")
100
+
101
+ # Save the cropped face
102
+ cropped_face.save(save_path, format="JPEG", quality=95)
103
+
104
+ return cropped_face
105
+
106
+ def _swap_face(self, source_image, target_video_path):
107
+ # Use a predefined face image instead of the provided source_image
108
+ source_path = "/root/AnimateAnyone/good_face.jpeg" # Change this to your known good face image path
109
+ output_path = "output.mp4"
110
+
111
+ roop.globals.source_path = source_path
112
+ roop.globals.target_path = target_video_path
113
+ roop.globals.output_path = normalize_output_path(roop.globals.source_path, roop.globals.target_path, output_path)
114
+ roop.globals.frame_processors = ["face_swapper", "face_enhancer"]
115
+ roop.globals.headless = True
116
+ roop.globals.keep_fps = True
117
+ roop.globals.keep_audio = True
118
+ roop.globals.keep_frames = False
119
+ roop.globals.many_faces = False
120
+ roop.globals.video_encoder = "libx264"
121
+ roop.globals.video_quality = 50
122
+ roop.globals.max_memory = suggest_max_memory()
123
+ roop.globals.execution_providers = decode_execution_providers(["cpu"])
124
+ roop.globals.execution_threads = suggest_execution_threads()
125
+
126
+ for frame_processor in get_frame_processors_modules(roop.globals.frame_processors):
127
+ if not frame_processor.pre_check():
128
+ raise ValueError("Frame processor pre-check failed.")
129
+
130
+ print(f"Starting face swap with source: {source_path} and target: {target_video_path}")
131
+ start()
132
+ print(f"Face swap completed. Output saved to: {output_path}")
133
+
134
+ return os.path.join(os.getcwd(), output_path)
135
+
136
+
137
+ def remove_bg_from_image(self, image_data):
138
+ model_name = "u2net" # Choose your preferred model: "u2net", "u2net_human_seg", "u2netp"
139
+ processed_image_data = remove_bg(
140
+ image_data,
141
+ model_name=model_name,
142
+ alpha_matting=True,
143
+ alpha_matting_foreground_threshold=240,
144
+ alpha_matting_background_threshold=10,
145
+ alpha_matting_erode_structure_size=10,
146
+ alpha_matting_base_size=1000
147
+ )
148
+ return processed_image_data
149
+
150
+ def _remove_background(self, input_path, output_path):
151
+ cap = cv2.VideoCapture(input_path)
152
+ if not cap.isOpened():
153
+ raise IOError(f"Error opening video file {input_path}")
154
+
155
+ width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
156
+ height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
157
+ fps = int(cap.get(cv2.CAP_PROP_FPS))
158
+
159
+ fourcc = cv2.VideoWriter_fourcc(*'mp4v')
160
+ out = cv2.VideoWriter(output_path, fourcc, fps, (width, height))
161
+
162
+ frame_count = 0
163
+ while cap.isOpened():
164
+ ret, frame = cap.read()
165
+ if not ret:
166
+ break
167
+
168
+ frame_count += 1
169
+ pil_frame = Image.fromarray(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))
170
+ frame_data = BytesIO()
171
+ pil_frame.save(frame_data, format="PNG")
172
+ frame_data = frame_data.getvalue()
173
+ processed_frame_data = self.remove_bg_from_image(frame_data)
174
+ processed_pil_frame = Image.open(BytesIO(processed_frame_data))
175
+ processed_frame = cv2.cvtColor(np.array(processed_pil_frame), cv2.COLOR_RGB2BGR)
176
+
177
+ out.write(processed_frame)
178
+
179
+ cap.release()
180
+ out.release()
181
+
182
+ if frame_count == 0:
183
+ raise IOError(f"No frames processed. Error with video file {input_path}")
184
+
185
+ def __call__(self, data: Any) -> Dict[str, str]:
186
+ inputs = data.get("inputs", {})
187
+ ref_image_base64 = inputs.get("ref_image", "")
188
+ pose_video_path = inputs.get("pose_video_path", "")
189
+ width = inputs.get("width", 512)
190
+ height = inputs.get("height", 768)
191
+ length = inputs.get("length", 24)
192
+ num_inference_steps = inputs.get("num_inference_steps", 25)
193
+ cfg = inputs.get("cfg", 3.5)
194
+ seed = inputs.get("seed", 123)
195
+
196
+ ref_image = Image.open(BytesIO(base64.b64decode(ref_image_base64)))
197
+
198
+ torch.manual_seed(seed)
199
+ pose_images = read_frames(pose_video_path)
200
+ src_fps = get_fps(pose_video_path)
201
+
202
+ pose_list = []
203
+ total_length = min(length, len(pose_images))
204
+ for pose_image_pil in pose_images[:total_length]:
205
+ pose_list.append(pose_image_pil)
206
+
207
+ video = self.pipeline(
208
+ ref_image,
209
+ pose_list,
210
+ width=width,
211
+ height=height,
212
+ video_length=total_length,
213
+ num_inference_steps=num_inference_steps,
214
+ guidance_scale=cfg
215
+ ).videos
216
+
217
+ save_dir = f"./output/gradio"
218
+ if not os.path.exists(save_dir):
219
+ os.makedirs(save_dir, exist_ok=True)
220
+ animation_path = os.path.join(save_dir, "animation_output.mp4")
221
+ save_videos_grid(video, animation_path, n_rows=1, fps=src_fps)
222
+
223
+ # Crop the face from the reference image and save it
224
+ cropped_face_path = os.path.join(save_dir, "cropped_face.jpg")
225
+ cropped_face = self._crop_face(ref_image, save_path=cropped_face_path)
226
+
227
+ # Perform face swapping
228
+ print(f"Starting face swap with cropped face: {cropped_face_path} and animation: {animation_path}")
229
+ final_video_path = self._swap_face(cropped_face, animation_path)
230
+ print(f"Face swap completed. Final video path: {final_video_path}")
231
+
232
+ # Ensure the output file exists before trying to open it
233
+ if not os.path.exists(final_video_path):
234
+ raise FileNotFoundError(f"Expected output file not found: {final_video_path}")
235
+
236
+ # Remove the background from the final video
237
+ bg_removed_video_path = os.path.join(save_dir, "bg_removed_output.mp4")
238
+ self._remove_background(final_video_path, bg_removed_video_path)
239
+ print(f"Background removal completed. Output saved to: {bg_removed_video_path}")
240
+
241
+ # Encode the final video in base64
242
+ with open(bg_removed_video_path, "rb") as video_file:
243
+ video_base64 = base64.b64encode(video_file.read()).decode("utf-8")
244
+
245
+ torch.cuda.empty_cache()
246
+
247
+ return {"video": video_base64}
input.jpg ADDED
models/GFPGANv1.4.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e2cd4703ab14f4d01fd1383a8a8b266f9a5833dacee8e6a79d3bf21a1b6be5ad
3
+ size 348632874
models/inswapper_128.onnx ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e4a3f08c753cb72d04e10aa0f7dbe3deebbf39567d4ead6dce08e98aa49e16af
3
+ size 554253681
output.mp4 ADDED
Binary file (96.8 kB). View file
 
output/gradio/animation_output.mp4 ADDED
Binary file (79.9 kB). View file
 
output/gradio/cropped_face.jpg ADDED
output/gradio/output_video.mp4 ADDED
Binary file (840 kB). View file
 
pose_video.mp4 ADDED
Binary file (755 kB). View file
 
pretrained_weights/DWPose/dw-ll_ucoco_384.onnx ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:724f4ff2439ed61afb86fb8a1951ec39c6220682803b4a8bd4f598cd913b1843
3
+ size 134399116
pretrained_weights/DWPose/yolox_l.onnx ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7860ae79de6c89a3c1eb72ae9a2756c0ccfbe04b7791bb5880afabd97855a411
3
+ size 216746733
pretrained_weights/denoising_unet.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b9e5a2c34fac369e8a922972ca2210916c6af175a0dad907deccf6235816ad52
3
+ size 3438374293
pretrained_weights/image_encoder/config.json ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "/home/jpinkney/.cache/huggingface/diffusers/models--lambdalabs--sd-image-variations-diffusers/snapshots/ca6f97f838ae1b5bf764f31363a21f388f4d8f3e/image_encoder",
3
+ "architectures": [
4
+ "CLIPVisionModelWithProjection"
5
+ ],
6
+ "attention_dropout": 0.0,
7
+ "dropout": 0.0,
8
+ "hidden_act": "quick_gelu",
9
+ "hidden_size": 1024,
10
+ "image_size": 224,
11
+ "initializer_factor": 1.0,
12
+ "initializer_range": 0.02,
13
+ "intermediate_size": 4096,
14
+ "layer_norm_eps": 1e-05,
15
+ "model_type": "clip_vision_model",
16
+ "num_attention_heads": 16,
17
+ "num_channels": 3,
18
+ "num_hidden_layers": 24,
19
+ "patch_size": 14,
20
+ "projection_dim": 768,
21
+ "torch_dtype": "float32",
22
+ "transformers_version": "4.25.1"
23
+ }
pretrained_weights/image_encoder/pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:89d2aa29b5fdf64f3ad4f45fb4227ea98bc45156bbae673b85be1af7783dbabb
3
+ size 1215993967
pretrained_weights/motion_module.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0d11e01a281b39880da2efeea892215c1313e5713fca3d100a7fbb72ee312ef9
3
+ size 1817900227
pretrained_weights/pose_guider.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1a8b7c1b4db92980fd977b4fd003c1396bbae9a9cdea00c35d452136d5e4f488
3
+ size 4351337
pretrained_weights/reference_unet.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:beddccb08d49a8b29b0f4d6d456c6521d4382a8d8d48884fa60ba8802509c214
3
+ size 3438323817
pretrained_weights/sd-vae-ft-mse/config.json ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_class_name": "AutoencoderKL",
3
+ "_diffusers_version": "0.4.2",
4
+ "act_fn": "silu",
5
+ "block_out_channels": [
6
+ 128,
7
+ 256,
8
+ 512,
9
+ 512
10
+ ],
11
+ "down_block_types": [
12
+ "DownEncoderBlock2D",
13
+ "DownEncoderBlock2D",
14
+ "DownEncoderBlock2D",
15
+ "DownEncoderBlock2D"
16
+ ],
17
+ "in_channels": 3,
18
+ "latent_channels": 4,
19
+ "layers_per_block": 2,
20
+ "norm_num_groups": 32,
21
+ "out_channels": 3,
22
+ "sample_size": 256,
23
+ "up_block_types": [
24
+ "UpDecoderBlock2D",
25
+ "UpDecoderBlock2D",
26
+ "UpDecoderBlock2D",
27
+ "UpDecoderBlock2D"
28
+ ]
29
+ }
pretrained_weights/sd-vae-ft-mse/diffusion_pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1b4889b6b1d4ce7ae320a02dedaeff1780ad77d415ea0d744b476155c6377ddc
3
+ size 334707217
pretrained_weights/sd-vae-ft-mse/diffusion_pytorch_model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a1d993488569e928462932c8c38a0760b874d166399b14414135bd9c42df5815
3
+ size 334643276
pretrained_weights/stable-diffusion-v1-5/feature_extractor/preprocessor_config.json ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "crop_size": 224,
3
+ "do_center_crop": true,
4
+ "do_convert_rgb": true,
5
+ "do_normalize": true,
6
+ "do_resize": true,
7
+ "feature_extractor_type": "CLIPFeatureExtractor",
8
+ "image_mean": [
9
+ 0.48145466,
10
+ 0.4578275,
11
+ 0.40821073
12
+ ],
13
+ "image_std": [
14
+ 0.26862954,
15
+ 0.26130258,
16
+ 0.27577711
17
+ ],
18
+ "resample": 3,
19
+ "size": 224
20
+ }
pretrained_weights/stable-diffusion-v1-5/model_index.json ADDED
@@ -0,0 +1,32 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_class_name": "StableDiffusionPipeline",
3
+ "_diffusers_version": "0.6.0",
4
+ "feature_extractor": [
5
+ "transformers",
6
+ "CLIPImageProcessor"
7
+ ],
8
+ "safety_checker": [
9
+ "stable_diffusion",
10
+ "StableDiffusionSafetyChecker"
11
+ ],
12
+ "scheduler": [
13
+ "diffusers",
14
+ "PNDMScheduler"
15
+ ],
16
+ "text_encoder": [
17
+ "transformers",
18
+ "CLIPTextModel"
19
+ ],
20
+ "tokenizer": [
21
+ "transformers",
22
+ "CLIPTokenizer"
23
+ ],
24
+ "unet": [
25
+ "diffusers",
26
+ "UNet2DConditionModel"
27
+ ],
28
+ "vae": [
29
+ "diffusers",
30
+ "AutoencoderKL"
31
+ ]
32
+ }
pretrained_weights/stable-diffusion-v1-5/unet/config.json ADDED
@@ -0,0 +1,36 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_class_name": "UNet2DConditionModel",
3
+ "_diffusers_version": "0.6.0",
4
+ "act_fn": "silu",
5
+ "attention_head_dim": 8,
6
+ "block_out_channels": [
7
+ 320,
8
+ 640,
9
+ 1280,
10
+ 1280
11
+ ],
12
+ "center_input_sample": false,
13
+ "cross_attention_dim": 768,
14
+ "down_block_types": [
15
+ "CrossAttnDownBlock2D",
16
+ "CrossAttnDownBlock2D",
17
+ "CrossAttnDownBlock2D",
18
+ "DownBlock2D"
19
+ ],
20
+ "downsample_padding": 1,
21
+ "flip_sin_to_cos": true,
22
+ "freq_shift": 0,
23
+ "in_channels": 4,
24
+ "layers_per_block": 2,
25
+ "mid_block_scale_factor": 1,
26
+ "norm_eps": 1e-05,
27
+ "norm_num_groups": 32,
28
+ "out_channels": 4,
29
+ "sample_size": 64,
30
+ "up_block_types": [
31
+ "UpBlock2D",
32
+ "CrossAttnUpBlock2D",
33
+ "CrossAttnUpBlock2D",
34
+ "CrossAttnUpBlock2D"
35
+ ]
36
+ }
pretrained_weights/stable-diffusion-v1-5/unet/diffusion_pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c7da0e21ba7ea50637bee26e81c220844defdf01aafca02b2c42ecdadb813de4
3
+ size 3438354725
pretrained_weights/stable-diffusion-v1-5/v1-inference.yaml ADDED
@@ -0,0 +1,70 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ model:
2
+ base_learning_rate: 1.0e-04
3
+ target: ldm.models.diffusion.ddpm.LatentDiffusion
4
+ params:
5
+ linear_start: 0.00085
6
+ linear_end: 0.0120
7
+ num_timesteps_cond: 1
8
+ log_every_t: 200
9
+ timesteps: 1000
10
+ first_stage_key: "jpg"
11
+ cond_stage_key: "txt"
12
+ image_size: 64
13
+ channels: 4
14
+ cond_stage_trainable: false # Note: different from the one we trained before
15
+ conditioning_key: crossattn
16
+ monitor: val/loss_simple_ema
17
+ scale_factor: 0.18215
18
+ use_ema: False
19
+
20
+ scheduler_config: # 10000 warmup steps
21
+ target: ldm.lr_scheduler.LambdaLinearScheduler
22
+ params:
23
+ warm_up_steps: [ 10000 ]
24
+ cycle_lengths: [ 10000000000000 ] # incredibly large number to prevent corner cases
25
+ f_start: [ 1.e-6 ]
26
+ f_max: [ 1. ]
27
+ f_min: [ 1. ]
28
+
29
+ unet_config:
30
+ target: ldm.modules.diffusionmodules.openaimodel.UNetModel
31
+ params:
32
+ image_size: 32 # unused
33
+ in_channels: 4
34
+ out_channels: 4
35
+ model_channels: 320
36
+ attention_resolutions: [ 4, 2, 1 ]
37
+ num_res_blocks: 2
38
+ channel_mult: [ 1, 2, 4, 4 ]
39
+ num_heads: 8
40
+ use_spatial_transformer: True
41
+ transformer_depth: 1
42
+ context_dim: 768
43
+ use_checkpoint: True
44
+ legacy: False
45
+
46
+ first_stage_config:
47
+ target: ldm.models.autoencoder.AutoencoderKL
48
+ params:
49
+ embed_dim: 4
50
+ monitor: val/rec_loss
51
+ ddconfig:
52
+ double_z: true
53
+ z_channels: 4
54
+ resolution: 256
55
+ in_channels: 3
56
+ out_ch: 3
57
+ ch: 128
58
+ ch_mult:
59
+ - 1
60
+ - 2
61
+ - 4
62
+ - 4
63
+ num_res_blocks: 2
64
+ attn_resolutions: []
65
+ dropout: 0.0
66
+ lossconfig:
67
+ target: torch.nn.Identity
68
+
69
+ cond_stage_config:
70
+ target: ldm.modules.encoders.modules.FrozenCLIPEmbedder
requirements.txt ADDED
@@ -0,0 +1,39 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ --extra-index-url https://download.pytorch.org/whl/cu118
2
+
3
+ numpy==1.23.5
4
+ opencv-python==4.7.0.72
5
+ onnx==1.14.0
6
+ insightface==0.7.3
7
+ psutil==5.9.5
8
+ tk==0.1.0
9
+ customtkinter==5.1.3
10
+ pillow==9.5.0
11
+ torch==2.0.1+cu118; sys_platform != 'darwin'
12
+ torch==2.0.1; sys_platform == 'darwin'
13
+ torchvision==0.15.2+cu118; sys_platform != 'darwin'
14
+ torchvision==0.15.2; sys_platform == 'darwin'
15
+ onnxruntime==1.15.0; sys_platform == 'darwin' and platform_machine != 'arm64'
16
+ onnxruntime-silicon==1.13.1; sys_platform == 'darwin' and platform_machine == 'arm64'
17
+ onnxruntime-gpu==1.15.0; sys_platform != 'darwin'
18
+ tensorflow==2.13.0rc1; sys_platform == 'darwin'
19
+ tensorflow==2.12.0; sys_platform != 'darwin'
20
+ opennsfw2==0.10.2
21
+ protobuf==4.23.2
22
+ tqdm==4.65.0
23
+ gfpgan==1.3.8
24
+ gradio==3.40.1
25
+ tkinterdnd2==0.3.0; sys_platform != 'darwin' and platform_machine != 'arm64'
26
+ tkinterdnd2-universal==1.7.3; sys_platform == 'darwin' and platform_machine == 'arm64'
27
+ onnxruntime-coreml==1.13.1; python_version == '3.9' and sys_platform == 'darwin' and platform_machine != 'arm64'
28
+
29
+ # Add additional dependencies
30
+ diffusers==0.24.0
31
+ omegaconf==2.2.3
32
+
33
+ # Face swap related dependencies
34
+ facenet-pytorch==2.5.2
35
+ dlib==19.22.0
36
+
37
+
38
+ # Background removal
39
+ backgroundremover
roop/__init__.py ADDED
File without changes
roop/__pycache__/__init__.cpython-310.pyc ADDED
Binary file (129 Bytes). View file
 
roop/__pycache__/capturer.cpython-310.pyc ADDED
Binary file (803 Bytes). View file
 
roop/__pycache__/core.cpython-310.pyc ADDED
Binary file (8.33 kB). View file
 
roop/__pycache__/face_analyser.cpython-310.pyc ADDED
Binary file (1.25 kB). View file
 
roop/__pycache__/globals.cpython-310.pyc ADDED
Binary file (525 Bytes). View file
 
roop/__pycache__/metadata.cpython-310.pyc ADDED
Binary file (164 Bytes). View file
 
roop/__pycache__/predicter.cpython-310.pyc ADDED
Binary file (1.65 kB). View file
 
roop/__pycache__/typing.cpython-310.pyc ADDED
Binary file (267 Bytes). View file
 
roop/__pycache__/ui.cpython-310.pyc ADDED
Binary file (8.39 kB). View file
 
roop/__pycache__/utilities.cpython-310.pyc ADDED
Binary file (5.58 kB). View file
 
roop/capturer.py ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import Any
2
+ import cv2
3
+
4
+
5
+ def get_video_frame(video_path: str, frame_number: int = 0) -> Any:
6
+ capture = cv2.VideoCapture(video_path)
7
+ frame_total = capture.get(cv2.CAP_PROP_FRAME_COUNT)
8
+ capture.set(cv2.CAP_PROP_POS_FRAMES, min(frame_total, frame_number - 1))
9
+ has_frame, frame = capture.read()
10
+ capture.release()
11
+ if has_frame:
12
+ return frame
13
+ return None
14
+
15
+
16
+ def get_video_frame_total(video_path: str) -> int:
17
+ capture = cv2.VideoCapture(video_path)
18
+ video_frame_total = int(capture.get(cv2.CAP_PROP_FRAME_COUNT))
19
+ capture.release()
20
+ return video_frame_total
roop/core.py ADDED
@@ -0,0 +1,215 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+
3
+ import os
4
+ import sys
5
+ # single thread doubles cuda performance - needs to be set before torch import
6
+ if any(arg.startswith('--execution-provider') for arg in sys.argv):
7
+ os.environ['OMP_NUM_THREADS'] = '1'
8
+ # reduce tensorflow log level
9
+ os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'
10
+ import warnings
11
+ from typing import List
12
+ import platform
13
+ import signal
14
+ import shutil
15
+ import argparse
16
+ import torch
17
+ import onnxruntime
18
+ import tensorflow
19
+
20
+ import roop.globals
21
+ import roop.metadata
22
+ import roop.ui as ui
23
+ from roop.predicter import predict_image, predict_video
24
+ from roop.processors.frame.core import get_frame_processors_modules
25
+ from roop.utilities import has_image_extension, is_image, is_video, detect_fps, create_video, extract_frames, get_temp_frame_paths, restore_audio, create_temp, move_temp, clean_temp, normalize_output_path
26
+
27
+ if 'ROCMExecutionProvider' in roop.globals.execution_providers:
28
+ del torch
29
+
30
+ warnings.filterwarnings('ignore', category=FutureWarning, module='insightface')
31
+ warnings.filterwarnings('ignore', category=UserWarning, module='torchvision')
32
+
33
+
34
+ def parse_args() -> None:
35
+ signal.signal(signal.SIGINT, lambda signal_number, frame: destroy())
36
+ program = argparse.ArgumentParser(formatter_class=lambda prog: argparse.HelpFormatter(prog, max_help_position=100))
37
+ program.add_argument('-s', '--source', help='select an source image', dest='source_path')
38
+ program.add_argument('-t', '--target', help='select an target image or video', dest='target_path')
39
+ program.add_argument('-o', '--output', help='select output file or directory', dest='output_path')
40
+ program.add_argument('--frame-processor', help='frame processors (choices: face_swapper, face_enhancer, ...)', dest='frame_processor', default=['face_swapper'], nargs='+')
41
+ program.add_argument('--keep-fps', help='keep original fps', dest='keep_fps', action='store_true', default=False)
42
+ program.add_argument('--keep-audio', help='keep original audio', dest='keep_audio', action='store_true', default=True)
43
+ program.add_argument('--keep-frames', help='keep temporary frames', dest='keep_frames', action='store_true', default=False)
44
+ program.add_argument('--many-faces', help='process every face', dest='many_faces', action='store_true', default=False)
45
+ program.add_argument('--video-encoder', help='adjust output video encoder', dest='video_encoder', default='libx264', choices=['libx264', 'libx265', 'libvpx-vp9'])
46
+ program.add_argument('--video-quality', help='adjust output video quality', dest='video_quality', type=int, default=18, choices=range(52), metavar='[0-51]')
47
+ program.add_argument('--max-memory', help='maximum amount of RAM in GB', dest='max_memory', type=int, default=suggest_max_memory())
48
+ program.add_argument('--execution-provider', help='available execution provider (choices: cpu, ...)', dest='execution_provider', default=['cpu'], choices=suggest_execution_providers(), nargs='+')
49
+ program.add_argument('--execution-threads', help='number of execution threads', dest='execution_threads', type=int, default=suggest_execution_threads())
50
+ program.add_argument('-v', '--version', action='version', version=f'{roop.metadata.name} {roop.metadata.version}')
51
+
52
+ args = program.parse_args()
53
+
54
+ roop.globals.source_path = args.source_path
55
+ roop.globals.target_path = args.target_path
56
+ roop.globals.output_path = normalize_output_path(roop.globals.source_path, roop.globals.target_path, args.output_path)
57
+ roop.globals.frame_processors = args.frame_processor
58
+ roop.globals.headless = args.source_path or args.target_path or args.output_path
59
+ roop.globals.keep_fps = args.keep_fps
60
+ roop.globals.keep_audio = args.keep_audio
61
+ roop.globals.keep_frames = args.keep_frames
62
+ roop.globals.many_faces = args.many_faces
63
+ roop.globals.video_encoder = args.video_encoder
64
+ roop.globals.video_quality = args.video_quality
65
+ roop.globals.max_memory = args.max_memory
66
+ roop.globals.execution_providers = decode_execution_providers(args.execution_provider)
67
+ roop.globals.execution_threads = args.execution_threads
68
+
69
+
70
+ def encode_execution_providers(execution_providers: List[str]) -> List[str]:
71
+ return [execution_provider.replace('ExecutionProvider', '').lower() for execution_provider in execution_providers]
72
+
73
+
74
+ def decode_execution_providers(execution_providers: List[str]) -> List[str]:
75
+ return [provider for provider, encoded_execution_provider in zip(onnxruntime.get_available_providers(), encode_execution_providers(onnxruntime.get_available_providers()))
76
+ if any(execution_provider in encoded_execution_provider for execution_provider in execution_providers)]
77
+
78
+
79
+ def suggest_max_memory() -> int:
80
+ if platform.system().lower() == 'darwin':
81
+ return 10
82
+ return 14
83
+
84
+
85
+ def suggest_execution_providers() -> List[str]:
86
+ return encode_execution_providers(onnxruntime.get_available_providers())
87
+
88
+
89
+ def suggest_execution_threads() -> int:
90
+ if 'DmlExecutionProvider' in roop.globals.execution_providers:
91
+ return 1
92
+ if 'ROCMExecutionProvider' in roop.globals.execution_providers:
93
+ return 1
94
+ return 8
95
+
96
+
97
+ def limit_resources() -> None:
98
+ # prevent tensorflow memory leak
99
+ gpus = tensorflow.config.experimental.list_physical_devices('GPU')
100
+ for gpu in gpus:
101
+ tensorflow.config.experimental.set_virtual_device_configuration(gpu, [
102
+ tensorflow.config.experimental.VirtualDeviceConfiguration(memory_limit=1024)
103
+ ])
104
+ # limit memory usage
105
+ if roop.globals.max_memory:
106
+ memory = roop.globals.max_memory * 1024 ** 3
107
+ if platform.system().lower() == 'darwin':
108
+ memory = roop.globals.max_memory * 1024 ** 6
109
+ if platform.system().lower() == 'windows':
110
+ import ctypes
111
+ kernel32 = ctypes.windll.kernel32
112
+ kernel32.SetProcessWorkingSetSize(-1, ctypes.c_size_t(memory), ctypes.c_size_t(memory))
113
+ else:
114
+ import resource
115
+ resource.setrlimit(resource.RLIMIT_DATA, (memory, memory))
116
+
117
+
118
+ def release_resources() -> None:
119
+ if 'CUDAExecutionProvider' in roop.globals.execution_providers:
120
+ torch.cuda.empty_cache()
121
+
122
+
123
+ def pre_check() -> bool:
124
+ if sys.version_info < (3, 9):
125
+ update_status('Python version is not supported - please upgrade to 3.9 or higher.')
126
+ return False
127
+ if not shutil.which('ffmpeg'):
128
+ update_status('ffmpeg is not installed.')
129
+ return False
130
+ return True
131
+
132
+
133
+ def update_status(message: str, scope: str = 'ROOP.CORE') -> None:
134
+ print(f'[{scope}] {message}')
135
+ if not roop.globals.headless:
136
+ ui.update_status(message)
137
+
138
+
139
+ def start() -> None:
140
+ for frame_processor in get_frame_processors_modules(roop.globals.frame_processors):
141
+ if not frame_processor.pre_start():
142
+ return
143
+ # process image to image
144
+ if has_image_extension(roop.globals.target_path):
145
+ if predict_image(roop.globals.target_path):
146
+ destroy()
147
+ shutil.copy2(roop.globals.target_path, roop.globals.output_path)
148
+ for frame_processor in get_frame_processors_modules(roop.globals.frame_processors):
149
+ update_status('Progressing...', frame_processor.NAME)
150
+ frame_processor.process_image(roop.globals.source_path, roop.globals.output_path, roop.globals.output_path)
151
+ frame_processor.post_process()
152
+ release_resources()
153
+ if is_image(roop.globals.target_path):
154
+ update_status('Processing to image succeed!')
155
+ else:
156
+ update_status('Processing to image failed!')
157
+ return
158
+ # process image to videos
159
+ if predict_video(roop.globals.target_path):
160
+ destroy()
161
+ update_status('Creating temp resources...')
162
+ create_temp(roop.globals.target_path)
163
+ update_status('Extracting frames...')
164
+ extract_frames(roop.globals.target_path)
165
+ temp_frame_paths = get_temp_frame_paths(roop.globals.target_path)
166
+ for frame_processor in get_frame_processors_modules(roop.globals.frame_processors):
167
+ update_status('Progressing...', frame_processor.NAME)
168
+ frame_processor.process_video(roop.globals.source_path, temp_frame_paths)
169
+ frame_processor.post_process()
170
+ release_resources()
171
+ # handles fps
172
+ if roop.globals.keep_fps:
173
+ update_status('Detecting fps...')
174
+ fps = detect_fps(roop.globals.target_path)
175
+ update_status(f'Creating video with {fps} fps...')
176
+ create_video(roop.globals.target_path, fps)
177
+ else:
178
+ update_status('Creating video with 30.0 fps...')
179
+ create_video(roop.globals.target_path)
180
+ # handle audio
181
+ if roop.globals.keep_audio:
182
+ if roop.globals.keep_fps:
183
+ update_status('Restoring audio...')
184
+ else:
185
+ update_status('Restoring audio might cause issues as fps are not kept...')
186
+ restore_audio(roop.globals.target_path, roop.globals.output_path)
187
+ else:
188
+ move_temp(roop.globals.target_path, roop.globals.output_path)
189
+ # clean and validate
190
+ clean_temp(roop.globals.target_path)
191
+ if is_video(roop.globals.target_path):
192
+ update_status('Processing to video succeed!')
193
+ else:
194
+ update_status('Processing to video failed!')
195
+
196
+
197
+ def destroy() -> None:
198
+ if roop.globals.target_path:
199
+ clean_temp(roop.globals.target_path)
200
+ quit()
201
+
202
+
203
+ def run() -> None:
204
+ parse_args()
205
+ if not pre_check():
206
+ return
207
+ for frame_processor in get_frame_processors_modules(roop.globals.frame_processors):
208
+ if not frame_processor.pre_check():
209
+ return
210
+ limit_resources()
211
+ if roop.globals.headless:
212
+ start()
213
+ else:
214
+ window = ui.init(start, destroy)
215
+ window.mainloop()
roop/face_analyser.py ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import threading
2
+ from typing import Any
3
+ import insightface
4
+
5
+ import roop.globals
6
+ from roop.typing import Frame
7
+
8
+ FACE_ANALYSER = None
9
+ THREAD_LOCK = threading.Lock()
10
+
11
+
12
+ def get_face_analyser() -> Any:
13
+ global FACE_ANALYSER
14
+
15
+ with THREAD_LOCK:
16
+ if FACE_ANALYSER is None:
17
+ FACE_ANALYSER = insightface.app.FaceAnalysis(name='buffalo_l', providers=roop.globals.execution_providers)
18
+ FACE_ANALYSER.prepare(ctx_id=0, det_size=(640, 640))
19
+ return FACE_ANALYSER
20
+
21
+
22
+ def get_one_face(frame: Frame) -> Any:
23
+ face = get_face_analyser().get(frame)
24
+ try:
25
+ return min(face, key=lambda x: x.bbox[0])
26
+ except ValueError:
27
+ return None
28
+
29
+
30
+ def get_many_faces(frame: Frame) -> Any:
31
+ try:
32
+ return get_face_analyser().get(frame)
33
+ except IndexError:
34
+ return None
roop/globals.py ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import List
2
+
3
+ source_path = None
4
+ target_path = None
5
+ output_path = None
6
+ frame_processors: List[str] = []
7
+ keep_fps = None
8
+ keep_audio = None
9
+ keep_frames = None
10
+ many_faces = None
11
+ video_encoder = None
12
+ video_quality = None
13
+ max_memory = None
14
+ execution_providers: List[str] = []
15
+ execution_threads = None
16
+ headless = None
17
+ log_level = 'error'
roop/metadata.py ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ name = 'roop'
2
+ version = '1.1.0'
roop/predicter.py ADDED
@@ -0,0 +1,43 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import threading
2
+ import numpy
3
+ import opennsfw2
4
+ from PIL import Image
5
+ from keras import Model
6
+
7
+ from roop.typing import Frame
8
+
9
+ PREDICTOR = None
10
+ THREAD_LOCK = threading.Lock()
11
+ MAX_PROBABILITY = 0.85
12
+
13
+
14
+ def get_predictor() -> Model:
15
+ global PREDICTOR
16
+
17
+ with THREAD_LOCK:
18
+ if PREDICTOR is None:
19
+ PREDICTOR = opennsfw2.make_open_nsfw_model()
20
+ return PREDICTOR
21
+
22
+
23
+ def clear_predictor() -> None:
24
+ global PREDICTOR
25
+
26
+ PREDICTOR = None
27
+
28
+
29
+ def predict_frame(target_frame: Frame) -> bool:
30
+ image = Image.fromarray(target_frame)
31
+ image = opennsfw2.preprocess_image(image, opennsfw2.Preprocessing.YAHOO)
32
+ views = numpy.expand_dims(image, axis=0)
33
+ _, probability = get_predictor().predict(views)[0]
34
+ return probability > MAX_PROBABILITY
35
+
36
+
37
+ def predict_image(target_path: str) -> bool:
38
+ return opennsfw2.predict_image(target_path) > MAX_PROBABILITY
39
+
40
+
41
+ def predict_video(target_path: str) -> bool:
42
+ _, probabilities = opennsfw2.predict_video_frames(video_path=target_path, frame_interval=100)
43
+ return any(probability > MAX_PROBABILITY for probability in probabilities)