root commited on
Commit
0be44e5
·
1 Parent(s): f13cdf8
Files changed (48) hide show
  1. MusePose +1 -0
  2. __pycache__/handler.cpython-310.pyc +0 -0
  3. assets/poses/align/img_downloaded_image_video_downloaded_video.mp4 +0 -0
  4. assets/poses/align/img_rithwik_video_pose-2.mp4 +0 -0
  5. extract_dwpose_from_vid.py +101 -0
  6. handler.py +142 -92
  7. input.jpg +0 -0
  8. me.jpeg +0 -0
  9. output.mp4 +0 -0
  10. output/gradio/animation_output.mp4 +0 -0
  11. post_install.sh +14 -0
  12. pretrained_weights/DWPose/dw-ll_ucoco_384.onnx +0 -3
  13. pretrained_weights/DWPose/yolox_l.onnx +0 -3
  14. pretrained_weights/denoising_unet.pth +0 -3
  15. pretrained_weights/image_encoder/config.json +0 -23
  16. pretrained_weights/image_encoder/pytorch_model.bin +0 -3
  17. pretrained_weights/motion_module.pth +0 -3
  18. pretrained_weights/pose_guider.pth +0 -3
  19. pretrained_weights/reference_unet.pth +0 -3
  20. pretrained_weights/sd-vae-ft-mse/config.json +0 -29
  21. pretrained_weights/sd-vae-ft-mse/diffusion_pytorch_model.bin +0 -3
  22. pretrained_weights/stable-diffusion-v1-5/unet/config.json +0 -36
  23. pretrained_weights/stable-diffusion-v1-5/unet/diffusion_pytorch_model.bin +0 -3
  24. requirements.txt +4 -1
  25. roop-unleashed +1 -0
  26. roop/__pycache__/metadata.cpython-310.pyc +0 -0
  27. roop/__pycache__/typing.cpython-310.pyc +0 -0
  28. sampler.py +7 -11
  29. sped_up_pose_video.mp4 +0 -0
  30. src/__init__.py +0 -0
  31. src/__pycache__/__init__.cpython-310.pyc +0 -0
  32. src/dataset/dance_image.py +124 -0
  33. src/dataset/dance_video.py +137 -0
  34. src/dwpose/__init__.py +123 -0
  35. src/dwpose/__pycache__/__init__.cpython-310.pyc +0 -0
  36. src/dwpose/__pycache__/onnxdet.cpython-310.pyc +0 -0
  37. src/dwpose/__pycache__/onnxpose.cpython-310.pyc +0 -0
  38. src/dwpose/__pycache__/util.cpython-310.pyc +0 -0
  39. src/dwpose/__pycache__/wholebody.cpython-310.pyc +0 -0
  40. src/dwpose/onnxdet.py +130 -0
  41. src/dwpose/onnxpose.py +370 -0
  42. src/dwpose/util.py +378 -0
  43. src/dwpose/wholebody.py +48 -0
  44. src/utils/__pycache__/util.cpython-310.pyc +0 -0
  45. tools/download_weights.py +111 -0
  46. tools/extract_meta_info.py +37 -0
  47. tools/facetracker_api.py +62 -0
  48. tools/vid2pose.py +38 -0
MusePose ADDED
@@ -0,0 +1 @@
 
 
1
+ Subproject commit 124543e3ff347b508a2c489c4344f5f40190c5d3
__pycache__/handler.cpython-310.pyc CHANGED
Binary files a/__pycache__/handler.cpython-310.pyc and b/__pycache__/handler.cpython-310.pyc differ
 
assets/poses/align/img_downloaded_image_video_downloaded_video.mp4 ADDED
Binary file (543 kB). View file
 
assets/poses/align/img_rithwik_video_pose-2.mp4 ADDED
Binary file (816 kB). View file
 
extract_dwpose_from_vid.py ADDED
@@ -0,0 +1,101 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import concurrent.futures
2
+ import os
3
+ import random
4
+ from pathlib import Path
5
+
6
+ import numpy as np
7
+
8
+ from src.dwpose import DWposeDetector
9
+ from src.utils.util import get_fps, read_frames, save_videos_from_pil
10
+
11
+ # Extract dwpose mp4 videos from raw videos
12
+ # /path/to/video_dataset/*/*.mp4 -> /path/to/video_dataset_dwpose/*/*.mp4
13
+
14
+
15
+ def process_single_video(video_path, detector, root_dir, save_dir):
16
+ relative_path = os.path.relpath(video_path, root_dir)
17
+ print(relative_path, video_path, root_dir)
18
+ out_path = os.path.join(save_dir, relative_path)
19
+ if os.path.exists(out_path):
20
+ return
21
+
22
+ output_dir = Path(os.path.dirname(os.path.join(save_dir, relative_path)))
23
+ if not output_dir.exists():
24
+ output_dir.mkdir(parents=True, exist_ok=True)
25
+
26
+ fps = get_fps(video_path)
27
+ frames = read_frames(video_path)
28
+ kps_results = []
29
+ for i, frame_pil in enumerate(frames):
30
+ result, score = detector(frame_pil)
31
+ score = np.mean(score, axis=-1)
32
+
33
+ kps_results.append(result)
34
+
35
+ save_videos_from_pil(kps_results, out_path, fps=fps)
36
+
37
+
38
+ def process_batch_videos(video_list, detector, root_dir, save_dir):
39
+ for i, video_path in enumerate(video_list):
40
+ print(f"Process {i}/{len(video_list)} video")
41
+ process_single_video(video_path, detector, root_dir, save_dir)
42
+
43
+
44
+ if __name__ == "__main__":
45
+ # -----
46
+ # NOTE:
47
+ # python tools/extract_dwpose_from_vid.py --video_root /path/to/video_dir
48
+ # -----
49
+ import argparse
50
+
51
+ parser = argparse.ArgumentParser()
52
+ parser.add_argument("--video_root", type=str)
53
+ parser.add_argument(
54
+ "--save_dir", type=str, help="Path to save extracted pose videos"
55
+ )
56
+ parser.add_argument("-j", type=int, default=4, help="Num workers")
57
+ args = parser.parse_args()
58
+ num_workers = args.j
59
+ if args.save_dir is None:
60
+ save_dir = args.video_root + "_dwpose"
61
+ else:
62
+ save_dir = args.save_dir
63
+ if not os.path.exists(save_dir):
64
+ os.makedirs(save_dir)
65
+ cuda_visible_devices = os.environ.get("CUDA_VISIBLE_DEVICES", "0")
66
+ gpu_ids = [int(id) for id in range(len(cuda_visible_devices.split(",")))]
67
+ print(f"avaliable gpu ids: {gpu_ids}")
68
+
69
+ # collect all video_folder paths
70
+ video_mp4_paths = set()
71
+ for root, dirs, files in os.walk(args.video_root):
72
+ for name in files:
73
+ if name.endswith(".mp4"):
74
+ video_mp4_paths.add(os.path.join(root, name))
75
+ video_mp4_paths = list(video_mp4_paths)
76
+ random.shuffle(video_mp4_paths)
77
+
78
+ # split into chunks,
79
+ batch_size = (len(video_mp4_paths) + num_workers - 1) // num_workers
80
+ print(f"Num videos: {len(video_mp4_paths)} {batch_size = }")
81
+ video_chunks = [
82
+ video_mp4_paths[i : i + batch_size]
83
+ for i in range(0, len(video_mp4_paths), batch_size)
84
+ ]
85
+
86
+ with concurrent.futures.ThreadPoolExecutor() as executor:
87
+ futures = []
88
+ for i, chunk in enumerate(video_chunks):
89
+ # init detector
90
+ gpu_id = gpu_ids[i % len(gpu_ids)]
91
+ detector = DWposeDetector()
92
+ # torch.cuda.set_device(gpu_id)
93
+ detector = detector.to(f"cuda:{gpu_id}")
94
+
95
+ futures.append(
96
+ executor.submit(
97
+ process_batch_videos, chunk, detector, args.video_root, save_dir
98
+ )
99
+ )
100
+ for future in concurrent.futures.as_completed(futures):
101
+ future.result()
handler.py CHANGED
@@ -4,28 +4,32 @@ from PIL import Image
4
  import base64
5
  from io import BytesIO
6
  import numpy as np
7
- from diffusers import AutoencoderKL, DDIMScheduler
8
  from einops import repeat
9
  from omegaconf import OmegaConf
10
- from transformers import CLIPVisionModelWithProjection
11
  import cv2
12
  import os
13
  import sys
14
  import skvideo.io
15
- from src.models.pose_guider import PoseGuider
16
- from src.models.unet_2d_condition import UNet2DConditionModel
17
- from src.models.unet_3d import UNet3DConditionModel
18
- from src.pipelines.pipeline_pose2vid_long import Pose2VideoPipeline
19
- from src.utils.util import read_frames, get_fps, save_videos_grid
20
  import roop.globals
21
  from roop.core import start, decode_execution_providers, suggest_max_memory, suggest_execution_threads
22
  from roop.utilities import normalize_output_path
23
  from roop.processors.frame.core import get_frame_processors_modules
24
 
25
- import onnxruntime as ort
26
  import gc
27
  import subprocess
28
 
 
 
 
 
29
  device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
30
 
31
  if device.type != 'cuda':
@@ -39,11 +43,21 @@ class EndpointHandler():
39
  if not os.path.exists(config_path):
40
  raise FileNotFoundError(f"The configuration file was not found at: {config_path}")
41
 
 
42
  self.config = OmegaConf.load(config_path)
43
  self.weight_dtype = torch.float16
44
  self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
45
  self.pipeline = None
46
- self._initialize_pipeline()
 
 
 
 
 
 
 
 
 
47
 
48
  def _initialize_pipeline(self):
49
  base_dir = os.path.dirname(os.path.abspath(__file__))
@@ -127,9 +141,9 @@ class EndpointHandler():
127
 
128
  return cropped_face
129
 
130
- def _swap_face(self, source_image, target_video_path):
131
- source_path = "input.jpg"
132
- source_image.save(source_path, format="JPEG", quality=95)
133
  output_path = "output.mp4"
134
 
135
  roop.globals.source_path = source_path
@@ -141,8 +155,8 @@ class EndpointHandler():
141
  roop.globals.keep_audio = True
142
  roop.globals.keep_frames = False
143
  roop.globals.many_faces = False
144
- roop.globals.video_encoder = "libx264"
145
- roop.globals.video_quality = 100
146
  roop.globals.max_memory = suggest_max_memory()
147
 
148
  # Set GPU execution provider
@@ -250,83 +264,119 @@ class EndpointHandler():
250
  if result.returncode != 0:
251
  raise RuntimeError(f"FFmpeg slow down failed with exit code {result.returncode}")
252
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
253
  def __call__(self, data: Any) -> Dict[str, str]:
254
  inputs = data.get("inputs", {})
255
- ref_image_base64 = inputs.get("ref_image", "")
256
- pose_video_path = inputs.get("pose_video_path", "")
257
- width = inputs.get("width", 512)
258
- height = inputs.get("height", 768)
259
- length = inputs.get("length", 24)
260
- num_inference_steps = inputs.get("num_inference_steps", 25)
261
- cfg = inputs.get("cfg", 3.5)
262
- seed = inputs.get("seed", 123)
263
-
264
- ref_image = Image.open(BytesIO(base64.b64decode(ref_image_base64)))
265
-
266
- # Get the base directory of the current file
267
- base_dir = os.path.dirname(os.path.abspath(__file__))
268
-
269
- # Update pose_video_path to use the base directory
270
- pose_video_path = os.path.join(base_dir, pose_video_path)
271
-
272
- if not os.path.exists(pose_video_path):
273
- raise FileNotFoundError(f"The pose video was not found at: {pose_video_path}")
274
-
275
- # Speed up the pose video by 4x
276
- sped_up_pose_video_path = os.path.join(base_dir, "sped_up_pose_video.mp4")
277
- self.speed_up_video(pose_video_path, sped_up_pose_video_path, factor=4)
278
-
279
- torch.manual_seed(seed)
280
- pose_images = read_frames(sped_up_pose_video_path)
281
- src_fps = get_fps(sped_up_pose_video_path)
282
-
283
- pose_list = []
284
- total_length = min(length, len(pose_images))
285
- for pose_image_pil in pose_images[:total_length]:
286
- pose_list.append(pose_image_pil)
287
-
288
- video = self.pipeline(
289
- ref_image,
290
- pose_list,
291
- width=width,
292
- height=height,
293
- video_length=total_length,
294
- num_inference_steps=num_inference_steps,
295
- guidance_scale=cfg
296
- ).videos
297
-
298
- save_dir = os.path.join(base_dir, "output", "gradio")
299
- if not os.path.exists(save_dir):
300
- os.makedirs(save_dir, exist_ok=True)
301
- animation_path = os.path.join(save_dir, "animation_output.mp4")
302
- save_videos_grid(video, animation_path, n_rows=1, fps=src_fps)
303
-
304
- # Crop the face from the reference image and save it
305
- cropped_face_path = os.path.join(save_dir, "cropped_face.jpg")
306
- cropped_face = self._crop_face(ref_image, save_path=cropped_face_path)
307
-
308
- # Delete the pipeline and clear CUDA cache to free up memory
309
- del self.pipeline
310
- torch.cuda.empty_cache()
311
-
312
- # Perform face swapping
313
- swapped_face_video_path = self._swap_face(cropped_face, animation_path)
314
-
315
- # Slow down the produced video by 4x
316
- slowed_down_animation_path = os.path.join(save_dir, "slowed_down_animation_output.mp4")
317
- self.slow_down_video(swapped_face_video_path, slowed_down_animation_path, factor=4)
318
-
319
- # Clear CUDA cache before RIFE interpolation
320
- torch.cuda.empty_cache()
321
-
322
- # Perform RIFE interpolation
323
- rife_output_path = os.path.join(save_dir, "completed_result.mp4")
324
- self.run_rife_interpolation(slowed_down_animation_path, rife_output_path, multi=2, scale=0.5)
325
-
326
- # Encode the final video in base64
327
- with open(rife_output_path, "rb") as video_file:
328
- video_base64 = base64.b64encode(video_file.read()).decode("utf-8")
329
-
330
- torch.cuda.empty_cache()
331
-
332
- return {"video": video_base64}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4
  import base64
5
  from io import BytesIO
6
  import numpy as np
7
+ # from diffusers import AutoencoderKL, DDIMScheduler
8
  from einops import repeat
9
  from omegaconf import OmegaConf
10
+ # from transformers import CLIPVisionModelWithProjection
11
  import cv2
12
  import os
13
  import sys
14
  import skvideo.io
15
+ # from src.models.pose_guider import PoseGuider
16
+ # from src.models.unet_2d_condition import UNet2DConditionModel
17
+ # from src.models.unet_3d import UNet3DConditionModel
18
+ # from src.pipelines.pipeline_pose2vid_long import Pose2VideoPipeline
19
+ # from src.utils.util import read_frames, get_fps, save_videos_grid
20
  import roop.globals
21
  from roop.core import start, decode_execution_providers, suggest_max_memory, suggest_execution_threads
22
  from roop.utilities import normalize_output_path
23
  from roop.processors.frame.core import get_frame_processors_modules
24
 
25
+ # import onnxruntime as ort
26
  import gc
27
  import subprocess
28
 
29
+ import requests
30
+ import tempfile
31
+
32
+
33
  device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
34
 
35
  if device.type != 'cuda':
 
43
  if not os.path.exists(config_path):
44
  raise FileNotFoundError(f"The configuration file was not found at: {config_path}")
45
 
46
+ self.run_post_install()
47
  self.config = OmegaConf.load(config_path)
48
  self.weight_dtype = torch.float16
49
  self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
50
  self.pipeline = None
51
+ # self._initialize_pipeline()
52
+
53
+ def run_post_install(self):
54
+ try:
55
+ result = subprocess.run(['bash', 'post_install.sh'], check=True, capture_output=True, text=True)
56
+ print("Post-install script ran successfully.")
57
+ print(result.stdout)
58
+ except subprocess.CalledProcessError as e:
59
+ print("Error running post-install script.")
60
+ print(e.stderr)
61
 
62
  def _initialize_pipeline(self):
63
  base_dir = os.path.dirname(os.path.abspath(__file__))
 
141
 
142
  return cropped_face
143
 
144
+ def _swap_face(self, source_path, target_video_path):
145
+ # source_path = "input.jpg"
146
+ # source_image.save(source_path, format="JPEG", quality=95)
147
  output_path = "output.mp4"
148
 
149
  roop.globals.source_path = source_path
 
155
  roop.globals.keep_audio = True
156
  roop.globals.keep_frames = False
157
  roop.globals.many_faces = False
158
+ # roop.globals.video_encoder = "libx264"
159
+ roop.globals.video_quality = 50
160
  roop.globals.max_memory = suggest_max_memory()
161
 
162
  # Set GPU execution provider
 
264
  if result.returncode != 0:
265
  raise RuntimeError(f"FFmpeg slow down failed with exit code {result.returncode}")
266
 
267
+ def download_file(self, url: str, save_path: str):
268
+ response = requests.get(url, stream=True)
269
+ if response.status_code == 200:
270
+ with open(save_path, 'wb') as f:
271
+ for chunk in response.iter_content(chunk_size=8192):
272
+ f.write(chunk)
273
+ else:
274
+ raise ValueError(f"Failed to download file from {url}")
275
+
276
+ def print_directory_contents(self, directory):
277
+ for root, dirs, files in os.walk(directory):
278
+ level = root.replace(directory, '').count(os.sep)
279
+ indent = ' ' * 4 * (level)
280
+ print(f"{indent}{os.path.basename(root)}/")
281
+ subindent = ' ' * 4 * (level + 1)
282
+ for f in files:
283
+ print(f"{subindent}{f}")
284
+
285
  def __call__(self, data: Any) -> Dict[str, str]:
286
  inputs = data.get("inputs", {})
287
+ ref_image_url = inputs.get("ref_image_url", "")
288
+ video_url = inputs.get("video_url", "")
289
+
290
+ # Create a unique temporary directory for this request
291
+ with tempfile.TemporaryDirectory() as temp_dir:
292
+ print(f"Temporary directory created at {temp_dir}") # Debug statement
293
+ video_root = os.path.join(temp_dir, "dw_poses_videos")
294
+ os.makedirs(video_root, exist_ok=True)
295
+ downloaded_video_path = os.path.join(video_root, "downloaded_video.mp4")
296
+ downloaded_image_path = os.path.join(video_root, "downloaded_image.jpg")
297
+
298
+ # Download the video from the URL
299
+ self.download_file(video_url, downloaded_video_path)
300
+
301
+ # Download the reference image from the URL
302
+ self.download_file(ref_image_url, downloaded_image_path)
303
+ ref_image = Image.open(downloaded_image_path)
304
+
305
+ pose_output_path = os.path.join(temp_dir, "pose_videos")
306
+
307
+ # Run the extract_dwpose_from_vid.py script
308
+ command = [
309
+ "python", "./MusePose/pose_align.py",
310
+ "--imgfn_refer", downloaded_image_path,
311
+ "--vidfn", './pose_video.mp4',
312
+ "--output_dir", pose_output_path
313
+ ]
314
+ result = subprocess.run(command, capture_output=True, text=True)
315
+ if result.returncode != 0:
316
+ raise RuntimeError(f"Error running extract_dwpose_from_vid.py: {result.stderr}")
317
+
318
+ # Locate the extracted pose video
319
+ pose_video_path = os.path.join(pose_output_path, "pose_video.mp4")
320
+
321
+ if not os.path.exists(pose_video_path):
322
+ print(f"Error running extract_dwpose_from_vid.py: {result.stderr}")
323
+ print("Contents of the temporary directory:")
324
+ self.print_directory_contents(temp_dir)
325
+ raise FileNotFoundError(f"The pose video was not found at: {pose_video_path}")
326
+
327
+ # Speed up the pose video by 4x
328
+ sped_up_pose_video_path = os.path.join(temp_dir, "sped_up_pose_video.mp4")
329
+ self.speed_up_video(pose_video_path, sped_up_pose_video_path, factor=1)
330
+
331
+ dancing_video_dir = os.path.join(temp_dir, "dancing_video")
332
+ dancing_video_path_final = os.path.join(temp_dir, "dancing_video", "dance.mp4") #This is in create_video, can change there
333
+
334
+ command = [
335
+ "python", "./MusePose/create_video.py",
336
+ "--ref_image_path", downloaded_image_path,
337
+ "--pose_video_path", sped_up_pose_video_path,
338
+ "-W", "512",
339
+ "-H", "512",
340
+ "--output_dir", dancing_video_dir
341
+ ]
342
+ result = subprocess.run(command, capture_output=True, text=True)
343
+ if result.returncode != 0:
344
+ raise RuntimeError(f"Error running extract_dwpose_from_vid.py: {result.stderr}")
345
+
346
+ # save_dir = os.path.join(temp_dir, "output")
347
+ # if not os.path.exists(save_dir):
348
+ # os.makedirs(save_dir, exist_ok=True)
349
+ # animation_path = os.path.join(save_dir, "animation_output.mp4")
350
+ # save_videos_grid(video, animation_path, n_rows=1, fps=src_fps)
351
+
352
+ # Crop the face from the reference image and save it
353
+ cropped_face_path = os.path.join(temp_dir, "cropped_face.jpg")
354
+ cropped_face = self._crop_face(ref_image, save_path=cropped_face_path)
355
+
356
+ # Delete the pipeline and clear CUDA cache to free up memory
357
+ del self.pipeline
358
+ torch.cuda.empty_cache()
359
+
360
+ # Perform face swapping
361
+ # self.print_directory_contents(temp_dir)
362
+ # swapped_face_video_path = self._swap_face(cropped_face_path, animation_path)
363
+
364
+ # Slow down the produced video by 4x
365
+ self.print_directory_contents(temp_dir)
366
+ slowed_down_animation_path = os.path.join(temp_dir, "slowed_down_animation_output.mp4")
367
+ self.slow_down_video(dancing_video_path_final, slowed_down_animation_path, factor=1)
368
+
369
+ # Clear CUDA cache before RIFE interpolation
370
+ torch.cuda.empty_cache()
371
+
372
+ # Perform RIFE interpolation
373
+ # rife_output_path = os.path.join(temp_dir, "completed_result.mp4")
374
+ # self.run_rife_interpolation(slowed_down_animation_path, rife_output_path, multi=2, scale=0.5)
375
+
376
+ # Encode the final video in base64
377
+ with open(slowed_down_animation_path, "rb") as video_file:
378
+ video_base64 = base64.b64encode(video_file.read()).decode("utf-8")
379
+
380
+ torch.cuda.empty_cache()
381
+
382
+ return {"video": video_base64}
input.jpg CHANGED
me.jpeg ADDED
output.mp4 DELETED
Binary file (79.8 kB)
 
output/gradio/animation_output.mp4 CHANGED
Binary files a/output/gradio/animation_output.mp4 and b/output/gradio/animation_output.mp4 differ
 
post_install.sh ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/bin/bash
2
+
3
+ pip install --no-cache-dir -U openmim
4
+ mim install mmengine
5
+ mim install "mmcv>=2.0.1"
6
+ mim install "mmdet>=3.1.0"
7
+ mim install "mmpose>=1.1.0"
8
+
9
+ # onnxruntime==1.16.3; sys_platform == 'darwin' and platform_machine != 'arm64'
10
+ # onnxruntime-silicon==1.13.1; sys_platform == 'darwin' and platform_machine == 'arm64'
11
+ # onnxruntime-gpu==1.16.3; sys_platform != 'darwin'
12
+ # onnxruntime-coreml==1.13.1; python_version == '3.9' and sys_platform == 'darwin' and platform_machine != 'arm64'
13
+ #onnx==1.14.0
14
+ #protobuf==4.23.2
pretrained_weights/DWPose/dw-ll_ucoco_384.onnx DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:724f4ff2439ed61afb86fb8a1951ec39c6220682803b4a8bd4f598cd913b1843
3
- size 134399116
 
 
 
 
pretrained_weights/DWPose/yolox_l.onnx DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:7860ae79de6c89a3c1eb72ae9a2756c0ccfbe04b7791bb5880afabd97855a411
3
- size 216746733
 
 
 
 
pretrained_weights/denoising_unet.pth DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:b9e5a2c34fac369e8a922972ca2210916c6af175a0dad907deccf6235816ad52
3
- size 3438374293
 
 
 
 
pretrained_weights/image_encoder/config.json DELETED
@@ -1,23 +0,0 @@
1
- {
2
- "_name_or_path": "/home/jpinkney/.cache/huggingface/diffusers/models--lambdalabs--sd-image-variations-diffusers/snapshots/ca6f97f838ae1b5bf764f31363a21f388f4d8f3e/image_encoder",
3
- "architectures": [
4
- "CLIPVisionModelWithProjection"
5
- ],
6
- "attention_dropout": 0.0,
7
- "dropout": 0.0,
8
- "hidden_act": "quick_gelu",
9
- "hidden_size": 1024,
10
- "image_size": 224,
11
- "initializer_factor": 1.0,
12
- "initializer_range": 0.02,
13
- "intermediate_size": 4096,
14
- "layer_norm_eps": 1e-05,
15
- "model_type": "clip_vision_model",
16
- "num_attention_heads": 16,
17
- "num_channels": 3,
18
- "num_hidden_layers": 24,
19
- "patch_size": 14,
20
- "projection_dim": 768,
21
- "torch_dtype": "float32",
22
- "transformers_version": "4.25.1"
23
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
pretrained_weights/image_encoder/pytorch_model.bin DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:89d2aa29b5fdf64f3ad4f45fb4227ea98bc45156bbae673b85be1af7783dbabb
3
- size 1215993967
 
 
 
 
pretrained_weights/motion_module.pth DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:0d11e01a281b39880da2efeea892215c1313e5713fca3d100a7fbb72ee312ef9
3
- size 1817900227
 
 
 
 
pretrained_weights/pose_guider.pth DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:1a8b7c1b4db92980fd977b4fd003c1396bbae9a9cdea00c35d452136d5e4f488
3
- size 4351337
 
 
 
 
pretrained_weights/reference_unet.pth DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:beddccb08d49a8b29b0f4d6d456c6521d4382a8d8d48884fa60ba8802509c214
3
- size 3438323817
 
 
 
 
pretrained_weights/sd-vae-ft-mse/config.json DELETED
@@ -1,29 +0,0 @@
1
- {
2
- "_class_name": "AutoencoderKL",
3
- "_diffusers_version": "0.4.2",
4
- "act_fn": "silu",
5
- "block_out_channels": [
6
- 128,
7
- 256,
8
- 512,
9
- 512
10
- ],
11
- "down_block_types": [
12
- "DownEncoderBlock2D",
13
- "DownEncoderBlock2D",
14
- "DownEncoderBlock2D",
15
- "DownEncoderBlock2D"
16
- ],
17
- "in_channels": 3,
18
- "latent_channels": 4,
19
- "layers_per_block": 2,
20
- "norm_num_groups": 32,
21
- "out_channels": 3,
22
- "sample_size": 256,
23
- "up_block_types": [
24
- "UpDecoderBlock2D",
25
- "UpDecoderBlock2D",
26
- "UpDecoderBlock2D",
27
- "UpDecoderBlock2D"
28
- ]
29
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
pretrained_weights/sd-vae-ft-mse/diffusion_pytorch_model.bin DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:1b4889b6b1d4ce7ae320a02dedaeff1780ad77d415ea0d744b476155c6377ddc
3
- size 334707217
 
 
 
 
pretrained_weights/stable-diffusion-v1-5/unet/config.json DELETED
@@ -1,36 +0,0 @@
1
- {
2
- "_class_name": "UNet2DConditionModel",
3
- "_diffusers_version": "0.6.0",
4
- "act_fn": "silu",
5
- "attention_head_dim": 8,
6
- "block_out_channels": [
7
- 320,
8
- 640,
9
- 1280,
10
- 1280
11
- ],
12
- "center_input_sample": false,
13
- "cross_attention_dim": 768,
14
- "down_block_types": [
15
- "CrossAttnDownBlock2D",
16
- "CrossAttnDownBlock2D",
17
- "CrossAttnDownBlock2D",
18
- "DownBlock2D"
19
- ],
20
- "downsample_padding": 1,
21
- "flip_sin_to_cos": true,
22
- "freq_shift": 0,
23
- "in_channels": 4,
24
- "layers_per_block": 2,
25
- "mid_block_scale_factor": 1,
26
- "norm_eps": 1e-05,
27
- "norm_num_groups": 32,
28
- "out_channels": 4,
29
- "sample_size": 64,
30
- "up_block_types": [
31
- "UpBlock2D",
32
- "CrossAttnUpBlock2D",
33
- "CrossAttnUpBlock2D",
34
- "CrossAttnUpBlock2D"
35
- ]
36
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
pretrained_weights/stable-diffusion-v1-5/unet/diffusion_pytorch_model.bin DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:c7da0e21ba7ea50637bee26e81c220844defdf01aafca02b2c42ecdadb813de4
3
- size 3438354725
 
 
 
 
requirements.txt CHANGED
@@ -23,6 +23,7 @@ gfpgan==1.3.8
23
  gradio==3.41.2
24
  onnxruntime-coreml==1.13.1; python_version == '3.9' and sys_platform == 'darwin' and platform_machine != 'arm64'
25
  transformers==4.41.1
 
26
 
27
  # Add additional dependencies
28
  diffusers==0.24.0
@@ -53,4 +54,6 @@ torchsde==0.2.5
53
 
54
  # Additional dependencies for RIFE
55
  sk-video==1.1.10
56
- moviepy==1.0.3
 
 
 
23
  gradio==3.41.2
24
  onnxruntime-coreml==1.13.1; python_version == '3.9' and sys_platform == 'darwin' and platform_machine != 'arm64'
25
  transformers==4.41.1
26
+ controlnet-aux==0.0.7
27
 
28
  # Add additional dependencies
29
  diffusers==0.24.0
 
54
 
55
  # Additional dependencies for RIFE
56
  sk-video==1.1.10
57
+ moviepy==1.0.3
58
+
59
+ requests==2.32.3
roop-unleashed ADDED
@@ -0,0 +1 @@
 
 
1
+ Subproject commit ed6e3dbcf875213051dbc3b095e570afd3557463
roop/__pycache__/metadata.cpython-310.pyc CHANGED
Binary files a/roop/__pycache__/metadata.cpython-310.pyc and b/roop/__pycache__/metadata.cpython-310.pyc differ
 
roop/__pycache__/typing.cpython-310.pyc CHANGED
Binary files a/roop/__pycache__/typing.cpython-310.pyc and b/roop/__pycache__/typing.cpython-310.pyc differ
 
sampler.py CHANGED
@@ -7,15 +7,11 @@ import io
7
  # Initialize the handler
8
  handler = EndpointHandler()
9
 
10
- # Read a sample reference image and encode it in base64
11
- with open("rithwik.png", "rb") as image_file:
12
- ref_image_base64 = base64.b64encode(image_file.read()).decode("utf-8")
13
-
14
  # Define sample inputs
15
  inputs = {
16
  "inputs": {
17
- "ref_image": ref_image_base64,
18
- "pose_video_path": "pose_video.mp4",
19
  "width": 378,
20
  "height": 504,
21
  "length": 24,
@@ -29,11 +25,11 @@ inputs = {
29
  output = handler(inputs)
30
 
31
  # # Decode the base64 video output
32
- # video_base64 = output.get("video", "")
33
- # video_bytes = base64.b64decode(video_base64)
34
 
35
- # # Save the video to a file
36
- # with open("output_video.mp4", "wb") as video_file:
37
- # video_file.write(video_bytes)
38
 
39
  print("Inference completed. Output video saved as output_video.mp4")
 
7
  # Initialize the handler
8
  handler = EndpointHandler()
9
 
 
 
 
 
10
  # Define sample inputs
11
  inputs = {
12
  "inputs": {
13
+ "ref_image_url": "https://media.discordapp.net/attachments/1183633414612594708/1245882096116043887/image.jpg?ex=665a5d9f&is=66590c1f&hm=3065fed7b8f5bd13aa2c8ad7d97e625dd4c2977589dbe7d8c13d024b782ab25a&=&format=webp&width=672&height=1194",
14
+ "video_url": "https://cdn.discordapp.com/attachments/1237667074210267217/1245971599660679208/pose.mov?ex=665ab0fa&is=66595f7a&hm=63691e23a23ebd8657a10ec708d63a06046a124c3940aa133de22a94aa1fd6c5&",
15
  "width": 378,
16
  "height": 504,
17
  "length": 24,
 
25
  output = handler(inputs)
26
 
27
  # # Decode the base64 video output
28
+ video_base64 = output.get("video", "")
29
+ video_bytes = base64.b64decode(video_base64)
30
 
31
+ # Save the video to a file
32
+ with open("output_video.mp4", "wb") as video_file:
33
+ video_file.write(video_bytes)
34
 
35
  print("Inference completed. Output video saved as output_video.mp4")
sped_up_pose_video.mp4 DELETED
Binary file (131 kB)
 
src/__init__.py ADDED
File without changes
src/__pycache__/__init__.cpython-310.pyc ADDED
Binary file (128 Bytes). View file
 
src/dataset/dance_image.py ADDED
@@ -0,0 +1,124 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import random
3
+
4
+ import torch
5
+ import torchvision.transforms as transforms
6
+ from decord import VideoReader
7
+ from PIL import Image
8
+ from torch.utils.data import Dataset
9
+ from transformers import CLIPImageProcessor
10
+
11
+
12
+ class HumanDanceDataset(Dataset):
13
+ def __init__(
14
+ self,
15
+ img_size,
16
+ img_scale=(1.0, 1.0),
17
+ img_ratio=(0.9, 1.0),
18
+ drop_ratio=0.1,
19
+ data_meta_paths=["./data/fahsion_meta.json"],
20
+ sample_margin=30,
21
+ ):
22
+ super().__init__()
23
+
24
+ self.img_size = img_size
25
+ self.img_scale = img_scale
26
+ self.img_ratio = img_ratio
27
+ self.sample_margin = sample_margin
28
+
29
+ # -----
30
+ # vid_meta format:
31
+ # [{'video_path': , 'kps_path': , 'other':},
32
+ # {'video_path': , 'kps_path': , 'other':}]
33
+ # -----
34
+ vid_meta = []
35
+ for data_meta_path in data_meta_paths:
36
+ vid_meta.extend(json.load(open(data_meta_path, "r")))
37
+ self.vid_meta = vid_meta
38
+
39
+ self.clip_image_processor = CLIPImageProcessor()
40
+
41
+ self.transform = transforms.Compose(
42
+ [
43
+ transforms.RandomResizedCrop(
44
+ self.img_size,
45
+ scale=self.img_scale,
46
+ ratio=self.img_ratio,
47
+ interpolation=transforms.InterpolationMode.BILINEAR,
48
+ ),
49
+ transforms.ToTensor(),
50
+ transforms.Normalize([0.5], [0.5]),
51
+ ]
52
+ )
53
+
54
+ self.cond_transform = transforms.Compose(
55
+ [
56
+ transforms.RandomResizedCrop(
57
+ self.img_size,
58
+ scale=self.img_scale,
59
+ ratio=self.img_ratio,
60
+ interpolation=transforms.InterpolationMode.BILINEAR,
61
+ ),
62
+ transforms.ToTensor(),
63
+ ]
64
+ )
65
+
66
+ self.drop_ratio = drop_ratio
67
+
68
+ def augmentation(self, image, transform, state=None):
69
+ if state is not None:
70
+ torch.set_rng_state(state)
71
+ return transform(image)
72
+
73
+ def __getitem__(self, index):
74
+ video_meta = self.vid_meta[index]
75
+ video_path = video_meta["video_path"]
76
+ kps_path = video_meta["kps_path"]
77
+
78
+ video_reader = VideoReader(video_path)
79
+ kps_reader = VideoReader(kps_path)
80
+
81
+ assert len(video_reader) == len(
82
+ kps_reader
83
+ ), f"{len(video_reader) = } != {len(kps_reader) = } in {video_path}"
84
+
85
+ video_length = len(video_reader)
86
+
87
+ margin = min(self.sample_margin, video_length)
88
+
89
+ ref_img_idx = random.randint(0, video_length - 1)
90
+ if ref_img_idx + margin < video_length:
91
+ tgt_img_idx = random.randint(ref_img_idx + margin, video_length - 1)
92
+ elif ref_img_idx - margin > 0:
93
+ tgt_img_idx = random.randint(0, ref_img_idx - margin)
94
+ else:
95
+ tgt_img_idx = random.randint(0, video_length - 1)
96
+
97
+ ref_img = video_reader[ref_img_idx]
98
+ ref_img_pil = Image.fromarray(ref_img.asnumpy())
99
+ tgt_img = video_reader[tgt_img_idx]
100
+ tgt_img_pil = Image.fromarray(tgt_img.asnumpy())
101
+
102
+ tgt_pose = kps_reader[tgt_img_idx]
103
+ tgt_pose_pil = Image.fromarray(tgt_pose.asnumpy())
104
+
105
+ state = torch.get_rng_state()
106
+ tgt_img = self.augmentation(tgt_img_pil, self.transform, state)
107
+ tgt_pose_img = self.augmentation(tgt_pose_pil, self.cond_transform, state)
108
+ ref_img_vae = self.augmentation(ref_img_pil, self.transform, state)
109
+ clip_image = self.clip_image_processor(
110
+ images=ref_img_pil, return_tensors="pt"
111
+ ).pixel_values[0]
112
+
113
+ sample = dict(
114
+ video_dir=video_path,
115
+ img=tgt_img,
116
+ tgt_pose=tgt_pose_img,
117
+ ref_img=ref_img_vae,
118
+ clip_images=clip_image,
119
+ )
120
+
121
+ return sample
122
+
123
+ def __len__(self):
124
+ return len(self.vid_meta)
src/dataset/dance_video.py ADDED
@@ -0,0 +1,137 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import random
3
+ from typing import List
4
+
5
+ import numpy as np
6
+ import pandas as pd
7
+ import torch
8
+ import torchvision.transforms as transforms
9
+ from decord import VideoReader
10
+ from PIL import Image
11
+ from torch.utils.data import Dataset
12
+ from transformers import CLIPImageProcessor
13
+
14
+
15
+ class HumanDanceVideoDataset(Dataset):
16
+ def __init__(
17
+ self,
18
+ sample_rate,
19
+ n_sample_frames,
20
+ width,
21
+ height,
22
+ img_scale=(1.0, 1.0),
23
+ img_ratio=(0.9, 1.0),
24
+ drop_ratio=0.1,
25
+ data_meta_paths=["./data/fashion_meta.json"],
26
+ ):
27
+ super().__init__()
28
+ self.sample_rate = sample_rate
29
+ self.n_sample_frames = n_sample_frames
30
+ self.width = width
31
+ self.height = height
32
+ self.img_scale = img_scale
33
+ self.img_ratio = img_ratio
34
+
35
+ vid_meta = []
36
+ for data_meta_path in data_meta_paths:
37
+ vid_meta.extend(json.load(open(data_meta_path, "r")))
38
+ self.vid_meta = vid_meta
39
+
40
+ self.clip_image_processor = CLIPImageProcessor()
41
+
42
+ self.pixel_transform = transforms.Compose(
43
+ [
44
+ transforms.RandomResizedCrop(
45
+ (height, width),
46
+ scale=self.img_scale,
47
+ ratio=self.img_ratio,
48
+ interpolation=transforms.InterpolationMode.BILINEAR,
49
+ ),
50
+ transforms.ToTensor(),
51
+ transforms.Normalize([0.5], [0.5]),
52
+ ]
53
+ )
54
+
55
+ self.cond_transform = transforms.Compose(
56
+ [
57
+ transforms.RandomResizedCrop(
58
+ (height, width),
59
+ scale=self.img_scale,
60
+ ratio=self.img_ratio,
61
+ interpolation=transforms.InterpolationMode.BILINEAR,
62
+ ),
63
+ transforms.ToTensor(),
64
+ ]
65
+ )
66
+
67
+ self.drop_ratio = drop_ratio
68
+
69
+ def augmentation(self, images, transform, state=None):
70
+ if state is not None:
71
+ torch.set_rng_state(state)
72
+ if isinstance(images, List):
73
+ transformed_images = [transform(img) for img in images]
74
+ ret_tensor = torch.stack(transformed_images, dim=0) # (f, c, h, w)
75
+ else:
76
+ ret_tensor = transform(images) # (c, h, w)
77
+ return ret_tensor
78
+
79
+ def __getitem__(self, index):
80
+ video_meta = self.vid_meta[index]
81
+ video_path = video_meta["video_path"]
82
+ kps_path = video_meta["kps_path"]
83
+
84
+ video_reader = VideoReader(video_path)
85
+ kps_reader = VideoReader(kps_path)
86
+
87
+ assert len(video_reader) == len(
88
+ kps_reader
89
+ ), f"{len(video_reader) = } != {len(kps_reader) = } in {video_path}"
90
+
91
+ video_length = len(video_reader)
92
+
93
+ clip_length = min(
94
+ video_length, (self.n_sample_frames - 1) * self.sample_rate + 1
95
+ )
96
+ start_idx = random.randint(0, video_length - clip_length)
97
+ batch_index = np.linspace(
98
+ start_idx, start_idx + clip_length - 1, self.n_sample_frames, dtype=int
99
+ ).tolist()
100
+
101
+ # read frames and kps
102
+ vid_pil_image_list = []
103
+ pose_pil_image_list = []
104
+ for index in batch_index:
105
+ img = video_reader[index]
106
+ vid_pil_image_list.append(Image.fromarray(img.asnumpy()))
107
+ img = kps_reader[index]
108
+ pose_pil_image_list.append(Image.fromarray(img.asnumpy()))
109
+
110
+ ref_img_idx = random.randint(0, video_length - 1)
111
+ ref_img = Image.fromarray(video_reader[ref_img_idx].asnumpy())
112
+
113
+ # transform
114
+ state = torch.get_rng_state()
115
+ pixel_values_vid = self.augmentation(
116
+ vid_pil_image_list, self.pixel_transform, state
117
+ )
118
+ pixel_values_pose = self.augmentation(
119
+ pose_pil_image_list, self.cond_transform, state
120
+ )
121
+ pixel_values_ref_img = self.augmentation(ref_img, self.pixel_transform, state)
122
+ clip_ref_img = self.clip_image_processor(
123
+ images=ref_img, return_tensors="pt"
124
+ ).pixel_values[0]
125
+
126
+ sample = dict(
127
+ video_dir=video_path,
128
+ pixel_values_vid=pixel_values_vid,
129
+ pixel_values_pose=pixel_values_pose,
130
+ pixel_values_ref_img=pixel_values_ref_img,
131
+ clip_ref_img=clip_ref_img,
132
+ )
133
+
134
+ return sample
135
+
136
+ def __len__(self):
137
+ return len(self.vid_meta)
src/dwpose/__init__.py ADDED
@@ -0,0 +1,123 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # https://github.com/IDEA-Research/DWPose
2
+ # Openpose
3
+ # Original from CMU https://github.com/CMU-Perceptual-Computing-Lab/openpose
4
+ # 2nd Edited by https://github.com/Hzzone/pytorch-openpose
5
+ # 3rd Edited by ControlNet
6
+ # 4th Edited by ControlNet (added face and correct hands)
7
+
8
+ import copy
9
+ import os
10
+
11
+ os.environ["KMP_DUPLICATE_LIB_OK"] = "TRUE"
12
+ import cv2
13
+ import numpy as np
14
+ import torch
15
+ from controlnet_aux.util import HWC3, resize_image
16
+ from PIL import Image
17
+
18
+ from . import util
19
+ from .wholebody import Wholebody
20
+
21
+
22
+ def draw_pose(pose, H, W):
23
+ bodies = pose["bodies"]
24
+ faces = pose["faces"]
25
+ hands = pose["hands"]
26
+ candidate = bodies["candidate"]
27
+ subset = bodies["subset"]
28
+ canvas = np.zeros(shape=(H, W, 3), dtype=np.uint8)
29
+
30
+ canvas = util.draw_bodypose(canvas, candidate, subset)
31
+
32
+ canvas = util.draw_handpose(canvas, hands)
33
+
34
+ canvas = util.draw_facepose(canvas, faces)
35
+
36
+ return canvas
37
+
38
+
39
+ class DWposeDetector:
40
+ def __init__(self):
41
+ pass
42
+
43
+ def to(self, device):
44
+ self.pose_estimation = Wholebody(device)
45
+ return self
46
+
47
+ def cal_height(self, input_image):
48
+ input_image = cv2.cvtColor(
49
+ np.array(input_image, dtype=np.uint8), cv2.COLOR_RGB2BGR
50
+ )
51
+
52
+ input_image = HWC3(input_image)
53
+ H, W, C = input_image.shape
54
+ with torch.no_grad():
55
+ candidate, subset = self.pose_estimation(input_image)
56
+ nums, keys, locs = candidate.shape
57
+ # candidate[..., 0] /= float(W)
58
+ # candidate[..., 1] /= float(H)
59
+ body = candidate
60
+ return body[0, ..., 1].min(), body[..., 1].max() - body[..., 1].min()
61
+
62
+ def __call__(
63
+ self,
64
+ input_image,
65
+ detect_resolution=512,
66
+ image_resolution=512,
67
+ output_type="pil",
68
+ **kwargs,
69
+ ):
70
+ input_image = cv2.cvtColor(
71
+ np.array(input_image, dtype=np.uint8), cv2.COLOR_RGB2BGR
72
+ )
73
+
74
+ input_image = HWC3(input_image)
75
+ input_image = resize_image(input_image, detect_resolution)
76
+ H, W, C = input_image.shape
77
+ with torch.no_grad():
78
+ candidate, subset = self.pose_estimation(input_image)
79
+ nums, keys, locs = candidate.shape
80
+ candidate[..., 0] /= float(W)
81
+ candidate[..., 1] /= float(H)
82
+ score = subset[:, :18]
83
+ max_ind = np.mean(score, axis=-1).argmax(axis=0)
84
+ score = score[[max_ind]]
85
+ body = candidate[:, :18].copy()
86
+ body = body[[max_ind]]
87
+ nums = 1
88
+ body = body.reshape(nums * 18, locs)
89
+ body_score = copy.deepcopy(score)
90
+ for i in range(len(score)):
91
+ for j in range(len(score[i])):
92
+ if score[i][j] > 0.3:
93
+ score[i][j] = int(18 * i + j)
94
+ else:
95
+ score[i][j] = -1
96
+
97
+ un_visible = subset < 0.3
98
+ candidate[un_visible] = -1
99
+
100
+ foot = candidate[:, 18:24]
101
+
102
+ faces = candidate[[max_ind], 24:92]
103
+
104
+ hands = candidate[[max_ind], 92:113]
105
+ hands = np.vstack([hands, candidate[[max_ind], 113:]])
106
+
107
+ bodies = dict(candidate=body, subset=score)
108
+ pose = dict(bodies=bodies, hands=hands, faces=faces)
109
+
110
+ detected_map = draw_pose(pose, H, W)
111
+ detected_map = HWC3(detected_map)
112
+
113
+ img = resize_image(input_image, image_resolution)
114
+ H, W, C = img.shape
115
+
116
+ detected_map = cv2.resize(
117
+ detected_map, (W, H), interpolation=cv2.INTER_LINEAR
118
+ )
119
+
120
+ if output_type == "pil":
121
+ detected_map = Image.fromarray(detected_map)
122
+
123
+ return detected_map, body_score
src/dwpose/__pycache__/__init__.cpython-310.pyc ADDED
Binary file (3.09 kB). View file
 
src/dwpose/__pycache__/onnxdet.cpython-310.pyc ADDED
Binary file (4.15 kB). View file
 
src/dwpose/__pycache__/onnxpose.cpython-310.pyc ADDED
Binary file (10.3 kB). View file
 
src/dwpose/__pycache__/util.cpython-310.pyc ADDED
Binary file (7.88 kB). View file
 
src/dwpose/__pycache__/wholebody.cpython-310.pyc ADDED
Binary file (1.81 kB). View file
 
src/dwpose/onnxdet.py ADDED
@@ -0,0 +1,130 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # https://github.com/IDEA-Research/DWPose
2
+ import cv2
3
+ import numpy as np
4
+ import onnxruntime
5
+
6
+
7
+ def nms(boxes, scores, nms_thr):
8
+ """Single class NMS implemented in Numpy."""
9
+ x1 = boxes[:, 0]
10
+ y1 = boxes[:, 1]
11
+ x2 = boxes[:, 2]
12
+ y2 = boxes[:, 3]
13
+
14
+ areas = (x2 - x1 + 1) * (y2 - y1 + 1)
15
+ order = scores.argsort()[::-1]
16
+
17
+ keep = []
18
+ while order.size > 0:
19
+ i = order[0]
20
+ keep.append(i)
21
+ xx1 = np.maximum(x1[i], x1[order[1:]])
22
+ yy1 = np.maximum(y1[i], y1[order[1:]])
23
+ xx2 = np.minimum(x2[i], x2[order[1:]])
24
+ yy2 = np.minimum(y2[i], y2[order[1:]])
25
+
26
+ w = np.maximum(0.0, xx2 - xx1 + 1)
27
+ h = np.maximum(0.0, yy2 - yy1 + 1)
28
+ inter = w * h
29
+ ovr = inter / (areas[i] + areas[order[1:]] - inter)
30
+
31
+ inds = np.where(ovr <= nms_thr)[0]
32
+ order = order[inds + 1]
33
+
34
+ return keep
35
+
36
+
37
+ def multiclass_nms(boxes, scores, nms_thr, score_thr):
38
+ """Multiclass NMS implemented in Numpy. Class-aware version."""
39
+ final_dets = []
40
+ num_classes = scores.shape[1]
41
+ for cls_ind in range(num_classes):
42
+ cls_scores = scores[:, cls_ind]
43
+ valid_score_mask = cls_scores > score_thr
44
+ if valid_score_mask.sum() == 0:
45
+ continue
46
+ else:
47
+ valid_scores = cls_scores[valid_score_mask]
48
+ valid_boxes = boxes[valid_score_mask]
49
+ keep = nms(valid_boxes, valid_scores, nms_thr)
50
+ if len(keep) > 0:
51
+ cls_inds = np.ones((len(keep), 1)) * cls_ind
52
+ dets = np.concatenate(
53
+ [valid_boxes[keep], valid_scores[keep, None], cls_inds], 1
54
+ )
55
+ final_dets.append(dets)
56
+ if len(final_dets) == 0:
57
+ return None
58
+ return np.concatenate(final_dets, 0)
59
+
60
+
61
+ def demo_postprocess(outputs, img_size, p6=False):
62
+ grids = []
63
+ expanded_strides = []
64
+ strides = [8, 16, 32] if not p6 else [8, 16, 32, 64]
65
+
66
+ hsizes = [img_size[0] // stride for stride in strides]
67
+ wsizes = [img_size[1] // stride for stride in strides]
68
+
69
+ for hsize, wsize, stride in zip(hsizes, wsizes, strides):
70
+ xv, yv = np.meshgrid(np.arange(wsize), np.arange(hsize))
71
+ grid = np.stack((xv, yv), 2).reshape(1, -1, 2)
72
+ grids.append(grid)
73
+ shape = grid.shape[:2]
74
+ expanded_strides.append(np.full((*shape, 1), stride))
75
+
76
+ grids = np.concatenate(grids, 1)
77
+ expanded_strides = np.concatenate(expanded_strides, 1)
78
+ outputs[..., :2] = (outputs[..., :2] + grids) * expanded_strides
79
+ outputs[..., 2:4] = np.exp(outputs[..., 2:4]) * expanded_strides
80
+
81
+ return outputs
82
+
83
+
84
+ def preprocess(img, input_size, swap=(2, 0, 1)):
85
+ if len(img.shape) == 3:
86
+ padded_img = np.ones((input_size[0], input_size[1], 3), dtype=np.uint8) * 114
87
+ else:
88
+ padded_img = np.ones(input_size, dtype=np.uint8) * 114
89
+
90
+ r = min(input_size[0] / img.shape[0], input_size[1] / img.shape[1])
91
+ resized_img = cv2.resize(
92
+ img,
93
+ (int(img.shape[1] * r), int(img.shape[0] * r)),
94
+ interpolation=cv2.INTER_LINEAR,
95
+ ).astype(np.uint8)
96
+ padded_img[: int(img.shape[0] * r), : int(img.shape[1] * r)] = resized_img
97
+
98
+ padded_img = padded_img.transpose(swap)
99
+ padded_img = np.ascontiguousarray(padded_img, dtype=np.float32)
100
+ return padded_img, r
101
+
102
+
103
+ def inference_detector(session, oriImg):
104
+ input_shape = (640, 640)
105
+ img, ratio = preprocess(oriImg, input_shape)
106
+
107
+ ort_inputs = {session.get_inputs()[0].name: img[None, :, :, :]}
108
+ output = session.run(None, ort_inputs)
109
+ predictions = demo_postprocess(output[0], input_shape)[0]
110
+
111
+ boxes = predictions[:, :4]
112
+ scores = predictions[:, 4:5] * predictions[:, 5:]
113
+
114
+ boxes_xyxy = np.ones_like(boxes)
115
+ boxes_xyxy[:, 0] = boxes[:, 0] - boxes[:, 2] / 2.0
116
+ boxes_xyxy[:, 1] = boxes[:, 1] - boxes[:, 3] / 2.0
117
+ boxes_xyxy[:, 2] = boxes[:, 0] + boxes[:, 2] / 2.0
118
+ boxes_xyxy[:, 3] = boxes[:, 1] + boxes[:, 3] / 2.0
119
+ boxes_xyxy /= ratio
120
+ dets = multiclass_nms(boxes_xyxy, scores, nms_thr=0.45, score_thr=0.1)
121
+ if dets is not None:
122
+ final_boxes, final_scores, final_cls_inds = dets[:, :4], dets[:, 4], dets[:, 5]
123
+ isscore = final_scores > 0.3
124
+ iscat = final_cls_inds == 0
125
+ isbbox = [i and j for (i, j) in zip(isscore, iscat)]
126
+ final_boxes = final_boxes[isbbox]
127
+ else:
128
+ return []
129
+
130
+ return final_boxes
src/dwpose/onnxpose.py ADDED
@@ -0,0 +1,370 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # https://github.com/IDEA-Research/DWPose
2
+ from typing import List, Tuple
3
+
4
+ import cv2
5
+ import numpy as np
6
+ import onnxruntime as ort
7
+
8
+
9
+ def preprocess(
10
+ img: np.ndarray, out_bbox, input_size: Tuple[int, int] = (192, 256)
11
+ ) -> Tuple[np.ndarray, np.ndarray, np.ndarray]:
12
+ """Do preprocessing for RTMPose model inference.
13
+
14
+ Args:
15
+ img (np.ndarray): Input image in shape.
16
+ input_size (tuple): Input image size in shape (w, h).
17
+
18
+ Returns:
19
+ tuple:
20
+ - resized_img (np.ndarray): Preprocessed image.
21
+ - center (np.ndarray): Center of image.
22
+ - scale (np.ndarray): Scale of image.
23
+ """
24
+ # get shape of image
25
+ img_shape = img.shape[:2]
26
+ out_img, out_center, out_scale = [], [], []
27
+ if len(out_bbox) == 0:
28
+ out_bbox = [[0, 0, img_shape[1], img_shape[0]]]
29
+ for i in range(len(out_bbox)):
30
+ x0 = out_bbox[i][0]
31
+ y0 = out_bbox[i][1]
32
+ x1 = out_bbox[i][2]
33
+ y1 = out_bbox[i][3]
34
+ bbox = np.array([x0, y0, x1, y1])
35
+
36
+ # get center and scale
37
+ center, scale = bbox_xyxy2cs(bbox, padding=1.25)
38
+
39
+ # do affine transformation
40
+ resized_img, scale = top_down_affine(input_size, scale, center, img)
41
+
42
+ # normalize image
43
+ mean = np.array([123.675, 116.28, 103.53])
44
+ std = np.array([58.395, 57.12, 57.375])
45
+ resized_img = (resized_img - mean) / std
46
+
47
+ out_img.append(resized_img)
48
+ out_center.append(center)
49
+ out_scale.append(scale)
50
+
51
+ return out_img, out_center, out_scale
52
+
53
+
54
+ def inference(sess: ort.InferenceSession, img: np.ndarray) -> np.ndarray:
55
+ """Inference RTMPose model.
56
+
57
+ Args:
58
+ sess (ort.InferenceSession): ONNXRuntime session.
59
+ img (np.ndarray): Input image in shape.
60
+
61
+ Returns:
62
+ outputs (np.ndarray): Output of RTMPose model.
63
+ """
64
+ all_out = []
65
+ # build input
66
+ for i in range(len(img)):
67
+ input = [img[i].transpose(2, 0, 1)]
68
+
69
+ # build output
70
+ sess_input = {sess.get_inputs()[0].name: input}
71
+ sess_output = []
72
+ for out in sess.get_outputs():
73
+ sess_output.append(out.name)
74
+
75
+ # run model
76
+ outputs = sess.run(sess_output, sess_input)
77
+ all_out.append(outputs)
78
+
79
+ return all_out
80
+
81
+
82
+ def postprocess(
83
+ outputs: List[np.ndarray],
84
+ model_input_size: Tuple[int, int],
85
+ center: Tuple[int, int],
86
+ scale: Tuple[int, int],
87
+ simcc_split_ratio: float = 2.0,
88
+ ) -> Tuple[np.ndarray, np.ndarray]:
89
+ """Postprocess for RTMPose model output.
90
+
91
+ Args:
92
+ outputs (np.ndarray): Output of RTMPose model.
93
+ model_input_size (tuple): RTMPose model Input image size.
94
+ center (tuple): Center of bbox in shape (x, y).
95
+ scale (tuple): Scale of bbox in shape (w, h).
96
+ simcc_split_ratio (float): Split ratio of simcc.
97
+
98
+ Returns:
99
+ tuple:
100
+ - keypoints (np.ndarray): Rescaled keypoints.
101
+ - scores (np.ndarray): Model predict scores.
102
+ """
103
+ all_key = []
104
+ all_score = []
105
+ for i in range(len(outputs)):
106
+ # use simcc to decode
107
+ simcc_x, simcc_y = outputs[i]
108
+ keypoints, scores = decode(simcc_x, simcc_y, simcc_split_ratio)
109
+
110
+ # rescale keypoints
111
+ keypoints = keypoints / model_input_size * scale[i] + center[i] - scale[i] / 2
112
+ all_key.append(keypoints[0])
113
+ all_score.append(scores[0])
114
+
115
+ return np.array(all_key), np.array(all_score)
116
+
117
+
118
+ def bbox_xyxy2cs(
119
+ bbox: np.ndarray, padding: float = 1.0
120
+ ) -> Tuple[np.ndarray, np.ndarray]:
121
+ """Transform the bbox format from (x,y,w,h) into (center, scale)
122
+
123
+ Args:
124
+ bbox (ndarray): Bounding box(es) in shape (4,) or (n, 4), formatted
125
+ as (left, top, right, bottom)
126
+ padding (float): BBox padding factor that will be multilied to scale.
127
+ Default: 1.0
128
+
129
+ Returns:
130
+ tuple: A tuple containing center and scale.
131
+ - np.ndarray[float32]: Center (x, y) of the bbox in shape (2,) or
132
+ (n, 2)
133
+ - np.ndarray[float32]: Scale (w, h) of the bbox in shape (2,) or
134
+ (n, 2)
135
+ """
136
+ # convert single bbox from (4, ) to (1, 4)
137
+ dim = bbox.ndim
138
+ if dim == 1:
139
+ bbox = bbox[None, :]
140
+
141
+ # get bbox center and scale
142
+ x1, y1, x2, y2 = np.hsplit(bbox, [1, 2, 3])
143
+ center = np.hstack([x1 + x2, y1 + y2]) * 0.5
144
+ scale = np.hstack([x2 - x1, y2 - y1]) * padding
145
+
146
+ if dim == 1:
147
+ center = center[0]
148
+ scale = scale[0]
149
+
150
+ return center, scale
151
+
152
+
153
+ def _fix_aspect_ratio(bbox_scale: np.ndarray, aspect_ratio: float) -> np.ndarray:
154
+ """Extend the scale to match the given aspect ratio.
155
+
156
+ Args:
157
+ scale (np.ndarray): The image scale (w, h) in shape (2, )
158
+ aspect_ratio (float): The ratio of ``w/h``
159
+
160
+ Returns:
161
+ np.ndarray: The reshaped image scale in (2, )
162
+ """
163
+ w, h = np.hsplit(bbox_scale, [1])
164
+ bbox_scale = np.where(
165
+ w > h * aspect_ratio,
166
+ np.hstack([w, w / aspect_ratio]),
167
+ np.hstack([h * aspect_ratio, h]),
168
+ )
169
+ return bbox_scale
170
+
171
+
172
+ def _rotate_point(pt: np.ndarray, angle_rad: float) -> np.ndarray:
173
+ """Rotate a point by an angle.
174
+
175
+ Args:
176
+ pt (np.ndarray): 2D point coordinates (x, y) in shape (2, )
177
+ angle_rad (float): rotation angle in radian
178
+
179
+ Returns:
180
+ np.ndarray: Rotated point in shape (2, )
181
+ """
182
+ sn, cs = np.sin(angle_rad), np.cos(angle_rad)
183
+ rot_mat = np.array([[cs, -sn], [sn, cs]])
184
+ return rot_mat @ pt
185
+
186
+
187
+ def _get_3rd_point(a: np.ndarray, b: np.ndarray) -> np.ndarray:
188
+ """To calculate the affine matrix, three pairs of points are required. This
189
+ function is used to get the 3rd point, given 2D points a & b.
190
+
191
+ The 3rd point is defined by rotating vector `a - b` by 90 degrees
192
+ anticlockwise, using b as the rotation center.
193
+
194
+ Args:
195
+ a (np.ndarray): The 1st point (x,y) in shape (2, )
196
+ b (np.ndarray): The 2nd point (x,y) in shape (2, )
197
+
198
+ Returns:
199
+ np.ndarray: The 3rd point.
200
+ """
201
+ direction = a - b
202
+ c = b + np.r_[-direction[1], direction[0]]
203
+ return c
204
+
205
+
206
+ def get_warp_matrix(
207
+ center: np.ndarray,
208
+ scale: np.ndarray,
209
+ rot: float,
210
+ output_size: Tuple[int, int],
211
+ shift: Tuple[float, float] = (0.0, 0.0),
212
+ inv: bool = False,
213
+ ) -> np.ndarray:
214
+ """Calculate the affine transformation matrix that can warp the bbox area
215
+ in the input image to the output size.
216
+
217
+ Args:
218
+ center (np.ndarray[2, ]): Center of the bounding box (x, y).
219
+ scale (np.ndarray[2, ]): Scale of the bounding box
220
+ wrt [width, height].
221
+ rot (float): Rotation angle (degree).
222
+ output_size (np.ndarray[2, ] | list(2,)): Size of the
223
+ destination heatmaps.
224
+ shift (0-100%): Shift translation ratio wrt the width/height.
225
+ Default (0., 0.).
226
+ inv (bool): Option to inverse the affine transform direction.
227
+ (inv=False: src->dst or inv=True: dst->src)
228
+
229
+ Returns:
230
+ np.ndarray: A 2x3 transformation matrix
231
+ """
232
+ shift = np.array(shift)
233
+ src_w = scale[0]
234
+ dst_w = output_size[0]
235
+ dst_h = output_size[1]
236
+
237
+ # compute transformation matrix
238
+ rot_rad = np.deg2rad(rot)
239
+ src_dir = _rotate_point(np.array([0.0, src_w * -0.5]), rot_rad)
240
+ dst_dir = np.array([0.0, dst_w * -0.5])
241
+
242
+ # get four corners of the src rectangle in the original image
243
+ src = np.zeros((3, 2), dtype=np.float32)
244
+ src[0, :] = center + scale * shift
245
+ src[1, :] = center + src_dir + scale * shift
246
+ src[2, :] = _get_3rd_point(src[0, :], src[1, :])
247
+
248
+ # get four corners of the dst rectangle in the input image
249
+ dst = np.zeros((3, 2), dtype=np.float32)
250
+ dst[0, :] = [dst_w * 0.5, dst_h * 0.5]
251
+ dst[1, :] = np.array([dst_w * 0.5, dst_h * 0.5]) + dst_dir
252
+ dst[2, :] = _get_3rd_point(dst[0, :], dst[1, :])
253
+
254
+ if inv:
255
+ warp_mat = cv2.getAffineTransform(np.float32(dst), np.float32(src))
256
+ else:
257
+ warp_mat = cv2.getAffineTransform(np.float32(src), np.float32(dst))
258
+
259
+ return warp_mat
260
+
261
+
262
+ def top_down_affine(
263
+ input_size: dict, bbox_scale: dict, bbox_center: dict, img: np.ndarray
264
+ ) -> Tuple[np.ndarray, np.ndarray]:
265
+ """Get the bbox image as the model input by affine transform.
266
+
267
+ Args:
268
+ input_size (dict): The input size of the model.
269
+ bbox_scale (dict): The bbox scale of the img.
270
+ bbox_center (dict): The bbox center of the img.
271
+ img (np.ndarray): The original image.
272
+
273
+ Returns:
274
+ tuple: A tuple containing center and scale.
275
+ - np.ndarray[float32]: img after affine transform.
276
+ - np.ndarray[float32]: bbox scale after affine transform.
277
+ """
278
+ w, h = input_size
279
+ warp_size = (int(w), int(h))
280
+
281
+ # reshape bbox to fixed aspect ratio
282
+ bbox_scale = _fix_aspect_ratio(bbox_scale, aspect_ratio=w / h)
283
+
284
+ # get the affine matrix
285
+ center = bbox_center
286
+ scale = bbox_scale
287
+ rot = 0
288
+ warp_mat = get_warp_matrix(center, scale, rot, output_size=(w, h))
289
+
290
+ # do affine transform
291
+ img = cv2.warpAffine(img, warp_mat, warp_size, flags=cv2.INTER_LINEAR)
292
+
293
+ return img, bbox_scale
294
+
295
+
296
+ def get_simcc_maximum(
297
+ simcc_x: np.ndarray, simcc_y: np.ndarray
298
+ ) -> Tuple[np.ndarray, np.ndarray]:
299
+ """Get maximum response location and value from simcc representations.
300
+
301
+ Note:
302
+ instance number: N
303
+ num_keypoints: K
304
+ heatmap height: H
305
+ heatmap width: W
306
+
307
+ Args:
308
+ simcc_x (np.ndarray): x-axis SimCC in shape (K, Wx) or (N, K, Wx)
309
+ simcc_y (np.ndarray): y-axis SimCC in shape (K, Wy) or (N, K, Wy)
310
+
311
+ Returns:
312
+ tuple:
313
+ - locs (np.ndarray): locations of maximum heatmap responses in shape
314
+ (K, 2) or (N, K, 2)
315
+ - vals (np.ndarray): values of maximum heatmap responses in shape
316
+ (K,) or (N, K)
317
+ """
318
+ N, K, Wx = simcc_x.shape
319
+ simcc_x = simcc_x.reshape(N * K, -1)
320
+ simcc_y = simcc_y.reshape(N * K, -1)
321
+
322
+ # get maximum value locations
323
+ x_locs = np.argmax(simcc_x, axis=1)
324
+ y_locs = np.argmax(simcc_y, axis=1)
325
+ locs = np.stack((x_locs, y_locs), axis=-1).astype(np.float32)
326
+ max_val_x = np.amax(simcc_x, axis=1)
327
+ max_val_y = np.amax(simcc_y, axis=1)
328
+
329
+ # get maximum value across x and y axis
330
+ mask = max_val_x > max_val_y
331
+ max_val_x[mask] = max_val_y[mask]
332
+ vals = max_val_x
333
+ locs[vals <= 0.0] = -1
334
+
335
+ # reshape
336
+ locs = locs.reshape(N, K, 2)
337
+ vals = vals.reshape(N, K)
338
+
339
+ return locs, vals
340
+
341
+
342
+ def decode(
343
+ simcc_x: np.ndarray, simcc_y: np.ndarray, simcc_split_ratio
344
+ ) -> Tuple[np.ndarray, np.ndarray]:
345
+ """Modulate simcc distribution with Gaussian.
346
+
347
+ Args:
348
+ simcc_x (np.ndarray[K, Wx]): model predicted simcc in x.
349
+ simcc_y (np.ndarray[K, Wy]): model predicted simcc in y.
350
+ simcc_split_ratio (int): The split ratio of simcc.
351
+
352
+ Returns:
353
+ tuple: A tuple containing center and scale.
354
+ - np.ndarray[float32]: keypoints in shape (K, 2) or (n, K, 2)
355
+ - np.ndarray[float32]: scores in shape (K,) or (n, K)
356
+ """
357
+ keypoints, scores = get_simcc_maximum(simcc_x, simcc_y)
358
+ keypoints /= simcc_split_ratio
359
+
360
+ return keypoints, scores
361
+
362
+
363
+ def inference_pose(session, out_bbox, oriImg):
364
+ h, w = session.get_inputs()[0].shape[2:]
365
+ model_input_size = (w, h)
366
+ resized_img, center, scale = preprocess(oriImg, out_bbox, model_input_size)
367
+ outputs = inference(session, resized_img)
368
+ keypoints, scores = postprocess(outputs, model_input_size, center, scale)
369
+
370
+ return keypoints, scores
src/dwpose/util.py ADDED
@@ -0,0 +1,378 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # https://github.com/IDEA-Research/DWPose
2
+ import math
3
+ import numpy as np
4
+ import matplotlib
5
+ import cv2
6
+
7
+
8
+ eps = 0.01
9
+
10
+
11
+ def smart_resize(x, s):
12
+ Ht, Wt = s
13
+ if x.ndim == 2:
14
+ Ho, Wo = x.shape
15
+ Co = 1
16
+ else:
17
+ Ho, Wo, Co = x.shape
18
+ if Co == 3 or Co == 1:
19
+ k = float(Ht + Wt) / float(Ho + Wo)
20
+ return cv2.resize(
21
+ x,
22
+ (int(Wt), int(Ht)),
23
+ interpolation=cv2.INTER_AREA if k < 1 else cv2.INTER_LANCZOS4,
24
+ )
25
+ else:
26
+ return np.stack([smart_resize(x[:, :, i], s) for i in range(Co)], axis=2)
27
+
28
+
29
+ def smart_resize_k(x, fx, fy):
30
+ if x.ndim == 2:
31
+ Ho, Wo = x.shape
32
+ Co = 1
33
+ else:
34
+ Ho, Wo, Co = x.shape
35
+ Ht, Wt = Ho * fy, Wo * fx
36
+ if Co == 3 or Co == 1:
37
+ k = float(Ht + Wt) / float(Ho + Wo)
38
+ return cv2.resize(
39
+ x,
40
+ (int(Wt), int(Ht)),
41
+ interpolation=cv2.INTER_AREA if k < 1 else cv2.INTER_LANCZOS4,
42
+ )
43
+ else:
44
+ return np.stack([smart_resize_k(x[:, :, i], fx, fy) for i in range(Co)], axis=2)
45
+
46
+
47
+ def padRightDownCorner(img, stride, padValue):
48
+ h = img.shape[0]
49
+ w = img.shape[1]
50
+
51
+ pad = 4 * [None]
52
+ pad[0] = 0 # up
53
+ pad[1] = 0 # left
54
+ pad[2] = 0 if (h % stride == 0) else stride - (h % stride) # down
55
+ pad[3] = 0 if (w % stride == 0) else stride - (w % stride) # right
56
+
57
+ img_padded = img
58
+ pad_up = np.tile(img_padded[0:1, :, :] * 0 + padValue, (pad[0], 1, 1))
59
+ img_padded = np.concatenate((pad_up, img_padded), axis=0)
60
+ pad_left = np.tile(img_padded[:, 0:1, :] * 0 + padValue, (1, pad[1], 1))
61
+ img_padded = np.concatenate((pad_left, img_padded), axis=1)
62
+ pad_down = np.tile(img_padded[-2:-1, :, :] * 0 + padValue, (pad[2], 1, 1))
63
+ img_padded = np.concatenate((img_padded, pad_down), axis=0)
64
+ pad_right = np.tile(img_padded[:, -2:-1, :] * 0 + padValue, (1, pad[3], 1))
65
+ img_padded = np.concatenate((img_padded, pad_right), axis=1)
66
+
67
+ return img_padded, pad
68
+
69
+
70
+ def transfer(model, model_weights):
71
+ transfered_model_weights = {}
72
+ for weights_name in model.state_dict().keys():
73
+ transfered_model_weights[weights_name] = model_weights[
74
+ ".".join(weights_name.split(".")[1:])
75
+ ]
76
+ return transfered_model_weights
77
+
78
+
79
+ def draw_bodypose(canvas, candidate, subset):
80
+ H, W, C = canvas.shape
81
+ candidate = np.array(candidate)
82
+ subset = np.array(subset)
83
+
84
+ stickwidth = 4
85
+
86
+ limbSeq = [
87
+ [2, 3],
88
+ [2, 6],
89
+ [3, 4],
90
+ [4, 5],
91
+ [6, 7],
92
+ [7, 8],
93
+ [2, 9],
94
+ [9, 10],
95
+ [10, 11],
96
+ [2, 12],
97
+ [12, 13],
98
+ [13, 14],
99
+ [2, 1],
100
+ [1, 15],
101
+ [15, 17],
102
+ [1, 16],
103
+ [16, 18],
104
+ [3, 17],
105
+ [6, 18],
106
+ ]
107
+
108
+ colors = [
109
+ [255, 0, 0],
110
+ [255, 85, 0],
111
+ [255, 170, 0],
112
+ [255, 255, 0],
113
+ [170, 255, 0],
114
+ [85, 255, 0],
115
+ [0, 255, 0],
116
+ [0, 255, 85],
117
+ [0, 255, 170],
118
+ [0, 255, 255],
119
+ [0, 170, 255],
120
+ [0, 85, 255],
121
+ [0, 0, 255],
122
+ [85, 0, 255],
123
+ [170, 0, 255],
124
+ [255, 0, 255],
125
+ [255, 0, 170],
126
+ [255, 0, 85],
127
+ ]
128
+
129
+ for i in range(17):
130
+ for n in range(len(subset)):
131
+ index = subset[n][np.array(limbSeq[i]) - 1]
132
+ if -1 in index:
133
+ continue
134
+ Y = candidate[index.astype(int), 0] * float(W)
135
+ X = candidate[index.astype(int), 1] * float(H)
136
+ mX = np.mean(X)
137
+ mY = np.mean(Y)
138
+ length = ((X[0] - X[1]) ** 2 + (Y[0] - Y[1]) ** 2) ** 0.5
139
+ angle = math.degrees(math.atan2(X[0] - X[1], Y[0] - Y[1]))
140
+ polygon = cv2.ellipse2Poly(
141
+ (int(mY), int(mX)), (int(length / 2), stickwidth), int(angle), 0, 360, 1
142
+ )
143
+ cv2.fillConvexPoly(canvas, polygon, colors[i])
144
+
145
+ canvas = (canvas * 0.6).astype(np.uint8)
146
+
147
+ for i in range(18):
148
+ for n in range(len(subset)):
149
+ index = int(subset[n][i])
150
+ if index == -1:
151
+ continue
152
+ x, y = candidate[index][0:2]
153
+ x = int(x * W)
154
+ y = int(y * H)
155
+ cv2.circle(canvas, (int(x), int(y)), 4, colors[i], thickness=-1)
156
+
157
+ return canvas
158
+
159
+
160
+ def draw_handpose(canvas, all_hand_peaks):
161
+ H, W, C = canvas.shape
162
+
163
+ edges = [
164
+ [0, 1],
165
+ [1, 2],
166
+ [2, 3],
167
+ [3, 4],
168
+ [0, 5],
169
+ [5, 6],
170
+ [6, 7],
171
+ [7, 8],
172
+ [0, 9],
173
+ [9, 10],
174
+ [10, 11],
175
+ [11, 12],
176
+ [0, 13],
177
+ [13, 14],
178
+ [14, 15],
179
+ [15, 16],
180
+ [0, 17],
181
+ [17, 18],
182
+ [18, 19],
183
+ [19, 20],
184
+ ]
185
+
186
+ for peaks in all_hand_peaks:
187
+ peaks = np.array(peaks)
188
+
189
+ for ie, e in enumerate(edges):
190
+ x1, y1 = peaks[e[0]]
191
+ x2, y2 = peaks[e[1]]
192
+ x1 = int(x1 * W)
193
+ y1 = int(y1 * H)
194
+ x2 = int(x2 * W)
195
+ y2 = int(y2 * H)
196
+ if x1 > eps and y1 > eps and x2 > eps and y2 > eps:
197
+ cv2.line(
198
+ canvas,
199
+ (x1, y1),
200
+ (x2, y2),
201
+ matplotlib.colors.hsv_to_rgb([ie / float(len(edges)), 1.0, 1.0])
202
+ * 255,
203
+ thickness=2,
204
+ )
205
+
206
+ for i, keyponit in enumerate(peaks):
207
+ x, y = keyponit
208
+ x = int(x * W)
209
+ y = int(y * H)
210
+ if x > eps and y > eps:
211
+ cv2.circle(canvas, (x, y), 4, (0, 0, 255), thickness=-1)
212
+ return canvas
213
+
214
+
215
+ def draw_facepose(canvas, all_lmks):
216
+ H, W, C = canvas.shape
217
+ for lmks in all_lmks:
218
+ lmks = np.array(lmks)
219
+ for lmk in lmks:
220
+ x, y = lmk
221
+ x = int(x * W)
222
+ y = int(y * H)
223
+ if x > eps and y > eps:
224
+ cv2.circle(canvas, (x, y), 3, (255, 255, 255), thickness=-1)
225
+ return canvas
226
+
227
+
228
+ # detect hand according to body pose keypoints
229
+ # please refer to https://github.com/CMU-Perceptual-Computing-Lab/openpose/blob/master/src/openpose/hand/handDetector.cpp
230
+ def handDetect(candidate, subset, oriImg):
231
+ # right hand: wrist 4, elbow 3, shoulder 2
232
+ # left hand: wrist 7, elbow 6, shoulder 5
233
+ ratioWristElbow = 0.33
234
+ detect_result = []
235
+ image_height, image_width = oriImg.shape[0:2]
236
+ for person in subset.astype(int):
237
+ # if any of three not detected
238
+ has_left = np.sum(person[[5, 6, 7]] == -1) == 0
239
+ has_right = np.sum(person[[2, 3, 4]] == -1) == 0
240
+ if not (has_left or has_right):
241
+ continue
242
+ hands = []
243
+ # left hand
244
+ if has_left:
245
+ left_shoulder_index, left_elbow_index, left_wrist_index = person[[5, 6, 7]]
246
+ x1, y1 = candidate[left_shoulder_index][:2]
247
+ x2, y2 = candidate[left_elbow_index][:2]
248
+ x3, y3 = candidate[left_wrist_index][:2]
249
+ hands.append([x1, y1, x2, y2, x3, y3, True])
250
+ # right hand
251
+ if has_right:
252
+ right_shoulder_index, right_elbow_index, right_wrist_index = person[
253
+ [2, 3, 4]
254
+ ]
255
+ x1, y1 = candidate[right_shoulder_index][:2]
256
+ x2, y2 = candidate[right_elbow_index][:2]
257
+ x3, y3 = candidate[right_wrist_index][:2]
258
+ hands.append([x1, y1, x2, y2, x3, y3, False])
259
+
260
+ for x1, y1, x2, y2, x3, y3, is_left in hands:
261
+ # pos_hand = pos_wrist + ratio * (pos_wrist - pos_elbox) = (1 + ratio) * pos_wrist - ratio * pos_elbox
262
+ # handRectangle.x = posePtr[wrist*3] + ratioWristElbow * (posePtr[wrist*3] - posePtr[elbow*3]);
263
+ # handRectangle.y = posePtr[wrist*3+1] + ratioWristElbow * (posePtr[wrist*3+1] - posePtr[elbow*3+1]);
264
+ # const auto distanceWristElbow = getDistance(poseKeypoints, person, wrist, elbow);
265
+ # const auto distanceElbowShoulder = getDistance(poseKeypoints, person, elbow, shoulder);
266
+ # handRectangle.width = 1.5f * fastMax(distanceWristElbow, 0.9f * distanceElbowShoulder);
267
+ x = x3 + ratioWristElbow * (x3 - x2)
268
+ y = y3 + ratioWristElbow * (y3 - y2)
269
+ distanceWristElbow = math.sqrt((x3 - x2) ** 2 + (y3 - y2) ** 2)
270
+ distanceElbowShoulder = math.sqrt((x2 - x1) ** 2 + (y2 - y1) ** 2)
271
+ width = 1.5 * max(distanceWristElbow, 0.9 * distanceElbowShoulder)
272
+ # x-y refers to the center --> offset to topLeft point
273
+ # handRectangle.x -= handRectangle.width / 2.f;
274
+ # handRectangle.y -= handRectangle.height / 2.f;
275
+ x -= width / 2
276
+ y -= width / 2 # width = height
277
+ # overflow the image
278
+ if x < 0:
279
+ x = 0
280
+ if y < 0:
281
+ y = 0
282
+ width1 = width
283
+ width2 = width
284
+ if x + width > image_width:
285
+ width1 = image_width - x
286
+ if y + width > image_height:
287
+ width2 = image_height - y
288
+ width = min(width1, width2)
289
+ # the max hand box value is 20 pixels
290
+ if width >= 20:
291
+ detect_result.append([int(x), int(y), int(width), is_left])
292
+
293
+ """
294
+ return value: [[x, y, w, True if left hand else False]].
295
+ width=height since the network require squared input.
296
+ x, y is the coordinate of top left
297
+ """
298
+ return detect_result
299
+
300
+
301
+ # Written by Lvmin
302
+ def faceDetect(candidate, subset, oriImg):
303
+ # left right eye ear 14 15 16 17
304
+ detect_result = []
305
+ image_height, image_width = oriImg.shape[0:2]
306
+ for person in subset.astype(int):
307
+ has_head = person[0] > -1
308
+ if not has_head:
309
+ continue
310
+
311
+ has_left_eye = person[14] > -1
312
+ has_right_eye = person[15] > -1
313
+ has_left_ear = person[16] > -1
314
+ has_right_ear = person[17] > -1
315
+
316
+ if not (has_left_eye or has_right_eye or has_left_ear or has_right_ear):
317
+ continue
318
+
319
+ head, left_eye, right_eye, left_ear, right_ear = person[[0, 14, 15, 16, 17]]
320
+
321
+ width = 0.0
322
+ x0, y0 = candidate[head][:2]
323
+
324
+ if has_left_eye:
325
+ x1, y1 = candidate[left_eye][:2]
326
+ d = max(abs(x0 - x1), abs(y0 - y1))
327
+ width = max(width, d * 3.0)
328
+
329
+ if has_right_eye:
330
+ x1, y1 = candidate[right_eye][:2]
331
+ d = max(abs(x0 - x1), abs(y0 - y1))
332
+ width = max(width, d * 3.0)
333
+
334
+ if has_left_ear:
335
+ x1, y1 = candidate[left_ear][:2]
336
+ d = max(abs(x0 - x1), abs(y0 - y1))
337
+ width = max(width, d * 1.5)
338
+
339
+ if has_right_ear:
340
+ x1, y1 = candidate[right_ear][:2]
341
+ d = max(abs(x0 - x1), abs(y0 - y1))
342
+ width = max(width, d * 1.5)
343
+
344
+ x, y = x0, y0
345
+
346
+ x -= width
347
+ y -= width
348
+
349
+ if x < 0:
350
+ x = 0
351
+
352
+ if y < 0:
353
+ y = 0
354
+
355
+ width1 = width * 2
356
+ width2 = width * 2
357
+
358
+ if x + width > image_width:
359
+ width1 = image_width - x
360
+
361
+ if y + width > image_height:
362
+ width2 = image_height - y
363
+
364
+ width = min(width1, width2)
365
+
366
+ if width >= 20:
367
+ detect_result.append([int(x), int(y), int(width)])
368
+
369
+ return detect_result
370
+
371
+
372
+ # get max index of 2d array
373
+ def npmax(array):
374
+ arrayindex = array.argmax(1)
375
+ arrayvalue = array.max(1)
376
+ i = arrayvalue.argmax()
377
+ j = arrayindex[i]
378
+ return i, j
src/dwpose/wholebody.py ADDED
@@ -0,0 +1,48 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # https://github.com/IDEA-Research/DWPose
2
+ from pathlib import Path
3
+
4
+ import cv2
5
+ import numpy as np
6
+ import onnxruntime as ort
7
+
8
+ from .onnxdet import inference_detector
9
+ from .onnxpose import inference_pose
10
+
11
+ ModelDataPathPrefix = Path("./pretrained_weights")
12
+
13
+
14
+ class Wholebody:
15
+ def __init__(self, device="cuda:0"):
16
+ providers = (
17
+ ["CPUExecutionProvider"] if device == "cpu" else ["CUDAExecutionProvider"]
18
+ )
19
+ onnx_det = ModelDataPathPrefix.joinpath("DWPose/yolox_l.onnx")
20
+ onnx_pose = ModelDataPathPrefix.joinpath("DWPose/dw-ll_ucoco_384.onnx")
21
+
22
+ self.session_det = ort.InferenceSession(
23
+ path_or_bytes=onnx_det, providers=providers
24
+ )
25
+ self.session_pose = ort.InferenceSession(
26
+ path_or_bytes=onnx_pose, providers=providers
27
+ )
28
+
29
+ def __call__(self, oriImg):
30
+ det_result = inference_detector(self.session_det, oriImg)
31
+ keypoints, scores = inference_pose(self.session_pose, det_result, oriImg)
32
+
33
+ keypoints_info = np.concatenate((keypoints, scores[..., None]), axis=-1)
34
+ # compute neck joint
35
+ neck = np.mean(keypoints_info[:, [5, 6]], axis=1)
36
+ # neck score when visualizing pred
37
+ neck[:, 2:4] = np.logical_and(
38
+ keypoints_info[:, 5, 2:4] > 0.3, keypoints_info[:, 6, 2:4] > 0.3
39
+ ).astype(int)
40
+ new_keypoints_info = np.insert(keypoints_info, 17, neck, axis=1)
41
+ mmpose_idx = [17, 6, 8, 10, 7, 9, 12, 14, 16, 13, 15, 2, 1, 4, 3]
42
+ openpose_idx = [1, 2, 3, 4, 6, 7, 8, 9, 10, 12, 13, 14, 15, 16, 17]
43
+ new_keypoints_info[:, openpose_idx] = new_keypoints_info[:, mmpose_idx]
44
+ keypoints_info = new_keypoints_info
45
+
46
+ keypoints, scores = keypoints_info[..., :2], keypoints_info[..., 2]
47
+
48
+ return keypoints, scores
src/utils/__pycache__/util.cpython-310.pyc CHANGED
Binary files a/src/utils/__pycache__/util.cpython-310.pyc and b/src/utils/__pycache__/util.cpython-310.pyc differ
 
tools/download_weights.py ADDED
@@ -0,0 +1,111 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from pathlib import Path, PurePosixPath
3
+
4
+ from huggingface_hub import hf_hub_download
5
+
6
+
7
+ def prepare_base_model():
8
+ print(f'Preparing base stable-diffusion-v1-5 weights...')
9
+ local_dir = "./pretrained_weights/stable-diffusion-v1-5"
10
+ os.makedirs(local_dir, exist_ok=True)
11
+ for hub_file in ["unet/config.json", "unet/diffusion_pytorch_model.bin"]:
12
+ path = Path(hub_file)
13
+ saved_path = local_dir / path
14
+ if os.path.exists(saved_path):
15
+ continue
16
+ hf_hub_download(
17
+ repo_id="runwayml/stable-diffusion-v1-5",
18
+ subfolder=PurePosixPath(path.parent),
19
+ filename=PurePosixPath(path.name),
20
+ local_dir=local_dir,
21
+ )
22
+
23
+
24
+ def prepare_image_encoder():
25
+ print(f"Preparing image encoder weights...")
26
+ local_dir = "./pretrained_weights"
27
+ os.makedirs(local_dir, exist_ok=True)
28
+ for hub_file in ["image_encoder/config.json", "image_encoder/pytorch_model.bin"]:
29
+ path = Path(hub_file)
30
+ saved_path = local_dir / path
31
+ if os.path.exists(saved_path):
32
+ continue
33
+ hf_hub_download(
34
+ repo_id="lambdalabs/sd-image-variations-diffusers",
35
+ subfolder=PurePosixPath(path.parent),
36
+ filename=PurePosixPath(path.name),
37
+ local_dir=local_dir,
38
+ )
39
+
40
+
41
+ def prepare_dwpose():
42
+ print(f"Preparing DWPose weights...")
43
+ local_dir = "./pretrained_weights/DWPose"
44
+ os.makedirs(local_dir, exist_ok=True)
45
+ for hub_file in [
46
+ "dw-ll_ucoco_384.onnx",
47
+ "yolox_l.onnx",
48
+ ]:
49
+ path = Path(hub_file)
50
+ saved_path = local_dir / path
51
+ if os.path.exists(saved_path):
52
+ continue
53
+
54
+ hf_hub_download(
55
+ repo_id="yzd-v/DWPose",
56
+ subfolder=PurePosixPath(path.parent),
57
+ filename=PurePosixPath(path.name),
58
+ local_dir=local_dir,
59
+ )
60
+
61
+
62
+ def prepare_vae():
63
+ print(f"Preparing vae weights...")
64
+ local_dir = "./pretrained_weights/sd-vae-ft-mse"
65
+ os.makedirs(local_dir, exist_ok=True)
66
+ for hub_file in [
67
+ "config.json",
68
+ "diffusion_pytorch_model.bin",
69
+ ]:
70
+ path = Path(hub_file)
71
+ saved_path = local_dir / path
72
+ if os.path.exists(saved_path):
73
+ continue
74
+
75
+ hf_hub_download(
76
+ repo_id="stabilityai/sd-vae-ft-mse",
77
+ subfolder=PurePosixPath(path.parent),
78
+ filename=PurePosixPath(path.name),
79
+ local_dir=local_dir,
80
+ )
81
+
82
+
83
+ def prepare_anyone():
84
+ print(f"Preparing AnimateAnyone weights...")
85
+ local_dir = "./pretrained_weights"
86
+ os.makedirs(local_dir, exist_ok=True)
87
+ for hub_file in [
88
+ "denoising_unet.pth",
89
+ "motion_module.pth",
90
+ "pose_guider.pth",
91
+ "reference_unet.pth",
92
+ ]:
93
+ path = Path(hub_file)
94
+ saved_path = local_dir / path
95
+ if os.path.exists(saved_path):
96
+ continue
97
+
98
+ hf_hub_download(
99
+ repo_id="patrolli/AnimateAnyone",
100
+ subfolder=PurePosixPath(path.parent),
101
+ filename=PurePosixPath(path.name),
102
+ local_dir=local_dir,
103
+ )
104
+
105
+ if __name__ == '__main__':
106
+ prepare_base_model()
107
+ prepare_image_encoder()
108
+ prepare_dwpose()
109
+ prepare_vae()
110
+ prepare_anyone()
111
+
tools/extract_meta_info.py ADDED
@@ -0,0 +1,37 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import argparse
2
+ import json
3
+ import os
4
+
5
+ # -----
6
+ # [{'vid': , 'kps': , 'other':},
7
+ # {'vid': , 'kps': , 'other':}]
8
+ # -----
9
+ # python tools/extract_meta_info.py --root_path /path/to/video_dir --dataset_name fashion
10
+ # -----
11
+ parser = argparse.ArgumentParser()
12
+ parser.add_argument("--root_path", type=str)
13
+ parser.add_argument("--dataset_name", type=str)
14
+ parser.add_argument("--meta_info_name", type=str)
15
+
16
+ args = parser.parse_args()
17
+
18
+ if args.meta_info_name is None:
19
+ args.meta_info_name = args.dataset_name
20
+
21
+ pose_dir = args.root_path + "_dwpose"
22
+
23
+ # collect all video_folder paths
24
+ video_mp4_paths = set()
25
+ for root, dirs, files in os.walk(args.root_path):
26
+ for name in files:
27
+ if name.endswith(".mp4"):
28
+ video_mp4_paths.add(os.path.join(root, name))
29
+ video_mp4_paths = list(video_mp4_paths)
30
+
31
+ meta_infos = []
32
+ for video_mp4_path in video_mp4_paths:
33
+ relative_video_name = os.path.relpath(video_mp4_path, args.root_path)
34
+ kps_path = os.path.join(pose_dir, relative_video_name)
35
+ meta_infos.append({"video_path": video_mp4_path, "kps_path": kps_path})
36
+
37
+ json.dump(meta_infos, open(f"./data/{args.meta_info_name}_meta.json", "w"))
tools/facetracker_api.py ADDED
@@ -0,0 +1,62 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import copy
2
+ import os, sys
3
+ import math
4
+ import numpy as np
5
+ import cv2
6
+ sys.path.append("OpenSeeFace/")
7
+ from tracker import Tracker, get_model_base_path
8
+
9
+ features = ["eye_l", "eye_r", "eyebrow_steepness_l", "eyebrow_updown_l", "eyebrow_quirk_l", "eyebrow_steepness_r", "eyebrow_updown_r", "eyebrow_quirk_r", "mouth_corner_updown_l", "mouth_corner_inout_l", "mouth_corner_updown_r", "mouth_corner_inout_r", "mouth_open", "mouth_wide"]
10
+
11
+
12
+ def face_image(frame, save_path=None):
13
+ height, width, c = frame.shape
14
+ tracker = Tracker(width, height, threshold=None, max_threads=1, max_faces=1, discard_after=10, scan_every=3, silent=False, model_type=3, model_dir=None,
15
+ no_gaze=False, detection_threshold=0.4, use_retinaface=0, max_feature_updates=900, static_model=True, try_hard=False)
16
+ faces = tracker.predict(frame)
17
+ frame = np.zeros_like(frame)
18
+ detected = False
19
+ face_lms = None
20
+ for face_num, f in enumerate(faces):
21
+ f = copy.copy(f)
22
+ if f.eye_blink is None:
23
+ f.eye_blink = [1, 1]
24
+ right_state = "O" if f.eye_blink[0] > 0.30 else "-"
25
+ left_state = "O" if f.eye_blink[1] > 0.30 else "-"
26
+ detected = True
27
+ if not f.success:
28
+ pts_3d = np.zeros((70, 3), np.float32)
29
+ if face_num == 0:
30
+ face_lms = f.lms
31
+ for pt_num, (x,y,c) in enumerate(f.lms):
32
+ if pt_num == 66 and (f.eye_blink[0] < 0.30 or c < 0.20):
33
+ continue
34
+ if pt_num == 67 and (f.eye_blink[1] < 0.30 or c < 0.20):
35
+ continue
36
+ x = int(x + 0.5)
37
+ y = int(y + 0.5)
38
+
39
+ color = (0, 255, 0)
40
+ if pt_num >= 66:
41
+ color = (255, 255, 0)
42
+ if not (x < 0 or y < 0 or x >= height or y >= width):
43
+ cv2.circle(frame, (y, x), 1, color, -1)
44
+ if f.rotation is not None:
45
+ projected = cv2.projectPoints(f.contour, f.rotation, f.translation, tracker.camera, tracker.dist_coeffs)
46
+ for [(x,y)] in projected[0]:
47
+ x = int(x + 0.5)
48
+ y = int(y + 0.5)
49
+ if not (x < 0 or y < 0 or x >= height or y >= width):
50
+ frame[int(x), int(y)] = (0, 255, 255)
51
+ x += 1
52
+ if not (x < 0 or y < 0 or x >= height or y >= width):
53
+ frame[int(x), int(y)] = (0, 255, 255)
54
+ y += 1
55
+ if not (x < 0 or y < 0 or x >= height or y >= width):
56
+ frame[int(x), int(y)] = (0, 255, 255)
57
+ x -= 1
58
+ if not (x < 0 or y < 0 or x >= height or y >= width):
59
+ frame[int(x), int(y)] = (0, 255, 255)
60
+ if save_path is not None:
61
+ cv2.imwrite(save_path, frame)
62
+ return frame, face_lms
tools/vid2pose.py ADDED
@@ -0,0 +1,38 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from src.dwpose import DWposeDetector
2
+ import os
3
+ from pathlib import Path
4
+
5
+ from src.utils.util import get_fps, read_frames, save_videos_from_pil
6
+ import numpy as np
7
+
8
+
9
+ if __name__ == "__main__":
10
+ import argparse
11
+
12
+ parser = argparse.ArgumentParser()
13
+ parser.add_argument("--video_path", type=str)
14
+ args = parser.parse_args()
15
+
16
+ if not os.path.exists(args.video_path):
17
+ raise ValueError(f"Path: {args.video_path} not exists")
18
+
19
+ dir_path, video_name = (
20
+ os.path.dirname(args.video_path),
21
+ os.path.splitext(os.path.basename(args.video_path))[0],
22
+ )
23
+ out_path = os.path.join(dir_path, video_name + "_kps.mp4")
24
+
25
+ detector = DWposeDetector()
26
+ detector = detector.to(f"cuda")
27
+
28
+ fps = get_fps(args.video_path)
29
+ frames = read_frames(args.video_path)
30
+ kps_results = []
31
+ for i, frame_pil in enumerate(frames):
32
+ result, score = detector(frame_pil)
33
+ score = np.mean(score, axis=-1)
34
+
35
+ kps_results.append(result)
36
+
37
+ print(out_path)
38
+ save_videos_from_pil(kps_results, out_path, fps=fps)