aidenyzhang commited on
Commit
da237e1
·
1 Parent(s): 385419f
Files changed (46) hide show
  1. README.md +196 -138
  2. musetalk/models/unet.py +5 -1
  3. musetalk/utils/__init__.py +0 -0
  4. musetalk/utils/audio_processor.py +99 -0
  5. musetalk/utils/blending.py +79 -44
  6. musetalk/utils/dwpose/default_runtime.py +0 -0
  7. musetalk/utils/dwpose/rtmpose-l_8xb32-270e_coco-ubody-wholebody-384x288.py +0 -0
  8. musetalk/utils/face_detection/README.md +0 -0
  9. musetalk/utils/face_detection/__init__.py +0 -0
  10. musetalk/utils/face_detection/api.py +0 -0
  11. musetalk/utils/face_detection/detection/__init__.py +0 -0
  12. musetalk/utils/face_detection/detection/core.py +0 -0
  13. musetalk/utils/face_detection/detection/sfd/__init__.py +0 -0
  14. musetalk/utils/face_detection/detection/sfd/bbox.py +0 -0
  15. musetalk/utils/face_detection/detection/sfd/detect.py +0 -0
  16. musetalk/utils/face_detection/detection/sfd/net_s3fd.py +0 -0
  17. musetalk/utils/face_detection/detection/sfd/sfd_detector.py +0 -0
  18. musetalk/utils/face_detection/models.py +0 -0
  19. musetalk/utils/face_detection/utils.py +0 -0
  20. musetalk/utils/face_parsing/__init__.py +65 -4
  21. musetalk/utils/preprocessing.py +0 -0
  22. musetalk/utils/utils.py +28 -14
  23. musetalk/whisper/audio2feature.py +0 -0
  24. musetalk/whisper/whisper/__init__.py +0 -0
  25. musetalk/whisper/whisper/__main__.py +0 -0
  26. musetalk/whisper/whisper/assets/gpt2/merges.txt +0 -0
  27. musetalk/whisper/whisper/assets/gpt2/special_tokens_map.json +0 -0
  28. musetalk/whisper/whisper/assets/gpt2/tokenizer_config.json +0 -0
  29. musetalk/whisper/whisper/assets/gpt2/vocab.json +0 -0
  30. musetalk/whisper/whisper/assets/multilingual/added_tokens.json +0 -0
  31. musetalk/whisper/whisper/assets/multilingual/merges.txt +0 -0
  32. musetalk/whisper/whisper/assets/multilingual/special_tokens_map.json +0 -0
  33. musetalk/whisper/whisper/assets/multilingual/tokenizer_config.json +0 -0
  34. musetalk/whisper/whisper/assets/multilingual/vocab.json +0 -0
  35. musetalk/whisper/whisper/audio.py +0 -0
  36. musetalk/whisper/whisper/decoding.py +0 -0
  37. musetalk/whisper/whisper/model.py +0 -0
  38. musetalk/whisper/whisper/normalizers/__init__.py +0 -0
  39. musetalk/whisper/whisper/normalizers/basic.py +0 -0
  40. musetalk/whisper/whisper/normalizers/english.json +0 -0
  41. musetalk/whisper/whisper/normalizers/english.py +0 -0
  42. musetalk/whisper/whisper/tokenizer.py +0 -0
  43. musetalk/whisper/whisper/transcribe.py +0 -0
  44. musetalk/whisper/whisper/utils.py +0 -0
  45. requirements.txt +1 -0
  46. scripts/inference_alpha.py +253 -0
README.md CHANGED
@@ -1,15 +1,16 @@
1
  # MuseTalk
2
 
3
- MuseTalk: Real-Time High Quality Lip Synchronization with Latent Space Inpainting
4
- </br>
5
- Yue Zhang <sup>\*</sup>,
 
6
  Minhao Liu<sup>\*</sup>,
7
  Zhaokang Chen,
8
  Bin Wu<sup>†</sup>,
9
  Yubin Zeng,
10
  Chao Zhan,
11
- Yingjie He,
12
  Junxin Huang,
 
13
  Wenjiang Zhou
14
  (<sup>*</sup>Equal Contribution, <sup>†</sup>Corresponding Author, benbinwu@tencent.com)
15
 
@@ -19,7 +20,10 @@ Lyra Lab, Tencent Music Entertainment
19
 
20
  We introduce `MuseTalk`, a **real-time high quality** lip-syncing model (30fps+ on an NVIDIA Tesla V100). MuseTalk can be applied with input videos, e.g., generated by [MuseV](https://github.com/TMElyralab/MuseV), as a complete virtual human solution.
21
 
22
- :new: Update: We are thrilled to announce that [MusePose](https://github.com/TMElyralab/MusePose/) has been released. MusePose is an image-to-video generation framework for virtual human under control signal like pose. Together with MuseV and MuseTalk, we hope the community can join us and march towards the vision where a virtual human can be generated end2end with native ability of full body movement and interaction.
 
 
 
23
 
24
  # Overview
25
  `MuseTalk` is a real-time high quality audio-driven lip-syncing model trained in the latent space of `ft-mse-vae`, which
@@ -28,152 +32,104 @@ We introduce `MuseTalk`, a **real-time high quality** lip-syncing model (30fps+
28
  1. supports audio in various languages, such as Chinese, English, and Japanese.
29
  1. supports real-time inference with 30fps+ on an NVIDIA Tesla V100.
30
  1. supports modification of the center point of the face region proposes, which **SIGNIFICANTLY** affects generation results.
31
- 1. checkpoint available trained on the HDTF dataset.
32
- 1. training codes (comming soon).
33
 
34
  # News
35
- - [04/02/2024] Release MuseTalk project and pretrained models.
 
 
36
  - [04/16/2024] Release Gradio [demo](https://huggingface.co/spaces/TMElyralab/MuseTalk) on HuggingFace Spaces (thanks to HF team for their community grant)
37
- - [04/17/2024] : We release a pipeline that utilizes MuseTalk for real-time inference.
38
- - [10/18/2024] :mega: We release the [technical report](https://arxiv.org/abs/2410.10122). Our report details a superior model to the open-source L1 loss version. It includes GAN and perceptual losses for improved clarity, and sync loss for enhanced performance.
39
 
40
  ## Model
41
- ![Model Structure](assets/figs/musetalk_arc.jpg)
42
  MuseTalk was trained in latent spaces, where the images were encoded by a freezed VAE. The audio was encoded by a freezed `whisper-tiny` model. The architecture of the generation network was borrowed from the UNet of the `stable-diffusion-v1-4`, where the audio embeddings were fused to the image embeddings by cross-attention.
43
 
44
  Note that although we use a very similar architecture as Stable Diffusion, MuseTalk is distinct in that it is **NOT** a diffusion model. Instead, MuseTalk operates by inpainting in the latent space with a single step.
45
 
46
  ## Cases
47
- ### MuseV + MuseTalk make human photos alive!
48
- <table class="center">
49
- <tr style="font-weight: bolder;text-align:center;">
50
- <td width="33%">Image</td>
51
- <td width="33%">MuseV</td>
52
- <td width="33%">+MuseTalk</td>
53
- </tr>
54
- <tr>
55
- <td>
56
- <img src=assets/demo/musk/musk.png width="95%">
57
- </td>
58
- <td >
59
- <video src=https://github.com/TMElyralab/MuseTalk/assets/163980830/4a4bb2d1-9d14-4ca9-85c8-7f19c39f712e controls preload></video>
60
- </td>
61
- <td >
62
- <video src=https://github.com/TMElyralab/MuseTalk/assets/163980830/b2a879c2-e23a-4d39-911d-51f0343218e4 controls preload></video>
63
- </td>
64
- </tr>
65
- <tr>
66
- <td>
67
- <img src=assets/demo/yongen/yongen.jpeg width="95%">
68
- </td>
69
- <td >
70
- <video src=https://github.com/TMElyralab/MuseTalk/assets/163980830/57ef9dee-a9fd-4dc8-839b-3fbbbf0ff3f4 controls preload></video>
71
- </td>
72
- <td >
73
- <video src=https://github.com/TMElyralab/MuseTalk/assets/163980830/94d8dcba-1bcd-4b54-9d1d-8b6fc53228f0 controls preload></video>
74
- </td>
75
- </tr>
76
- <tr>
77
- <td>
78
- <img src=assets/demo/sit/sit.jpeg width="95%">
79
- </td>
80
- <td >
81
- <video src=https://github.com/TMElyralab/MuseTalk/assets/163980830/5fbab81b-d3f2-4c75-abb5-14c76e51769e controls preload></video>
82
- </td>
83
- <td >
84
- <video src=https://github.com/TMElyralab/MuseTalk/assets/163980830/f8100f4a-3df8-4151-8de2-291b09269f66 controls preload></video>
85
- </td>
86
- </tr>
87
- <tr>
88
- <td>
89
- <img src=assets/demo/man/man.png width="95%">
90
- </td>
91
- <td >
92
- <video src=https://github.com/TMElyralab/MuseTalk/assets/163980830/a6e7d431-5643-4745-9868-8b423a454153 controls preload></video>
93
- </td>
94
- <td >
95
- <video src=https://github.com/TMElyralab/MuseTalk/assets/163980830/6ccf7bc7-cb48-42de-85bd-076d5ee8a623 controls preload></video>
96
- </td>
97
- </tr>
98
- <tr>
99
- <td>
100
- <img src=assets/demo/monalisa/monalisa.png width="95%">
101
- </td>
102
- <td >
103
- <video src=https://github.com/TMElyralab/MuseTalk/assets/163980830/1568f604-a34f-4526-a13a-7d282aa2e773 controls preload></video>
104
- </td>
105
- <td >
106
- <video src=https://github.com/TMElyralab/MuseTalk/assets/163980830/a40784fc-a885-4c1f-9b7e-8f87b7caf4e0 controls preload></video>
107
- </td>
108
- </tr>
109
- <tr>
110
- <td>
111
- <img src=assets/demo/sun1/sun.png width="95%">
112
- </td>
113
- <td >
114
- <video src=https://github.com/TMElyralab/MuseTalk/assets/163980830/37a3a666-7b90-4244-8d3a-058cb0e44107 controls preload></video>
115
- </td>
116
- <td >
117
- <video src=https://github.com/TMElyralab/MuseTalk/assets/163980830/172f4ff1-d432-45bd-a5a7-a07dec33a26b controls preload></video>
118
- </td>
119
- </tr>
120
- <tr>
121
- <td>
122
- <img src=assets/demo/sun2/sun.png width="95%">
123
- </td>
124
- <td >
125
- <video src=https://github.com/TMElyralab/MuseTalk/assets/163980830/37a3a666-7b90-4244-8d3a-058cb0e44107 controls preload></video>
126
- </td>
127
- <td >
128
- <video src=https://github.com/TMElyralab/MuseTalk/assets/163980830/85a6873d-a028-4cce-af2b-6c59a1f2971d controls preload></video>
129
- </td>
130
- </tr>
131
- </table >
132
 
133
- * The character of the last two rows, `Xinying Sun`, is a supermodel KOL. You can follow her on [douyin](https://www.douyin.com/user/MS4wLjABAAAAWDThbMPN_6Xmm_JgXexbOii1K-httbu2APdG8DvDyM8).
 
 
134
 
135
- ## Video dubbing
136
- <table class="center">
137
- <tr style="font-weight: bolder;text-align:center;">
138
- <td width="70%">MuseTalk</td>
139
- <td width="30%">Original videos</td>
140
- </tr>
141
- <tr>
142
- <td>
143
- <video src=https://github.com/TMElyralab/MuseTalk/assets/163980830/4d7c5fa1-3550-4d52-8ed2-52f158150f24 controls preload></video>
144
- </td>
145
- <td>
146
- <a href="//www.bilibili.com/video/BV1wT411b7HU">Link</a>
147
- <href src=""></href>
148
- </td>
149
- </tr>
150
- </table>
151
 
152
- * For video dubbing, we applied a self-developed tool which can identify the talking person.
 
153
 
154
- ## Some interesting videos!
155
- <table class="center">
156
- <tr style="font-weight: bolder;text-align:center;">
157
- <td width="50%">Image</td>
158
- <td width="50%">MuseV + MuseTalk</td>
159
- </tr>
160
- <tr>
161
- <td>
162
- <img src=assets/demo/video1/video1.png width="95%">
163
- </td>
164
- <td>
165
- <video src=https://github.com/TMElyralab/MuseTalk/assets/163980830/1f02f9c6-8b98-475e-86b8-82ebee82fe0d controls preload></video>
166
- </td>
167
- </tr>
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
168
  </table>
169
 
 
170
  # TODO:
171
  - [x] trained models and inference codes.
172
  - [x] Huggingface Gradio [demo](https://huggingface.co/spaces/TMElyralab/MuseTalk).
173
  - [x] codes for real-time inference.
174
- - [ ] technical report.
175
- - [ ] training codes.
176
- - [ ] a better model (may take longer).
 
177
 
178
 
179
  # Getting Started
@@ -232,6 +188,9 @@ Finally, these weights should be organized in `models` as follows:
232
  ├── musetalk
233
  │ └── musetalk.json
234
  │ └── pytorch_model.bin
 
 
 
235
  ├── dwpose
236
  │ └── dw-ll_ucoco_384.pth
237
  ├── face-parse-bisent
@@ -246,16 +205,112 @@ Finally, these weights should be organized in `models` as follows:
246
  ## Quickstart
247
 
248
  ### Inference
249
- Here, we provide the inference script.
250
- ```
251
- python -m scripts.inference --inference_config configs/inference/test.yaml
 
 
252
  ```
 
 
 
 
253
  configs/inference/test.yaml is the path to the inference configuration file, including video_path and audio_path.
254
  The video_path should be either a video file, an image file or a directory of images.
255
 
 
 
 
 
256
  You are recommended to input video with `25fps`, the same fps used when training the model. If your video is far less than 25fps, you are recommended to apply frame interpolation or directly convert the video to 25fps using ffmpeg.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
257
 
258
- #### Use of bbox_shift to have adjustable results
259
  :mag_right: We have found that upper-bound of the mask has an important impact on mouth openness. Thus, to control the mask region, we suggest using the `bbox_shift` parameter. Positive values (moving towards the lower half) increase mouth openness, while negative values (moving towards the upper half) decrease mouth openness.
260
 
261
  You can start by running with the default configuration to obtain the adjustable value range, and then re-run the script within this range.
@@ -266,6 +321,9 @@ python -m scripts.inference --inference_config configs/inference/test.yaml --bbo
266
  ```
267
  :pushpin: More technical details can be found in [bbox_shift](assets/BBOX_SHIFT.md).
268
 
 
 
 
269
  #### Combining MuseV and MuseTalk
270
 
271
  As a complete solution to virtual human generation, you are suggested to first apply [MuseV](https://github.com/TMElyralab/MuseV) to generate a video (text-to-video, image-to-video or pose-to-video) by referring [this](https://github.com/TMElyralab/MuseV?tab=readme-ov-file#text2video). Frame interpolation is suggested to increase frame rate. Then, you can use `MuseTalk` to generate a lip-sync video by referring [this](https://github.com/TMElyralab/MuseTalk?tab=readme-ov-file#inference).
@@ -312,10 +370,10 @@ If you need higher resolution, you could apply super resolution models such as [
312
  # Citation
313
  ```bib
314
  @article{musetalk,
315
- title={MuseTalk: Real-Time High Quality Lip Synchorization with Latent Space Inpainting},
316
- author={Zhang, Yue and Liu, Minhao and Chen, Zhaokang and Wu, Bin and Zeng, Yubin and Zhan, Chao and He, Yingjie and Huang, Junxin and Zhou, Wenjiang},
317
  journal={arxiv},
318
- year={2024}
319
  }
320
  ```
321
  # Disclaimer/License
 
1
  # MuseTalk
2
 
3
+ <strong>MuseTalk: Real-Time High-Fidelity Video Dubbing via Spatio-Temporal Sampling</strong>
4
+
5
+ Yue Zhang<sup>\*</sup>,
6
+ Zhizhou Zhong<sup>\*</sup>,
7
  Minhao Liu<sup>\*</sup>,
8
  Zhaokang Chen,
9
  Bin Wu<sup>†</sup>,
10
  Yubin Zeng,
11
  Chao Zhan,
 
12
  Junxin Huang,
13
+ Yingjie He,
14
  Wenjiang Zhou
15
  (<sup>*</sup>Equal Contribution, <sup>†</sup>Corresponding Author, benbinwu@tencent.com)
16
 
 
20
 
21
  We introduce `MuseTalk`, a **real-time high quality** lip-syncing model (30fps+ on an NVIDIA Tesla V100). MuseTalk can be applied with input videos, e.g., generated by [MuseV](https://github.com/TMElyralab/MuseV), as a complete virtual human solution.
22
 
23
+ ## 🔥 Updates
24
+ We're excited to unveil MuseTalk 1.5.
25
+ This version **(1)** integrates training with perceptual loss, GAN loss, and sync loss, significantly boosting its overall performance. **(2)** We've implemented a two-stage training strategy and a spatio-temporal data sampling approach to strike a balance between visual quality and lip-sync accuracy.
26
+ Learn more details [here](https://arxiv.org/abs/2410.10122)
27
 
28
  # Overview
29
  `MuseTalk` is a real-time high quality audio-driven lip-syncing model trained in the latent space of `ft-mse-vae`, which
 
32
  1. supports audio in various languages, such as Chinese, English, and Japanese.
33
  1. supports real-time inference with 30fps+ on an NVIDIA Tesla V100.
34
  1. supports modification of the center point of the face region proposes, which **SIGNIFICANTLY** affects generation results.
35
+ 1. checkpoint available trained on the HDTF and private dataset.
 
36
 
37
  # News
38
+ - [03/28/2025] :mega: We are thrilled to announce the release of our 1.5 version. This version is a significant improvement over the 1.0 version, with enhanced clarity, identity consistency, and precise lip-speech synchronization. We update the [technical report](https://arxiv.org/abs/2410.10122) with more details.
39
+ - [10/18/2024] We release the [technical report](https://arxiv.org/abs/2410.10122v2). Our report details a superior model to the open-source L1 loss version. It includes GAN and perceptual losses for improved clarity, and sync loss for enhanced performance.
40
+ - [04/17/2024] We release a pipeline that utilizes MuseTalk for real-time inference.
41
  - [04/16/2024] Release Gradio [demo](https://huggingface.co/spaces/TMElyralab/MuseTalk) on HuggingFace Spaces (thanks to HF team for their community grant)
42
+ - [04/02/2024] Release MuseTalk project and pretrained models.
43
+
44
 
45
  ## Model
46
+ ![Model Structure](https://github.com/user-attachments/assets/02f4a214-1bdd-4326-983c-e70b478accba)
47
  MuseTalk was trained in latent spaces, where the images were encoded by a freezed VAE. The audio was encoded by a freezed `whisper-tiny` model. The architecture of the generation network was borrowed from the UNet of the `stable-diffusion-v1-4`, where the audio embeddings were fused to the image embeddings by cross-attention.
48
 
49
  Note that although we use a very similar architecture as Stable Diffusion, MuseTalk is distinct in that it is **NOT** a diffusion model. Instead, MuseTalk operates by inpainting in the latent space with a single step.
50
 
51
  ## Cases
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
52
 
53
+ <table>
54
+ <tr>
55
+ <td width="33%">
56
 
57
+ ### Input Video
58
+ ---
59
+ https://github.com/TMElyralab/MuseTalk/assets/163980830/37a3a666-7b90-4244-8d3a-058cb0e44107
 
 
 
 
 
 
 
 
 
 
 
 
 
60
 
61
+ ---
62
+ https://github.com/user-attachments/assets/1ce3e850-90ac-4a31-a45f-8dfa4f2960ac
63
 
64
+ ---
65
+ https://github.com/user-attachments/assets/fa3b13a1-ae26-4d1d-899e-87435f8d22b3
66
+
67
+ ---
68
+ https://github.com/user-attachments/assets/15800692-39d1-4f4c-99f2-aef044dc3251
69
+
70
+ ---
71
+ https://github.com/user-attachments/assets/a843f9c9-136d-4ed4-9303-4a7269787a60
72
+
73
+ ---
74
+ https://github.com/user-attachments/assets/6eb4e70e-9e19-48e9-85a9-bbfa589c5fcb
75
+
76
+ </td>
77
+ <td width="33%">
78
+
79
+ ### MuseTalk 1.0
80
+ ---
81
+ https://github.com/user-attachments/assets/c04f3cd5-9f77-40e9-aafd-61978380d0ef
82
+
83
+ ---
84
+ https://github.com/user-attachments/assets/2051a388-1cef-4c1d-b2a2-3c1ceee5dc99
85
+
86
+ ---
87
+ https://github.com/user-attachments/assets/b5f56f71-5cdc-4e2e-a519-454242000d32
88
+
89
+ ---
90
+ https://github.com/user-attachments/assets/a5843835-04ab-4c31-989f-0995cfc22f34
91
+
92
+ ---
93
+ https://github.com/user-attachments/assets/3dc7f1d7-8747-4733-bbdd-97874af0c028
94
+
95
+ ---
96
+ https://github.com/user-attachments/assets/3c78064e-faad-4637-83ae-28452a22b09a
97
+
98
+ </td>
99
+ <td width="33%">
100
+
101
+ ### MuseTalk 1.5
102
+ ---
103
+ https://github.com/user-attachments/assets/999a6f5b-61dd-48e1-b902-bb3f9cbc7247
104
+
105
+ ---
106
+ https://github.com/user-attachments/assets/d26a5c9a-003c-489d-a043-c9a331456e75
107
+
108
+ ---
109
+ https://github.com/user-attachments/assets/471290d7-b157-4cf6-8a6d-7e899afa302c
110
+
111
+ ---
112
+ https://github.com/user-attachments/assets/1ee77c4c-8c70-4add-b6db-583a12faa7dc
113
+
114
+ ---
115
+ https://github.com/user-attachments/assets/370510ea-624c-43b7-bbb0-ab5333e0fcc4
116
+
117
+ ---
118
+ https://github.com/user-attachments/assets/b011ece9-a332-4bc1-b8b7-ef6e383d7bde
119
+
120
+ </td>
121
+ </tr>
122
  </table>
123
 
124
+
125
  # TODO:
126
  - [x] trained models and inference codes.
127
  - [x] Huggingface Gradio [demo](https://huggingface.co/spaces/TMElyralab/MuseTalk).
128
  - [x] codes for real-time inference.
129
+ - [x] [technical report](https://arxiv.org/abs/2410.10122v2).
130
+ - [x] a better model with updated [technical report](https://arxiv.org/abs/2410.10122).
131
+ - [ ] training and dataloader code (Expected completion on 04/04/2025).
132
+
133
 
134
 
135
  # Getting Started
 
188
  ├── musetalk
189
  │ └── musetalk.json
190
  │ └── pytorch_model.bin
191
+ ├── musetalkV15
192
+ │ └── musetalk.json
193
+ │ └── unet.pth
194
  ├── dwpose
195
  │ └── dw-ll_ucoco_384.pth
196
  ├── face-parse-bisent
 
205
  ## Quickstart
206
 
207
  ### Inference
208
+ We provide inference scripts for both versions of MuseTalk:
209
+
210
+ #### MuseTalk 1.5 (Recommended)
211
+ ```bash
212
+ python3 -m scripts.inference_alpha --inference_config configs/inference/test.yaml --unet_model_path ./models/musetalkV15/unet.pth
213
  ```
214
+ This inference script supports both MuseTalk 1.5 and 1.0 models:
215
+ - For MuseTalk 1.5: Use the command above with the V1.5 model path
216
+ - For MuseTalk 1.0: Use the same script but point to the V1.0 model path
217
+
218
  configs/inference/test.yaml is the path to the inference configuration file, including video_path and audio_path.
219
  The video_path should be either a video file, an image file or a directory of images.
220
 
221
+ #### MuseTalk 1.0
222
+ ```bash
223
+ python3 -m scripts.inference --inference_config configs/inference/test.yaml
224
+ ```
225
  You are recommended to input video with `25fps`, the same fps used when training the model. If your video is far less than 25fps, you are recommended to apply frame interpolation or directly convert the video to 25fps using ffmpeg.
226
+ <details close>
227
+ ## TestCases For 1.0
228
+ <table class="center">
229
+ <tr style="font-weight: bolder;text-align:center;">
230
+ <td width="33%">Image</td>
231
+ <td width="33%">MuseV</td>
232
+ <td width="33%">+MuseTalk</td>
233
+ </tr>
234
+ <tr>
235
+ <td>
236
+ <img src=assets/demo/musk/musk.png width="95%">
237
+ </td>
238
+ <td >
239
+ <video src=https://github.com/TMElyralab/MuseTalk/assets/163980830/4a4bb2d1-9d14-4ca9-85c8-7f19c39f712e controls preload></video>
240
+ </td>
241
+ <td >
242
+ <video src=https://github.com/TMElyralab/MuseTalk/assets/163980830/b2a879c2-e23a-4d39-911d-51f0343218e4 controls preload></video>
243
+ </td>
244
+ </tr>
245
+ <tr>
246
+ <td>
247
+ <img src=assets/demo/yongen/yongen.jpeg width="95%">
248
+ </td>
249
+ <td >
250
+ <video src=https://github.com/TMElyralab/MuseTalk/assets/163980830/57ef9dee-a9fd-4dc8-839b-3fbbbf0ff3f4 controls preload></video>
251
+ </td>
252
+ <td >
253
+ <video src=https://github.com/TMElyralab/MuseTalk/assets/163980830/94d8dcba-1bcd-4b54-9d1d-8b6fc53228f0 controls preload></video>
254
+ </td>
255
+ </tr>
256
+ <tr>
257
+ <td>
258
+ <img src=assets/demo/sit/sit.jpeg width="95%">
259
+ </td>
260
+ <td >
261
+ <video src=https://github.com/TMElyralab/MuseTalk/assets/163980830/5fbab81b-d3f2-4c75-abb5-14c76e51769e controls preload></video>
262
+ </td>
263
+ <td >
264
+ <video src=https://github.com/TMElyralab/MuseTalk/assets/163980830/f8100f4a-3df8-4151-8de2-291b09269f66 controls preload></video>
265
+ </td>
266
+ </tr>
267
+ <tr>
268
+ <td>
269
+ <img src=assets/demo/man/man.png width="95%">
270
+ </td>
271
+ <td >
272
+ <video src=https://github.com/TMElyralab/MuseTalk/assets/163980830/a6e7d431-5643-4745-9868-8b423a454153 controls preload></video>
273
+ </td>
274
+ <td >
275
+ <video src=https://github.com/TMElyralab/MuseTalk/assets/163980830/6ccf7bc7-cb48-42de-85bd-076d5ee8a623 controls preload></video>
276
+ </td>
277
+ </tr>
278
+ <tr>
279
+ <td>
280
+ <img src=assets/demo/monalisa/monalisa.png width="95%">
281
+ </td>
282
+ <td >
283
+ <video src=https://github.com/TMElyralab/MuseTalk/assets/163980830/1568f604-a34f-4526-a13a-7d282aa2e773 controls preload></video>
284
+ </td>
285
+ <td >
286
+ <video src=https://github.com/TMElyralab/MuseTalk/assets/163980830/a40784fc-a885-4c1f-9b7e-8f87b7caf4e0 controls preload></video>
287
+ </td>
288
+ </tr>
289
+ <tr>
290
+ <td>
291
+ <img src=assets/demo/sun1/sun.png width="95%">
292
+ </td>
293
+ <td >
294
+ <video src=https://github.com/TMElyralab/MuseTalk/assets/163980830/37a3a666-7b90-4244-8d3a-058cb0e44107 controls preload></video>
295
+ </td>
296
+ <td >
297
+ <video src=https://github.com/TMElyralab/MuseTalk/assets/163980830/172f4ff1-d432-45bd-a5a7-a07dec33a26b controls preload></video>
298
+ </td>
299
+ </tr>
300
+ <tr>
301
+ <td>
302
+ <img src=assets/demo/sun2/sun.png width="95%">
303
+ </td>
304
+ <td >
305
+ <video src=https://github.com/TMElyralab/MuseTalk/assets/163980830/37a3a666-7b90-4244-8d3a-058cb0e44107 controls preload></video>
306
+ </td>
307
+ <td >
308
+ <video src=https://github.com/TMElyralab/MuseTalk/assets/163980830/85a6873d-a028-4cce-af2b-6c59a1f2971d controls preload></video>
309
+ </td>
310
+ </tr>
311
+ </table >
312
 
313
+ #### Use of bbox_shift to have adjustable results(For 1.0)
314
  :mag_right: We have found that upper-bound of the mask has an important impact on mouth openness. Thus, to control the mask region, we suggest using the `bbox_shift` parameter. Positive values (moving towards the lower half) increase mouth openness, while negative values (moving towards the upper half) decrease mouth openness.
315
 
316
  You can start by running with the default configuration to obtain the adjustable value range, and then re-run the script within this range.
 
321
  ```
322
  :pushpin: More technical details can be found in [bbox_shift](assets/BBOX_SHIFT.md).
323
 
324
+ </details>
325
+
326
+
327
  #### Combining MuseV and MuseTalk
328
 
329
  As a complete solution to virtual human generation, you are suggested to first apply [MuseV](https://github.com/TMElyralab/MuseV) to generate a video (text-to-video, image-to-video or pose-to-video) by referring [this](https://github.com/TMElyralab/MuseV?tab=readme-ov-file#text2video). Frame interpolation is suggested to increase frame rate. Then, you can use `MuseTalk` to generate a lip-sync video by referring [this](https://github.com/TMElyralab/MuseTalk?tab=readme-ov-file#inference).
 
370
  # Citation
371
  ```bib
372
  @article{musetalk,
373
+ title={MuseTalk: Real-Time High-Fidelity Video Dubbing via Spatio-Temporal Sampling},
374
+ author={Zhang, Yue and Zhong, Zhizhou and Liu, Minhao and Chen, Zhaokang and Wu, Bin and Zeng, Yubin and Zhan, Chao and He, Yingjie and Huang, Junxin and Zhou, Wenjiang},
375
  journal={arxiv},
376
+ year={2025}
377
  }
378
  ```
379
  # Disclaimer/License
musetalk/models/unet.py CHANGED
@@ -31,12 +31,16 @@ class UNet():
31
  unet_config,
32
  model_path,
33
  use_float16=False,
 
34
  ):
35
  with open(unet_config, 'r') as f:
36
  unet_config = json.load(f)
37
  self.model = UNet2DConditionModel(**unet_config)
38
  self.pe = PositionalEncoding(d_model=384)
39
- self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 
 
 
40
  weights = torch.load(model_path) if torch.cuda.is_available() else torch.load(model_path, map_location=self.device)
41
  self.model.load_state_dict(weights)
42
  if use_float16:
 
31
  unet_config,
32
  model_path,
33
  use_float16=False,
34
+ device=None
35
  ):
36
  with open(unet_config, 'r') as f:
37
  unet_config = json.load(f)
38
  self.model = UNet2DConditionModel(**unet_config)
39
  self.pe = PositionalEncoding(d_model=384)
40
+ if device != None:
41
+ self.device = device
42
+ else:
43
+ self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
44
  weights = torch.load(model_path) if torch.cuda.is_available() else torch.load(model_path, map_location=self.device)
45
  self.model.load_state_dict(weights)
46
  if use_float16:
musetalk/utils/__init__.py CHANGED
File without changes
musetalk/utils/audio_processor.py ADDED
@@ -0,0 +1,99 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import math
3
+ import librosa
4
+ import numpy as np
5
+ import torch
6
+
7
+ from einops import rearrange
8
+ from transformers import AutoFeatureExtractor
9
+
10
+ class AudioProcessor:
11
+ def __init__(self, feature_extractor_path="openai/whisper-tiny/"):
12
+ self.feature_extractor = AutoFeatureExtractor.from_pretrained(feature_extractor_path)
13
+
14
+ def get_audio_feature(self, wav_path, start_index=0):
15
+ if not os.path.exists(wav_path):
16
+ return None
17
+ librosa_output, sampling_rate = librosa.load(wav_path, sr=16000)
18
+ assert sampling_rate == 16000
19
+ # Split audio into 30s segments
20
+ segment_length = 30 * sampling_rate
21
+ segments = [librosa_output[i:i + segment_length] for i in range(0, len(librosa_output), segment_length)]
22
+
23
+ features = []
24
+ for segment in segments:
25
+ audio_feature = self.feature_extractor(
26
+ segment,
27
+ return_tensors="pt",
28
+ sampling_rate=sampling_rate
29
+ ).input_features
30
+ features.append(audio_feature)
31
+
32
+ return features, len(librosa_output)
33
+
34
+ def get_whisper_chunk(
35
+ self,
36
+ whisper_input_features,
37
+ device,
38
+ weight_dtype,
39
+ whisper,
40
+ librosa_length,
41
+ fps=25,
42
+ audio_padding_length_left=2,
43
+ audio_padding_length_right=2,
44
+ ):
45
+ audio_feature_length_per_frame = 2 * (audio_padding_length_left + audio_padding_length_right + 1)
46
+ whisper_feature = []
47
+ # Process multiple 30s mel input features
48
+ for input_feature in whisper_input_features:
49
+ audio_feats = whisper.encoder(input_feature.to(device), output_hidden_states=True).hidden_states
50
+ audio_feats = torch.stack(audio_feats, dim=2).to(weight_dtype)
51
+ whisper_feature.append(audio_feats)
52
+
53
+ whisper_feature = torch.cat(whisper_feature, dim=1)
54
+ # Trim the last segment to remove padding
55
+ sr = 16000
56
+ audio_fps = 50
57
+ fps = int(fps)
58
+ whisper_idx_multiplier = audio_fps / fps
59
+ num_frames = math.floor((librosa_length / sr)) * fps
60
+ actual_length = math.floor((librosa_length / sr)) * audio_fps
61
+ whisper_feature = whisper_feature[:,:actual_length,...]
62
+
63
+ # Calculate padding amount
64
+ padding_nums = math.floor(whisper_idx_multiplier)
65
+ # Add padding at start and end
66
+ whisper_feature = torch.cat([
67
+ torch.zeros_like(whisper_feature[:, :padding_nums * audio_padding_length_left]),
68
+ whisper_feature,
69
+ # Add extra padding to prevent out of bounds
70
+ torch.zeros_like(whisper_feature[:, :padding_nums * 3 * audio_padding_length_right])
71
+ ], 1)
72
+
73
+ audio_prompts = []
74
+ for frame_index in range(num_frames):
75
+ try:
76
+ audio_index = math.floor(frame_index * whisper_idx_multiplier)
77
+ audio_clip = whisper_feature[:, audio_index: audio_index + audio_feature_length_per_frame]
78
+ assert audio_clip.shape[1] == audio_feature_length_per_frame
79
+ audio_prompts.append(audio_clip)
80
+ except Exception as e:
81
+ print(f"Error occurred: {e}")
82
+ print(f"whisper_feature.shape: {whisper_feature.shape}")
83
+ print(f"audio_clip.shape: {audio_clip.shape}")
84
+ print(f"num frames: {num_frames}, fps: {fps}, whisper_idx_multiplier: {whisper_idx_multiplier}")
85
+ print(f"frame_index: {frame_index}, audio_index: {audio_index}-{audio_index + audio_feature_length_per_frame}")
86
+ exit()
87
+
88
+ audio_prompts = torch.cat(audio_prompts, dim=0) # T, 10, 5, 384
89
+ audio_prompts = rearrange(audio_prompts, 'b c h w -> b (c h) w')
90
+ return audio_prompts
91
+
92
+ if __name__ == "__main__":
93
+ audio_processor = AudioProcessor()
94
+ wav_path = "/cfs-workspace/users/gozhong/codes/musetalk_opensource2/data/audio/2.wav"
95
+ audio_feature, librosa_feature_length = audio_processor.get_audio_feature(wav_path)
96
+ print("Audio Feature shape:", audio_feature.shape)
97
+ print("librosa_feature_length:", librosa_feature_length)
98
+
99
+
musetalk/utils/blending.py CHANGED
@@ -2,9 +2,6 @@ from PIL import Image
2
  import numpy as np
3
  import cv2
4
  import copy
5
- from face_parsing import FaceParsing
6
-
7
- fp = FaceParsing()
8
 
9
  def get_crop_box(box, expand):
10
  x, y, x1, y1 = box
@@ -14,46 +11,98 @@ def get_crop_box(box, expand):
14
  crop_box = [x_c-s, y_c-s, x_c+s, y_c+s]
15
  return crop_box, s
16
 
17
- def face_seg(image):
18
- seg_image = fp(image)
 
 
 
 
 
 
 
 
 
19
  if seg_image is None:
20
- print("error, no person_segment")
21
  return None
22
 
23
- seg_image = seg_image.resize(image.size)
24
  return seg_image
25
 
26
- def get_image(image,face,face_box,upper_boundary_ratio = 0.5,expand=1.2):
27
- #print(image.shape)
28
- #print(face.shape)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
29
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
30
  body = Image.fromarray(image[:,:,::-1])
31
  face = Image.fromarray(face[:,:,::-1])
32
 
33
- x, y, x1, y1 = face_box
34
- #print(x1-x,y1-y)
35
- crop_box, s = get_crop_box(face_box, expand)
36
  x_s, y_s, x_e, y_e = crop_box
37
- face_position = (x, y)
38
-
39
  face_large = body.crop(crop_box)
40
- ori_shape = face_large.size
41
 
42
- mask_image = face_seg(face_large)
43
- mask_small = mask_image.crop((x-x_s, y-y_s, x1-x_s, y1-y_s))
44
- mask_image = Image.new('L', ori_shape, 0)
45
- mask_image.paste(mask_small, (x-x_s, y-y_s, x1-x_s, y1-y_s))
46
-
47
- # keep upper_boundary_ratio of talking area
48
- width, height = mask_image.size
49
- top_boundary = int(height * upper_boundary_ratio)
50
- modified_mask_image = Image.new('L', ori_shape, 0)
51
- modified_mask_image.paste(mask_image.crop((0, top_boundary, width, height)), (0, top_boundary))
52
-
53
- blur_kernel_size = int(0.1 * ori_shape[0] // 2 * 2) + 1
54
- mask_array = cv2.GaussianBlur(np.array(modified_mask_image), (blur_kernel_size, blur_kernel_size), 0)
55
  mask_image = Image.fromarray(mask_array)
56
-
57
  face_large.paste(face, (x-x_s, y-y_s, x1-x_s, y1-y_s))
58
  body.paste(face_large, crop_box[:2], mask_image)
59
  body = np.array(body)
@@ -84,17 +133,3 @@ def get_image_prepare_material(image,face_box,upper_boundary_ratio = 0.5,expand=
84
  blur_kernel_size = int(0.1 * ori_shape[0] // 2 * 2) + 1
85
  mask_array = cv2.GaussianBlur(np.array(modified_mask_image), (blur_kernel_size, blur_kernel_size), 0)
86
  return mask_array,crop_box
87
-
88
- def get_image_blending(image,face,face_box,mask_array,crop_box):
89
- body = image
90
- x, y, x1, y1 = face_box
91
- x_s, y_s, x_e, y_e = crop_box
92
- face_large = copy.deepcopy(body[y_s:y_e, x_s:x_e])
93
- face_large[y-y_s:y1-y_s, x-x_s:x1-x_s]=face
94
-
95
- mask_image = cv2.cvtColor(mask_array,cv2.COLOR_BGR2GRAY)
96
- mask_image = (mask_image/255).astype(np.float32)
97
-
98
- body[y_s:y_e, x_s:x_e] = cv2.blendLinear(face_large,body[y_s:y_e, x_s:x_e],mask_image,1-mask_image)
99
-
100
- return body
 
2
  import numpy as np
3
  import cv2
4
  import copy
 
 
 
5
 
6
  def get_crop_box(box, expand):
7
  x, y, x1, y1 = box
 
11
  crop_box = [x_c-s, y_c-s, x_c+s, y_c+s]
12
  return crop_box, s
13
 
14
+ def face_seg(image, mode="jaw", fp=None):
15
+ """
16
+ 对图像进行面部解析,生成面部区域的掩码。
17
+
18
+ Args:
19
+ image (PIL.Image): 输入图像。
20
+
21
+ Returns:
22
+ PIL.Image: 面部区域的掩码图像。
23
+ """
24
+ seg_image = fp(image, mode=mode) # 使用 FaceParsing 模型解析面部
25
  if seg_image is None:
26
+ print("error, no person_segment") # 如果没有检测到面部,返回错误
27
  return None
28
 
29
+ seg_image = seg_image.resize(image.size) # 将掩码图像调整为输入图像的大小
30
  return seg_image
31
 
32
+
33
+ def get_image(image, face, face_box, upper_boundary_ratio=0.5, expand=1.5, mode="raw", fp=None):
34
+ """
35
+ 将裁剪的面部图像粘贴回原始图像,并进行一些处理。
36
+
37
+ Args:
38
+ image (numpy.ndarray): 原始图像(身体部分)。
39
+ face (numpy.ndarray): 裁剪的面部图像。
40
+ face_box (tuple): 面部边界框的坐标 (x, y, x1, y1)。
41
+ upper_boundary_ratio (float): 用于控制面部区域的保留比例。
42
+ expand (float): 扩展因子,用于放大裁剪框。
43
+ mode: 融合mask构建方式
44
+
45
+ Returns:
46
+ numpy.ndarray: 处理后的图像。
47
+ """
48
+ # 将 numpy 数组转换为 PIL 图像
49
+ body = Image.fromarray(image[:, :, ::-1]) # 身体部分图像(整张图)
50
+ face = Image.fromarray(face[:, :, ::-1]) # 面部图像
51
+
52
+ x, y, x1, y1 = face_box # 获取面部边界框的坐标
53
+ crop_box, s = get_crop_box(face_box, expand) # 计算扩展后的裁剪框
54
+ x_s, y_s, x_e, y_e = crop_box # 裁剪框的坐标
55
+ face_position = (x, y) # 面部在原始图像中的位置
56
+
57
+ # 从身体图像中裁剪出扩展后的面部区域(下巴到边界有距离)
58
+ face_large = body.crop(crop_box)
59
+
60
+ ori_shape = face_large.size # 裁剪后图像的原始尺寸
61
+
62
+ # 对裁剪后的面部区域进行面部解析,生成掩码
63
+ mask_image = face_seg(face_large, mode=mode, fp=fp)
64
+
65
+ mask_small = mask_image.crop((x - x_s, y - y_s, x1 - x_s, y1 - y_s)) # 裁剪出面部区域的掩码
66
+
67
+ mask_image = Image.new('L', ori_shape, 0) # 创建一个全黑的掩码图像
68
+ mask_image.paste(mask_small, (x - x_s, y - y_s, x1 - x_s, y1 - y_s)) # 将面部掩码粘贴到全黑图像上
69
+
70
 
71
+ # 保留面部区域的上半部分(用于控制说话区域)
72
+ width, height = mask_image.size
73
+ top_boundary = int(height * upper_boundary_ratio) # 计算上半部分的边界
74
+ modified_mask_image = Image.new('L', ori_shape, 0) # 创建一个新的全黑掩码图像
75
+ modified_mask_image.paste(mask_image.crop((0, top_boundary, width, height)), (0, top_boundary)) # 粘贴上半部分掩码
76
+
77
+
78
+ # 对掩码进行高斯模糊,使边缘更平滑
79
+ blur_kernel_size = int(0.05 * ori_shape[0] // 2 * 2) + 1 # 计算模糊核大小
80
+ mask_array = cv2.GaussianBlur(np.array(modified_mask_image), (blur_kernel_size, blur_kernel_size), 0) # 高斯模糊
81
+ #mask_array = np.array(modified_mask_image)
82
+ mask_image = Image.fromarray(mask_array) # 将模糊后的掩码转换回 PIL 图像
83
+
84
+ # 将裁剪的面部图像粘贴回扩展后的面部区域
85
+ face_large.paste(face, (x - x_s, y - y_s, x1 - x_s, y1 - y_s))
86
+
87
+ body.paste(face_large, crop_box[:2], mask_image)
88
+
89
+ # 不用掩码,完全用infer
90
+ #face_large.save("debug/checkpoint_6_face_large.png")
91
+
92
+ body = np.array(body) # 将 PIL 图像转换回 numpy 数组
93
+
94
+ return body[:, :, ::-1] # 返回处理后的图像(BGR 转 RGB)
95
+
96
+ def get_image_blending(image,face,face_box,mask_array,crop_box):
97
  body = Image.fromarray(image[:,:,::-1])
98
  face = Image.fromarray(face[:,:,::-1])
99
 
100
+ x, y, x1, y1 = face_box
 
 
101
  x_s, y_s, x_e, y_e = crop_box
 
 
102
  face_large = body.crop(crop_box)
 
103
 
 
 
 
 
 
 
 
 
 
 
 
 
 
104
  mask_image = Image.fromarray(mask_array)
105
+ mask_image = mask_image.convert("L")
106
  face_large.paste(face, (x-x_s, y-y_s, x1-x_s, y1-y_s))
107
  body.paste(face_large, crop_box[:2], mask_image)
108
  body = np.array(body)
 
133
  blur_kernel_size = int(0.1 * ori_shape[0] // 2 * 2) + 1
134
  mask_array = cv2.GaussianBlur(np.array(modified_mask_image), (blur_kernel_size, blur_kernel_size), 0)
135
  return mask_array,crop_box
 
 
 
 
 
 
 
 
 
 
 
 
 
 
musetalk/utils/dwpose/default_runtime.py CHANGED
File without changes
musetalk/utils/dwpose/rtmpose-l_8xb32-270e_coco-ubody-wholebody-384x288.py CHANGED
File without changes
musetalk/utils/face_detection/README.md CHANGED
File without changes
musetalk/utils/face_detection/__init__.py CHANGED
File without changes
musetalk/utils/face_detection/api.py CHANGED
File without changes
musetalk/utils/face_detection/detection/__init__.py CHANGED
File without changes
musetalk/utils/face_detection/detection/core.py CHANGED
File without changes
musetalk/utils/face_detection/detection/sfd/__init__.py CHANGED
File without changes
musetalk/utils/face_detection/detection/sfd/bbox.py CHANGED
File without changes
musetalk/utils/face_detection/detection/sfd/detect.py CHANGED
File without changes
musetalk/utils/face_detection/detection/sfd/net_s3fd.py CHANGED
File without changes
musetalk/utils/face_detection/detection/sfd/sfd_detector.py CHANGED
File without changes
musetalk/utils/face_detection/models.py CHANGED
File without changes
musetalk/utils/face_detection/utils.py CHANGED
File without changes
musetalk/utils/face_parsing/__init__.py CHANGED
@@ -8,9 +8,53 @@ from .model import BiSeNet
8
  import torchvision.transforms as transforms
9
 
10
  class FaceParsing():
11
- def __init__(self):
12
  self.net = self.model_init()
13
  self.preprocess = self.image_preprocess()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
14
 
15
  def model_init(self,
16
  resnet_path='./models/face-parse-bisent/resnet18-5c106cde.pth',
@@ -30,7 +74,7 @@ class FaceParsing():
30
  transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225)),
31
  ])
32
 
33
- def __call__(self, image, size=(512, 512)):
34
  if isinstance(image, str):
35
  image = Image.open(image)
36
 
@@ -44,8 +88,25 @@ class FaceParsing():
44
  img = torch.unsqueeze(img, 0)
45
  out = self.net(img)[0]
46
  parsing = out.squeeze(0).cpu().numpy().argmax(0)
47
- parsing[np.where(parsing>13)] = 0
48
- parsing[np.where(parsing>=1)] = 255
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
49
  parsing = Image.fromarray(parsing.astype(np.uint8))
50
  return parsing
51
 
 
8
  import torchvision.transforms as transforms
9
 
10
  class FaceParsing():
11
+ def __init__(self, left_cheek_width=80, right_cheek_width=80):
12
  self.net = self.model_init()
13
  self.preprocess = self.image_preprocess()
14
+ # Ensure all size parameters are integers
15
+ cone_height = 21
16
+ tail_height = 12
17
+ total_size = cone_height + tail_height
18
+
19
+ # Create kernel with explicit integer dimensions
20
+ kernel = np.zeros((total_size, total_size), dtype=np.uint8)
21
+ center_x = total_size // 2 # Ensure center coordinates are integers
22
+
23
+ # Cone part
24
+ for row in range(cone_height):
25
+ if row < cone_height//2:
26
+ continue
27
+ width = int(2 * (row - cone_height//2) + 1)
28
+ start = int(center_x - (width // 2))
29
+ end = int(center_x + (width // 2) + 1)
30
+ kernel[row, start:end] = 1
31
+
32
+ # Vertical extension part
33
+ if cone_height > 0:
34
+ base_width = int(kernel[cone_height-1].sum())
35
+ else:
36
+ base_width = 1
37
+
38
+ for row in range(cone_height, total_size):
39
+ start = max(0, int(center_x - (base_width//2)))
40
+ end = min(total_size, int(center_x + (base_width//2) + 1))
41
+ kernel[row, start:end] = 1
42
+ self.kernel = kernel
43
+
44
+ # Modify cheek erosion kernel to be flatter ellipse
45
+ self.cheek_kernel = cv2.getStructuringElement(
46
+ cv2.MORPH_ELLIPSE, (35, 3))
47
+
48
+ # Add cheek area mask (protect chin area)
49
+ self.cheek_mask = self._create_cheek_mask(left_cheek_width=left_cheek_width, right_cheek_width=right_cheek_width)
50
+
51
+ def _create_cheek_mask(self, left_cheek_width=80, right_cheek_width=80):
52
+ """Create cheek area mask (1/4 area on both sides)"""
53
+ mask = np.zeros((512, 512), dtype=np.uint8)
54
+ center = 512 // 2
55
+ cv2.rectangle(mask, (0, 0), (center - left_cheek_width, 512), 255, -1) # Left cheek
56
+ cv2.rectangle(mask, (center + right_cheek_width, 0), (512, 512), 255, -1) # Right cheek
57
+ return mask
58
 
59
  def model_init(self,
60
  resnet_path='./models/face-parse-bisent/resnet18-5c106cde.pth',
 
74
  transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225)),
75
  ])
76
 
77
+ def __call__(self, image, size=(512, 512), mode="jaw"):
78
  if isinstance(image, str):
79
  image = Image.open(image)
80
 
 
88
  img = torch.unsqueeze(img, 0)
89
  out = self.net(img)[0]
90
  parsing = out.squeeze(0).cpu().numpy().argmax(0)
91
+
92
+ # Add 14:neck, remove 10:nose and 7:8:9
93
+ if mode == "neck":
94
+ parsing[np.isin(parsing, [1, 11, 12, 13, 14])] = 255
95
+ parsing[np.where(parsing!=255)] = 0
96
+ elif mode == "jaw":
97
+ face_region = np.isin(parsing, [1])*255
98
+ face_region = face_region.astype(np.uint8)
99
+ original_dilated = cv2.dilate(face_region, self.kernel, iterations=1)
100
+ eroded = cv2.erode(original_dilated, self.cheek_kernel, iterations=2)
101
+ face_region = cv2.bitwise_and(eroded, self.cheek_mask)
102
+ face_region = cv2.bitwise_or(face_region, cv2.bitwise_and(original_dilated, ~self.cheek_mask))
103
+ parsing[(face_region==255) & (~np.isin(parsing, [10]))] = 255
104
+ parsing[np.isin(parsing, [11, 12, 13])] = 255
105
+ parsing[np.where(parsing!=255)] = 0
106
+ else:
107
+ parsing[np.isin(parsing, [1, 11, 12, 13])] = 255
108
+ parsing[np.where(parsing!=255)] = 0
109
+
110
  parsing = Image.fromarray(parsing.astype(np.uint8))
111
  return parsing
112
 
musetalk/utils/preprocessing.py CHANGED
File without changes
musetalk/utils/utils.py CHANGED
@@ -15,13 +15,24 @@ from musetalk.whisper.audio2feature import Audio2Feature
15
  from musetalk.models.vae import VAE
16
  from musetalk.models.unet import UNet,PositionalEncoding
17
 
18
- def load_all_model():
19
- audio_processor = Audio2Feature(model_path="./models/whisper/tiny.pt")
20
- vae = VAE(model_path = "./models/sd-vae-ft-mse/")
21
- unet = UNet(unet_config="./models/musetalk/musetalk.json",
22
- model_path ="./models/musetalk/pytorch_model.bin")
 
 
 
 
 
 
 
 
 
 
 
23
  pe = PositionalEncoding(d_model=384)
24
- return audio_processor,vae,unet,pe
25
 
26
  def get_file_type(video_path):
27
  _, ext = os.path.splitext(video_path)
@@ -39,10 +50,13 @@ def get_video_fps(video_path):
39
  video.release()
40
  return fps
41
 
42
- def datagen(whisper_chunks,
43
- vae_encode_latents,
44
- batch_size=8,
45
- delay_frame=0):
 
 
 
46
  whisper_batch, latent_batch = [], []
47
  for i, w in enumerate(whisper_chunks):
48
  idx = (i+delay_frame)%len(vae_encode_latents)
@@ -51,14 +65,14 @@ def datagen(whisper_chunks,
51
  latent_batch.append(latent)
52
 
53
  if len(latent_batch) >= batch_size:
54
- whisper_batch = np.stack(whisper_batch)
55
  latent_batch = torch.cat(latent_batch, dim=0)
56
  yield whisper_batch, latent_batch
57
- whisper_batch, latent_batch = [], []
58
 
59
  # the last batch may smaller than batch size
60
  if len(latent_batch) > 0:
61
- whisper_batch = np.stack(whisper_batch)
62
  latent_batch = torch.cat(latent_batch, dim=0)
63
 
64
- yield whisper_batch, latent_batch
 
15
  from musetalk.models.vae import VAE
16
  from musetalk.models.unet import UNet,PositionalEncoding
17
 
18
+
19
+ def load_all_model(
20
+ unet_model_path="./models/musetalk/pytorch_model.bin",
21
+ vae_type="sd-vae-ft-mse",
22
+ unet_config="./models/musetalk/musetalk.json",
23
+ device=None,
24
+ ):
25
+ vae = VAE(
26
+ model_path = f"./models/{vae_type}/",
27
+ )
28
+ print(f"load unet model from {unet_model_path}")
29
+ unet = UNet(
30
+ unet_config=unet_config,
31
+ model_path=unet_model_path,
32
+ device=device
33
+ )
34
  pe = PositionalEncoding(d_model=384)
35
+ return vae, unet, pe
36
 
37
  def get_file_type(video_path):
38
  _, ext = os.path.splitext(video_path)
 
50
  video.release()
51
  return fps
52
 
53
+ def datagen(
54
+ whisper_chunks,
55
+ vae_encode_latents,
56
+ batch_size=8,
57
+ delay_frame=0,
58
+ device="cuda:0",
59
+ ):
60
  whisper_batch, latent_batch = [], []
61
  for i, w in enumerate(whisper_chunks):
62
  idx = (i+delay_frame)%len(vae_encode_latents)
 
65
  latent_batch.append(latent)
66
 
67
  if len(latent_batch) >= batch_size:
68
+ whisper_batch = torch.stack(whisper_batch)
69
  latent_batch = torch.cat(latent_batch, dim=0)
70
  yield whisper_batch, latent_batch
71
+ whisper_batch, latent_batch = [], []
72
 
73
  # the last batch may smaller than batch size
74
  if len(latent_batch) > 0:
75
+ whisper_batch = torch.stack(whisper_batch)
76
  latent_batch = torch.cat(latent_batch, dim=0)
77
 
78
+ yield whisper_batch.to(device), latent_batch.to(device)
musetalk/whisper/audio2feature.py CHANGED
File without changes
musetalk/whisper/whisper/__init__.py CHANGED
File without changes
musetalk/whisper/whisper/__main__.py CHANGED
File without changes
musetalk/whisper/whisper/assets/gpt2/merges.txt CHANGED
File without changes
musetalk/whisper/whisper/assets/gpt2/special_tokens_map.json CHANGED
File without changes
musetalk/whisper/whisper/assets/gpt2/tokenizer_config.json CHANGED
File without changes
musetalk/whisper/whisper/assets/gpt2/vocab.json CHANGED
File without changes
musetalk/whisper/whisper/assets/multilingual/added_tokens.json CHANGED
File without changes
musetalk/whisper/whisper/assets/multilingual/merges.txt CHANGED
File without changes
musetalk/whisper/whisper/assets/multilingual/special_tokens_map.json CHANGED
File without changes
musetalk/whisper/whisper/assets/multilingual/tokenizer_config.json CHANGED
File without changes
musetalk/whisper/whisper/assets/multilingual/vocab.json CHANGED
File without changes
musetalk/whisper/whisper/audio.py CHANGED
File without changes
musetalk/whisper/whisper/decoding.py CHANGED
File without changes
musetalk/whisper/whisper/model.py CHANGED
File without changes
musetalk/whisper/whisper/normalizers/__init__.py CHANGED
File without changes
musetalk/whisper/whisper/normalizers/basic.py CHANGED
File without changes
musetalk/whisper/whisper/normalizers/english.json CHANGED
File without changes
musetalk/whisper/whisper/normalizers/english.py CHANGED
File without changes
musetalk/whisper/whisper/tokenizer.py CHANGED
File without changes
musetalk/whisper/whisper/transcribe.py CHANGED
File without changes
musetalk/whisper/whisper/utils.py CHANGED
File without changes
requirements.txt CHANGED
@@ -9,6 +9,7 @@ tensorboard==2.12.0
9
  opencv-python==4.9.0.80
10
  soundfile==0.12.1
11
  transformers==4.39.2
 
12
 
13
  gdown
14
  requests
 
9
  opencv-python==4.9.0.80
10
  soundfile==0.12.1
11
  transformers==4.39.2
12
+ huggingface_hub==0.25.0
13
 
14
  gdown
15
  requests
scripts/inference_alpha.py ADDED
@@ -0,0 +1,253 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import cv2
3
+ import math
4
+ import copy
5
+ import torch
6
+ import glob
7
+ import shutil
8
+ import pickle
9
+ import argparse
10
+ import subprocess
11
+ import numpy as np
12
+ from tqdm import tqdm
13
+ from omegaconf import OmegaConf
14
+ from transformers import WhisperModel
15
+
16
+ from musetalk.utils.blending import get_image
17
+ from musetalk.utils.face_parsing import FaceParsing
18
+ from musetalk.utils.audio_processor import AudioProcessor
19
+ from musetalk.utils.utils import get_file_type, get_video_fps, datagen, load_all_model
20
+ from musetalk.utils.preprocessing import get_landmark_and_bbox, read_imgs, coord_placeholder
21
+
22
+
23
+ @torch.no_grad()
24
+ def main(args):
25
+ # Configure ffmpeg path
26
+ if args.ffmpeg_path not in os.getenv('PATH'):
27
+ print("Adding ffmpeg to PATH")
28
+ os.environ["PATH"] = f"{args.ffmpeg_path}:{os.environ['PATH']}"
29
+
30
+ # Set computing device
31
+ device = torch.device(f"cuda:{args.gpu_id}" if torch.cuda.is_available() else "cpu")
32
+
33
+ # Load model weights
34
+ vae, unet, pe = load_all_model(
35
+ unet_model_path=args.unet_model_path,
36
+ vae_type=args.vae_type,
37
+ unet_config=args.unet_config,
38
+ device=device
39
+ )
40
+ timesteps = torch.tensor([0], device=device)
41
+
42
+ # Convert models to half precision if float16 is enabled
43
+ if args.use_float16:
44
+ pe = pe.half()
45
+ vae.vae = vae.vae.half()
46
+ unet.model = unet.model.half()
47
+
48
+ # Move models to specified device
49
+ pe = pe.to(device)
50
+ vae.vae = vae.vae.to(device)
51
+ unet.model = unet.model.to(device)
52
+
53
+ # Initialize audio processor and Whisper model
54
+ audio_processor = AudioProcessor(feature_extractor_path=args.whisper_dir)
55
+ weight_dtype = unet.model.dtype
56
+ whisper = WhisperModel.from_pretrained(args.whisper_dir)
57
+ whisper = whisper.to(device=device, dtype=weight_dtype).eval()
58
+ whisper.requires_grad_(False)
59
+
60
+ # Initialize face parser
61
+ fp = FaceParsing(left_cheek_width=args.left_cheek_width, right_cheek_width=args.right_cheek_width)
62
+
63
+ # Load inference configuration
64
+ inference_config = OmegaConf.load(args.inference_config)
65
+ print("Loaded inference config:", inference_config)
66
+
67
+ # Process each task
68
+ for task_id in inference_config:
69
+ try:
70
+ # Get task configuration
71
+ video_path = inference_config[task_id]["video_path"]
72
+ audio_path = inference_config[task_id]["audio_path"]
73
+ if "result_name" in inference_config[task_id]:
74
+ args.output_vid_name = inference_config[task_id]["result_name"]
75
+ bbox_shift = inference_config[task_id].get("bbox_shift", args.bbox_shift)
76
+
77
+ # Set output paths
78
+ input_basename = os.path.basename(video_path).split('.')[0]
79
+ audio_basename = os.path.basename(audio_path).split('.')[0]
80
+ output_basename = f"{input_basename}_{audio_basename}"
81
+
82
+ # Create temporary directories
83
+ temp_dir = os.path.join(args.result_dir, "frames_result")
84
+ os.makedirs(temp_dir, exist_ok=True)
85
+
86
+ # Set result save paths
87
+ result_img_save_path = os.path.join(temp_dir, output_basename) # related to video & audio inputs
88
+ crop_coord_save_path = os.path.join(args.result_dir, "../", input_basename+".pkl") # only related to video input
89
+ os.makedirs(result_img_save_path, exist_ok=True)
90
+ # Set output video paths
91
+ if args.output_vid_name is None:
92
+ output_vid_name = os.path.join(temp_dir, output_basename + ".mp4")
93
+ else:
94
+ output_vid_name = os.path.join(temp_dir, args.output_vid_name)
95
+ output_vid_name_concat = os.path.join(temp_dir, output_basename + "_concat.mp4")
96
+
97
+ # Skip if output file already exists
98
+ if os.path.exists(output_vid_name):
99
+ print(f"{output_vid_name} already exists, skipping!")
100
+ continue
101
+
102
+ # Extract frames from source video
103
+ if get_file_type(video_path) == "video":
104
+ save_dir_full = os.path.join(temp_dir, input_basename)
105
+ os.makedirs(save_dir_full, exist_ok=True)
106
+ cmd = f"ffmpeg -v fatal -i {video_path} -start_number 0 {save_dir_full}/%08d.png"
107
+ os.system(cmd)
108
+ input_img_list = sorted(glob.glob(os.path.join(save_dir_full, '*.[jpJP][pnPN]*[gG]')))
109
+ fps = get_video_fps(video_path)
110
+ elif get_file_type(video_path) == "image":
111
+ input_img_list = [video_path]
112
+ fps = args.fps
113
+ elif os.path.isdir(video_path):
114
+ input_img_list = glob.glob(os.path.join(video_path, '*.[jpJP][pnPN]*[gG]'))
115
+ input_img_list = sorted(input_img_list, key=lambda x: int(os.path.splitext(os.path.basename(x))[0]))
116
+ fps = args.fps
117
+ else:
118
+ raise ValueError(f"{video_path} should be a video file, an image file or a directory of images")
119
+
120
+ # Extract audio features
121
+ whisper_input_features, librosa_length = audio_processor.get_audio_feature(audio_path)
122
+ whisper_chunks = audio_processor.get_whisper_chunk(
123
+ whisper_input_features,
124
+ device,
125
+ weight_dtype,
126
+ whisper,
127
+ librosa_length,
128
+ fps=fps,
129
+ audio_padding_length_left=args.audio_padding_length_left,
130
+ audio_padding_length_right=args.audio_padding_length_right,
131
+ )
132
+
133
+ # Preprocess input images
134
+ if os.path.exists(crop_coord_save_path) and args.use_saved_coord:
135
+ print("Using saved coordinates")
136
+ with open(crop_coord_save_path, 'rb') as f:
137
+ coord_list = pickle.load(f)
138
+ frame_list = read_imgs(input_img_list)
139
+ else:
140
+ print("Extracting landmarks... time-consuming operation")
141
+ coord_list, frame_list = get_landmark_and_bbox(input_img_list, bbox_shift)
142
+ with open(crop_coord_save_path, 'wb') as f:
143
+ pickle.dump(coord_list, f)
144
+
145
+ print(f"Number of frames: {len(frame_list)}")
146
+
147
+ # Process each frame
148
+ input_latent_list = []
149
+ for bbox, frame in zip(coord_list, frame_list):
150
+ if bbox == coord_placeholder:
151
+ continue
152
+ x1, y1, x2, y2 = bbox
153
+ y2 = y2 + args.extra_margin
154
+ y2 = min(y2, frame.shape[0])
155
+ crop_frame = frame[y1:y2, x1:x2]
156
+ crop_frame = cv2.resize(crop_frame, (256,256), interpolation=cv2.INTER_LANCZOS4)
157
+ latents = vae.get_latents_for_unet(crop_frame)
158
+ input_latent_list.append(latents)
159
+
160
+ # Smooth first and last frames
161
+ frame_list_cycle = frame_list + frame_list[::-1]
162
+ coord_list_cycle = coord_list + coord_list[::-1]
163
+ input_latent_list_cycle = input_latent_list + input_latent_list[::-1]
164
+
165
+ # Batch inference
166
+ print("Starting inference")
167
+ video_num = len(whisper_chunks)
168
+ batch_size = args.batch_size
169
+ gen = datagen(
170
+ whisper_chunks=whisper_chunks,
171
+ vae_encode_latents=input_latent_list_cycle,
172
+ batch_size=batch_size,
173
+ delay_frame=0,
174
+ device=device,
175
+ )
176
+
177
+ res_frame_list = []
178
+ total = int(np.ceil(float(video_num) / batch_size))
179
+
180
+ # Execute inference
181
+ for i, (whisper_batch, latent_batch) in enumerate(tqdm(gen, total=total)):
182
+ audio_feature_batch = pe(whisper_batch)
183
+ latent_batch = latent_batch.to(dtype=unet.model.dtype)
184
+
185
+ pred_latents = unet.model(latent_batch, timesteps, encoder_hidden_states=audio_feature_batch).sample
186
+ recon = vae.decode_latents(pred_latents)
187
+ for res_frame in recon:
188
+ res_frame_list.append(res_frame)
189
+
190
+ # Pad generated images to original video size
191
+ print("Padding generated images to original video size")
192
+ for i, res_frame in enumerate(tqdm(res_frame_list)):
193
+ bbox = coord_list_cycle[i%(len(coord_list_cycle))]
194
+ ori_frame = copy.deepcopy(frame_list_cycle[i%(len(frame_list_cycle))])
195
+ x1, y1, x2, y2 = bbox
196
+ y2 = y2 + args.extra_margin
197
+ y2 = min(y2, frame.shape[0])
198
+ try:
199
+ res_frame = cv2.resize(res_frame.astype(np.uint8), (x2-x1, y2-y1))
200
+ except:
201
+ continue
202
+
203
+ # Merge results
204
+ combine_frame = get_image(ori_frame, res_frame, [x1, y1, x2, y2], mode=args.parsing_mode, fp=fp)
205
+ cv2.imwrite(f"{result_img_save_path}/{str(i).zfill(8)}.png", combine_frame)
206
+
207
+ # Save prediction results
208
+ temp_vid_path = f"{temp_dir}/temp_{input_basename}_{audio_basename}.mp4"
209
+ cmd_img2video = f"ffmpeg -y -v warning -r {fps} -f image2 -i {result_img_save_path}/%08d.png -vcodec libx264 -vf format=yuv420p -crf 18 {temp_vid_path}"
210
+ print("Video generation command:", cmd_img2video)
211
+ os.system(cmd_img2video)
212
+
213
+ cmd_combine_audio = f"ffmpeg -y -v warning -i {audio_path} -i {temp_vid_path} {output_vid_name}"
214
+ print("Audio combination command:", cmd_combine_audio)
215
+ os.system(cmd_combine_audio)
216
+
217
+ # Clean up temporary files
218
+ shutil.rmtree(result_img_save_path)
219
+ os.remove(temp_vid_path)
220
+
221
+ shutil.rmtree(save_dir_full)
222
+ if not args.saved_coord:
223
+ os.remove(crop_coord_save_path)
224
+
225
+ print(f"Results saved to {output_vid_name}")
226
+ except Exception as e:
227
+ print("Error occurred during processing:", e)
228
+
229
+ if __name__ == "__main__":
230
+ parser = argparse.ArgumentParser()
231
+ parser.add_argument("--ffmpeg_path", type=str, default="/cfs-workspace/users/gozhong/ffmpeg-4.4-amd64-static/", help="Path to ffmpeg executable")
232
+ parser.add_argument("--gpu_id", type=int, default=0, help="GPU ID to use")
233
+ parser.add_argument("--vae_type", type=str, default="sd-vae", help="Type of VAE model")
234
+ parser.add_argument("--unet_config", type=str, default="./models/musetalk/config.json", help="Path to UNet configuration file")
235
+ parser.add_argument("--unet_model_path", type=str, default="/cfs-datasets/users/gozhong/codes/musetalk_exp/exp_out/stage1_bs40/unet-20000.pth", help="Path to UNet model weights")
236
+ parser.add_argument("--whisper_dir", type=str, default="/cfs-datasets/public_models/whisper-tiny", help="Directory containing Whisper model")
237
+ parser.add_argument("--inference_config", type=str, default="configs/inference/test_img.yaml", help="Path to inference configuration file")
238
+ parser.add_argument("--bbox_shift", type=int, default=0, help="Bounding box shift value")
239
+ parser.add_argument("--result_dir", default='./results', help="Directory for output results")
240
+ parser.add_argument("--extra_margin", type=int, default=10, help="Extra margin for face cropping")
241
+ parser.add_argument("--fps", type=int, default=25, help="Video frames per second")
242
+ parser.add_argument("--audio_padding_length_left", type=int, default=2, help="Left padding length for audio")
243
+ parser.add_argument("--audio_padding_length_right", type=int, default=2, help="Right padding length for audio")
244
+ parser.add_argument("--batch_size", type=int, default=8, help="Batch size for inference")
245
+ parser.add_argument("--output_vid_name", type=str, default=None, help="Name of output video file")
246
+ parser.add_argument("--use_saved_coord", action="store_true", help='Use saved coordinates to save time')
247
+ parser.add_argument("--saved_coord", action="store_true", help='Save coordinates for future use')
248
+ parser.add_argument("--use_float16", action="store_true", help="Use float16 for faster inference")
249
+ parser.add_argument("--parsing_mode", default='jaw', help="Face blending parsing mode")
250
+ parser.add_argument("--left_cheek_width", type=int, default=90, help="Width of left cheek region")
251
+ parser.add_argument("--right_cheek_width", type=int, default=90, help="Width of right cheek region")
252
+ args = parser.parse_args()
253
+ main(args)