<enhance>(inference): support using an image as video input(#17 #34)
Browse files- README.md +5 -5
- scripts/inference.py +12 -7
- scripts/realtime_inference.py +3 -3
README.md
CHANGED
|
@@ -244,7 +244,7 @@ Here, we provide the inference script.
|
|
| 244 |
python -m scripts.inference --inference_config configs/inference/test.yaml
|
| 245 |
```
|
| 246 |
configs/inference/test.yaml is the path to the inference configuration file, including video_path and audio_path.
|
| 247 |
-
The video_path should be either a video file or a directory of images.
|
| 248 |
|
| 249 |
You are recommended to input video with `25fps`, the same fps used when training the model. If your video is far less than 25fps, you are recommended to apply frame interpolation or directly convert the video to 25fps using ffmpeg.
|
| 250 |
|
|
@@ -276,12 +276,12 @@ configs/inference/realtime.yaml is the path to the real-time inference configura
|
|
| 276 |
```
|
| 277 |
Inferring using: data/audio/yongen.wav
|
| 278 |
```
|
| 279 |
-
1. While MuseTalk is inferring, sub-threads can simultaneously stream the results to the users. The generation process can achieve
|
| 280 |
```
|
| 281 |
2%|βββ | 3/141 [00:00<00:32, 4.30it/s] # inference process
|
| 282 |
-
|
| 283 |
-
|
| 284 |
-
|
| 285 |
3%|ββββ | 4/141 [00:00<00:32, 4.21it/s]
|
| 286 |
```
|
| 287 |
1. Set `preparation` to `False` and run this script if you want to genrate more videos using the same avatar.
|
|
|
|
| 244 |
python -m scripts.inference --inference_config configs/inference/test.yaml
|
| 245 |
```
|
| 246 |
configs/inference/test.yaml is the path to the inference configuration file, including video_path and audio_path.
|
| 247 |
+
The video_path should be either a video file, an image file or a directory of images.
|
| 248 |
|
| 249 |
You are recommended to input video with `25fps`, the same fps used when training the model. If your video is far less than 25fps, you are recommended to apply frame interpolation or directly convert the video to 25fps using ffmpeg.
|
| 250 |
|
|
|
|
| 276 |
```
|
| 277 |
Inferring using: data/audio/yongen.wav
|
| 278 |
```
|
| 279 |
+
1. While MuseTalk is inferring, sub-threads can simultaneously stream the results to the users. The generation process can achieve 30fps+ on an NVIDIA Tesla V100.
|
| 280 |
```
|
| 281 |
2%|βββ | 3/141 [00:00<00:32, 4.30it/s] # inference process
|
| 282 |
+
Displaying the 6-th frame with FPS: 48.58 # display process
|
| 283 |
+
Displaying the 7-th frame with FPS: 48.74
|
| 284 |
+
Displaying the 8-th frame with FPS: 49.17
|
| 285 |
3%|ββββ | 4/141 [00:00<00:32, 4.21it/s]
|
| 286 |
```
|
| 287 |
1. Set `preparation` to `False` and run this script if you want to genrate more videos using the same avatar.
|
scripts/inference.py
CHANGED
|
@@ -36,7 +36,7 @@ def main(args):
|
|
| 36 |
crop_coord_save_path = os.path.join(result_img_save_path, input_basename+".pkl") # only related to video input
|
| 37 |
os.makedirs(result_img_save_path,exist_ok =True)
|
| 38 |
|
| 39 |
-
if args.output_vid_name
|
| 40 |
output_vid_name = os.path.join(args.result_dir, output_basename+".mp4")
|
| 41 |
else:
|
| 42 |
output_vid_name = os.path.join(args.result_dir, args.output_vid_name)
|
|
@@ -48,10 +48,16 @@ def main(args):
|
|
| 48 |
os.system(cmd)
|
| 49 |
input_img_list = sorted(glob.glob(os.path.join(save_dir_full, '*.[jpJP][pnPN]*[gG]')))
|
| 50 |
fps = get_video_fps(video_path)
|
| 51 |
-
|
|
|
|
|
|
|
|
|
|
| 52 |
input_img_list = glob.glob(os.path.join(video_path, '*.[jpJP][pnPN]*[gG]'))
|
| 53 |
input_img_list = sorted(input_img_list, key=lambda x: int(os.path.splitext(os.path.basename(x))[0]))
|
| 54 |
fps = args.fps
|
|
|
|
|
|
|
|
|
|
| 55 |
#print(input_img_list)
|
| 56 |
############################################## extract audio feature ##############################################
|
| 57 |
whisper_feature = audio_processor.audio2feat(audio_path)
|
|
@@ -114,12 +120,12 @@ def main(args):
|
|
| 114 |
|
| 115 |
combine_frame = get_image(ori_frame,res_frame,bbox)
|
| 116 |
cv2.imwrite(f"{result_img_save_path}/{str(i).zfill(8)}.png",combine_frame)
|
| 117 |
-
|
| 118 |
-
cmd_img2video = f"ffmpeg -y -v
|
| 119 |
print(cmd_img2video)
|
| 120 |
os.system(cmd_img2video)
|
| 121 |
|
| 122 |
-
cmd_combine_audio = f"ffmpeg -y -v
|
| 123 |
print(cmd_combine_audio)
|
| 124 |
os.system(cmd_combine_audio)
|
| 125 |
|
|
@@ -135,7 +141,7 @@ if __name__ == "__main__":
|
|
| 135 |
|
| 136 |
parser.add_argument("--fps", type=int, default=25)
|
| 137 |
parser.add_argument("--batch_size", type=int, default=8)
|
| 138 |
-
parser.add_argument("--output_vid_name", type=str,default=
|
| 139 |
parser.add_argument("--use_saved_coord",
|
| 140 |
action="store_true",
|
| 141 |
help='use saved coordinate to save time')
|
|
@@ -143,4 +149,3 @@ if __name__ == "__main__":
|
|
| 143 |
|
| 144 |
args = parser.parse_args()
|
| 145 |
main(args)
|
| 146 |
-
|
|
|
|
| 36 |
crop_coord_save_path = os.path.join(result_img_save_path, input_basename+".pkl") # only related to video input
|
| 37 |
os.makedirs(result_img_save_path,exist_ok =True)
|
| 38 |
|
| 39 |
+
if args.output_vid_name is None:
|
| 40 |
output_vid_name = os.path.join(args.result_dir, output_basename+".mp4")
|
| 41 |
else:
|
| 42 |
output_vid_name = os.path.join(args.result_dir, args.output_vid_name)
|
|
|
|
| 48 |
os.system(cmd)
|
| 49 |
input_img_list = sorted(glob.glob(os.path.join(save_dir_full, '*.[jpJP][pnPN]*[gG]')))
|
| 50 |
fps = get_video_fps(video_path)
|
| 51 |
+
elif get_file_type(video_path)=="image":
|
| 52 |
+
input_img_list = [video_path, ]
|
| 53 |
+
fps = args.fps
|
| 54 |
+
elif os.path.isdir(video_path): # input img folder
|
| 55 |
input_img_list = glob.glob(os.path.join(video_path, '*.[jpJP][pnPN]*[gG]'))
|
| 56 |
input_img_list = sorted(input_img_list, key=lambda x: int(os.path.splitext(os.path.basename(x))[0]))
|
| 57 |
fps = args.fps
|
| 58 |
+
else:
|
| 59 |
+
raise ValueError(f"{video_path} should be a video file, an image file or a directory of images")
|
| 60 |
+
|
| 61 |
#print(input_img_list)
|
| 62 |
############################################## extract audio feature ##############################################
|
| 63 |
whisper_feature = audio_processor.audio2feat(audio_path)
|
|
|
|
| 120 |
|
| 121 |
combine_frame = get_image(ori_frame,res_frame,bbox)
|
| 122 |
cv2.imwrite(f"{result_img_save_path}/{str(i).zfill(8)}.png",combine_frame)
|
| 123 |
+
|
| 124 |
+
cmd_img2video = f"ffmpeg -y -v warning -r {fps} -f image2 -i {result_img_save_path}/%08d.png -vcodec libx264 -vf format=rgb24,scale=out_color_matrix=bt709,format=yuv420p -crf 18 temp.mp4"
|
| 125 |
print(cmd_img2video)
|
| 126 |
os.system(cmd_img2video)
|
| 127 |
|
| 128 |
+
cmd_combine_audio = f"ffmpeg -y -v warning -i {audio_path} -i temp.mp4 {output_vid_name}"
|
| 129 |
print(cmd_combine_audio)
|
| 130 |
os.system(cmd_combine_audio)
|
| 131 |
|
|
|
|
| 141 |
|
| 142 |
parser.add_argument("--fps", type=int, default=25)
|
| 143 |
parser.add_argument("--batch_size", type=int, default=8)
|
| 144 |
+
parser.add_argument("--output_vid_name", type=str, default=None)
|
| 145 |
parser.add_argument("--use_saved_coord",
|
| 146 |
action="store_true",
|
| 147 |
help='use saved coordinate to save time')
|
|
|
|
| 149 |
|
| 150 |
args = parser.parse_args()
|
| 151 |
main(args)
|
|
|
scripts/realtime_inference.py
CHANGED
|
@@ -206,7 +206,7 @@ class Avatar:
|
|
| 206 |
combine_frame = get_image_blending(ori_frame,res_frame,bbox,mask,mask_crop_box)
|
| 207 |
|
| 208 |
fps = 1/(time.time()-start+1e-6)
|
| 209 |
-
print(f"
|
| 210 |
cv2.imwrite(f"{self.avatar_path}/tmp/{str(self.idx).zfill(8)}.png",combine_frame)
|
| 211 |
self.idx = self.idx + 1
|
| 212 |
|
|
@@ -244,12 +244,12 @@ class Avatar:
|
|
| 244 |
|
| 245 |
if out_vid_name is not None:
|
| 246 |
# optional
|
| 247 |
-
cmd_img2video = f"ffmpeg -y -v
|
| 248 |
print(cmd_img2video)
|
| 249 |
os.system(cmd_img2video)
|
| 250 |
|
| 251 |
output_vid = os.path.join(self.video_out_path, out_vid_name+".mp4") # on
|
| 252 |
-
cmd_combine_audio = f"ffmpeg -y -v
|
| 253 |
print(cmd_combine_audio)
|
| 254 |
os.system(cmd_combine_audio)
|
| 255 |
|
|
|
|
| 206 |
combine_frame = get_image_blending(ori_frame,res_frame,bbox,mask,mask_crop_box)
|
| 207 |
|
| 208 |
fps = 1/(time.time()-start+1e-6)
|
| 209 |
+
print(f"Displaying the {self.idx}-th frame with FPS: {fps:.2f}")
|
| 210 |
cv2.imwrite(f"{self.avatar_path}/tmp/{str(self.idx).zfill(8)}.png",combine_frame)
|
| 211 |
self.idx = self.idx + 1
|
| 212 |
|
|
|
|
| 244 |
|
| 245 |
if out_vid_name is not None:
|
| 246 |
# optional
|
| 247 |
+
cmd_img2video = f"ffmpeg -y -v warning -r {fps} -f image2 -i {self.avatar_path}/tmp/%08d.png -vcodec libx264 -vf format=rgb24,scale=out_color_matrix=bt709,format=yuv420p -crf 18 {self.avatar_path}/temp.mp4"
|
| 248 |
print(cmd_img2video)
|
| 249 |
os.system(cmd_img2video)
|
| 250 |
|
| 251 |
output_vid = os.path.join(self.video_out_path, out_vid_name+".mp4") # on
|
| 252 |
+
cmd_combine_audio = f"ffmpeg -y -v warning -i {audio_path} -i {self.avatar_path}/temp.mp4 {output_vid}"
|
| 253 |
print(cmd_combine_audio)
|
| 254 |
os.system(cmd_combine_audio)
|
| 255 |
|