czk32611 commited on
Commit
41bcdc8
Β·
1 Parent(s): 9e59599

<enhance>(inference): support using an image as video input(#17 #34)

Browse files
README.md CHANGED
@@ -244,7 +244,7 @@ Here, we provide the inference script.
244
  python -m scripts.inference --inference_config configs/inference/test.yaml
245
  ```
246
  configs/inference/test.yaml is the path to the inference configuration file, including video_path and audio_path.
247
- The video_path should be either a video file or a directory of images.
248
 
249
  You are recommended to input video with `25fps`, the same fps used when training the model. If your video is far less than 25fps, you are recommended to apply frame interpolation or directly convert the video to 25fps using ffmpeg.
250
 
@@ -276,12 +276,12 @@ configs/inference/realtime.yaml is the path to the real-time inference configura
276
  ```
277
  Inferring using: data/audio/yongen.wav
278
  ```
279
- 1. While MuseTalk is inferring, sub-threads can simultaneously stream the results to the users. The generation process can achieve up to 50fps on an NVIDIA Tesla V100.
280
  ```
281
  2%|β–ˆβ–ˆβ– | 3/141 [00:00<00:32, 4.30it/s] # inference process
282
- Generating the 6-th frame with FPS: 48.58 # playing process
283
- Generating the 7-th frame with FPS: 48.74
284
- Generating the 8-th frame with FPS: 49.17
285
  3%|β–ˆβ–ˆβ–ˆβ–Ž | 4/141 [00:00<00:32, 4.21it/s]
286
  ```
287
  1. Set `preparation` to `False` and run this script if you want to genrate more videos using the same avatar.
 
244
  python -m scripts.inference --inference_config configs/inference/test.yaml
245
  ```
246
  configs/inference/test.yaml is the path to the inference configuration file, including video_path and audio_path.
247
+ The video_path should be either a video file, an image file or a directory of images.
248
 
249
  You are recommended to input video with `25fps`, the same fps used when training the model. If your video is far less than 25fps, you are recommended to apply frame interpolation or directly convert the video to 25fps using ffmpeg.
250
 
 
276
  ```
277
  Inferring using: data/audio/yongen.wav
278
  ```
279
+ 1. While MuseTalk is inferring, sub-threads can simultaneously stream the results to the users. The generation process can achieve 30fps+ on an NVIDIA Tesla V100.
280
  ```
281
  2%|β–ˆβ–ˆβ– | 3/141 [00:00<00:32, 4.30it/s] # inference process
282
+ Displaying the 6-th frame with FPS: 48.58 # display process
283
+ Displaying the 7-th frame with FPS: 48.74
284
+ Displaying the 8-th frame with FPS: 49.17
285
  3%|β–ˆβ–ˆβ–ˆβ–Ž | 4/141 [00:00<00:32, 4.21it/s]
286
  ```
287
  1. Set `preparation` to `False` and run this script if you want to genrate more videos using the same avatar.
scripts/inference.py CHANGED
@@ -36,7 +36,7 @@ def main(args):
36
  crop_coord_save_path = os.path.join(result_img_save_path, input_basename+".pkl") # only related to video input
37
  os.makedirs(result_img_save_path,exist_ok =True)
38
 
39
- if args.output_vid_name=="":
40
  output_vid_name = os.path.join(args.result_dir, output_basename+".mp4")
41
  else:
42
  output_vid_name = os.path.join(args.result_dir, args.output_vid_name)
@@ -48,10 +48,16 @@ def main(args):
48
  os.system(cmd)
49
  input_img_list = sorted(glob.glob(os.path.join(save_dir_full, '*.[jpJP][pnPN]*[gG]')))
50
  fps = get_video_fps(video_path)
51
- else: # input img folder
 
 
 
52
  input_img_list = glob.glob(os.path.join(video_path, '*.[jpJP][pnPN]*[gG]'))
53
  input_img_list = sorted(input_img_list, key=lambda x: int(os.path.splitext(os.path.basename(x))[0]))
54
  fps = args.fps
 
 
 
55
  #print(input_img_list)
56
  ############################################## extract audio feature ##############################################
57
  whisper_feature = audio_processor.audio2feat(audio_path)
@@ -114,12 +120,12 @@ def main(args):
114
 
115
  combine_frame = get_image(ori_frame,res_frame,bbox)
116
  cv2.imwrite(f"{result_img_save_path}/{str(i).zfill(8)}.png",combine_frame)
117
-
118
- cmd_img2video = f"ffmpeg -y -v fatal -r {fps} -f image2 -i {result_img_save_path}/%08d.png -vcodec libx264 -vf format=rgb24,scale=out_color_matrix=bt709,format=yuv420p -crf 18 temp.mp4"
119
  print(cmd_img2video)
120
  os.system(cmd_img2video)
121
 
122
- cmd_combine_audio = f"ffmpeg -y -v fatal -i {audio_path} -i temp.mp4 {output_vid_name}"
123
  print(cmd_combine_audio)
124
  os.system(cmd_combine_audio)
125
 
@@ -135,7 +141,7 @@ if __name__ == "__main__":
135
 
136
  parser.add_argument("--fps", type=int, default=25)
137
  parser.add_argument("--batch_size", type=int, default=8)
138
- parser.add_argument("--output_vid_name", type=str,default='')
139
  parser.add_argument("--use_saved_coord",
140
  action="store_true",
141
  help='use saved coordinate to save time')
@@ -143,4 +149,3 @@ if __name__ == "__main__":
143
 
144
  args = parser.parse_args()
145
  main(args)
146
-
 
36
  crop_coord_save_path = os.path.join(result_img_save_path, input_basename+".pkl") # only related to video input
37
  os.makedirs(result_img_save_path,exist_ok =True)
38
 
39
+ if args.output_vid_name is None:
40
  output_vid_name = os.path.join(args.result_dir, output_basename+".mp4")
41
  else:
42
  output_vid_name = os.path.join(args.result_dir, args.output_vid_name)
 
48
  os.system(cmd)
49
  input_img_list = sorted(glob.glob(os.path.join(save_dir_full, '*.[jpJP][pnPN]*[gG]')))
50
  fps = get_video_fps(video_path)
51
+ elif get_file_type(video_path)=="image":
52
+ input_img_list = [video_path, ]
53
+ fps = args.fps
54
+ elif os.path.isdir(video_path): # input img folder
55
  input_img_list = glob.glob(os.path.join(video_path, '*.[jpJP][pnPN]*[gG]'))
56
  input_img_list = sorted(input_img_list, key=lambda x: int(os.path.splitext(os.path.basename(x))[0]))
57
  fps = args.fps
58
+ else:
59
+ raise ValueError(f"{video_path} should be a video file, an image file or a directory of images")
60
+
61
  #print(input_img_list)
62
  ############################################## extract audio feature ##############################################
63
  whisper_feature = audio_processor.audio2feat(audio_path)
 
120
 
121
  combine_frame = get_image(ori_frame,res_frame,bbox)
122
  cv2.imwrite(f"{result_img_save_path}/{str(i).zfill(8)}.png",combine_frame)
123
+
124
+ cmd_img2video = f"ffmpeg -y -v warning -r {fps} -f image2 -i {result_img_save_path}/%08d.png -vcodec libx264 -vf format=rgb24,scale=out_color_matrix=bt709,format=yuv420p -crf 18 temp.mp4"
125
  print(cmd_img2video)
126
  os.system(cmd_img2video)
127
 
128
+ cmd_combine_audio = f"ffmpeg -y -v warning -i {audio_path} -i temp.mp4 {output_vid_name}"
129
  print(cmd_combine_audio)
130
  os.system(cmd_combine_audio)
131
 
 
141
 
142
  parser.add_argument("--fps", type=int, default=25)
143
  parser.add_argument("--batch_size", type=int, default=8)
144
+ parser.add_argument("--output_vid_name", type=str, default=None)
145
  parser.add_argument("--use_saved_coord",
146
  action="store_true",
147
  help='use saved coordinate to save time')
 
149
 
150
  args = parser.parse_args()
151
  main(args)
 
scripts/realtime_inference.py CHANGED
@@ -206,7 +206,7 @@ class Avatar:
206
  combine_frame = get_image_blending(ori_frame,res_frame,bbox,mask,mask_crop_box)
207
 
208
  fps = 1/(time.time()-start+1e-6)
209
- print(f"Generating the {self.idx}-th frame with FPS: {fps:.2f}")
210
  cv2.imwrite(f"{self.avatar_path}/tmp/{str(self.idx).zfill(8)}.png",combine_frame)
211
  self.idx = self.idx + 1
212
 
@@ -244,12 +244,12 @@ class Avatar:
244
 
245
  if out_vid_name is not None:
246
  # optional
247
- cmd_img2video = f"ffmpeg -y -v fatal -r {fps} -f image2 -i {self.avatar_path}/tmp/%08d.png -vcodec libx264 -vf format=rgb24,scale=out_color_matrix=bt709,format=yuv420p -crf 18 {self.avatar_path}/temp.mp4"
248
  print(cmd_img2video)
249
  os.system(cmd_img2video)
250
 
251
  output_vid = os.path.join(self.video_out_path, out_vid_name+".mp4") # on
252
- cmd_combine_audio = f"ffmpeg -y -v fatal -i {audio_path} -i {self.avatar_path}/temp.mp4 {output_vid}"
253
  print(cmd_combine_audio)
254
  os.system(cmd_combine_audio)
255
 
 
206
  combine_frame = get_image_blending(ori_frame,res_frame,bbox,mask,mask_crop_box)
207
 
208
  fps = 1/(time.time()-start+1e-6)
209
+ print(f"Displaying the {self.idx}-th frame with FPS: {fps:.2f}")
210
  cv2.imwrite(f"{self.avatar_path}/tmp/{str(self.idx).zfill(8)}.png",combine_frame)
211
  self.idx = self.idx + 1
212
 
 
244
 
245
  if out_vid_name is not None:
246
  # optional
247
+ cmd_img2video = f"ffmpeg -y -v warning -r {fps} -f image2 -i {self.avatar_path}/tmp/%08d.png -vcodec libx264 -vf format=rgb24,scale=out_color_matrix=bt709,format=yuv420p -crf 18 {self.avatar_path}/temp.mp4"
248
  print(cmd_img2video)
249
  os.system(cmd_img2video)
250
 
251
  output_vid = os.path.join(self.video_out_path, out_vid_name+".mp4") # on
252
+ cmd_combine_audio = f"ffmpeg -y -v warning -i {audio_path} -i {self.avatar_path}/temp.mp4 {output_vid}"
253
  print(cmd_combine_audio)
254
  os.system(cmd_combine_audio)
255