zejunyang
commited on
Commit
·
e24f684
1
Parent(s):
d1af78b
debug
Browse files
app.py
CHANGED
|
@@ -118,6 +118,8 @@ def audio2video(input_audio, ref_img, headpose_video=None, size=512, steps=25, l
|
|
| 118 |
while os.path.exists(save_dir):
|
| 119 |
save_dir = Path(f"a2v_output/{date_str}/{save_dir_name}_{np.random.randint(10000):04d}")
|
| 120 |
save_dir.mkdir(exist_ok=True, parents=True)
|
|
|
|
|
|
|
| 121 |
|
| 122 |
ref_image_np = cv2.cvtColor(ref_img, cv2.COLOR_RGB2BGR)
|
| 123 |
ref_image_np = crop_face(ref_image_np, lmk_extractor)
|
|
@@ -127,16 +129,22 @@ def audio2video(input_audio, ref_img, headpose_video=None, size=512, steps=25, l
|
|
| 127 |
ref_image_np = cv2.resize(ref_image_np, (size, size))
|
| 128 |
ref_image_pil = Image.fromarray(cv2.cvtColor(ref_image_np, cv2.COLOR_BGR2RGB))
|
| 129 |
|
|
|
|
|
|
|
| 130 |
face_result = lmk_extractor(ref_image_np)
|
| 131 |
if face_result is None:
|
| 132 |
return None, ref_image_pil
|
| 133 |
|
|
|
|
|
|
|
| 134 |
lmks = face_result['lmks'].astype(np.float32)
|
| 135 |
ref_pose = vis.draw_landmarks((ref_image_np.shape[1], ref_image_np.shape[0]), lmks, normed=True)
|
| 136 |
|
| 137 |
sample = prepare_audio_feature(input_audio, wav2vec_model_path=audio_infer_config['a2m_model']['model_path'])
|
| 138 |
sample['audio_feature'] = torch.from_numpy(sample['audio_feature']).float().cuda()
|
| 139 |
sample['audio_feature'] = sample['audio_feature'].unsqueeze(0)
|
|
|
|
|
|
|
| 140 |
|
| 141 |
# inference
|
| 142 |
pred = a2m_model.infer(sample['audio_feature'], sample['seq_len'])
|
|
@@ -144,6 +152,8 @@ def audio2video(input_audio, ref_img, headpose_video=None, size=512, steps=25, l
|
|
| 144 |
pred = pred.reshape(pred.shape[0], -1, 3)
|
| 145 |
pred = pred + face_result['lmks3d']
|
| 146 |
|
|
|
|
|
|
|
| 147 |
if headpose_video is not None:
|
| 148 |
pose_seq = get_headpose_temp(headpose_video)
|
| 149 |
else:
|
|
@@ -158,6 +168,8 @@ def audio2video(input_audio, ref_img, headpose_video=None, size=512, steps=25, l
|
|
| 158 |
for i, verts in enumerate(projected_vertices):
|
| 159 |
lmk_img = vis.draw_landmarks((width, height), verts, normed=False)
|
| 160 |
pose_images.append(lmk_img)
|
|
|
|
|
|
|
| 161 |
|
| 162 |
pose_list = []
|
| 163 |
# pose_tensor_list = []
|
|
@@ -176,6 +188,8 @@ def audio2video(input_audio, ref_img, headpose_video=None, size=512, steps=25, l
|
|
| 176 |
pose_list = np.array(pose_list)
|
| 177 |
|
| 178 |
video_length = len(pose_list)
|
|
|
|
|
|
|
| 179 |
|
| 180 |
video = pipe(
|
| 181 |
ref_image_pil,
|
|
@@ -383,7 +397,7 @@ with gr.Blocks() as demo:
|
|
| 383 |
a2v_headpose_video = gr.Video(label="Option: upload head pose reference video", sources="upload")
|
| 384 |
|
| 385 |
with gr.Row():
|
| 386 |
-
a2v_size_slider = gr.Slider(minimum=256, maximum=512, step=8, value=
|
| 387 |
a2v_step_slider = gr.Slider(minimum=5, maximum=20, step=1, value=15, label="Steps (--steps)")
|
| 388 |
|
| 389 |
with gr.Row():
|
|
@@ -411,7 +425,7 @@ with gr.Blocks() as demo:
|
|
| 411 |
v2v_source_video = gr.Video(label="Upload source video", sources="upload")
|
| 412 |
|
| 413 |
with gr.Row():
|
| 414 |
-
v2v_size_slider = gr.Slider(minimum=256, maximum=512, step=8, value=
|
| 415 |
v2v_step_slider = gr.Slider(minimum=5, maximum=20, step=1, value=15, label="Steps (--steps)")
|
| 416 |
|
| 417 |
with gr.Row():
|
|
|
|
| 118 |
while os.path.exists(save_dir):
|
| 119 |
save_dir = Path(f"a2v_output/{date_str}/{save_dir_name}_{np.random.randint(10000):04d}")
|
| 120 |
save_dir.mkdir(exist_ok=True, parents=True)
|
| 121 |
+
|
| 122 |
+
print('=====1======')
|
| 123 |
|
| 124 |
ref_image_np = cv2.cvtColor(ref_img, cv2.COLOR_RGB2BGR)
|
| 125 |
ref_image_np = crop_face(ref_image_np, lmk_extractor)
|
|
|
|
| 129 |
ref_image_np = cv2.resize(ref_image_np, (size, size))
|
| 130 |
ref_image_pil = Image.fromarray(cv2.cvtColor(ref_image_np, cv2.COLOR_BGR2RGB))
|
| 131 |
|
| 132 |
+
print('=====2======')
|
| 133 |
+
|
| 134 |
face_result = lmk_extractor(ref_image_np)
|
| 135 |
if face_result is None:
|
| 136 |
return None, ref_image_pil
|
| 137 |
|
| 138 |
+
print('=====3======')
|
| 139 |
+
|
| 140 |
lmks = face_result['lmks'].astype(np.float32)
|
| 141 |
ref_pose = vis.draw_landmarks((ref_image_np.shape[1], ref_image_np.shape[0]), lmks, normed=True)
|
| 142 |
|
| 143 |
sample = prepare_audio_feature(input_audio, wav2vec_model_path=audio_infer_config['a2m_model']['model_path'])
|
| 144 |
sample['audio_feature'] = torch.from_numpy(sample['audio_feature']).float().cuda()
|
| 145 |
sample['audio_feature'] = sample['audio_feature'].unsqueeze(0)
|
| 146 |
+
|
| 147 |
+
print('=====4======')
|
| 148 |
|
| 149 |
# inference
|
| 150 |
pred = a2m_model.infer(sample['audio_feature'], sample['seq_len'])
|
|
|
|
| 152 |
pred = pred.reshape(pred.shape[0], -1, 3)
|
| 153 |
pred = pred + face_result['lmks3d']
|
| 154 |
|
| 155 |
+
print('=====5======')
|
| 156 |
+
|
| 157 |
if headpose_video is not None:
|
| 158 |
pose_seq = get_headpose_temp(headpose_video)
|
| 159 |
else:
|
|
|
|
| 168 |
for i, verts in enumerate(projected_vertices):
|
| 169 |
lmk_img = vis.draw_landmarks((width, height), verts, normed=False)
|
| 170 |
pose_images.append(lmk_img)
|
| 171 |
+
|
| 172 |
+
print('=====6======')
|
| 173 |
|
| 174 |
pose_list = []
|
| 175 |
# pose_tensor_list = []
|
|
|
|
| 188 |
pose_list = np.array(pose_list)
|
| 189 |
|
| 190 |
video_length = len(pose_list)
|
| 191 |
+
|
| 192 |
+
print('=====7======')
|
| 193 |
|
| 194 |
video = pipe(
|
| 195 |
ref_image_pil,
|
|
|
|
| 397 |
a2v_headpose_video = gr.Video(label="Option: upload head pose reference video", sources="upload")
|
| 398 |
|
| 399 |
with gr.Row():
|
| 400 |
+
a2v_size_slider = gr.Slider(minimum=256, maximum=512, step=8, value=384, label="Video size (-W & -H)")
|
| 401 |
a2v_step_slider = gr.Slider(minimum=5, maximum=20, step=1, value=15, label="Steps (--steps)")
|
| 402 |
|
| 403 |
with gr.Row():
|
|
|
|
| 425 |
v2v_source_video = gr.Video(label="Upload source video", sources="upload")
|
| 426 |
|
| 427 |
with gr.Row():
|
| 428 |
+
v2v_size_slider = gr.Slider(minimum=256, maximum=512, step=8, value=384, label="Video size (-W & -H)")
|
| 429 |
v2v_step_slider = gr.Slider(minimum=5, maximum=20, step=1, value=15, label="Steps (--steps)")
|
| 430 |
|
| 431 |
with gr.Row():
|