Spaces:
Paused
Paused
output pose coords
Browse files
app.py
CHANGED
|
@@ -26,6 +26,7 @@ with gr.Blocks() as demo:
|
|
| 26 |
animation = gr.Video(label="Result")
|
| 27 |
frames = gr.Gallery(type="pil", label="Frames", format="png")
|
| 28 |
frames_thumb = gr.Gallery(type="pil", label="Thumbnails", format="png")
|
|
|
|
| 29 |
|
| 30 |
submit_btn.click(
|
| 31 |
run_app, inputs=[char_imgs, mocap, tr_steps, inf_steps, fps, remove_bg, resize_inputs], outputs=[animation, frames]
|
|
@@ -36,7 +37,7 @@ with gr.Blocks() as demo:
|
|
| 36 |
)
|
| 37 |
|
| 38 |
inference_btn.click(
|
| 39 |
-
run_inference, inputs=[char_imgs, mocap, tr_steps, inf_steps, fps, modelId, img_width, img_height, remove_bg, resize_inputs], outputs=[animation, frames, frames_thumb]
|
| 40 |
)
|
| 41 |
|
| 42 |
|
|
|
|
| 26 |
animation = gr.Video(label="Result")
|
| 27 |
frames = gr.Gallery(type="pil", label="Frames", format="png")
|
| 28 |
frames_thumb = gr.Gallery(type="pil", label="Thumbnails", format="png")
|
| 29 |
+
pose_coords = gr.JSON(label="Pose Coordinates")
|
| 30 |
|
| 31 |
submit_btn.click(
|
| 32 |
run_app, inputs=[char_imgs, mocap, tr_steps, inf_steps, fps, remove_bg, resize_inputs], outputs=[animation, frames]
|
|
|
|
| 37 |
)
|
| 38 |
|
| 39 |
inference_btn.click(
|
| 40 |
+
run_inference, inputs=[char_imgs, mocap, tr_steps, inf_steps, fps, modelId, img_width, img_height, remove_bg, resize_inputs], outputs=[animation, frames, frames_thumb, pose_coords]
|
| 41 |
)
|
| 42 |
|
| 43 |
|
main.py
CHANGED
|
@@ -58,7 +58,7 @@ import uuid
|
|
| 58 |
import gc
|
| 59 |
from numba import cuda
|
| 60 |
import requests
|
| 61 |
-
import
|
| 62 |
|
| 63 |
from huggingface_hub import hf_hub_download, HfApi
|
| 64 |
|
|
@@ -221,7 +221,7 @@ def get_pose(img, dwpose, outfile, crop=False):
|
|
| 221 |
out_img = out_img.crop(bbox)
|
| 222 |
out_img = ImageOps.expand(out_img, border=int(out_img.width*0.2), fill=(0,0,0))
|
| 223 |
|
| 224 |
-
return out_img
|
| 225 |
|
| 226 |
|
| 227 |
def extract_frames(video_path, fps):
|
|
@@ -272,12 +272,13 @@ def prepare_inputs_train(images, bg_remove, dwpose, rembg_session):
|
|
| 272 |
images = [removebg(img, rembg_session) for img in images]
|
| 273 |
|
| 274 |
in_img = images[0]
|
| 275 |
-
in_pose = get_pose(in_img, dwpose, "in_pose.png")
|
| 276 |
train_poses = []
|
| 277 |
train_imgs = [resize_and_pad(img, in_img) for img in images[1:]]
|
| 278 |
|
| 279 |
for i, img in enumerate(train_imgs):
|
| 280 |
-
|
|
|
|
| 281 |
|
| 282 |
return in_img, in_pose, train_imgs, train_poses
|
| 283 |
|
|
@@ -287,7 +288,7 @@ def prepare_inputs_inference(in_img, in_vid, fps, dwpose, rembg_session, bg_remo
|
|
| 287 |
|
| 288 |
print("prepare_inputs_inference")
|
| 289 |
|
| 290 |
-
in_pose = get_pose(in_img, dwpose, "in_pose.png")
|
| 291 |
|
| 292 |
frames = extract_frames(in_vid, fps)
|
| 293 |
print("remove background", bg_remove)
|
|
@@ -302,14 +303,21 @@ def prepare_inputs_inference(in_img, in_vid, fps, dwpose, rembg_session, bg_remo
|
|
| 302 |
|
| 303 |
progress_bar = tqdm(range(len(frames)), initial=0, desc="Frames")
|
| 304 |
target_poses = []
|
|
|
|
| 305 |
max_left = max_top = 999999
|
| 306 |
max_right = max_bottom = 0
|
| 307 |
it = frames
|
| 308 |
if is_app:
|
| 309 |
it = progress.tqdm(frames, desc="Pose Detection")
|
| 310 |
for f in it:
|
| 311 |
-
tpose = get_pose(f, dwpose, "tar_pose"+str(len(target_poses))+".png")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 312 |
target_poses.append(tpose)
|
|
|
|
| 313 |
progress_bar.update(1)
|
| 314 |
|
| 315 |
bbox = tpose.getbbox()
|
|
@@ -332,14 +340,14 @@ def prepare_inputs_inference(in_img, in_vid, fps, dwpose, rembg_session, bg_remo
|
|
| 332 |
tpose.save("out/"+"tar_pose"+str(len(target_poses_cropped))+".png")
|
| 333 |
target_poses_cropped.append(tpose)
|
| 334 |
|
| 335 |
-
return in_img, target_poses_cropped, in_pose
|
| 336 |
|
| 337 |
|
| 338 |
def prepare_inputs(images, in_vid, fps, bg_remove, dwpose, rembg_session, resize='target', is_app=False):
|
| 339 |
|
| 340 |
in_img, in_pose, train_imgs, train_poses = prepare_inputs_train(images, bg_remove, dwpose, rembg_session)
|
| 341 |
|
| 342 |
-
in_img, target_poses_cropped, _ = prepare_inputs_inference(in_img, in_vid, fps, dwpose, rembg_session, bg_remove, resize, is_app)
|
| 343 |
|
| 344 |
|
| 345 |
return in_img, in_pose, train_imgs, train_poses, target_poses_cropped
|
|
@@ -1125,7 +1133,7 @@ def run_inference(images, video_path, train_steps=100, inference_steps=10, fps=1
|
|
| 1125 |
images = [img[0] for img in images]
|
| 1126 |
in_img = images[0]
|
| 1127 |
|
| 1128 |
-
in_img, target_poses, in_pose = prepare_inputs_inference(in_img, video_path, fps, dwpose, rembg_session, bg_remove, 'target', is_app)
|
| 1129 |
|
| 1130 |
results = inference(modelId, in_img, in_pose, target_poses, inference_steps, None, vae, unet, image_encoder_p, is_app)
|
| 1131 |
#urls = save_temp_imgs(results)
|
|
@@ -1143,7 +1151,7 @@ def run_inference(images, video_path, train_steps=100, inference_steps=10, fps=1
|
|
| 1143 |
|
| 1144 |
print("Done!")
|
| 1145 |
|
| 1146 |
-
return out_vid+'.webm', results, getThumbnails(results)
|
| 1147 |
|
| 1148 |
|
| 1149 |
def run_app(images, video_path, train_steps=100, inference_steps=10, fps=12, bg_remove=False, resize_inputs=True):
|
|
|
|
| 58 |
import gc
|
| 59 |
from numba import cuda
|
| 60 |
import requests
|
| 61 |
+
import json
|
| 62 |
|
| 63 |
from huggingface_hub import hf_hub_download, HfApi
|
| 64 |
|
|
|
|
| 221 |
out_img = out_img.crop(bbox)
|
| 222 |
out_img = ImageOps.expand(out_img, border=int(out_img.width*0.2), fill=(0,0,0))
|
| 223 |
|
| 224 |
+
return out_img, pose
|
| 225 |
|
| 226 |
|
| 227 |
def extract_frames(video_path, fps):
|
|
|
|
| 272 |
images = [removebg(img, rembg_session) for img in images]
|
| 273 |
|
| 274 |
in_img = images[0]
|
| 275 |
+
in_pose, _ = get_pose(in_img, dwpose, "in_pose.png")
|
| 276 |
train_poses = []
|
| 277 |
train_imgs = [resize_and_pad(img, in_img) for img in images[1:]]
|
| 278 |
|
| 279 |
for i, img in enumerate(train_imgs):
|
| 280 |
+
train_pose, _ = get_pose(img, dwpose, "tr_pose"+str(i)+".png")
|
| 281 |
+
train_poses.append(train_pose)
|
| 282 |
|
| 283 |
return in_img, in_pose, train_imgs, train_poses
|
| 284 |
|
|
|
|
| 288 |
|
| 289 |
print("prepare_inputs_inference")
|
| 290 |
|
| 291 |
+
in_pose, _ = get_pose(in_img, dwpose, "in_pose.png")
|
| 292 |
|
| 293 |
frames = extract_frames(in_vid, fps)
|
| 294 |
print("remove background", bg_remove)
|
|
|
|
| 303 |
|
| 304 |
progress_bar = tqdm(range(len(frames)), initial=0, desc="Frames")
|
| 305 |
target_poses = []
|
| 306 |
+
target_poses_coords = []
|
| 307 |
max_left = max_top = 999999
|
| 308 |
max_right = max_bottom = 0
|
| 309 |
it = frames
|
| 310 |
if is_app:
|
| 311 |
it = progress.tqdm(frames, desc="Pose Detection")
|
| 312 |
for f in it:
|
| 313 |
+
tpose, tpose_coords = get_pose(f, dwpose, "tar_pose"+str(len(target_poses))+".png")
|
| 314 |
+
#print(tpose_coords)
|
| 315 |
+
coords = {}
|
| 316 |
+
for k in tpose_coords:
|
| 317 |
+
coords[k] = tpose_coords[k].tolist()
|
| 318 |
+
#print(coords)
|
| 319 |
target_poses.append(tpose)
|
| 320 |
+
target_poses_coords.append(json.dumps(coords))
|
| 321 |
progress_bar.update(1)
|
| 322 |
|
| 323 |
bbox = tpose.getbbox()
|
|
|
|
| 340 |
tpose.save("out/"+"tar_pose"+str(len(target_poses_cropped))+".png")
|
| 341 |
target_poses_cropped.append(tpose)
|
| 342 |
|
| 343 |
+
return in_img, target_poses_cropped, in_pose, target_poses_coords
|
| 344 |
|
| 345 |
|
| 346 |
def prepare_inputs(images, in_vid, fps, bg_remove, dwpose, rembg_session, resize='target', is_app=False):
|
| 347 |
|
| 348 |
in_img, in_pose, train_imgs, train_poses = prepare_inputs_train(images, bg_remove, dwpose, rembg_session)
|
| 349 |
|
| 350 |
+
in_img, target_poses_cropped, _, _ = prepare_inputs_inference(in_img, in_vid, fps, dwpose, rembg_session, bg_remove, resize, is_app)
|
| 351 |
|
| 352 |
|
| 353 |
return in_img, in_pose, train_imgs, train_poses, target_poses_cropped
|
|
|
|
| 1133 |
images = [img[0] for img in images]
|
| 1134 |
in_img = images[0]
|
| 1135 |
|
| 1136 |
+
in_img, target_poses, in_pose, target_poses_coords = prepare_inputs_inference(in_img, video_path, fps, dwpose, rembg_session, bg_remove, 'target', is_app)
|
| 1137 |
|
| 1138 |
results = inference(modelId, in_img, in_pose, target_poses, inference_steps, None, vae, unet, image_encoder_p, is_app)
|
| 1139 |
#urls = save_temp_imgs(results)
|
|
|
|
| 1151 |
|
| 1152 |
print("Done!")
|
| 1153 |
|
| 1154 |
+
return out_vid+'.webm', results, getThumbnails(results), target_poses_coords
|
| 1155 |
|
| 1156 |
|
| 1157 |
def run_app(images, video_path, train_steps=100, inference_steps=10, fps=12, bg_remove=False, resize_inputs=True):
|