Spaces:
Runtime error
Runtime error
multiple improvements
Browse files- app.py +201 -148
- sample_images/sample11.jpg +2 -2
- sbatch/sbatch_demo.sh +5 -3
- sbatch/sbatch_demo2.sh +1 -1
app.py
CHANGED
|
@@ -19,6 +19,7 @@ import random
|
|
| 19 |
from copy import deepcopy
|
| 20 |
from huggingface_hub import hf_hub_download
|
| 21 |
from gradio_toggle import Toggle
|
|
|
|
| 22 |
try:
|
| 23 |
import spaces
|
| 24 |
except:
|
|
@@ -27,11 +28,17 @@ except:
|
|
| 27 |
MAX_N = 6
|
| 28 |
FIX_MAX_N = 6
|
| 29 |
LENGTH = 480
|
| 30 |
-
|
| 31 |
placeholder = cv2.cvtColor(cv2.imread("placeholder.png"), cv2.COLOR_BGR2RGB)
|
| 32 |
-
|
| 33 |
-
|
| 34 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 35 |
pre_device = "cpu" if HF else "cuda"
|
| 36 |
spaces_60_fn = spaces.GPU(duration=60) if HF else (lambda f: f)
|
| 37 |
spaces_120_fn = spaces.GPU(duration=60) if HF else (lambda f: f)
|
|
@@ -214,6 +221,7 @@ if NEW_MODEL:
|
|
| 214 |
# ckpt_state_dict = torch.load(model_path)['model_state_dict']
|
| 215 |
ckpt_state_dict = torch.load(model_path, map_location='cpu')['ema_state_dict']
|
| 216 |
missing_keys, extra_keys = model.load_state_dict(ckpt_state_dict, strict=False)
|
|
|
|
| 217 |
model = model.to(device)
|
| 218 |
model.eval()
|
| 219 |
print(missing_keys, extra_keys)
|
|
@@ -233,6 +241,29 @@ if NEW_MODEL:
|
|
| 233 |
print(f"encoder after eval() max: {max([p.max() for p in autoencoder.encoder.parameters()])}")
|
| 234 |
print(f"autoencoder encoder after eval() dtype: {next(autoencoder.encoder.parameters()).dtype}")
|
| 235 |
assert len(missing_keys) == 0
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 236 |
|
| 237 |
sam_path = "sam_vit_h_4b8939.pth"
|
| 238 |
if not os.path.exists(sam_path):
|
|
@@ -249,6 +280,7 @@ hands = mp_hands.Hands(
|
|
| 249 |
no_hands_open = cv2.resize(np.array(Image.open("no_hands_open.jpeg"))[..., :3], (LENGTH, LENGTH))
|
| 250 |
|
| 251 |
def prepare_anno(ref, ref_is_user):
|
|
|
|
| 252 |
if not ref_is_user: # no_hand_open.jpeg
|
| 253 |
return gr.update(value=None), gr.update(value=None)
|
| 254 |
if ref is None or ref["background"] is None or ref["background"].sum()==0: # clear_all
|
|
@@ -284,6 +316,7 @@ def prepare_anno(ref, ref_is_user):
|
|
| 284 |
|
| 285 |
@spaces_60_fn
|
| 286 |
def get_ref_anno(img, keypts, use_mask, use_pose):
|
|
|
|
| 287 |
no_mask, no_pose = not use_mask, not use_pose
|
| 288 |
if img.sum() == 0: # clear_all
|
| 289 |
return None, gr.update(), None, gr.update(), True
|
|
@@ -407,6 +440,7 @@ def get_ref_anno(img, keypts, use_mask, use_pose):
|
|
| 407 |
return img, ref_pose, ref_cond, gr.update(), True
|
| 408 |
|
| 409 |
def get_target_anno(img, keypts):
|
|
|
|
| 410 |
if img.sum() == 0: # clear_all
|
| 411 |
return None, gr.update(), None, gr.update(), True
|
| 412 |
if keypts is None: # hands not detected
|
|
@@ -447,6 +481,7 @@ def get_target_anno(img, keypts):
|
|
| 447 |
return img, target_pose, target_cond, keypts, gr.update(), True
|
| 448 |
|
| 449 |
def visualize_ref(ref, ex_mask):
|
|
|
|
| 450 |
if ref is None:
|
| 451 |
return None
|
| 452 |
|
|
@@ -598,6 +633,12 @@ def process_crop(img, crop_coord, evt:gr.SelectData):
|
|
| 598 |
cropped_vis = image.copy()
|
| 599 |
cropped_vis[:,:,-1] = 255
|
| 600 |
else: # will add second click
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 601 |
crop_coord.append(new_coord)
|
| 602 |
x1, y1 = crop_coord[0]
|
| 603 |
x2, y2 = crop_coord[1]
|
|
@@ -889,6 +930,7 @@ def flip_hand(
|
|
| 889 |
cond, auto_cond, manual_cond,
|
| 890 |
keypts=None, auto_keypts=None, manual_keypts=None
|
| 891 |
):
|
|
|
|
| 892 |
if cond is None: # clear clicked
|
| 893 |
return
|
| 894 |
img["composite"] = img["composite"][:, ::-1, :]
|
|
@@ -923,7 +965,7 @@ def flip_hand(
|
|
| 923 |
manual_keypts[:21, 0] = opts.image_size[1] - manual_keypts[:21, 0]
|
| 924 |
if manual_keypts[21:, :].sum() != 0:
|
| 925 |
manual_keypts[21:, 0] = opts.image_size[1] - manual_keypts[21:, 0]
|
| 926 |
-
return img, img_raw, pose_img, pose_manual_img, manual_kp_right, manual_kp_left, cond, auto_cond, manual_cond, keypts, auto_keypts, manual_keypts
|
| 927 |
|
| 928 |
def resize_to_full(img):
|
| 929 |
img["background"] = cv2.resize(img["background"], (LENGTH, LENGTH))
|
|
@@ -1117,75 +1159,80 @@ def unvisible_component(decider, component):
|
|
| 1117 |
|
| 1118 |
example_ref_imgs = [
|
| 1119 |
[
|
| 1120 |
-
"sample_images/
|
| 1121 |
],
|
| 1122 |
[
|
| 1123 |
-
"sample_images/
|
| 1124 |
],
|
| 1125 |
[
|
| 1126 |
-
"sample_images/sample3.jpg",
|
| 1127 |
],
|
| 1128 |
[
|
| 1129 |
-
"sample_images/
|
| 1130 |
],
|
| 1131 |
[
|
| 1132 |
-
"sample_images/
|
| 1133 |
],
|
| 1134 |
]
|
| 1135 |
example_target_imgs = [
|
| 1136 |
[
|
| 1137 |
-
"sample_images/
|
| 1138 |
],
|
| 1139 |
[
|
| 1140 |
"sample_images/sample9.jpg",
|
| 1141 |
],
|
|
|
|
|
|
|
|
|
|
| 1142 |
[
|
| 1143 |
"sample_images/sample10.jpg",
|
| 1144 |
],
|
| 1145 |
[
|
| 1146 |
-
"
|
| 1147 |
],
|
| 1148 |
-
["pose_images/pose1.jpg"],
|
| 1149 |
-
]
|
| 1150 |
-
fix_example_imgs = [
|
| 1151 |
-
["bad_hands/1.jpg"],
|
| 1152 |
-
["bad_hands/3.jpg"],
|
| 1153 |
-
["bad_hands/4.jpg"],
|
| 1154 |
-
["bad_hands/5.jpg"],
|
| 1155 |
-
["bad_hands/6.jpg"],
|
| 1156 |
-
["bad_hands/7.jpg"],
|
| 1157 |
-
]
|
| 1158 |
-
fix_example_brush = [
|
| 1159 |
-
["bad_hands/1_composite.png"],
|
| 1160 |
-
["bad_hands/3_composite.png"],
|
| 1161 |
-
["bad_hands/4_composite.png"],
|
| 1162 |
-
["bad_hands/5_composite.png"],
|
| 1163 |
-
["bad_hands/6_composite.png"],
|
| 1164 |
-
["bad_hands/7_composite.png"],
|
| 1165 |
-
]
|
| 1166 |
-
fix_example_kpts = [
|
| 1167 |
-
["bad_hands/1_kpts.png", 3.0, 1224],
|
| 1168 |
-
["bad_hands/3_kpts.png", 1.0, 42],
|
| 1169 |
-
["bad_hands/4_kpts.png", 2.0, 42],
|
| 1170 |
-
["bad_hands/5_kpts.png", 3.0, 42],
|
| 1171 |
-
["bad_hands/6_kpts.png", 3.0, 1348],
|
| 1172 |
-
["bad_hands/7_kpts.png", 3.0, 42],
|
| 1173 |
]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1174 |
fix_example_all = [
|
| 1175 |
["bad_hands/1.jpg", "bad_hands/1_composite.png", "bad_hands/1_mask.jpg", "bad_hands/1_kpts.png", 3.0, 1224],
|
| 1176 |
["bad_hands/3.jpg", "bad_hands/3_composite.png", "bad_hands/3_mask.jpg", "bad_hands/3_kpts.png", 1.0, 42],
|
| 1177 |
-
["bad_hands/4.jpg", "bad_hands/4_composite.png", "bad_hands/4_mask.jpg", "bad_hands/4_kpts.png", 2.0, 42],
|
| 1178 |
["bad_hands/5.jpg", "bad_hands/5_composite.png", "bad_hands/5_mask.jpg", "bad_hands/5_kpts.png", 3.0, 42],
|
| 1179 |
["bad_hands/6.jpg", "bad_hands/6_composite.png", "bad_hands/6_mask.jpg", "bad_hands/6_kpts.png", 3.0, 1348],
|
| 1180 |
["bad_hands/7.jpg", "bad_hands/7_composite.png", "bad_hands/7_mask.jpg", "bad_hands/7_kpts.png", 3.0, 42],
|
| 1181 |
]
|
| 1182 |
-
for i in range(len(fix_example_kpts)):
|
| 1183 |
-
|
| 1184 |
-
|
| 1185 |
for i in range(len(fix_example_all)):
|
| 1186 |
npy_path = fix_example_all[i][3].replace("_kpts.png", ".npy")
|
| 1187 |
fix_example_all[i].append(npy_path)
|
| 1188 |
|
|
|
|
|
|
|
|
|
|
| 1189 |
custom_css = """
|
| 1190 |
.gradio-container .examples img {
|
| 1191 |
width: 240px !important;
|
|
@@ -1237,9 +1284,6 @@ custom_css = """
|
|
| 1237 |
#fix_examples_all table tr td:nth-child(7) {
|
| 1238 |
display: none !important;
|
| 1239 |
}
|
| 1240 |
-
#fix_examples_all table tr:first-child {
|
| 1241 |
-
display: none !important;
|
| 1242 |
-
}
|
| 1243 |
#repose_tutorial video {
|
| 1244 |
width: 50% !important;
|
| 1245 |
display: block;
|
|
@@ -1280,6 +1324,13 @@ custom_css = """
|
|
| 1280 |
#gradio-app {
|
| 1281 |
flex-direction: row; !important;
|
| 1282 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1283 |
"""
|
| 1284 |
##no_wrap_row {
|
| 1285 |
# display: flex !important;
|
|
@@ -1411,6 +1462,37 @@ with gr.Blocks(css=custom_css, theme="soft") as demo:
|
|
| 1411 |
# config
|
| 1412 |
use_pose = gr.State(value=True)
|
| 1413 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1414 |
# main tabs
|
| 1415 |
with gr.Row():
|
| 1416 |
# ref column
|
|
@@ -1431,7 +1513,6 @@ with gr.Blocks(css=custom_css, theme="soft") as demo:
|
|
| 1431 |
layers=False,
|
| 1432 |
crop_size="1:1",
|
| 1433 |
)
|
| 1434 |
-
gr.Examples(example_ref_imgs, [ref], examples_per_page=20)
|
| 1435 |
use_mask = Toggle(label="Use mask", value=False, interactive=True)
|
| 1436 |
with gr.Accordion(label="See hand pose & mask", open=False):
|
| 1437 |
with gr.Tab("Automatic hand keypoints"):
|
|
@@ -1550,7 +1631,7 @@ with gr.Blocks(css=custom_css, theme="soft") as demo:
|
|
| 1550 |
layers=False,
|
| 1551 |
crop_size="1:1",
|
| 1552 |
)
|
| 1553 |
-
gr.Examples(example_target_imgs, [target], examples_per_page=20)
|
| 1554 |
with gr.Accordion(label="See hand pose", open=False):
|
| 1555 |
with gr.Tab("Automatic hand keypoints"):
|
| 1556 |
target_pose = gr.Image(
|
|
@@ -1685,36 +1766,8 @@ with gr.Blocks(css=custom_css, theme="soft") as demo:
|
|
| 1685 |
# )
|
| 1686 |
clear = gr.ClearButton(elem_id="clear_button")
|
| 1687 |
|
| 1688 |
-
|
| 1689 |
-
|
| 1690 |
-
with gr.Row():
|
| 1691 |
-
n_generation = gr.Slider(
|
| 1692 |
-
label="Number of generations",
|
| 1693 |
-
value=1,
|
| 1694 |
-
minimum=1,
|
| 1695 |
-
maximum=MAX_N,
|
| 1696 |
-
step=1,
|
| 1697 |
-
randomize=False,
|
| 1698 |
-
interactive=True,
|
| 1699 |
-
)
|
| 1700 |
-
seed = gr.Slider(
|
| 1701 |
-
label="Seed",
|
| 1702 |
-
value=42,
|
| 1703 |
-
minimum=0,
|
| 1704 |
-
maximum=10000,
|
| 1705 |
-
step=1,
|
| 1706 |
-
randomize=False,
|
| 1707 |
-
interactive=True,
|
| 1708 |
-
)
|
| 1709 |
-
cfg = gr.Slider(
|
| 1710 |
-
label="Classifier free guidance scale",
|
| 1711 |
-
value=2.5,
|
| 1712 |
-
minimum=0.0,
|
| 1713 |
-
maximum=10.0,
|
| 1714 |
-
step=0.1,
|
| 1715 |
-
randomize=False,
|
| 1716 |
-
interactive=True,
|
| 1717 |
-
)
|
| 1718 |
|
| 1719 |
# tutorial video
|
| 1720 |
with gr.Accordion("Tutorial Video of Demo 1", elem_id="accordion_bold_large_center"):
|
|
@@ -1791,7 +1844,7 @@ with gr.Blocks(css=custom_css, theme="soft") as demo:
|
|
| 1791 |
ref_flip.select(
|
| 1792 |
flip_hand,
|
| 1793 |
[ref, ref_im_raw, ref_pose, ref_manual_pose, ref_manual_kp_right, ref_manual_kp_left, ref_cond, ref_auto_cond, ref_manual_cond],
|
| 1794 |
-
[ref, ref_im_raw, ref_pose, ref_manual_pose, ref_manual_kp_right, ref_manual_kp_left, ref_cond, ref_auto_cond, ref_manual_cond]
|
| 1795 |
)
|
| 1796 |
|
| 1797 |
# target listeners
|
|
@@ -1858,7 +1911,7 @@ with gr.Blocks(css=custom_css, theme="soft") as demo:
|
|
| 1858 |
target_flip.select(
|
| 1859 |
flip_hand,
|
| 1860 |
[target, target_im_raw, target_pose, target_manual_pose, target_manual_kp_right, target_manual_kp_left, target_cond, target_auto_cond, target_manual_cond, target_keypts, target_auto_keypts, target_manual_keypts],
|
| 1861 |
-
[target, target_im_raw, target_pose, target_manual_pose, target_manual_kp_right, target_manual_kp_left, target_cond, target_auto_cond, target_manual_cond, target_keypts, target_auto_keypts, target_manual_keypts],
|
| 1862 |
)
|
| 1863 |
|
| 1864 |
# run listerners
|
|
@@ -2020,7 +2073,7 @@ with gr.Blocks(css=custom_css, theme="soft") as demo:
|
|
| 2020 |
)
|
| 2021 |
with gr.Column():
|
| 2022 |
gr.Markdown(
|
| 2023 |
-
"""<p style="text-align: center; font-size: 18px; font-weight: bold;">2.
|
| 2024 |
)
|
| 2025 |
# gr.Markdown(
|
| 2026 |
# """<p style="text-align: center;">Don't brush the entire hand!</p>"""
|
|
@@ -2055,17 +2108,6 @@ with gr.Blocks(css=custom_css, theme="soft") as demo:
|
|
| 2055 |
# gr.Markdown(
|
| 2056 |
# """<p style="text-align: center;">Either get hand pose from Examples, or manually give hand pose (located at the bottom)</p>"""
|
| 2057 |
# )
|
| 2058 |
-
fix_kp_all = gr.Image(
|
| 2059 |
-
type="numpy",
|
| 2060 |
-
label="Target Hand Pose",
|
| 2061 |
-
show_label=False,
|
| 2062 |
-
height=LENGTH,
|
| 2063 |
-
width=LENGTH,
|
| 2064 |
-
interactive=False,
|
| 2065 |
-
visible=True,
|
| 2066 |
-
sources=(),
|
| 2067 |
-
image_mode="RGBA"
|
| 2068 |
-
)
|
| 2069 |
# with gr.Accordion(open=True):
|
| 2070 |
# fix_ex_kpts = gr.Examples(
|
| 2071 |
# fix_example_kpts,
|
|
@@ -2074,68 +2116,79 @@ with gr.Blocks(css=custom_css, theme="soft") as demo:
|
|
| 2074 |
# postprocess=False,
|
| 2075 |
# elem_id="kpts_examples"
|
| 2076 |
# )
|
| 2077 |
-
with gr.Accordion("[Your own image] Manually give hand pose", open=False, elem_id="accordion_bold"):
|
| 2078 |
-
|
| 2079 |
-
|
| 2080 |
-
|
| 2081 |
-
|
| 2082 |
-
|
| 2083 |
-
|
| 2084 |
-
|
| 2085 |
-
|
| 2086 |
-
|
| 2087 |
-
|
| 2088 |
-
|
| 2089 |
-
|
| 2090 |
-
|
| 2091 |
-
|
| 2092 |
-
|
| 2093 |
-
|
| 2094 |
-
|
| 2095 |
-
|
| 2096 |
-
|
| 2097 |
-
|
| 2098 |
-
|
| 2099 |
-
|
| 2100 |
-
|
| 2101 |
-
|
| 2102 |
-
|
| 2103 |
-
)
|
| 2104 |
-
fix_reset_right = gr.Button(
|
| 2105 |
-
value="Reset", interactive=False, visible=False
|
| 2106 |
-
)
|
| 2107 |
-
fix_kp_l_info = gr.Markdown(
|
| 2108 |
-
"""<p style="text-align: center;">② Click 21 keypoints on the image to provide the target hand pose of <b>left hand</b>. See the \"OpenPose keypoints convention\" for guidance.</p>""",
|
| 2109 |
-
visible=False
|
| 2110 |
)
|
| 2111 |
-
|
| 2112 |
-
|
| 2113 |
-
label="Keypoint Selection (left hand)",
|
| 2114 |
-
show_label=True,
|
| 2115 |
-
height=LENGTH,
|
| 2116 |
-
width=LENGTH,
|
| 2117 |
-
interactive=False,
|
| 2118 |
-
visible=False,
|
| 2119 |
-
sources=[],
|
| 2120 |
)
|
| 2121 |
-
|
| 2122 |
-
|
| 2123 |
-
|
| 2124 |
-
|
| 2125 |
-
|
| 2126 |
-
|
| 2127 |
-
|
| 2128 |
-
|
| 2129 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2130 |
)
|
| 2131 |
-
|
| 2132 |
-
value="
|
| 2133 |
-
type="numpy",
|
| 2134 |
-
show_label=False,
|
| 2135 |
-
height=LENGTH // 2,
|
| 2136 |
-
width=LENGTH // 2,
|
| 2137 |
-
interactive=False,
|
| 2138 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2139 |
|
| 2140 |
# result column
|
| 2141 |
with gr.Column():
|
|
|
|
| 19 |
from copy import deepcopy
|
| 20 |
from huggingface_hub import hf_hub_download
|
| 21 |
from gradio_toggle import Toggle
|
| 22 |
+
import argparse
|
| 23 |
try:
|
| 24 |
import spaces
|
| 25 |
except:
|
|
|
|
| 28 |
MAX_N = 6
|
| 29 |
FIX_MAX_N = 6
|
| 30 |
LENGTH = 480
|
|
|
|
| 31 |
placeholder = cv2.cvtColor(cv2.imread("placeholder.png"), cv2.COLOR_BGR2RGB)
|
| 32 |
+
|
| 33 |
+
parser = argparse.ArgumentParser()
|
| 34 |
+
parser.add_argument("--not_hf", action="store_true", default=False)
|
| 35 |
+
parser.add_argument("--old_model", action="store_true", default=False)
|
| 36 |
+
parser.add_argument("--model_epoch", type=int, default=6)
|
| 37 |
+
args = parser.parse_args()
|
| 38 |
+
NEW_MODEL = not args.old_model
|
| 39 |
+
MODEL_EPOCH = args.model_epoch
|
| 40 |
+
HF = not args.not_hf
|
| 41 |
+
|
| 42 |
pre_device = "cpu" if HF else "cuda"
|
| 43 |
spaces_60_fn = spaces.GPU(duration=60) if HF else (lambda f: f)
|
| 44 |
spaces_120_fn = spaces.GPU(duration=60) if HF else (lambda f: f)
|
|
|
|
| 221 |
# ckpt_state_dict = torch.load(model_path)['model_state_dict']
|
| 222 |
ckpt_state_dict = torch.load(model_path, map_location='cpu')['ema_state_dict']
|
| 223 |
missing_keys, extra_keys = model.load_state_dict(ckpt_state_dict, strict=False)
|
| 224 |
+
print(f"Loaded {model_path}")
|
| 225 |
model = model.to(device)
|
| 226 |
model.eval()
|
| 227 |
print(missing_keys, extra_keys)
|
|
|
|
| 241 |
print(f"encoder after eval() max: {max([p.max() for p in autoencoder.encoder.parameters()])}")
|
| 242 |
print(f"autoencoder encoder after eval() dtype: {next(autoencoder.encoder.parameters()).dtype}")
|
| 243 |
assert len(missing_keys) == 0
|
| 244 |
+
else:
|
| 245 |
+
opts = HandDiffOpts()
|
| 246 |
+
model_path = './finetune_epoch=5-step=130000.ckpt'
|
| 247 |
+
sd_path = './sd-v1-4.ckpt'
|
| 248 |
+
print('Load diffusion model...')
|
| 249 |
+
diffusion = create_diffusion(str(opts.test_sampling_steps))
|
| 250 |
+
model = vit.DiT_XL_2(
|
| 251 |
+
input_size=opts.latent_size[0],
|
| 252 |
+
latent_dim=opts.latent_dim,
|
| 253 |
+
in_channels=opts.latent_dim+opts.n_keypoints+opts.n_mask,
|
| 254 |
+
learn_sigma=True,
|
| 255 |
+
).cuda()
|
| 256 |
+
ckpt_state_dict = torch.load(model_path)['state_dict']
|
| 257 |
+
print(f"Loaded {model_path}")
|
| 258 |
+
dit_state_dict = {remove_prefix(k, 'diffusion_backbone.'): v for k, v in ckpt_state_dict.items() if k.startswith('diffusion_backbone')}
|
| 259 |
+
vae_state_dict = {remove_prefix(k, 'autoencoder.'): v for k, v in ckpt_state_dict.items() if k.startswith('autoencoder')}
|
| 260 |
+
missing_keys, extra_keys = model.load_state_dict(dit_state_dict, strict=False)
|
| 261 |
+
model.eval()
|
| 262 |
+
assert len(missing_keys) == 0 and len(extra_keys) == 0
|
| 263 |
+
autoencoder = vqvae.create_model(3, 3, opts.latent_dim).eval().requires_grad_(False).cuda()
|
| 264 |
+
missing_keys, extra_keys = autoencoder.load_state_dict(vae_state_dict, strict=False)
|
| 265 |
+
autoencoder.eval()
|
| 266 |
+
assert len(missing_keys) == 0 and len(extra_keys) == 0
|
| 267 |
|
| 268 |
sam_path = "sam_vit_h_4b8939.pth"
|
| 269 |
if not os.path.exists(sam_path):
|
|
|
|
| 280 |
no_hands_open = cv2.resize(np.array(Image.open("no_hands_open.jpeg"))[..., :3], (LENGTH, LENGTH))
|
| 281 |
|
| 282 |
def prepare_anno(ref, ref_is_user):
|
| 283 |
+
print("inside prepare_anno")
|
| 284 |
if not ref_is_user: # no_hand_open.jpeg
|
| 285 |
return gr.update(value=None), gr.update(value=None)
|
| 286 |
if ref is None or ref["background"] is None or ref["background"].sum()==0: # clear_all
|
|
|
|
| 316 |
|
| 317 |
@spaces_60_fn
|
| 318 |
def get_ref_anno(img, keypts, use_mask, use_pose):
|
| 319 |
+
print("inside get_ref_anno")
|
| 320 |
no_mask, no_pose = not use_mask, not use_pose
|
| 321 |
if img.sum() == 0: # clear_all
|
| 322 |
return None, gr.update(), None, gr.update(), True
|
|
|
|
| 440 |
return img, ref_pose, ref_cond, gr.update(), True
|
| 441 |
|
| 442 |
def get_target_anno(img, keypts):
|
| 443 |
+
print("inside get_target_anno")
|
| 444 |
if img.sum() == 0: # clear_all
|
| 445 |
return None, gr.update(), None, gr.update(), True
|
| 446 |
if keypts is None: # hands not detected
|
|
|
|
| 481 |
return img, target_pose, target_cond, keypts, gr.update(), True
|
| 482 |
|
| 483 |
def visualize_ref(ref, ex_mask):
|
| 484 |
+
print("inside visualize_ref")
|
| 485 |
if ref is None:
|
| 486 |
return None
|
| 487 |
|
|
|
|
| 633 |
cropped_vis = image.copy()
|
| 634 |
cropped_vis[:,:,-1] = 255
|
| 635 |
else: # will add second click
|
| 636 |
+
x_length = new_coord[0] - crop_coord[0][0]
|
| 637 |
+
y_length = new_coord[1] - crop_coord[0][1]
|
| 638 |
+
if x_length > y_length:
|
| 639 |
+
new_coord[0] = crop_coord[0][0] + y_length
|
| 640 |
+
else:
|
| 641 |
+
new_coord[1] = crop_coord[0][1] + x_length
|
| 642 |
crop_coord.append(new_coord)
|
| 643 |
x1, y1 = crop_coord[0]
|
| 644 |
x2, y2 = crop_coord[1]
|
|
|
|
| 930 |
cond, auto_cond, manual_cond,
|
| 931 |
keypts=None, auto_keypts=None, manual_keypts=None
|
| 932 |
):
|
| 933 |
+
print("inside flip_hand")
|
| 934 |
if cond is None: # clear clicked
|
| 935 |
return
|
| 936 |
img["composite"] = img["composite"][:, ::-1, :]
|
|
|
|
| 965 |
manual_keypts[:21, 0] = opts.image_size[1] - manual_keypts[:21, 0]
|
| 966 |
if manual_keypts[21:, :].sum() != 0:
|
| 967 |
manual_keypts[21:, 0] = opts.image_size[1] - manual_keypts[21:, 0]
|
| 968 |
+
return img, img_raw, pose_img, pose_manual_img, manual_kp_right, manual_kp_left, cond, auto_cond, manual_cond, False, keypts, auto_keypts, manual_keypts
|
| 969 |
|
| 970 |
def resize_to_full(img):
|
| 971 |
img["background"] = cv2.resize(img["background"], (LENGTH, LENGTH))
|
|
|
|
| 1159 |
|
| 1160 |
example_ref_imgs = [
|
| 1161 |
[
|
| 1162 |
+
"sample_images/sample2.jpg", "sample_images/sample10.jpg"
|
| 1163 |
],
|
| 1164 |
[
|
| 1165 |
+
"sample_images/sample10.jpg", "sample_images/sample9.jpg"
|
| 1166 |
],
|
| 1167 |
[
|
| 1168 |
+
"sample_images/sample3.jpg", "sample_images/sample5.jpg"
|
| 1169 |
],
|
| 1170 |
[
|
| 1171 |
+
"sample_images/sample11.jpg", "sample_images/sample10.jpg"
|
| 1172 |
],
|
| 1173 |
[
|
| 1174 |
+
"sample_images/sample4.jpg", "pose_images/pose4.jpg"
|
| 1175 |
],
|
| 1176 |
]
|
| 1177 |
example_target_imgs = [
|
| 1178 |
[
|
| 1179 |
+
"sample_images/sample10.jpg",
|
| 1180 |
],
|
| 1181 |
[
|
| 1182 |
"sample_images/sample9.jpg",
|
| 1183 |
],
|
| 1184 |
+
[
|
| 1185 |
+
"sample_images/sample5.jpg",
|
| 1186 |
+
],
|
| 1187 |
[
|
| 1188 |
"sample_images/sample10.jpg",
|
| 1189 |
],
|
| 1190 |
[
|
| 1191 |
+
"pose_images/pose4.jpg"
|
| 1192 |
],
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1193 |
]
|
| 1194 |
+
# fix_example_imgs = [
|
| 1195 |
+
# ["bad_hands/1.jpg"],
|
| 1196 |
+
# ["bad_hands/3.jpg"],
|
| 1197 |
+
# # ["bad_hands/4.jpg"],
|
| 1198 |
+
# ["bad_hands/5.jpg"],
|
| 1199 |
+
# ["bad_hands/6.jpg"],
|
| 1200 |
+
# ["bad_hands/7.jpg"],
|
| 1201 |
+
# ]
|
| 1202 |
+
# fix_example_brush = [
|
| 1203 |
+
# ["bad_hands/1_composite.png"],
|
| 1204 |
+
# ["bad_hands/3_composite.png"],
|
| 1205 |
+
# # ["bad_hands/4_composite.png"],
|
| 1206 |
+
# ["bad_hands/5_composite.png"],
|
| 1207 |
+
# ["bad_hands/6_composite.png"],
|
| 1208 |
+
# ["bad_hands/7_composite.png"],
|
| 1209 |
+
# ]
|
| 1210 |
+
# fix_example_kpts = [
|
| 1211 |
+
# ["bad_hands/1_kpts.png", 3.0, 1224],
|
| 1212 |
+
# ["bad_hands/3_kpts.png", 1.0, 42],
|
| 1213 |
+
# # ["bad_hands/4_kpts.png", 2.0, 42],
|
| 1214 |
+
# ["bad_hands/5_kpts.png", 3.0, 42],
|
| 1215 |
+
# ["bad_hands/6_kpts.png", 3.0, 1348],
|
| 1216 |
+
# ["bad_hands/7_kpts.png", 3.0, 42],
|
| 1217 |
+
# ]
|
| 1218 |
fix_example_all = [
|
| 1219 |
["bad_hands/1.jpg", "bad_hands/1_composite.png", "bad_hands/1_mask.jpg", "bad_hands/1_kpts.png", 3.0, 1224],
|
| 1220 |
["bad_hands/3.jpg", "bad_hands/3_composite.png", "bad_hands/3_mask.jpg", "bad_hands/3_kpts.png", 1.0, 42],
|
| 1221 |
+
# ["bad_hands/4.jpg", "bad_hands/4_composite.png", "bad_hands/4_mask.jpg", "bad_hands/4_kpts.png", 2.0, 42],
|
| 1222 |
["bad_hands/5.jpg", "bad_hands/5_composite.png", "bad_hands/5_mask.jpg", "bad_hands/5_kpts.png", 3.0, 42],
|
| 1223 |
["bad_hands/6.jpg", "bad_hands/6_composite.png", "bad_hands/6_mask.jpg", "bad_hands/6_kpts.png", 3.0, 1348],
|
| 1224 |
["bad_hands/7.jpg", "bad_hands/7_composite.png", "bad_hands/7_mask.jpg", "bad_hands/7_kpts.png", 3.0, 42],
|
| 1225 |
]
|
| 1226 |
+
# for i in range(len(fix_example_kpts)):
|
| 1227 |
+
# npy_path = fix_example_kpts[i][0].replace("_kpts.png", ".npy")
|
| 1228 |
+
# fix_example_kpts[i].append(npy_path)
|
| 1229 |
for i in range(len(fix_example_all)):
|
| 1230 |
npy_path = fix_example_all[i][3].replace("_kpts.png", ".npy")
|
| 1231 |
fix_example_all[i].append(npy_path)
|
| 1232 |
|
| 1233 |
+
# #fix_examples_all table tr:first-child {
|
| 1234 |
+
# display: none !important;
|
| 1235 |
+
# }
|
| 1236 |
custom_css = """
|
| 1237 |
.gradio-container .examples img {
|
| 1238 |
width: 240px !important;
|
|
|
|
| 1284 |
#fix_examples_all table tr td:nth-child(7) {
|
| 1285 |
display: none !important;
|
| 1286 |
}
|
|
|
|
|
|
|
|
|
|
| 1287 |
#repose_tutorial video {
|
| 1288 |
width: 50% !important;
|
| 1289 |
display: block;
|
|
|
|
| 1324 |
#gradio-app {
|
| 1325 |
flex-direction: row; !important;
|
| 1326 |
}
|
| 1327 |
+
#example_ref_target {
|
| 1328 |
+
display: block !important;
|
| 1329 |
+
width: 66.6667% !important;
|
| 1330 |
+
margin-left: 0 !important;
|
| 1331 |
+
margin-right: auto !important;
|
| 1332 |
+
align-self: flex-start !important;
|
| 1333 |
+
}
|
| 1334 |
"""
|
| 1335 |
##no_wrap_row {
|
| 1336 |
# display: flex !important;
|
|
|
|
| 1462 |
# config
|
| 1463 |
use_pose = gr.State(value=True)
|
| 1464 |
|
| 1465 |
+
# more options
|
| 1466 |
+
with gr.Accordion(label="More options", open=False):
|
| 1467 |
+
with gr.Row():
|
| 1468 |
+
n_generation = gr.Slider(
|
| 1469 |
+
label="Number of generations",
|
| 1470 |
+
value=1,
|
| 1471 |
+
minimum=1,
|
| 1472 |
+
maximum=MAX_N,
|
| 1473 |
+
step=1,
|
| 1474 |
+
randomize=False,
|
| 1475 |
+
interactive=True,
|
| 1476 |
+
)
|
| 1477 |
+
seed = gr.Slider(
|
| 1478 |
+
label="Seed",
|
| 1479 |
+
value=42,
|
| 1480 |
+
minimum=0,
|
| 1481 |
+
maximum=10000,
|
| 1482 |
+
step=1,
|
| 1483 |
+
randomize=False,
|
| 1484 |
+
interactive=True,
|
| 1485 |
+
)
|
| 1486 |
+
cfg = gr.Slider(
|
| 1487 |
+
label="Classifier free guidance scale",
|
| 1488 |
+
value=2.5,
|
| 1489 |
+
minimum=0.0,
|
| 1490 |
+
maximum=10.0,
|
| 1491 |
+
step=0.1,
|
| 1492 |
+
randomize=False,
|
| 1493 |
+
interactive=True,
|
| 1494 |
+
)
|
| 1495 |
+
|
| 1496 |
# main tabs
|
| 1497 |
with gr.Row():
|
| 1498 |
# ref column
|
|
|
|
| 1513 |
layers=False,
|
| 1514 |
crop_size="1:1",
|
| 1515 |
)
|
|
|
|
| 1516 |
use_mask = Toggle(label="Use mask", value=False, interactive=True)
|
| 1517 |
with gr.Accordion(label="See hand pose & mask", open=False):
|
| 1518 |
with gr.Tab("Automatic hand keypoints"):
|
|
|
|
| 1631 |
layers=False,
|
| 1632 |
crop_size="1:1",
|
| 1633 |
)
|
| 1634 |
+
# gr.Examples(example_target_imgs, [target], examples_per_page=20)
|
| 1635 |
with gr.Accordion(label="See hand pose", open=False):
|
| 1636 |
with gr.Tab("Automatic hand keypoints"):
|
| 1637 |
target_pose = gr.Image(
|
|
|
|
| 1766 |
# )
|
| 1767 |
clear = gr.ClearButton(elem_id="clear_button")
|
| 1768 |
|
| 1769 |
+
with gr.Row():
|
| 1770 |
+
gr.Examples(example_ref_imgs, [ref, target], examples_per_page=20, elem_id="example_ref_target")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1771 |
|
| 1772 |
# tutorial video
|
| 1773 |
with gr.Accordion("Tutorial Video of Demo 1", elem_id="accordion_bold_large_center"):
|
|
|
|
| 1844 |
ref_flip.select(
|
| 1845 |
flip_hand,
|
| 1846 |
[ref, ref_im_raw, ref_pose, ref_manual_pose, ref_manual_kp_right, ref_manual_kp_left, ref_cond, ref_auto_cond, ref_manual_cond],
|
| 1847 |
+
[ref, ref_im_raw, ref_pose, ref_manual_pose, ref_manual_kp_right, ref_manual_kp_left, ref_cond, ref_auto_cond, ref_manual_cond, ref_is_user]
|
| 1848 |
)
|
| 1849 |
|
| 1850 |
# target listeners
|
|
|
|
| 1911 |
target_flip.select(
|
| 1912 |
flip_hand,
|
| 1913 |
[target, target_im_raw, target_pose, target_manual_pose, target_manual_kp_right, target_manual_kp_left, target_cond, target_auto_cond, target_manual_cond, target_keypts, target_auto_keypts, target_manual_keypts],
|
| 1914 |
+
[target, target_im_raw, target_pose, target_manual_pose, target_manual_kp_right, target_manual_kp_left, target_cond, target_auto_cond, target_manual_cond, target_is_user, target_keypts, target_auto_keypts, target_manual_keypts],
|
| 1915 |
)
|
| 1916 |
|
| 1917 |
# run listerners
|
|
|
|
| 2073 |
)
|
| 2074 |
with gr.Column():
|
| 2075 |
gr.Markdown(
|
| 2076 |
+
"""<p style="text-align: center; font-size: 18px; font-weight: bold;">2. Select area to fix <br>(⚠️and surrounding area)</p>"""
|
| 2077 |
)
|
| 2078 |
# gr.Markdown(
|
| 2079 |
# """<p style="text-align: center;">Don't brush the entire hand!</p>"""
|
|
|
|
| 2108 |
# gr.Markdown(
|
| 2109 |
# """<p style="text-align: center;">Either get hand pose from Examples, or manually give hand pose (located at the bottom)</p>"""
|
| 2110 |
# )
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2111 |
# with gr.Accordion(open=True):
|
| 2112 |
# fix_ex_kpts = gr.Examples(
|
| 2113 |
# fix_example_kpts,
|
|
|
|
| 2116 |
# postprocess=False,
|
| 2117 |
# elem_id="kpts_examples"
|
| 2118 |
# )
|
| 2119 |
+
# with gr.Accordion("[Your own image] Manually give hand pose", open=False, elem_id="accordion_bold"):
|
| 2120 |
+
gr.Markdown(
|
| 2121 |
+
"""<p style="text-align: center;">① Tell us if this is right, left, or both hands, if it wasn't from Example</p>"""
|
| 2122 |
+
)
|
| 2123 |
+
fix_checkbox = gr.CheckboxGroup(
|
| 2124 |
+
["Right hand", "Left hand"],
|
| 2125 |
+
show_label=False,
|
| 2126 |
+
interactive=False,
|
| 2127 |
+
)
|
| 2128 |
+
fix_kp_r_info = gr.Markdown(
|
| 2129 |
+
"""<p style="text-align: center;">② Click 21 keypoints on the image to provide the target hand pose of <b>right hand</b>. See the \"OpenPose keypoints convention\" for guidance.</p>""",
|
| 2130 |
+
visible=False
|
| 2131 |
+
)
|
| 2132 |
+
fix_kp_right = gr.Image(
|
| 2133 |
+
type="numpy",
|
| 2134 |
+
label="Keypoint Selection (right hand)",
|
| 2135 |
+
show_label=True,
|
| 2136 |
+
height=LENGTH,
|
| 2137 |
+
width=LENGTH,
|
| 2138 |
+
interactive=False,
|
| 2139 |
+
visible=False,
|
| 2140 |
+
sources=[],
|
| 2141 |
+
)
|
| 2142 |
+
with gr.Row():
|
| 2143 |
+
fix_undo_right = gr.Button(
|
| 2144 |
+
value="Undo", interactive=False, visible=False
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2145 |
)
|
| 2146 |
+
fix_reset_right = gr.Button(
|
| 2147 |
+
value="Reset", interactive=False, visible=False
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2148 |
)
|
| 2149 |
+
fix_kp_l_info = gr.Markdown(
|
| 2150 |
+
"""<p style="text-align: center;">② Click 21 keypoints on the image to provide the target hand pose of <b>left hand</b>. See the \"OpenPose keypoints convention\" for guidance.</p>""",
|
| 2151 |
+
visible=False
|
| 2152 |
+
)
|
| 2153 |
+
fix_kp_left = gr.Image(
|
| 2154 |
+
type="numpy",
|
| 2155 |
+
label="Keypoint Selection (left hand)",
|
| 2156 |
+
show_label=True,
|
| 2157 |
+
height=LENGTH,
|
| 2158 |
+
width=LENGTH,
|
| 2159 |
+
interactive=False,
|
| 2160 |
+
visible=False,
|
| 2161 |
+
sources=[],
|
| 2162 |
+
)
|
| 2163 |
+
with gr.Row():
|
| 2164 |
+
fix_undo_left = gr.Button(
|
| 2165 |
+
value="Undo", interactive=False, visible=False
|
| 2166 |
)
|
| 2167 |
+
fix_reset_left = gr.Button(
|
| 2168 |
+
value="Reset", interactive=False, visible=False
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2169 |
)
|
| 2170 |
+
fix_kp_all = gr.Image(
|
| 2171 |
+
type="numpy",
|
| 2172 |
+
label="Keypoint Selection (from Example)",
|
| 2173 |
+
show_label=True,
|
| 2174 |
+
height=LENGTH,
|
| 2175 |
+
width=LENGTH,
|
| 2176 |
+
interactive=False,
|
| 2177 |
+
visible=True,
|
| 2178 |
+
sources=(),
|
| 2179 |
+
image_mode="RGBA"
|
| 2180 |
+
)
|
| 2181 |
+
gr.Markdown(
|
| 2182 |
+
"""<p style="text-align: left; font-weight: bold; ">OpenPose keypoints convention</p>"""
|
| 2183 |
+
)
|
| 2184 |
+
fix_openpose = gr.Image(
|
| 2185 |
+
value="openpose.png",
|
| 2186 |
+
type="numpy",
|
| 2187 |
+
show_label=False,
|
| 2188 |
+
height=LENGTH // 2,
|
| 2189 |
+
width=LENGTH // 2,
|
| 2190 |
+
interactive=False,
|
| 2191 |
+
)
|
| 2192 |
|
| 2193 |
# result column
|
| 2194 |
with gr.Column():
|
sample_images/sample11.jpg
CHANGED
|
Git LFS Details
|
|
Git LFS Details
|
sbatch/sbatch_demo.sh
CHANGED
|
@@ -4,8 +4,10 @@
|
|
| 4 |
#SBATCH -J demo_foundhand
|
| 5 |
|
| 6 |
# partition
|
| 7 |
-
#SBATCH
|
| 8 |
-
|
|
|
|
|
|
|
| 9 |
|
| 10 |
# ensures all allocated cores are on the same node
|
| 11 |
#SBATCH -N 1
|
|
@@ -35,4 +37,4 @@ conda activate handdiff
|
|
| 35 |
cd $HOME/hdd/FoundHand_demo
|
| 36 |
echo Directory is `pwd`
|
| 37 |
|
| 38 |
-
python -u app.py
|
|
|
|
| 4 |
#SBATCH -J demo_foundhand
|
| 5 |
|
| 6 |
# partition
|
| 7 |
+
#SBATCH -p 3090-gcondo --gres=gpu:1
|
| 8 |
+
|
| 9 |
+
##SBATCH --partition=ssrinath-gcondo --gres=gpu:1 --gres-flags=enforce-binding
|
| 10 |
+
##SBATCH --account=ssrinath-gcondo
|
| 11 |
|
| 12 |
# ensures all allocated cores are on the same node
|
| 13 |
#SBATCH -N 1
|
|
|
|
| 37 |
cd $HOME/hdd/FoundHand_demo
|
| 38 |
echo Directory is `pwd`
|
| 39 |
|
| 40 |
+
python -u app.py --not_hf --model_epoch 4
|
sbatch/sbatch_demo2.sh
CHANGED
|
@@ -37,4 +37,4 @@ conda activate handdiff
|
|
| 37 |
cd $HOME/hdd/FoundHand_demo
|
| 38 |
echo Directory is `pwd`
|
| 39 |
|
| 40 |
-
python -u app.py
|
|
|
|
| 37 |
cd $HOME/hdd/FoundHand_demo
|
| 38 |
echo Directory is `pwd`
|
| 39 |
|
| 40 |
+
python -u app.py --not_hf --model_epoch 6
|