Spaces:
Runtime error
Runtime error
add a checkbox to make grounded-sam optional
Browse files
app.py
CHANGED
|
@@ -125,7 +125,10 @@ def draw_box(box, draw, label):
|
|
| 125 |
|
| 126 |
|
| 127 |
@torch.no_grad()
|
| 128 |
-
def inference(
|
|
|
|
|
|
|
|
|
|
| 129 |
print(f"Start processing, image size {raw_image.size}")
|
| 130 |
raw_image = raw_image.convert("RGB")
|
| 131 |
|
|
@@ -155,6 +158,13 @@ def inference(raw_image, specified_tags, tagging_model_type, tagging_model, grou
|
|
| 155 |
print(f"Tags: {tags}")
|
| 156 |
print(f"Caption: {caption}")
|
| 157 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 158 |
# run groundingDINO
|
| 159 |
transform = T.Compose([
|
| 160 |
T.RandomResize([800], max_size=1333),
|
|
@@ -255,14 +265,17 @@ if __name__ == "__main__":
|
|
| 255 |
<br>
|
| 256 |
Welcome to the RAM/Tag2Text + Grounded-SAM demo! <br><br>
|
| 257 |
<li>
|
| 258 |
-
<b>Recognize Anything Model
|
| 259 |
</li>
|
| 260 |
<li>
|
| 261 |
-
<b>Tag2Text Model
|
| 262 |
(Optional: Specify tags to get the corresponding caption.)
|
| 263 |
</li>
|
|
|
|
|
|
|
|
|
|
| 264 |
<br>
|
| 265 |
-
|
| 266 |
""" # noqa
|
| 267 |
|
| 268 |
article = """
|
|
@@ -277,11 +290,17 @@ if __name__ == "__main__":
|
|
| 277 |
</p>
|
| 278 |
""" # noqa
|
| 279 |
|
| 280 |
-
def inference_with_ram(img):
|
| 281 |
-
return inference(
|
|
|
|
|
|
|
|
|
|
| 282 |
|
| 283 |
-
def inference_with_t2t(img, input_tags):
|
| 284 |
-
return inference(
|
|
|
|
|
|
|
|
|
|
| 285 |
|
| 286 |
with gr.Blocks(title="Recognize Anything Model") as demo:
|
| 287 |
###############
|
|
@@ -293,6 +312,7 @@ if __name__ == "__main__":
|
|
| 293 |
with gr.Row():
|
| 294 |
with gr.Column():
|
| 295 |
ram_in_img = gr.Image(type="pil")
|
|
|
|
| 296 |
with gr.Row():
|
| 297 |
ram_btn_run = gr.Button(value="Run")
|
| 298 |
ram_btn_clear = gr.ClearButton()
|
|
@@ -302,12 +322,12 @@ if __name__ == "__main__":
|
|
| 302 |
ram_out_biaoqian = gr.Textbox(label="标签")
|
| 303 |
gr.Examples(
|
| 304 |
examples=[
|
| 305 |
-
["images/demo1.jpg"],
|
| 306 |
-
["images/demo2.jpg"],
|
| 307 |
-
["images/demo4.jpg"],
|
| 308 |
],
|
| 309 |
fn=inference_with_ram,
|
| 310 |
-
inputs=[ram_in_img],
|
| 311 |
outputs=[ram_out_tag, ram_out_biaoqian, ram_out_img],
|
| 312 |
cache_examples=True
|
| 313 |
)
|
|
@@ -317,6 +337,7 @@ if __name__ == "__main__":
|
|
| 317 |
with gr.Column():
|
| 318 |
t2t_in_img = gr.Image(type="pil")
|
| 319 |
t2t_in_tag = gr.Textbox(label="User Specified Tags (Optional, separated by comma)")
|
|
|
|
| 320 |
with gr.Row():
|
| 321 |
t2t_btn_run = gr.Button(value="Run")
|
| 322 |
t2t_btn_clear = gr.ClearButton()
|
|
@@ -326,12 +347,12 @@ if __name__ == "__main__":
|
|
| 326 |
t2t_out_cap = gr.Textbox(label="Caption")
|
| 327 |
gr.Examples(
|
| 328 |
examples=[
|
| 329 |
-
["images/demo4.jpg", ""],
|
| 330 |
-
["images/demo4.jpg", "power line"],
|
| 331 |
-
["images/demo4.jpg", "track, train"],
|
| 332 |
],
|
| 333 |
fn=inference_with_t2t,
|
| 334 |
-
inputs=[t2t_in_img, t2t_in_tag],
|
| 335 |
outputs=[t2t_out_tag, t2t_out_cap, t2t_out_img],
|
| 336 |
cache_examples=True
|
| 337 |
)
|
|
@@ -344,23 +365,22 @@ if __name__ == "__main__":
|
|
| 344 |
# run inference
|
| 345 |
ram_btn_run.click(
|
| 346 |
fn=inference_with_ram,
|
| 347 |
-
inputs=[ram_in_img],
|
| 348 |
outputs=[ram_out_tag, ram_out_biaoqian, ram_out_img]
|
| 349 |
)
|
| 350 |
t2t_btn_run.click(
|
| 351 |
fn=inference_with_t2t,
|
| 352 |
-
inputs=[t2t_in_img, t2t_in_tag],
|
| 353 |
outputs=[t2t_out_tag, t2t_out_cap, t2t_out_img]
|
| 354 |
)
|
| 355 |
|
| 356 |
-
|
| 357 |
-
|
| 358 |
-
|
| 359 |
-
|
| 360 |
-
|
| 361 |
-
|
| 362 |
-
|
| 363 |
-
])
|
| 364 |
|
| 365 |
return demo
|
| 366 |
|
|
|
|
| 125 |
|
| 126 |
|
| 127 |
@torch.no_grad()
|
| 128 |
+
def inference(
|
| 129 |
+
raw_image, specified_tags, do_det_seg,
|
| 130 |
+
tagging_model_type, tagging_model, grounding_dino_model, sam_model
|
| 131 |
+
):
|
| 132 |
print(f"Start processing, image size {raw_image.size}")
|
| 133 |
raw_image = raw_image.convert("RGB")
|
| 134 |
|
|
|
|
| 158 |
print(f"Tags: {tags}")
|
| 159 |
print(f"Caption: {caption}")
|
| 160 |
|
| 161 |
+
# return
|
| 162 |
+
if not do_det_seg:
|
| 163 |
+
if tagging_model_type == "RAM":
|
| 164 |
+
return tags.replace(", ", " | "), tags_chinese.replace(", ", " | "), None
|
| 165 |
+
else:
|
| 166 |
+
return tags.replace(", ", " | "), caption, None
|
| 167 |
+
|
| 168 |
# run groundingDINO
|
| 169 |
transform = T.Compose([
|
| 170 |
T.RandomResize([800], max_size=1333),
|
|
|
|
| 265 |
<br>
|
| 266 |
Welcome to the RAM/Tag2Text + Grounded-SAM demo! <br><br>
|
| 267 |
<li>
|
| 268 |
+
<b>Recognize Anything Model:</b> Upload your image to get the <b>English and Chinese tags</b>!
|
| 269 |
</li>
|
| 270 |
<li>
|
| 271 |
+
<b>Tag2Text Model:</b> Upload your image to get the <b>tags and caption</b>!
|
| 272 |
(Optional: Specify tags to get the corresponding caption.)
|
| 273 |
</li>
|
| 274 |
+
<li>
|
| 275 |
+
<b>Grounded-SAM:</b> Tick the checkbox to get <b>boxes</b> and <b>masks</b> of tags!
|
| 276 |
+
</li>
|
| 277 |
<br>
|
| 278 |
+
Great thanks to <a href='https://huggingface.co/majinyu' target='_blank'>Ma Jinyu</a>, the major contributor of this demo!
|
| 279 |
""" # noqa
|
| 280 |
|
| 281 |
article = """
|
|
|
|
| 290 |
</p>
|
| 291 |
""" # noqa
|
| 292 |
|
| 293 |
+
def inference_with_ram(img, do_det_seg):
|
| 294 |
+
return inference(
|
| 295 |
+
img, None, do_det_seg,
|
| 296 |
+
"RAM", ram_model, grounding_dino_model, sam_model
|
| 297 |
+
)
|
| 298 |
|
| 299 |
+
def inference_with_t2t(img, input_tags, do_det_seg):
|
| 300 |
+
return inference(
|
| 301 |
+
img, input_tags, do_det_seg,
|
| 302 |
+
"Tag2Text", tag2text_model, grounding_dino_model, sam_model
|
| 303 |
+
)
|
| 304 |
|
| 305 |
with gr.Blocks(title="Recognize Anything Model") as demo:
|
| 306 |
###############
|
|
|
|
| 312 |
with gr.Row():
|
| 313 |
with gr.Column():
|
| 314 |
ram_in_img = gr.Image(type="pil")
|
| 315 |
+
ram_opt_det_seg = gr.Checkbox(label="Get Boxes and Masks with Grounded-SAM", value=True)
|
| 316 |
with gr.Row():
|
| 317 |
ram_btn_run = gr.Button(value="Run")
|
| 318 |
ram_btn_clear = gr.ClearButton()
|
|
|
|
| 322 |
ram_out_biaoqian = gr.Textbox(label="标签")
|
| 323 |
gr.Examples(
|
| 324 |
examples=[
|
| 325 |
+
["images/demo1.jpg", True],
|
| 326 |
+
["images/demo2.jpg", True],
|
| 327 |
+
["images/demo4.jpg", True],
|
| 328 |
],
|
| 329 |
fn=inference_with_ram,
|
| 330 |
+
inputs=[ram_in_img, ram_opt_det_seg],
|
| 331 |
outputs=[ram_out_tag, ram_out_biaoqian, ram_out_img],
|
| 332 |
cache_examples=True
|
| 333 |
)
|
|
|
|
| 337 |
with gr.Column():
|
| 338 |
t2t_in_img = gr.Image(type="pil")
|
| 339 |
t2t_in_tag = gr.Textbox(label="User Specified Tags (Optional, separated by comma)")
|
| 340 |
+
t2t_opt_det_seg = gr.Checkbox(label="Get Boxes and Masks with Grounded-SAM", value=True)
|
| 341 |
with gr.Row():
|
| 342 |
t2t_btn_run = gr.Button(value="Run")
|
| 343 |
t2t_btn_clear = gr.ClearButton()
|
|
|
|
| 347 |
t2t_out_cap = gr.Textbox(label="Caption")
|
| 348 |
gr.Examples(
|
| 349 |
examples=[
|
| 350 |
+
["images/demo4.jpg", "", True],
|
| 351 |
+
["images/demo4.jpg", "power line", False],
|
| 352 |
+
["images/demo4.jpg", "track, train", False],
|
| 353 |
],
|
| 354 |
fn=inference_with_t2t,
|
| 355 |
+
inputs=[t2t_in_img, t2t_in_tag, t2t_opt_det_seg],
|
| 356 |
outputs=[t2t_out_tag, t2t_out_cap, t2t_out_img],
|
| 357 |
cache_examples=True
|
| 358 |
)
|
|
|
|
| 365 |
# run inference
|
| 366 |
ram_btn_run.click(
|
| 367 |
fn=inference_with_ram,
|
| 368 |
+
inputs=[ram_in_img, ram_opt_det_seg],
|
| 369 |
outputs=[ram_out_tag, ram_out_biaoqian, ram_out_img]
|
| 370 |
)
|
| 371 |
t2t_btn_run.click(
|
| 372 |
fn=inference_with_t2t,
|
| 373 |
+
inputs=[t2t_in_img, t2t_in_tag, t2t_opt_det_seg],
|
| 374 |
outputs=[t2t_out_tag, t2t_out_cap, t2t_out_img]
|
| 375 |
)
|
| 376 |
|
| 377 |
+
# hide or show image output
|
| 378 |
+
ram_opt_det_seg.change(fn=lambda b: gr.update(visible=b), inputs=[ram_opt_det_seg], outputs=[ram_out_img])
|
| 379 |
+
t2t_opt_det_seg.change(fn=lambda b: gr.update(visible=b), inputs=[t2t_opt_det_seg], outputs=[t2t_out_img])
|
| 380 |
+
|
| 381 |
+
# clear
|
| 382 |
+
ram_btn_clear.add([ram_in_img, ram_out_img, ram_out_tag, ram_out_biaoqian])
|
| 383 |
+
t2t_btn_clear.add([t2t_in_img, t2t_in_tag, t2t_out_img, t2t_out_tag, t2t_out_cap])
|
|
|
|
| 384 |
|
| 385 |
return demo
|
| 386 |
|