Spaces:

majinyu
/

recognize-detect-segment-anything

Runtime error

App Files Files Community

majinyu commited on Jun 25, 2023

Commit

2c0c15f

1 Parent(s): 3e575ea

add a checkbox to make grounded-sam optional

Browse files

Files changed (1) hide show

app.py +46 -26

app.py CHANGED Viewed

@@ -125,7 +125,10 @@ def draw_box(box, draw, label):
 @torch.no_grad()
-def inference(raw_image, specified_tags, tagging_model_type, tagging_model, grounding_dino_model, sam_model):
     print(f"Start processing, image size {raw_image.size}")
     raw_image = raw_image.convert("RGB")
@@ -155,6 +158,13 @@ def inference(raw_image, specified_tags, tagging_model_type, tagging_model, grou
         print(f"Tags: {tags}")
         print(f"Caption: {caption}")
     # run groundingDINO
     transform = T.Compose([
         T.RandomResize([800], max_size=1333),
@@ -255,14 +265,17 @@ if __name__ == "__main__":
             <br>
             Welcome to the RAM/Tag2Text + Grounded-SAM demo! <br><br>
             <li>
-                <b>Recognize Anything Model + Grounded-SAM:</b> Upload your image to get the <b>English and Chinese tags</b> (by RAM) and <b>masks and boxes</b> (by Grounded-SAM)!
             </li>
             <li>
-                <b>Tag2Text Model + Grounded-SAM:</b> Upload your image to get the <b>tags and caption</b> (by Tag2Text) and <b>masks and boxes</b> (by Grounded-SAM)!
                 (Optional: Specify tags to get the corresponding caption.)
             </li>
             <br>
-            Note: this demo may take up to minutes to inference. If you do not need masks and boxes, visit <a href='https://huggingface.co/spaces/xinyu1205/Recognize_Anything-Tag2Text/' target='_blank'>this demo</a>.
         """  # noqa
         article = """
@@ -277,11 +290,17 @@ if __name__ == "__main__":
             </p>
         """  # noqa
-        def inference_with_ram(img):
-            return inference(img, None, "RAM", ram_model, grounding_dino_model, sam_model)
-        def inference_with_t2t(img, input_tags):
-            return inference(img, input_tags, "Tag2Text", tag2text_model, grounding_dino_model, sam_model)
         with gr.Blocks(title="Recognize Anything Model") as demo:
             ###############
@@ -293,6 +312,7 @@ if __name__ == "__main__":
                 with gr.Row():
                     with gr.Column():
                         ram_in_img = gr.Image(type="pil")
                         with gr.Row():
                             ram_btn_run = gr.Button(value="Run")
                             ram_btn_clear = gr.ClearButton()
@@ -302,12 +322,12 @@ if __name__ == "__main__":
                         ram_out_biaoqian = gr.Textbox(label="标签")
                 gr.Examples(
                     examples=[
-                        ["images/demo1.jpg"],
-                        ["images/demo2.jpg"],
-                        ["images/demo4.jpg"],
                     ],
                     fn=inference_with_ram,
-                    inputs=[ram_in_img],
                     outputs=[ram_out_tag, ram_out_biaoqian, ram_out_img],
                     cache_examples=True
                 )
@@ -317,6 +337,7 @@ if __name__ == "__main__":
                     with gr.Column():
                         t2t_in_img = gr.Image(type="pil")
                         t2t_in_tag = gr.Textbox(label="User Specified Tags (Optional, separated by comma)")
                         with gr.Row():
                             t2t_btn_run = gr.Button(value="Run")
                             t2t_btn_clear = gr.ClearButton()
@@ -326,12 +347,12 @@ if __name__ == "__main__":
                         t2t_out_cap = gr.Textbox(label="Caption")
                 gr.Examples(
                     examples=[
-                        ["images/demo4.jpg", ""],
-                        ["images/demo4.jpg", "power line"],
-                        ["images/demo4.jpg", "track, train"],
                     ],
                     fn=inference_with_t2t,
-                    inputs=[t2t_in_img, t2t_in_tag],
                     outputs=[t2t_out_tag, t2t_out_cap, t2t_out_img],
                     cache_examples=True
                 )
@@ -344,23 +365,22 @@ if __name__ == "__main__":
             # run inference
             ram_btn_run.click(
                 fn=inference_with_ram,
-                inputs=[ram_in_img],
                 outputs=[ram_out_tag, ram_out_biaoqian, ram_out_img]
             )
             t2t_btn_run.click(
                 fn=inference_with_t2t,
-                inputs=[t2t_in_img, t2t_in_tag],
                 outputs=[t2t_out_tag, t2t_out_cap, t2t_out_img]
             )
-            ram_btn_clear.add([
-                ram_in_img, t2t_in_img, t2t_in_tag,
-                ram_out_img, ram_out_tag, ram_out_biaoqian, t2t_out_img, t2t_out_tag, t2t_out_cap
-            ])
-            t2t_btn_clear.add([
-                ram_in_img, t2t_in_img, t2t_in_tag,
-                ram_out_img, ram_out_tag, ram_out_biaoqian, t2t_out_img, t2t_out_tag, t2t_out_cap
-            ])
         return demo

 @torch.no_grad()
+def inference(
+    raw_image, specified_tags, do_det_seg,
+    tagging_model_type, tagging_model, grounding_dino_model, sam_model
+):
     print(f"Start processing, image size {raw_image.size}")
     raw_image = raw_image.convert("RGB")
         print(f"Tags: {tags}")
         print(f"Caption: {caption}")
+    # return
+    if not do_det_seg:
+        if tagging_model_type == "RAM":
+            return tags.replace(", ", " | "), tags_chinese.replace(", ", " | "), None
+        else:
+            return tags.replace(", ", " | "), caption, None
     # run groundingDINO
     transform = T.Compose([
         T.RandomResize([800], max_size=1333),
             <br>
             Welcome to the RAM/Tag2Text + Grounded-SAM demo! <br><br>
             <li>
+                <b>Recognize Anything Model:</b> Upload your image to get the <b>English and Chinese tags</b>!
             </li>
             <li>
+                <b>Tag2Text Model:</b> Upload your image to get the <b>tags and caption</b>!
                 (Optional: Specify tags to get the corresponding caption.)
             </li>
+            <li>
+                <b>Grounded-SAM:</b> Tick the checkbox to get <b>boxes</b> and <b>masks</b> of tags!
+            </li>
             <br>
+            Great thanks to <a href='https://huggingface.co/majinyu' target='_blank'>Ma Jinyu</a>, the major contributor of this demo!
         """  # noqa
         article = """
             </p>
         """  # noqa
+        def inference_with_ram(img, do_det_seg):
+            return inference(
+                img, None, do_det_seg,
+                "RAM", ram_model, grounding_dino_model, sam_model
+            )
+        def inference_with_t2t(img, input_tags, do_det_seg):
+            return inference(
+                img, input_tags, do_det_seg,
+                "Tag2Text", tag2text_model, grounding_dino_model, sam_model
+            )
         with gr.Blocks(title="Recognize Anything Model") as demo:
             ###############
                 with gr.Row():
                     with gr.Column():
                         ram_in_img = gr.Image(type="pil")
+                        ram_opt_det_seg = gr.Checkbox(label="Get Boxes and Masks with Grounded-SAM", value=True)
                         with gr.Row():
                             ram_btn_run = gr.Button(value="Run")
                             ram_btn_clear = gr.ClearButton()
                         ram_out_biaoqian = gr.Textbox(label="标签")
                 gr.Examples(
                     examples=[
+                        ["images/demo1.jpg", True],
+                        ["images/demo2.jpg", True],
+                        ["images/demo4.jpg", True],
                     ],
                     fn=inference_with_ram,
+                    inputs=[ram_in_img, ram_opt_det_seg],
                     outputs=[ram_out_tag, ram_out_biaoqian, ram_out_img],
                     cache_examples=True
                 )
                     with gr.Column():
                         t2t_in_img = gr.Image(type="pil")
                         t2t_in_tag = gr.Textbox(label="User Specified Tags (Optional, separated by comma)")
+                        t2t_opt_det_seg = gr.Checkbox(label="Get Boxes and Masks with Grounded-SAM", value=True)
                         with gr.Row():
                             t2t_btn_run = gr.Button(value="Run")
                             t2t_btn_clear = gr.ClearButton()
                         t2t_out_cap = gr.Textbox(label="Caption")
                 gr.Examples(
                     examples=[
+                        ["images/demo4.jpg", "", True],
+                        ["images/demo4.jpg", "power line", False],
+                        ["images/demo4.jpg", "track, train", False],
                     ],
                     fn=inference_with_t2t,
+                    inputs=[t2t_in_img, t2t_in_tag, t2t_opt_det_seg],
                     outputs=[t2t_out_tag, t2t_out_cap, t2t_out_img],
                     cache_examples=True
                 )
             # run inference
             ram_btn_run.click(
                 fn=inference_with_ram,
+                inputs=[ram_in_img, ram_opt_det_seg],
                 outputs=[ram_out_tag, ram_out_biaoqian, ram_out_img]
             )
             t2t_btn_run.click(
                 fn=inference_with_t2t,
+                inputs=[t2t_in_img, t2t_in_tag, t2t_opt_det_seg],
                 outputs=[t2t_out_tag, t2t_out_cap, t2t_out_img]
             )
+            # hide or show image output
+            ram_opt_det_seg.change(fn=lambda b: gr.update(visible=b), inputs=[ram_opt_det_seg], outputs=[ram_out_img])
+            t2t_opt_det_seg.change(fn=lambda b: gr.update(visible=b), inputs=[t2t_opt_det_seg], outputs=[t2t_out_img])
+            # clear
+            ram_btn_clear.add([ram_in_img, ram_out_img, ram_out_tag, ram_out_biaoqian])
+            t2t_btn_clear.add([t2t_in_img, t2t_in_tag, t2t_out_img, t2t_out_tag, t2t_out_cap])
         return demo