Spaces:
Runtime error
Runtime error
| """ | |
| Gradio app for pollen-vision | |
| This script creates a Gradio app for pollen-vision. The app allows users to perform object detection and object segmentation using the OWL-ViT and MobileSAM models. | |
| """ | |
| from datasets import load_dataset | |
| import gradio as gr | |
| import numpy as np | |
| import numpy.typing as npt | |
| from typing import Any, Dict, List | |
| from pollen_vision.vision_models.object_detection import OwlVitWrapper | |
| from pollen_vision.vision_models.object_segmentation import MobileSamWrapper | |
| from pollen_vision.utils import Annotator, get_bboxes | |
| owl_vit = OwlVitWrapper() | |
| mobile_sam = MobileSamWrapper() | |
| annotator = Annotator() | |
| def object_detection( | |
| img: npt.NDArray[np.uint8], text_queries: List[str], score_threshold: float | |
| ) -> List[Dict[str, Any]]: | |
| predictions: List[Dict[str, Any]] = owl_vit.infer( | |
| im=img, candidate_labels=text_queries, detection_threshold=score_threshold | |
| ) | |
| return predictions | |
| def object_segmentation( | |
| img: npt.NDArray[np.uint8], object_detection_predictions: List[Dict[str, Any]] | |
| ) -> List[npt.NDArray[np.uint8]]: | |
| bboxes = get_bboxes(predictions=object_detection_predictions) | |
| masks: List[npt.NDArray[np.uint8]] = mobile_sam.infer(im=img, bboxes=bboxes) | |
| return masks | |
| def query( | |
| task: str, | |
| img: npt.NDArray[np.uint8], | |
| text_queries: List[str], | |
| score_threshold: float, | |
| ) -> npt.NDArray[np.uint8]: | |
| object_detection_predictions = object_detection( | |
| img=img, text_queries=text_queries, score_threshold=score_threshold | |
| ) | |
| if task == "Object detection + segmentation (OWL-ViT + MobileSAM)": | |
| masks = object_segmentation( | |
| img=img, object_detection_predictions=object_detection_predictions | |
| ) | |
| img = annotator.annotate( | |
| im=img, detection_predictions=object_detection_predictions, masks=masks | |
| ) | |
| return img | |
| img = annotator.annotate(im=img, detection_predictions=object_detection_predictions) | |
| return img | |
| description = """ | |
| Welcome to the demo of pollen-vision, a simple and unified Python library to zero-shot computer vision models curated | |
| for robotics use cases. **Pollen-vision** is designed for ease of installation and use, composed of independent modules | |
| that can be combined to create a 3D object detection pipeline, getting the position of the objects in 3D space (x, y, z). | |
| \n\nIn this demo, you have the option to choose between two tasks: object detection and object detection + segmentation. | |
| The models available are: | |
| - **OWL-VIT** (Open World Localization - Vision Transformer, By Google Research): this model performs text-conditionned | |
| zero-shot 2D object localization in RGB images. | |
| - **Mobile SAM**: A lightweight version of the Segment Anything Model (SAM) by Meta AI. SAM is a zero shot image | |
| segmentation model. It can be prompted with bounding boxes or points. (https://github.com/ChaoningZhang/MobileSAM) | |
| \n\nYou can input images in this demo in three ways: either by trying out the provided examples, by uploading an image | |
| of your choice, or by capturing an image from your computer's webcam. | |
| Additionally, you should provide text queries representing a list of objects to detect. Separate each object with a comma. | |
| The last input parameter is the detection threshold (ranging from 0 to 1), which defaults to 0.1. | |
| \n\nCheck out our blog post introducing pollen-vision or its <a href="https://github.com/pollen-robotics/pollen-vision"> | |
| Github repository</a> for more info! | |
| """ | |
| demo_inputs = [ | |
| gr.Dropdown( | |
| [ | |
| "Object detection (OWL-ViT)", | |
| "Object detection + segmentation (OWL-ViT + MobileSAM)", | |
| ], | |
| label="Choose a task", | |
| value="Object detection (OWL-ViT)", | |
| ), | |
| gr.Image(), | |
| "text", | |
| gr.Slider(0, 1, value=0.1), | |
| ] | |
| rdt_dataset = load_dataset("pollen-robotics/reachy-doing-things", split="train") | |
| img_kitchen_detection = rdt_dataset[11]["image"] | |
| img_kitchen_segmentation = rdt_dataset[12]["image"] | |
| demo_examples = [ | |
| [ | |
| "Object detection (OWL-ViT)", | |
| img_kitchen_detection, | |
| ["kettle", "black mug", "sink", "blue mug", "sponge", "bag of chips"], | |
| 0.15, | |
| ], | |
| [ | |
| "Object detection + segmentation (OWL-ViT + MobileSAM)", | |
| img_kitchen_segmentation, | |
| ["blue mug", "paper cup", "kettle", "sponge"], | |
| 0.12, | |
| ], | |
| ] | |
| demo = gr.Interface( | |
| fn=query, | |
| inputs=demo_inputs, | |
| outputs="image", | |
| title="Use zero-shot computer vision models with pollen-vision", | |
| description=description, | |
| examples=demo_examples, | |
| ) | |
| demo.launch() | |