Spaces:
Sleeping
Sleeping
| import spaces | |
| import gradio as gr | |
| from PIL import Image, ImageDraw, ImageFont | |
| from ultralytics import YOLO | |
| import cv2 | |
| import tempfile | |
| import numpy as np | |
| # ----------------------------- | |
| # Config | |
| # ----------------------------- | |
| MODEL_CHOICES = [ | |
| "yolo12n.pt", "yolo12s.pt", "yolo12m.pt", "yolo12l.pt", "yolo12x.pt", | |
| "yolo12n-seg.pt", "yolo12s-seg.pt", "yolo12m-seg.pt", "yolo12l-seg.pt", "yolo12x-seg.pt", | |
| "yolo12n-pose.pt", "yolo12s-pose.pt", "yolo12m-pose.pt", "yolo12l-pose.pt", "yolo12x-pose.pt", | |
| "yolo12n-obb.pt", "yolo12s-obb.pt", "yolo12m-obb.pt", "yolo12l-obb.pt", "yolo12x-obb.pt", | |
| "yolo12n-cls.pt", "yolo12s-cls.pt", "yolo12m-cls.pt", "yolo12l-cls.pt", "yolo12x-cls.pt", | |
| ] | |
| IMG_SIZE_CHOICES = [128, 160, 256, 384, 480, 640, 736, 1024, 1440, 2176] | |
| DEFAULT_IMG_SIZE = 640 | |
| # ----------------------------- | |
| # Inference | |
| # ----------------------------- | |
| def yolo_inference_image(image, model_id, conf_threshold, iou_threshold, max_detection, img_size): | |
| model = YOLO(model_id) | |
| if getattr(model, "task", None) != "classify": | |
| head = model.model.model[-1] | |
| if hasattr(head, "one2one_cv2"): | |
| delattr(head, "one2one_cv2") | |
| if image is None: | |
| w, h = 640, 480 | |
| blank = Image.new("RGB", (w, h), color="white") | |
| draw = ImageDraw.Draw(blank) | |
| msg = "No image provided" | |
| font = ImageFont.load_default() | |
| bbox = draw.textbbox((0, 0), msg, font=font) | |
| tw, th = bbox[2] - bbox[0], bbox[3] - bbox[1] | |
| draw.text(((w - tw) / 2, (h - th) / 2), msg, fill="black", font=font) | |
| return blank | |
| results = model.predict( | |
| source=image, | |
| conf=conf_threshold, | |
| iou=iou_threshold, | |
| imgsz=int(img_size), | |
| max_det=max_detection, | |
| show_labels=True, | |
| show_conf=True, | |
| verbose=False, | |
| ) | |
| annotated_image = None | |
| for r in results: | |
| img_bgr = r.plot() | |
| annotated_image = Image.fromarray(img_bgr[..., ::-1]) | |
| return annotated_image | |
| def yolo_inference_video(video, model_id, conf_threshold, iou_threshold, max_detection, img_size): | |
| model = YOLO(model_id) | |
| if getattr(model, "task", None) != "classify": | |
| head = model.model.model[-1] | |
| if hasattr(head, "one2one_cv2"): | |
| delattr(head, "one2one_cv2") | |
| if video is None: | |
| w, h = 640, 480 | |
| blank = Image.new("RGB", (w, h), color="white") | |
| draw = ImageDraw.Draw(blank) | |
| msg = "No video provided" | |
| font = ImageFont.load_default() | |
| bbox = draw.textbbox((0, 0), msg, font=font) | |
| tw, th = bbox[2] - bbox[0], bbox[3] - bbox[1] | |
| draw.text(((w - tw) / 2, (h - th) / 2), msg, fill="black", font=font) | |
| tmp = tempfile.NamedTemporaryFile(suffix=".mp4", delete=False).name | |
| fourcc = cv2.VideoWriter_fourcc(*"mp4v") | |
| out = cv2.VideoWriter(tmp, fourcc, 1, (w, h)) | |
| out.write(cv2.cvtColor(np.array(blank), cv2.COLOR_RGB2BGR)) | |
| out.release() | |
| return tmp | |
| cap = cv2.VideoCapture(video) | |
| if not cap.isOpened(): | |
| return None | |
| fps_val = cap.get(cv2.CAP_PROP_FPS) | |
| fps = fps_val if fps_val and fps_val > 0 else 25 | |
| w_val = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH)) | |
| h_val = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT)) | |
| w = w_val if w_val and w_val > 0 else 640 | |
| h = h_val if h_val and h_val > 0 else 480 | |
| tmp = tempfile.NamedTemporaryFile(suffix=".mp4", delete=False).name | |
| fourcc = cv2.VideoWriter_fourcc(*"mp4v") | |
| out = cv2.VideoWriter(tmp, fourcc, fps, (w, h)) | |
| wrote_any = False | |
| while True: | |
| ret, frame = cap.read() | |
| if not ret: | |
| break | |
| results = model.predict( | |
| source=frame, | |
| conf=conf_threshold, | |
| iou=iou_threshold, | |
| imgsz=int(img_size), | |
| max_det=max_detection, | |
| show_labels=True, | |
| show_conf=True, | |
| verbose=False, | |
| ) | |
| anno_bgr = frame | |
| for r in results: | |
| anno_bgr = r.plot() | |
| out.write(anno_bgr) | |
| wrote_any = True | |
| cap.release() | |
| out.release() | |
| if not wrote_any: | |
| return None | |
| return tmp | |
| def yolo_inference_for_examples(image, model_id, conf_threshold, iou_threshold, max_detection, img_size): | |
| return yolo_inference_image(image, model_id, conf_threshold, iou_threshold, max_detection, img_size) | |
| with gr.Blocks() as app: | |
| gr.Markdown("# YOLO12") | |
| gr.Markdown( | |
| "Image or video inference with detection, segmentation, pose, oriented bounding boxes, and classification using the latest Ultralytics YOLO12 models." | |
| ) | |
| with gr.Accordion("Reference", open=False): | |
| gr.Markdown( | |
| """ | |
| **BibTeX:** | |
| ``` | |
| @software{yolo12_ultralytics, | |
| author = {Glenn Jocher and Jing Qiu}, | |
| title = {Ultralytics YOLO12}, | |
| version = {12.0.0}, | |
| year = {2025}, | |
| url = {https://github.com/ultralytics/ultralytics}, | |
| orcid = {0000-0001-5950-6979, 0000-0003-3783-7069}, | |
| license = {AGPL-3.0} | |
| } | |
| ``` | |
| """ | |
| ) | |
| with gr.Tabs() as media_tabs: | |
| with gr.Tab("Image") as image_tab: | |
| with gr.Row(): | |
| with gr.Column(): | |
| image = gr.Image(type="pil", label="Image") | |
| model_id_img = gr.Dropdown(label="Model", choices=MODEL_CHOICES, value="yolo12n.pt") | |
| img_size_img = gr.Radio(choices=IMG_SIZE_CHOICES, value=DEFAULT_IMG_SIZE, label="Image Size") | |
| conf_img = gr.Slider(0, 1, value=0.25, label="Confidence Threshold") | |
| iou_img = gr.Slider(0, 1, value=0.45, label="IoU Threshold") | |
| max_det_img = gr.Slider(1, 300, step=1, value=300, label="Max Detection") | |
| infer_image_button = gr.Button("Detect Objects", variant="primary") | |
| with gr.Column(): | |
| output_image = gr.Image(type="pil", show_label=False) | |
| gr.DeepLinkButton(variant="primary") | |
| gr.Examples( | |
| examples=[ | |
| ["zidane.jpg", "yolo12s.pt", 0.25, 0.45, 300, DEFAULT_IMG_SIZE], | |
| ["bus.jpg", "yolo12m.pt", 0.25, 0.45, 300, DEFAULT_IMG_SIZE], | |
| ["yolo_vision.jpg", "yolo12x.pt", 0.25, 0.45, 300, DEFAULT_IMG_SIZE], | |
| ["Tricycle.jpg", "yolo12x.pt", 0.25, 0.45, 300, DEFAULT_IMG_SIZE], | |
| ["tcganadolu.jpg", "yolo12m.pt", 0.25, 0.45, 300, DEFAULT_IMG_SIZE], | |
| ["San Diego Airport.jpg", "yolo12x.pt", 0.25, 0.45, 300, DEFAULT_IMG_SIZE], | |
| ["Theodore_Roosevelt.png", "yolo12l.pt", 0.25, 0.45, 300, DEFAULT_IMG_SIZE], | |
| ], | |
| fn=yolo_inference_for_examples, | |
| inputs=[image, model_id_img, conf_img, iou_img, max_det_img, img_size_img], | |
| outputs=[output_image], | |
| label="Examples", | |
| cache_examples=False, | |
| ) | |
| with gr.Tab("Video") as video_tab: | |
| with gr.Row(): | |
| with gr.Column(): | |
| video = gr.Video(label="Video") | |
| model_id_vid = gr.Dropdown(label="Model", choices=MODEL_CHOICES, value="yolo12n.pt") | |
| img_size_vid = gr.Radio(choices=IMG_SIZE_CHOICES, value=DEFAULT_IMG_SIZE, label="Image Size") | |
| conf_vid = gr.Slider(0, 1, value=0.25, label="Confidence Threshold") | |
| iou_vid = gr.Slider(0, 1, value=0.45, label="IoU Threshold") | |
| max_det_vid = gr.Slider(1, 300, step=1, value=300, label="Max Detection") | |
| infer_video_button = gr.Button("Detect Objects", variant="primary") | |
| with gr.Column(): | |
| output_video = gr.Video(show_label=False) | |
| gr.DeepLinkButton(variant="primary") | |
| infer_image_button.click( | |
| fn=yolo_inference_image, | |
| inputs=[image, model_id_img, conf_img, iou_img, max_det_img, img_size_img], | |
| outputs=[output_image], | |
| ) | |
| infer_video_button.click( | |
| fn=yolo_inference_video, | |
| inputs=[video, model_id_vid, conf_vid, iou_vid, max_det_vid, img_size_vid], | |
| outputs=[output_video], | |
| ) | |
| if __name__ == "__main__": | |
| app.launch(mcp_server=True, theme=gr.themes.Ocean(primary_hue="indigo", secondary_hue="blue")) | |