import spaces import gradio as gr from PIL import Image, ImageDraw, ImageFont from ultralytics import YOLO import cv2 import tempfile import numpy as np # ----------------------------- # Config # ----------------------------- MODEL_CHOICES = [ "yolo12n.pt", "yolo12s.pt", "yolo12m.pt", "yolo12l.pt", "yolo12x.pt", "yolo12n-seg.pt", "yolo12s-seg.pt", "yolo12m-seg.pt", "yolo12l-seg.pt", "yolo12x-seg.pt", "yolo12n-pose.pt", "yolo12s-pose.pt", "yolo12m-pose.pt", "yolo12l-pose.pt", "yolo12x-pose.pt", "yolo12n-obb.pt", "yolo12s-obb.pt", "yolo12m-obb.pt", "yolo12l-obb.pt", "yolo12x-obb.pt", "yolo12n-cls.pt", "yolo12s-cls.pt", "yolo12m-cls.pt", "yolo12l-cls.pt", "yolo12x-cls.pt", ] IMG_SIZE_CHOICES = [128, 160, 256, 384, 480, 640, 736, 1024, 1440, 2176] DEFAULT_IMG_SIZE = 640 # ----------------------------- # Inference # ----------------------------- @spaces.GPU def yolo_inference_image(image, model_id, conf_threshold, iou_threshold, max_detection, img_size): model = YOLO(model_id) if getattr(model, "task", None) != "classify": head = model.model.model[-1] if hasattr(head, "one2one_cv2"): delattr(head, "one2one_cv2") if image is None: w, h = 640, 480 blank = Image.new("RGB", (w, h), color="white") draw = ImageDraw.Draw(blank) msg = "No image provided" font = ImageFont.load_default() bbox = draw.textbbox((0, 0), msg, font=font) tw, th = bbox[2] - bbox[0], bbox[3] - bbox[1] draw.text(((w - tw) / 2, (h - th) / 2), msg, fill="black", font=font) return blank results = model.predict( source=image, conf=conf_threshold, iou=iou_threshold, imgsz=int(img_size), max_det=max_detection, show_labels=True, show_conf=True, verbose=False, ) annotated_image = None for r in results: img_bgr = r.plot() annotated_image = Image.fromarray(img_bgr[..., ::-1]) return annotated_image @spaces.GPU def yolo_inference_video(video, model_id, conf_threshold, iou_threshold, max_detection, img_size): model = YOLO(model_id) if getattr(model, "task", None) != "classify": head = model.model.model[-1] if hasattr(head, "one2one_cv2"): delattr(head, "one2one_cv2") if video is None: w, h = 640, 480 blank = Image.new("RGB", (w, h), color="white") draw = ImageDraw.Draw(blank) msg = "No video provided" font = ImageFont.load_default() bbox = draw.textbbox((0, 0), msg, font=font) tw, th = bbox[2] - bbox[0], bbox[3] - bbox[1] draw.text(((w - tw) / 2, (h - th) / 2), msg, fill="black", font=font) tmp = tempfile.NamedTemporaryFile(suffix=".mp4", delete=False).name fourcc = cv2.VideoWriter_fourcc(*"mp4v") out = cv2.VideoWriter(tmp, fourcc, 1, (w, h)) out.write(cv2.cvtColor(np.array(blank), cv2.COLOR_RGB2BGR)) out.release() return tmp cap = cv2.VideoCapture(video) if not cap.isOpened(): return None fps_val = cap.get(cv2.CAP_PROP_FPS) fps = fps_val if fps_val and fps_val > 0 else 25 w_val = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH)) h_val = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT)) w = w_val if w_val and w_val > 0 else 640 h = h_val if h_val and h_val > 0 else 480 tmp = tempfile.NamedTemporaryFile(suffix=".mp4", delete=False).name fourcc = cv2.VideoWriter_fourcc(*"mp4v") out = cv2.VideoWriter(tmp, fourcc, fps, (w, h)) wrote_any = False while True: ret, frame = cap.read() if not ret: break results = model.predict( source=frame, conf=conf_threshold, iou=iou_threshold, imgsz=int(img_size), max_det=max_detection, show_labels=True, show_conf=True, verbose=False, ) anno_bgr = frame for r in results: anno_bgr = r.plot() out.write(anno_bgr) wrote_any = True cap.release() out.release() if not wrote_any: return None return tmp def yolo_inference_for_examples(image, model_id, conf_threshold, iou_threshold, max_detection, img_size): return yolo_inference_image(image, model_id, conf_threshold, iou_threshold, max_detection, img_size) with gr.Blocks() as app: gr.Markdown("# YOLO12") gr.Markdown( "Image or video inference with detection, segmentation, pose, oriented bounding boxes, and classification using the latest Ultralytics YOLO12 models." ) with gr.Accordion("Reference", open=False): gr.Markdown( """ **BibTeX:** ``` @software{yolo12_ultralytics, author = {Glenn Jocher and Jing Qiu}, title = {Ultralytics YOLO12}, version = {12.0.0}, year = {2025}, url = {https://github.com/ultralytics/ultralytics}, orcid = {0000-0001-5950-6979, 0000-0003-3783-7069}, license = {AGPL-3.0} } ``` """ ) with gr.Tabs() as media_tabs: with gr.Tab("Image") as image_tab: with gr.Row(): with gr.Column(): image = gr.Image(type="pil", label="Image") model_id_img = gr.Dropdown(label="Model", choices=MODEL_CHOICES, value="yolo12n.pt") img_size_img = gr.Radio(choices=IMG_SIZE_CHOICES, value=DEFAULT_IMG_SIZE, label="Image Size") conf_img = gr.Slider(0, 1, value=0.25, label="Confidence Threshold") iou_img = gr.Slider(0, 1, value=0.45, label="IoU Threshold") max_det_img = gr.Slider(1, 300, step=1, value=300, label="Max Detection") infer_image_button = gr.Button("Detect Objects", variant="primary") with gr.Column(): output_image = gr.Image(type="pil", show_label=False) gr.DeepLinkButton(variant="primary") gr.Examples( examples=[ ["zidane.jpg", "yolo12s.pt", 0.25, 0.45, 300, DEFAULT_IMG_SIZE], ["bus.jpg", "yolo12m.pt", 0.25, 0.45, 300, DEFAULT_IMG_SIZE], ["yolo_vision.jpg", "yolo12x.pt", 0.25, 0.45, 300, DEFAULT_IMG_SIZE], ["Tricycle.jpg", "yolo12x.pt", 0.25, 0.45, 300, DEFAULT_IMG_SIZE], ["tcganadolu.jpg", "yolo12m.pt", 0.25, 0.45, 300, DEFAULT_IMG_SIZE], ["San Diego Airport.jpg", "yolo12x.pt", 0.25, 0.45, 300, DEFAULT_IMG_SIZE], ["Theodore_Roosevelt.png", "yolo12l.pt", 0.25, 0.45, 300, DEFAULT_IMG_SIZE], ], fn=yolo_inference_for_examples, inputs=[image, model_id_img, conf_img, iou_img, max_det_img, img_size_img], outputs=[output_image], label="Examples", cache_examples=False, ) with gr.Tab("Video") as video_tab: with gr.Row(): with gr.Column(): video = gr.Video(label="Video") model_id_vid = gr.Dropdown(label="Model", choices=MODEL_CHOICES, value="yolo12n.pt") img_size_vid = gr.Radio(choices=IMG_SIZE_CHOICES, value=DEFAULT_IMG_SIZE, label="Image Size") conf_vid = gr.Slider(0, 1, value=0.25, label="Confidence Threshold") iou_vid = gr.Slider(0, 1, value=0.45, label="IoU Threshold") max_det_vid = gr.Slider(1, 300, step=1, value=300, label="Max Detection") infer_video_button = gr.Button("Detect Objects", variant="primary") with gr.Column(): output_video = gr.Video(show_label=False) gr.DeepLinkButton(variant="primary") infer_image_button.click( fn=yolo_inference_image, inputs=[image, model_id_img, conf_img, iou_img, max_det_img, img_size_img], outputs=[output_image], ) infer_video_button.click( fn=yolo_inference_video, inputs=[video, model_id_vid, conf_vid, iou_vid, max_det_vid, img_size_vid], outputs=[output_video], ) if __name__ == "__main__": app.launch(mcp_server=True, theme=gr.themes.Ocean(primary_hue="indigo", secondary_hue="blue"))