Spaces:

arihant3704
/

Yolo12-MultiTask-Vision

Sleeping

App Files Files Community

atalaydenknalbant commited on Nov 25, 2025

Commit

3355ffb

verified ·

1 Parent(s): d360d5f

Update app.py

Browse files

Files changed (1) hide show

app.py +205 -205

app.py CHANGED Viewed

@@ -1,205 +1,205 @@
-import spaces
-import gradio as gr
-from PIL import Image, ImageDraw, ImageFont
-from ultralytics import YOLO
-import cv2
-import tempfile
-import numpy as np
-# -----------------------------
-# Inference
-# -----------------------------
-@spaces.GPU
-def yolo_inference(input_type, image, video, model_id, conf_threshold, iou_threshold, max_detection):
-    """
-    Ultralytics YOLO26 inference for image or video.
-    Accepts detect/seg/pose/obb/cls checkpoints and renders r.plot().
-    """
-    model = YOLO(model_id)
-    if input_type == "Image":
-        if image is None:
-            w, h = 640, 480
-            blank = Image.new("RGB", (w, h), color="white")
-            draw = ImageDraw.Draw(blank)
-            msg = "No image provided"
-            font = ImageFont.load_default(size=40)
-            bbox = draw.textbbox((0, 0), msg, font=font)
-            tw, th = bbox[2] - bbox[0], bbox[3] - bbox[1]
-            draw.text(((w - tw) / 2, (h - th) / 2), msg, fill="black", font=font)
-            return blank, None
-        results = model.predict(
-            source=image,
-            conf=conf_threshold,
-            iou=iou_threshold,
-            imgsz=640,
-            max_det=max_detection,
-            show_labels=True,
-            show_conf=True,
-        )
-        annotated_image = None
-        for r in results:
-            img_bgr = r.plot()
-            annotated_image = Image.fromarray(img_bgr[..., ::-1])
-        return annotated_image, None
-    if input_type == "Video":
-        if video is None:
-            w, h = 640, 480
-            blank = Image.new("RGB", (w, h), color="white")
-            draw = ImageDraw.Draw(blank)
-            msg = "No video provided"
-            font = ImageFont.load_default(size=40)
-            bbox = draw.textbbox((0, 0), msg, font=font)
-            tw, th = bbox[2] - bbox[0], bbox[3] - bbox[1]
-            draw.text(((w - tw) / 2, (h - th) / 2), msg, fill="black", font=font)
-            tmp = tempfile.NamedTemporaryFile(suffix=".mp4", delete=False).name
-            fourcc = cv2.VideoWriter_fourcc(*"mp4v")
-            out = cv2.VideoWriter(tmp, fourcc, 1, (w, h))
-            out.write(cv2.cvtColor(np.array(blank), cv2.COLOR_RGB2BGR))
-            out.release()
-            return None, tmp
-        cap = cv2.VideoCapture(video)
-        fps = cap.get(cv2.CAP_PROP_FPS) if cap.get(cv2.CAP_PROP_FPS) > 0 else 25
-        frames = []
-        while True:
-            ret, frame = cap.read()
-            if not ret:
-                break
-            pil_frame = Image.fromarray(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))
-            results = model.predict(
-                source=pil_frame,
-                conf=conf_threshold,
-                iou=iou_threshold,
-                imgsz=640,
-                max_det=max_detection,
-                show_labels=True,
-                show_conf=True,
-            )
-            for r in results:
-                anno_bgr = r.plot()
-                anno_rgb = cv2.cvtColor(anno_bgr, cv2.COLOR_BGR2RGB)
-            frames.append(anno_rgb)
-        cap.release()
-        if not frames:
-            return None, None
-        h, w, _ = frames[0].shape
-        tmp = tempfile.NamedTemporaryFile(suffix=".mp4", delete=False).name
-        fourcc = cv2.VideoWriter_fourcc(*"mp4v")
-        out = cv2.VideoWriter(tmp, fourcc, fps, (w, h))
-        for f in frames:
-            out.write(cv2.cvtColor(f, cv2.COLOR_RGB2BGR))
-        out.release()
-        return None, tmp
-    return None, None
-def update_visibility(input_type):
-    if input_type == "Image":
-        return gr.update(visible=True), gr.update(visible=False), gr.update(visible=True), gr.update(visible=False)
-    else:
-        return gr.update(visible=False), gr.update(visible=True), gr.update(visible=False), gr.update(visible=True)
-def yolo_inference_for_examples(image, model_id, conf_threshold, iou_threshold, max_detection):
-    annotated_image, _ = yolo_inference(
-        input_type="Image",
-        image=image,
-        video=None,
-        model_id=model_id,
-        conf_threshold=conf_threshold,
-        iou_threshold=iou_threshold,
-        max_detection=max_detection
-    )
-    return annotated_image
-theme = gr.themes.Ocean(primary_hue="indigo", secondary_hue="blue")
-with gr.Blocks(theme=theme) as app:
-    gr.Markdown("# Ultralytics YOLO26")
-    gr.Markdown("Image or video inference with detection, segmentation, pose, oriented bounding boxes, and classification.")
-    with gr.Accordion("Reference", open=False):
-        gr.Markdown("""
-        **BibTeX:**
-        ```
-        @software{yolo26_ultralytics,
-          author = {Glenn Jocher and Jing Qiu},
-          title = {Ultralytics YOLO26},
-          version = {26.0.0},
-          year = {2025},
-          url = {https://github.com/ultralytics/ultralytics},
-          orcid = {0000-0001-5950-6979, 0000-0003-3783-7069},
-          license = {AGPL-3.0}
-        }
-        ```
-        """
-        )
-    with gr.Row():
-        with gr.Column():
-            image = gr.Image(type="pil", label="Image", visible=True)
-            video = gr.Video(label="Video", visible=False)
-            input_type = gr.Radio(choices=["Image", "Video"], value="Image", label="Input Type")
-            model_id = gr.Dropdown(
-                label="Model",
-                choices=[
-                    # detect
-                    "yolo26n.pt","yolo26s.pt","yolo26m.pt","yolo26l.pt","yolo26x.pt",
-                    # seg
-                    "yolo26n-seg.pt","yolo26s-seg.pt","yolo26m-seg.pt","yolo26l-seg.pt","yolo26x-seg.pt",
-                    # pose
-                    "yolo26n-pose.pt","yolo26s-pose.pt","yolo26m-pose.pt","yolo26l-pose.pt","yolo26x-pose.pt",
-                    # obb
-                    "yolo26n-obb.pt","yolo26s-obb.pt","yolo26m-obb.pt","yolo26l-obb.pt","yolo26x-obb.pt",
-                    # cls
-                    "yolo26n-cls.pt","yolo26s-cls.pt","yolo26m-cls.pt","yolo26l-cls.pt","yolo26x-cls.pt",
-                ],
-                value="yolo26n.pt",
-            )
-            conf_threshold = gr.Slider(minimum=0, maximum=1, value=0.25, label="Confidence Threshold")
-            iou_threshold = gr.Slider(minimum=0, maximum=1, value=0.45, label="IoU Threshold")
-            max_detection = gr.Slider(minimum=1, maximum=300, step=1, value=300, label="Max Detection")
-            infer_button = gr.Button("Detect Objects", variant="primary")
-        with gr.Column():
-            output_image = gr.Image(type="pil", show_label=False, show_share_button=False, visible=True)
-            output_video = gr.Video(show_label=False, show_share_button=False, visible=False)
-            gr.DeepLinkButton(variant="primary")
-    input_type.change(
-        fn=update_visibility,
-        inputs=input_type,
-        outputs=[image, video, output_image, output_video],
-    )
-    infer_button.click(
-        fn=yolo_inference,
-        inputs=[input_type, image, video, model_id, conf_threshold, iou_threshold, max_detection],
-        outputs=[output_image, output_video],
-    )
-    gr.Examples(
-        examples=[
-            ["zidane.jpg", "yolo26s.pt", 0.25, 0.45, 300],
-            ["bus.jpg", "yolo26m.pt", 0.25, 0.45, 300],
-            ["yolo_vision.jpg", "yolo26x.pt", 0.25, 0.45, 300],
-            ["Tricycle.jpg", "yolo26x-cls.pt", 0.25, 0.45, 300],
-            ["tcganadolu.jpg", "yolo26m-obb.pt", 0.25, 0.45, 300],
-            ["San Diego Airport.jpg", "yolo26x-seg.pt", 0.25, 0.45, 300],
-            ["Theodore_Roosevelt.png", "yolo26l-pose.pt", 0.25, 0.45, 300],
-        ],
-        fn=yolo_inference_for_examples,
-        inputs=[image, model_id, conf_threshold, iou_threshold, max_detection],
-        outputs=[output_image],
-        label="Examples",
-    )
-if __name__ == "__main__":
-    app.launch(mcp_server=True)

+import spaces
+import gradio as gr
+from PIL import Image, ImageDraw, ImageFont
+from ultralytics import YOLO
+import cv2
+import tempfile
+import numpy as np
+# -----------------------------
+# Inference
+# -----------------------------
+@spaces.GPU
+def yolo_inference(input_type, image, video, model_id, conf_threshold, iou_threshold, max_detection):
+    """
+    Ultralytics YOLO26 inference for image or video.
+    Accepts detect/seg/pose/obb/cls checkpoints and renders r.plot().
+    """
+    model = YOLO(model_id)
+    if input_type == "Image":
+        if image is None:
+            w, h = 640, 480
+            blank = Image.new("RGB", (w, h), color="white")
+            draw = ImageDraw.Draw(blank)
+            msg = "No image provided"
+            font = ImageFont.load_default(size=40)
+            bbox = draw.textbbox((0, 0), msg, font=font)
+            tw, th = bbox[2] - bbox[0], bbox[3] - bbox[1]
+            draw.text(((w - tw) / 2, (h - th) / 2), msg, fill="black", font=font)
+            return blank, None
+        results = model.predict(
+            source=image,
+            conf=conf_threshold,
+            iou=iou_threshold,
+            imgsz=640,
+            max_det=max_detection,
+            show_labels=True,
+            show_conf=True,
+        )
+        annotated_image = None
+        for r in results:
+            img_bgr = r.plot()
+            annotated_image = Image.fromarray(img_bgr[..., ::-1])
+        return annotated_image, None
+    if input_type == "Video":
+        if video is None:
+            w, h = 640, 480
+            blank = Image.new("RGB", (w, h), color="white")
+            draw = ImageDraw.Draw(blank)
+            msg = "No video provided"
+            font = ImageFont.load_default(size=40)
+            bbox = draw.textbbox((0, 0), msg, font=font)
+            tw, th = bbox[2] - bbox[0], bbox[3] - bbox[1]
+            draw.text(((w - tw) / 2, (h - th) / 2), msg, fill="black", font=font)
+            tmp = tempfile.NamedTemporaryFile(suffix=".mp4", delete=False).name
+            fourcc = cv2.VideoWriter_fourcc(*"mp4v")
+            out = cv2.VideoWriter(tmp, fourcc, 1, (w, h))
+            out.write(cv2.cvtColor(np.array(blank), cv2.COLOR_RGB2BGR))
+            out.release()
+            return None, tmp
+        cap = cv2.VideoCapture(video)
+        fps = cap.get(cv2.CAP_PROP_FPS) if cap.get(cv2.CAP_PROP_FPS) > 0 else 25
+        frames = []
+        while True:
+            ret, frame = cap.read()
+            if not ret:
+                break
+            pil_frame = Image.fromarray(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))
+            results = model.predict(
+                source=pil_frame,
+                conf=conf_threshold,
+                iou=iou_threshold,
+                imgsz=640,
+                max_det=max_detection,
+                show_labels=True,
+                show_conf=True,
+            )
+            for r in results:
+                anno_bgr = r.plot()
+                anno_rgb = cv2.cvtColor(anno_bgr, cv2.COLOR_BGR2RGB)
+            frames.append(anno_rgb)
+        cap.release()
+        if not frames:
+            return None, None
+        h, w, _ = frames[0].shape
+        tmp = tempfile.NamedTemporaryFile(suffix=".mp4", delete=False).name
+        fourcc = cv2.VideoWriter_fourcc(*"mp4v")
+        out = cv2.VideoWriter(tmp, fourcc, fps, (w, h))
+        for f in frames:
+            out.write(cv2.cvtColor(f, cv2.COLOR_RGB2BGR))
+        out.release()
+        return None, tmp
+    return None, None
+def update_visibility(input_type):
+    if input_type == "Image":
+        return gr.update(visible=True), gr.update(visible=False), gr.update(visible=True), gr.update(visible=False)
+    else:
+        return gr.update(visible=False), gr.update(visible=True), gr.update(visible=False), gr.update(visible=True)
+def yolo_inference_for_examples(image, model_id, conf_threshold, iou_threshold, max_detection):
+    annotated_image, _ = yolo_inference(
+        input_type="Image",
+        image=image,
+        video=None,
+        model_id=model_id,
+        conf_threshold=conf_threshold,
+        iou_threshold=iou_threshold,
+        max_detection=max_detection
+    )
+    return annotated_image
+theme = gr.themes.Ocean(primary_hue="indigo", secondary_hue="blue")
+with gr.Blocks(theme=theme) as app:
+    gr.Markdown("# Ultralytics YOLO26")
+    gr.Markdown("Image or video inference with detection, segmentation, pose, oriented bounding boxes, and classification.")
+    with gr.Accordion("Reference", open=False):
+        gr.Markdown("""
+        **BibTeX:**
+        ```
+        @software{yolo26_ultralytics,
+          author = {Glenn Jocher and Jing Qiu},
+          title = {Ultralytics YOLO26},
+          version = {26.0.0},
+          year = {2025},
+          url = {https://github.com/ultralytics/ultralytics},
+          orcid = {0000-0001-5950-6979, 0000-0003-3783-7069},
+          license = {AGPL-3.0}
+        }
+        ```
+        """
+        )
+    with gr.Row():
+        with gr.Column():
+            image = gr.Image(type="pil", label="Image", visible=True)
+            video = gr.Video(label="Video", visible=False)
+            input_type = gr.Radio(choices=["Image", "Video"], value="Image", label="Input Type")
+            model_id = gr.Dropdown(
+                label="Model",
+                choices=[
+                    # detect
+                    "yolo26n.pt","yolo26s.pt","yolo26m.pt","yolo26l.pt","yolo26x.pt",
+                    # seg
+                    "yolo26n-seg.pt","yolo26s-seg.pt","yolo26m-seg.pt","yolo26l-seg.pt","yolo26x-seg.pt",
+                    # pose
+                    "yolo26n-pose.pt","yolo26s-pose.pt","yolo26m-pose.pt","yolo26l-pose.pt","yolo26x-pose.pt",
+                    # obb
+                    "yolo26n-obb.pt","yolo26s-obb.pt","yolo26m-obb.pt","yolo26l-obb.pt","yolo26x-obb.pt",
+                    # cls
+                    "yolo26n-cls.pt","yolo26s-cls.pt","yolo26m-cls.pt","yolo26l-cls.pt","yolo26x-cls.pt",
+                ],
+                value="yolo26n.pt",
+            )
+            conf_threshold = gr.Slider(minimum=0, maximum=1, value=0.25, label="Confidence Threshold")
+            iou_threshold = gr.Slider(minimum=0, maximum=1, value=0.45, label="IoU Threshold")
+            max_detection = gr.Slider(minimum=1, maximum=300, step=1, value=300, label="Max Detection")
+            infer_button = gr.Button("Detect Objects", variant="primary")
+        with gr.Column():
+            output_image = gr.Image(type="pil", show_label=False, visible=True)
+            output_video = gr.Video(show_label=False, visible=False)
+            gr.DeepLinkButton(variant="primary")
+    input_type.change(
+        fn=update_visibility,
+        inputs=input_type,
+        outputs=[image, video, output_image, output_video],
+    )
+    infer_button.click(
+        fn=yolo_inference,
+        inputs=[input_type, image, video, model_id, conf_threshold, iou_threshold, max_detection],
+        outputs=[output_image, output_video],
+    )
+    gr.Examples(
+        examples=[
+            ["zidane.jpg", "yolo26s.pt", 0.25, 0.45, 300],
+            ["bus.jpg", "yolo26m.pt", 0.25, 0.45, 300],
+            ["yolo_vision.jpg", "yolo26x.pt", 0.25, 0.45, 300],
+            ["Tricycle.jpg", "yolo26x-cls.pt", 0.25, 0.45, 300],
+            ["tcganadolu.jpg", "yolo26m-obb.pt", 0.25, 0.45, 300],
+            ["San Diego Airport.jpg", "yolo26x-seg.pt", 0.25, 0.45, 300],
+            ["Theodore_Roosevelt.png", "yolo26l-pose.pt", 0.25, 0.45, 300],
+        ],
+        fn=yolo_inference_for_examples,
+        inputs=[image, model_id, conf_threshold, iou_threshold, max_detection],
+        outputs=[output_image],
+        label="Examples",
+    )
+if __name__ == "__main__":
+    app.launch(mcp_server=True)