import spaces
import gradio as gr
from PIL import Image, ImageDraw, ImageFont
from ultralytics import YOLO
import cv2
import tempfile
import numpy as np

# -----------------------------
# Config
# -----------------------------
MODEL_CHOICES = [
    "yolo12n.pt", "yolo12s.pt", "yolo12m.pt", "yolo12l.pt", "yolo12x.pt",
    "yolo12n-seg.pt", "yolo12s-seg.pt", "yolo12m-seg.pt", "yolo12l-seg.pt", "yolo12x-seg.pt",
    "yolo12n-pose.pt", "yolo12s-pose.pt", "yolo12m-pose.pt", "yolo12l-pose.pt", "yolo12x-pose.pt",
    "yolo12n-obb.pt", "yolo12s-obb.pt", "yolo12m-obb.pt", "yolo12l-obb.pt", "yolo12x-obb.pt",
    "yolo12n-cls.pt", "yolo12s-cls.pt", "yolo12m-cls.pt", "yolo12l-cls.pt", "yolo12x-cls.pt",
]

IMG_SIZE_CHOICES = [128, 160, 256, 384, 480, 640, 736, 1024, 1440, 2176]
DEFAULT_IMG_SIZE = 640

# -----------------------------
# Inference
# -----------------------------
@spaces.GPU
def yolo_inference_image(image, model_id, conf_threshold, iou_threshold, max_detection, img_size):
    model = YOLO(model_id)
    if getattr(model, "task", None) != "classify":
        head = model.model.model[-1]
        if hasattr(head, "one2one_cv2"):
            delattr(head, "one2one_cv2")

    if image is None:
        w, h = 640, 480
        blank = Image.new("RGB", (w, h), color="white")
        draw = ImageDraw.Draw(blank)
        msg = "No image provided"
        font = ImageFont.load_default()
        bbox = draw.textbbox((0, 0), msg, font=font)
        tw, th = bbox[2] - bbox[0], bbox[3] - bbox[1]
        draw.text(((w - tw) / 2, (h - th) / 2), msg, fill="black", font=font)
        return blank

    results = model.predict(
        source=image,
        conf=conf_threshold,
        iou=iou_threshold,
        imgsz=int(img_size),
        max_det=max_detection,
        show_labels=True,
        show_conf=True,
        verbose=False,
    )

    annotated_image = None
    for r in results:
        img_bgr = r.plot()
        annotated_image = Image.fromarray(img_bgr[..., ::-1])
    return annotated_image


@spaces.GPU
def yolo_inference_video(video, model_id, conf_threshold, iou_threshold, max_detection, img_size):
    model = YOLO(model_id)
    if getattr(model, "task", None) != "classify":
        head = model.model.model[-1]
        if hasattr(head, "one2one_cv2"):
            delattr(head, "one2one_cv2")

    if video is None:
        w, h = 640, 480
        blank = Image.new("RGB", (w, h), color="white")
        draw = ImageDraw.Draw(blank)
        msg = "No video provided"
        font = ImageFont.load_default()
        bbox = draw.textbbox((0, 0), msg, font=font)
        tw, th = bbox[2] - bbox[0], bbox[3] - bbox[1]
        draw.text(((w - tw) / 2, (h - th) / 2), msg, fill="black", font=font)

        tmp = tempfile.NamedTemporaryFile(suffix=".mp4", delete=False).name
        fourcc = cv2.VideoWriter_fourcc(*"mp4v")
        out = cv2.VideoWriter(tmp, fourcc, 1, (w, h))
        out.write(cv2.cvtColor(np.array(blank), cv2.COLOR_RGB2BGR))
        out.release()
        return tmp

    cap = cv2.VideoCapture(video)
    if not cap.isOpened():
        return None

    fps_val = cap.get(cv2.CAP_PROP_FPS)
    fps = fps_val if fps_val and fps_val > 0 else 25

    w_val = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
    h_val = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
    w = w_val if w_val and w_val > 0 else 640
    h = h_val if h_val and h_val > 0 else 480

    tmp = tempfile.NamedTemporaryFile(suffix=".mp4", delete=False).name
    fourcc = cv2.VideoWriter_fourcc(*"mp4v")
    out = cv2.VideoWriter(tmp, fourcc, fps, (w, h))

    wrote_any = False
    while True:
        ret, frame = cap.read()
        if not ret:
            break

        results = model.predict(
            source=frame,
            conf=conf_threshold,
            iou=iou_threshold,
            imgsz=int(img_size),
            max_det=max_detection,
            show_labels=True,
            show_conf=True,
            verbose=False,
        )

        anno_bgr = frame
        for r in results:
            anno_bgr = r.plot()

        out.write(anno_bgr)
        wrote_any = True

    cap.release()
    out.release()

    if not wrote_any:
        return None
    return tmp


def yolo_inference_for_examples(image, model_id, conf_threshold, iou_threshold, max_detection, img_size):
    return yolo_inference_image(image, model_id, conf_threshold, iou_threshold, max_detection, img_size)


with gr.Blocks() as app:
    gr.Markdown("# YOLO12")
    gr.Markdown(
        "Image or video inference with detection, segmentation, pose, oriented bounding boxes, and classification using the latest Ultralytics YOLO12 models."
    )

    with gr.Accordion("Reference", open=False):
        gr.Markdown(
            """
        **BibTeX:**
        ```
        @software{yolo12_ultralytics,
          author = {Glenn Jocher and Jing Qiu},
          title = {Ultralytics YOLO12},
          version = {12.0.0},
          year = {2025},
          url = {https://github.com/ultralytics/ultralytics},
          orcid = {0000-0001-5950-6979, 0000-0003-3783-7069},
          license = {AGPL-3.0}
        }
        ```
        """
        )

    with gr.Tabs() as media_tabs:
        with gr.Tab("Image") as image_tab:
            with gr.Row():
                with gr.Column():
                    image = gr.Image(type="pil", label="Image")
                    model_id_img = gr.Dropdown(label="Model", choices=MODEL_CHOICES, value="yolo12n.pt")
                    img_size_img = gr.Radio(choices=IMG_SIZE_CHOICES, value=DEFAULT_IMG_SIZE, label="Image Size")
                    conf_img = gr.Slider(0, 1, value=0.25, label="Confidence Threshold")
                    iou_img = gr.Slider(0, 1, value=0.45, label="IoU Threshold")
                    max_det_img = gr.Slider(1, 300, step=1, value=300, label="Max Detection")

                    infer_image_button = gr.Button("Detect Objects", variant="primary")

                with gr.Column():
                    output_image = gr.Image(type="pil", show_label=False)
                    gr.DeepLinkButton(variant="primary")

            gr.Examples(
                examples=[
                    ["zidane.jpg", "yolo12s.pt", 0.25, 0.45, 300, DEFAULT_IMG_SIZE],
                    ["bus.jpg", "yolo12m.pt", 0.25, 0.45, 300, DEFAULT_IMG_SIZE],
                    ["yolo_vision.jpg", "yolo12x.pt", 0.25, 0.45, 300, DEFAULT_IMG_SIZE],
                    ["Tricycle.jpg", "yolo12x.pt", 0.25, 0.45, 300, DEFAULT_IMG_SIZE],
                    ["tcganadolu.jpg", "yolo12m.pt", 0.25, 0.45, 300, DEFAULT_IMG_SIZE],
                    ["San Diego Airport.jpg", "yolo12x.pt", 0.25, 0.45, 300, DEFAULT_IMG_SIZE],
                    ["Theodore_Roosevelt.png", "yolo12l.pt", 0.25, 0.45, 300, DEFAULT_IMG_SIZE],
                ],
                fn=yolo_inference_for_examples,
                inputs=[image, model_id_img, conf_img, iou_img, max_det_img, img_size_img],
                outputs=[output_image],
                label="Examples",
                cache_examples=False,
            )

        with gr.Tab("Video") as video_tab:
            with gr.Row():
                with gr.Column():
                    video = gr.Video(label="Video")
                    model_id_vid = gr.Dropdown(label="Model", choices=MODEL_CHOICES, value="yolo12n.pt")
                    img_size_vid = gr.Radio(choices=IMG_SIZE_CHOICES, value=DEFAULT_IMG_SIZE, label="Image Size")
                    conf_vid = gr.Slider(0, 1, value=0.25, label="Confidence Threshold")
                    iou_vid = gr.Slider(0, 1, value=0.45, label="IoU Threshold")
                    max_det_vid = gr.Slider(1, 300, step=1, value=300, label="Max Detection")

                    infer_video_button = gr.Button("Detect Objects", variant="primary")

                with gr.Column():
                    output_video = gr.Video(show_label=False)
                    gr.DeepLinkButton(variant="primary")

    infer_image_button.click(
        fn=yolo_inference_image,
        inputs=[image, model_id_img, conf_img, iou_img, max_det_img, img_size_img],
        outputs=[output_image],
    )

    infer_video_button.click(
        fn=yolo_inference_video,
        inputs=[video, model_id_vid, conf_vid, iou_vid, max_det_vid, img_size_vid],
        outputs=[output_video],
    )

if __name__ == "__main__":
    app.launch(mcp_server=True, theme=gr.themes.Ocean(primary_hue="indigo", secondary_hue="blue"))