Spaces:

arihant3704
/

Yolo12-MultiTask-Vision

Sleeping

App Files Files Community

Yolo12-MultiTask-Vision / app.py

arihant3704

Disabled example caching and used stable model variants to fix startup crash

5d77b8a about 2 months ago

raw

history blame contribute delete

8.42 kB

	import spaces
	import gradio as gr
	from PIL import Image, ImageDraw, ImageFont
	from ultralytics import YOLO
	import cv2
	import tempfile
	import numpy as np

	# -----------------------------
	# Config
	# -----------------------------
	MODEL_CHOICES = [
	"yolo12n.pt", "yolo12s.pt", "yolo12m.pt", "yolo12l.pt", "yolo12x.pt",
	"yolo12n-seg.pt", "yolo12s-seg.pt", "yolo12m-seg.pt", "yolo12l-seg.pt", "yolo12x-seg.pt",
	"yolo12n-pose.pt", "yolo12s-pose.pt", "yolo12m-pose.pt", "yolo12l-pose.pt", "yolo12x-pose.pt",
	"yolo12n-obb.pt", "yolo12s-obb.pt", "yolo12m-obb.pt", "yolo12l-obb.pt", "yolo12x-obb.pt",
	"yolo12n-cls.pt", "yolo12s-cls.pt", "yolo12m-cls.pt", "yolo12l-cls.pt", "yolo12x-cls.pt",
	]

	IMG_SIZE_CHOICES = [128, 160, 256, 384, 480, 640, 736, 1024, 1440, 2176]
	DEFAULT_IMG_SIZE = 640

	# -----------------------------
	# Inference
	# -----------------------------
	@spaces.GPU
	def yolo_inference_image(image, model_id, conf_threshold, iou_threshold, max_detection, img_size):
	model = YOLO(model_id)
	if getattr(model, "task", None) != "classify":
	head = model.model.model[-1]
	if hasattr(head, "one2one_cv2"):
	delattr(head, "one2one_cv2")

	if image is None:
	w, h = 640, 480
	blank = Image.new("RGB", (w, h), color="white")
	draw = ImageDraw.Draw(blank)
	msg = "No image provided"
	font = ImageFont.load_default()
	bbox = draw.textbbox((0, 0), msg, font=font)
	tw, th = bbox[2] - bbox[0], bbox[3] - bbox[1]
	draw.text(((w - tw) / 2, (h - th) / 2), msg, fill="black", font=font)
	return blank

	results = model.predict(
	source=image,
	conf=conf_threshold,
	iou=iou_threshold,
	imgsz=int(img_size),
	max_det=max_detection,
	show_labels=True,
	show_conf=True,
	verbose=False,
	)

	annotated_image = None
	for r in results:
	img_bgr = r.plot()
	annotated_image = Image.fromarray(img_bgr[..., ::-1])
	return annotated_image


	@spaces.GPU
	def yolo_inference_video(video, model_id, conf_threshold, iou_threshold, max_detection, img_size):
	model = YOLO(model_id)
	if getattr(model, "task", None) != "classify":
	head = model.model.model[-1]
	if hasattr(head, "one2one_cv2"):
	delattr(head, "one2one_cv2")

	if video is None:
	w, h = 640, 480
	blank = Image.new("RGB", (w, h), color="white")
	draw = ImageDraw.Draw(blank)
	msg = "No video provided"
	font = ImageFont.load_default()
	bbox = draw.textbbox((0, 0), msg, font=font)
	tw, th = bbox[2] - bbox[0], bbox[3] - bbox[1]
	draw.text(((w - tw) / 2, (h - th) / 2), msg, fill="black", font=font)

	tmp = tempfile.NamedTemporaryFile(suffix=".mp4", delete=False).name
	fourcc = cv2.VideoWriter_fourcc(*"mp4v")
	out = cv2.VideoWriter(tmp, fourcc, 1, (w, h))
	out.write(cv2.cvtColor(np.array(blank), cv2.COLOR_RGB2BGR))
	out.release()
	return tmp

	cap = cv2.VideoCapture(video)
	if not cap.isOpened():
	return None

	fps_val = cap.get(cv2.CAP_PROP_FPS)
	fps = fps_val if fps_val and fps_val > 0 else 25

	w_val = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
	h_val = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
	w = w_val if w_val and w_val > 0 else 640
	h = h_val if h_val and h_val > 0 else 480

	tmp = tempfile.NamedTemporaryFile(suffix=".mp4", delete=False).name
	fourcc = cv2.VideoWriter_fourcc(*"mp4v")
	out = cv2.VideoWriter(tmp, fourcc, fps, (w, h))

	wrote_any = False
	while True:
	ret, frame = cap.read()
	if not ret:
	break

	results = model.predict(
	source=frame,
	conf=conf_threshold,
	iou=iou_threshold,
	imgsz=int(img_size),
	max_det=max_detection,
	show_labels=True,
	show_conf=True,
	verbose=False,
	)

	anno_bgr = frame
	for r in results:
	anno_bgr = r.plot()

	out.write(anno_bgr)
	wrote_any = True

	cap.release()
	out.release()

	if not wrote_any:
	return None
	return tmp


	def yolo_inference_for_examples(image, model_id, conf_threshold, iou_threshold, max_detection, img_size):
	return yolo_inference_image(image, model_id, conf_threshold, iou_threshold, max_detection, img_size)


	with gr.Blocks() as app:
	gr.Markdown("# YOLO12")
	gr.Markdown(
	"Image or video inference with detection, segmentation, pose, oriented bounding boxes, and classification using the latest Ultralytics YOLO12 models."
	)

	with gr.Accordion("Reference", open=False):
	gr.Markdown(
	"""
	BibTeX:
	```
	@software{yolo12_ultralytics,
	author = {Glenn Jocher and Jing Qiu},
	title = {Ultralytics YOLO12},
	version = {12.0.0},
	year = {2025},
	url = {https://github.com/ultralytics/ultralytics},
	orcid = {0000-0001-5950-6979, 0000-0003-3783-7069},
	license = {AGPL-3.0}
	}
	```
	"""
	)

	with gr.Tabs() as media_tabs:
	with gr.Tab("Image") as image_tab:
	with gr.Row():
	with gr.Column():
	image = gr.Image(type="pil", label="Image")
	model_id_img = gr.Dropdown(label="Model", choices=MODEL_CHOICES, value="yolo12n.pt")
	img_size_img = gr.Radio(choices=IMG_SIZE_CHOICES, value=DEFAULT_IMG_SIZE, label="Image Size")
	conf_img = gr.Slider(0, 1, value=0.25, label="Confidence Threshold")
	iou_img = gr.Slider(0, 1, value=0.45, label="IoU Threshold")
	max_det_img = gr.Slider(1, 300, step=1, value=300, label="Max Detection")

	infer_image_button = gr.Button("Detect Objects", variant="primary")

	with gr.Column():
	output_image = gr.Image(type="pil", show_label=False)
	gr.DeepLinkButton(variant="primary")

	gr.Examples(
	examples=[
	["zidane.jpg", "yolo12s.pt", 0.25, 0.45, 300, DEFAULT_IMG_SIZE],
	["bus.jpg", "yolo12m.pt", 0.25, 0.45, 300, DEFAULT_IMG_SIZE],
	["yolo_vision.jpg", "yolo12x.pt", 0.25, 0.45, 300, DEFAULT_IMG_SIZE],
	["Tricycle.jpg", "yolo12x.pt", 0.25, 0.45, 300, DEFAULT_IMG_SIZE],
	["tcganadolu.jpg", "yolo12m.pt", 0.25, 0.45, 300, DEFAULT_IMG_SIZE],
	["San Diego Airport.jpg", "yolo12x.pt", 0.25, 0.45, 300, DEFAULT_IMG_SIZE],
	["Theodore_Roosevelt.png", "yolo12l.pt", 0.25, 0.45, 300, DEFAULT_IMG_SIZE],
	],
	fn=yolo_inference_for_examples,
	inputs=[image, model_id_img, conf_img, iou_img, max_det_img, img_size_img],
	outputs=[output_image],
	label="Examples",
	cache_examples=False,
	)

	with gr.Tab("Video") as video_tab:
	with gr.Row():
	with gr.Column():
	video = gr.Video(label="Video")
	model_id_vid = gr.Dropdown(label="Model", choices=MODEL_CHOICES, value="yolo12n.pt")
	img_size_vid = gr.Radio(choices=IMG_SIZE_CHOICES, value=DEFAULT_IMG_SIZE, label="Image Size")
	conf_vid = gr.Slider(0, 1, value=0.25, label="Confidence Threshold")
	iou_vid = gr.Slider(0, 1, value=0.45, label="IoU Threshold")
	max_det_vid = gr.Slider(1, 300, step=1, value=300, label="Max Detection")

	infer_video_button = gr.Button("Detect Objects", variant="primary")

	with gr.Column():
	output_video = gr.Video(show_label=False)
	gr.DeepLinkButton(variant="primary")

	infer_image_button.click(
	fn=yolo_inference_image,
	inputs=[image, model_id_img, conf_img, iou_img, max_det_img, img_size_img],
	outputs=[output_image],
	)

	infer_video_button.click(
	fn=yolo_inference_video,
	inputs=[video, model_id_vid, conf_vid, iou_vid, max_det_vid, img_size_vid],
	outputs=[output_video],
	)

	if __name__ == "__main__":
	app.launch(mcp_server=True, theme=gr.themes.Ocean(primary_hue="indigo", secondary_hue="blue"))