import torch import gradio as gr import cv2 import numpy as np from PIL import Image # -------- Load MiDaS model ---------- device = torch.device("cuda" if torch.cuda.is_available() else "cpu") midas = torch.hub.load("isl-org/MiDaS", "MiDaS_small", trust_repo=True) midas.to(device) midas.eval() transforms = torch.hub.load("isl-org/MiDaS", "transforms", trust_repo=True) transform = transforms.small_transform # -------- Load YOLOv11 Pose Model ---------- from ultralytics import YOLO pose_model = YOLO("yolo11n-pose.pt") # YOLOv11 pose def run_pose_depth(image: Image.Image): img = np.array(image) if img.shape[2] == 4: img = cv2.cvtColor(img, cv2.COLOR_RGBA2RGB) img_rgb = cv2.cvtColor(img, cv2.COLOR_BGR2RGB) h, w, _ = img_rgb.shape # --- Depth --- input_batch = transform(img_rgb).to(device) with torch.no_grad(): prediction = midas(input_batch) prediction = torch.nn.functional.interpolate( prediction.unsqueeze(1), size=img_rgb.shape[:2], mode="bicubic", align_corners=False ).squeeze() depth_map = prediction.cpu().numpy() depth_norm = (depth_map - depth_map.min()) / (depth_map.max() - depth_map.min()) depth_img = (depth_norm * 255).astype(np.uint8) depth_img = cv2.applyColorMap(depth_img, cv2.COLORMAP_MAGMA) depth_pil = Image.fromarray(depth_img) results = pose_model(img_rgb)[0] keypoints_list = [] if results.keypoints is not None: for person in results.keypoints: # get keypoints as numpy array keypoints_array = person.data.cpu().numpy()[0] joints = [] for kp in keypoints_array: x, y, conf = kp px = int(np.clip(x, 0, w-1)) py = int(np.clip(y, 0, h-1)) z = float(depth_map[py, px]) joints.append({"x": float(x), "y": float(y), "z": z, "confidence": float(conf)}) keypoints_list.append(joints) return depth_pil, keypoints_list iface = gr.Interface( fn=run_pose_depth, inputs=gr.Image(type="pil"), outputs=[gr.Image(type="pil"), gr.JSON()], title="YOLOv11 Pose + MiDaS Depth", description="Upload image → Pose keypoints (X,Y,Z) with depth." ) iface.launch()