File size: 2,274 Bytes
8e36f3b
 
 
 
 
 
018189f
8e36f3b
47e7e45
8e36f3b
 
018189f
8e36f3b
 
73dcbc4
018189f
73dcbc4
018189f
 
8e36f3b
018189f
8e36f3b
 
018189f
8e36f3b
73dcbc4
8e36f3b
 
 
 
 
 
 
018189f
8e36f3b
 
 
 
 
 
 
73dcbc4
018189f
 
 
 
73dcbc4
 
018189f
73dcbc4
018189f
 
 
73dcbc4
89a69e2
018189f
 
 
 
8e36f3b
018189f
8e36f3b
018189f
73dcbc4
 
8e36f3b
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
import torch
import gradio as gr
import cv2
import numpy as np
from PIL import Image

# -------- Load MiDaS model ----------
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
midas = torch.hub.load("isl-org/MiDaS", "MiDaS_small", trust_repo=True)
midas.to(device)
midas.eval()
transforms = torch.hub.load("isl-org/MiDaS", "transforms", trust_repo=True)
transform = transforms.small_transform

# -------- Load YOLOv11 Pose Model ----------
from ultralytics import YOLO
pose_model = YOLO("yolo11n-pose.pt")  # YOLOv11 pose

def run_pose_depth(image: Image.Image):
    img = np.array(image)
    if img.shape[2] == 4:
        img = cv2.cvtColor(img, cv2.COLOR_RGBA2RGB)
    img_rgb = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
    h, w, _ = img_rgb.shape

    # --- Depth ---
    input_batch = transform(img_rgb).to(device)
    with torch.no_grad():
        prediction = midas(input_batch)
    prediction = torch.nn.functional.interpolate(
        prediction.unsqueeze(1),
        size=img_rgb.shape[:2],
        mode="bicubic",
        align_corners=False
    ).squeeze()
    depth_map = prediction.cpu().numpy()
    depth_norm = (depth_map - depth_map.min()) / (depth_map.max() - depth_map.min())
    depth_img = (depth_norm * 255).astype(np.uint8)
    depth_img = cv2.applyColorMap(depth_img, cv2.COLORMAP_MAGMA)
    depth_pil = Image.fromarray(depth_img)

    results = pose_model(img_rgb)[0]
    keypoints_list = []

    if results.keypoints is not None:
        for person in results.keypoints:
            # get keypoints as numpy array
            keypoints_array = person.data.cpu().numpy()[0]
            joints = []
            for kp in keypoints_array:
                x, y, conf = kp
                px = int(np.clip(x, 0, w-1))
                py = int(np.clip(y, 0, h-1))
                z = float(depth_map[py, px])
                joints.append({"x": float(x), "y": float(y), "z": z, "confidence": float(conf)})
            keypoints_list.append(joints)

    return depth_pil, keypoints_list

iface = gr.Interface(
    fn=run_pose_depth,
    inputs=gr.Image(type="pil"),
    outputs=[gr.Image(type="pil"), gr.JSON()],
    title="YOLOv11 Pose + MiDaS Depth",
    description="Upload image → Pose keypoints (X,Y,Z) with depth."
)

iface.launch()