import torch
import gradio as gr
import cv2
import numpy as np
from PIL import Image

# -------- Load MiDaS model ----------
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
midas = torch.hub.load("isl-org/MiDaS", "MiDaS_small", trust_repo=True)
midas.to(device)
midas.eval()
transforms = torch.hub.load("isl-org/MiDaS", "transforms", trust_repo=True)
transform = transforms.small_transform

# -------- Load YOLOv11 Pose Model ----------
from ultralytics import YOLO
pose_model = YOLO("yolo11n-pose.pt")  # YOLOv11 pose

def run_pose_depth(image: Image.Image):
    img = np.array(image)
    if img.shape[2] == 4:
        img = cv2.cvtColor(img, cv2.COLOR_RGBA2RGB)
    img_rgb = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
    h, w, _ = img_rgb.shape

    # --- Depth ---
    input_batch = transform(img_rgb).to(device)
    with torch.no_grad():
        prediction = midas(input_batch)
    prediction = torch.nn.functional.interpolate(
        prediction.unsqueeze(1),
        size=img_rgb.shape[:2],
        mode="bicubic",
        align_corners=False
    ).squeeze()
    depth_map = prediction.cpu().numpy()
    depth_norm = (depth_map - depth_map.min()) / (depth_map.max() - depth_map.min())
    depth_img = (depth_norm * 255).astype(np.uint8)
    depth_img = cv2.applyColorMap(depth_img, cv2.COLORMAP_MAGMA)
    depth_pil = Image.fromarray(depth_img)

    results = pose_model(img_rgb)[0]
    keypoints_list = []

    if results.keypoints is not None:
        for person in results.keypoints:
            # get keypoints as numpy array
            keypoints_array = person.data.cpu().numpy()[0]
            joints = []
            for kp in keypoints_array:
                x, y, conf = kp
                px = int(np.clip(x, 0, w-1))
                py = int(np.clip(y, 0, h-1))
                z = float(depth_map[py, px])
                joints.append({"x": float(x), "y": float(y), "z": z, "confidence": float(conf)})
            keypoints_list.append(joints)

    return depth_pil, keypoints_list

iface = gr.Interface(
    fn=run_pose_depth,
    inputs=gr.Image(type="pil"),
    outputs=[gr.Image(type="pil"), gr.JSON()],
    title="YOLOv11 Pose + MiDaS Depth",
    description="Upload image → Pose keypoints (X,Y,Z) with depth."
)

iface.launch()