import gradio as gr
import torch
from torchvision.ops import nms
from PIL import Image, ImageDraw

from transformers import AutoImageProcessor, AutoModelForObjectDetection

DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

YOLOS_REPO_ID = "adcelis/yolo_finetuned_raccoon"
DETR_REPO_ID  = "adcelis/detr_finetuned_raccoon"

# Cargar YOLOS
proc_yolos = AutoImageProcessor.from_pretrained(YOLOS_REPO_ID)
model_yolos = AutoModelForObjectDetection.from_pretrained(YOLOS_REPO_ID).to(DEVICE)
model_yolos.eval()

# Cargar DETR
proc_detr = AutoImageProcessor.from_pretrained(DETR_REPO_ID)
model_detr = AutoModelForObjectDetection.from_pretrained(DETR_REPO_ID).to(DEVICE)
model_detr.eval()

@torch.no_grad()
def predict_tf(pil_img, processor, model, conf):
    inputs = processor(images=[pil_img], return_tensors="pt").to(DEVICE)
    outputs = model(**inputs)
    target_sizes = torch.tensor([[pil_img.size[1], pil_img.size[0]]], device=DEVICE)
    res = processor.post_process_object_detection(outputs, threshold=conf, target_sizes=target_sizes)[0]
    return res["boxes"].cpu(), res["scores"].cpu(), res["labels"].cpu()

def ensemble_union_nms(boxes1, scores1, labels1, boxes2, scores2, labels2,
                       w2=0.8, iou_thr=0.5, score_thr=0.25):
    boxes = torch.cat([boxes1, boxes2], dim=0)
    scores = torch.cat([scores1, scores2 * w2], dim=0)
    labels = torch.cat([labels1, labels2], dim=0)

    keep = scores >= score_thr
    boxes, scores, labels = boxes[keep], scores[keep], labels[keep]
    if boxes.numel() == 0:
        return boxes, scores, labels

    keep_all = []
    for cls in labels.unique():
        idx = torch.where(labels == cls)[0]
        k = nms(boxes[idx], scores[idx], iou_thr)
        keep_all.append(idx[k])

    keep_all = torch.cat(keep_all)
    keep_all = keep_all[scores[keep_all].argsort(descending=True)]
    return boxes[keep_all], scores[keep_all], labels[keep_all]

def draw_boxes(pil_img, boxes, scores, labels, id2label):
    img = pil_img.copy()
    draw = ImageDraw.Draw(img)
    for b, s, l in zip(boxes, scores, labels):
        x1, y1, x2, y2 = [float(x) for x in b.tolist()]
        draw.rectangle((x1, y1, x2, y2), outline="green", width=2)
        name = id2label.get(int(l), str(int(l)))
        draw.text((x1, y1), f"{name} {float(s):.2f}", fill="black")
    return img

def run(pil_img, yolos_conf, detr_conf, w2, iou_thr, score_thr):
    pil_img = pil_img.convert("RGB")

    b1, s1, l1 = predict_tf(pil_img, proc_yolos, model_yolos, yolos_conf)
    b2, s2, l2 = predict_tf(pil_img, proc_detr,  model_detr,  detr_conf)

    be, se, le = ensemble_union_nms(b1, s1, l1, b2, s2, l2, w2=w2, iou_thr=iou_thr, score_thr=score_thr)

    # id2label (misma clase en ambos: raccoon)
    id2label = model_yolos.config.id2label
    out_img = draw_boxes(pil_img, be, se, le, id2label)

    rows = []
    for b, s, l in zip(be, se, le):
        x1, y1, x2, y2 = [round(float(x), 2) for x in b.tolist()]
        rows.append([id2label.get(int(l), str(int(l))), round(float(s), 3), x1, y1, x2, y2])

    return out_img, rows

demo = gr.Interface(
    fn=run,
    inputs=[
        gr.Image(type="pil", label="Imagen"),
        gr.Slider(0.05, 0.9, value=0.5, step=0.05, label="YOLOS conf"),
        gr.Slider(0.05, 0.9, value=0.5, step=0.05, label="DETR conf"),
        gr.Slider(0.1, 1.5, value=0.8, step=0.05, label="Peso DETR (w2)"),
        gr.Slider(0.1, 0.9, value=0.5, step=0.05, label="NMS IoU"),
        gr.Slider(0.05, 0.9, value=0.25, step=0.05, label="Score mínimo (post-ensemble)"),
    ],
    outputs=[
        gr.Image(type="pil", label="Ensemble (NMS)"),
        gr.Dataframe(headers=["label", "score", "x1", "y1", "x2", "y2"], label="Detecciones"),
    ],
    title="Ensemble YOLOS + DETR con Non-Maximum Suppression",
)

if __name__ == "__main__":
    demo.launch()