import torch
import gradio as gr
from PIL import Image, ImageDraw, ImageFont

from transformers import DetrImageProcessor, DetrForObjectDetection

# =========================
# Configuración del modelo
# =========================
MODEL_NAME = "facebook/detr-resnet-50"
processor = DetrImageProcessor.from_pretrained(MODEL_NAME)
model = DetrForObjectDetection.from_pretrained(MODEL_NAME)
model.eval()

# En Spaces normalmente es CPU, pero dejamos esto robusto
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
model.to(DEVICE)


def _draw_boxes(image: Image.Image, detections, line_width: int = 3) -> Image.Image:
    """
    Dibuja bounding boxes y etiquetas sobre una imagen PIL.
    detections: lista de dicts con keys: label, score, box=[x1,y1,x2,y2]
    """
    img = image.copy().convert("RGB")
    draw = ImageDraw.Draw(img)

    # Fuente (si no existe, usa default)
    try:
        font = ImageFont.truetype("DejaVuSans.ttf", 14)
    except Exception:
        font = ImageFont.load_default()

    for det in detections:
        x1, y1, x2, y2 = det["box"]
        label = det["label"]
        score = det["score"]

        # Caja
        draw.rectangle([x1, y1, x2, y2], width=line_width)

        # Etiqueta
        text = f"{label} {score:.2f}"

        bbox = draw.textbbox((0, 0), text, font=font)
        text_w = bbox[2] - bbox[0]
        text_h = bbox[3] - bbox[1]

        # Fondo del texto
        draw.rectangle(
            [x1, max(0, y1 - text_h - 6), x1 + text_w + 6, y1],
            fill="black"
        )
        draw.text((x1 + 3, max(0, y1 - text_h - 3)), text, fill="white", font=font)


    return img


def detect_objects(
    image: Image.Image,
    threshold: float = 0.7,
    top_k: int = 10,
    show_boxes: bool = True,
):
    """
    Detecta objetos con DETR y devuelve:
    1) Imagen anotada (opcional)
    2) Tabla con detecciones (label, score, box)
    3) Resumen textual (conteo por clase)
    """
    if image is None:
        return None, [], "Por favor sube una imagen."

    # Preprocesamiento
    inputs = processor(images=image, return_tensors="pt").to(DEVICE)

    # Inferencia
    with torch.no_grad():
        outputs = model(**inputs)

    # Post-procesamiento
    target_sizes = torch.tensor([image.size[::-1]], device=DEVICE)  # (alto, ancho)
    results = processor.post_process_object_detection(
        outputs, target_sizes=target_sizes, threshold=threshold
    )[0]

    labels = results["labels"].tolist()
    scores = results["scores"].tolist()
    boxes = results["boxes"].tolist()

    if len(labels) == 0:
        msg = (
            f"No se detectaron objetos con threshold={threshold:.2f}. "
            "Prueba bajándolo a 0.6–0.7 y usa una imagen con objetos claros (personas, carros, perros)."
        )
        return image, [], msg

    # Convertimos a detecciones con nombre legible
    detections = []
    for label_id, score, box in zip(labels, scores, boxes):
        label_name = model.config.id2label.get(label_id, str(label_id))
        x1, y1, x2, y2 = box
        detections.append(
            {
                "label": label_name,
                "score": float(score),
                "box": [float(x1), float(y1), float(x2), float(y2)],
            }
        )

    # Ordenar por score y limitar top-k
    detections = sorted(detections, key=lambda d: d["score"], reverse=True)[: int(top_k)]

    # Tabla para Gradio (Dataframe acepta lista de listas)
    table = [
        [d["label"], round(d["score"], 3),
         [round(v, 1) for v in d["box"]]]
        for d in detections
    ]

    # Resumen por clase
    counts = {}
    for d in detections:
        counts[d["label"]] = counts.get(d["label"], 0) + 1
    summary = "Resumen (top-k): " + ", ".join([f"{k}: {v}" for k, v in sorted(counts.items())])

    # Imagen anotada
    if show_boxes:
        annotated = _draw_boxes(image, detections)
    else:
        annotated = image

    return annotated, table, summary


# =========================
# Interfaz (UX mejorada)
# =========================
with gr.Blocks(title="Detección de Objetos con DETR (Transformers)") as demo:
    gr.Markdown(
        """
# Detección de Objetos con DETR (Hugging Face Transformers)
Sube una imagen y el modelo **DETR** detectará objetos del dataset **COCO**.

**Tip:** Si no detecta nada, baja el *threshold* a **0.6–0.7**.
        """
    )

    with gr.Row():
        inp_image = gr.Image(type="pil", label="Sube una imagen")
        out_image = gr.Image(type="pil", label="Imagen con detecciones")

    with gr.Row():
        threshold = gr.Slider(0.1, 0.99, value=0.7, step=0.01, label="Threshold (confianza)")
        top_k = gr.Slider(1, 50, value=10, step=1, label="Top-K detecciones")
        show_boxes = gr.Checkbox(value=True, label="Mostrar bounding boxes")

    btn = gr.Button("Detectar objetos")

    out_table = gr.Dataframe(
        headers=["Objeto", "Score", "Box [x1,y1,x2,y2]"],
        label="Detecciones (ordenadas por score)",
        wrap=True
    )
    out_summary = gr.Textbox(label="Resumen")

    btn.click(
        fn=detect_objects,
        inputs=[inp_image, threshold, top_k, show_boxes],
        outputs=[out_image, out_table, out_summary]
    )

if __name__ == "__main__":
    demo.launch()