import gradio as gr import torch from torchvision.ops import nms from PIL import Image, ImageDraw from transformers import AutoImageProcessor, AutoModelForObjectDetection DEVICE = "cuda" if torch.cuda.is_available() else "cpu" YOLOS_REPO_ID = "adcelis/yolo_finetuned_raccoon" DETR_REPO_ID = "adcelis/detr_finetuned_raccoon" # Cargar YOLOS proc_yolos = AutoImageProcessor.from_pretrained(YOLOS_REPO_ID) model_yolos = AutoModelForObjectDetection.from_pretrained(YOLOS_REPO_ID).to(DEVICE) model_yolos.eval() # Cargar DETR proc_detr = AutoImageProcessor.from_pretrained(DETR_REPO_ID) model_detr = AutoModelForObjectDetection.from_pretrained(DETR_REPO_ID).to(DEVICE) model_detr.eval() @torch.no_grad() def predict_tf(pil_img, processor, model, conf): inputs = processor(images=[pil_img], return_tensors="pt").to(DEVICE) outputs = model(**inputs) target_sizes = torch.tensor([[pil_img.size[1], pil_img.size[0]]], device=DEVICE) res = processor.post_process_object_detection(outputs, threshold=conf, target_sizes=target_sizes)[0] return res["boxes"].cpu(), res["scores"].cpu(), res["labels"].cpu() def ensemble_union_nms(boxes1, scores1, labels1, boxes2, scores2, labels2, w2=0.8, iou_thr=0.5, score_thr=0.25): boxes = torch.cat([boxes1, boxes2], dim=0) scores = torch.cat([scores1, scores2 * w2], dim=0) labels = torch.cat([labels1, labels2], dim=0) keep = scores >= score_thr boxes, scores, labels = boxes[keep], scores[keep], labels[keep] if boxes.numel() == 0: return boxes, scores, labels keep_all = [] for cls in labels.unique(): idx = torch.where(labels == cls)[0] k = nms(boxes[idx], scores[idx], iou_thr) keep_all.append(idx[k]) keep_all = torch.cat(keep_all) keep_all = keep_all[scores[keep_all].argsort(descending=True)] return boxes[keep_all], scores[keep_all], labels[keep_all] def draw_boxes(pil_img, boxes, scores, labels, id2label): img = pil_img.copy() draw = ImageDraw.Draw(img) for b, s, l in zip(boxes, scores, labels): x1, y1, x2, y2 = [float(x) for x in b.tolist()] draw.rectangle((x1, y1, x2, y2), outline="green", width=2) name = id2label.get(int(l), str(int(l))) draw.text((x1, y1), f"{name} {float(s):.2f}", fill="black") return img def run(pil_img, yolos_conf, detr_conf, w2, iou_thr, score_thr): pil_img = pil_img.convert("RGB") b1, s1, l1 = predict_tf(pil_img, proc_yolos, model_yolos, yolos_conf) b2, s2, l2 = predict_tf(pil_img, proc_detr, model_detr, detr_conf) be, se, le = ensemble_union_nms(b1, s1, l1, b2, s2, l2, w2=w2, iou_thr=iou_thr, score_thr=score_thr) # id2label (misma clase en ambos: raccoon) id2label = model_yolos.config.id2label out_img = draw_boxes(pil_img, be, se, le, id2label) rows = [] for b, s, l in zip(be, se, le): x1, y1, x2, y2 = [round(float(x), 2) for x in b.tolist()] rows.append([id2label.get(int(l), str(int(l))), round(float(s), 3), x1, y1, x2, y2]) return out_img, rows demo = gr.Interface( fn=run, inputs=[ gr.Image(type="pil", label="Imagen"), gr.Slider(0.05, 0.9, value=0.5, step=0.05, label="YOLOS conf"), gr.Slider(0.05, 0.9, value=0.5, step=0.05, label="DETR conf"), gr.Slider(0.1, 1.5, value=0.8, step=0.05, label="Peso DETR (w2)"), gr.Slider(0.1, 0.9, value=0.5, step=0.05, label="NMS IoU"), gr.Slider(0.05, 0.9, value=0.25, step=0.05, label="Score mínimo (post-ensemble)"), ], outputs=[ gr.Image(type="pil", label="Ensemble (NMS)"), gr.Dataframe(headers=["label", "score", "x1", "y1", "x2", "y2"], label="Detecciones"), ], title="Ensemble YOLOS + DETR con Non-Maximum Suppression", ) if __name__ == "__main__": demo.launch()