import gradio as gr import torch import numpy as np import cv2 from PIL import Image from ultralytics import YOLO from torchvision.models.detection import fasterrcnn_resnet50_fpn from transformers import AutoImageProcessor, AutoModelForObjectDetection # ------------------------------------------------- # Device # ------------------------------------------------- device = torch.device("cuda" if torch.cuda.is_available() else "cpu") # ------------------------------------------------- # Load Models # ------------------------------------------------- # YOLOv8 yolo = YOLO("yolov8n.pt") # Faster R-CNN frcnn = fasterrcnn_resnet50_fpn(pretrained=True) frcnn.to(device).eval() # Deformable DETR processor = AutoImageProcessor.from_pretrained( "SenseTime/deformable-detr", use_fast=False ) detr = AutoModelForObjectDetection.from_pretrained( "SenseTime/deformable-detr" ) detr.to(device).eval() # ------------------------------------------------- # Utility Functions # ------------------------------------------------- def compute_iou(box1, box2): x1, y1 = max(box1[0], box2[0]), max(box1[1], box2[1]) x2, y2 = min(box1[2], box2[2]), min(box1[3], box2[3]) inter = max(0, x2 - x1) * max(0, y2 - y1) area1 = (box1[2]-box1[0])*(box1[3]-box1[1]) area2 = (box2[2]-box2[0])*(box2[3]-box2[1]) return inter / (area1 + area2 - inter + 1e-6) def draw_boxes(image, detections): img = np.array(image) for d in detections: x1, y1, x2, y2 = map(int, d["box"]) label = d["label"] cv2.rectangle(img, (x1,y1), (x2,y2), (0,255,0), 2) cv2.putText( img, label, (x1, y1-6), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0,255,0), 1 ) return Image.fromarray(img) # ------------------------------------------------- # Model Inference # ------------------------------------------------- def yolo_detect(image): results = yolo(image)[0] dets = [] for b in results.boxes: dets.append({ "box": b.xyxy[0].cpu().numpy(), "model": "YOLO" }) return dets def frcnn_detect(image): img = torch.tensor(np.array(image)/255.).permute(2,0,1).float() img = img.unsqueeze(0).to(device) with torch.no_grad(): out = frcnn(img)[0] dets = [] for box, score in zip(out["boxes"], out["scores"]): if score > 0.6: dets.append({ "box": box.cpu().numpy(), "model": "FRCNN" }) return dets def detr_detect(image): inputs = processor(images=image, return_tensors="pt").to(device) with torch.no_grad(): outputs = detr(**inputs) size = torch.tensor([image.size[::-1]]).to(device) results = processor.post_process_object_detection( outputs, target_sizes=size, threshold=0.7 )[0] dets = [] for box in results["boxes"]: dets.append({ "box": box.cpu().numpy(), "model": "DETR" }) return dets # ------------------------------------------------- # HARD VOTING # ------------------------------------------------- def hard_vote(detections, vote_thresh=2, iou_thresh=0.5): final = [] for d in detections: votes = [d] for o in detections: if d["model"] != o["model"]: if compute_iou(d["box"], o["box"]) >= iou_thresh: votes.append(o) models = set(v["model"] for v in votes) if len(models) >= vote_thresh: avg_box = np.mean([v["box"] for v in votes], axis=0) final.append({ "box": avg_box, "label": f"Ensemble ({len(models)})" }) # remove duplicates unique = [] for d in final: if not any(compute_iou(d["box"], u["box"]) > 0.8 for u in unique): unique.append(d) return unique # ------------------------------------------------- # LIVE FRAME FUNCTION # ------------------------------------------------- def live_detect(frame): image = Image.fromarray(frame) detections = ( yolo_detect(image) + frcnn_detect(image) + detr_detect(image) ) voted = hard_vote(detections) output = draw_boxes(image, voted) return np.array(output) # ------------------------------------------------- # Gradio Interface (Webcam) # ------------------------------------------------- demo = gr.Interface( fn=live_detect, inputs=gr.Image(source="webcam", streaming=True), outputs=gr.Image(), live=True, title="Live Object Detection – Hard Voting Ensemble", description=( "YOLOv8 + Faster R-CNN + Deformable DETR\n" "Browser-based webcam with IoU-based hard voting." ) ) demo.launch()