Spaces:

nathbns
/

yolo3_from_scratch

Running

File size: 7,440 Bytes

import gradio as gr
import torch
import cv2
import numpy as np
from PIL import Image
from huggingface_hub import hf_hub_download
import os

# Configuration
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
IMAGE_SIZE = 416
NUM_CLASSES = 20

# Anchors YOLOv3 (normalisés)
ANCHORS = [
    [(0.28, 0.22), (0.38, 0.48), (0.9, 0.78)],
    [(0.07, 0.15), (0.15, 0.11), (0.14, 0.29)],
    [(0.02, 0.03), (0.04, 0.07), (0.08, 0.06)],
]

S = [IMAGE_SIZE // 32, IMAGE_SIZE // 16, IMAGE_SIZE // 8]

# Classes Pascal VOC
PASCAL_CLASSES = [
    "aeroplane", "bicycle", "bird", "boat", "bottle",
    "bus", "car", "cat", "chair", "cow",
    "diningtable", "dog", "horse", "motorbike", "person",
    "pottedplant", "sheep", "sofa", "train", "tvmonitor"
]

# Import du modèle
from model import YOLOv3
from utils import cells_to_bboxes, non_max_suppression


class YOLOv3Detector:
    def __init__(self, checkpoint_path):
        """Initialise le détecteur YOLOv3"""
        
        # Charger le modèle
        self.model = YOLOv3(num_classes=NUM_CLASSES).to(DEVICE)
        checkpoint = torch.load(checkpoint_path, map_location=DEVICE)
        self.model.load_state_dict(checkpoint["state_dict"])
        self.model.eval()
        
        # Anchors mis à l'échelle
        self.scaled_anchors = (
            torch.tensor(ANCHORS)
            * torch.tensor(S).unsqueeze(1).unsqueeze(1).repeat(1, 3, 2)
        ).to(DEVICE)
        
        # Couleurs pour chaque classe
        np.random.seed(42)
        self.colors = np.random.randint(0, 255, size=(len(PASCAL_CLASSES), 3), dtype=np.uint8)
        
    
    def preprocess_image(self, image):
        """Prétraite l'image pour le modèle"""
        if isinstance(image, Image.Image):
            image = np.array(image)
        
        original_shape = image.shape[:2]
        image_resized = cv2.resize(image, (IMAGE_SIZE, IMAGE_SIZE))
        
        # Normaliser et convertir en tensor
        image_tensor = torch.from_numpy(image_resized).float() / 255.0
        image_tensor = image_tensor.permute(2, 0, 1).unsqueeze(0)
        
        return image_tensor.to(DEVICE), original_shape
    
    def detect(self, image, conf_threshold=0.5, iou_threshold=0.45):
        """Détecte les objets dans l'image"""
        image_tensor, original_shape = self.preprocess_image(image)
        
        with torch.no_grad():
            predictions = self.model(image_tensor)
        
        # Convertir les prédictions en bboxes
        bboxes = [[] for _ in range(1)]
        for i in range(3):
            S = predictions[i].shape[2]
            anchor = self.scaled_anchors[i]
            boxes_scale_i = cells_to_bboxes(
                predictions[i], anchor, S=S, is_preds=True
            )
            for idx, box in enumerate(boxes_scale_i):
                bboxes[idx] += box
        
        # Appliquer NMS
        nms_boxes = non_max_suppression(
            bboxes[0],
            iou_threshold=iou_threshold,
            threshold=conf_threshold,
            box_format="midpoint",
        )
        
        return nms_boxes
    
    def draw_boxes(self, image, boxes):
        """Dessine les bounding boxes sur l'image"""
        if isinstance(image, Image.Image):
            image = np.array(image)
        
        image = image.copy()
        height, width = image.shape[:2]
        
        detections_info = []
        
        for box in boxes:
            class_idx = int(box[0])
            confidence = box[1]
            x_center, y_center, box_width, box_height = box[2:]
            
            # Convertir en coordonnées pixel
            x1 = int((x_center - box_width / 2) * width)
            y1 = int((y_center - box_height / 2) * height)
            x2 = int((x_center + box_width / 2) * width)
            y2 = int((y_center + box_height / 2) * height)
            
            # Couleur pour cette classe
            color = self.colors[class_idx].tolist()
            
            # Dessiner le rectangle
            cv2.rectangle(image, (x1, y1), (x2, y2), color, 2)
            
            # Texte
            label = f"{PASCAL_CLASSES[class_idx]}: {confidence:.2f}"
            
            # Fond du texte
            (text_width, text_height), _ = cv2.getTextSize(
                label, cv2.FONT_HERSHEY_SIMPLEX, 0.5, 1
            )
            cv2.rectangle(
                image, 
                (x1, y1 - text_height - 4), 
                (x1 + text_width, y1), 
                color, 
                -1
            )
            
            # Texte blanc
            cv2.putText(
                image, 
                label, 
                (x1, y1 - 4), 
                cv2.FONT_HERSHEY_SIMPLEX, 
                0.5, 
                (255, 255, 255), 
                1
            )
            
            detections_info.append(f"• {PASCAL_CLASSES[class_idx]}: {confidence:.1%}")
        
        return image, detections_info


# Télécharger le modèle depuis Hugging Face
checkpoint_path = hf_hub_download(
    repo_id="nathbns/yolov3_from_scratch",
    filename="checkpoint.pth.tar"
)

# Initialiser le détecteur
detector = YOLOv3Detector(checkpoint_path)


def predict(image, conf_threshold, iou_threshold):
    """Fonction de prédiction pour Gradio"""
    if image is None:
        return None, "Aucune image fournie"
    
    # Détecter
    boxes = detector.detect(image, conf_threshold, iou_threshold)
    
    # Dessiner
    result_image, detections = detector.draw_boxes(image, boxes)
    
    # Texte des détections
    if detections:
        detection_text = f"**{len(detections)} objet(s) détecté(s) :**\n\n" + "\n".join(detections)
    else:
        detection_text = "Aucun objet détecté"
    
    return result_image, detection_text


# Interface Gradio
with gr.Blocks(title="YOLOv3 Object Detection", theme=gr.themes.Soft()) as demo:
    gr.Markdown(
        """
        # YOLOv3 Object Detection - Pascal VOC
        """
    )
    
    with gr.Row():
        with gr.Column():
            input_image = gr.Image(type="pil", label="Image d'entrée")
            
            with gr.Accordion("Paramètres", open=True):
                conf_slider = gr.Slider(
                    minimum=0.1, 
                    maximum=1.0, 
                    value=0.5, 
                    step=0.05,
                    label="Seuil de confiance",
                    info="Plus élevé = moins de détections mais plus sûres"
                )
                iou_slider = gr.Slider(
                    minimum=0.1, 
                    maximum=1.0, 
                    value=0.45, 
                    step=0.05,
                    label="Seuil NMS (IoU)",
                    info="Plus élevé = plus de boîtes qui se chevauchent"
                )
            
            detect_btn = gr.Button("Détecter les objets", variant="primary", size="lg")
        
        with gr.Column():
            output_image = gr.Image(label="Résultat")
            output_text = gr.Markdown(label="Détections")
    
    # Action
    detect_btn.click(
        fn=predict,
        inputs=[input_image, conf_slider, iou_slider],
        outputs=[output_image, output_text]
    )
    
    # Auto-run sur upload
    input_image.change(
        fn=predict,
        inputs=[input_image, conf_slider, iou_slider],
        outputs=[output_image, output_text]
    )

if __name__ == "__main__":
    demo.launch()