Spaces:

nathbns
/

yolo1_from_scratch

Sleeping

App Files Files Community

nathbns commited on Oct 11, 2025

Commit

6698bc8

verified ·

1 Parent(s): 4490e35

Upload 6 files

Browse files

Files changed (6) hide show

app.py +269 -0
dataset.py +69 -0
loss.py +83 -0
model.py +110 -0
requirements.txt +8 -0
utils.py +337 -0

app.py ADDED Viewed

	@@ -0,0 +1,269 @@

+import torch
+import gradio as gr
+import numpy as np
+from PIL import Image
+import torchvision.transforms as transforms
+from model import Yolov1
+from utils import cellboxes_to_boxes, non_max_suppression
+import cv2
+import os
+import glob
+import time
+# Classes PASCAL VOC
+CLASSES = [
+    "aeroplane", "bicycle", "bird", "boat", "bottle", "bus", "car", "cat",
+    "chair", "cow", "diningtable", "dog", "horse", "motorbike", "person",
+    "pottedplant", "sheep", "sofa", "train", "tvmonitor"
+]
+np.random.seed(42)
+COLORS = np.random.randint(50, 255, size=(len(CLASSES), 3), dtype=np.uint8)
+DEVICE = "cuda" if torch.cuda.is_available() else "mps" if torch.backends.mps.is_available() else "cpu"
+MODEL_PATH = "checkpoint_epoch_50.pth.tar"
+# Charger le modèle
+print(f"Chargement du modèle depuis {MODEL_PATH}...")
+model = Yolov1(split_size=7, num_boxes=2, num_classes=20).to(DEVICE)
+checkpoint = torch.load(MODEL_PATH, map_location=DEVICE)
+model.load_state_dict(checkpoint["state_dict"])
+model.eval()
+print(f"Modèle chargé avec succès!")
+# Info sur le modèle
+MODEL_INFO = {
+    "mAP": checkpoint.get("mAP", "N/A"),
+    "epoch": checkpoint.get("epoch", "N/A"),
+    "device": DEVICE,
+    "classes": len(CLASSES)
+}
+print(f"entraînement: {MODEL_INFO['mAP']}")
+print(f"Device: {DEVICE}")
+# Charger des images d'exemple depuis le dossier data
+EXAMPLE_IMAGES = []
+if os.path.exists("data/images"):
+    image_files = glob.glob("data/images/*.jpg")[:20]  # Prendre 20 images
+    EXAMPLE_IMAGES = sorted(image_files)
+    print(f"{len(EXAMPLE_IMAGES)} images d'exemple chargées")
+def draw_boxes(image, boxes):
+    """Dessine les bounding boxes sur l'image"""
+    img_array = np.array(image)
+    height, width = img_array.shape[:2]
+    for box in boxes:
+        # box format: [class_pred, prob_score, x, y, width, height]
+        class_pred = int(box[0])
+        confidence = float(box[1])
+        x_center, y_center, box_width, box_height = box[2:6]
+        # Convertir de coordonnées normalisées à pixels
+        x1 = int((x_center - box_width / 2) * width)
+        y1 = int((y_center - box_height / 2) * height)
+        x2 = int((x_center + box_width / 2) * width)
+        y2 = int((y_center + box_height / 2) * height)
+        # Couleur de la classe
+        color = tuple(int(c) for c in COLORS[class_pred])
+        # Dessiner le rectangle
+        cv2.rectangle(img_array, (x1, y1), (x2, y2), color, 2)
+        # Texte
+        label = f"{CLASSES[class_pred]}: {confidence:.2f}"
+        # Fond du texte
+        (text_width, text_height), _ = cv2.getTextSize(label, cv2.FONT_HERSHEY_SIMPLEX, 0.5, 1)
+        cv2.rectangle(img_array, (x1, y1 - text_height - 5), (x1 + text_width, y1), color, -1)
+        # Texte blanc
+        cv2.putText(img_array, label, (x1, y1 - 5), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (255, 255, 255), 1)
+    return Image.fromarray(img_array)
+def detect_objects(image, confidence_threshold, iou_threshold, show_confidence=True):
+    """Détecte les objets dans une image avec statistiques détaillées"""
+    if image is None:
+        return None, None, "**Veuillez uploader ou sélectionner une image**"
+    start_time = time.time()
+    # Prétraiter l'image
+    transform = transforms.Compose([
+        transforms.Resize((448, 448)),
+        transforms.ToTensor(),
+    ])
+    # Garder l'image originale pour l'affichage
+    original_image = image.copy()
+    original_size = image.size  # (width, height)
+    # Transformer l'image
+    img_tensor = transform(image).unsqueeze(0).to(DEVICE)
+    # Prédiction
+    with torch.no_grad():
+        predictions = model(img_tensor)
+    # Convertir les prédictions en bounding boxes
+    bboxes = cellboxes_to_boxes(predictions)
+    # Non-maximum suppression
+    nms_boxes = non_max_suppression(
+        bboxes[0],
+        iou_threshold=iou_threshold,
+        threshold=confidence_threshold,
+        box_format="midpoint"
+    )
+    inference_time = time.time() - start_time
+    # Dessiner les boxes
+    result_image = draw_boxes(original_image, nms_boxes)
+    # Statistiques détaillées
+    num_detections = len(nms_boxes)
+    detected_classes = [CLASSES[int(box[0])] for box in nms_boxes]
+    class_counts = {}
+    confidence_scores = []
+    for box in nms_boxes:
+        cls = CLASSES[int(box[0])]
+        conf = float(box[1])
+        class_counts[cls] = class_counts.get(cls, 0) + 1
+        confidence_scores.append(conf)
+    # Créer un rapport détaillé
+    stats = f"##Résultats de détection\n\n"
+    stats += f"**{num_detections} objet(s) détecté(s)**\n\n"
+    if num_detections > 0:
+        stats += f"Temps d'inférence: **{inference_time:.3f}s**\n"
+        stats += f"Taille image: **{original_size[0]}x{original_size[1]}**\n"
+        stats += f"Confiance moyenne: **{np.mean(confidence_scores):.2%}**\n\n"
+        stats += "### Objets détectés:\n\n"
+        for cls, count in sorted(class_counts.items(), key=lambda x: x[1], reverse=True):
+            stats += f"- **{cls}**: {count}\n"
+        if show_confidence:
+            stats += "\n### Confiances individuelles:\n\n"
+            for i, box in enumerate(nms_boxes[:10], 1):  # Top 10
+                cls = CLASSES[int(box[0])]
+                conf = float(box[1])
+                stats += f"{i}. {cls}: {conf:.1%}\n"
+            if len(nms_boxes) > 10:
+                stats += f"\n*...et {len(nms_boxes)-10} détection(s) de plus*\n"
+    else:
+        stats += "Aucun objet détecté.\n\n"
+    return original_image, result_image, stats
+# Interface Gradio améliorée
+with gr.Blocks(title="YOLO v1 - Détection d'objets", theme=gr.themes.Soft(), css="""
+    .gradio-container {max-width: 1400px !important}
+    .example-gallery {height: 400px; overflow-y: auto}
+""") as demo:
+    # En-tête
+    mAP_display = f"{MODEL_INFO['mAP']:.4f}" if isinstance(MODEL_INFO['mAP'], (int, float)) else MODEL_INFO['mAP']
+    gr.Markdown(f"""
+    # YOLO v1 - Détection d'objets en temps réel
+    ---
+    """)
+    with gr.Tabs():
+        # Onglet principal - Détection
+        with gr.Tab("Détection"):
+            gr.Markdown("""
+            ### Uploadez votre image ou sélectionnez un exemple
+            **Classes PASCAL VOC :** aeroplane, bicycle, bird, boat, bottle, bus, car, cat, chair, cow,
+            diningtable, dog, horse, motorbike, person, pottedplant, sheep, sofa, train, tvmonitor
+            """)
+            with gr.Row():
+                with gr.Column(scale=1):
+                    input_image = gr.Image(type="pil", label="Image d'entrée")
+                    with gr.Accordion("Paramètres avancés", open=True):
+                        confidence_slider = gr.Slider(
+                            minimum=0.05,
+                            maximum=0.95,
+                            value=0.4,
+                            step=0.05,
+                            label="Seuil de confiance",
+                            info="Plus bas = plus de détections"
+                        )
+                        iou_slider = gr.Slider(
+                            minimum=0.1,
+                            maximum=0.9,
+                            value=0.5,
+                            step=0.05,
+                            label="Seuil",
+                            info="Plus haut = garde plus de boxes qui se chevauchent"
+                        )
+                        show_conf_check = gr.Checkbox(
+                            value=True,
+                            label="Afficher les confiances détaillées"
+                        )
+                    detect_btn = gr.Button("Détecter les objets", variant="primary", size="lg")
+                with gr.Column(scale=2):
+                    with gr.Row():
+                        original_display = gr.Image(type="pil", label="Image originale")
+                        output_image = gr.Image(type="pil", label="Résultat avec détections")
+                    output_stats = gr.Markdown("**Uploadez une image et cliquez sur 'Détecter' pour commencer !**")
+            # Galerie d'exemples
+            if EXAMPLE_IMAGES:
+                gr.Markdown("### Exemples (cliquez pour tester)")
+                examples_list = [[img, 0.4, 0.5, True] for img in EXAMPLE_IMAGES[:12]]
+                gr.Examples(
+                    examples=examples_list,
+                    inputs=[input_image, confidence_slider, iou_slider, show_conf_check],
+                    outputs=[original_display, output_image, output_stats],
+                    fn=detect_objects,
+                    cache_examples=False,
+                    examples_per_page=6,
+                )
+            # Actions
+            detect_btn.click(
+                fn=detect_objects,
+                inputs=[input_image, confidence_slider, iou_slider, show_conf_check],
+                outputs=[original_display, output_image, output_stats]
+            )
+            input_image.change(
+                fn=detect_objects,
+                inputs=[input_image, confidence_slider, iou_slider, show_conf_check],
+                outputs=[original_display, output_image, output_stats]
+            )
+        # Onglet Info
+        with gr.Tab("À propos"):
+            mAP_info = f"{MODEL_INFO['mAP']:.4f}" if isinstance(MODEL_INFO['mAP'], (int, float)) else 'N/A'
+            epoch_info = MODEL_INFO['epoch'] if MODEL_INFO['epoch'] != 'N/A' else 'N/A'
+# Lancer l'app
+if __name__ == "__main__":
+    print("\n" + "="*60)
+    print("Lancement de l'application Gradio YOLO v1")
+    print("="*60)
+    print(f"Modèle: {MODEL_PATH}")
+    print(f"Device: {DEVICE}")
+    print(f"Exemples chargés: {len(EXAMPLE_IMAGES)}")
+    print("="*60 + "\n")
+    demo.launch(
+        share=True,
+        server_name="0.0.0.0",  # Accessible depuis le réseau local
+        server_port=7860,
+        show_error=True
+    )

dataset.py ADDED Viewed

	@@ -0,0 +1,69 @@

+import torch
+import os
+import pandas as pd
+from PIL import Image
+class VOCDataset(torch.utils.data.Dataset):
+    '''
+    on reprend les params originel de la paper YOLOV1:
+    7x7 cellules, 2 boites par cellule, 20 classes VOC.
+    '''
+    def __init__(self, csv_file, img_dir, label_dir, S=7, B=2, C=20, transform=None):
+        self.annotations = pd.read_csv(csv_file)
+        self.img_dir = img_dir
+        self.label_dir = label_dir
+        self.transform = transform # fct appliquee a l'img
+        self.S = S
+        self.B = B
+        self.C = C
+    def __len__(self):
+        return len(self.annotations) # nb de lignes csv
+    def __getitem__(self, index):
+        label_path = os.path.join(self.label_dir, self.annotations.iloc[index, 1])
+        boxes = []
+        with open(label_path) as f:
+            for label in f.readlines():
+                class_label, x, y, width, height = [
+                    float(x) if float(x) != int(float(x)) else int(x)
+                    for x in label.replace("\n", "").split()
+                ]
+                boxes.append([class_label, x, y, width, height])
+        img_path = os.path.join(self.img_dir, self.annotations.iloc[index, 0])
+        image = Image.open(img_path)
+        boxes = torch.tensor(boxes)
+        if self.transform:
+            image, boxes = self.transform(image, boxes)
+        label_matrix = torch.zeros((self.S, self.S, self.C + 5 * self.B))
+        for box in boxes:
+            class_label, x, y, width, height = box.tolist()
+            class_label = int(class_label)
+            i, j = int(self.S * y), int(self.S * x)
+            x_cell, y_cell = self.S * x - j, self.S * y - i
+            width_cell, height_cell = (
+                width * self.S,
+                height * self.S,
+            )
+            if label_matrix[i, j, 20] == 0:
+                label_matrix[i, j, 20] = 1
+                box_coordinates = torch.tensor(
+                    [x_cell, y_cell, width_cell, height_cell]
+                )
+                label_matrix[i, j, 21:25] = box_coordinates
+                # one hot encoding
+                label_matrix[i, j, class_label] = 1
+        return image, label_matrix

loss.py ADDED Viewed

	@@ -0,0 +1,83 @@

+import torch
+import torch.nn as nn
+from utils import intersection_over_union
+class Loss_Yolo(nn.Module):
+    def __init__(self, S=7, B=2, C=20):
+        super(Loss_Yolo, self).__init__()
+        self.mse = nn.MSELoss(reduction="sum")
+        self.S = S
+        self.B = B
+        self.C = C
+        self.lambda_noobj = 0.5
+        self.lambda_coord = 5
+    def forward(self, predictions, target):
+        predictions = predictions.reshape(-1, self.S, self.S, self.C + self.B * 5)
+        iou_b1 = intersection_over_union(predictions[..., 21:25], target[..., 21:25])
+        iou_b2 = intersection_over_union(predictions[..., 26:30], target[..., 21:25])
+        ious = torch.cat([iou_b1.unsqueeze(0), iou_b2.unsqueeze(0)], dim=0)
+        iou_maxes, bestbox = torch.max(ious, dim=0)
+        exists_box = target[..., 20].unsqueeze(3)
+        box_predictions = exists_box * (
+            (
+                bestbox * predictions[..., 26:30]
+                + (1 - bestbox) * predictions[..., 21:25]
+            )
+        )
+        box_targets = exists_box * target[..., 21:25]
+        box_predictions[..., 2:4] = torch.sign(box_predictions[..., 2:4]) * torch.sqrt(
+            torch.abs(box_predictions[..., 2:4] + 1e-6)
+        )
+        box_targets[..., 2:4] = torch.sqrt(box_targets[..., 2:4])
+        box_loss = self.mse(
+            torch.flatten(box_predictions, end_dim=-2),
+            torch.flatten(box_targets, end_dim=-2),
+        )
+        pred_box = (
+            bestbox * predictions[..., 25:26] + (1 - bestbox) * predictions[..., 20:21]
+        )
+        object_loss = self.mse(
+            torch.flatten(exists_box * pred_box),
+            torch.flatten(exists_box * target[..., 20:21]),
+        )
+        no_object_loss = self.mse(
+            torch.flatten((1 - exists_box) * predictions[..., 20:21], start_dim=1),
+            torch.flatten((1 - exists_box) * target[..., 20:21], start_dim=1),
+        )
+        no_object_loss += self.mse(
+            torch.flatten((1 - exists_box) * predictions[..., 25:26], start_dim=1),
+            torch.flatten((1 - exists_box) * target[..., 20:21], start_dim=1)
+        )
+        class_loss = self.mse(
+            torch.flatten(exists_box * predictions[..., :20], end_dim=-2,),
+            torch.flatten(exists_box * target[..., :20], end_dim=-2,),
+        )
+        loss = (
+            self.lambda_coord * box_loss # les deux premieres lignes dans le papier
+            + object_loss
+            + self.lambda_noobj * no_object_loss
+            + class_loss
+        )
+        return loss

model.py ADDED Viewed

	@@ -0,0 +1,110 @@

+import torch
+import torch.nn as nn
+class CNN(nn.Module):
+    """
+    **kwargs tous les autre args, sous forme de dict,
+    couche de convolution, bias=False parce que l'on batchNorm (il a son propre biais),
+    leaky relue: si x > 0 -> x, sinon -> 0.1 * x
+    """
+    def __init__(self, in_channels, out_channels, **kwargs):
+        super().__init__()
+        self.conv = nn.Conv2d(in_channels, out_channels, bias=False, **kwargs)
+        self.batchnorm = nn.BatchNorm2d(out_channels)
+        self.leakyrelue = nn.LeakyReLU(0.1)
+    def forward(self, x):
+        return self.leakyrelue(self.batchnorm(self.conv(x)))
+class Yolo_V1(nn.Module):
+    def __init__(self, in_channels=3, split_size=7, num_boxes=2, num_classes=20):
+        super(Yolo_V1, self).__init__()
+        # Darknet model, mais from scratch
+        self.conv1 = CNN(in_channels, 64, kernel_size=7, stride=2, padding=3)
+        self.maxpool1 = nn.MaxPool2d(kernel_size=2, stride=2)
+        self.conv2 = CNN(64, 192, kernel_size=3, stride=1, padding=1)
+        self.maxpool2 = nn.MaxPool2d(kernel_size=2, stride=2)
+        self.conv3 = CNN(192, 128, kernel_size=1, stride=1, padding=0)
+        self.conv4 = CNN(128, 256, kernel_size=3, stride=1, padding=1)
+        self.conv5 = CNN(256, 256, kernel_size=1, stride=1, padding=0)
+        self.conv6 = CNN(256, 512, kernel_size=3, stride=1, padding=1)
+        self.maxpool3 = nn.MaxPool2d(kernel_size=2, stride=2)
+        # Bloc répété 4 fois: (1x1 256) -> (3x3 512)
+        self.conv7 = CNN(512, 256, kernel_size=1, stride=1, padding=0)
+        self.conv8 = CNN(256, 512, kernel_size=3, stride=1, padding=1)
+        self.conv9 = CNN(512, 256, kernel_size=1, stride=1, padding=0)
+        self.conv10 = CNN(256, 512, kernel_size=3, stride=1, padding=1)
+        self.conv11 = CNN(512, 256, kernel_size=1, stride=1, padding=0)
+        self.conv12 = CNN(256, 512, kernel_size=3, stride=1, padding=1)
+        self.conv13 = CNN(512, 256, kernel_size=1, stride=1, padding=0)
+        self.conv14 = CNN(256, 512, kernel_size=3, stride=1, padding=1)
+        self.conv15 = CNN(512, 512, kernel_size=1, stride=1, padding=0)
+        self.conv16 = CNN(512, 1024, kernel_size=3, stride=1, padding=1)
+        self.maxpool4 = nn.MaxPool2d(kernel_size=2, stride=2)
+        # Bloc répété 2 fois: (1x1 512) -> (3x3 1024)
+        self.conv17 = CNN(1024, 512, kernel_size=1, stride=1, padding=0)
+        self.conv18 = CNN(512, 1024, kernel_size=3, stride=1, padding=1)
+        self.conv19 = CNN(1024, 512, kernel_size=1, stride=1, padding=0)
+        self.conv20 = CNN(512, 1024, kernel_size=3, stride=1, padding=1)
+        self.conv21 = CNN(1024, 1024, kernel_size=3, stride=1, padding=1)
+        self.conv22 = CNN(1024, 1024, kernel_size=3, stride=2, padding=1)
+        self.conv23 = CNN(1024, 1024, kernel_size=3, stride=1, padding=1)
+        self.conv24 = CNN(1024, 1024, kernel_size=3, stride=1, padding=1)
+        # Head du modele
+        S, B, C = split_size, num_boxes, num_classes
+        self.fc1 = nn.Linear(1024 * S * S, 496)
+        self.dropout = nn.Dropout(0.0)
+        self.leaky = nn.LeakyReLU(0.1)
+        self.fc2 = nn.Linear(496, S * S * (C + B * 5))
+    def forward(self, x):
+        x = self.conv1(x)
+        x = self.maxpool1(x)
+        x = self.conv2(x)
+        x = self.maxpool2(x)
+        x = self.conv3(x)
+        x = self.conv4(x)
+        x = self.conv5(x)
+        x = self.conv6(x)
+        x = self.maxpool3(x)
+        x = self.conv7(x)
+        x = self.conv8(x)
+        x = self.conv9(x)
+        x = self.conv10(x)
+        x = self.conv11(x)
+        x = self.conv12(x)
+        x = self.conv13(x)
+        x = self.conv14(x)
+        x = self.conv15(x)
+        x = self.conv16(x)
+        x = self.maxpool4(x)
+        x = self.conv17(x)
+        x = self.conv18(x)
+        x = self.conv19(x)
+        x = self.conv20(x)
+        x = self.conv21(x)
+        x = self.conv22(x)
+        x = self.conv23(x)
+        x = self.conv24(x)
+        x = torch.flatten(x, start_dim=1)
+        x = self.fc1(x)
+        x = self.dropout(x)
+        x = self.leaky(x)
+        x = self.fc2(x)
+        return x

requirements.txt ADDED Viewed

	@@ -0,0 +1,8 @@

+torch
+torchvision
+gradio
+numpy
+pandas
+opencv-python
+pillow
+matplotlib

utils.py ADDED Viewed

	@@ -0,0 +1,337 @@

+import torch
+import numpy as np
+import matplotlib.pyplot as plt
+import matplotlib.patches as patches
+from collections import Counter
+def intersection_over_union(boxes_preds, boxes_labels, box_format="midpoint"):
+    if box_format == "midpoint":
+        box1_x1 = boxes_preds[..., 0:1] - boxes_preds[..., 2:3] / 2
+        box1_y1 = boxes_preds[..., 1:2] - boxes_preds[..., 3:4] / 2
+        box1_x2 = boxes_preds[..., 0:1] + boxes_preds[..., 2:3] / 2
+        box1_y2 = boxes_preds[..., 1:2] + boxes_preds[..., 3:4] / 2
+        box2_x1 = boxes_labels[..., 0:1] - boxes_labels[..., 2:3] / 2
+        box2_y1 = boxes_labels[..., 1:2] - boxes_labels[..., 3:4] / 2
+        box2_x2 = boxes_labels[..., 0:1] + boxes_labels[..., 2:3] / 2
+        box2_y2 = boxes_labels[..., 1:2] + boxes_labels[..., 3:4] / 2
+    if box_format == "corners":
+        box1_x1 = boxes_preds[..., 0:1]
+        box1_y1 = boxes_preds[..., 1:2]
+        box1_x2 = boxes_preds[..., 2:3]
+        box1_y2 = boxes_preds[..., 3:4]  # (N, 1)
+        box2_x1 = boxes_labels[..., 0:1]
+        box2_y1 = boxes_labels[..., 1:2]
+        box2_x2 = boxes_labels[..., 2:3]
+        box2_y2 = boxes_labels[..., 3:4]
+    x1 = torch.max(box1_x1, box2_x1)
+    y1 = torch.max(box1_y1, box2_y1)
+    x2 = torch.min(box1_x2, box2_x2)
+    y2 = torch.min(box1_y2, box2_y2)
+    # .clamp(0) is for the case when they do not intersect
+    intersection = (x2 - x1).clamp(0) * (y2 - y1).clamp(0)
+    box1_area = abs((box1_x2 - box1_x1) * (box1_y2 - box1_y1))
+    box2_area = abs((box2_x2 - box2_x1) * (box2_y2 - box2_y1))
+    return intersection / (box1_area + box2_area - intersection + 1e-6)
+def non_max_suppression(bboxes, iou_threshold, threshold, box_format="corners"):
+    """
+    Does Non Max Suppression given bboxes
+    Parameters:
+        bboxes (list): list of lists containing all bboxes with each bboxes
+        specified as [class_pred, prob_score, x1, y1, x2, y2]
+        iou_threshold (float): threshold where predicted bboxes is correct
+        threshold (float): threshold to remove predicted bboxes (independent of IoU)
+        box_format (str): "midpoint" or "corners" used to specify bboxes
+    Returns:
+        list: bboxes after performing NMS given a specific IoU threshold
+    """
+    assert type(bboxes) == list
+    bboxes = [box for box in bboxes if box[1] > threshold]
+    bboxes = sorted(bboxes, key=lambda x: x[1], reverse=True)
+    bboxes_after_nms = []
+    while bboxes:
+        chosen_box = bboxes.pop(0)
+        bboxes = [
+            box
+            for box in bboxes
+            if box[0] != chosen_box[0]
+            or intersection_over_union(
+                torch.tensor(chosen_box[2:]),
+                torch.tensor(box[2:]),
+                box_format=box_format,
+            )
+            < iou_threshold
+        ]
+        bboxes_after_nms.append(chosen_box)
+    return bboxes_after_nms
+def mean_average_precision(
+    pred_boxes, true_boxes, iou_threshold=0.5, box_format="midpoint", num_classes=20
+):
+    """
+    Calculates mean average precision
+    Parameters:
+        pred_boxes (list): list of lists containing all bboxes with each bboxes
+        specified as [train_idx, class_prediction, prob_score, x1, y1, x2, y2]
+        true_boxes (list): Similar as pred_boxes except all the correct ones
+        iou_threshold (float): threshold where predicted bboxes is correct
+        box_format (str): "midpoint" or "corners" used to specify bboxes
+        num_classes (int): number of classes
+    Returns:
+        float: mAP value across all classes given a specific IoU threshold
+    """
+    # list storing all AP for respective classes
+    average_precisions = []
+    # used for numerical stability later on
+    epsilon = 1e-6
+    for c in range(num_classes):
+        detections = []
+        ground_truths = []
+        # Go through all predictions and targets,
+        # and only add the ones that belong to the
+        # current class c
+        for detection in pred_boxes:
+            if detection[1] == c:
+                detections.append(detection)
+        for true_box in true_boxes:
+            if true_box[1] == c:
+                ground_truths.append(true_box)
+        # find the amount of bboxes for each training example
+        # Counter here finds how many ground truth bboxes we get
+        # for each training example, so let's say img 0 has 3,
+        # img 1 has 5 then we will obtain a dictionary with:
+        # amount_bboxes = {0:3, 1:5}
+        amount_bboxes = Counter([gt[0] for gt in ground_truths])
+        # We then go through each key, val in this dictionary
+        # and convert to the following (w.r.t same example):
+        # ammount_bboxes = {0:torch.tensor[0,0,0], 1:torch.tensor[0,0,0,0,0]}
+        for key, val in amount_bboxes.items():
+            amount_bboxes[key] = torch.zeros(val)
+        # sort by box probabilities which is index 2
+        detections.sort(key=lambda x: x[2], reverse=True)
+        TP = torch.zeros((len(detections)))
+        FP = torch.zeros((len(detections)))
+        total_true_bboxes = len(ground_truths)
+        # If none exists for this class then we can safely skip
+        if total_true_bboxes == 0:
+            continue
+        for detection_idx, detection in enumerate(detections):
+            # Only take out the ground_truths that have the same
+            # training idx as detection
+            ground_truth_img = [
+                bbox for bbox in ground_truths if bbox[0] == detection[0]
+            ]
+            num_gts = len(ground_truth_img)
+            best_iou = 0
+            for idx, gt in enumerate(ground_truth_img):
+                iou = intersection_over_union(
+                    torch.tensor(detection[3:]),
+                    torch.tensor(gt[3:]),
+                    box_format=box_format,
+                )
+                if iou > best_iou:
+                    best_iou = iou
+                    best_gt_idx = idx
+            if best_iou > iou_threshold:
+                # only detect ground truth detection once
+                if amount_bboxes[detection[0]][best_gt_idx] == 0:
+                    # true positive and add this bounding box to seen
+                    TP[detection_idx] = 1
+                    amount_bboxes[detection[0]][best_gt_idx] = 1
+                else:
+                    FP[detection_idx] = 1
+            # if IOU is lower then the detection is a false positive
+            else:
+                FP[detection_idx] = 1
+        TP_cumsum = torch.cumsum(TP, dim=0)
+        FP_cumsum = torch.cumsum(FP, dim=0)
+        recalls = TP_cumsum / (total_true_bboxes + epsilon)
+        precisions = torch.divide(TP_cumsum, (TP_cumsum + FP_cumsum + epsilon))
+        precisions = torch.cat((torch.tensor([1]), precisions))
+        recalls = torch.cat((torch.tensor([0]), recalls))
+        # torch.trapz for numerical integration
+        average_precisions.append(torch.trapz(precisions, recalls))
+    return sum(average_precisions) / len(average_precisions)
+def plot_image(image, boxes):
+    """Plots predicted bounding boxes on the image"""
+    im = np.array(image)
+    height, width, _ = im.shape
+    # Create figure and axes
+    fig, ax = plt.subplots(1)
+    # Display the image
+    ax.imshow(im)
+    # box[0] is x midpoint, box[2] is width
+    # box[1] is y midpoint, box[3] is height
+    # Create a Rectangle potch
+    for box in boxes:
+        box = box[2:]
+        assert len(box) == 4, "Got more values than in x, y, w, h, in a box!"
+        upper_left_x = box[0] - box[2] / 2
+        upper_left_y = box[1] - box[3] / 2
+        rect = patches.Rectangle(
+            (upper_left_x * width, upper_left_y * height),
+            box[2] * width,
+            box[3] * height,
+            linewidth=1,
+            edgecolor="r",
+            facecolor="none",
+        )
+        # Add the patch to the Axes
+        ax.add_patch(rect)
+    plt.show()
+def get_bboxes(
+    loader,
+    model,
+    iou_threshold,
+    threshold,
+    pred_format="cells",
+    box_format="midpoint",
+    device="cuda",
+):
+    all_pred_boxes = []
+    all_true_boxes = []
+    # make sure model is in eval before get bboxes
+    model.eval()
+    train_idx = 0
+    for batch_idx, (x, labels) in enumerate(loader):
+        x = x.to(device)
+        labels = labels.to(device)
+        with torch.no_grad():
+            predictions = model(x)
+        batch_size = x.shape[0]
+        true_bboxes = cellboxes_to_boxes(labels)
+        bboxes = cellboxes_to_boxes(predictions)
+        for idx in range(batch_size):
+            nms_boxes = non_max_suppression(
+                bboxes[idx],
+                iou_threshold=iou_threshold,
+                threshold=threshold,
+                box_format=box_format,
+            )
+            #if batch_idx == 0 and idx == 0:
+            #    plot_image(x[idx].permute(1,2,0).to("cpu"), nms_boxes)
+            #    print(nms_boxes)
+            for nms_box in nms_boxes:
+                all_pred_boxes.append([train_idx] + nms_box)
+            for box in true_bboxes[idx]:
+                # many will get converted to 0 pred
+                if box[1] > threshold:
+                    all_true_boxes.append([train_idx] + box)
+            train_idx += 1
+    model.train()
+    return all_pred_boxes, all_true_boxes
+def convert_cellboxes(predictions, S=7):
+    """
+    Converts bounding boxes output from Yolo with
+    an image split size of S into entire image ratios
+    rather than relative to cell ratios. Tried to do this
+    vectorized, but this resulted in quite difficult to read
+    code... Use as a black box? Or implement a more intuitive,
+    using 2 for loops iterating range(S) and convert them one
+    by one, resulting in a slower but more readable implementation.
+    """
+    predictions = predictions.to("cpu")
+    batch_size = predictions.shape[0]
+    predictions = predictions.reshape(batch_size, 7, 7, 30)
+    bboxes1 = predictions[..., 21:25]
+    bboxes2 = predictions[..., 26:30]
+    scores = torch.cat(
+        (predictions[..., 20].unsqueeze(0), predictions[..., 25].unsqueeze(0)), dim=0
+    )
+    best_box = scores.argmax(0).unsqueeze(-1)
+    best_boxes = bboxes1 * (1 - best_box) + best_box * bboxes2
+    cell_indices = torch.arange(7).repeat(batch_size, 7, 1).unsqueeze(-1)
+    x = 1 / S * (best_boxes[..., :1] + cell_indices)
+    y = 1 / S * (best_boxes[..., 1:2] + cell_indices.permute(0, 2, 1, 3))
+    w_y = 1 / S * best_boxes[..., 2:4]
+    converted_bboxes = torch.cat((x, y, w_y), dim=-1)
+    predicted_class = predictions[..., :20].argmax(-1).unsqueeze(-1)
+    best_confidence = torch.max(predictions[..., 20], predictions[..., 25]).unsqueeze(
+        -1
+    )
+    converted_preds = torch.cat(
+        (predicted_class, best_confidence, converted_bboxes), dim=-1
+    )
+    return converted_preds
+def cellboxes_to_boxes(out, S=7):
+    converted_pred = convert_cellboxes(out).reshape(out.shape[0], S * S, -1)
+    converted_pred[..., 0] = converted_pred[..., 0].long()
+    all_bboxes = []
+    for ex_idx in range(out.shape[0]):
+        bboxes = []
+        for bbox_idx in range(S * S):
+            bboxes.append([x.item() for x in converted_pred[ex_idx, bbox_idx, :]])
+        all_bboxes.append(bboxes)
+    return all_bboxes
+def save_checkpoint(state, filename="my_checkpoint.pth.tar"):
+    print("=> Saving checkpoint")
+    torch.save(state, filename)
+def load_checkpoint(checkpoint, model, optimizer):
+    print("=> Loading checkpoint")
+    model.load_state_dict(checkpoint["state_dict"])
+    optimizer.load_state_dict(checkpoint["optimizer"])