Spaces:

GunaKoppula
/

Session13

Sleeping

App Files Files Community

GunaKoppula commited on Aug 18, 2023

Commit

2f51281

1 Parent(s): 70a1c01

Added files

Browse files

Files changed (14) hide show

README.md +14 -5
YOLOv3.pth +3 -0
app.py +195 -0
batch_sampler.py +47 -0
config.py +103 -0
dataset.py +215 -0
dataset_org.py +127 -0
examples/1.jpg +0 -0
examples/2.jpg +0 -0
loss.py +79 -0
model.py +218 -0
requirements.txt +9 -0
train.py +180 -0
utils.py +588 -0

README.md CHANGED Viewed

@@ -1,8 +1,8 @@
 ---
-title: Session13
-emoji: 🚀
-colorFrom: red
-colorTo: red
 sdk: gradio
 sdk_version: 3.40.1
 app_file: app.py
@@ -10,4 +10,13 @@ pinned: false
 license: mit
 ---
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 ---
+title: Object Detection With Yolov3
+emoji: 📈
+colorFrom: yellow
+colorTo: purple
 sdk: gradio
 sdk_version: 3.40.1
 app_file: app.py
 license: mit
 ---
+# YOLOv3 Object Detection App
+Welcome to the YOLOv3 Object Detection App! This repository showcases an interactive application that combines the power of YOLOv3, a state-of-the-art object detection model, with the elegance of Gradio.
+## What is YOLOv3?
+YOLO (You Only Look Once) is an advanced object detection algorithm that stands out for its real-time performance. YOLOv3, the third iteration of YOLO, further refines its predecessor's accuracy and speed by leveraging a series of convolutional layers to predict bounding boxes and class probabilities.
+## How does the App Work?
+The YOLOv3 Object Detection App allows you to experience the magic of YOLOv3 firsthand. Simply upload an image, and watch as the app processes it through the YOLOv3 model to identify and highlight objects of interest. The app then presents you with a visual output, displaying the image with bounding boxes around the detected objects.
+Link to Github: https://github.com/selvaraj-sembulingam/ERA-V1/tree/main/Assignments/S13

YOLOv3.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:14ad7d1fda29ed91ed955e38615e8b9c66a42e376ad115a6f5e1140f7aece657
+size 250325919

app.py ADDED Viewed

	@@ -0,0 +1,195 @@

+import gradio as gr
+import numpy as np
+import cv2
+import torch
+from torchvision import datasets, transforms
+from PIL import Image
+from train import YOLOv3Lightning
+from utils import non_max_suppression, plot_image, cells_to_bboxes
+from dataset import YOLODataset
+import config
+import albumentations as A
+from albumentations.pytorch import ToTensorV2
+import matplotlib.pyplot as plt
+import matplotlib.patches as patches
+# Load the model
+model = YOLOv3Lightning(config)
+model.load_state_dict(torch.load('YOLOv3.pth', map_location=torch.device('cpu')), strict=False)
+model.eval()
+# Anchor
+scaled_anchors = (
+    torch.tensor(config.ANCHORS)
+    * torch.tensor(config.S).unsqueeze(1).unsqueeze(1).repeat(1, 3, 2)
+).to("cpu")
+test_transforms = A.Compose(
+    [
+        A.LongestMaxSize(max_size=416),
+        A.PadIfNeeded(
+            min_height=416, min_width=416, border_mode=cv2.BORDER_CONSTANT
+        ),
+        A.Normalize(mean=[0, 0, 0], std=[1, 1, 1], max_pixel_value=255,),
+        ToTensorV2(),
+    ]
+)
+class GradCAM:
+    def __init__(self, model, target_layer):
+        self.model = model
+        self.target_layer = target_layer
+        self.gradients = None
+        self.model.eval()
+        self._register_hooks()
+    def _register_hooks(self):
+        def forward_hook(module, input, output):
+            self.feature_map = output
+        def backward_hook(module, grad_input, grad_output):
+            self.gradients = grad_output[0]
+        target_module = self.model
+        for name in self.target_layer.split("."):
+            target_module = target_module._modules[name]
+        target_module.register_forward_hook(forward_hook)
+        target_module.register_backward_hook(backward_hook)
+    def _get_gradients_and_features(self, image):
+        self.model.zero_grad()
+        outputs = self.model(image)
+        gradients_list = []
+        for output in outputs:
+            self.gradients = None  # Reset gradients
+            output.backward(gradient=output, retain_graph=True)
+            gradients_list.append(self.gradients)
+        return gradients_list, self.feature_map
+    def generate_heatmap(self, image):
+        gradients_list, feature_map = self._get_gradients_and_features(image)
+        for gradients, fmap in zip(gradients_list, feature_map):
+            if gradients is not None:
+                pooled_gradients = torch.mean(gradients, dim=[0, 2, 3])
+                for i in range(len(pooled_gradients)):
+                    fmap[:, i, :, :] *= pooled_gradients[i]
+        heatmap = torch.mean(feature_map, dim=1).squeeze().detach().numpy()
+        heatmap = np.maximum(heatmap, 0)
+        heatmap /= np.max(heatmap)
+        return heatmap
+def plot_image(image, boxes):
+        """Plots predicted bounding boxes on the image"""
+        cmap = plt.get_cmap("tab20b")
+        class_labels = config.PASCAL_CLASSES
+        colors = [cmap(i) for i in np.linspace(0, 1, len(class_labels))]
+        im = np.array(image)
+        height, width, _ = im.shape
+        # Create figure and axes
+        fig, ax = plt.subplots(1)
+        # Display the image
+        ax.imshow(im)
+        # box[0] is x midpoint, box[2] is width
+        # box[1] is y midpoint, box[3] is height
+        # Create a Rectangle patch
+        for box in boxes:
+            assert len(box) == 6, "box should contain class pred, confidence, x, y, width, height"
+            class_pred = box[0]
+            box = box[2:]
+            upper_left_x = box[0] - box[2] / 2
+            upper_left_y = box[1] - box[3] / 2
+            rect = patches.Rectangle(
+                (upper_left_x * width, upper_left_y * height),
+                box[2] * width,
+                box[3] * height,
+                linewidth=2,
+                edgecolor=colors[int(class_pred)],
+                facecolor="none",
+            )
+            # Add the patch to the Axes
+            ax.add_patch(rect)
+            plt.text(
+                upper_left_x * width,
+                upper_left_y * height,
+                s=class_labels[int(class_pred)],
+                color="white",
+                verticalalignment="top",
+                bbox={"color": colors[int(class_pred)], "pad": 0},
+            )
+        ax.axis('off')
+        plt.savefig('inference.png', bbox_inches='tight', pad_inches=0)
+# Inference function
+def inference(inp_image):
+    org_image = inp_image
+    transform = test_transforms
+    x = transform(image=inp_image)["image"].unsqueeze(0)
+    out = model(x)
+    bboxes = [[] for _ in range(x.shape[0])]
+    for i in range(3):
+        batch_size, A, S, _, _ = out[i].shape
+        anchor = scaled_anchors[i]
+        boxes_scale_i = cells_to_bboxes(
+            out[i], anchor, S=S, is_preds=True
+        )
+        for idx, (box) in enumerate(boxes_scale_i):
+            bboxes[idx] += box
+    nms_boxes = non_max_suppression(
+        bboxes[0], iou_threshold=0.5, threshold=0.6, box_format="midpoint",
+    )
+    plot_image(cv2.resize(org_image,(416,416)), nms_boxes)
+    plotted_img = 'inference.png'
+    # GradCAM
+    grad_cam = GradCAM(model, target_layer="model.layers.27.layers.0")
+    image = cv2.cvtColor(org_image, cv2.COLOR_BGR2RGB)
+    image = cv2.resize(image, (416, 416))
+    image = image.transpose(2, 0, 1)
+    image = torch.from_numpy(image).unsqueeze(0).float() / 255.0
+    heatmap = grad_cam.generate_heatmap(image)
+    heatmap = cv2.resize(heatmap, (image.shape[3], image.shape[2]))
+    heatmap = cv2.applyColorMap(np.uint8(255 * heatmap), cv2.COLORMAP_JET)
+    overlay = heatmap * 0.4 + (image.squeeze().permute(1, 2, 0).cpu().numpy() * 255).astype(np.uint8) * 0.6
+    overlay = np.clip(overlay, 0, 255).astype(np.uint8)
+    overlay_bgr = cv2.cvtColor(overlay, cv2.COLOR_RGB2BGR)
+    output_path = "gradcam.png"
+    plt.imshow(overlay_bgr)
+    plt.axis('off')
+    plt.savefig(output_path, bbox_inches='tight', pad_inches=0)
+    plt.close()
+    gradcam_img = 'gradcam.png'
+    return plotted_img, gradcam_img
+inputs = gr.inputs.Image(label="Original Image")
+outputs = gr.outputs.Image(type="pil",label="Output Image")
+gradcam = gr.outputs.Image(type="pil",label="GradCAM Image")
+title = "YOLOv3 trained on PASCAL VOC"
+description = """YOLOv3 Gradio demo for object detection
+- Classes supported  = aeroplane, bicycle, bird, boat, bottle, bus, car, cat, chair, cow, diningtable, dog, horse, motorbike, person, pottedplant, sheep, sofa, train, tvmonitor
+"""
+examples = [['examples/1.jpg'], ['examples/2.jpg']]
+gr.Interface(inference, inputs, [outputs,gradcam], title=title,  examples=examples, description=description, theme='abidlabs/dracula_revamped').launch(
+    debug=True)

batch_sampler.py ADDED Viewed

	@@ -0,0 +1,47 @@

+from torch.utils.data import Sampler,RandomSampler,SequentialSampler
+import numpy as np
+class BatchSampler(object):
+    def __init__(self, sampler, batch_size, drop_last,multiscale_step=None,img_sizes = None):
+        if not isinstance(sampler, Sampler):
+            raise ValueError("sampler should be an instance of "
+                             "torch.utils.data.Sampler, but got sampler={}"
+                             .format(sampler))
+        if not isinstance(drop_last, bool):
+            raise ValueError("drop_last should be a boolean value, but got "
+                             "drop_last={}".format(drop_last))
+        self.sampler = sampler
+        self.batch_size = batch_size
+        self.drop_last = drop_last
+        if multiscale_step is not None and multiscale_step < 1 :
+            raise ValueError("multiscale_step should be > 0, but got "
+                             "multiscale_step={}".format(multiscale_step))
+        if multiscale_step is not None and img_sizes is None:
+            raise ValueError("img_sizes must a list, but got img_sizes={} ".format(img_sizes))
+        self.multiscale_step = multiscale_step
+        self.img_sizes = img_sizes
+    def __iter__(self):
+        num_batch = 0
+        batch = []
+        size = 416
+        for idx in self.sampler:
+            batch.append([idx,size])
+            if len(batch) == self.batch_size:
+                # print("Batch size reached:", batch)
+                yield batch
+                num_batch+=1
+                batch = []
+                if self.multiscale_step and num_batch % self.multiscale_step == 0 :
+                    size = np.random.choice(self.img_sizes)
+                    # print("Changing image size:", size)
+        if len(batch) > 0 and not self.drop_last:
+            # print("Last batch:", batch)
+            yield batch
+    def __len__(self):
+        if self.drop_last:
+            return len(self.sampler) // self.batch_size
+        else:
+            return (len(self.sampler) + self.batch_size - 1) // self.batch_size

config.py ADDED Viewed

	@@ -0,0 +1,103 @@

+import albumentations as A
+import cv2
+import torch
+import os
+from albumentations.pytorch import ToTensorV2
+from utils import seed_everything
+DATASET = '/storage/PASCAL_VOC'
+DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
+seed_everything()  # If you want deterministic behavior
+NUM_WORKERS = os.cpu_count()-1
+BATCH_SIZE = 16
+IMAGE_SIZE = 416
+NUM_CLASSES = 20
+LEARNING_RATE = 1e-5
+WEIGHT_DECAY = 1e-4
+NUM_EPOCHS = 40
+CONF_THRESHOLD = 0.05
+MAP_IOU_THRESH = 0.5
+NMS_IOU_THRESH = 0.45
+S = [IMAGE_SIZE // 32, IMAGE_SIZE // 16, IMAGE_SIZE // 8]
+PIN_MEMORY = True
+LOAD_MODEL = False
+SAVE_MODEL = True
+CHECKPOINT_FILE = "checkpoint.pth.tar"
+IMG_DIR = DATASET + "/images/"
+LABEL_DIR = DATASET + "/labels/"
+ANCHORS = [
+    [(0.28, 0.22), (0.38, 0.48), (0.9, 0.78)],
+    [(0.07, 0.15), (0.15, 0.11), (0.14, 0.29)],
+    [(0.02, 0.03), (0.04, 0.07), (0.08, 0.06)],
+]  # Note these have been rescaled to be between [0, 1]
+means = [0.485, 0.456, 0.406]
+scale = 1.1
+train_transforms = A.Compose(
+    [
+        A.LongestMaxSize(max_size=int(IMAGE_SIZE * scale)),
+        A.PadIfNeeded(
+            min_height=int(IMAGE_SIZE * scale),
+            min_width=int(IMAGE_SIZE * scale),
+            border_mode=cv2.BORDER_CONSTANT,
+        ),
+        A.Rotate(limit = 10, interpolation=1, border_mode=4),
+        A.RandomCrop(width=IMAGE_SIZE, height=IMAGE_SIZE),
+        A.ColorJitter(brightness=0.6, contrast=0.6, saturation=0.6, hue=0.6, p=0.4),
+        A.OneOf(
+            [
+                A.ShiftScaleRotate(
+                    rotate_limit=20, p=0.5, border_mode=cv2.BORDER_CONSTANT
+                ),
+                # A.Affine(shear=15, p=0.5, mode="constant"),
+            ],
+            p=1.0,
+        ),
+        A.HorizontalFlip(p=0.5),
+        A.Blur(p=0.1),
+        A.CLAHE(p=0.1),
+        A.Posterize(p=0.1),
+        A.ToGray(p=0.1),
+        A.ChannelShuffle(p=0.05),
+        A.Normalize(mean=[0, 0, 0], std=[1, 1, 1], max_pixel_value=255,),
+        ToTensorV2(),
+    ],
+    bbox_params=A.BboxParams(format="yolo", min_visibility=0.4, label_fields=[],),
+)
+test_transforms = A.Compose(
+    [
+        A.LongestMaxSize(max_size=IMAGE_SIZE),
+        A.PadIfNeeded(
+            min_height=IMAGE_SIZE, min_width=IMAGE_SIZE, border_mode=cv2.BORDER_CONSTANT
+        ),
+        A.Normalize(mean=[0, 0, 0], std=[1, 1, 1], max_pixel_value=255,),
+        ToTensorV2(),
+    ],
+    bbox_params=A.BboxParams(format="yolo", min_visibility=0.4, label_fields=[]),
+)
+PASCAL_CLASSES = [
+    "aeroplane",
+    "bicycle",
+    "bird",
+    "boat",
+    "bottle",
+    "bus",
+    "car",
+    "cat",
+    "chair",
+    "cow",
+    "diningtable",
+    "dog",
+    "horse",
+    "motorbike",
+    "person",
+    "pottedplant",
+    "sheep",
+    "sofa",
+    "train",
+    "tvmonitor"
+]

dataset.py ADDED Viewed

	@@ -0,0 +1,215 @@

+"""
+Creates a Pytorch dataset to load the Pascal VOC & MS COCO datasets
+"""
+import config
+import numpy as np
+import os
+import pandas as pd
+import torch
+from utils import xywhn2xyxy, xyxy2xywhn
+import random
+import torchvision.transforms as transforms
+from batch_sampler import BatchSampler,RandomSampler,SequentialSampler
+from PIL import Image, ImageFile
+from torch.utils.data import Dataset, DataLoader
+from utils import (
+    cells_to_bboxes,
+    iou_width_height as iou,
+    non_max_suppression as nms,
+    plot_image
+)
+ImageFile.LOAD_TRUNCATED_IMAGES = True
+class YOLODataset(Dataset):
+    def __init__(
+        self,
+        csv_file,
+        img_dir,
+        label_dir,
+        anchors,
+        image_size=416,
+        S=[13, 26, 52],
+        C=20,
+        transform=None,
+    ):
+        self.annotations = pd.read_csv(csv_file)
+        self.img_dir = img_dir
+        self.label_dir = label_dir
+        self.image_size = image_size
+        self.mosaic_border = [image_size // 2, image_size // 2]
+        self.transform = transform
+        self.S = S
+        self.anchors = torch.tensor(anchors[0] + anchors[1] + anchors[2])  # for all 3 scales
+        self.num_anchors = self.anchors.shape[0]
+        self.num_anchors_per_scale = self.num_anchors // 3
+        self.C = C
+        self.ignore_iou_thresh = 0.5
+    def __len__(self):
+        return len(self.annotations)
+    def load_mosaic(self, index):
+        # YOLOv5 4-mosaic loader. Loads 1 image + 3 random images into a 4-image mosaic
+        labels4 = []
+        s = self.image_size
+        yc, xc = (int(random.uniform(x, 2 * s - x)) for x in self.mosaic_border)  # mosaic center x, y
+        indices = [index] + random.choices(range(len(self)), k=3)  # 3 additional image indices
+        random.shuffle(indices)
+        for i, index in enumerate(indices):
+            # Load image
+            label_path = os.path.join(self.label_dir, self.annotations.iloc[index, 1])
+            bboxes = np.roll(np.loadtxt(fname=label_path, delimiter=" ", ndmin=2), 4, axis=1).tolist()
+            img_path = os.path.join(self.img_dir, self.annotations.iloc[index, 0])
+            img = np.array(Image.open(img_path).convert("RGB"))
+            h, w = img.shape[0], img.shape[1]
+            labels = np.array(bboxes)
+            # place img in img4
+            if i == 0:  # top left
+                img4 = np.full((s * 2, s * 2, img.shape[2]), 114, dtype=np.uint8)  # base image with 4 tiles
+                x1a, y1a, x2a, y2a = max(xc - w, 0), max(yc - h, 0), xc, yc  # xmin, ymin, xmax, ymax (large image)
+                x1b, y1b, x2b, y2b = w - (x2a - x1a), h - (y2a - y1a), w, h  # xmin, ymin, xmax, ymax (small image)
+            elif i == 1:  # top right
+                x1a, y1a, x2a, y2a = xc, max(yc - h, 0), min(xc + w, s * 2), yc
+                x1b, y1b, x2b, y2b = 0, h - (y2a - y1a), min(w, x2a - x1a), h
+            elif i == 2:  # bottom left
+                x1a, y1a, x2a, y2a = max(xc - w, 0), yc, xc, min(s * 2, yc + h)
+                x1b, y1b, x2b, y2b = w - (x2a - x1a), 0, w, min(y2a - y1a, h)
+            elif i == 3:  # bottom right
+                x1a, y1a, x2a, y2a = xc, yc, min(xc + w, s * 2), min(s * 2, yc + h)
+                x1b, y1b, x2b, y2b = 0, 0, min(w, x2a - x1a), min(y2a - y1a, h)
+            img4[y1a:y2a, x1a:x2a] = img[y1b:y2b, x1b:x2b]  # img4[ymin:ymax, xmin:xmax]
+            padw = x1a - x1b
+            padh = y1a - y1b
+            # Labels
+            if labels.size:
+                labels[:, :-1] = xywhn2xyxy(labels[:, :-1], w, h, padw, padh)  # normalized xywh to pixel xyxy format
+            labels4.append(labels)
+        # Concat/clip labels
+        labels4 = np.concatenate(labels4, 0)
+        for x in (labels4[:, :-1],):
+            np.clip(x, 0, 2 * s, out=x)  # clip when using random_perspective()
+        # img4, labels4 = replicate(img4, labels4)  # replicate
+        labels4[:, :-1] = xyxy2xywhn(labels4[:, :-1], 2 * s, 2 * s)
+        labels4[:, :-1] = np.clip(labels4[:, :-1], 0, 1)
+        labels4 = labels4[labels4[:, 2] > 0]
+        labels4 = labels4[labels4[:, 3] > 0]
+        return img4, labels4
+    def resize(self, img, size):
+        # Image resizing for Multi-resolution training
+        transform = transforms.Resize((size, size))
+        img = transform(img)
+        return img
+    def __getitem__(self, index):
+        sizee = None
+        if isinstance(index, list):
+            sizee = index[1]
+            index = index[0]
+        # apply mosaic 50% of the times
+        if random.random() >= 0.5:
+            image, bboxes = self.load_mosaic(index)
+        else:
+            label_path = os.path.join(self.label_dir, self.annotations.iloc[index, 1])
+            bboxes = np.roll(np.loadtxt(fname=label_path, delimiter=" ", ndmin=2), 4, axis=1).tolist()
+            img_path = os.path.join(self.img_dir, self.annotations.iloc[index, 0])
+            image = np.array(Image.open(img_path).convert("RGB"))
+        if self.transform:
+            augmentations = self.transform(image=image, bboxes=bboxes)
+            image = augmentations["image"]
+            bboxes = augmentations["bboxes"]
+        if sizee:
+            image = self.resize(image, sizee)
+        # Below assumes 3 scale predictions (as paper) and same num of anchors per scale
+        targets = [torch.zeros((self.num_anchors // 3, S, S, 6)) for S in self.S]
+        for box in bboxes:
+            iou_anchors = iou(torch.tensor(box[2:4]), self.anchors)
+            anchor_indices = iou_anchors.argsort(descending=True, dim=0)
+            x, y, width, height, class_label = box
+            has_anchor = [False] * 3  # each scale should have one anchor
+            for anchor_idx in anchor_indices:
+                scale_idx = anchor_idx // self.num_anchors_per_scale
+                anchor_on_scale = anchor_idx % self.num_anchors_per_scale
+                S = self.S[scale_idx]
+                i, j = int(S * y), int(S * x)  # which cell
+                anchor_taken = targets[scale_idx][anchor_on_scale, i, j, 0]
+                if not anchor_taken and not has_anchor[scale_idx]:
+                    targets[scale_idx][anchor_on_scale, i, j, 0] = 1
+                    x_cell, y_cell = S * x - j, S * y - i  # both between [0,1]
+                    width_cell, height_cell = (
+                        width * S,
+                        height * S,
+                    )  # can be greater than 1 since it's relative to cell
+                    box_coordinates = torch.tensor(
+                        [x_cell, y_cell, width_cell, height_cell]
+                    )
+                    targets[scale_idx][anchor_on_scale, i, j, 1:5] = box_coordinates
+                    targets[scale_idx][anchor_on_scale, i, j, 5] = int(class_label)
+                    has_anchor[scale_idx] = True
+                elif not anchor_taken and iou_anchors[anchor_idx] > self.ignore_iou_thresh:
+                    targets[scale_idx][anchor_on_scale, i, j, 0] = -1  # ignore prediction
+        return image, tuple(targets)
+def test():
+    anchors = config.ANCHORS
+    transform = config.test_transforms
+    dataset = YOLODataset(
+        config.DATASET + "/train.csv",
+        config.DATASET + "/images/",
+        config.DATASET + "/labels/",
+        S=[13, 26, 52],
+        anchors=anchors,
+        transform=transform,
+    )
+    S = [13, 26, 52]
+    scaled_anchors = torch.tensor(anchors) / (
+        1 / torch.tensor(S).unsqueeze(1).unsqueeze(1).repeat(1, 3, 2)
+    )
+    # loader = DataLoader(dataset=dataset, batch_size=1, shuffle=True)
+    loader = DataLoader(dataset=dataset,
+                        batch_sampler= BatchSampler(SequentialSampler(dataset),
+                                 batch_size=1,
+                                 drop_last=True,
+                                 multiscale_step=1,
+                                 img_sizes=list(range(320, 608 + 1, 32))
+                                ),
+#                                  num_workers=4
+                        )
+    for x, y in loader:
+        boxes = []
+        for i in range(y[0].shape[1]):
+            anchor = scaled_anchors[i]
+            print(anchor.shape)
+            print(y[i].shape)
+            boxes += cells_to_bboxes(
+                y[i], is_preds=False, S=y[i].shape[2], anchors=anchor
+            )[0]
+        boxes = nms(boxes, iou_threshold=1, threshold=0.7, box_format="midpoint")
+        print(boxes)
+        plot_image(x[0].permute(1, 2, 0).to("cpu"), boxes)
+if __name__ == "__main__":
+    test()

dataset_org.py ADDED Viewed

	@@ -0,0 +1,127 @@

+"""
+Creates a Pytorch dataset to load the Pascal VOC & MS COCO datasets
+"""
+import config
+import numpy as np
+import os
+import pandas as pd
+import torch
+from PIL import Image, ImageFile
+from torch.utils.data import Dataset, DataLoader
+from utils import (
+    cells_to_bboxes,
+    iou_width_height as iou,
+    non_max_suppression as nms,
+    plot_image
+)
+ImageFile.LOAD_TRUNCATED_IMAGES = True
+class YOLODataset(Dataset):
+    def __init__(
+        self,
+        csv_file,
+        img_dir,
+        label_dir,
+        anchors,
+        image_size=416,
+        S=[13, 26, 52],
+        C=20,
+        transform=None,
+    ):
+        self.annotations = pd.read_csv(csv_file)
+        self.img_dir = img_dir
+        self.label_dir = label_dir
+        self.image_size = image_size
+        self.transform = transform
+        self.S = S
+        self.anchors = torch.tensor(anchors[0] + anchors[1] + anchors[2])  # for all 3 scales
+        self.num_anchors = self.anchors.shape[0]
+        self.num_anchors_per_scale = self.num_anchors // 3
+        self.C = C
+        self.ignore_iou_thresh = 0.5
+    def __len__(self):
+        return len(self.annotations)
+    def __getitem__(self, index):
+        label_path = os.path.join(self.label_dir, self.annotations.iloc[index, 1])
+        bboxes = np.roll(np.loadtxt(fname=label_path, delimiter=" ", ndmin=2), 4, axis=1).tolist()
+        img_path = os.path.join(self.img_dir, self.annotations.iloc[index, 0])
+        image = np.array(Image.open(img_path).convert("RGB"))
+        if self.transform:
+            augmentations = self.transform(image=image, bboxes=bboxes)
+            image = augmentations["image"]
+            bboxes = augmentations["bboxes"]
+        # Below assumes 3 scale predictions (as paper) and same num of anchors per scale
+        targets = [torch.zeros((self.num_anchors // 3, S, S, 6)) for S in self.S]
+        for box in bboxes:
+            iou_anchors = iou(torch.tensor(box[2:4]), self.anchors)
+            anchor_indices = iou_anchors.argsort(descending=True, dim=0)
+            x, y, width, height, class_label = box
+            has_anchor = [False] * 3  # each scale should have one anchor
+            for anchor_idx in anchor_indices:
+                scale_idx = anchor_idx // self.num_anchors_per_scale
+                anchor_on_scale = anchor_idx % self.num_anchors_per_scale
+                S = self.S[scale_idx]
+                i, j = int(S * y), int(S * x)  # which cell
+                anchor_taken = targets[scale_idx][anchor_on_scale, i, j, 0]
+                if not anchor_taken and not has_anchor[scale_idx]:
+                    targets[scale_idx][anchor_on_scale, i, j, 0] = 1
+                    x_cell, y_cell = S * x - j, S * y - i  # both between [0,1]
+                    width_cell, height_cell = (
+                        width * S,
+                        height * S,
+                    )  # can be greater than 1 since it's relative to cell
+                    box_coordinates = torch.tensor(
+                        [x_cell, y_cell, width_cell, height_cell]
+                    )
+                    targets[scale_idx][anchor_on_scale, i, j, 1:5] = box_coordinates
+                    targets[scale_idx][anchor_on_scale, i, j, 5] = int(class_label)
+                    has_anchor[scale_idx] = True
+                elif not anchor_taken and iou_anchors[anchor_idx] > self.ignore_iou_thresh:
+                    targets[scale_idx][anchor_on_scale, i, j, 0] = -1  # ignore prediction
+        return image, tuple(targets)
+def test():
+    anchors = config.ANCHORS
+    transform = config.test_transforms
+    dataset = YOLODataset(
+        "COCO/train.csv",
+        "COCO/images/images/",
+        "COCO/labels/labels_new/",
+        S=[13, 26, 52],
+        anchors=anchors,
+        transform=transform,
+    )
+    S = [13, 26, 52]
+    scaled_anchors = torch.tensor(anchors) / (
+        1 / torch.tensor(S).unsqueeze(1).unsqueeze(1).repeat(1, 3, 2)
+    )
+    loader = DataLoader(dataset=dataset, batch_size=1, shuffle=True)
+    for x, y in loader:
+        boxes = []
+        for i in range(y[0].shape[1]):
+            anchor = scaled_anchors[i]
+            print(anchor.shape)
+            print(y[i].shape)
+            boxes += cells_to_bboxes(
+                y[i], is_preds=False, S=y[i].shape[2], anchors=anchor
+            )[0]
+        boxes = nms(boxes, iou_threshold=1, threshold=0.7, box_format="midpoint")
+        print(boxes)
+        plot_image(x[0].permute(1, 2, 0).to("cpu"), boxes)
+if __name__ == "__main__":
+    test()

examples/1.jpg ADDED Viewed

examples/2.jpg ADDED Viewed

loss.py ADDED Viewed

	@@ -0,0 +1,79 @@

+"""
+Implementation of Yolo Loss Function similar to the one in Yolov3 paper,
+the difference from what I can tell is I use CrossEntropy for the classes
+instead of BinaryCrossEntropy.
+"""
+import random
+import torch
+import torch.nn as nn
+from utils import intersection_over_union
+class YoloLoss(nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.mse = nn.MSELoss()
+        self.bce = nn.BCEWithLogitsLoss()
+        self.entropy = nn.CrossEntropyLoss()
+        self.sigmoid = nn.Sigmoid()
+        # Constants signifying how much to pay for each respective part of the loss
+        self.lambda_class = 1
+        self.lambda_noobj = 10
+        self.lambda_obj = 1
+        self.lambda_box = 10
+    def forward(self, predictions, target, anchors):
+        # Check where obj and noobj (we ignore if target == -1)
+        obj = target[..., 0] == 1  # in paper this is Iobj_i
+        noobj = target[..., 0] == 0  # in paper this is Inoobj_i
+        # ======================= #
+        #   FOR NO OBJECT LOSS    #
+        # ======================= #
+        no_object_loss = self.bce(
+            (predictions[..., 0:1][noobj]), (target[..., 0:1][noobj]),
+        )
+        # ==================== #
+        #   FOR OBJECT LOSS    #
+        # ==================== #
+        anchors = anchors.reshape(1, 3, 1, 1, 2)
+        box_preds = torch.cat([self.sigmoid(predictions[..., 1:3]), torch.exp(predictions[..., 3:5]) * anchors], dim=-1)
+        ious = intersection_over_union(box_preds[obj], target[..., 1:5][obj]).detach()
+        object_loss = self.mse(self.sigmoid(predictions[..., 0:1][obj]), ious * target[..., 0:1][obj])
+        # ======================== #
+        #   FOR BOX COORDINATES    #
+        # ======================== #
+        predictions[..., 1:3] = self.sigmoid(predictions[..., 1:3])  # x,y coordinates
+        target[..., 3:5] = torch.log(
+            (1e-16 + target[..., 3:5] / anchors)
+        )  # width, height coordinates
+        box_loss = self.mse(predictions[..., 1:5][obj], target[..., 1:5][obj])
+        # ================== #
+        #   FOR CLASS LOSS   #
+        # ================== #
+        class_loss = self.entropy(
+            (predictions[..., 5:][obj]), (target[..., 5][obj].long()),
+        )
+        #print("__________________________________")
+        #print(self.lambda_box * box_loss)
+        #print(self.lambda_obj * object_loss)
+        #print(self.lambda_noobj * no_object_loss)
+        #print(self.lambda_class * class_loss)
+        #print("\n")
+        return (
+            self.lambda_box * box_loss
+            + self.lambda_obj * object_loss
+            + self.lambda_noobj * no_object_loss
+            + self.lambda_class * class_loss
+        )

model.py ADDED Viewed

	@@ -0,0 +1,218 @@

+"""
+Implementation of YOLOv3 architecture
+"""
+import torch
+import torch.nn as nn
+"""
+Information about architecture config:
+Tuple is structured by (filters, kernel_size, stride)
+Every conv is a same convolution.
+List is structured by "B" indicating a residual block followed by the number of repeats
+"S" is for scale prediction block and computing the yolo loss
+"U" is for upsampling the feature map and concatenating with a previous layer
+"""
+config = [
+    (32, 3, 1),
+    (64, 3, 2),
+    ["B", 1],
+    (128, 3, 2),
+    ["B", 2],
+    (256, 3, 2),
+    ["B", 8],
+    (512, 3, 2),
+    ["B", 8],
+    (1024, 3, 2),
+    ["B", 4],  # To this point is Darknet-53
+    (512, 1, 1),
+    (1024, 3, 1),
+    "S1",
+    (256, 1, 1),
+    "U",
+    (256, 1, 1),
+    (512, 3, 1),
+    "S2",
+    (128, 1, 1),
+    "U",
+    (128, 1, 1),
+    (256, 3, 1),
+    "S3",
+]
+S=[13,26,52]
+class CNNBlock(nn.Module):
+    def __init__(self, in_channels, out_channels, bn_act=True, **kwargs):
+        super().__init__()
+        self.conv = nn.Conv2d(in_channels, out_channels, bias=not bn_act, **kwargs)
+        self.bn = nn.BatchNorm2d(out_channels)
+        self.leaky = nn.LeakyReLU(0.1)
+        self.use_bn_act = bn_act
+    def forward(self, x):
+        if self.use_bn_act:
+            return self.leaky(self.bn(self.conv(x)))
+        else:
+            return self.conv(x)
+class ResidualBlock(nn.Module):
+    def __init__(self, channels, use_residual=True, num_repeats=1):
+        super().__init__()
+        self.layers = nn.ModuleList()
+        for repeat in range(num_repeats):
+            self.layers += [
+                nn.Sequential(
+                    CNNBlock(channels, channels // 2, kernel_size=1),
+                    CNNBlock(channels // 2, channels, kernel_size=3, padding=1),
+                )
+            ]
+        self.use_residual = use_residual
+        self.num_repeats = num_repeats
+    def forward(self, x):
+        for layer in self.layers:
+            if self.use_residual:
+                x = x + layer(x)
+            else:
+                x = layer(x)
+        return x
+class SPPBlock(nn.Module):
+    def __init__(self, c1, c2, k=(5, 9, 13)):
+        super().__init__()
+        c_ = c1 // 2  # Intermediate channels
+        self.cv1 = nn.Conv2d(c1, c_, kernel_size=1, stride=1)
+        self.pool_layers = nn.ModuleList([
+			nn.MaxPool2d(kernel_size=size, stride=1, padding=size // 2) for size in k
+        ])
+        self.cv2 = nn.Conv2d(c_ * (len(k) + 1), c2, kernel_size=1, stride=1)
+    def forward(self, x):
+        x = self.cv1(x)
+        pool_outputs = [layer(x) for layer in self.pool_layers]
+        pool_outputs = [x] + pool_outputs
+        x = torch.cat(pool_outputs, dim=1)
+        x = self.cv2(x)
+        return x
+class ScalePrediction(nn.Module):
+    def __init__(self, in_channels, num_classes, im_shape):
+        super().__init__()
+        self.im_shape = im_shape
+        self.pred = nn.Sequential(
+            SPPBlock(in_channels,in_channels),
+            nn.AdaptiveMaxPool2d(self.im_shape),
+            CNNBlock(in_channels, 2 * in_channels, kernel_size=3, padding=1),
+            CNNBlock(
+                2 * in_channels, (num_classes + 5) * 3, bn_act=False, kernel_size=1
+            ),
+        )
+        self.num_classes = num_classes
+    def forward(self, x):
+        x = self.pred(x)
+        return (
+            x
+            .reshape(x.shape[0], 3, self.num_classes + 5, x.shape[2], x.shape[3])
+            .permute(0, 1, 3, 4, 2)
+        )
+class YOLOv3(nn.Module):
+    def __init__(self, in_channels=3, num_classes=80):
+        super().__init__()
+        self.num_classes = num_classes
+        self.in_channels = in_channels
+        self.layers = self._create_conv_layers()
+    def forward(self, x):
+        outputs = []  # for each scale
+        route_connections = []
+        for layer in self.layers:
+            if isinstance(layer, ScalePrediction):
+                outputs.append(layer(x))
+                continue
+            x = layer(x)
+            if isinstance(layer, ResidualBlock) and layer.num_repeats == 8:
+                route_connections.append(x)
+            elif isinstance(layer, nn.Upsample):
+                x = torch.cat([x, route_connections[-1]], dim=1)
+                route_connections.pop()
+        return outputs
+    def _create_conv_layers(self):
+        layers = nn.ModuleList()
+        in_channels = self.in_channels
+        for module in config:
+            if isinstance(module, tuple):
+                out_channels, kernel_size, stride = module
+                layers.append(
+                    CNNBlock(
+                        in_channels,
+                        out_channels,
+                        kernel_size=kernel_size,
+                        stride=stride,
+                        padding=1 if kernel_size == 3 else 0,
+                    )
+                )
+                in_channels = out_channels
+            elif isinstance(module, list):
+                num_repeats = module[1]
+                layers.append(ResidualBlock(in_channels, num_repeats=num_repeats,))
+            elif isinstance(module, str):
+                if module == "S1":
+                    layers += [
+                        ResidualBlock(in_channels, use_residual=False, num_repeats=1),
+                        CNNBlock(in_channels, in_channels // 2, kernel_size=1),
+                        ScalePrediction(in_channels // 2, num_classes=self.num_classes, im_shape=S[0]),
+                    ]
+                    in_channels = in_channels // 2
+                if module == "S2":
+                    layers += [
+                        ResidualBlock(in_channels, use_residual=False, num_repeats=1),
+                        CNNBlock(in_channels, in_channels // 2, kernel_size=1),
+                        ScalePrediction(in_channels // 2, num_classes=self.num_classes, im_shape=S[1]),
+                    ]
+                    in_channels = in_channels // 2
+                if module == "S3":
+                    layers += [
+                        ResidualBlock(in_channels, use_residual=False, num_repeats=1),
+                        CNNBlock(in_channels, in_channels // 2, kernel_size=1),
+                        ScalePrediction(in_channels // 2, num_classes=self.num_classes, im_shape=S[2]),
+                    ]
+                    in_channels = in_channels // 2
+                elif module == "U":
+                    layers.append(nn.Upsample(scale_factor=2),)
+                    in_channels = in_channels * 3
+        return layers
+if __name__ == "__main__":
+    num_classes = 20
+    IMAGE_SIZE = 416
+    model = YOLOv3(num_classes=num_classes)
+    x = torch.randn((2, 3, IMAGE_SIZE, IMAGE_SIZE))
+    out = model(x)
+    assert model(x)[0].shape == (2, 3, IMAGE_SIZE//32, IMAGE_SIZE//32, num_classes + 5)
+    assert model(x)[1].shape == (2, 3, IMAGE_SIZE//16, IMAGE_SIZE//16, num_classes + 5)
+    assert model(x)[2].shape == (2, 3, IMAGE_SIZE//8, IMAGE_SIZE//8, num_classes + 5)
+    print("Success!")

requirements.txt ADDED Viewed

	@@ -0,0 +1,9 @@

+grad-cam==1.4.8
+gradio==3.39.0
+gradio_client==0.3.0
+numpy==1.22.4
+torchvision
+Pillow==9.4.0
+torch
+pytorch-lightning==2.0.6
+albumentations

train.py ADDED Viewed

	@@ -0,0 +1,180 @@

+"""
+Main file for training Yolo model on Pascal VOC and COCO dataset
+"""
+import config
+import torch
+import torch.optim as optim
+from model import YOLOv3
+from tqdm import tqdm
+from utils import (
+    mean_average_precision,
+    cells_to_bboxes,
+    get_evaluation_bboxes,
+    save_checkpoint,
+    load_checkpoint,
+    check_class_accuracy,
+    get_loaders,
+    plot_couple_examples
+)
+from loss import YoloLoss
+import warnings
+warnings.filterwarnings("ignore")
+import pytorch_lightning as pl
+from pytorch_lightning.callbacks import LearningRateMonitor, ModelCheckpoint
+from torch.optim.lr_scheduler import OneCycleLR
+class YOLOv3Lightning(pl.LightningModule):
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        self.model = YOLOv3(num_classes=config.NUM_CLASSES)
+        self.loss_fn = YoloLoss()
+        self.scaled_anchors = (
+            torch.tensor(config.ANCHORS)
+            * torch.tensor(config.S).unsqueeze(1).unsqueeze(1).repeat(1, 3, 2)
+        ).to(config.DEVICE)
+        self.train_step_outputs = []
+        self.validation_step_outputs = []
+    def forward(self, x):
+        return self.model(x)
+    def get_loss(self, batch):
+        x, y = batch
+        y0, y1, y2 = (
+            y[0],
+            y[1],
+            y[2],
+        )
+        out = self(x)
+        loss = (
+                self.loss_fn(out[0], y0, self.scaled_anchors[0])
+                + self.loss_fn(out[1], y1, self.scaled_anchors[1])
+                + self.loss_fn(out[2], y2, self.scaled_anchors[2])
+        )
+        return loss
+    def training_step(self, batch, batch_idx):
+        loss = self.get_loss(batch)
+        self.log("train/loss", loss, on_epoch=True, prog_bar=True, logger=True)  # Logging the training loss for visualization
+        self.train_step_outputs.append(loss)
+        return loss
+    def on_train_epoch_end(self):
+        print(f"\nCurrently epoch {self.current_epoch}")
+        train_epoch_average = torch.stack(self.train_step_outputs).mean()
+        self.train_step_outputs.clear()
+        print(f"Train loss {train_epoch_average}")
+        print("On Train loader:")
+        class_accuracy, no_obj_accuracy, obj_accuracy = check_class_accuracy(self.model, self.train_loader, threshold=config.CONF_THRESHOLD)
+        self.log("train/class_accuracy", class_accuracy, on_epoch=True, prog_bar=True, logger=True)
+        self.log("train/no_obj_accuracy", no_obj_accuracy, on_epoch=True, prog_bar=True, logger=True)
+        self.log("train/obj_accuracy", obj_accuracy, on_epoch=True, prog_bar=True, logger=True)
+        val_epoch_average = torch.stack(self.validation_step_outputs).mean()
+        self.validation_step_outputs.clear()
+        print(f"Validation loss {val_epoch_average}")
+        print("On Train Eval loader:")
+        class_accuracy, no_obj_accuracy, obj_accuracy = check_class_accuracy(self.model, self.train_eval_loader, threshold=config.CONF_THRESHOLD)
+        self.log("val/class_accuracy", class_accuracy, on_epoch=True, prog_bar=True, logger=True)
+        self.log("val/no_obj_accuracy", no_obj_accuracy, on_epoch=True, prog_bar=True, logger=True)
+        self.log("val/obj_accuracy", obj_accuracy, on_epoch=True, prog_bar=True, logger=True)
+        if (self.current_epoch>0) and ((self.current_epoch+1) % 10 == 0):
+            plot_couple_examples(self.model, self.test_loader, 0.6, 0.5, self.scaled_anchors)
+        if (self.current_epoch>0) and (self.current_epoch+1 == 40):
+            print("On Test loader:")
+            test_class_accuracy, test_no_obj_accuracy, test_obj_accuracy = check_class_accuracy(self.model, self.test_loader, threshold=config.CONF_THRESHOLD)
+            self.log("test/class_accuracy", test_class_accuracy, on_epoch=True, prog_bar=True, logger=True)
+            self.log("test/no_obj_accuracy", test_no_obj_accuracy, on_epoch=True, prog_bar=True, logger=True)
+            self.log("test/obj_accuracy", test_obj_accuracy, on_epoch=True, prog_bar=True, logger=True)
+            pred_boxes, true_boxes = get_evaluation_bboxes(
+                self.test_loader,
+                self.model,
+                iou_threshold=config.NMS_IOU_THRESH,
+                anchors=config.ANCHORS,
+                threshold=config.CONF_THRESHOLD,
+            )
+            mapval = mean_average_precision(
+                pred_boxes,
+                true_boxes,
+                iou_threshold=config.MAP_IOU_THRESH,
+                box_format="midpoint",
+                num_classes=config.NUM_CLASSES,
+            )
+            print(f"MAP: {mapval.item()}")
+            self.log("MAP", mapval.item(), on_epoch=True, prog_bar=True, logger=True)
+    def validation_step(self, batch, batch_idx):
+        loss = self.get_loss(batch)
+        self.log("val/loss", loss, on_epoch=True, prog_bar=True, logger=True)
+        self.validation_step_outputs.append(loss)
+        return loss
+    def configure_optimizers(self):
+        optimizer = optim.Adam(
+            self.parameters(),
+            lr=self.config.LEARNING_RATE,
+            weight_decay=self.config.WEIGHT_DECAY,
+        )
+        self.trainer.fit_loop.setup_data()
+        dataloader = self.trainer.train_dataloader
+        EPOCHS = config.NUM_EPOCHS
+        lr_scheduler = OneCycleLR(
+            optimizer,
+            max_lr=1.0E-03,
+            steps_per_epoch=len(dataloader),
+            epochs=EPOCHS,
+            pct_start=5/EPOCHS,
+            div_factor=100,
+            three_phase=False,
+            final_div_factor=100,
+            anneal_strategy='linear'
+        )
+        scheduler = {"scheduler": lr_scheduler, "interval" : "step"}
+        return [optimizer], [scheduler]
+    def setup(self, stage=None):
+        self.train_loader, self.test_loader, self.train_eval_loader = get_loaders(
+            train_csv_path=self.config.DATASET + "/train.csv",
+            test_csv_path=self.config.DATASET + "/test.csv",
+        )
+    def train_dataloader(self):
+        return self.train_loader
+    def val_dataloader(self):
+        return self.train_eval_loader
+    def test_dataloader(self):
+        return self.test_loader
+if __name__ == "__main__":
+    model = YOLOv3Lightning(config)
+    checkpoint = ModelCheckpoint(filename='last_epoch', save_last=True)
+    lr_rate_monitor = LearningRateMonitor(logging_interval="epoch")
+    trainer = pl.Trainer(
+                  max_epochs=config.NUM_EPOCHS,
+                  deterministic=False,
+                  logger=True,
+                  callbacks=[checkpoint, lr_rate_monitor],
+                  enable_model_summary=False,
+                  log_every_n_steps=1,
+                  precision=16
+              )
+    print("Training Started by Selvaraj Sembulingam")
+    trainer.fit(model)
+    print("Training Completed by Selvaraj Sembulingam")
+    torch.save(model.state_dict(), 'YOLOv3.pth')

utils.py ADDED Viewed

	@@ -0,0 +1,588 @@

+import config
+import matplotlib.pyplot as plt
+import matplotlib.patches as patches
+import numpy as np
+import os
+import random
+import torch
+from batch_sampler import BatchSampler,RandomSampler,SequentialSampler
+from collections import Counter
+from torch.utils.data import DataLoader
+from tqdm import tqdm
+def iou_width_height(boxes1, boxes2):
+    """
+    Parameters:
+        boxes1 (tensor): width and height of the first bounding boxes
+        boxes2 (tensor): width and height of the second bounding boxes
+    Returns:
+        tensor: Intersection over union of the corresponding boxes
+    """
+    intersection = torch.min(boxes1[..., 0], boxes2[..., 0]) * torch.min(
+        boxes1[..., 1], boxes2[..., 1]
+    )
+    union = (
+        boxes1[..., 0] * boxes1[..., 1] + boxes2[..., 0] * boxes2[..., 1] - intersection
+    )
+    return intersection / union
+def intersection_over_union(boxes_preds, boxes_labels, box_format="midpoint"):
+    """
+    Video explanation of this function:
+    https://youtu.be/XXYG5ZWtjj0
+    This function calculates intersection over union (iou) given pred boxes
+    and target boxes.
+    Parameters:
+        boxes_preds (tensor): Predictions of Bounding Boxes (BATCH_SIZE, 4)
+        boxes_labels (tensor): Correct labels of Bounding Boxes (BATCH_SIZE, 4)
+        box_format (str): midpoint/corners, if boxes (x,y,w,h) or (x1,y1,x2,y2)
+    Returns:
+        tensor: Intersection over union for all examples
+    """
+    if box_format == "midpoint":
+        box1_x1 = boxes_preds[..., 0:1] - boxes_preds[..., 2:3] / 2
+        box1_y1 = boxes_preds[..., 1:2] - boxes_preds[..., 3:4] / 2
+        box1_x2 = boxes_preds[..., 0:1] + boxes_preds[..., 2:3] / 2
+        box1_y2 = boxes_preds[..., 1:2] + boxes_preds[..., 3:4] / 2
+        box2_x1 = boxes_labels[..., 0:1] - boxes_labels[..., 2:3] / 2
+        box2_y1 = boxes_labels[..., 1:2] - boxes_labels[..., 3:4] / 2
+        box2_x2 = boxes_labels[..., 0:1] + boxes_labels[..., 2:3] / 2
+        box2_y2 = boxes_labels[..., 1:2] + boxes_labels[..., 3:4] / 2
+    if box_format == "corners":
+        box1_x1 = boxes_preds[..., 0:1]
+        box1_y1 = boxes_preds[..., 1:2]
+        box1_x2 = boxes_preds[..., 2:3]
+        box1_y2 = boxes_preds[..., 3:4]
+        box2_x1 = boxes_labels[..., 0:1]
+        box2_y1 = boxes_labels[..., 1:2]
+        box2_x2 = boxes_labels[..., 2:3]
+        box2_y2 = boxes_labels[..., 3:4]
+    x1 = torch.max(box1_x1, box2_x1)
+    y1 = torch.max(box1_y1, box2_y1)
+    x2 = torch.min(box1_x2, box2_x2)
+    y2 = torch.min(box1_y2, box2_y2)
+    intersection = (x2 - x1).clamp(0) * (y2 - y1).clamp(0)
+    box1_area = abs((box1_x2 - box1_x1) * (box1_y2 - box1_y1))
+    box2_area = abs((box2_x2 - box2_x1) * (box2_y2 - box2_y1))
+    return intersection / (box1_area + box2_area - intersection + 1e-6)
+def non_max_suppression(bboxes, iou_threshold, threshold, box_format="corners"):
+    """
+    Video explanation of this function:
+    https://youtu.be/YDkjWEN8jNA
+    Does Non Max Suppression given bboxes
+    Parameters:
+        bboxes (list): list of lists containing all bboxes with each bboxes
+        specified as [class_pred, prob_score, x1, y1, x2, y2]
+        iou_threshold (float): threshold where predicted bboxes is correct
+        threshold (float): threshold to remove predicted bboxes (independent of IoU)
+        box_format (str): "midpoint" or "corners" used to specify bboxes
+    Returns:
+        list: bboxes after performing NMS given a specific IoU threshold
+    """
+    assert type(bboxes) == list
+    bboxes = [box for box in bboxes if box[1] > threshold]
+    bboxes = sorted(bboxes, key=lambda x: x[1], reverse=True)
+    bboxes_after_nms = []
+    while bboxes:
+        chosen_box = bboxes.pop(0)
+        bboxes = [
+            box
+            for box in bboxes
+            if box[0] != chosen_box[0]
+            or intersection_over_union(
+                torch.tensor(chosen_box[2:]),
+                torch.tensor(box[2:]),
+                box_format=box_format,
+            )
+            < iou_threshold
+        ]
+        bboxes_after_nms.append(chosen_box)
+    return bboxes_after_nms
+def mean_average_precision(
+    pred_boxes, true_boxes, iou_threshold=0.5, box_format="midpoint", num_classes=20
+):
+    """
+    Video explanation of this function:
+    https://youtu.be/FppOzcDvaDI
+    This function calculates mean average precision (mAP)
+    Parameters:
+        pred_boxes (list): list of lists containing all bboxes with each bboxes
+        specified as [train_idx, class_prediction, prob_score, x1, y1, x2, y2]
+        true_boxes (list): Similar as pred_boxes except all the correct ones
+        iou_threshold (float): threshold where predicted bboxes is correct
+        box_format (str): "midpoint" or "corners" used to specify bboxes
+        num_classes (int): number of classes
+    Returns:
+        float: mAP value across all classes given a specific IoU threshold
+    """
+    # list storing all AP for respective classes
+    average_precisions = []
+    # used for numerical stability later on
+    epsilon = 1e-6
+    for c in range(num_classes):
+        detections = []
+        ground_truths = []
+        # Go through all predictions and targets,
+        # and only add the ones that belong to the
+        # current class c
+        for detection in pred_boxes:
+            if detection[1] == c:
+                detections.append(detection)
+        for true_box in true_boxes:
+            if true_box[1] == c:
+                ground_truths.append(true_box)
+        # find the amount of bboxes for each training example
+        # Counter here finds how many ground truth bboxes we get
+        # for each training example, so let's say img 0 has 3,
+        # img 1 has 5 then we will obtain a dictionary with:
+        # amount_bboxes = {0:3, 1:5}
+        amount_bboxes = Counter([gt[0] for gt in ground_truths])
+        # We then go through each key, val in this dictionary
+        # and convert to the following (w.r.t same example):
+        # ammount_bboxes = {0:torch.tensor[0,0,0], 1:torch.tensor[0,0,0,0,0]}
+        for key, val in amount_bboxes.items():
+            amount_bboxes[key] = torch.zeros(val)
+        # sort by box probabilities which is index 2
+        detections.sort(key=lambda x: x[2], reverse=True)
+        TP = torch.zeros((len(detections)))
+        FP = torch.zeros((len(detections)))
+        total_true_bboxes = len(ground_truths)
+        # If none exists for this class then we can safely skip
+        if total_true_bboxes == 0:
+            continue
+        for detection_idx, detection in enumerate(detections):
+            # Only take out the ground_truths that have the same
+            # training idx as detection
+            ground_truth_img = [
+                bbox for bbox in ground_truths if bbox[0] == detection[0]
+            ]
+            num_gts = len(ground_truth_img)
+            best_iou = 0
+            for idx, gt in enumerate(ground_truth_img):
+                iou = intersection_over_union(
+                    torch.tensor(detection[3:]),
+                    torch.tensor(gt[3:]),
+                    box_format=box_format,
+                )
+                if iou > best_iou:
+                    best_iou = iou
+                    best_gt_idx = idx
+            if best_iou > iou_threshold:
+                # only detect ground truth detection once
+                if amount_bboxes[detection[0]][best_gt_idx] == 0:
+                    # true positive and add this bounding box to seen
+                    TP[detection_idx] = 1
+                    amount_bboxes[detection[0]][best_gt_idx] = 1
+                else:
+                    FP[detection_idx] = 1
+            # if IOU is lower then the detection is a false positive
+            else:
+                FP[detection_idx] = 1
+        TP_cumsum = torch.cumsum(TP, dim=0)
+        FP_cumsum = torch.cumsum(FP, dim=0)
+        recalls = TP_cumsum / (total_true_bboxes + epsilon)
+        precisions = TP_cumsum / (TP_cumsum + FP_cumsum + epsilon)
+        precisions = torch.cat((torch.tensor([1]), precisions))
+        recalls = torch.cat((torch.tensor([0]), recalls))
+        # torch.trapz for numerical integration
+        average_precisions.append(torch.trapz(precisions, recalls))
+    return sum(average_precisions) / len(average_precisions)
+def plot_image(image, boxes):
+    """Plots predicted bounding boxes on the image"""
+    cmap = plt.get_cmap("tab20b")
+    class_labels = config.COCO_LABELS if config.DATASET=='COCO' else config.PASCAL_CLASSES
+    colors = [cmap(i) for i in np.linspace(0, 1, len(class_labels))]
+    im = np.array(image)
+    height, width, _ = im.shape
+    # Create figure and axes
+    fig, ax = plt.subplots(1)
+    # Display the image
+    ax.imshow(im)
+    # box[0] is x midpoint, box[2] is width
+    # box[1] is y midpoint, box[3] is height
+    # Create a Rectangle patch
+    for box in boxes:
+        assert len(box) == 6, "box should contain class pred, confidence, x, y, width, height"
+        class_pred = box[0]
+        box = box[2:]
+        upper_left_x = box[0] - box[2] / 2
+        upper_left_y = box[1] - box[3] / 2
+        rect = patches.Rectangle(
+            (upper_left_x * width, upper_left_y * height),
+            box[2] * width,
+            box[3] * height,
+            linewidth=2,
+            edgecolor=colors[int(class_pred)],
+            facecolor="none",
+        )
+        # Add the patch to the Axes
+        ax.add_patch(rect)
+        plt.text(
+            upper_left_x * width,
+            upper_left_y * height,
+            s=class_labels[int(class_pred)],
+            color="white",
+            verticalalignment="top",
+            bbox={"color": colors[int(class_pred)], "pad": 0},
+        )
+    plt.show()
+def get_evaluation_bboxes(
+    loader,
+    model,
+    iou_threshold,
+    anchors,
+    threshold,
+    box_format="midpoint",
+    device="cuda",
+):
+    # make sure model is in eval before get bboxes
+    model.eval()
+    train_idx = 0
+    all_pred_boxes = []
+    all_true_boxes = []
+    for batch_idx, (x, labels) in enumerate(loader):
+        x = x.to(device)
+        with torch.no_grad():
+            predictions = model(x)
+        batch_size = x.shape[0]
+        bboxes = [[] for _ in range(batch_size)]
+        for i in range(3):
+            S = predictions[i].shape[2]
+            anchor = torch.tensor([*anchors[i]]).to(device) * S
+            boxes_scale_i = cells_to_bboxes(
+                predictions[i], anchor, S=S, is_preds=True
+            )
+            for idx, (box) in enumerate(boxes_scale_i):
+                bboxes[idx] += box
+        # we just want one bbox for each label, not one for each scale
+        true_bboxes = cells_to_bboxes(
+            labels[2], anchor, S=S, is_preds=False
+        )
+        for idx in range(batch_size):
+            nms_boxes = non_max_suppression(
+                bboxes[idx],
+                iou_threshold=iou_threshold,
+                threshold=threshold,
+                box_format=box_format,
+            )
+            for nms_box in nms_boxes:
+                all_pred_boxes.append([train_idx] + nms_box)
+            for box in true_bboxes[idx]:
+                if box[1] > threshold:
+                    all_true_boxes.append([train_idx] + box)
+            train_idx += 1
+    model.train()
+    return all_pred_boxes, all_true_boxes
+def cells_to_bboxes(predictions, anchors, S, is_preds=True):
+    """
+    Scales the predictions coming from the model to
+    be relative to the entire image such that they for example later
+    can be plotted or.
+    INPUT:
+    predictions: tensor of size (N, 3, S, S, num_classes+5)
+    anchors: the anchors used for the predictions
+    S: the number of cells the image is divided in on the width (and height)
+    is_preds: whether the input is predictions or the true bounding boxes
+    OUTPUT:
+    converted_bboxes: the converted boxes of sizes (N, num_anchors, S, S, 1+5) with class index,
+                      object score, bounding box coordinates
+    """
+    BATCH_SIZE = predictions.shape[0]
+    num_anchors = len(anchors)
+    box_predictions = predictions[..., 1:5]
+    if is_preds:
+        anchors = anchors.reshape(1, len(anchors), 1, 1, 2)
+        box_predictions[..., 0:2] = torch.sigmoid(box_predictions[..., 0:2])
+        box_predictions[..., 2:] = torch.exp(box_predictions[..., 2:]) * anchors
+        scores = torch.sigmoid(predictions[..., 0:1])
+        best_class = torch.argmax(predictions[..., 5:], dim=-1).unsqueeze(-1)
+    else:
+        scores = predictions[..., 0:1]
+        best_class = predictions[..., 5:6]
+    cell_indices = (
+        torch.arange(S)
+        .repeat(predictions.shape[0], 3, S, 1)
+        .unsqueeze(-1)
+        .to(predictions.device)
+    )
+    x = 1 / S * (box_predictions[..., 0:1] + cell_indices)
+    y = 1 / S * (box_predictions[..., 1:2] + cell_indices.permute(0, 1, 3, 2, 4))
+    w_h = 1 / S * box_predictions[..., 2:4]
+    converted_bboxes = torch.cat((best_class, scores, x, y, w_h), dim=-1).reshape(BATCH_SIZE, num_anchors * S * S, 6)
+    return converted_bboxes.tolist()
+def check_class_accuracy(model, loader, threshold):
+    model.eval()
+    tot_class_preds, correct_class = 0, 0
+    tot_noobj, correct_noobj = 0, 0
+    tot_obj, correct_obj = 0, 0
+    for idx, (x, y) in enumerate(loader):
+        x = x.to(config.DEVICE)
+        with torch.no_grad():
+            out = model(x)
+        for i in range(3):
+            y[i] = y[i].to(config.DEVICE)
+            obj = y[i][..., 0] == 1 # in paper this is Iobj_i
+            noobj = y[i][..., 0] == 0  # in paper this is Iobj_i
+            correct_class += torch.sum(
+                torch.argmax(out[i][..., 5:][obj], dim=-1) == y[i][..., 5][obj]
+            )
+            tot_class_preds += torch.sum(obj)
+            obj_preds = torch.sigmoid(out[i][..., 0]) > threshold
+            correct_obj += torch.sum(obj_preds[obj] == y[i][..., 0][obj])
+            tot_obj += torch.sum(obj)
+            correct_noobj += torch.sum(obj_preds[noobj] == y[i][..., 0][noobj])
+            tot_noobj += torch.sum(noobj)
+    print(f"Class accuracy is: {(correct_class/(tot_class_preds+1e-16))*100:2f}%")
+    print(f"No obj accuracy is: {(correct_noobj/(tot_noobj+1e-16))*100:2f}%")
+    print(f"Obj accuracy is: {(correct_obj/(tot_obj+1e-16))*100:2f}%")
+    model.train()
+    return (correct_class/(tot_class_preds+1e-16))*100, (correct_noobj/(tot_noobj+1e-16))*100, (correct_obj/(tot_obj+1e-16))*100
+def get_mean_std(loader):
+    # var[X] = E[X**2] - E[X]**2
+    channels_sum, channels_sqrd_sum, num_batches = 0, 0, 0
+    for data, _ in loader:
+        channels_sum += torch.mean(data, dim=[0, 2, 3])
+        channels_sqrd_sum += torch.mean(data ** 2, dim=[0, 2, 3])
+        num_batches += 1
+    mean = channels_sum / num_batches
+    std = (channels_sqrd_sum / num_batches - mean ** 2) ** 0.5
+    return mean, std
+def save_checkpoint(model, optimizer, filename="my_checkpoint.pth.tar"):
+    print("=> Saving checkpoint")
+    checkpoint = {
+        "state_dict": model.state_dict(),
+        "optimizer": optimizer.state_dict(),
+    }
+    torch.save(checkpoint, filename)
+def load_checkpoint(checkpoint_file, model, optimizer, lr):
+    print("=> Loading checkpoint")
+    checkpoint = torch.load(checkpoint_file, map_location=config.DEVICE)
+    model.load_state_dict(checkpoint["state_dict"])
+    optimizer.load_state_dict(checkpoint["optimizer"])
+    # If we don't do this then it will just have learning rate of old checkpoint
+    # and it will lead to many hours of debugging \:
+    for param_group in optimizer.param_groups:
+        param_group["lr"] = lr
+def get_loaders(train_csv_path, test_csv_path):
+    from dataset import YOLODataset
+    IMAGE_SIZE = config.IMAGE_SIZE
+    train_dataset = YOLODataset(
+        train_csv_path,
+        transform=config.train_transforms,
+        S=[IMAGE_SIZE // 32, IMAGE_SIZE // 16, IMAGE_SIZE // 8],
+        img_dir=config.IMG_DIR,
+        label_dir=config.LABEL_DIR,
+        anchors=config.ANCHORS,
+    )
+    test_dataset = YOLODataset(
+        test_csv_path,
+        transform=config.test_transforms,
+        S=[IMAGE_SIZE // 32, IMAGE_SIZE // 16, IMAGE_SIZE // 8],
+        img_dir=config.IMG_DIR,
+        label_dir=config.LABEL_DIR,
+        anchors=config.ANCHORS,
+    )
+    train_loader = DataLoader(
+        dataset=train_dataset,
+        batch_sampler= BatchSampler(RandomSampler(train_dataset),
+                                 batch_size=config.BATCH_SIZE,
+                                 drop_last=False,
+                                 multiscale_step=1,
+                                 img_sizes=list(range(320, 608 + 1, 32))
+                        ),
+        num_workers=config.NUM_WORKERS,
+        pin_memory=config.PIN_MEMORY,
+    )
+    test_loader = DataLoader(
+        dataset=test_dataset,
+        batch_size=config.BATCH_SIZE,
+        num_workers=config.NUM_WORKERS,
+        pin_memory=config.PIN_MEMORY,
+        shuffle=False,
+        drop_last=False,
+    )
+    train_eval_dataset = YOLODataset(
+        train_csv_path,
+        transform=config.test_transforms,
+        S=[IMAGE_SIZE // 32, IMAGE_SIZE // 16, IMAGE_SIZE // 8],
+        img_dir=config.IMG_DIR,
+        label_dir=config.LABEL_DIR,
+        anchors=config.ANCHORS,
+    )
+    train_eval_loader = DataLoader(
+        dataset=train_eval_dataset,
+        batch_size=config.BATCH_SIZE,
+        num_workers=config.NUM_WORKERS,
+        pin_memory=config.PIN_MEMORY,
+        shuffle=False,
+        drop_last=False,
+    )
+    return train_loader, test_loader, train_eval_loader
+def plot_couple_examples(model, loader, thresh, iou_thresh, anchors):
+    model.eval()
+    x, y = next(iter(loader))
+    x = x.to("cuda")
+    with torch.no_grad():
+        out = model(x)
+        bboxes = [[] for _ in range(x.shape[0])]
+        for i in range(3):
+            batch_size, A, S, _, _ = out[i].shape
+            anchor = anchors[i]
+            boxes_scale_i = cells_to_bboxes(
+                out[i], anchor, S=S, is_preds=True
+            )
+            for idx, (box) in enumerate(boxes_scale_i):
+                bboxes[idx] += box
+        model.train()
+    for i in range(batch_size//4):
+        nms_boxes = non_max_suppression(
+            bboxes[i], iou_threshold=iou_thresh, threshold=thresh, box_format="midpoint",
+        )
+        plot_image(x[i].permute(1,2,0).detach().cpu(), nms_boxes)
+def seed_everything(seed=42):
+    os.environ['PYTHONHASHSEED'] = str(seed)
+    random.seed(seed)
+    np.random.seed(seed)
+    torch.manual_seed(seed)
+    torch.cuda.manual_seed(seed)
+    torch.cuda.manual_seed_all(seed)
+    torch.backends.cudnn.deterministic = True
+    torch.backends.cudnn.benchmark = False
+def clip_coords(boxes, img_shape):
+    # Clip bounding xyxy bounding boxes to image shape (height, width)
+    boxes[:, 0].clamp_(0, img_shape[1])  # x1
+    boxes[:, 1].clamp_(0, img_shape[0])  # y1
+    boxes[:, 2].clamp_(0, img_shape[1])  # x2
+    boxes[:, 3].clamp_(0, img_shape[0])  # y2
+def xywhn2xyxy(x, w=640, h=640, padw=0, padh=0):
+    # Convert nx4 boxes from [x, y, w, h] normalized to [x1, y1, x2, y2] where xy1=top-left, xy2=bottom-right
+    y = x.clone() if isinstance(x, torch.Tensor) else np.copy(x)
+    y[..., 0] = w * (x[..., 0] - x[..., 2] / 2) + padw  # top left x
+    y[..., 1] = h * (x[..., 1] - x[..., 3] / 2) + padh  # top left y
+    y[..., 2] = w * (x[..., 0] + x[..., 2] / 2) + padw  # bottom right x
+    y[..., 3] = h * (x[..., 1] + x[..., 3] / 2) + padh  # bottom right y
+    return y
+def xyn2xy(x, w=640, h=640, padw=0, padh=0):
+    # Convert normalized segments into pixel segments, shape (n,2)
+    y = x.clone() if isinstance(x, torch.Tensor) else np.copy(x)
+    y[..., 0] = w * x[..., 0] + padw  # top left x
+    y[..., 1] = h * x[..., 1] + padh  # top left y
+    return y
+def xyxy2xywhn(x, w=640, h=640, clip=False, eps=0.0):
+    # Convert nx4 boxes from [x1, y1, x2, y2] to [x, y, w, h] normalized where xy1=top-left, xy2=bottom-right
+    if clip:
+        clip_boxes(x, (h - eps, w - eps))  # warning: inplace clip
+    y = x.clone() if isinstance(x, torch.Tensor) else np.copy(x)
+    y[..., 0] = ((x[..., 0] + x[..., 2]) / 2) / w  # x center
+    y[..., 1] = ((x[..., 1] + x[..., 3]) / 2) / h  # y center
+    y[..., 2] = (x[..., 2] - x[..., 0]) / w  # width
+    y[..., 3] = (x[..., 3] - x[..., 1]) / h  # height
+    return y
+def clip_boxes(boxes, shape):
+    # Clip boxes (xyxy) to image shape (height, width)
+    if isinstance(boxes, torch.Tensor):  # faster individually
+        boxes[..., 0].clamp_(0, shape[1])  # x1
+        boxes[..., 1].clamp_(0, shape[0])  # y1
+        boxes[..., 2].clamp_(0, shape[1])  # x2
+        boxes[..., 3].clamp_(0, shape[0])  # y2
+    else:  # np.array (faster grouped)
+        boxes[..., [0, 2]] = boxes[..., [0, 2]].clip(0, shape[1])  # x1, x2
+        boxes[..., [1, 3]] = boxes[..., [1, 3]].clip(0, shape[0])  # y1, y2