stevfoy's picture
extra
985c437
import math
import torch
import torch.nn as nn
from .utils import to_cpu
# This new loss function is based on https://github.com/ultralytics/yolov3/blob/master/utils/loss.py
def bbox_iou(box1, box2, x1y1x2y2=True, GIoU=False, DIoU=False, CIoU=False, eps=1e-9):
# Returns the IoU of box1 to box2. box1 is 4, box2 is nx4
box2 = box2.T
# Get the coordinates of bounding boxes
if x1y1x2y2: # x1, y1, x2, y2 = box1
b1_x1, b1_y1, b1_x2, b1_y2 = box1[0], box1[1], box1[2], box1[3]
b2_x1, b2_y1, b2_x2, b2_y2 = box2[0], box2[1], box2[2], box2[3]
else: # transform from xywh to xyxy
b1_x1, b1_x2 = box1[0] - box1[2] / 2, box1[0] + box1[2] / 2
b1_y1, b1_y2 = box1[1] - box1[3] / 2, box1[1] + box1[3] / 2
b2_x1, b2_x2 = box2[0] - box2[2] / 2, box2[0] + box2[2] / 2
b2_y1, b2_y2 = box2[1] - box2[3] / 2, box2[1] + box2[3] / 2
# Intersection area
inter = (torch.min(b1_x2, b2_x2) - torch.max(b1_x1, b2_x1)).clamp(0) * \
(torch.min(b1_y2, b2_y2) - torch.max(b1_y1, b2_y1)).clamp(0)
# Union Area
w1, h1 = b1_x2 - b1_x1, b1_y2 - b1_y1 + eps
w2, h2 = b2_x2 - b2_x1, b2_y2 - b2_y1 + eps
union = w1 * h1 + w2 * h2 - inter + eps
iou = inter / union
if GIoU or DIoU or CIoU:
# convex (smallest enclosing box) width
cw = torch.max(b1_x2, b2_x2) - torch.min(b1_x1, b2_x1)
ch = torch.max(b1_y2, b2_y2) - torch.min(b1_y1, b2_y1) # convex height
if CIoU or DIoU: # Distance or Complete IoU https://arxiv.org/abs/1911.08287v1
c2 = cw ** 2 + ch ** 2 + eps # convex diagonal squared
rho2 = ((b2_x1 + b2_x2 - b1_x1 - b1_x2) ** 2 +
(b2_y1 + b2_y2 - b1_y1 - b1_y2) ** 2) / 4 # center distance squared
if DIoU:
return iou - rho2 / c2 # DIoU
elif CIoU: # https://github.com/Zzh-tju/DIoU-SSD-pytorch/blob/master/utils/box/box_utils.py#L47
v = (4 / math.pi ** 2) * \
torch.pow(torch.atan(w2 / h2) - torch.atan(w1 / h1), 2)
with torch.no_grad():
alpha = v / ((1 + eps) - iou + v)
return iou - (rho2 / c2 + v * alpha) # CIoU
else: # GIoU https://arxiv.org/pdf/1902.09630.pdf
c_area = cw * ch + eps # convex area
return iou - (c_area - union) / c_area # GIoU
else:
return iou # IoU
def compute_loss(predictions, targets, model):
# Check which device was used
device = targets.device
# Add placeholder varables for the different losses
lcls, lbox, lobj = torch.zeros(1, device=device), torch.zeros(1, device=device), torch.zeros(1, device=device)
# Build yolo targets
tcls, tbox, indices, anchors = build_targets(predictions, targets, model) # targets
# Define different loss functions classification
BCEcls = nn.BCEWithLogitsLoss(
pos_weight=torch.tensor([1.0], device=device))
BCEobj = nn.BCEWithLogitsLoss(
pos_weight=torch.tensor([1.0], device=device))
# Calculate losses for each yolo layer
for layer_index, layer_predictions in enumerate(predictions):
# Get image ids, anchors, grid index i and j for each target in the current yolo layer
b, anchor, grid_j, grid_i = indices[layer_index]
# Build empty object target tensor with the same shape as the object prediction
tobj = torch.zeros_like(layer_predictions[..., 0], device=device) # target obj
# Get the number of targets for this layer.
# Each target is a label box with some scaling and the association of an anchor box.
# Label boxes may be associated to 0 or multiple anchors. So they are multiple times or not at all in the targets.
num_targets = b.shape[0]
# Check if there are targets for this batch
if num_targets:
# Load the corresponding values from the predictions for each of the targets
ps = layer_predictions[b, anchor, grid_j, grid_i]
# Regression of the box
# Apply sigmoid to xy offset predictions in each cell that has a target
pxy = ps[:, :2].sigmoid()
# Apply exponent to wh predictions and multiply with the anchor box that matched best with the label for each cell that has a target
pwh = torch.exp(ps[:, 2:4]) * anchors[layer_index]
# Build box out of xy and wh
pbox = torch.cat((pxy, pwh), 1)
# Calculate CIoU or GIoU for each target with the predicted box for its cell + anchor
iou = bbox_iou(pbox.T, tbox[layer_index], x1y1x2y2=False, CIoU=True)
# We want to minimize our loss so we and the best possible IoU is 1 so we take 1 - IoU and reduce it with a mean
lbox += (1.0 - iou).mean() # iou loss
# Classification of the objectness
# Fill our empty object target tensor with the IoU we just calculated for each target at the targets position
tobj[b, anchor, grid_j, grid_i] = iou.detach().clamp(0).type(tobj.dtype) # Use cells with iou > 0 as object targets
# Classification of the class
# Check if we need to do a classification (number of classes > 1)
if ps.size(1) - 5 > 1:
# Hot one class encoding
t = torch.zeros_like(ps[:, 5:], device=device) # targets
t[range(num_targets), tcls[layer_index]] = 1
# Use the tensor to calculate the BCE loss
lcls += BCEcls(ps[:, 5:], t) # BCE
# Classification of the objectness the sequel
# Calculate the BCE loss between the on the fly generated target and the network prediction
lobj += BCEobj(layer_predictions[..., 4], tobj) # obj loss
lbox *= 0.05
lobj *= 1.0
lcls *= 0.5
# Merge losses
loss = lbox + lobj + lcls
return loss, to_cpu(torch.cat((lbox, lobj, lcls, loss)))
def build_targets(p, targets, model):
# Build targets for compute_loss(), input targets(image,class,x,y,w,h)
na, nt = 3, targets.shape[0] # number of anchors, targets #TODO
tcls, tbox, indices, anch = [], [], [], []
gain = torch.ones(7, device=targets.device) # normalized to gridspace gain
# Make a tensor that iterates 0-2 for 3 anchors and repeat that as many times as we have target boxes
ai = torch.arange(na, device=targets.device).float().view(na, 1).repeat(1, nt)
# Copy target boxes anchor size times and append an anchor index to each copy the anchor index is also expressed by the new first dimension
targets = torch.cat((targets.repeat(na, 1, 1), ai[:, :, None]), 2)
for i, yolo_layer in enumerate(model.yolo_layers):
# Scale anchors by the yolo grid cell size so that an anchor with the size of the cell would result in 1
anchors = yolo_layer.anchors / yolo_layer.stride
# Add the number of yolo cells in this layer the gain tensor
# The gain tensor matches the collums of our targets (img id, class, x, y, w, h, anchor id)
gain[2:6] = torch.tensor(p[i].shape)[[3, 2, 3, 2]] # xyxy gain
# Scale targets by the number of yolo layer cells, they are now in the yolo cell coordinate system
t = targets * gain
# Check if we have targets
if nt:
# Calculate ration between anchor and target box for both width and height
r = t[:, :, 4:6] / anchors[:, None]
# Select the ratios that have the highest divergence in any axis and check if the ratio is less than 4
j = torch.max(r, 1. / r).max(2)[0] < 4 # compare #TODO
# Only use targets that have the correct ratios for their anchors
# That means we only keep ones that have a matching anchor and we loose the anchor dimension
# The anchor id is still saved in the 7th value of each target
t = t[j]
else:
t = targets[0]
# Extract image id in batch and class id
b, c = t[:, :2].long().T
# We isolate the target cell associations.
# x, y, w, h are allready in the cell coordinate system meaning an x = 1.2 would be 1.2 times cellwidth
gxy = t[:, 2:4]
gwh = t[:, 4:6] # grid wh
# Cast to int to get an cell index e.g. 1.2 gets associated to cell 1
gij = gxy.long()
# Isolate x and y index dimensions
gi, gj = gij.T # grid xy indices
# Convert anchor indexes to int
a = t[:, 6].long()
# Add target tensors for this yolo layer to the output lists
# Add to index list and limit index range to prevent out of bounds
indices.append((b, a, gj.clamp_(0, gain[3].long() - 1), gi.clamp_(0, gain[2].long() - 1)))
# Add to target box list and convert box coordinates from global grid coordinates to local offsets in the grid cell
tbox.append(torch.cat((gxy - gij, gwh), 1)) # box
# Add correct anchor for each target to the list
anch.append(anchors[a])
# Add class for each target to the list
tcls.append(c)
return tcls, tbox, indices, anch