nbm_v1 / faster_utils.py
unknown
add app files
82b8273
import numpy as np
import torch
import torch.nn as nn
# Utility functions for Faster R-CNN layers
class Config:
verbose = False
save_dir = './models'
conv_net_path = 'checkpoints/vgg16_bn-6c64b313.pth'
backbone = 'vgg'
pretrain_conv_net = True
device = 'cuda'
classification = False
# General params
num_classes = 1
input_channels = 1
img_width = 1024
img_height = 256
use_biophonia = True
fpn = True
fpn_rpn = False
fpn_p_channels = 256
fpn_o_channels = 256
normalize_input = False
noise_strength = 0
self_attention = False
encode_frequency = False
position_encoding = False
transform = False
# Anchors
anchor_stride = 16
base_size = 16
ratios = [0.5, 1, 2]
scales_factor_low = 0
scales_factor_high = 4
scales = 2**np.arange(scales_factor_low, scales_factor_high)
n_anchors = len(ratios) * len(scales)
# Anchor Target Layer
rpn_neg_label = 0.3
rpn_pos_label = 0.7 # p-e baisser un peu ça
rpn_batchsize = 16 # jouer là dessus, le réduire exagérément ?
rpn_fg_fraction = 0.5
# Proposal Layer
pre_nms_topN = 3000 # jouer là dessus
min_threshold = 5 # minimum proposal size in px
nms_thresh = 0.7
post_nms_topN = 1000 # jouer là dessus
post_nms_topN_eval = 50
pre_nms_topN_eval = 500
# Proposal Target Layer
rcnn_batch_size = 16 # jouer là dessus
rcnn_fg_prop = 0.4 # 0.25 dans le papier original, essayer différentes valeurs
fg_threshold = 0.5
bg_threshold_lo = 0.1
bg_threshold_hi = 0.5
# ROI Pooling
roi_pool_h = 2 # à changer (3? 4?)
roi_pool_w = 2 # à changer (3? 4?)
hidden_size = 4096
top_pyramid_roi_size = 128
rcnn_attention = False
# Inference
proposal_number = 50 # number of proposals per class after last nms
# Training
lambda_reg_rpn_loss = 1.0 # tester qq autres val
lambda_reg_rcnn_loss = 1.0 # tester qq autres val
batch_size = 2
val_size = 20
learning_rate = 1e-4 # jouer la dessus
validation_prop = 0.01
n_epochs = 10
save_every = 10
scheduler_gamma = 0.1
scheduler_milestones = [15, 25]
cv_idx = -1
def generate_anchors(base_size, ratios, scales):
base_anchor_wh = np.array([base_size, base_size])
# Deform base anchor dimensions to the given ratios
coeffs = np.hstack([np.sqrt(ratios)[:, np.newaxis], (1 / np.sqrt(ratios))[:, np.newaxis]])
ratios_anchors_wh = coeffs * np.sqrt(np.prod(base_anchor_wh))
# Expand the resulting anchor dimensions to the given sizes
all_anchor_whs = (ratios_anchors_wh.flatten() * scales[:, np.newaxis]).reshape(-1, 2)
# Convert from w h to x1 y1 x2 y2 representation, given center coordinates at int(base_size / 2)
all_anchor = (np.hstack([- all_anchor_whs / 2, all_anchor_whs / 2]) + int(base_size / 2)).astype(int)
return all_anchor
def get_anchor_shifts(width, height, anchor_stride):
shift_x = np.arange(0, width) * anchor_stride
shift_y = np.arange(0, height) * anchor_stride
shifts = np.hstack([np.tile(shift_x, len(shift_y)).reshape(-1, 1), np.repeat(shift_y, len(shift_x)).reshape(-1, 1)])
shifts = np.tile(shifts, 2)
return shifts.reshape(-1, 1, 4)
def bbox_overlap(anchors, bbox):
"""
Computes a K (anchors) x N (bbox) intersection over union matrix
"""
right_boundaries = torch.stack([anchors[:, 2].repeat(len(bbox)), bbox[:, 2].repeat_interleave(len(anchors))]).min(dim=0)[0]
left_boundaries = torch.stack([anchors[:, 0].repeat(len(bbox)), bbox[:, 0].repeat_interleave(len(anchors))]).max(dim=0)[0]
x_intersec = (right_boundaries - left_boundaries + 1).clamp(min=0)
# shapes anchors * bbox
top_boundaries = torch.stack([anchors[:, 3].repeat(len(bbox)), bbox[:, 3].repeat_interleave(len(anchors))]).min(dim=0)[0]
bottom_boundaries = torch.stack([anchors[:, 1].repeat(len(bbox)), bbox[:, 1].repeat_interleave(len(anchors))]).max(dim=0)[0]
y_intersec = (top_boundaries - bottom_boundaries + 1).clamp(min=0)
intersection = x_intersec * y_intersec
areas_anchors = (anchors[:, 2] - anchors[:, 0] + 1) * (anchors[:, 3] - anchors[:, 1] + 1)
areas_bbox = (bbox[:, 2] - bbox[:, 0] + 1) * (bbox[:, 3] - bbox[:, 1] + 1)
union = torch.stack([areas_anchors.repeat(len(bbox)), areas_bbox.repeat_interleave(len(anchors))]).sum(dim=0) - intersection
iou = (intersection / union).view(len(bbox), len(anchors)).transpose(1, 0)
return iou
def bbox_transform(anchors, bbox):
wa = (anchors[:, 2] - anchors[:, 0]) + 1
ha = (anchors[:, 3] - anchors[:, 1]) + 1
xa = anchors[:, 0] + 0.5 * wa
ya = anchors[:, 1] + 0.5 * ha
w = (bbox[:, 2] - bbox[:, 0]) + 1
h = (bbox[:, 3] - bbox[:, 1]) + 1
x = bbox[:, 0] + 0.5 * w
y = bbox[:, 1] + 0.5 * h
t_x = (x - xa) / wa
t_y = (y - ya) / ha
t_w = torch.log(w / wa)
t_h = torch.log(h / ha)
return torch.stack([t_x, t_y, t_w, t_h]).transpose(1, 0)
def weight_init(m):
classname = m.__class__.__name__
if classname.find('BatchNorm') != -1:
m.weight.data.normal_(0.0, 0.02)
if (classname.find('Linear') != -1) & (classname.find('LinearLayer') == -1):
nn.init.kaiming_normal_(m.weight)
if (classname.find('Conv2d') != -1):
nn.init.kaiming_normal_(m.weight)
def collate_fn(list_batch):
lengths = [elt[1].size(0) for elt in list_batch]
img_batch = torch.stack([img for (img, bb_cord, bird_id, img_info) in list_batch])
bb_coord_batch = torch.cat([bb_cord for (img, bb_cord, bird_id, img_info) in list_batch], dim=0)
bird_ids = torch.cat([bird_id for (img, bb_cord, bird_id, img_info) in list_batch])
img_infos = [elt[-1] for elt in list_batch]
return [img_batch, bb_coord_batch, lengths, bird_ids, img_infos]
def bbox_reg_to_coord(bbox_pred, anchors):
wa = (anchors[:, 2] - anchors[:, 0]) + 1
ha = (anchors[:, 3] - anchors[:, 1]) + 1
xa = anchors[:, 0] + 0.5 * wa
ya = anchors[:, 1] + 0.5 * ha
t_x = bbox_pred[..., 0]
t_y = bbox_pred[..., 1]
t_w = bbox_pred[..., 2]
t_h = bbox_pred[..., 3]
x = (t_x * wa) + xa
y = (t_y * ha) + ya
w = torch.exp(t_w) * wa
h = torch.exp(t_h) * ha
return torch.stack([(x - 0.5 * w).round(), (y - 0.5 * h).round(), (x + 0.5 * w).round(), (y + 0.5 * h).round()], dim=2)
def batch_self_overlap(bbox_pred):
rep = bbox_pred.shape[1]
right_boundaries = torch.stack([bbox_pred[..., 2].repeat(1, rep), bbox_pred[..., 2].repeat_interleave(rep, dim=1)]).min(dim=0)[0]
left_boundaries = torch.stack([bbox_pred[..., 0].repeat(1, rep), bbox_pred[..., 0].repeat_interleave(rep, dim=1)]).max(dim=0)[0]
x_intersec = (right_boundaries - left_boundaries + 1).clamp(min=0)
top_boundaries = torch.stack([bbox_pred[..., 3].repeat(1, rep), bbox_pred[..., 3].repeat_interleave(rep, dim=1)]).min(dim=0)[0]
bottom_boundaries = torch.stack([bbox_pred[..., 1].repeat(1, rep), bbox_pred[..., 1].repeat_interleave(rep, dim=1)]).max(dim=0)[0]
y_intersec = (top_boundaries - bottom_boundaries + 1).clamp(min=0)
intersection = x_intersec * y_intersec
areas = (bbox_pred[..., 2] - bbox_pred[..., 0] + 1) * (bbox_pred[..., 3] - bbox_pred[..., 1] + 1)
union = torch.stack([areas.repeat(1, rep), areas.repeat_interleave(rep, dim=1)]).sum(dim=0) - intersection
iou = (intersection / union).view(-1, rep, rep)
return iou
def nms(bbox_pred, scores, nms_thresh=0.7, post_nms_topN=300, return_idx=False):
"""
Applies non maximum suppression to the predicted bbox coordinates bbox_pred (shape batch_size * n_boxes * 4)
scores are sorted in decreasing order, and bbox_pred coordinates are sorted accordingly for each batch idx
"""
iou = batch_self_overlap(bbox_pred)
batch_keep = []
batch_size = len(bbox_pred)
for b_idx in range(batch_size):
suppress = []
keep_idx = []
b_iou = iou[b_idx]
for idx in range(len(b_iou)):
if idx in suppress:
continue
keep_idx.append(idx)
suppress += (torch.nonzero(b_iou[idx, idx + 1:] >= nms_thresh)[:, 0] + idx + 1).tolist()
batch_keep.append(keep_idx)
# Truncate idx vectors if one has length < post nms topN
post_nms_topN = min(np.array([len(b_keep) for b_keep in batch_keep]).min(), post_nms_topN)
scores = torch.stack([scores[i, batch_keep[i][:post_nms_topN]] for i in range(batch_size)])
bbox_pred = torch.stack([bbox_pred[i, batch_keep[i][:post_nms_topN], :] for i in range(batch_size)])
out = bbox_pred, scores
if return_idx:
out += (batch_keep,)
return out
def get_bbox_regression_targets(bbox_targets, b_labels, num_classes):
"""
One regression objective per object class
"""
expanded_bbox_targets = torch.zeros(len(bbox_targets), 4 * (1 + num_classes)).cuda()
for i in range(1, num_classes + 1):
class_idx = torch.nonzero(b_labels == i)[:, 0]
col_idx = 4 * i
expanded_bbox_targets[class_idx, col_idx:col_idx + 4] = bbox_targets[class_idx]
return expanded_bbox_targets
def cross_entropy_loss(bbox_classes, labels):
"""
labels must be a flatten (numpy) array of class indices (0 for background)
"""
gt_probs = bbox_classes[range(len(bbox_classes)), labels]
cel = (-torch.log(gt_probs)).sum()
return cel
def smooth_l1_loss(bbox_reg, bbox_targets):
deltas = torch.abs(bbox_reg - bbox_targets)
mask_smoothing = (deltas >= 1)
smoothed_l1 = (~mask_smoothing).float() * 0.5 * (deltas**2) + mask_smoothing.float() * (deltas - 0.5)
return smoothed_l1
def bool_parser(string):
if string.lower() == 'false':
return False
return True
def train_test_split(length, val_prop):
indices = np.arange(length)
np.random.shuffle(indices)
cut = int(val_prop * length)
return indices[cut:], indices[:cut]
def position_encodings(x, device):
bs, channels, height, width = x.shape
i_idx = np.arange(width)
j_idx = np.arange(height)
position_encodings = torch.from_numpy(np.stack(
[np.tile(np.sin(i_idx * 128 / (width * (1e4 ** (2 * k / channels)))), (height, 1)) for k in range(int(channels / 4))] + \
[np.tile(np.cos(i_idx * 128 / (width * (1e4 ** (2 * k / channels)))), (height, 1)) for k in range(int(channels / 4))] + \
[np.tile(np.sin(j_idx * 128 / (height * (1e4 ** (2 * k / channels))))[:, np.newaxis], (1, width)) for k in range(int(channels / 4))] + \
[np.tile(np.cos(j_idx * 128 / (height * (1e4 ** (2 * k / channels))))[:, np.newaxis], (1, width)) for k in range(int(channels / 4))]
)).to(device)
return position_encodings.unsqueeze(0).repeat(bs, 1, 1, 1).float()