Spaces:
Configuration error
Configuration error
| import os | |
| from time import localtime, strftime, time | |
| from pathlib import Path | |
| import hashlib | |
| import requests | |
| import numpy as np | |
| import torch | |
| from matplotlib import cm as colormap | |
| from matplotlib import pyplot as plt | |
| from matplotlib.colors import Normalize | |
| from PIL import Image, ImageDraw, ImageFont | |
| YOLOV3_WEIGHTS_PATH = 'https://pjreddie.com/media/files/yolov3.weights' | |
| YOLOV3_WEIGHTS_MD5 = 'c84e5b99d0e52cd466ae710cadf6d84c' | |
| def md5_hash(path): | |
| with open(path, "rb") as f: | |
| content = f.read() | |
| return hashlib.md5(content).hexdigest() | |
| def check_if_file_exists_else_download(path, chunk_size=1024): | |
| path = Path(path) | |
| if not path.exists() or (md5_hash(path) != YOLOV3_WEIGHTS_MD5): | |
| print(path, 'does not exist or md5sum is incorrect downloading...') | |
| path.parent.mkdir(exist_ok=True, parents=True) | |
| with requests.get(YOLOV3_WEIGHTS_PATH, stream=True) as r: | |
| total_size = int(r.headers.get('content-length', 0)) | |
| with open(path, 'wb') as f: | |
| for data in r.iter_content(chunk_size=chunk_size): | |
| if data: | |
| f.write(data) | |
| print('downloaded from', YOLOV3_WEIGHTS_PATH, 'md5 of the file:', md5_hash(path)) | |
| return path | |
| def parse_cfg(file): | |
| ''' | |
| Parses the original cfg file | |
| Argument | |
| -------- | |
| file: str | |
| A path to cfg file. | |
| Output | |
| ------ | |
| layers: list | |
| A list of dicts with config for each layer. | |
| Note: the 0th element of the list contain config for network itself | |
| ''' | |
| layers = [] | |
| layer = {} | |
| with open(file, 'r') as readf: | |
| lines = readf.read().split('\n') | |
| # skip commented lines | |
| lines = [line for line in lines if not line.startswith('#')] | |
| # skip empty lines | |
| lines = [line for line in lines if not len(line) == 0] | |
| # remove all whitespaces | |
| lines = [line.replace(' ', '') for line in lines] | |
| for line in lines: | |
| # if the name of the layer (they are of form : [*]) | |
| if line.startswith('[') and line.endswith(']'): | |
| # save the prev. layer as the next lines contains info for the next layer | |
| if len(layer) > 0: | |
| layers.append(layer) | |
| layer = {} | |
| # add the layer's name/type | |
| layer['name'] = line.replace('[', '').replace(']', '') | |
| # if not the name then parse agruments | |
| else: | |
| # all arguments follows the pattern: 'key=value' | |
| key, value = line.split('=') | |
| # add info to the layer | |
| layer[key] = value | |
| # append the last layer | |
| layers.append(layer) | |
| return layers | |
| def get_center_coords(bboxes): #top_left_x, top_left_y, box_w, box_h | |
| ''' | |
| Given the bboxes with top-left coordinates transforms the bboxes | |
| with center coordinates. | |
| Argument | |
| -------- | |
| bboxes: torch.FloatTensor | |
| A tensor of size (P, D) where D should contain info about the coords | |
| in the following order (top_left_x, top_left_y, width, height). | |
| Note: D can be higher than 4. | |
| Output | |
| ------ | |
| bboxes: torch.FloatTensor | |
| The similar to the tensor specified in the input but with center | |
| coordinates in 0th and 1st columns. | |
| ''' | |
| bboxes[:, 0] = bboxes[:, 0] + bboxes[:, 2] // 2 | |
| bboxes[:, 1] = bboxes[:, 1] + bboxes[:, 3] // 2 | |
| return bboxes | |
| def get_corner_coords(bboxes): | |
| ''' | |
| Transforms the bounding boxes coordinate from (center_x, center_y, w, h) into | |
| (top_left_x, top_left_y, bottom_right_x, bottom_right_y), | |
| i.e. into corner coordinates. | |
| Argument | |
| -------- | |
| bboxes: torch.FloatTensor | |
| A tensor of size (P, D) where D should contain info about the coords | |
| in the following order (center_x, center_y, width, height). Note: | |
| D can be higher than 4. | |
| Outputs | |
| ------- | |
| top_left_x, top_left_y, bottom_right_x, bottom_right_y: torch.FloatTensors | |
| Transformed coordinates for bboxes: top-left corner coordinates for x and y | |
| and bottom-right coordinates for x and y respectively. | |
| ''' | |
| top_left_x = bboxes[:, 0] - bboxes[:, 2]/2 | |
| top_left_y = bboxes[:, 1] - bboxes[:, 3]/2 | |
| bottom_right_x = bboxes[:, 0] + bboxes[:, 2]/2 | |
| bottom_right_y = bboxes[:, 1] + bboxes[:, 3]/2 | |
| return top_left_x, top_left_y, bottom_right_x, bottom_right_y | |
| def iou_vectorized(bboxes1, bboxes2, without_center_coords=False): | |
| ''' | |
| Calculates intersection over union between every bbox in bboxes1 with | |
| every bbox in bboxes2, i.e. Cartesian product of both sets. | |
| Arguments | |
| --------- | |
| bboxes1: torch.FloatTensor | |
| (M, 4 + *) shapped tensor with M bboxes with 4 bbox coordinates (cx, cy, w, h, *). | |
| bboxes2: torch.FloatTensor | |
| (N, 4 + *) shapped tensor with M bboxes with 4 bbox coordinates (cx, cy, w, h, *). | |
| without_center_coords: bool | |
| True: IoU is calculated only using width and height (no center coordinates). | |
| It is useful on training when the best bbox is selected to replace the gt bbox. | |
| Note: bboxes1 and bboxes2 are expected to have (M, 2 + *) and (N, 2 + *), respectively. | |
| Output | |
| ------ | |
| : torch.FloatTensor | |
| (M, N) shapped tensor with (i, j) corresponding to IoU between i-th bbox | |
| from bboxes1 with j-th bbox from bboxes2. | |
| ''' | |
| # pixel shift is 0 if we calculate without center coordinates and 1 otherwise. | |
| # Why? Let's say I want to calculate the number of pixels the width of a box | |
| # overlaps given two x coordinates for pixels: 0 and 5. So, the side is 6 pixels | |
| # but 5 - 0 = 5. Therefore, we add 1. | |
| # However, we don't need to do it when we don't have center coordinates | |
| # i.e. without_center_coords = True | |
| px_shift = 1 | |
| # add 'fake' center coordinates. You can use any value, we use zeros | |
| if without_center_coords: | |
| bboxes1 = torch.cat([torch.zeros_like(bboxes1[:, :2]), bboxes1], dim=1) | |
| bboxes2 = torch.cat([torch.zeros_like(bboxes2[:, :2]), bboxes2], dim=1) | |
| px_shift = 0 | |
| M, D = bboxes1.shape | |
| N, D = bboxes2.shape | |
| # Transform coords of the 1st bboxes (y=0 is at the top, and increases downwards) | |
| top_left_x1, top_left_y1, bottom_right_x1, bottom_right_y1 = get_corner_coords(bboxes1) | |
| # Transform coords of the 2nd bboxes | |
| top_left_x2, top_left_y2, bottom_right_x2, bottom_right_y2 = get_corner_coords(bboxes2) | |
| # broadcasting 1st bboxes | |
| top_left_x1 = top_left_x1.view(M, 1) | |
| top_left_y1 = top_left_y1.view(M, 1) | |
| bottom_right_x1 = bottom_right_x1.view(M, 1) | |
| bottom_right_y1 = bottom_right_y1.view(M, 1) | |
| # broadcasting 2nd bboxes | |
| top_left_x2 = top_left_x2.view(1, N) | |
| top_left_y2 = top_left_y2.view(1, N) | |
| bottom_right_x2 = bottom_right_x2.view(1, N) | |
| bottom_right_y2 = bottom_right_y2.view(1, N) | |
| # calculate coords for intersection | |
| inner_top_left_x = torch.max(top_left_x1, top_left_x2) | |
| inner_top_left_y = torch.max(top_left_y1, top_left_y2) | |
| inner_bottom_right_x = torch.min(bottom_right_x1, bottom_right_x2) | |
| inner_bottom_right_y = torch.min(bottom_right_y1, bottom_right_y2) | |
| # area = side_a * side_b | |
| # clamp(x, min=0) = max(x, 0) | |
| # we make sure that the area is 0 if size of a side is negative | |
| # which means that inner_top_left_x > inner_bottom_right_x which is not feasible | |
| # Note: adding one because the coordinates starts at 0 and let's | |
| a = torch.clamp(inner_bottom_right_x - inner_top_left_x + px_shift, min=0) | |
| b = torch.clamp(inner_bottom_right_y - inner_top_left_y + px_shift, min=0) | |
| inner_area = a * b | |
| # finally we calculate union for each pair of bboxes | |
| out_area1 = (bottom_right_x1 - top_left_x1 + px_shift) * (bottom_right_y1 - top_left_y1 + px_shift) | |
| out_area2 = (bottom_right_x2 - top_left_x2 + px_shift) * (bottom_right_y2 - top_left_y2 + px_shift) | |
| out_area = out_area1 + out_area2 - inner_area | |
| return inner_area / out_area | |
| def objectness_filter_and_nms(predictions, classes, obj_thresh=0.8, nms_thresh=0.4): | |
| ''' | |
| Performs filtering according objectness score and non-maximum supression on predictions. | |
| Arguments | |
| --------- | |
| predictions: torch.FloatTensor | |
| A tensor of size (B, P, 5+classes) with predictions. | |
| B -- batch size; P -- number of predictions for an image, | |
| i.e. 3 scales and 3 anchor boxes and | |
| For example: P = (13*13 + 26*26 + 52*52) * 3 = 10647; | |
| 5 + classes -- (cx, cy, w, h, obj_score, {prob_class}). | |
| classes: int | |
| An integer with the number of classes to detect. | |
| obj_thresh: float | |
| A float that corresponds to the lowest objectness score the detector allows. | |
| nms_thresh: float | |
| Corresponds to the highest IoU the detector allows. | |
| Output | |
| ------ | |
| predictions: torch.FloatTensor or None | |
| Predictions after objectness filtering and non-max supression (same size | |
| as predictions in arguments but with a different P). Returns None when | |
| there no detections found. | |
| ''' | |
| # iterate for images in a batch | |
| for i, prediction in enumerate(predictions): | |
| ## objectness thresholding | |
| # If prediction's (bbox') score is higher than obj_thress keep the prediction | |
| # the fourth (fifth) element is objectness score; if there are no | |
| # detections with obj score higher than obj_thresh, return None | |
| objectness_mask = (prediction[:, 4] > obj_thresh) | |
| if len(torch.nonzero(objectness_mask)) == 0: | |
| return None | |
| prediction = prediction[objectness_mask] | |
| # if no object on an image found, continue with the next image | |
| if prediction.size(0) == 0: | |
| continue | |
| ## non-max supression | |
| # The idea is as follows. If a prediction "survived" objectness filtering | |
| # then it is considered meaningful. Since we may have multiple detections of | |
| # one object on an image we need to filter out those predictions that have | |
| # substantial (more than nms_thresh) overlap, or IoU, with the box with highest | |
| # class score. Also note that one image might contact more than object of the same class. | |
| # So, as they don't have high IoU with the box with highest class score, they will be kept | |
| # in the list of predictions | |
| # for each prediction we save the class with the maximum class score | |
| pred_score, pred_classes = torch.max(prediction[:, 5:5+classes], dim=-1) | |
| # we are going to iterate through classes, so, first, we select the set of unique classes | |
| unique_classes = pred_classes.unique().float() | |
| # initialize the list of filtered detections | |
| detections_after_nms = [] | |
| for cls in unique_classes: | |
| # select only the entries for a specific class. | |
| # pred_classes is of torch.LongTensor type but we need torch.FloatTensor | |
| prediction_4_cls = prediction[pred_classes.float() == cls] | |
| # then we sort predictions for a specific class by objectness score (high -> low) | |
| sort_pred_idxs = torch.sort(prediction_4_cls[:, 4], descending=True)[1] | |
| prediction_4_cls = prediction_4_cls[sort_pred_idxs] | |
| # next we want to fill out detections_after_nms with only with those objects | |
| # that has a unique position, i.e. low IoU with other predictions. | |
| # The idea here is to append (save) the first prediction in the prediction_4_cls | |
| # and calculate IoUs with the rest predictions in that prediction_4_cls of the | |
| # ordered list. Next, the predictions with the high IoU | |
| # with the first prediction in prediction_4_cls will be discarded. | |
| # For the next iteration, the first prediction will be the prediction with | |
| # the highest obj score among the ones that are left. | |
| # exit the loop when there is no prediction left after the nms | |
| while len(prediction_4_cls) > 0: | |
| # we append the first prediction for a specific class to the list of predictions. | |
| # We can do this because we ordered the prediction_4_cls beforehand. | |
| detections_after_nms.append(prediction_4_cls[0].unsqueeze(0)) | |
| # also stop when this is the last prediction in prediction_4_cls | |
| if len(prediction_4_cls) == 1: | |
| break | |
| # calculate IoUs with the first pred in prediction_4_cls and the rest of them | |
| ious = iou_vectorized(prediction_4_cls[0, :5].unsqueeze(0), prediction_4_cls[1:, :5]) | |
| # when iou_vectorized inputs two tensors, the ious.shape is (N, M) but now N = 1 | |
| # and [ious < nms_thresh] should be one dimesional | |
| ious = ious.reshape(-1) | |
| # filter out the first prediction (1:) and the ones with high IoU with the 0th pred | |
| prediction_4_cls = prediction_4_cls[1:][ious < nms_thresh] | |
| # as detections_after_nms is a list, we concatenate its elements to a tensor | |
| predictions = torch.cat(detections_after_nms) | |
| return predictions | |
| def scale_numbers(num1, num2, largest_num_target): | |
| ''' | |
| Scales two numbers (for example, dimensions) keeping aspect ratio. | |
| Arguments | |
| --------- | |
| num1: float or int | |
| The 1st number (dim1). | |
| num2: float or int | |
| The 2nd number (dim2). | |
| largest_num_target: int | |
| The expected size of the largest number among 1st and 2nd numbers. | |
| Outputs | |
| ------- | |
| (int, int, float) | |
| Two scaled numbers such that the largest is equal to largest_num_target | |
| maintaining the same aspect ratio as num1 and num2 in input. Also, | |
| returns a scalling coefficient. | |
| Note: two ints are returned. | |
| Examples | |
| -------- | |
| scale_numbers(832, 832, 416) -> (416, 416, 0.5) | |
| scale_numbers(223, 111, 416) -> (416, 207, 1.865...) | |
| scale_numbers(100, 200, 416) -> (208, 416, 2.08) | |
| scale_numbers(200, 832, 416) -> (100, 416, 0.5) | |
| ''' | |
| # make sure the arguments are of correct types | |
| assert isinstance(largest_num_target, int), 'largest_num_target should be "int"' | |
| # to make the largest number to be equal largest_num_target keeping aspect ratio | |
| # we need, first, to estimate by how much the largest number is smaller (larger) | |
| # than largest_num_target and, second, to scale both numbers by this ratio. | |
| # select the maximum among two numbers | |
| max_num = max(num1, num2) | |
| # calculate scalling coefficient | |
| scale_coeff = largest_num_target / max_num | |
| # scale both numbers | |
| num1 = num1 * scale_coeff | |
| num2 = num2 * scale_coeff | |
| return round(num1), round(num2), scale_coeff | |
| def letterbox_pad(img, color=127.5): | |
| ''' | |
| Adds padding to an image according to the original implementation of darknet. | |
| Specifically, it will pad the image up to (net_input_size x net_input_size) size. | |
| Arguments | |
| --------- | |
| img: numpy.ndarray | |
| An image to pad. | |
| color: (float or int) \in [0, 255] | |
| The RGB intensity. The image will be padded with this color. | |
| Output | |
| ------ | |
| img: numpy.ndarray | |
| The padded image. | |
| pad_sizes: (int, int, int, int) | |
| The sizes of paddings. Used in show_prediction module where we need to shift | |
| predictions by the size of the padding. order: top, bottom, left, right | |
| ''' | |
| # make sure the arguments are of correct types | |
| assert isinstance(img, np.ndarray), '"img" should have numpy.ndarray type' | |
| # assert isinstance(net_input_size, int), '"net_input_size" should have int type' | |
| assert isinstance(color, (int, float)), '"color" should be an int or float' | |
| H, W, C = img.shape | |
| max_side_len = max(H, W) | |
| # if width is higher than height then, to make a squared-shaped image, we need | |
| # to pad the height; else, we need to pad width. | |
| if W > H: | |
| # calculates how much should be padded "on top" which is a half of | |
| # the difference between the target size and the current height | |
| pad_top = (max_side_len - H) // 2 | |
| # another half is added to the bottom | |
| pad_bottom = max_side_len - (H + pad_top) | |
| pad_left = 0 | |
| pad_right = 0 | |
| else: | |
| pad_top = 0 | |
| pad_bottom = 0 | |
| # calculates how much should be padded "on left" which is a half of | |
| # the difference between the target size and the current width | |
| pad_left = (max_side_len - W) // 2 | |
| pad_right = max_side_len - (W + pad_left) | |
| # pad_widths should contain three pairs (because of 3d) of padding sizes: | |
| # first pair adds rows [from top and bottom], | |
| # second adds columns [from left to right], | |
| # the third adds nothing because we pad only spatially, not channel-wise | |
| pad_widths = [[pad_top, pad_bottom], [pad_left, pad_right], [0, 0]] | |
| # for each padding we specify a color (constant parameter) | |
| color = [[color, color], [color, color], [0, 0]] | |
| # perform padding | |
| img = np.pad(img, pad_widths, 'constant', constant_values=color) | |
| # save padding sizes | |
| pad_sizes = (pad_top, pad_bottom, pad_left, pad_right) | |
| return img, pad_sizes | |
| def fix_orientation_if_needed(pil_img, orientation): | |
| ''' | |
| Motivation: sometimes when a user uploads a photo from their phone the | |
| photo is rotated by 90 deg even though it looks fine on the phone. This | |
| functionfixes this problem by correcting the orientation by employing info | |
| from EXIF. For more info regarding this issue, please see: | |
| https://magnushoff.com/jpeg-orientation.html | |
| Argument | |
| -------- | |
| pil_img: PIL.Image.Image | |
| The target image. | |
| orientation: str | |
| Orientation which front-end tries to extract from EXIF of an image. | |
| Can be 'undefined' or some integer which can be used to orient the image. | |
| Output | |
| ------ | |
| pil_img: PIL.Image.Image | |
| The original image with the fixed orientation or the same image if | |
| no EXIF info is available | |
| ''' | |
| # if expand is False the dimension of the image remains the same | |
| if orientation == '3': | |
| pil_img = pil_img.rotate(180, expand=True) | |
| elif orientation == '6': | |
| pil_img = pil_img.rotate(270, expand=True) | |
| elif orientation == '8': | |
| pil_img = pil_img.rotate(90, expand=True) | |
| return pil_img | |
| # TODO: test for different devices | |
| def predict_and_save(source_img, model, device, labels_path, font_path, orientation, show=False, save=True): | |
| ''' | |
| Performs inference on an image and saves the image with bounding boxes drawn on it. | |
| Arguments | |
| --------- | |
| source_img: PIL.Image.Image or str | |
| The image to perform inference on. | |
| model: Darknet | |
| The model which will be used for inference. | |
| device: torch.device or str | |
| Device for calculations. | |
| labels_path: str | |
| The path to the object names. | |
| font_path: str | |
| The path to the font which is going to be used to tag bounding boxes. | |
| orientation: str | |
| Orientation which front-end tries to extract from EXIF of an image. | |
| Can be 'undefined' or some integer which can be used to orient the image. | |
| Used in fix_orientation_if_needed(). | |
| show: bool | |
| Whether to show the output image with bounding boxes, for example, in jupyter notebook | |
| save: bool | |
| Whether to save the output image with bounding boxes. | |
| Outputs | |
| ------- | |
| prediction: torch.FloatTensor or NoneType | |
| Predictions of a size (<number of detected objects>, 4+1+<number of classes>). | |
| prediction is NoneType when no object has been detected on an image. | |
| ''' | |
| assert isinstance(labels_path, (str, Path)), '"labels_path" should be str or Path' | |
| assert isinstance(device, (torch.device, str)), 'device should be either torch.device or str' | |
| assert isinstance(show, bool), 'show should be boolean' | |
| # parameters of the vizualization: color palette, figsize to show, | |
| # label parameters, jpeg quality | |
| norm = Normalize(vmin=0, vmax=model.classes) | |
| color_map = colormap.tab10 | |
| figsize = (15, 15) | |
| line_thickness = 2 | |
| obj_thresh = 0.8 # 0.8 | |
| nms_thresh = 0.4 # 0.4 | |
| # make a dict: {class_number: class_name} if we have more than 1 class | |
| if model.classes > 1: | |
| # replacing with whitespace because we would like to remove space from | |
| # the text format later in naming the bounding boxes: | |
| names = [name.replace('\n', ' ') for name in open(labels_path, 'r').readlines()] | |
| num2name = {num: name for num, name in enumerate(names)} | |
| else: | |
| # we don't need a class names if the the number of classes is 1 | |
| num2name = {0: ''} | |
| source_img = fix_orientation_if_needed(source_img, orientation) | |
| W, H = source_img.size | |
| # add letterbox padding and save the pad sizes and scalling coefficient | |
| # to use it latter when drawing bboxes on the original image | |
| H_new, W_new, scale = scale_numbers(H, W, model.model_width) | |
| img = source_img.resize((W_new, H_new)) | |
| img = np.array(img) | |
| img, pad_sizes = letterbox_pad(img) | |
| # HWC -> CHW, scale intensities to [0, 1], send to pytorch, add 'batch-'dimension | |
| img = img.astype(np.float32) | |
| img = img.transpose(2, 0, 1) | |
| img = img / 255 | |
| img = torch.from_numpy(img) | |
| img = img.unsqueeze(0) | |
| img = img.to(device) | |
| # make prediction | |
| prediction, loss = model(img, device=device) | |
| # and apply objectness filtering and nms. If returns None, draw a box that states it | |
| prediction = objectness_filter_and_nms(prediction, model.classes, obj_thresh, nms_thresh) | |
| # if show initialize a figure environment | |
| if show: | |
| plt.figure(figsize=figsize) | |
| ### if no objects have been detected draw one rectangle on the perimeter of the | |
| # source_img with text that no objects are found. for comments for this | |
| # if condition please see the for-loop below | |
| if prediction is None: | |
| text = "Couldn't find any objects that I was trained to detect :-(" | |
| font = ImageFont.truetype(str(font_path), 20) | |
| text_size = font.getsize(text) | |
| top_left_coords = ((W-text_size[0])//2, H//2) | |
| black = (0, 0, 0) | |
| # increase the font size a bit | |
| tag = Image.new('RGB', text_size, black) | |
| source_img.paste(tag, top_left_coords) | |
| # create a rectangle object and draw it on the source image | |
| tag_draw = ImageDraw.Draw(source_img) | |
| # adds the text | |
| tag_draw.text(top_left_coords, text, font=font) | |
| if show: | |
| plt.imshow(source_img) | |
| if save: | |
| source_img.save('output.jpg', 'JPEG') | |
| return None, source_img | |
| ### | |
| # since the predictions are made for a resized and padded images, | |
| # the bounding boxes have to be scaled and shifted back | |
| pad_top, pad_bottom, pad_left, pad_right = pad_sizes | |
| prediction[:, 0] = (prediction[:, 0] - pad_left) / scale | |
| prediction[:, 1] = (prediction[:, 1] - pad_top) / scale | |
| prediction[:, 2] = prediction[:, 2] / scale | |
| prediction[:, 3] = prediction[:, 3] / scale | |
| # the, transform the coordinates (cx, cy, w, h) into corner coordinates: | |
| # (top_left_x, top_left_y, bottom_right_x, bottom_right_y) | |
| top_left_x, top_left_y, bottom_right_x, bottom_right_y = get_corner_coords(prediction) | |
| # detach values from the computation graph, take the int part and transform to np.ndarray | |
| top_left_x = top_left_x.cpu().detach().int().numpy() | |
| top_left_y = top_left_y.cpu().detach().int().numpy() | |
| bottom_right_x = bottom_right_x.cpu().detach().int().numpy() | |
| bottom_right_y = bottom_right_y.cpu().detach().int().numpy() | |
| # add each prediction on the image and captures it with a class number | |
| machine_readable_preds = [] | |
| machine_readable_preds.append('class,confidence,bx,by,bw,bh') | |
| for i in range(len(prediction)): | |
| ## ADD BBOXES | |
| # first we need to extract coords for both top left and bottom right corners | |
| # note: sometimes, the corner coordinates lie outside of the image itself | |
| # hence we need to keep them on image -> min and max | |
| top_left_coords = max(0, top_left_x[i]), max(0, top_left_y[i]) | |
| bottom_right_coords = min(W, bottom_right_x[i]), min(H, bottom_right_y[i]) | |
| # predicted class number | |
| # todo dim (also see NMS with batch dim) | |
| class_score, class_int = torch.max(prediction[i, 5:5+model.classes], dim=-1) | |
| class_score, class_int = float(class_score), int(class_int) | |
| # select the color for a class according to its label number and scale it to [0, 255] | |
| bbox_color = color_map(class_int / model.classes)[:3] | |
| bbox_color = tuple(map(lambda x: int(x * 255), bbox_color)) | |
| ## ADD A LABLE FOR EACH BBOX INSIDE THE RECTANGLE WITH THE SAME COLOR | |
| ## AS THE BBOX ITSELF | |
| # predicted class name to put on a bbox | |
| class_name = num2name[class_int] | |
| # text to name a box: class name and the probability in percents | |
| text = f'{class_name}{(class_score * 100):.0f}%' | |
| font = ImageFont.truetype(str(font_path), 14) | |
| text_size = font.getsize(text) | |
| # create a tag object and draw it on the source image | |
| tag = Image.new('RGB', text_size, bbox_color) | |
| top_left_coords_tag = top_left_coords[0], max(0, top_left_coords[1] - text_size[1]) | |
| source_img.paste(tag, top_left_coords_tag) | |
| # create a rectangle object and draw it on the source image | |
| bbox_draw = ImageDraw.Draw(source_img) | |
| bbox_draw.rectangle((top_left_coords, bottom_right_coords), | |
| width=line_thickness, outline=bbox_color) | |
| # adds the class label with confidence | |
| bbox_draw.text(top_left_coords_tag, text, font=font) | |
| # add a prediction to the list of human readable predictions by making an ugly string | |
| machine_readable_preds.append( | |
| ','.join([ | |
| f'{class_name.strip()}', | |
| f'{class_score:.2f}', | |
| f'{prediction[i, 0].item() / W:.2f}', | |
| f'{prediction[i, 1].item() / H:.2f}', | |
| f'{prediction[i, 2].item() / W:.2f}', | |
| f'{prediction[i, 3].item() / H:.2f}', | |
| ]) | |
| ) | |
| # enclose the list of human readable predictions into a markdown code block | |
| machine_readable_preds = '\n'.join(machine_readable_preds) | |
| machine_readable_preds = f'```\n{machine_readable_preds}\n```' | |
| # if show, then, show and close the environment | |
| if show: | |
| plt.imshow(source_img) | |
| if save: | |
| source_img.save('output.jpg', 'JPEG') | |
| return machine_readable_preds, source_img | |
| def show_image_w_bboxes_for_server( | |
| img_path: str, | |
| out_path: str, | |
| archive_path: str, | |
| labels_path: str, | |
| font_path: str, | |
| model: torch.nn.Module, | |
| device: torch.device, | |
| orientation: str) -> None: | |
| ''' | |
| Reads an image from the disk and applies a detection algorithm specified in model. | |
| Arguments | |
| --------- | |
| img_path: str | |
| A path to an image. | |
| out_path: str | |
| A path where to save the result image with detections. This image will | |
| be used to send back to the user. | |
| archive_path: str | |
| Another path where the result image will be saved (archive). | |
| Since `out_path` is always the same, we also use the archive path. | |
| labels_path: str | |
| A path to model labels (COCO) | |
| font_path: str: | |
| A path to a font-face to use to draw the prediction labels | |
| model: Darknet | |
| Model to apply to the image. | |
| device: str: | |
| PyTorch device. Use this argument to control 'cuda' vs 'cpu'. | |
| orientation: str | |
| Orientation which front-end tries to extract from EXIF of an image. | |
| Can be 'undefined' or some integer which can be used to orient the image. | |
| Used in predict_and_save(). | |
| ''' | |
| # I want to log the processing time for each image | |
| start = time() | |
| # predict_and_save returns both img with predictions drawn on it | |
| # and the tensor with predictions | |
| assert out_path is None or isinstance(out_path, str), 'output should be either NoneType or str' | |
| # make sure the arguments are of correct types | |
| assert isinstance(img_path, (str, Path)), '"img_path" should be str or Path' | |
| # read an image | |
| source_img = Image.open(img_path).convert('RGB') | |
| with torch.no_grad(): | |
| predictions, img = predict_and_save( | |
| source_img, model, device, labels_path, font_path, orientation, show=False | |
| ) | |
| # selecting a name for a file for archiving | |
| filename = f'{strftime("%y-%m-%dT%H-%M-%S", localtime())}.jpg' | |
| archive_full_path = os.path.join(archive_path, filename) | |
| img.save(archive_full_path, 'JPEG') | |
| img.save(out_path, 'JPEG') | |
| # calculating elapsed time and printing it to flask console | |
| elapsed_time = round(time() - start, 2) | |
| print(f'Processing time of {filename}: {elapsed_time} sec.') | |
| print('=' * 70) | |
| ### SOME CODE FOR WIDER DATASET HANDLING | |
| ''' | |
| The dataset folder is expected to have | |
| the following structure: | |
| ./Submission_example/ | |
| 0--Parade/ | |
| 0_Parade_marchingband_1_20.txt | |
| ./wider_face_split/ | |
| wider_face_test.mat | |
| wider_face_train.mat | |
| wider_face_test_filelist.txt | |
| wider_face_val.mat | |
| wider_face_train_bbx_gt.txt | |
| readme.txt | |
| wider_face_val_bbx_gt.txt | |
| ./WIDER_train/ | |
| images/ | |
| 0--Parade/ | |
| 0_Parade_marchingband_1_100.jpg | |
| ... | |
| 1--Handshaking/ | |
| 1_Handshaking_Handshaking_1_102.jpg | |
| ... | |
| ... | |
| ./WIDER_val/ | |
| (similar to ./WIDER_train/) | |
| ./WIDER_test/ | |
| (similar to ./WIDER_train/) | |
| ''' | |
| def read_meta_from_file(data_root_path): | |
| ''' | |
| Parses WIDER ground truth data. | |
| Argument | |
| -------- | |
| data_root_path: str | |
| A path to the ground truth dataset. It is expected to have the '.txt' | |
| extension. | |
| Output | |
| ------ | |
| meta: dict | |
| A map between a file path and ground truth bounding box coordinates and some | |
| attributes (x1, y1, w, h, blur, expression, illumination, invalid, occlusion, pose) | |
| stored as list of lists. For more information about the attributes see readme.txt. | |
| ''' | |
| split_path = os.path.join(data_root_path, 'wider_face_split') | |
| train_data_path = os.path.join(data_root_path, 'WIDER_train/images') | |
| train_meta_path = os.path.join(split_path, 'wider_face_train_bbx_gt.txt') | |
| meta = {} | |
| with open(train_meta_path, 'r') as rfile: | |
| while True: | |
| short_file_path = rfile.readline() | |
| bbox_count = rfile.readline() | |
| if short_file_path == '' or bbox_count == '': | |
| rfile.close() | |
| break | |
| short_file_path = short_file_path.replace('\n', '') | |
| bbox_count = int(bbox_count.replace('\n', '')) | |
| full_file_path = os.path.join(train_data_path, short_file_path) | |
| gt_bboxes = [] | |
| for _ in range(bbox_count): | |
| attributes = rfile.readline() | |
| attributes = attributes.replace('\n', '').split(' ') | |
| attributes = [int(att) for att in attributes if len(att) > 0] | |
| gt_bboxes.append(attributes) | |
| meta[full_file_path] = gt_bboxes | |
| return meta | |