Spaces:
Sleeping
Sleeping
| from typing import Any, Callable, List, Optional, Tuple, Union | |
| import torch | |
| import torch.nn.functional as F | |
| from torch import nn | |
| from torchvision.ops import MultiScaleRoIAlign | |
| from torchvision.ops import misc as misc_nn_ops | |
| from torchvision.transforms._presets import ObjectDetection | |
| from torchvision.models._api import register_model, Weights, WeightsEnum | |
| from torchvision.models._meta import _COCO_CATEGORIES | |
| from torchvision.models._utils import _ovewrite_value_param, handle_legacy_interface | |
| from torchvision.models.resnet import resnet50, ResNet50_Weights | |
| from ._utils import overwrite_eps | |
| from .anchor_utils import AnchorGenerator | |
| from .backbone_utils import _resnet_fpn_extractor, _validate_trainable_layers | |
| from .generalized_rcnn import GeneralizedRCNN | |
| from .roi_heads import RoIHeads | |
| from .rpn import RegionProposalNetwork, RPNHead | |
| from .transform import GeneralizedRCNNTransform | |
| __all__ = [ | |
| "FasterRCNN", | |
| "FasterRCNN_ResNet50_FPN_Weights", | |
| "fasterrcnn_resnet50_fpn", | |
| ] | |
| def _default_anchorgen(): | |
| anchor_sizes = ((32,), (64,), (128,), (256,), (512,)) | |
| aspect_ratios = ((0.5, 1.0, 2.0),) * len(anchor_sizes) | |
| return AnchorGenerator(anchor_sizes, aspect_ratios) | |
| class FasterRCNN(GeneralizedRCNN): | |
| """ | |
| Implements Faster R-CNN. | |
| The input to the model is expected to be a list of tensors, each of shape [C, H, W], one for each | |
| image, and should be in 0-1 range. Different images can have different sizes. | |
| The behavior of the model changes depending on if it is in training or evaluation mode. | |
| During training, the model expects both the input tensors and targets (list of dictionary), | |
| containing: | |
| - boxes (``FloatTensor[N, 4]``): the ground-truth boxes in ``[x1, y1, x2, y2]`` format, with | |
| ``0 <= x1 < x2 <= W`` and ``0 <= y1 < y2 <= H``. | |
| - labels (Int64Tensor[N]): the class label for each ground-truth box | |
| The model returns a Dict[Tensor] during training, containing the classification and regression | |
| losses for both the RPN and the R-CNN. | |
| During inference, the model requires only the input tensors, and returns the post-processed | |
| predictions as a List[Dict[Tensor]], one for each input image. The fields of the Dict are as | |
| follows: | |
| - boxes (``FloatTensor[N, 4]``): the predicted boxes in ``[x1, y1, x2, y2]`` format, with | |
| ``0 <= x1 < x2 <= W`` and ``0 <= y1 < y2 <= H``. | |
| - labels (Int64Tensor[N]): the predicted labels for each image | |
| - scores (Tensor[N]): the scores or each prediction | |
| Args: | |
| backbone (nn.Module): the network used to compute the features for the model. | |
| It should contain an out_channels attribute, which indicates the number of output | |
| channels that each feature map has (and it should be the same for all feature maps). | |
| The backbone should return a single Tensor or and OrderedDict[Tensor]. | |
| num_classes (int): number of output classes of the model (including the background). | |
| If box_predictor is specified, num_classes should be None. | |
| min_size (int): minimum size of the image to be rescaled before feeding it to the backbone | |
| max_size (int): maximum size of the image to be rescaled before feeding it to the backbone | |
| image_mean (Tuple[float, float, float]): mean values used for input normalization. | |
| They are generally the mean values of the dataset on which the backbone has been trained | |
| on | |
| image_std (Tuple[float, float, float]): std values used for input normalization. | |
| They are generally the std values of the dataset on which the backbone has been trained on | |
| rpn_anchor_generator (AnchorGenerator): module that generates the anchors for a set of feature | |
| maps. | |
| rpn_head (nn.Module): module that computes the objectness and regression deltas from the RPN | |
| rpn_pre_nms_top_n_train (int): number of proposals to keep before applying NMS during training | |
| rpn_pre_nms_top_n_test (int): number of proposals to keep before applying NMS during testing | |
| rpn_post_nms_top_n_train (int): number of proposals to keep after applying NMS during training | |
| rpn_post_nms_top_n_test (int): number of proposals to keep after applying NMS during testing | |
| rpn_nms_thresh (float): NMS threshold used for postprocessing the RPN proposals | |
| rpn_fg_iou_thresh (float): minimum IoU between the anchor and the GT box so that they can be | |
| considered as positive during training of the RPN. | |
| rpn_bg_iou_thresh (float): maximum IoU between the anchor and the GT box so that they can be | |
| considered as negative during training of the RPN. | |
| rpn_batch_size_per_image (int): number of anchors that are sampled during training of the RPN | |
| for computing the loss | |
| rpn_positive_fraction (float): proportion of positive anchors in a mini-batch during training | |
| of the RPN | |
| rpn_score_thresh (float): only return proposals with an objectness score greater than rpn_score_thresh | |
| box_roi_pool (MultiScaleRoIAlign): the module which crops and resizes the feature maps in | |
| the locations indicated by the bounding boxes | |
| box_head (nn.Module): module that takes the cropped feature maps as input | |
| box_predictor (nn.Module): module that takes the output of box_head and returns the | |
| classification logits and box regression deltas. | |
| box_score_thresh (float): during inference, only return proposals with a classification score | |
| greater than box_score_thresh | |
| box_nms_thresh (float): NMS threshold for the prediction head. Used during inference | |
| box_detections_per_img (int): maximum number of detections per image, for all classes. | |
| box_fg_iou_thresh (float): minimum IoU between the proposals and the GT box so that they can be | |
| considered as positive during training of the classification head | |
| box_bg_iou_thresh (float): maximum IoU between the proposals and the GT box so that they can be | |
| considered as negative during training of the classification head | |
| box_batch_size_per_image (int): number of proposals that are sampled during training of the | |
| classification head | |
| box_positive_fraction (float): proportion of positive proposals in a mini-batch during training | |
| of the classification head | |
| bbox_reg_weights (Tuple[float, float, float, float]): weights for the encoding/decoding of the | |
| bounding boxes | |
| """ | |
| def __init__( | |
| self, | |
| backbone, | |
| num_classes=None, | |
| # transform parameters | |
| min_size=800, | |
| max_size=1333, | |
| image_mean=None, | |
| image_std=None, | |
| # RPN parameters | |
| rpn_anchor_generator=None, | |
| rpn_head=None, | |
| rpn_pre_nms_top_n_train=2000, | |
| rpn_pre_nms_top_n_test=1000, | |
| rpn_post_nms_top_n_train=2000, | |
| rpn_post_nms_top_n_test=1000, | |
| rpn_nms_thresh=0.7, | |
| rpn_fg_iou_thresh=0.7, | |
| rpn_bg_iou_thresh=0.3, | |
| rpn_batch_size_per_image=256, | |
| rpn_positive_fraction=0.5, | |
| rpn_score_thresh=0.0, | |
| # Box parameters | |
| box_roi_pool=None, | |
| box_head=None, | |
| box_predictor=None, | |
| box_score_thresh=0.05, | |
| box_nms_thresh=0.5, | |
| box_detections_per_img=100, | |
| box_fg_iou_thresh=0.5, | |
| box_bg_iou_thresh=0.5, | |
| box_batch_size_per_image=512, | |
| box_positive_fraction=0.25, | |
| bbox_reg_weights=None, | |
| **kwargs, | |
| ): | |
| if not hasattr(backbone, "out_channels"): | |
| raise ValueError( | |
| "backbone should contain an attribute out_channels " | |
| "specifying the number of output channels (assumed to be the " | |
| "same for all the levels)" | |
| ) | |
| if not isinstance(rpn_anchor_generator, (AnchorGenerator, type(None))): | |
| raise TypeError( | |
| f"rpn_anchor_generator should be of type AnchorGenerator or None instead of {type(rpn_anchor_generator)}" | |
| ) | |
| if not isinstance(box_roi_pool, (MultiScaleRoIAlign, type(None))): | |
| raise TypeError( | |
| f"box_roi_pool should be of type MultiScaleRoIAlign or None instead of {type(box_roi_pool)}" | |
| ) | |
| if num_classes is not None: | |
| if box_predictor is not None: | |
| raise ValueError("num_classes should be None when box_predictor is specified") | |
| else: | |
| if box_predictor is None: | |
| raise ValueError("num_classes should not be None when box_predictor is not specified") | |
| out_channels = backbone.out_channels | |
| if rpn_anchor_generator is None: | |
| rpn_anchor_generator = _default_anchorgen() | |
| if rpn_head is None: | |
| rpn_head = RPNHead(out_channels, rpn_anchor_generator.num_anchors_per_location()[0]) | |
| rpn_pre_nms_top_n = dict(training=rpn_pre_nms_top_n_train, testing=rpn_pre_nms_top_n_test) | |
| rpn_post_nms_top_n = dict(training=rpn_post_nms_top_n_train, testing=rpn_post_nms_top_n_test) | |
| rpn = RegionProposalNetwork( | |
| rpn_anchor_generator, | |
| rpn_head, | |
| rpn_fg_iou_thresh, | |
| rpn_bg_iou_thresh, | |
| rpn_batch_size_per_image, | |
| rpn_positive_fraction, | |
| rpn_pre_nms_top_n, | |
| rpn_post_nms_top_n, | |
| rpn_nms_thresh, | |
| score_thresh=rpn_score_thresh, | |
| ) | |
| if box_roi_pool is None: | |
| box_roi_pool = MultiScaleRoIAlign(featmap_names=["0", "1", "2", "3"], output_size=7, sampling_ratio=2) | |
| if box_head is None: | |
| resolution = box_roi_pool.output_size[0] | |
| representation_size = 1024 | |
| box_head = TwoMLPHead(out_channels * resolution**2, representation_size) | |
| if box_predictor is None: | |
| representation_size = 1024 | |
| box_predictor = FastRCNNPredictor(representation_size, num_classes) | |
| roi_heads = RoIHeads( | |
| # Box | |
| box_roi_pool, | |
| box_head, | |
| box_predictor, | |
| box_fg_iou_thresh, | |
| box_bg_iou_thresh, | |
| box_batch_size_per_image, | |
| box_positive_fraction, | |
| bbox_reg_weights, | |
| box_score_thresh, | |
| box_nms_thresh, | |
| box_detections_per_img, | |
| ) | |
| if image_mean is None: | |
| image_mean = [0.485, 0.456, 0.406] | |
| if image_std is None: | |
| image_std = [0.229, 0.224, 0.225] | |
| transform = GeneralizedRCNNTransform(min_size, max_size, image_mean, image_std, **kwargs) | |
| super().__init__(backbone, rpn, roi_heads, transform) | |
| class TwoMLPHead(nn.Module): | |
| """ | |
| Standard heads for FPN-based models | |
| Args: | |
| in_channels (int): number of input channels | |
| representation_size (int): size of the intermediate representation | |
| """ | |
| def __init__(self, in_channels, representation_size): | |
| super().__init__() | |
| self.fc6 = nn.Linear(in_channels, representation_size) | |
| self.fc7 = nn.Linear(representation_size, representation_size) | |
| def forward(self, x): | |
| x = x.flatten(start_dim=1) | |
| x = F.relu(self.fc6(x)) | |
| x = F.relu(self.fc7(x)) | |
| return x | |
| class FastRCNNConvFCHead(nn.Sequential): | |
| def __init__( | |
| self, | |
| input_size: Tuple[int, int, int], | |
| conv_layers: List[int], | |
| fc_layers: List[int], | |
| norm_layer: Optional[Callable[..., nn.Module]] = None, | |
| ): | |
| """ | |
| Args: | |
| input_size (Tuple[int, int, int]): the input size in CHW format. | |
| conv_layers (list): feature dimensions of each Convolution layer | |
| fc_layers (list): feature dimensions of each FCN layer | |
| norm_layer (callable, optional): Module specifying the normalization layer to use. Default: None | |
| """ | |
| in_channels, in_height, in_width = input_size | |
| blocks = [] | |
| previous_channels = in_channels | |
| for current_channels in conv_layers: | |
| blocks.append(misc_nn_ops.Conv2dNormActivation(previous_channels, current_channels, norm_layer=norm_layer)) | |
| previous_channels = current_channels | |
| blocks.append(nn.Flatten()) | |
| previous_channels = previous_channels * in_height * in_width | |
| for current_channels in fc_layers: | |
| blocks.append(nn.Linear(previous_channels, current_channels)) | |
| blocks.append(nn.ReLU(inplace=True)) | |
| previous_channels = current_channels | |
| super().__init__(*blocks) | |
| for layer in self.modules(): | |
| if isinstance(layer, nn.Conv2d): | |
| nn.init.kaiming_normal_(layer.weight, mode="fan_out", nonlinearity="relu") | |
| if layer.bias is not None: | |
| nn.init.zeros_(layer.bias) | |
| class FastRCNNPredictor(nn.Module): | |
| """ | |
| Standard classification + bounding box regression layers + theta | |
| for Fast R-CNN. | |
| Args: | |
| in_channels (int): number of input channels | |
| num_classes (int): number of output classes (including background) | |
| """ | |
| def __init__(self, in_channels, num_classes, num_theta_bins=1): | |
| super().__init__() | |
| self.cls_score = nn.Linear(in_channels, num_classes) | |
| self.bbox_pred = nn.Linear(in_channels, num_classes * 4) | |
| self.theta_pred = nn.Linear(in_channels, 1 + num_theta_bins) | |
| def forward(self, x): | |
| if x.dim() == 4: | |
| torch._assert( | |
| list(x.shape[2:]) == [1, 1], | |
| f"x has the wrong shape, expecting the last two dimensions to be [1,1] instead of {list(x.shape[2:])}", | |
| ) | |
| x = x.flatten(start_dim=1) | |
| scores = self.cls_score(x) | |
| bbox_deltas = self.bbox_pred(x) | |
| theta_preds = self.theta_pred(x) | |
| return scores, bbox_deltas, theta_preds | |
| _COMMON_META = { | |
| "categories": _COCO_CATEGORIES, | |
| "min_size": (1, 1), | |
| } | |
| class FasterRCNN_ResNet50_FPN_Weights(WeightsEnum): | |
| COCO_V1 = Weights( | |
| url="https://download.pytorch.org/models/fasterrcnn_resnet50_fpn_coco-258fb6c6.pth", | |
| transforms=ObjectDetection, | |
| meta={ | |
| **_COMMON_META, | |
| "num_params": 41755286, | |
| "recipe": "https://github.com/pytorch/vision/tree/main/references/detection#faster-r-cnn-resnet-50-fpn", | |
| "_metrics": { | |
| "COCO-val2017": { | |
| "box_map": 37.0, | |
| } | |
| }, | |
| "_ops": 134.38, | |
| "_file_size": 159.743, | |
| "_docs": """These weights were produced by following a similar training recipe as on the paper.""", | |
| }, | |
| ) | |
| DEFAULT = COCO_V1 | |
| # @register_model() | |
| def fasterrcnn_resnet50_fpn( | |
| *, | |
| weights: Optional[FasterRCNN_ResNet50_FPN_Weights] = None, | |
| progress: bool = True, | |
| num_classes: Optional[int] = None, | |
| weights_backbone: Optional[ResNet50_Weights] = ResNet50_Weights.IMAGENET1K_V1, | |
| trainable_backbone_layers: Optional[int] = None, | |
| **kwargs: Any, | |
| ) -> FasterRCNN: | |
| weights = FasterRCNN_ResNet50_FPN_Weights.verify(weights) | |
| weights_backbone = ResNet50_Weights.verify(weights_backbone) | |
| if weights is not None: | |
| weights_backbone = None | |
| num_classes = _ovewrite_value_param("num_classes", num_classes, len(weights.meta["categories"])) | |
| elif num_classes is None: | |
| num_classes = 91 | |
| is_trained = weights is not None or weights_backbone is not None | |
| trainable_backbone_layers = _validate_trainable_layers(is_trained, trainable_backbone_layers, 5, 3) | |
| norm_layer = misc_nn_ops.FrozenBatchNorm2d if is_trained else nn.BatchNorm2d | |
| backbone = resnet50(weights=weights_backbone, progress=progress, norm_layer=norm_layer) | |
| backbone = _resnet_fpn_extractor(backbone, trainable_backbone_layers) | |
| model = FasterRCNN(backbone, num_classes=num_classes, **kwargs) | |
| if weights is not None: | |
| model.load_state_dict(weights.get_state_dict(progress=progress), strict=False) | |
| torch.nn.init.kaiming_normal_(model.roi_heads.box_predictor.theta_pred.weight, mode="fan_out", nonlinearity="relu") | |
| torch.nn.init.constant_(model.roi_heads.box_predictor.theta_pred.bias, 0) | |
| if weights == FasterRCNN_ResNet50_FPN_Weights.COCO_V1: | |
| overwrite_eps(model, 0.0) | |
| return model |