File size: 5,290 Bytes
from typing import Dict, List, Optional, Union

from transformers.configuration_utils import PretrainedConfig
from transformers.utils import logging


logger = logging.get_logger(__name__)


class RTMDetConfig(PretrainedConfig):
    """
    Configuration class for RTMDet models from OpenMMLab.
    
    Args:
        backbone_arch (`str`, *optional*, defaults to `"P5"`):
            Architecture of the backbone. Can be either "P5" or "P6".
        backbone_expand_ratio (`float`, *optional*, defaults to `0.5`):
            Expand ratio of the backbone channels.
        backbone_deepen_factor (`float`, *optional*, defaults to `1.0`):
            Factor to deepen the backbone stages.
        backbone_widen_factor (`float`, *optional*, defaults to `1.0`):
            Factor to widen the backbone channels.
        backbone_channel_attention (`bool`, *optional*, defaults to `True`):
            Whether to use channel attention in the backbone.
        neck_in_channels (`List[int]`, *optional*, defaults to `[256, 512, 1024]`):
            Input channels for the neck.
        neck_out_channels (`int`, *optional*, defaults to `256`):
            Output channels for the neck.
        neck_num_csp_blocks (`int`, *optional*, defaults to `3`):
            Number of CSP blocks in the neck.
        neck_expand_ratio (`float`, *optional*, defaults to `0.5`):
            Expand ratio for the neck channels.
        num_classes (`int`, *optional*, defaults to `80`):
            Number of classes to predict.
        head_in_channels (`int`, *optional*, defaults to `256`):
            Input channels for the detection head.
        head_stacked_convs (`int`, *optional*, defaults to `2`):
            Number of stacked convolutions in the head.
        head_feat_channels (`int`, *optional*, defaults to `256`):
            Number of feature channels in the head.
        head_with_objectness (`bool`, *optional*, defaults to `False`):
            Whether to use objectness in the head.
        head_exp_on_reg (`bool`, *optional*, defaults to `True`):
            Whether to use exponential function on the regression branch.
        head_share_conv (`bool`, *optional*, defaults to `True`):
            Whether to share convolutions between classes in the head.
        head_pred_kernel_size (`int`, *optional*, defaults to `1`):
            Kernel size for the prediction layer in the head.
        strides (`List[int]`, *optional*, defaults to `[8, 16, 32]`):
            Strides for multi-scale feature maps.
        input_size (`List[int]`, *optional*, defaults to `[640, 640]`):
            Default input image size [width, height].
        score_threshold (`float`, *optional*, defaults to `0.05`):
            Score threshold for detections.
        nms_threshold (`float`, *optional*, defaults to `0.6`):
            NMS IoU threshold.
        max_detections (`int`, *optional*, defaults to `100`):
            Maximum number of detections to return.
        **kwargs:
            Additional parameters passed to the parent class.
    """

    model_type = "rtmdet"

    def __init__(
        self,
        backbone_arch: str = "P5",
        backbone_expand_ratio: float = 0.5,
        backbone_deepen_factor: float = 1.0,
        backbone_widen_factor: float = 1.0,
        backbone_channel_attention: bool = True,
        neck_in_channels: List[int] = [256, 512, 1024],
        neck_out_channels: int = 256,
        neck_num_csp_blocks: int = 3,
        neck_expand_ratio: float = 0.5,
        num_classes: int = 80,
        head_in_channels: int = 256,
        head_stacked_convs: int = 2,
        head_feat_channels: int = 256,
        head_with_objectness: bool = False,
        head_exp_on_reg: bool = True,
        head_share_conv: bool = True,
        head_pred_kernel_size: int = 1,
        strides: List[int] = [8, 16, 32],
        input_size: List[int] = [640, 640],
        score_threshold: float = 0.05,
        nms_threshold: float = 0.6,
        max_detections: int = 100,
        **kwargs
    ):
        super().__init__(**kwargs)

        # Backbone config
        self.backbone_arch = backbone_arch
        self.backbone_expand_ratio = backbone_expand_ratio
        self.backbone_deepen_factor = backbone_deepen_factor
        self.backbone_widen_factor = backbone_widen_factor
        self.backbone_channel_attention = backbone_channel_attention
        
        # Neck config
        self.neck_in_channels = neck_in_channels
        self.neck_out_channels = neck_out_channels
        self.neck_num_csp_blocks = neck_num_csp_blocks
        self.neck_expand_ratio = neck_expand_ratio
        
        # Head config
        self.num_classes = num_classes
        self.head_in_channels = head_in_channels
        self.head_stacked_convs = head_stacked_convs
        self.head_feat_channels = head_feat_channels
        self.head_with_objectness = head_with_objectness
        self.head_exp_on_reg = head_exp_on_reg
        self.head_share_conv = head_share_conv
        self.head_pred_kernel_size = head_pred_kernel_size
        self.strides = strides
        
        # Inference config
        self.input_size = input_size
        self.score_threshold = score_threshold
        self.nms_threshold = nms_threshold
        self.max_detections = max_detections