File size: 5,290 Bytes
8514022
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
from typing import Dict, List, Optional, Union

from transformers.configuration_utils import PretrainedConfig
from transformers.utils import logging


logger = logging.get_logger(__name__)


class RTMDetConfig(PretrainedConfig):
    """
    Configuration class for RTMDet models from OpenMMLab.
    
    Args:
        backbone_arch (`str`, *optional*, defaults to `"P5"`):
            Architecture of the backbone. Can be either "P5" or "P6".
        backbone_expand_ratio (`float`, *optional*, defaults to `0.5`):
            Expand ratio of the backbone channels.
        backbone_deepen_factor (`float`, *optional*, defaults to `1.0`):
            Factor to deepen the backbone stages.
        backbone_widen_factor (`float`, *optional*, defaults to `1.0`):
            Factor to widen the backbone channels.
        backbone_channel_attention (`bool`, *optional*, defaults to `True`):
            Whether to use channel attention in the backbone.
        neck_in_channels (`List[int]`, *optional*, defaults to `[256, 512, 1024]`):
            Input channels for the neck.
        neck_out_channels (`int`, *optional*, defaults to `256`):
            Output channels for the neck.
        neck_num_csp_blocks (`int`, *optional*, defaults to `3`):
            Number of CSP blocks in the neck.
        neck_expand_ratio (`float`, *optional*, defaults to `0.5`):
            Expand ratio for the neck channels.
        num_classes (`int`, *optional*, defaults to `80`):
            Number of classes to predict.
        head_in_channels (`int`, *optional*, defaults to `256`):
            Input channels for the detection head.
        head_stacked_convs (`int`, *optional*, defaults to `2`):
            Number of stacked convolutions in the head.
        head_feat_channels (`int`, *optional*, defaults to `256`):
            Number of feature channels in the head.
        head_with_objectness (`bool`, *optional*, defaults to `False`):
            Whether to use objectness in the head.
        head_exp_on_reg (`bool`, *optional*, defaults to `True`):
            Whether to use exponential function on the regression branch.
        head_share_conv (`bool`, *optional*, defaults to `True`):
            Whether to share convolutions between classes in the head.
        head_pred_kernel_size (`int`, *optional*, defaults to `1`):
            Kernel size for the prediction layer in the head.
        strides (`List[int]`, *optional*, defaults to `[8, 16, 32]`):
            Strides for multi-scale feature maps.
        input_size (`List[int]`, *optional*, defaults to `[640, 640]`):
            Default input image size [width, height].
        score_threshold (`float`, *optional*, defaults to `0.05`):
            Score threshold for detections.
        nms_threshold (`float`, *optional*, defaults to `0.6`):
            NMS IoU threshold.
        max_detections (`int`, *optional*, defaults to `100`):
            Maximum number of detections to return.
        **kwargs:
            Additional parameters passed to the parent class.
    """

    model_type = "rtmdet"

    def __init__(
        self,
        backbone_arch: str = "P5",
        backbone_expand_ratio: float = 0.5,
        backbone_deepen_factor: float = 1.0,
        backbone_widen_factor: float = 1.0,
        backbone_channel_attention: bool = True,
        neck_in_channels: List[int] = [256, 512, 1024],
        neck_out_channels: int = 256,
        neck_num_csp_blocks: int = 3,
        neck_expand_ratio: float = 0.5,
        num_classes: int = 80,
        head_in_channels: int = 256,
        head_stacked_convs: int = 2,
        head_feat_channels: int = 256,
        head_with_objectness: bool = False,
        head_exp_on_reg: bool = True,
        head_share_conv: bool = True,
        head_pred_kernel_size: int = 1,
        strides: List[int] = [8, 16, 32],
        input_size: List[int] = [640, 640],
        score_threshold: float = 0.05,
        nms_threshold: float = 0.6,
        max_detections: int = 100,
        **kwargs
    ):
        super().__init__(**kwargs)

        # Backbone config
        self.backbone_arch = backbone_arch
        self.backbone_expand_ratio = backbone_expand_ratio
        self.backbone_deepen_factor = backbone_deepen_factor
        self.backbone_widen_factor = backbone_widen_factor
        self.backbone_channel_attention = backbone_channel_attention
        
        # Neck config
        self.neck_in_channels = neck_in_channels
        self.neck_out_channels = neck_out_channels
        self.neck_num_csp_blocks = neck_num_csp_blocks
        self.neck_expand_ratio = neck_expand_ratio
        
        # Head config
        self.num_classes = num_classes
        self.head_in_channels = head_in_channels
        self.head_stacked_convs = head_stacked_convs
        self.head_feat_channels = head_feat_channels
        self.head_with_objectness = head_with_objectness
        self.head_exp_on_reg = head_exp_on_reg
        self.head_share_conv = head_share_conv
        self.head_pred_kernel_size = head_pred_kernel_size
        self.strides = strides
        
        # Inference config
        self.input_size = input_size
        self.score_threshold = score_threshold
        self.nms_threshold = nms_threshold
        self.max_detections = max_detections