akore commited on
Commit
8514022
·
verified ·
1 Parent(s): cbce20a

Add rtmdet-tiny RTMW/RTMDet HF port

Browse files
README.md ADDED
@@ -0,0 +1,64 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ license: apache-2.0
3
+ tags:
4
+ - object-detection
5
+ - person-detection
6
+ - rtmdet
7
+ - real-time
8
+ - computer-vision
9
+ pipeline_tag: object-detection
10
+ ---
11
+
12
+ # rtmdet-tiny
13
+
14
+ This is a Hugging Face-compatible port of **rtmdet-tiny** from [OpenMMLab MMDetection](https://github.com/open-mmlab/mmdetection).
15
+
16
+ RTMDet is a family of real-time object detectors based on the CSPNeXt architecture. This checkpoint is pretrained on COCO and is particularly well-suited for **person detection** as a first stage before wholebody pose estimation with [RTMW](https://huggingface.co/akore/rtmw-l-384x288).
17
+
18
+ ## Model description
19
+
20
+ - **Architecture**: CSPNeXt backbone + CSPNeXtPAFPN neck + RTMDetHead
21
+ - **Backbone scale**: deepen=0.167, widen=0.375 (~~5M parameters)
22
+ - **Input size**: 640×640
23
+ - **Classes**: 80 (COCO)
24
+ - **Uses custom code** — load with `trust_remote_code=True`
25
+
26
+ ## Usage
27
+
28
+ ```python
29
+ from transformers import AutoImageProcessor
30
+ from PIL import Image
31
+ import torch
32
+
33
+ from rtmdet_modules.configuration_rtmdet import RTMDetConfig
34
+ from rtmdet_modules.modeling_rtmdet import RTMDetModel
35
+
36
+ config = RTMDetConfig.from_pretrained("akore/rtmdet-tiny", trust_remote_code=True)
37
+ model = RTMDetModel.from_pretrained("akore/rtmdet-tiny", trust_remote_code=True)
38
+ model.eval()
39
+
40
+ processor = AutoImageProcessor.from_pretrained("akore/rtmdet-tiny")
41
+ image = Image.open("your_image.jpg").convert("RGB")
42
+ inputs = processor(images=image, return_tensors="pt")
43
+
44
+ with torch.no_grad():
45
+ outputs = model(pixel_values=inputs["pixel_values"])
46
+
47
+ # outputs["boxes"]: (N, 4) in [x1, y1, x2, y2]
48
+ # outputs["scores"]: (N,)
49
+ # outputs["labels"]: (N,) — 0 = person in COCO
50
+ print(outputs)
51
+ ```
52
+
53
+ ## Citation
54
+
55
+ ```bibtex
56
+ @misc{lyu2022rtmdet,
57
+ title={RTMDet: An Empirical Study of Designing Real-Time Object Detectors},
58
+ author={Chengqi Lyu and Wenwei Zhang and Haian Huang and Yue Zhou and Yudong Wang and Yanyi Liu and Shilong Zhang and Kai Chen},
59
+ year={2022},
60
+ eprint={2212.07784},
61
+ archivePrefix={arXiv},
62
+ primaryClass={cs.CV}
63
+ }
64
+ ```
config.json ADDED
@@ -0,0 +1,41 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "backbone_arch": "P5",
3
+ "backbone_channel_attention": true,
4
+ "backbone_deepen_factor": 0.167,
5
+ "backbone_expand_ratio": 0.5,
6
+ "backbone_widen_factor": 0.375,
7
+ "head_exp_on_reg": false,
8
+ "head_feat_channels": 96,
9
+ "head_in_channels": 96,
10
+ "head_pred_kernel_size": 1,
11
+ "head_share_conv": true,
12
+ "head_stacked_convs": 2,
13
+ "head_with_objectness": false,
14
+ "input_size": [
15
+ 640,
16
+ 640
17
+ ],
18
+ "max_detections": 100,
19
+ "model_type": "rtmdet",
20
+ "neck_expand_ratio": 0.5,
21
+ "neck_in_channels": [
22
+ 96,
23
+ 192,
24
+ 384
25
+ ],
26
+ "neck_num_csp_blocks": 1,
27
+ "neck_out_channels": 96,
28
+ "nms_threshold": 0.6,
29
+ "num_classes": 80,
30
+ "score_threshold": 0.05,
31
+ "strides": [
32
+ 8,
33
+ 16,
34
+ 32
35
+ ],
36
+ "transformers_version": "5.2.0",
37
+ "auto_map": {
38
+ "AutoConfig": "configuration_rtmdet.RTMDetConfig",
39
+ "AutoModelForImageProcessing": "modeling_rtmdet.RTMDetModel"
40
+ }
41
+ }
configuration_rtmdet.py ADDED
@@ -0,0 +1,121 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import Dict, List, Optional, Union
2
+
3
+ from transformers.configuration_utils import PretrainedConfig
4
+ from transformers.utils import logging
5
+
6
+
7
+ logger = logging.get_logger(__name__)
8
+
9
+
10
+ class RTMDetConfig(PretrainedConfig):
11
+ """
12
+ Configuration class for RTMDet models from OpenMMLab.
13
+
14
+ Args:
15
+ backbone_arch (`str`, *optional*, defaults to `"P5"`):
16
+ Architecture of the backbone. Can be either "P5" or "P6".
17
+ backbone_expand_ratio (`float`, *optional*, defaults to `0.5`):
18
+ Expand ratio of the backbone channels.
19
+ backbone_deepen_factor (`float`, *optional*, defaults to `1.0`):
20
+ Factor to deepen the backbone stages.
21
+ backbone_widen_factor (`float`, *optional*, defaults to `1.0`):
22
+ Factor to widen the backbone channels.
23
+ backbone_channel_attention (`bool`, *optional*, defaults to `True`):
24
+ Whether to use channel attention in the backbone.
25
+ neck_in_channels (`List[int]`, *optional*, defaults to `[256, 512, 1024]`):
26
+ Input channels for the neck.
27
+ neck_out_channels (`int`, *optional*, defaults to `256`):
28
+ Output channels for the neck.
29
+ neck_num_csp_blocks (`int`, *optional*, defaults to `3`):
30
+ Number of CSP blocks in the neck.
31
+ neck_expand_ratio (`float`, *optional*, defaults to `0.5`):
32
+ Expand ratio for the neck channels.
33
+ num_classes (`int`, *optional*, defaults to `80`):
34
+ Number of classes to predict.
35
+ head_in_channels (`int`, *optional*, defaults to `256`):
36
+ Input channels for the detection head.
37
+ head_stacked_convs (`int`, *optional*, defaults to `2`):
38
+ Number of stacked convolutions in the head.
39
+ head_feat_channels (`int`, *optional*, defaults to `256`):
40
+ Number of feature channels in the head.
41
+ head_with_objectness (`bool`, *optional*, defaults to `False`):
42
+ Whether to use objectness in the head.
43
+ head_exp_on_reg (`bool`, *optional*, defaults to `True`):
44
+ Whether to use exponential function on the regression branch.
45
+ head_share_conv (`bool`, *optional*, defaults to `True`):
46
+ Whether to share convolutions between classes in the head.
47
+ head_pred_kernel_size (`int`, *optional*, defaults to `1`):
48
+ Kernel size for the prediction layer in the head.
49
+ strides (`List[int]`, *optional*, defaults to `[8, 16, 32]`):
50
+ Strides for multi-scale feature maps.
51
+ input_size (`List[int]`, *optional*, defaults to `[640, 640]`):
52
+ Default input image size [width, height].
53
+ score_threshold (`float`, *optional*, defaults to `0.05`):
54
+ Score threshold for detections.
55
+ nms_threshold (`float`, *optional*, defaults to `0.6`):
56
+ NMS IoU threshold.
57
+ max_detections (`int`, *optional*, defaults to `100`):
58
+ Maximum number of detections to return.
59
+ **kwargs:
60
+ Additional parameters passed to the parent class.
61
+ """
62
+
63
+ model_type = "rtmdet"
64
+
65
+ def __init__(
66
+ self,
67
+ backbone_arch: str = "P5",
68
+ backbone_expand_ratio: float = 0.5,
69
+ backbone_deepen_factor: float = 1.0,
70
+ backbone_widen_factor: float = 1.0,
71
+ backbone_channel_attention: bool = True,
72
+ neck_in_channels: List[int] = [256, 512, 1024],
73
+ neck_out_channels: int = 256,
74
+ neck_num_csp_blocks: int = 3,
75
+ neck_expand_ratio: float = 0.5,
76
+ num_classes: int = 80,
77
+ head_in_channels: int = 256,
78
+ head_stacked_convs: int = 2,
79
+ head_feat_channels: int = 256,
80
+ head_with_objectness: bool = False,
81
+ head_exp_on_reg: bool = True,
82
+ head_share_conv: bool = True,
83
+ head_pred_kernel_size: int = 1,
84
+ strides: List[int] = [8, 16, 32],
85
+ input_size: List[int] = [640, 640],
86
+ score_threshold: float = 0.05,
87
+ nms_threshold: float = 0.6,
88
+ max_detections: int = 100,
89
+ **kwargs
90
+ ):
91
+ super().__init__(**kwargs)
92
+
93
+ # Backbone config
94
+ self.backbone_arch = backbone_arch
95
+ self.backbone_expand_ratio = backbone_expand_ratio
96
+ self.backbone_deepen_factor = backbone_deepen_factor
97
+ self.backbone_widen_factor = backbone_widen_factor
98
+ self.backbone_channel_attention = backbone_channel_attention
99
+
100
+ # Neck config
101
+ self.neck_in_channels = neck_in_channels
102
+ self.neck_out_channels = neck_out_channels
103
+ self.neck_num_csp_blocks = neck_num_csp_blocks
104
+ self.neck_expand_ratio = neck_expand_ratio
105
+
106
+ # Head config
107
+ self.num_classes = num_classes
108
+ self.head_in_channels = head_in_channels
109
+ self.head_stacked_convs = head_stacked_convs
110
+ self.head_feat_channels = head_feat_channels
111
+ self.head_with_objectness = head_with_objectness
112
+ self.head_exp_on_reg = head_exp_on_reg
113
+ self.head_share_conv = head_share_conv
114
+ self.head_pred_kernel_size = head_pred_kernel_size
115
+ self.strides = strides
116
+
117
+ # Inference config
118
+ self.input_size = input_size
119
+ self.score_threshold = score_threshold
120
+ self.nms_threshold = nms_threshold
121
+ self.max_detections = max_detections
model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:931fc4f5ab39801c5079167c825774f233781c72611104168774bc045d5b92f6
3
+ size 22362944
modeling_rtmdet.py ADDED
@@ -0,0 +1,1886 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import List, Optional, Tuple, Union, Sequence, Dict
2
+ from dataclasses import dataclass
3
+ import inspect
4
+ from functools import partial
5
+ import warnings
6
+
7
+ import math
8
+ import torch
9
+ import torchvision
10
+ import torch.nn as nn
11
+ from torch import Tensor
12
+ import torch.nn.functional as F
13
+ from torch.nn.modules.batchnorm import _BatchNorm, SyncBatchNorm
14
+
15
+ from transformers.modeling_outputs import ModelOutput
16
+ from transformers.modeling_utils import PreTrainedModel
17
+ from transformers.utils import logging
18
+
19
+ from .configuration_rtmdet import RTMDetConfig
20
+
21
+
22
+ logger = logging.get_logger(__name__)
23
+
24
+ @dataclass
25
+ class DetectionOutput(ModelOutput):
26
+ """
27
+ Output type for object detection models.
28
+
29
+ Args:
30
+ boxes (`torch.FloatTensor` of shape `(batch_size, num_boxes, 4)`):
31
+ Detection boxes in format [x1, y1, x2, y2].
32
+ scores (`torch.FloatTensor` of shape `(batch_size, num_boxes)`):
33
+ Detection confidence scores.
34
+ labels (`torch.LongTensor` of shape `(batch_size, num_boxes)`):
35
+ Detection class indices.
36
+ loss (`torch.FloatTensor`, *optional*):
37
+ Loss value if training.
38
+ """
39
+
40
+ boxes: torch.FloatTensor = None
41
+ scores: torch.FloatTensor = None
42
+ labels: torch.LongTensor = None
43
+ loss: Optional[torch.FloatTensor] = None
44
+
45
+
46
+ # Replace MODELS registry with direct class mappings
47
+ ACTIVATION_LAYERS = {
48
+ 'ReLU': nn.ReLU,
49
+ 'LeakyReLU': nn.LeakyReLU,
50
+ 'PReLU': nn.PReLU,
51
+ 'SiLU': nn.SiLU,
52
+ 'Sigmoid': nn.Sigmoid,
53
+ 'Tanh': nn.Tanh,
54
+ 'GELU': nn.GELU,
55
+ 'Swish': nn.SiLU, # Swish is equivalent to SiLU
56
+ 'Hardsigmoid': nn.Hardsigmoid,
57
+ 'HSigmoid': nn.Hardsigmoid
58
+ }
59
+
60
+ # Simple Config Type replacement
61
+ ConfigType = Dict
62
+ OptConfigType = Optional[Dict]
63
+ OptMultiConfig = Optional[Union[Dict, List[Dict]]]
64
+
65
+ def build_activation_layer(cfg: Dict) -> nn.Module:
66
+ """Build activation layer.
67
+ Args:
68
+ cfg (dict): The activation layer config, which should contain:
69
+ - type (str): Layer type.
70
+ - layer args: Args needed to instantiate an activation layer.
71
+ Returns:
72
+ nn.Module: Created activation layer.
73
+ """
74
+ if not isinstance(cfg, dict):
75
+ raise TypeError('cfg must be a dict')
76
+ if 'type' not in cfg:
77
+ raise KeyError('the cfg dict must contain the key "type"')
78
+
79
+ cfg_ = cfg.copy()
80
+ layer_type = cfg_.pop('type')
81
+
82
+ if layer_type not in ACTIVATION_LAYERS:
83
+ raise KeyError(f'Unrecognized activation type {layer_type}')
84
+
85
+ activation = ACTIVATION_LAYERS[layer_type]
86
+ return activation(**cfg_)
87
+
88
+ def kaiming_init(module,
89
+ a=0,
90
+ mode='fan_out',
91
+ nonlinearity='relu',
92
+ bias=0,
93
+ distribution='normal'):
94
+ assert distribution in ['uniform', 'normal']
95
+ if hasattr(module, 'weight') and module.weight is not None:
96
+ if distribution == 'uniform':
97
+ nn.init.kaiming_uniform_(
98
+ module.weight, a=a, mode=mode, nonlinearity=nonlinearity)
99
+ else:
100
+ nn.init.kaiming_normal_(
101
+ module.weight, a=a, mode=mode, nonlinearity=nonlinearity)
102
+ if hasattr(module, 'bias') and module.bias is not None:
103
+ nn.init.constant_(module.bias, bias)
104
+
105
+ def constant_init(module, val, bias=0):
106
+ if hasattr(module, 'weight') and module.weight is not None:
107
+ nn.init.constant_(module.weight, val)
108
+ if hasattr(module, 'bias') and module.bias is not None:
109
+ nn.init.constant_(module.bias, bias)
110
+
111
+ class _InstanceNorm(nn.modules.instancenorm._InstanceNorm):
112
+ """Instance Normalization Base Class."""
113
+ pass
114
+
115
+ # Custom implementation of methods with asterisks that couldn't be included in the original code
116
+ # These methods need to be renamed without asterisks in actual implementation
117
+
118
+ def infer_abbr(class_type):
119
+ """Infer abbreviation from the class name."""
120
+ if not inspect.isclass(class_type):
121
+ raise TypeError(
122
+ f'class_type must be a type, but got {type(class_type)}')
123
+ if hasattr(class_type, '_abbr_'):
124
+ return class_type._abbr_
125
+ if issubclass(class_type, _InstanceNorm): # IN is a subclass of BN
126
+ return 'in'
127
+ elif issubclass(class_type, _BatchNorm):
128
+ return 'bn'
129
+ elif issubclass(class_type, nn.GroupNorm):
130
+ return 'gn'
131
+ elif issubclass(class_type, nn.LayerNorm):
132
+ return 'ln'
133
+ else:
134
+ class_name = class_type.__name__.lower()
135
+ if 'batch' in class_name:
136
+ return 'bn'
137
+ elif 'group' in class_name:
138
+ return 'gn'
139
+ elif 'layer' in class_name:
140
+ return 'ln'
141
+ elif 'instance' in class_name:
142
+ return 'in'
143
+ else:
144
+ return 'norm_layer'
145
+
146
+ # Create mapping from strings to layer classes
147
+ NORM_LAYERS = {
148
+ 'BN': nn.BatchNorm2d,
149
+ 'BN1d': nn.BatchNorm1d,
150
+ 'BN2d': nn.BatchNorm2d,
151
+ 'BN3d': nn.BatchNorm3d,
152
+ 'SyncBN': SyncBatchNorm,
153
+ 'GN': nn.GroupNorm,
154
+ 'LN': nn.LayerNorm,
155
+ 'IN': nn.InstanceNorm2d,
156
+ 'IN1d': nn.InstanceNorm1d,
157
+ 'IN2d': nn.InstanceNorm2d,
158
+ 'IN3d': nn.InstanceNorm3d
159
+ }
160
+
161
+ CONV_LAYERS = {
162
+ 'Conv1d': nn.Conv1d,
163
+ 'Conv2d': nn.Conv2d,
164
+ 'Conv3d': nn.Conv3d,
165
+ 'Conv': nn.Conv2d
166
+ }
167
+
168
+ PADDING_LAYERS = {
169
+ 'zero': nn.ZeroPad2d,
170
+ 'reflect': nn.ReflectionPad2d,
171
+ 'replicate': nn.ReplicationPad2d
172
+ }
173
+
174
+ def build_norm_layer(cfg: Dict,
175
+ num_features: int,
176
+ postfix: Union[int, str] = '') -> Tuple[str, nn.Module]:
177
+ """Build normalization layer."""
178
+ if not isinstance(cfg, dict):
179
+ raise TypeError('cfg must be a dict')
180
+ if 'type' not in cfg:
181
+ raise KeyError('the cfg dict must contain the key "type"')
182
+
183
+ cfg_ = cfg.copy()
184
+ layer_type = cfg_.pop('type')
185
+
186
+ if layer_type not in NORM_LAYERS:
187
+ raise KeyError(f'Unrecognized norm type {layer_type}')
188
+
189
+ norm_layer = NORM_LAYERS[layer_type]
190
+ abbr = infer_abbr(norm_layer)
191
+
192
+ assert isinstance(postfix, (int, str))
193
+ name = abbr + str(postfix)
194
+
195
+ requires_grad = cfg_.pop('requires_grad', True)
196
+ cfg_.setdefault('eps', 1e-5)
197
+
198
+ if norm_layer is not nn.GroupNorm:
199
+ layer = norm_layer(num_features, **cfg_)
200
+ if layer_type == 'SyncBN' and hasattr(layer, '_specify_ddp_gpu_num'):
201
+ layer._specify_ddp_gpu_num(1)
202
+ else:
203
+ assert 'num_groups' in cfg_
204
+ layer = norm_layer(num_channels=num_features, **cfg_)
205
+
206
+ for param in layer.parameters():
207
+ param.requires_grad = requires_grad
208
+
209
+ return name, layer
210
+
211
+ def build_conv_layer(cfg: Optional[Dict], *args, **kwargs) -> nn.Module:
212
+ """Build convolution layer."""
213
+ if cfg is None:
214
+ cfg_ = dict(type='Conv2d')
215
+ else:
216
+ if not isinstance(cfg, dict):
217
+ raise TypeError('cfg must be a dict')
218
+ if 'type' not in cfg:
219
+ raise KeyError('the cfg dict must contain the key "type"')
220
+ cfg_ = cfg.copy()
221
+
222
+ layer_type = cfg_.pop('type')
223
+
224
+ if layer_type not in CONV_LAYERS:
225
+ raise KeyError(f'Unrecognized conv type {layer_type}')
226
+
227
+ conv_layer = CONV_LAYERS[layer_type]
228
+ layer = conv_layer(*args, **kwargs, **cfg_)
229
+
230
+ return layer
231
+
232
+ def build_padding_layer(cfg: Dict, *args, **kwargs) -> nn.Module:
233
+ """Build padding layer."""
234
+ if not isinstance(cfg, dict):
235
+ raise TypeError('cfg must be a dict')
236
+ if 'type' not in cfg:
237
+ raise KeyError('the cfg dict must contain the key "type"')
238
+
239
+ cfg_ = cfg.copy()
240
+ padding_type = cfg_.pop('type')
241
+
242
+ if padding_type not in PADDING_LAYERS:
243
+ raise KeyError(f'Unrecognized padding type {padding_type}')
244
+
245
+ padding_layer = PADDING_LAYERS[padding_type]
246
+ layer = padding_layer(*args, **kwargs, **cfg_)
247
+
248
+ return layer
249
+
250
+ def efficient_conv_bn_eval_forward(bn: _BatchNorm,
251
+ conv: nn.modules.conv._ConvNd,
252
+ x: torch.Tensor):
253
+ """
254
+ Implementation based on https://arxiv.org/abs/2305.11624
255
+ "Tune-Mode ConvBN Blocks For Efficient Transfer Learning"
256
+ It leverages the associative law between convolution and affine transform,
257
+ i.e., normalize (weight conv feature) = (normalize weight) conv feature.
258
+ It works for Eval mode of ConvBN blocks during validation, and can be used
259
+ for training as well. It reduces memory and computation cost.
260
+ Args:
261
+ bn (_BatchNorm): a BatchNorm module.
262
+ conv (nn._ConvNd): a conv module
263
+ x (torch.Tensor): Input feature map.
264
+ """
265
+ # These lines of code are designed to deal with various cases
266
+ # like bn without affine transform, and conv without bias
267
+ weight_on_the_fly = conv.weight
268
+ if conv.bias is not None:
269
+ bias_on_the_fly = conv.bias
270
+ else:
271
+ bias_on_the_fly = torch.zeros_like(bn.running_var)
272
+ if bn.weight is not None:
273
+ bn_weight = bn.weight
274
+ else:
275
+ bn_weight = torch.ones_like(bn.running_var)
276
+ if bn.bias is not None:
277
+ bn_bias = bn.bias
278
+ else:
279
+ bn_bias = torch.zeros_like(bn.running_var)
280
+ # shape of [C_out, 1, 1, 1] in Conv2d
281
+ weight_coeff = torch.rsqrt(bn.running_var +
282
+ bn.eps).reshape([-1] + [1] *
283
+ (len(conv.weight.shape) - 1))
284
+ # shape of [C_out, 1, 1, 1] in Conv2d
285
+ coefff_on_the_fly = bn_weight.view_as(weight_coeff) * weight_coeff
286
+ # shape of [C_out, C_in, k, k] in Conv2d
287
+ weight_on_the_fly = weight_on_the_fly * coefff_on_the_fly
288
+ # shape of [C_out] in Conv2d
289
+ bias_on_the_fly = bn_bias + coefff_on_the_fly.flatten() *\
290
+ (bias_on_the_fly - bn.running_mean)
291
+ return conv._conv_forward(x, weight_on_the_fly, bias_on_the_fly)
292
+
293
+ class ConvModule(nn.Module):
294
+ """A conv block that bundles conv/norm/activation layers."""
295
+ _abbr_ = 'conv_block'
296
+
297
+ def __init__(self,
298
+ in_channels: int,
299
+ out_channels: int,
300
+ kernel_size: Union[int, Tuple[int, int]],
301
+ stride: Union[int, Tuple[int, int]] = 1,
302
+ padding: Union[int, Tuple[int, int]] = 0,
303
+ dilation: Union[int, Tuple[int, int]] = 1,
304
+ groups: int = 1,
305
+ bias: Union[bool, str] = 'auto',
306
+ conv_cfg: Optional[Dict] = None,
307
+ norm_cfg: Optional[Dict] = None,
308
+ act_cfg: Optional[Dict] = dict(type='ReLU'),
309
+ inplace: bool = True,
310
+ with_spectral_norm: bool = False,
311
+ padding_mode: str = 'zeros',
312
+ order: tuple = ('conv', 'norm', 'act'),
313
+ efficient_conv_bn_eval: bool = False):
314
+ super().__init__()
315
+ assert conv_cfg is None or isinstance(conv_cfg, dict)
316
+ assert norm_cfg is None or isinstance(norm_cfg, dict)
317
+ assert act_cfg is None or isinstance(act_cfg, dict)
318
+ official_padding_mode = ['zeros', 'circular']
319
+ self.conv_cfg = conv_cfg
320
+ self.norm_cfg = norm_cfg
321
+ self.act_cfg = act_cfg
322
+ self.inplace = inplace
323
+ self.with_spectral_norm = with_spectral_norm
324
+ self.with_explicit_padding = padding_mode not in official_padding_mode
325
+ self.order = order
326
+ assert isinstance(self.order, tuple) and len(self.order) == 3
327
+ assert set(order) == {'conv', 'norm', 'act'}
328
+ self.with_norm = norm_cfg is not None
329
+ self.with_activation = act_cfg is not None
330
+ # if the conv layer is before a norm layer, bias is unnecessary.
331
+ if bias == 'auto':
332
+ bias = not self.with_norm
333
+ self.with_bias = bias
334
+
335
+ if self.with_explicit_padding:
336
+ pad_cfg = dict(type=padding_mode)
337
+ self.padding_layer = build_padding_layer(pad_cfg, padding)
338
+
339
+ # reset padding to 0 for conv module
340
+ conv_padding = 0 if self.with_explicit_padding else padding
341
+
342
+ # build convolution layer
343
+ self.conv = build_conv_layer(
344
+ conv_cfg,
345
+ in_channels,
346
+ out_channels,
347
+ kernel_size,
348
+ stride=stride,
349
+ padding=conv_padding,
350
+ dilation=dilation,
351
+ groups=groups,
352
+ bias=bias)
353
+
354
+ # export the attributes of self.conv to a higher level for convenience
355
+ self.in_channels = self.conv.in_channels
356
+ self.out_channels = self.conv.out_channels
357
+ self.kernel_size = self.conv.kernel_size
358
+ self.stride = self.conv.stride
359
+ self.padding = padding
360
+ self.dilation = self.conv.dilation
361
+ self.transposed = self.conv.transposed
362
+ self.output_padding = self.conv.output_padding
363
+ self.groups = self.conv.groups
364
+
365
+ if self.with_spectral_norm:
366
+ self.conv = nn.utils.spectral_norm(self.conv)
367
+
368
+ # build normalization layers
369
+ if self.with_norm:
370
+ # norm layer is after conv layer
371
+ if order.index('norm') > order.index('conv'):
372
+ norm_channels = out_channels
373
+ else:
374
+ norm_channels = in_channels
375
+ self.norm_name, norm = build_norm_layer(
376
+ norm_cfg, norm_channels) # type: ignore
377
+ self.add_module(self.norm_name, norm)
378
+ if self.with_bias:
379
+ if isinstance(norm, (_BatchNorm, _InstanceNorm)):
380
+ warnings.warn(
381
+ 'Unnecessary conv bias before batch/instance norm')
382
+ else:
383
+ self.norm_name = None # type: ignore
384
+
385
+ self.turn_on_efficient_conv_bn_eval(efficient_conv_bn_eval)
386
+
387
+ # build activation layer
388
+ if self.with_activation:
389
+ act_cfg_ = act_cfg.copy() # type: ignore
390
+ # nn.Tanh has no 'inplace' argument
391
+ if act_cfg_['type'] not in [
392
+ 'Tanh', 'PReLU', 'Sigmoid', 'HSigmoid', 'Swish', 'GELU'
393
+ ]:
394
+ act_cfg_.setdefault('inplace', inplace)
395
+ self.activate = build_activation_layer(act_cfg_)
396
+
397
+ # Use msra init by default
398
+ self.init_weights()
399
+
400
+ @property
401
+ def norm(self):
402
+ if self.norm_name:
403
+ return getattr(self, self.norm_name)
404
+ else:
405
+ return None
406
+
407
+ def init_weights(self):
408
+ if not hasattr(self.conv, 'init_weights'):
409
+ if self.with_activation and self.act_cfg['type'] == 'LeakyReLU':
410
+ nonlinearity = 'leaky_relu'
411
+ a = self.act_cfg.get('negative_slope', 0.01)
412
+ else:
413
+ nonlinearity = 'relu'
414
+ a = 0
415
+ kaiming_init(self.conv, a=a, nonlinearity=nonlinearity)
416
+ if self.with_norm:
417
+ constant_init(self.norm, 1, bias=0)
418
+
419
+ def forward(self,
420
+ x: torch.Tensor,
421
+ activate: bool = True,
422
+ norm: bool = True) -> torch.Tensor:
423
+ layer_index = 0
424
+ while layer_index < len(self.order):
425
+ layer = self.order[layer_index]
426
+ if layer == 'conv':
427
+ if self.with_explicit_padding:
428
+ x = self.padding_layer(x)
429
+ # if the next operation is norm and we have a norm layer in
430
+ # eval mode and we have enabled `efficient_conv_bn_eval` for
431
+ # the conv operator, then activate the optimized forward and
432
+ # skip the next norm operator since it has been fused
433
+ if layer_index + 1 < len(self.order) and \
434
+ self.order[layer_index + 1] == 'norm' and norm and \
435
+ self.with_norm and not self.norm.training and \
436
+ self.efficient_conv_bn_eval_forward is not None:
437
+ self.conv.forward = partial(
438
+ self.efficient_conv_bn_eval_forward, self.norm,
439
+ self.conv)
440
+ layer_index += 1
441
+ x = self.conv(x)
442
+ del self.conv.forward
443
+ else:
444
+ x = self.conv(x)
445
+ elif layer == 'norm' and norm and self.with_norm:
446
+ x = self.norm(x)
447
+ elif layer == 'act' and activate and self.with_activation:
448
+ x = self.activate(x)
449
+ layer_index += 1
450
+ return x
451
+
452
+ def turn_on_efficient_conv_bn_eval(self, efficient_conv_bn_eval=True):
453
+ # efficient_conv_bn_eval works for conv + bn
454
+ # with `track_running_stats` option
455
+ if efficient_conv_bn_eval and self.norm \
456
+ and isinstance(self.norm, _BatchNorm) \
457
+ and self.norm.track_running_stats:
458
+ self.efficient_conv_bn_eval_forward = efficient_conv_bn_eval_forward # noqa: E501
459
+ else:
460
+ self.efficient_conv_bn_eval_forward = None # type: ignore
461
+
462
+ @staticmethod
463
+ def create_from_conv_bn(conv: torch.nn.modules.conv._ConvNd,
464
+ bn: torch.nn.modules.batchnorm._BatchNorm,
465
+ efficient_conv_bn_eval=True) -> 'ConvModule':
466
+ """Create a ConvModule from a conv and a bn module."""
467
+ self = ConvModule.__new__(ConvModule)
468
+ super(ConvModule, self).__init__()
469
+ self.conv_cfg = None
470
+ self.norm_cfg = None
471
+ self.act_cfg = None
472
+ self.inplace = False
473
+ self.with_spectral_norm = False
474
+ self.with_explicit_padding = False
475
+ self.order = ('conv', 'norm', 'act')
476
+ self.with_norm = True
477
+ self.with_activation = False
478
+ self.with_bias = conv.bias is not None
479
+ # build convolution layer
480
+ self.conv = conv
481
+ # export the attributes of self.conv to a higher level for convenience
482
+ self.in_channels = self.conv.in_channels
483
+ self.out_channels = self.conv.out_channels
484
+ self.kernel_size = self.conv.kernel_size
485
+ self.stride = self.conv.stride
486
+ self.padding = self.conv.padding
487
+ self.dilation = self.conv.dilation
488
+ self.transposed = self.conv.transposed
489
+ self.output_padding = self.conv.output_padding
490
+ self.groups = self.conv.groups
491
+ # build normalization layers
492
+ self.norm_name, norm = 'bn', bn
493
+ self.add_module(self.norm_name, norm)
494
+ self.turn_on_efficient_conv_bn_eval(efficient_conv_bn_eval)
495
+ return self
496
+
497
+ class DepthwiseSeparableConvModule(nn.Module):
498
+ """Depthwise separable convolution module."""
499
+ def __init__(self,
500
+ in_channels: int,
501
+ out_channels: int,
502
+ kernel_size: Union[int, Tuple[int, int]],
503
+ stride: Union[int, Tuple[int, int]] = 1,
504
+ padding: Union[int, Tuple[int, int]] = 0,
505
+ dilation: Union[int, Tuple[int, int]] = 1,
506
+ norm_cfg: Optional[Dict] = None,
507
+ act_cfg: Dict = dict(type='ReLU'),
508
+ dw_norm_cfg: Union[Dict, str] = 'default',
509
+ dw_act_cfg: Union[Dict, str] = 'default',
510
+ pw_norm_cfg: Union[Dict, str] = 'default',
511
+ pw_act_cfg: Union[Dict, str] = 'default',
512
+ **kwargs):
513
+ super().__init__()
514
+ assert 'groups' not in kwargs, 'groups should not be specified'
515
+ # if norm/activation config of depthwise/pointwise ConvModule is not
516
+ # specified, use default config.
517
+ dw_norm_cfg = dw_norm_cfg if dw_norm_cfg != 'default' else norm_cfg # type: ignore # noqa E501
518
+ dw_act_cfg = dw_act_cfg if dw_act_cfg != 'default' else act_cfg
519
+ pw_norm_cfg = pw_norm_cfg if pw_norm_cfg != 'default' else norm_cfg # type: ignore # noqa E501
520
+ pw_act_cfg = pw_act_cfg if pw_act_cfg != 'default' else act_cfg
521
+
522
+ # depthwise convolution
523
+ self.depthwise_conv = ConvModule(
524
+ in_channels,
525
+ in_channels,
526
+ kernel_size,
527
+ stride=stride,
528
+ padding=padding,
529
+ dilation=dilation,
530
+ groups=in_channels,
531
+ norm_cfg=dw_norm_cfg, # type: ignore
532
+ act_cfg=dw_act_cfg, # type: ignore
533
+ **kwargs)
534
+
535
+ self.pointwise_conv = ConvModule(
536
+ in_channels,
537
+ out_channels,
538
+ 1,
539
+ norm_cfg=pw_norm_cfg, # type: ignore
540
+ act_cfg=pw_act_cfg, # type: ignore
541
+ **kwargs)
542
+
543
+ def forward(self, x: torch.Tensor) -> torch.Tensor:
544
+ x = self.depthwise_conv(x)
545
+ x = self.pointwise_conv(x)
546
+ return x
547
+
548
+ class SPPBottleneck(nn.Module):
549
+ """Spatial pyramid pooling layer used in YOLOv3-SPP."""
550
+ def __init__(self,
551
+ in_channels,
552
+ out_channels,
553
+ kernel_sizes=(5, 9, 13),
554
+ conv_cfg=None,
555
+ norm_cfg=dict(type='BN', momentum=0.03, eps=0.001),
556
+ act_cfg=dict(type='Swish'),
557
+ init_cfg=None):
558
+ super().__init__()
559
+ mid_channels = in_channels // 2
560
+ self.conv1 = ConvModule(
561
+ in_channels,
562
+ mid_channels,
563
+ 1,
564
+ stride=1,
565
+ conv_cfg=conv_cfg,
566
+ norm_cfg=norm_cfg,
567
+ act_cfg=act_cfg)
568
+ self.poolings = nn.ModuleList([
569
+ nn.MaxPool2d(kernel_size=ks, stride=1, padding=ks // 2)
570
+ for ks in kernel_sizes
571
+ ])
572
+ conv2_channels = mid_channels * (len(kernel_sizes) + 1)
573
+ self.conv2 = ConvModule(
574
+ conv2_channels,
575
+ out_channels,
576
+ 1,
577
+ conv_cfg=conv_cfg,
578
+ norm_cfg=norm_cfg,
579
+ act_cfg=act_cfg)
580
+
581
+ def forward(self, x):
582
+ x = self.conv1(x)
583
+ with torch.amp.autocast(enabled=False, device_type=x.device.type):
584
+ x = torch.cat(
585
+ [x] + [pooling(x) for pooling in self.poolings], dim=1)
586
+ x = self.conv2(x)
587
+ return x
588
+
589
+ class DarknetBottleneck(nn.Module):
590
+ """The basic bottleneck block used in Darknet."""
591
+ def __init__(self,
592
+ in_channels: int,
593
+ out_channels: int,
594
+ expansion: float = 0.5,
595
+ add_identity: bool = True,
596
+ use_depthwise: bool = False,
597
+ conv_cfg: OptConfigType = None,
598
+ norm_cfg: ConfigType = dict(
599
+ type='BN', momentum=0.03, eps=0.001),
600
+ act_cfg: ConfigType = dict(type='Swish'),
601
+ init_cfg: OptMultiConfig = None) -> None:
602
+ super().__init__()
603
+ hidden_channels = int(out_channels * expansion)
604
+ conv = DepthwiseSeparableConvModule if use_depthwise else ConvModule
605
+ self.conv1 = ConvModule(
606
+ in_channels,
607
+ hidden_channels,
608
+ 1,
609
+ conv_cfg=conv_cfg,
610
+ norm_cfg=norm_cfg,
611
+ act_cfg=act_cfg)
612
+ self.conv2 = conv(
613
+ hidden_channels,
614
+ out_channels,
615
+ 3,
616
+ stride=1,
617
+ padding=1,
618
+ conv_cfg=conv_cfg,
619
+ norm_cfg=norm_cfg,
620
+ act_cfg=act_cfg)
621
+ self.add_identity = \
622
+ add_identity and in_channels == out_channels
623
+
624
+ def forward(self, x: Tensor) -> Tensor:
625
+ """Forward function."""
626
+ identity = x
627
+ out = self.conv1(x)
628
+ out = self.conv2(out)
629
+ if self.add_identity:
630
+ return out + identity
631
+ else:
632
+ return out
633
+
634
+ class CSPNeXtBlock(nn.Module):
635
+ """The basic bottleneck block used in CSPNeXt."""
636
+ def __init__(self,
637
+ in_channels: int,
638
+ out_channels: int,
639
+ expansion: float = 0.5,
640
+ add_identity: bool = True,
641
+ use_depthwise: bool = False,
642
+ kernel_size: int = 5,
643
+ conv_cfg: OptConfigType = None,
644
+ norm_cfg: ConfigType = dict(
645
+ type='BN', momentum=0.03, eps=0.001),
646
+ act_cfg: ConfigType = dict(type='SiLU'),
647
+ init_cfg: OptMultiConfig = None) -> None:
648
+ super().__init__()
649
+ hidden_channels = int(out_channels * expansion)
650
+ conv = DepthwiseSeparableConvModule if use_depthwise else ConvModule
651
+ self.conv1 = conv(
652
+ in_channels,
653
+ hidden_channels,
654
+ 3,
655
+ stride=1,
656
+ padding=1,
657
+ norm_cfg=norm_cfg,
658
+ act_cfg=act_cfg)
659
+ self.conv2 = DepthwiseSeparableConvModule(
660
+ hidden_channels,
661
+ out_channels,
662
+ kernel_size,
663
+ stride=1,
664
+ padding=kernel_size // 2,
665
+ conv_cfg=conv_cfg,
666
+ norm_cfg=norm_cfg,
667
+ act_cfg=act_cfg)
668
+ self.add_identity = \
669
+ add_identity and in_channels == out_channels
670
+
671
+ def forward(self, x: Tensor) -> Tensor:
672
+ """Forward function."""
673
+ identity = x
674
+ out = self.conv1(x)
675
+ out = self.conv2(out)
676
+ if self.add_identity:
677
+ return out + identity
678
+ else:
679
+ return out
680
+
681
+ class ChannelAttention(nn.Module):
682
+ """Channel attention Module."""
683
+ def __init__(self, channels: int, init_cfg: OptMultiConfig = None) -> None:
684
+ super().__init__()
685
+ self.global_avgpool = nn.AdaptiveAvgPool2d(1)
686
+ self.fc = nn.Conv2d(channels, channels, 1, 1, 0, bias=True)
687
+ self.act = nn.Hardsigmoid(inplace=True)
688
+
689
+ def forward(self, x: Tensor) -> Tensor:
690
+ """Forward function for ChannelAttention."""
691
+ with torch.amp.autocast(enabled=False, device_type=x.device.type):
692
+ out = self.global_avgpool(x)
693
+ out = self.fc(out)
694
+ out = self.act(out)
695
+ return x * out
696
+
697
+ class CSPLayer(nn.Module):
698
+ """Cross Stage Partial Layer.
699
+ Args:
700
+ in_channels (int): The input channels of the CSP layer.
701
+ out_channels (int): The output channels of the CSP layer.
702
+ expand_ratio (float): Ratio to adjust the number of channels of the
703
+ hidden layer. Defaults to 0.5.
704
+ num_blocks (int): Number of blocks. Defaults to 1.
705
+ add_identity (bool): Whether to add identity in blocks.
706
+ Defaults to True.
707
+ use_cspnext_block (bool): Whether to use CSPNeXt block.
708
+ Defaults to False.
709
+ use_depthwise (bool): Whether to use depthwise separable convolution in
710
+ blocks. Defaults to False.
711
+ channel_attention (bool): Whether to add channel attention in each
712
+ stage. Defaults to True.
713
+ conv_cfg (dict, optional): Config dict for convolution layer.
714
+ Defaults to None, which means using conv2d.
715
+ norm_cfg (dict): Config dict for normalization layer.
716
+ Defaults to dict(type='BN')
717
+ act_cfg (dict): Config dict for activation layer.
718
+ Defaults to dict(type='Swish')
719
+ """
720
+ def __init__(self,
721
+ in_channels: int,
722
+ out_channels: int,
723
+ expand_ratio: float = 0.5,
724
+ num_blocks: int = 1,
725
+ add_identity: bool = True,
726
+ use_depthwise: bool = False,
727
+ use_cspnext_block: bool = False,
728
+ channel_attention: bool = False,
729
+ conv_cfg: OptConfigType = None,
730
+ norm_cfg: ConfigType = dict(
731
+ type='BN', momentum=0.03, eps=0.001),
732
+ act_cfg: ConfigType = dict(type='Swish'),
733
+ init_cfg: OptMultiConfig = None) -> None:
734
+ super().__init__()
735
+ block = CSPNeXtBlock if use_cspnext_block else DarknetBottleneck
736
+ mid_channels = int(out_channels * expand_ratio)
737
+ self.channel_attention = channel_attention
738
+
739
+ self.main_conv = ConvModule(
740
+ in_channels,
741
+ mid_channels,
742
+ 1,
743
+ conv_cfg=conv_cfg,
744
+ norm_cfg=norm_cfg,
745
+ act_cfg=act_cfg)
746
+
747
+ self.short_conv = ConvModule(
748
+ in_channels,
749
+ mid_channels,
750
+ 1,
751
+ conv_cfg=conv_cfg,
752
+ norm_cfg=norm_cfg,
753
+ act_cfg=act_cfg)
754
+
755
+ self.final_conv = ConvModule(
756
+ 2 * mid_channels,
757
+ out_channels,
758
+ 1,
759
+ conv_cfg=conv_cfg,
760
+ norm_cfg=norm_cfg,
761
+ act_cfg=act_cfg)
762
+
763
+ self.blocks = nn.Sequential(*[
764
+ block(
765
+ mid_channels,
766
+ mid_channels,
767
+ 1.0,
768
+ add_identity,
769
+ use_depthwise,
770
+ conv_cfg=conv_cfg,
771
+ norm_cfg=norm_cfg,
772
+ act_cfg=act_cfg) for _ in range(num_blocks)
773
+ ])
774
+
775
+ if channel_attention:
776
+ self.attention = ChannelAttention(2 * mid_channels)
777
+
778
+ def forward(self, x: Tensor) -> Tensor:
779
+ """Forward function."""
780
+ x_short = self.short_conv(x)
781
+ x_main = self.main_conv(x)
782
+ x_main = self.blocks(x_main)
783
+ x_final = torch.cat((x_main, x_short), dim=1)
784
+
785
+ if self.channel_attention:
786
+ x_final = self.attention(x_final)
787
+
788
+ return self.final_conv(x_final)
789
+
790
+
791
+ class CSPNeXt(nn.Module):
792
+ """CSPNeXt backbone used in RTMDet.
793
+ This is a standalone implementation without requiring the mmdet registry.
794
+
795
+ Args:
796
+ arch (str): Architecture of CSPNeXt, from {P5, P6}.
797
+ Defaults to P5.
798
+ expand_ratio (float): Ratio to adjust the number of channels of the
799
+ hidden layer. Defaults to 0.5.
800
+ deepen_factor (float): Depth multiplier, multiply number of
801
+ blocks in CSP layer by this amount. Defaults to 1.0.
802
+ widen_factor (float): Width multiplier, multiply number of
803
+ channels in each layer by this amount. Defaults to 1.0.
804
+ out_indices (Sequence[int]): Output from which stages.
805
+ Defaults to (2, 3, 4).
806
+ frozen_stages (int): Stages to be frozen (stop grad and set eval
807
+ mode). -1 means not freezing any parameters. Defaults to -1.
808
+ use_depthwise (bool): Whether to use depthwise separable convolution.
809
+ Defaults to False.
810
+ arch_ovewrite (list): Overwrite default arch settings.
811
+ Defaults to None.
812
+ spp_kernel_sizes: (tuple[int]): Sequential of kernel sizes of SPP
813
+ layers. Defaults to (5, 9, 13).
814
+ channel_attention (bool): Whether to add channel attention in each
815
+ stage. Defaults to True.
816
+ conv_cfg (:obj:`ConfigDict` or dict, optional): Config dict for
817
+ convolution layer. Defaults to None.
818
+ norm_cfg (:obj:`ConfigDict` or dict): Dictionary to construct and
819
+ config norm layer. Defaults to dict(type='BN', requires_grad=True).
820
+ act_cfg (:obj:`ConfigDict` or dict): Config dict for activation layer.
821
+ Defaults to dict(type='SiLU').
822
+ norm_eval (bool): Whether to set norm layers to eval mode, namely,
823
+ freeze running stats (mean and var). Note: Effect on Batch Norm
824
+ and its variants only.
825
+ """
826
+
827
+ # From left to right:
828
+ # in_channels, out_channels, num_blocks, add_identity, use_spp
829
+ arch_settings = {
830
+ 'P5': [[64, 128, 3, True, False], [128, 256, 6, True, False],
831
+ [256, 512, 6, True, False], [512, 1024, 3, False, True]],
832
+ 'P6': [[64, 128, 3, True, False], [128, 256, 6, True, False],
833
+ [256, 512, 6, True, False], [512, 768, 3, True, False],
834
+ [768, 1024, 3, False, True]]
835
+ }
836
+
837
+ def __init__(
838
+ self,
839
+ arch: str = 'P5',
840
+ deepen_factor: float = 1.0,
841
+ widen_factor: float = 1.0,
842
+ out_indices: Sequence[int] = (2, 3, 4),
843
+ frozen_stages: int = -1,
844
+ use_depthwise: bool = False,
845
+ expand_ratio: float = 0.5,
846
+ arch_ovewrite: dict = None,
847
+ spp_kernel_sizes: Sequence[int] = (5, 9, 13),
848
+ channel_attention: bool = True,
849
+ conv_cfg: OptConfigType = None,
850
+ norm_cfg: ConfigType = dict(type='BN', momentum=0.03, eps=0.001),
851
+ act_cfg: ConfigType = dict(type='SiLU'),
852
+ norm_eval: bool = False,
853
+ init_cfg: OptMultiConfig = dict(
854
+ type='Kaiming',
855
+ layer='Conv2d',
856
+ a=math.sqrt(5),
857
+ distribution='uniform',
858
+ mode='fan_in',
859
+ nonlinearity='leaky_relu')
860
+ ) -> None:
861
+ super().__init__()
862
+ arch_setting = self.arch_settings[arch]
863
+ if arch_ovewrite:
864
+ arch_setting = arch_ovewrite
865
+ assert set(out_indices).issubset(
866
+ i for i in range(len(arch_setting) + 1))
867
+ if frozen_stages not in range(-1, len(arch_setting) + 1):
868
+ raise ValueError('frozen_stages must be in range(-1, '
869
+ 'len(arch_setting) + 1). But received '
870
+ f'{frozen_stages}')
871
+
872
+ self.out_indices = out_indices
873
+ self.frozen_stages = frozen_stages
874
+ self.use_depthwise = use_depthwise
875
+ self.norm_eval = norm_eval
876
+
877
+ conv = DepthwiseSeparableConvModule if use_depthwise else ConvModule
878
+
879
+ self.stem = nn.Sequential(
880
+ ConvModule(
881
+ 3,
882
+ int(arch_setting[0][0] * widen_factor // 2),
883
+ 3,
884
+ padding=1,
885
+ stride=2,
886
+ norm_cfg=norm_cfg,
887
+ act_cfg=act_cfg),
888
+ ConvModule(
889
+ int(arch_setting[0][0] * widen_factor // 2),
890
+ int(arch_setting[0][0] * widen_factor // 2),
891
+ 3,
892
+ padding=1,
893
+ stride=1,
894
+ norm_cfg=norm_cfg,
895
+ act_cfg=act_cfg),
896
+ ConvModule(
897
+ int(arch_setting[0][0] * widen_factor // 2),
898
+ int(arch_setting[0][0] * widen_factor),
899
+ 3,
900
+ padding=1,
901
+ stride=1,
902
+ norm_cfg=norm_cfg,
903
+ act_cfg=act_cfg))
904
+
905
+ self.layers = ['stem']
906
+
907
+ for i, (in_channels, out_channels, num_blocks, add_identity,
908
+ use_spp) in enumerate(arch_setting):
909
+ in_channels = int(in_channels * widen_factor)
910
+ out_channels = int(out_channels * widen_factor)
911
+ num_blocks = max(round(num_blocks * deepen_factor), 1)
912
+ stage = []
913
+
914
+ conv_layer = conv(
915
+ in_channels,
916
+ out_channels,
917
+ 3,
918
+ stride=2,
919
+ padding=1,
920
+ conv_cfg=conv_cfg,
921
+ norm_cfg=norm_cfg,
922
+ act_cfg=act_cfg)
923
+ stage.append(conv_layer)
924
+
925
+ if use_spp:
926
+ spp = SPPBottleneck(
927
+ out_channels,
928
+ out_channels,
929
+ kernel_sizes=spp_kernel_sizes,
930
+ conv_cfg=conv_cfg,
931
+ norm_cfg=norm_cfg,
932
+ act_cfg=act_cfg)
933
+ stage.append(spp)
934
+
935
+ csp_layer = CSPLayer(
936
+ out_channels,
937
+ out_channels,
938
+ num_blocks=num_blocks,
939
+ add_identity=add_identity,
940
+ use_depthwise=use_depthwise,
941
+ use_cspnext_block=True,
942
+ expand_ratio=expand_ratio,
943
+ channel_attention=channel_attention,
944
+ conv_cfg=conv_cfg,
945
+ norm_cfg=norm_cfg,
946
+ act_cfg=act_cfg)
947
+ stage.append(csp_layer)
948
+
949
+ self.add_module(f'stage{i + 1}', nn.Sequential(*stage))
950
+ self.layers.append(f'stage{i + 1}')
951
+
952
+ def freeze_stages(self) -> None:
953
+ """Freeze stages parameters."""
954
+ if self.frozen_stages >= 0:
955
+ for i in range(self.frozen_stages + 1):
956
+ m = getattr(self, self.layers[i])
957
+ m.eval()
958
+ for param in m.parameters():
959
+ param.requires_grad = False
960
+
961
+ def train(self, mode=True) -> None:
962
+ """Convert the model into training mode while keeping normalization layer
963
+ frozen."""
964
+ super().train(mode)
965
+ self.freeze_stages()
966
+ if mode and self.norm_eval:
967
+ for m in self.modules():
968
+ if isinstance(m, _BatchNorm):
969
+ m.eval()
970
+
971
+ def forward(self, x: Tuple[Tensor, ...]) -> Tuple[Tensor, ...]:
972
+ outs = []
973
+ for i, layer_name in enumerate(self.layers):
974
+ layer = getattr(self, layer_name)
975
+ x = layer(x)
976
+ if i in self.out_indices:
977
+ outs.append(x)
978
+ return tuple(outs)
979
+
980
+
981
+ class CSPNeXtPAFPN(nn.Module):
982
+ """Path Aggregation Network with CSPNeXt blocks.
983
+ This is a standalone implementation that works with the CSPNeXt backbone.
984
+
985
+ Args:
986
+ in_channels (Sequence[int]): Number of input channels per scale.
987
+ out_channels (int): Number of output channels (used at each scale)
988
+ out_indices (Sequence[int]): Output from which stages.
989
+ num_csp_blocks (int): Number of bottlenecks in CSPLayer.
990
+ Defaults to 3.
991
+ use_depthwise (bool): Whether to use depthwise separable convolution in
992
+ blocks. Defaults to False.
993
+ expand_ratio (float): Ratio to adjust the number of channels of the
994
+ hidden layer. Default: 0.5
995
+ upsample_cfg (dict): Config dict for interpolate layer.
996
+ Default: `dict(scale_factor=2, mode='nearest')`
997
+ conv_cfg (dict, optional): Config dict for convolution layer.
998
+ Default: None, which means using conv2d.
999
+ norm_cfg (dict): Config dict for normalization layer.
1000
+ Default: dict(type='BN')
1001
+ act_cfg (dict): Config dict for activation layer.
1002
+ Default: dict(type='Swish')
1003
+ """
1004
+
1005
+ def __init__(
1006
+ self,
1007
+ in_channels: Sequence[int],
1008
+ out_channels: int,
1009
+ out_indices=(0, 1, 2),
1010
+ num_csp_blocks: int = 3,
1011
+ use_depthwise: bool = False,
1012
+ expand_ratio: float = 0.5,
1013
+ upsample_cfg: ConfigType = dict(scale_factor=2, mode='nearest'),
1014
+ conv_cfg: OptConfigType = None,
1015
+ norm_cfg: ConfigType = dict(type='BN', momentum=0.03, eps=0.001),
1016
+ act_cfg: ConfigType = dict(type='Swish'),
1017
+ init_cfg: OptMultiConfig = dict(
1018
+ type='Kaiming',
1019
+ layer='Conv2d',
1020
+ a=math.sqrt(5),
1021
+ distribution='uniform',
1022
+ mode='fan_in',
1023
+ nonlinearity='leaky_relu')
1024
+ ) -> None:
1025
+ super().__init__()
1026
+ self.in_channels = in_channels
1027
+ self.out_channels = out_channels
1028
+ self.out_indices = out_indices
1029
+
1030
+ conv = DepthwiseSeparableConvModule if use_depthwise else ConvModule
1031
+
1032
+ # build top-down blocks
1033
+ self.upsample = nn.Upsample(**upsample_cfg)
1034
+ self.reduce_layers = nn.ModuleList()
1035
+ self.top_down_blocks = nn.ModuleList()
1036
+ for idx in range(len(in_channels) - 1, 0, -1):
1037
+ self.reduce_layers.append(
1038
+ ConvModule(
1039
+ in_channels[idx],
1040
+ in_channels[idx - 1],
1041
+ 1,
1042
+ conv_cfg=conv_cfg,
1043
+ norm_cfg=norm_cfg,
1044
+ act_cfg=act_cfg))
1045
+ self.top_down_blocks.append(
1046
+ CSPLayer(
1047
+ in_channels[idx - 1] * 2,
1048
+ in_channels[idx - 1],
1049
+ num_blocks=num_csp_blocks,
1050
+ add_identity=False,
1051
+ use_depthwise=use_depthwise,
1052
+ use_cspnext_block=True,
1053
+ expand_ratio=expand_ratio,
1054
+ conv_cfg=conv_cfg,
1055
+ norm_cfg=norm_cfg,
1056
+ act_cfg=act_cfg))
1057
+
1058
+ # build bottom-up blocks
1059
+ self.downsamples = nn.ModuleList()
1060
+ self.bottom_up_blocks = nn.ModuleList()
1061
+ for idx in range(len(in_channels) - 1):
1062
+ self.downsamples.append(
1063
+ conv(
1064
+ in_channels[idx],
1065
+ in_channels[idx],
1066
+ 3,
1067
+ stride=2,
1068
+ padding=1,
1069
+ conv_cfg=conv_cfg,
1070
+ norm_cfg=norm_cfg,
1071
+ act_cfg=act_cfg))
1072
+ self.bottom_up_blocks.append(
1073
+ CSPLayer(
1074
+ in_channels[idx] * 2,
1075
+ in_channels[idx + 1],
1076
+ num_blocks=num_csp_blocks,
1077
+ add_identity=False,
1078
+ use_depthwise=use_depthwise,
1079
+ use_cspnext_block=True,
1080
+ expand_ratio=expand_ratio,
1081
+ conv_cfg=conv_cfg,
1082
+ norm_cfg=norm_cfg,
1083
+ act_cfg=act_cfg))
1084
+
1085
+ if self.out_channels is not None:
1086
+ self.out_convs = nn.ModuleList()
1087
+ for i in range(len(in_channels)):
1088
+ self.out_convs.append(
1089
+ conv(
1090
+ in_channels[i],
1091
+ out_channels,
1092
+ 3,
1093
+ padding=1,
1094
+ conv_cfg=conv_cfg,
1095
+ norm_cfg=norm_cfg,
1096
+ act_cfg=act_cfg))
1097
+
1098
+ def forward(self, inputs: Tuple[Tensor, ...]) -> Tuple[Tensor, ...]:
1099
+ """
1100
+ Args:
1101
+ inputs (tuple[Tensor]): input features.
1102
+
1103
+ Returns:
1104
+ tuple[Tensor]: YOLOXPAFPN features.
1105
+ """
1106
+ assert len(inputs) == len(self.in_channels)
1107
+
1108
+ # top-down path
1109
+ inner_outs = [inputs[-1]]
1110
+ for idx in range(len(self.in_channels) - 1, 0, -1):
1111
+ feat_high = inner_outs[0]
1112
+ feat_low = inputs[idx - 1]
1113
+ feat_high = self.reduce_layers[len(self.in_channels) - 1 - idx](
1114
+ feat_high)
1115
+ inner_outs[0] = feat_high
1116
+
1117
+ upsample_feat = self.upsample(feat_high)
1118
+
1119
+ inner_out = self.top_down_blocks[len(self.in_channels) - 1 - idx](
1120
+ torch.cat([upsample_feat, feat_low], 1))
1121
+ inner_outs.insert(0, inner_out)
1122
+
1123
+ # bottom-up path
1124
+ outs = [inner_outs[0]]
1125
+ for idx in range(len(self.in_channels) - 1):
1126
+ feat_low = outs[-1]
1127
+ feat_high = inner_outs[idx + 1]
1128
+ downsample_feat = self.downsamples[idx](feat_low)
1129
+ out = self.bottom_up_blocks[idx](
1130
+ torch.cat([downsample_feat, feat_high], 1))
1131
+ outs.append(out)
1132
+
1133
+ if self.out_channels is not None:
1134
+ # out convs
1135
+ for idx in range(len(outs)):
1136
+ outs[idx] = self.out_convs[idx](outs[idx])
1137
+
1138
+ return tuple([outs[i] for i in self.out_indices])
1139
+
1140
+
1141
+ class MlvlPointGenerator:
1142
+ """Standard points generator for multi-level feature maps."""
1143
+
1144
+ def __init__(
1145
+ self,
1146
+ strides,
1147
+ offset: float = 0.5
1148
+ ) -> None:
1149
+ if not isinstance(strides, (list, tuple)):
1150
+ strides = [strides]
1151
+
1152
+ self.strides = strides
1153
+ self.offset = offset
1154
+
1155
+ def grid_priors(
1156
+ self,
1157
+ featmap_sizes,
1158
+ dtype=torch.float32,
1159
+ device='cuda',
1160
+ with_stride=False
1161
+ ):
1162
+ """Generate grid points of multiple feature levels."""
1163
+ num_levels = len(featmap_sizes)
1164
+ multi_level_priors = []
1165
+
1166
+ for i in range(num_levels):
1167
+ priors = self.single_level_grid_priors(
1168
+ featmap_sizes[i],
1169
+ level_idx=i,
1170
+ dtype=dtype,
1171
+ device=device,
1172
+ with_stride=with_stride)
1173
+ multi_level_priors.append(priors)
1174
+
1175
+ return multi_level_priors
1176
+
1177
+ def single_level_grid_priors(
1178
+ self,
1179
+ featmap_size,
1180
+ level_idx,
1181
+ dtype=torch.float32,
1182
+ device='cuda',
1183
+ with_stride=False
1184
+ ):
1185
+ """Generate grid points for a single feature level."""
1186
+ feat_h, feat_w = featmap_size
1187
+ stride = self.strides[level_idx]
1188
+
1189
+ # Create grid coordinates
1190
+ shift_x = (torch.arange(0, feat_w, device=device) + self.offset) * stride
1191
+ shift_y = (torch.arange(0, feat_h, device=device) + self.offset) * stride
1192
+
1193
+ shift_x = shift_x.to(dtype)
1194
+ shift_y = shift_y.to(dtype)
1195
+
1196
+ # Create grid
1197
+ shift_yy, shift_xx = torch.meshgrid(shift_y, shift_x, indexing="ij")
1198
+ shift_xx = shift_xx.reshape(-1)
1199
+ shift_yy = shift_yy.reshape(-1)
1200
+
1201
+ if not with_stride:
1202
+ shifts = torch.stack([shift_xx, shift_yy], dim=-1)
1203
+ else:
1204
+ # Include stride information
1205
+ stride_tensor = torch.tensor(stride, dtype=dtype, device=device)
1206
+ stride_xx = torch.full_like(shift_xx, stride_tensor)
1207
+ stride_yy = torch.full_like(shift_yy, stride_tensor)
1208
+ shifts = torch.stack([shift_xx, shift_yy, stride_xx, stride_yy], dim=-1)
1209
+
1210
+ return shifts
1211
+
1212
+
1213
+ # Helper functions needed for geometric mean sigmoid
1214
+ def sigmoid_geometric_mean(x, y):
1215
+ """Compute geometric mean of two sigmoid functions."""
1216
+ x_sigmoid = torch.sigmoid(x)
1217
+ y_sigmoid = torch.sigmoid(y)
1218
+ return torch.sqrt(x_sigmoid * y_sigmoid)
1219
+
1220
+
1221
+ def inverse_sigmoid(x, eps=1e-5):
1222
+ """Inverse function of sigmoid."""
1223
+ x = x.clamp(min=0, max=1)
1224
+ x1 = x.clamp(min=eps)
1225
+ x2 = (1 - x).clamp(min=eps)
1226
+ return torch.log(x1 / x2)
1227
+
1228
+
1229
+ class RTMDetSepBNHead(nn.Module):
1230
+ """RTMDetHead with separated BN layers and shared conv layers."""
1231
+
1232
+ def __init__(
1233
+ self,
1234
+ num_classes: int,
1235
+ in_channels: int,
1236
+ share_conv: bool = True,
1237
+ use_depthwise: bool = False,
1238
+ pred_kernel_size: int = 1,
1239
+ stacked_convs: int = 2,
1240
+ feat_channels: int = 256,
1241
+ strides: List[int] = [8, 16, 32],
1242
+ with_objectness: bool = False,
1243
+ exp_on_reg: bool = False,
1244
+ ) -> None:
1245
+ super().__init__()
1246
+ self.num_classes = num_classes
1247
+ self.cls_out_channels = num_classes # For sigmoid
1248
+ self.in_channels = in_channels
1249
+ self.feat_channels = feat_channels
1250
+ self.stacked_convs = stacked_convs
1251
+ self.share_conv = share_conv
1252
+ self.use_depthwise = use_depthwise
1253
+ self.pred_kernel_size = pred_kernel_size
1254
+ self.with_objectness = with_objectness
1255
+ self.exp_on_reg = exp_on_reg
1256
+ self.strides = strides
1257
+
1258
+ # Number of anchors per grid point
1259
+ self.num_base_priors = 1
1260
+
1261
+ self._init_layers()
1262
+
1263
+ def _init_layers(self) -> None:
1264
+ """Initialize layers of the head."""
1265
+ self.cls_convs = nn.ModuleList()
1266
+ self.reg_convs = nn.ModuleList()
1267
+
1268
+ self.rtm_cls = nn.ModuleList()
1269
+ self.rtm_reg = nn.ModuleList()
1270
+ if self.with_objectness:
1271
+ self.rtm_obj = nn.ModuleList()
1272
+
1273
+ for n in range(len(self.strides)):
1274
+ cls_convs = nn.ModuleList()
1275
+ reg_convs = nn.ModuleList()
1276
+ for i in range(self.stacked_convs):
1277
+ chn = self.in_channels if i == 0 else self.feat_channels
1278
+
1279
+ if self.use_depthwise:
1280
+ cls_conv = DepthwiseSeparableConvModule(
1281
+ chn,
1282
+ self.feat_channels,
1283
+ 3,
1284
+ stride=1,
1285
+ padding=1,
1286
+ bias=False,
1287
+ act_cfg=dict(type='SiLU'),
1288
+ norm_cfg=dict(type='BN', momentum=0.03, eps=0.001)
1289
+ )
1290
+ reg_conv = DepthwiseSeparableConvModule(
1291
+ chn,
1292
+ self.feat_channels,
1293
+ 3,
1294
+ stride=1,
1295
+ padding=1,
1296
+ bias=False,
1297
+ act_cfg=dict(type='SiLU'),
1298
+ norm_cfg=dict(type='BN', momentum=0.03, eps=0.001)
1299
+ )
1300
+ else:
1301
+ cls_conv = ConvModule(
1302
+ chn,
1303
+ self.feat_channels,
1304
+ 3,
1305
+ stride=1,
1306
+ padding=1,
1307
+ bias=False,
1308
+ act_cfg=dict(type='SiLU'),
1309
+ norm_cfg=dict(type='BN', momentum=0.03, eps=0.001))
1310
+ reg_conv = ConvModule(
1311
+ chn,
1312
+ self.feat_channels,
1313
+ 3,
1314
+ stride=1,
1315
+ padding=1,
1316
+ bias=False,
1317
+ act_cfg=dict(type='SiLU'),
1318
+ norm_cfg=dict(type='BN', momentum=0.03, eps=0.001))
1319
+ # Append conv layers to the list
1320
+ cls_convs.append(cls_conv)
1321
+ reg_convs.append(reg_conv)
1322
+
1323
+ self.cls_convs.append(cls_convs)
1324
+ self.reg_convs.append(reg_convs)
1325
+
1326
+ self.rtm_cls.append(
1327
+ nn.Conv2d(
1328
+ self.feat_channels,
1329
+ self.num_base_priors * self.cls_out_channels,
1330
+ self.pred_kernel_size,
1331
+ padding=self.pred_kernel_size // 2))
1332
+ self.rtm_reg.append(
1333
+ nn.Conv2d(
1334
+ self.feat_channels,
1335
+ self.num_base_priors * 4,
1336
+ self.pred_kernel_size,
1337
+ padding=self.pred_kernel_size // 2))
1338
+ if self.with_objectness:
1339
+ self.rtm_obj.append(
1340
+ nn.Conv2d(
1341
+ self.feat_channels,
1342
+ 1,
1343
+ self.pred_kernel_size,
1344
+ padding=self.pred_kernel_size // 2))
1345
+
1346
+ if self.share_conv:
1347
+ for n in range(1, len(self.strides)):
1348
+ for i in range(self.stacked_convs):
1349
+ self.cls_convs[n][i] = self.cls_convs[0][i]
1350
+ self.reg_convs[n][i] = self.reg_convs[0][i]
1351
+
1352
+ # Initialize MlvlPointGenerator for anchor-free detection
1353
+ self.prior_generator = MlvlPointGenerator(self.strides, offset=0.0)
1354
+
1355
+ def init_weights(self):
1356
+ """Initialize weights of the head."""
1357
+ # Initialize conv layers with normal distribution
1358
+ for m in self.modules():
1359
+ if isinstance(m, nn.Conv2d):
1360
+ nn.init.normal_(m.weight, mean=0, std=0.01)
1361
+ if m.bias is not None:
1362
+ nn.init.constant_(m.bias, 0)
1363
+ if isinstance(m, nn.BatchNorm2d):
1364
+ nn.init.constant_(m.weight, 1)
1365
+ nn.init.constant_(m.bias, 0)
1366
+
1367
+ # Initialize classification layers with a prior probability
1368
+ bias_init = -torch.log(torch.tensor((1 - 0.01) / 0.01))
1369
+ for rtm_cls in self.rtm_cls:
1370
+ nn.init.normal_(rtm_cls.weight, mean=0, std=0.01)
1371
+ nn.init.constant_(rtm_cls.bias, bias_init)
1372
+
1373
+ for rtm_reg in self.rtm_reg:
1374
+ nn.init.normal_(rtm_reg.weight, mean=0, std=0.01)
1375
+ nn.init.constant_(rtm_reg.bias, 0)
1376
+
1377
+ if self.with_objectness:
1378
+ for rtm_obj in self.rtm_obj:
1379
+ nn.init.normal_(rtm_obj.weight, mean=0, std=0.01)
1380
+ nn.init.constant_(rtm_obj.bias, bias_init)
1381
+
1382
+ def forward(self, feats):
1383
+ """Forward features from the upstream network.
1384
+
1385
+ Args:
1386
+ feats (tuple[Tensor]): Features from the upstream network, each is
1387
+ a 4D-tensor.
1388
+
1389
+ Returns:
1390
+ tuple: Usually a tuple of classification scores and bbox prediction
1391
+ - cls_scores (list[Tensor]): Classification scores for all scale
1392
+ levels, each is a 4D-tensor.
1393
+ - bbox_preds (list[Tensor]): Box energies / deltas for all scale
1394
+ levels, each is a 4D-tensor.
1395
+ """
1396
+ cls_scores = []
1397
+ bbox_preds = []
1398
+ for idx, (x, stride) in enumerate(
1399
+ zip(feats, self.strides)):
1400
+ cls_feat = x
1401
+ reg_feat = x
1402
+
1403
+ for cls_layer in self.cls_convs[idx]:
1404
+ cls_feat = cls_layer(cls_feat)
1405
+ cls_score = self.rtm_cls[idx](cls_feat)
1406
+
1407
+ for reg_layer in self.reg_convs[idx]:
1408
+ reg_feat = reg_layer(reg_feat)
1409
+
1410
+ if self.with_objectness:
1411
+ objectness = self.rtm_obj[idx](reg_feat)
1412
+ cls_score = inverse_sigmoid(
1413
+ sigmoid_geometric_mean(cls_score, objectness))
1414
+
1415
+ if self.exp_on_reg:
1416
+ # Convert anchor-free to distance prediction, with stride scale
1417
+ reg_dist = self.rtm_reg[idx](reg_feat).exp() * stride
1418
+ else:
1419
+ reg_dist = self.rtm_reg[idx](reg_feat) * stride
1420
+
1421
+ cls_scores.append(cls_score)
1422
+ bbox_preds.append(reg_dist)
1423
+
1424
+ return tuple(cls_scores), tuple(bbox_preds)
1425
+
1426
+ def predict(self, cls_scores, bbox_preds, batch_img_metas=None, cfg=None,
1427
+ rescale=False, with_nms=True, score_thr=0.05,
1428
+ nms_iou_threshold=0.6, max_per_img=100):
1429
+ """Transform network outputs into bbox predictions.
1430
+
1431
+ This is a simplified version for inference only.
1432
+ """
1433
+ assert len(cls_scores) == len(bbox_preds)
1434
+ num_levels = len(cls_scores)
1435
+ device = cls_scores[0].device
1436
+ batch_size = cls_scores[0].shape[0]
1437
+
1438
+ # If no image metadata is provided, create default ones
1439
+ if batch_img_metas is None:
1440
+ # Use input feature size to estimate image size
1441
+ featmap_sizes = [cls_scores[i].shape[-2:] for i in range(num_levels)]
1442
+ strides = self.strides
1443
+
1444
+ # Calculate original image size based on feature map sizes and strides
1445
+ # This is approximate but works for most cases
1446
+ upscaled_sizes = []
1447
+ for i, featmap_size in enumerate(featmap_sizes):
1448
+ h, w = featmap_size
1449
+ upscaled_sizes.append((h * strides[i], w * strides[i]))
1450
+
1451
+ # Use the maximum size across levels
1452
+ img_h = max(s[0] for s in upscaled_sizes)
1453
+ img_w = max(s[1] for s in upscaled_sizes)
1454
+
1455
+ batch_img_metas = [{
1456
+ 'img_shape': (img_h, img_w, 3),
1457
+ 'scale_factor': [1.0, 1.0, 1.0, 1.0]
1458
+ } for _ in range(batch_size)]
1459
+
1460
+ # Get feature map sizes
1461
+ featmap_sizes = [cls_scores[i].shape[-2:] for i in range(num_levels)]
1462
+
1463
+ # Generate grid points for each level
1464
+ mlvl_priors = self.prior_generator.grid_priors(
1465
+ featmap_sizes,
1466
+ dtype=cls_scores[0].dtype,
1467
+ device=device,
1468
+ with_stride=True)
1469
+
1470
+ result_list = []
1471
+ for img_id in range(batch_size):
1472
+ img_meta = batch_img_metas[img_id]
1473
+ cls_score_list = [
1474
+ cls_scores[i][img_id].detach() for i in range(num_levels)
1475
+ ]
1476
+ bbox_pred_list = [
1477
+ bbox_preds[i][img_id].detach() for i in range(num_levels)
1478
+ ]
1479
+
1480
+ results = self._predict_by_feat_single(
1481
+ cls_score_list,
1482
+ bbox_pred_list,
1483
+ mlvl_priors,
1484
+ img_meta,
1485
+ score_thr=score_thr,
1486
+ nms_iou_threshold=nms_iou_threshold,
1487
+ max_per_img=max_per_img,
1488
+ rescale=rescale,
1489
+ with_nms=with_nms
1490
+ )
1491
+ result_list.append(results)
1492
+
1493
+ # Convert the results to a more standardized format
1494
+ boxes_batch = []
1495
+ scores_batch = []
1496
+ labels_batch = []
1497
+
1498
+ for result in result_list:
1499
+ boxes = result['bboxes']
1500
+ scores = result.get('scores', boxes[:, -1])
1501
+ labels = result['labels']
1502
+
1503
+ # Ensure boxes have only coordinates (some implementations add score as 5th column)
1504
+ if boxes.shape[1] > 4:
1505
+ boxes = boxes[:, :4]
1506
+
1507
+ boxes_batch.append(boxes)
1508
+ scores_batch.append(scores)
1509
+ labels_batch.append(labels)
1510
+
1511
+ # Stack results if there's at least one detection in each image
1512
+ if all(len(boxes) > 0 for boxes in boxes_batch):
1513
+ return DetectionOutput(
1514
+ boxes=torch.stack(boxes_batch),
1515
+ scores=torch.stack(scores_batch),
1516
+ labels=torch.stack(labels_batch)
1517
+ )
1518
+
1519
+ # Handle case where some images have no detections
1520
+ max_num = max(len(boxes) for boxes in boxes_batch)
1521
+ if max_num == 0:
1522
+ # No detections at all
1523
+ dummy = torch.zeros((batch_size, 0, 4), device=device)
1524
+ return DetectionOutput(
1525
+ boxes=dummy,
1526
+ scores=torch.zeros((batch_size, 0), device=device),
1527
+ labels=torch.zeros((batch_size, 0), dtype=torch.long, device=device)
1528
+ )
1529
+
1530
+ # Pad results to have consistent tensor shapes
1531
+ padded_boxes = []
1532
+ padded_scores = []
1533
+ padded_labels = []
1534
+
1535
+ for boxes, scores, labels in zip(boxes_batch, scores_batch, labels_batch):
1536
+ num_dets = len(boxes)
1537
+ if num_dets == 0:
1538
+ padded_boxes.append(torch.zeros((max_num, 4), device=device))
1539
+ padded_scores.append(torch.zeros(max_num, device=device))
1540
+ padded_labels.append(torch.zeros(max_num, dtype=torch.long, device=device))
1541
+ else:
1542
+ padding = torch.zeros((max_num - num_dets, 4), device=device)
1543
+ padded_boxes.append(torch.cat([boxes, padding], dim=0))
1544
+
1545
+ padding = torch.zeros(max_num - num_dets, device=device)
1546
+ padded_scores.append(torch.cat([scores, padding], dim=0))
1547
+
1548
+ padding = torch.zeros(max_num - num_dets, dtype=torch.long, device=device)
1549
+ padded_labels.append(torch.cat([labels, padding], dim=0))
1550
+
1551
+ return DetectionOutput(
1552
+ boxes=torch.stack(padded_boxes),
1553
+ scores=torch.stack(padded_scores),
1554
+ labels=torch.stack(padded_labels)
1555
+ )
1556
+
1557
+ def _predict_by_feat_single(self, cls_score_list, bbox_pred_list, mlvl_priors,
1558
+ img_meta, score_thr=0.05, nms_iou_threshold=0.6,
1559
+ max_per_img=100, rescale=False, with_nms=True):
1560
+ """Transform outputs of a single image into bbox predictions.
1561
+
1562
+ This is a simplified version for inference only.
1563
+ """
1564
+ # For each scale level
1565
+ mlvl_bboxes = []
1566
+ mlvl_scores = []
1567
+
1568
+ for level_idx, (cls_score, bbox_pred, priors) in enumerate(
1569
+ zip(cls_score_list, bbox_pred_list, mlvl_priors)):
1570
+ assert cls_score.size()[-2:] == bbox_pred.size()[-2:]
1571
+
1572
+ # Reshape
1573
+ cls_score = cls_score.permute(1, 2, 0).reshape(-1, self.cls_out_channels)
1574
+ bbox_pred = bbox_pred.permute(1, 2, 0).reshape(-1, 4)
1575
+
1576
+ # Get scores
1577
+ scores = torch.sigmoid(cls_score)
1578
+
1579
+ # Find high-scoring predictions
1580
+ max_scores, _ = scores.max(dim=1)
1581
+ keep_mask = max_scores > score_thr
1582
+ scores = scores[keep_mask]
1583
+ bbox_pred = bbox_pred[keep_mask]
1584
+ priors = priors[keep_mask]
1585
+
1586
+ # If no valid predictions for this level, continue
1587
+ if scores.numel() == 0:
1588
+ continue
1589
+
1590
+ # Decode bboxes
1591
+ bboxes = self._decode_bboxes(priors, bbox_pred, img_meta.get('img_shape'))
1592
+
1593
+ mlvl_bboxes.append(bboxes)
1594
+ mlvl_scores.append(scores)
1595
+
1596
+ # Combine all levels
1597
+ if len(mlvl_bboxes) == 0:
1598
+ # Return empty result if no valid predictions
1599
+ return {
1600
+ 'bboxes': torch.zeros((0, 4), device=cls_score_list[0].device),
1601
+ 'scores': torch.zeros((0,), device=cls_score_list[0].device),
1602
+ 'labels': torch.zeros((0,), device=cls_score_list[0].device, dtype=torch.long)
1603
+ }
1604
+
1605
+ bboxes = torch.cat(mlvl_bboxes)
1606
+ scores = torch.cat(mlvl_scores)
1607
+
1608
+ # Optional rescaling to original image size
1609
+ if rescale and 'scale_factor' in img_meta:
1610
+ bboxes /= bboxes.new_tensor(img_meta['scale_factor']).repeat((1, 2))
1611
+
1612
+ # Apply NMS for each class
1613
+ if with_nms:
1614
+ det_bboxes, det_labels = self._nms(bboxes, scores,
1615
+ nms_iou_threshold,
1616
+ max_per_img)
1617
+ else:
1618
+ # Just return top k scores without NMS
1619
+ scores_flattened = scores.flatten()
1620
+ if scores_flattened.size(0) > max_per_img:
1621
+ top_scores, indices = scores_flattened.topk(max_per_img)
1622
+ scores_top_k = scores.view(-1, self.num_classes).index_select(0, indices)
1623
+ bboxes_top_k = bboxes.index_select(0, indices)
1624
+ labels_top_k = indices % self.num_classes
1625
+ det_bboxes = torch.cat([bboxes_top_k, top_scores.unsqueeze(-1)], dim=1)
1626
+ det_labels = labels_top_k
1627
+ else:
1628
+ # Convert to the same format with NMS
1629
+ num_bboxes = bboxes.size(0)
1630
+ max_scores, labels = scores.max(dim=1)
1631
+ det_bboxes = torch.cat([bboxes, max_scores.unsqueeze(-1)], dim=1)
1632
+ det_labels = labels
1633
+
1634
+ return {
1635
+ 'bboxes': det_bboxes,
1636
+ 'scores': det_bboxes[:, -1],
1637
+ 'labels': det_labels
1638
+ }
1639
+
1640
+ def _decode_bboxes(self, priors, distance, max_shape=None):
1641
+ """Decode distance predictions to bounding box coordinates."""
1642
+ # Get xy coordinates of priors (grid points)
1643
+ xy = priors[..., :2]
1644
+
1645
+ # Distance predictions to 4 boundaries (left, top, right, bottom)
1646
+ # distances = [l, t, r, b]
1647
+
1648
+ # Calculate bbox coordinates
1649
+ x1 = xy[..., 0] - distance[..., 0]
1650
+ y1 = xy[..., 1] - distance[..., 1]
1651
+ x2 = xy[..., 0] + distance[..., 2]
1652
+ y2 = xy[..., 1] + distance[..., 3]
1653
+
1654
+ bboxes = torch.stack([x1, y1, x2, y2], -1)
1655
+
1656
+ # Clip boxes to image boundaries if needed
1657
+ if max_shape is not None:
1658
+ bboxes[..., 0].clamp_(min=0, max=max_shape[1])
1659
+ bboxes[..., 1].clamp_(min=0, max=max_shape[0])
1660
+ bboxes[..., 2].clamp_(min=0, max=max_shape[1])
1661
+ bboxes[..., 3].clamp_(min=0, max=max_shape[0])
1662
+
1663
+ return bboxes
1664
+
1665
+ def _nms(self, bboxes, scores, iou_threshold, max_per_img):
1666
+ """Apply NMS to detection results."""
1667
+ # For each class
1668
+ num_classes = scores.shape[1]
1669
+ det_bboxes = []
1670
+ det_labels = []
1671
+
1672
+ for cls_idx in range(num_classes):
1673
+ cls_scores = scores[:, cls_idx]
1674
+ keep_idx = cls_scores > 0.05 # Apply score threshold
1675
+
1676
+ if not keep_idx.any():
1677
+ continue
1678
+
1679
+ cls_bboxes = bboxes[keep_idx]
1680
+ cls_scores = cls_scores[keep_idx]
1681
+
1682
+ # Apply NMS for this class
1683
+ keep = self._batched_nms(cls_bboxes, cls_scores, iou_threshold)
1684
+ keep = keep[:max_per_img]
1685
+
1686
+ det_bboxes.append(torch.cat([cls_bboxes[keep], cls_scores[keep].unsqueeze(-1)], dim=1))
1687
+ det_labels.append(cls_bboxes.new_full((keep.size(0),), cls_idx, dtype=torch.long))
1688
+
1689
+ if len(det_bboxes) > 0:
1690
+ det_bboxes = torch.cat(det_bboxes, dim=0)
1691
+ det_labels = torch.cat(det_labels, dim=0)
1692
+
1693
+ # Sort by score
1694
+ _, indices = det_bboxes[:, -1].sort(descending=True)
1695
+ det_bboxes = det_bboxes[indices]
1696
+ det_labels = det_labels[indices]
1697
+
1698
+ # Limit to max_per_img
1699
+ det_bboxes = det_bboxes[:max_per_img]
1700
+ det_labels = det_labels[:max_per_img]
1701
+ else:
1702
+ # Return empty tensors if no detections
1703
+ det_bboxes = bboxes.new_zeros((0, 5))
1704
+ det_labels = bboxes.new_zeros((0,), dtype=torch.long)
1705
+
1706
+ return det_bboxes, det_labels
1707
+
1708
+ def _batched_nms(self, boxes, scores, iou_threshold):
1709
+ """Performs non-maximum suppression on a batch of boxes."""
1710
+ if boxes.shape[0] == 0:
1711
+ return boxes.new_zeros(0, dtype=torch.long)
1712
+
1713
+ try:
1714
+ # Try to use torchvision NMS for speed if available
1715
+ return torchvision.ops.nms(boxes, scores, iou_threshold)
1716
+ except:
1717
+ # Fall back to manual NMS implementation
1718
+ x1 = boxes[:, 0]
1719
+ y1 = boxes[:, 1]
1720
+ x2 = boxes[:, 2]
1721
+ y2 = boxes[:, 3]
1722
+ areas = (x2 - x1) * (y2 - y1)
1723
+ _, order = scores.sort(descending=True)
1724
+
1725
+ keep = []
1726
+ while order.size(0) > 0:
1727
+ i = order[0].item()
1728
+ keep.append(i)
1729
+
1730
+ if order.size(0) == 1:
1731
+ break
1732
+
1733
+ xx1 = torch.max(x1[order[1:]], x1[i])
1734
+ yy1 = torch.max(y1[order[1:]], y1[i])
1735
+ xx2 = torch.min(x2[order[1:]], x2[i])
1736
+ yy2 = torch.min(y2[order[1:]], y2[i])
1737
+
1738
+ w = torch.clamp(xx2 - xx1, min=0)
1739
+ h = torch.clamp(yy2 - yy1, min=0)
1740
+ inter = w * h
1741
+
1742
+ iou = inter / (areas[i] + areas[order[1:]] - inter)
1743
+
1744
+ inds = torch.where(iou <= iou_threshold)[0]
1745
+ order = order[inds + 1]
1746
+
1747
+ return torch.tensor(keep, dtype=torch.long, device=boxes.device)
1748
+
1749
+
1750
+ class RTMDetModel(PreTrainedModel):
1751
+ """
1752
+ RTMDet object detection model compatible with Hugging Face transformers.
1753
+ Updated implementation using PyTorch only with no NumPy or OpenCV dependencies.
1754
+
1755
+ This model consists of a backbone, neck, and detection head for object detection.
1756
+ """
1757
+
1758
+ config_class = RTMDetConfig
1759
+ base_model_prefix = "rtmdet"
1760
+ main_input_name = "pixel_values"
1761
+
1762
+ def __init__(self, config):
1763
+ super().__init__(config)
1764
+
1765
+ # Build backbone
1766
+ self.backbone = CSPNeXt(
1767
+ arch=config.backbone_arch,
1768
+ deepen_factor=config.backbone_deepen_factor,
1769
+ widen_factor=config.backbone_widen_factor,
1770
+ expand_ratio=config.backbone_expand_ratio,
1771
+ channel_attention=config.backbone_channel_attention,
1772
+ use_depthwise=False,
1773
+ )
1774
+
1775
+ # Build neck
1776
+ self.neck = CSPNeXtPAFPN(
1777
+ in_channels=config.neck_in_channels,
1778
+ out_channels=config.neck_out_channels,
1779
+ num_csp_blocks=config.neck_num_csp_blocks,
1780
+ expand_ratio=config.neck_expand_ratio,
1781
+ use_depthwise=False,
1782
+ )
1783
+
1784
+ # Build head
1785
+ self.bbox_head = RTMDetSepBNHead(
1786
+ num_classes=config.num_classes,
1787
+ in_channels=config.head_in_channels,
1788
+ stacked_convs=config.head_stacked_convs,
1789
+ feat_channels=config.head_feat_channels,
1790
+ with_objectness=config.head_with_objectness,
1791
+ exp_on_reg=config.head_exp_on_reg,
1792
+ share_conv=config.head_share_conv,
1793
+ pred_kernel_size=config.head_pred_kernel_size,
1794
+ strides=config.strides,
1795
+ use_depthwise=False
1796
+ )
1797
+
1798
+ # Initialize weights
1799
+ self.init_weights()
1800
+
1801
+ def init_weights(self):
1802
+ """Initialize the weights of the model."""
1803
+ # Backbone is usually initialized from pre-trained weights
1804
+ # so we don't need special initialization
1805
+
1806
+ # Initialize head
1807
+ self.bbox_head.init_weights()
1808
+
1809
+ def forward(
1810
+ self,
1811
+ pixel_values=None,
1812
+ labels=None,
1813
+ output_hidden_states=None,
1814
+ return_dict=None,
1815
+ ):
1816
+ """
1817
+ Forward pass of the model.
1818
+
1819
+ Args:
1820
+ pixel_values (`torch.FloatTensor` of shape `(batch_size, channels, height, width)`):
1821
+ Pixel values. Pixel values can be obtained using
1822
+ RTMDetImageProcessor.
1823
+ labels (`List[Dict]`, *optional*):
1824
+ Labels for computing the detection loss. Expected format:
1825
+ List of dicts with 'boxes' and 'labels' keys.
1826
+ output_hidden_states (`bool`, *optional*):
1827
+ Whether or not to return the hidden states of all layers.
1828
+ return_dict (`bool`, *optional*):
1829
+ Whether or not to return a ModelOutput instead of a plain tuple.
1830
+
1831
+ Returns:
1832
+ `DetectionOutput` or `tuple`:
1833
+ If return_dict=True, `DetectionOutput` is returned.
1834
+ If return_dict=False, a tuple is returned where the first element
1835
+ is the detection output tensor.
1836
+ """
1837
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
1838
+
1839
+ # Get inputs
1840
+ if pixel_values is None:
1841
+ raise ValueError("You have to specify pixel_values")
1842
+
1843
+ batch_size, channels, height, width = pixel_values.shape
1844
+
1845
+ # Extract features from backbone
1846
+ backbone_features = self.backbone(pixel_values)
1847
+
1848
+ # Process features through neck
1849
+ neck_features = self.neck(backbone_features)
1850
+
1851
+ # Get cls_scores and bbox_preds from head
1852
+ cls_scores, bbox_preds = self.bbox_head(neck_features)
1853
+
1854
+ if labels is not None:
1855
+ # Training mode: calculate loss (not implemented in this simplified version)
1856
+ loss = torch.tensor(0.0, device=pixel_values.device)
1857
+ if return_dict:
1858
+ return DetectionOutput(loss=loss)
1859
+ else:
1860
+ return (loss,)
1861
+
1862
+ # Inference mode: Get detection results
1863
+ # Create default batch_img_metas for prediction
1864
+ batch_img_metas = [{
1865
+ 'img_shape': (height, width, 3),
1866
+ 'scale_factor': [1.0, 1.0, 1.0, 1.0]
1867
+ } for _ in range(batch_size)]
1868
+
1869
+ # Call predict method with parameters from config
1870
+ results = self.bbox_head.predict(
1871
+ cls_scores=cls_scores,
1872
+ bbox_preds=bbox_preds,
1873
+ batch_img_metas=batch_img_metas,
1874
+ rescale=False,
1875
+ with_nms=True,
1876
+ score_thr=self.config.score_threshold,
1877
+ nms_iou_threshold=self.config.nms_threshold,
1878
+ max_per_img=self.config.max_detections
1879
+ )
1880
+
1881
+ if return_dict:
1882
+ return results
1883
+ else:
1884
+ # Return as tuple (boxes, scores, labels)
1885
+ return (results.boxes, results.scores, results.labels)
1886
+
preprocessor_config.json ADDED
@@ -0,0 +1,39 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_valid_processor_keys": [
3
+ "images",
4
+ "do_resize",
5
+ "size",
6
+ "keep_aspect_ratio",
7
+ "ensure_multiple_of",
8
+ "resample",
9
+ "do_rescale",
10
+ "rescale_factor",
11
+ "do_normalize",
12
+ "image_mean",
13
+ "image_std",
14
+ "do_pad",
15
+ "size_divisor",
16
+ "return_tensors",
17
+ "data_format",
18
+ "input_data_format"
19
+ ],
20
+ "do_normalize": true,
21
+ "do_rescale": false,
22
+ "do_resize": true,
23
+ "image_mean": [
24
+ 123.675,
25
+ 116.28,
26
+ 103.53
27
+ ],
28
+ "image_processor_type": "DPTImageProcessor",
29
+ "image_std": [
30
+ 58.395,
31
+ 57.12,
32
+ 57.375
33
+ ],
34
+ "size": {
35
+ "height": 640,
36
+ "width": 640
37
+ }
38
+ }
39
+