| | from transformers import PretrainedConfig |
| |
|
| | from transformers.models.auto import CONFIG_MAPPING |
| | from transformers.utils.backbone_utils import verify_backbone_config_arguments |
| |
|
| | from transformers.utils import logging, PushToHubMixin |
| |
|
| | logger = logging.get_logger(__name__) |
| |
|
| | class DiffusionDetConfig(PretrainedConfig): |
| |
|
| | model_type = "diffusiondet" |
| |
|
| | def __init__( |
| | self, |
| | use_timm_backbone=True, |
| | backbone_config=None, |
| | num_channels=3, |
| | pixel_mean=(123.675, 116.280, 103.530), |
| | pixel_std=(58.395, 57.120, 57.375), |
| | resnet_out_features=("res2", "res3", "res4", "res5"), |
| | resnet_in_features=("res2", "res3", "res4", "res5"), |
| | roi_head_in_features=("p2", "p3", "p4", "p5"), |
| | fpn_out_channels=256, |
| | pooler_resolution=7, |
| | sampling_ratio=2, |
| | num_proposals=300, |
| | num_attn_heads=8, |
| | dropout=0.0, |
| | dim_feedforward=2048, |
| | activation="relu", |
| | hidden_dim=256, |
| | num_cls=1, |
| | num_reg=3, |
| | num_heads=6, |
| | num_dynamic=2, |
| | dim_dynamic=64, |
| | class_weight=2.0, |
| | giou_weight=2.0, |
| | l1_weight=5.0, |
| | deep_supervision=True, |
| | no_object_weight=0.1, |
| | use_focal=True, |
| | use_fed_loss=False, |
| | alpha=0.25, |
| | gamma=2.0, |
| | prior_prob=0.01, |
| | ota_k=5, |
| | snr_scale=2.0, |
| | sample_step=1, |
| | use_nms=True, |
| | swin_size="B", |
| | use_swin_checkpoint=False, |
| | swin_out_features=(0, 1, 2, 3), |
| | optimizer="ADAMW", |
| | backbone_multiplier=1.0, |
| | backbone='resnet50', |
| | use_pretrained_backbone=True, |
| | backbone_kwargs=None, |
| | dilation=False, |
| | **kwargs |
| | ): |
| | |
| | |
| | if use_timm_backbone and backbone_kwargs is None: |
| | backbone_kwargs = {} |
| | if dilation: |
| | backbone_kwargs["output_stride"] = 16 |
| | backbone_kwargs["out_indices"] = [1, 2, 3, 4] |
| | backbone_kwargs["in_chans"] = num_channels |
| | |
| | elif not use_timm_backbone and backbone in (None, "resnet50"): |
| | if backbone_config is None: |
| | logger.info("`backbone_config` is `None`. Initializing the config with the default `ResNet` backbone.") |
| | backbone_config = CONFIG_MAPPING["resnet"](out_features=["stage4"]) |
| | elif isinstance(backbone_config, dict): |
| | backbone_model_type = backbone_config.get("model_type") |
| | config_class = CONFIG_MAPPING[backbone_model_type] |
| | backbone_config = config_class.from_dict(backbone_config) |
| | backbone = None |
| | |
| | dilation = None |
| |
|
| | verify_backbone_config_arguments( |
| | use_timm_backbone=use_timm_backbone, |
| | use_pretrained_backbone=use_pretrained_backbone, |
| | backbone=backbone, |
| | backbone_config=backbone_config, |
| | backbone_kwargs=backbone_kwargs, |
| | ) |
| |
|
| | |
| | self.auto_map = { |
| | "AutoConfig": "configuration_diffusiondet.DiffusionDetConfig", |
| | "AutoModelForObjectDetection": "modeling_diffusiondet.DiffusionDet" |
| | } |
| |
|
| | |
| | self.use_timm_backbone = use_timm_backbone |
| | self.backbone_config = backbone_config |
| | self.num_channels = num_channels |
| | self.backbone = backbone |
| | self.use_pretrained_backbone = use_pretrained_backbone |
| | self.backbone_kwargs = backbone_kwargs |
| | self.dilation = dilation |
| | self.fpn_out_channels = fpn_out_channels |
| |
|
| | |
| | self.pixel_mean = pixel_mean |
| | self.pixel_std = pixel_std |
| | self.resnet_out_features = resnet_out_features |
| | self.resnet_in_features = resnet_in_features |
| | self.roi_head_in_features = roi_head_in_features |
| | self.pooler_resolution = pooler_resolution |
| | self.sampling_ratio = sampling_ratio |
| | self.num_proposals = num_proposals |
| |
|
| | |
| | self.num_attn_heads = num_attn_heads |
| | self.dropout = dropout |
| | self.dim_feedforward = dim_feedforward |
| | self.activation = activation |
| | self.hidden_dim = hidden_dim |
| | self.num_cls = num_cls |
| | self.num_reg = num_reg |
| | self.num_heads = num_heads |
| |
|
| | |
| | self.num_dynamic = num_dynamic |
| | self.dim_dynamic = dim_dynamic |
| |
|
| | |
| | self.class_weight = class_weight |
| | self.giou_weight = giou_weight |
| | self.l1_weight = l1_weight |
| | self.deep_supervision = deep_supervision |
| | self.no_object_weight = no_object_weight |
| |
|
| | |
| | self.use_focal = use_focal |
| | self.use_fed_loss = use_fed_loss |
| | self.alpha = alpha |
| | self.gamma = gamma |
| | self.prior_prob = prior_prob |
| |
|
| | |
| | self.ota_k = ota_k |
| |
|
| | |
| | self.snr_scale = snr_scale |
| | self.sample_step = sample_step |
| |
|
| | |
| | self.use_nms = use_nms |
| |
|
| | |
| | self.swin_size = swin_size |
| | self.use_swin_checkpoint = use_swin_checkpoint |
| | self.swin_out_features = swin_out_features |
| |
|
| | |
| | self.optimizer = optimizer |
| | self.backbone_multiplier = backbone_multiplier |
| |
|
| | self.num_labels = 80 |
| |
|
| | super().__init__() |
| |
|