ustc-community
/

dfine-xlarge-coco

@@ -1,21 +1,13 @@
 {
   "activation_dropout": 0.0,
   "activation_function": "silu",
   "anchor_image_size": null,
-  "architectures": [
-    "DFineForObjectDetection"
-  ],
   "attention_dropout": 0.0,
   "auxiliary_loss": true,
   "backbone": null,
   "backbone_config": {
     "embedding_size": 32,
-    "hidden_sizes": [
-      64,
-      128,
-      256,
-      512
-    ],
     "layer_type": "basic",
     "model_type": "d_fine_resnet",
     "out_features": [
@@ -28,47 +20,23 @@
       3,
       4
     ],
-    "stage_config": [
-      [
-        64,
-        64,
-        128,
-        1,
-        false,
-        false,
-        3,
-        6
-      ],
-      [
-        128,
-        128,
-        512,
-        2,
-        true,
-        false,
-        3,
-        6
-      ],
-      [
-        512,
-        256,
-        1024,
-        5,
-        true,
-        true,
-        5,
-        6
-      ],
-      [
-        1024,
-        512,
-        2048,
-        2,
-        true,
-        true,
-        5,
-        6
-      ]
     ],
     "stem_channels": [
       3,
@@ -89,6 +57,7 @@
     384
   ],
   "decoder_layers": 6,
   "decoder_n_points": [
     3,
     6,
@@ -294,21 +263,23 @@
   "layer_norm_eps": 1e-05,
   "layer_scale": 1,
   "learn_initial_query": false,
   "matcher_alpha": 0.25,
   "matcher_bbox_cost": 5.0,
   "matcher_class_cost": 2.0,
   "matcher_gamma": 2.0,
   "matcher_giou_cost": 2.0,
   "model_type": "d_fine",
   "normalize_before": false,
   "num_denoising": 100,
   "num_feature_levels": 3,
   "num_queries": 300,
   "positional_encoding_temperature": 10000,
-  "reg_max": 32,
   "reg_scale": 4.0,
-  "torch_dtype": "float32",
-  "transformers_version": "4.49.0.dev0",
   "use_focal_loss": true,
   "use_pretrained_backbone": false,
   "use_timm_backbone": false,

 {
+  "_attn_implementation_autoset": true,
   "activation_dropout": 0.0,
   "activation_function": "silu",
   "anchor_image_size": null,
   "attention_dropout": 0.0,
   "auxiliary_loss": true,
   "backbone": null,
   "backbone_config": {
     "embedding_size": 32,
     "layer_type": "basic",
     "model_type": "d_fine_resnet",
     "out_features": [
       3,
       4
     ],
+    "stage_in_channels": [
+      64,
+      128,
+      512,
+      1024
+    ],
+    "stage_mid_channels": [
+      64,
+      128,
+      256,
+      512
+    ],
+    "stage_num_blocks": [
+      1,
+      2,
+      5,
+      2
     ],
     "stem_channels": [
       3,
     384
   ],
   "decoder_layers": 6,
+  "decoder_method": "default",
   "decoder_n_points": [
     3,
     6,
   "layer_norm_eps": 1e-05,
   "layer_scale": 1,
   "learn_initial_query": false,
+  "lqe_hidden_dim": 64,
+  "lqe_layers": 2,
   "matcher_alpha": 0.25,
   "matcher_bbox_cost": 5.0,
   "matcher_class_cost": 2.0,
   "matcher_gamma": 2.0,
   "matcher_giou_cost": 2.0,
+  "max_num_bins": 32,
   "model_type": "d_fine",
   "normalize_before": false,
   "num_denoising": 100,
   "num_feature_levels": 3,
   "num_queries": 300,
   "positional_encoding_temperature": 10000,
   "reg_scale": 4.0,
+  "top_prob_values": 4,
+  "transformers_version": "4.50.0.dev0",
   "use_focal_loss": true,
   "use_pretrained_backbone": false,
   "use_timm_backbone": false,