{
  "architectures": [
    "FastForSceneTextRecognition"
  ],
  "backbone_config": {
    "batch_norm_eps": 1e-05,
    "conv_layer_kernel_sizes": [
      [
        [
          3,
          3
        ],
        [
          3,
          3
        ],
        [
          3,
          1
        ],
        [
          3,
          3
        ],
        [
          3,
          1
        ],
        [
          3,
          3
        ],
        [
          3,
          3
        ],
        [
          1,
          3
        ],
        [
          3,
          3
        ],
        [
          3,
          3
        ]
      ],
      [
        [
          3,
          3
        ],
        [
          1,
          3
        ],
        [
          3,
          3
        ],
        [
          3,
          1
        ],
        [
          3,
          3
        ],
        [
          3,
          3
        ],
        [
          3,
          1
        ],
        [
          3,
          1
        ],
        [
          3,
          3
        ],
        [
          3,
          3
        ]
      ],
      [
        [
          3,
          3
        ],
        [
          3,
          3
        ],
        [
          3,
          3
        ],
        [
          1,
          3
        ],
        [
          3,
          3
        ],
        [
          3,
          1
        ],
        [
          3,
          3
        ],
        [
          3,
          1
        ]
      ],
      [
        [
          3,
          3
        ],
        [
          1,
          3
        ],
        [
          3,
          1
        ],
        [
          3,
          1
        ],
        [
          1,
          3
        ]
      ]
    ],
    "conv_layer_strides": [
      [
        1,
        2,
        1,
        1,
        1,
        1,
        1,
        1,
        1,
        1
      ],
      [
        2,
        1,
        1,
        1,
        1,
        1,
        1,
        1,
        1,
        1
      ],
      [
        2,
        1,
        1,
        1,
        1,
        1,
        1,
        1
      ],
      [
        2,
        1,
        1,
        1,
        1
      ]
    ],
    "depths": [
      10,
      10,
      8,
      5
    ],
    "hidden_sizes": [
      64,
      64,
      128,
      256,
      512
    ],
    "image_size": [
      640,
      640
    ],
    "initializer_range": 0.02,
    "model_type": "textnet",
    "out_features": [
      "stage1",
      "stage2",
      "stage3",
      "stage4"
    ],
    "out_indices": [
      1,
      2,
      3,
      4
    ],
    "stage_names": [
      "stem",
      "stage1",
      "stage2",
      "stage3",
      "stage4"
    ],
    "stem_act_func": "relu",
    "stem_kernel_size": 3,
    "stem_num_channels": 3,
    "stem_out_channels": 64,
    "stem_stride": 2
  },
  "bounding_box_type": "boxes",
  "head_conv_dilation": 1,
  "head_conv_groups": 1,
  "head_conv_in_channels": 512,
  "head_conv_kernel_size": [
    3,
    3
  ],
  "head_conv_out_channels": 128,
  "head_conv_stride": 1,
  "head_dropout_ratio": 0.1,
  "head_final_act_func": null,
  "head_final_bias": false,
  "head_final_dilation": 1,
  "head_final_dropout_rate": 0,
  "head_final_groups": 1,
  "head_final_has_shuffle": false,
  "head_final_in_channels": 128,
  "head_final_kernel_size": 1,
  "head_final_ops_order": "weight",
  "head_final_out_channels": 5,
  "head_final_stride": 1,
  "head_final_use_bn": false,
  "head_pooling_size": 9,
  "initializer_range": 0.02,
  "loss_bg": false,
  "min_area": 250,
  "neck_dilation": [
    1,
    1,
    1,
    1
  ],
  "neck_groups": [
    1,
    1,
    1,
    1
  ],
  "neck_in_channels": [
    64,
    128,
    256,
    512
  ],
  "neck_kernel_size": [
    [
      3,
      3
    ],
    [
      3,
      3
    ],
    [
      3,
      3
    ],
    [
      3,
      3
    ]
  ],
  "neck_out_channels": [
    128,
    128,
    128,
    128
  ],
  "neck_stride": [
    1,
    1,
    1,
    1
  ],
  "torch_dtype": "float32",
  "transformers_version": "4.55.0.dev0",
  "use_timm_backbone": false
}