{ "architectures": [ "MlxComicTextDetector" ], "format_version": 1, "fusion": { "conv_bn": "forward", "head_bn_eps": 1e-05, "trunk_bn_eps": 0.001 }, "heads": { "db": { "activation": "relu", "layers": [ { "in_channels": 512, "name": "upconv3", "out_channels": 512, "repeats": 1, "type": "UpC3" }, { "in_channels": 384, "name": "upconv4", "out_channels": 256, "repeats": 1, "type": "UpC3" }, { "in_channels": 128, "kernel": 1, "name": "projection", "out_channels": 64, "padding": 0, "stride": 1, "type": "Conv2d" }, { "in_channels": 64, "kernel": 3, "name": "binarize.0", "out_channels": 16, "padding": 1, "stride": 1, "type": "Conv2d" }, { "in_channels": 16, "kernel": 2, "name": "binarize.3", "out_channels": 16, "padding": 0, "stride": 2, "type": "ConvTranspose2d" }, { "in_channels": 16, "kernel": 2, "name": "binarize.6", "out_channels": 1, "padding": 0, "stride": 2, "type": "ConvTranspose2d" }, { "in_channels": 64, "kernel": 3, "name": "thresh.0", "out_channels": 16, "padding": 1, "stride": 1, "type": "Conv2d" }, { "in_channels": 16, "kernel": 2, "name": "thresh.3", "out_channels": 16, "padding": 0, "stride": 2, "type": "ConvTranspose2d" }, { "in_channels": 16, "kernel": 2, "name": "thresh.6", "out_channels": 1, "padding": 0, "stride": 2, "type": "ConvTranspose2d" } ] }, "segmentation": { "activation": "leaky", "layers": [ { "in_channels": 512, "name": "down_conv1", "out_channels": 512, "repeats": 1, "type": "DownC3" }, { "in_channels": 512, "name": "upconv0", "out_channels": 512, "repeats": 1, "type": "UpC3" }, { "in_channels": 768, "name": "upconv2", "out_channels": 512, "repeats": 1, "type": "UpC3" }, { "in_channels": 512, "name": "upconv3", "out_channels": 512, "repeats": 1, "type": "UpC3" }, { "in_channels": 384, "name": "upconv4", "out_channels": 256, "repeats": 1, "type": "UpC3" }, { "in_channels": 192, "name": "upconv5", "out_channels": 128, "repeats": 1, "type": "UpC3" }, { "in_channels": 64, "kernel": 4, "name": "upconv6", "out_channels": 1, "padding": 1, "stride": 2, "type": "ConvTranspose2d" } ] } }, "id2label": { "0": "eng", "1": "ja" }, "image_size": 1024, "input_layout": "NCHW", "internal_layout": "NHWC", "label2id": { "eng": 0, "ja": 1 }, "model_type": "comic_text_detector", "num_classes": 2, "torch_dtype": "float32", "weights": { "file": "model.safetensors", "format": "safetensors", "layout": "mlx-ohwi" }, "yolo": { "anchors": [ [ [ 10, 13 ], [ 16, 30 ], [ 33, 23 ] ], [ [ 30, 61 ], [ 62, 45 ], [ 59, 119 ] ], [ [ 116, 90 ], [ 156, 198 ], [ 373, 326 ] ] ], "depth_multiple": 0.33, "detect_indices": [ 17, 20, 23 ], "feature_indices": [ 1, 3, 5, 7, 9 ], "layers": [ { "from": -1, "in_channels": 3, "kernel": 6, "out_channels": 32, "padding": 2, "repeats": 1, "stride": 2, "type": "Conv" }, { "from": -1, "in_channels": 32, "kernel": 3, "out_channels": 64, "repeats": 1, "stride": 2, "type": "Conv" }, { "from": -1, "in_channels": 64, "out_channels": 64, "repeats": 1, "shortcut": true, "type": "C3" }, { "from": -1, "in_channels": 64, "kernel": 3, "out_channels": 128, "repeats": 1, "stride": 2, "type": "Conv" }, { "from": -1, "in_channels": 128, "out_channels": 128, "repeats": 2, "shortcut": true, "type": "C3" }, { "from": -1, "in_channels": 128, "kernel": 3, "out_channels": 256, "repeats": 1, "stride": 2, "type": "Conv" }, { "from": -1, "in_channels": 256, "out_channels": 256, "repeats": 3, "shortcut": true, "type": "C3" }, { "from": -1, "in_channels": 256, "kernel": 3, "out_channels": 512, "repeats": 1, "stride": 2, "type": "Conv" }, { "from": -1, "in_channels": 512, "out_channels": 512, "repeats": 1, "shortcut": true, "type": "C3" }, { "from": -1, "in_channels": 512, "kernel": 5, "out_channels": 512, "repeats": 1, "type": "SPPF" }, { "from": -1, "in_channels": 512, "kernel": 1, "out_channels": 256, "repeats": 1, "stride": 1, "type": "Conv" }, { "from": -1, "in_channels": 256, "mode": "nearest", "out_channels": 256, "repeats": 1, "scale_factor": 2, "type": "Upsample" }, { "dimension": 1, "from": [ -1, 6 ], "in_channels": [ 256, 256 ], "out_channels": 512, "repeats": 1, "type": "Concat" }, { "from": -1, "in_channels": 512, "out_channels": 256, "repeats": 1, "shortcut": false, "type": "C3" }, { "from": -1, "in_channels": 256, "kernel": 1, "out_channels": 128, "repeats": 1, "stride": 1, "type": "Conv" }, { "from": -1, "in_channels": 128, "mode": "nearest", "out_channels": 128, "repeats": 1, "scale_factor": 2, "type": "Upsample" }, { "dimension": 1, "from": [ -1, 4 ], "in_channels": [ 128, 128 ], "out_channels": 256, "repeats": 1, "type": "Concat" }, { "from": -1, "in_channels": 256, "out_channels": 128, "repeats": 1, "shortcut": false, "type": "C3" }, { "from": -1, "in_channels": 128, "kernel": 3, "out_channels": 128, "repeats": 1, "stride": 2, "type": "Conv" }, { "dimension": 1, "from": [ -1, 14 ], "in_channels": [ 128, 128 ], "out_channels": 256, "repeats": 1, "type": "Concat" }, { "from": -1, "in_channels": 256, "out_channels": 256, "repeats": 1, "shortcut": false, "type": "C3" }, { "from": -1, "in_channels": 256, "kernel": 3, "out_channels": 256, "repeats": 1, "stride": 2, "type": "Conv" }, { "dimension": 1, "from": [ -1, 10 ], "in_channels": [ 256, 256 ], "out_channels": 512, "repeats": 1, "type": "Concat" }, { "from": -1, "in_channels": 512, "out_channels": 512, "repeats": 1, "shortcut": false, "type": "C3" }, { "from": [ 17, 20, 23 ], "in_channels": [ 128, 256, 512 ], "out_channels": 21, "repeats": 1, "type": "Detect" } ], "width_multiple": 0.5 } }