| { |
| "architectures": [ |
| "MlxComicTextDetector" |
| ], |
| "format_version": 1, |
| "fusion": { |
| "conv_bn": "forward", |
| "head_bn_eps": 1e-05, |
| "trunk_bn_eps": 0.001 |
| }, |
| "heads": { |
| "db": { |
| "activation": "relu", |
| "layers": [ |
| { |
| "in_channels": 512, |
| "name": "upconv3", |
| "out_channels": 512, |
| "repeats": 1, |
| "type": "UpC3" |
| }, |
| { |
| "in_channels": 384, |
| "name": "upconv4", |
| "out_channels": 256, |
| "repeats": 1, |
| "type": "UpC3" |
| }, |
| { |
| "in_channels": 128, |
| "kernel": 1, |
| "name": "projection", |
| "out_channels": 64, |
| "padding": 0, |
| "stride": 1, |
| "type": "Conv2d" |
| }, |
| { |
| "in_channels": 64, |
| "kernel": 3, |
| "name": "binarize.0", |
| "out_channels": 16, |
| "padding": 1, |
| "stride": 1, |
| "type": "Conv2d" |
| }, |
| { |
| "in_channels": 16, |
| "kernel": 2, |
| "name": "binarize.3", |
| "out_channels": 16, |
| "padding": 0, |
| "stride": 2, |
| "type": "ConvTranspose2d" |
| }, |
| { |
| "in_channels": 16, |
| "kernel": 2, |
| "name": "binarize.6", |
| "out_channels": 1, |
| "padding": 0, |
| "stride": 2, |
| "type": "ConvTranspose2d" |
| }, |
| { |
| "in_channels": 64, |
| "kernel": 3, |
| "name": "thresh.0", |
| "out_channels": 16, |
| "padding": 1, |
| "stride": 1, |
| "type": "Conv2d" |
| }, |
| { |
| "in_channels": 16, |
| "kernel": 2, |
| "name": "thresh.3", |
| "out_channels": 16, |
| "padding": 0, |
| "stride": 2, |
| "type": "ConvTranspose2d" |
| }, |
| { |
| "in_channels": 16, |
| "kernel": 2, |
| "name": "thresh.6", |
| "out_channels": 1, |
| "padding": 0, |
| "stride": 2, |
| "type": "ConvTranspose2d" |
| } |
| ] |
| }, |
| "segmentation": { |
| "activation": "leaky", |
| "layers": [ |
| { |
| "in_channels": 512, |
| "name": "down_conv1", |
| "out_channels": 512, |
| "repeats": 1, |
| "type": "DownC3" |
| }, |
| { |
| "in_channels": 512, |
| "name": "upconv0", |
| "out_channels": 512, |
| "repeats": 1, |
| "type": "UpC3" |
| }, |
| { |
| "in_channels": 768, |
| "name": "upconv2", |
| "out_channels": 512, |
| "repeats": 1, |
| "type": "UpC3" |
| }, |
| { |
| "in_channels": 512, |
| "name": "upconv3", |
| "out_channels": 512, |
| "repeats": 1, |
| "type": "UpC3" |
| }, |
| { |
| "in_channels": 384, |
| "name": "upconv4", |
| "out_channels": 256, |
| "repeats": 1, |
| "type": "UpC3" |
| }, |
| { |
| "in_channels": 192, |
| "name": "upconv5", |
| "out_channels": 128, |
| "repeats": 1, |
| "type": "UpC3" |
| }, |
| { |
| "in_channels": 64, |
| "kernel": 4, |
| "name": "upconv6", |
| "out_channels": 1, |
| "padding": 1, |
| "stride": 2, |
| "type": "ConvTranspose2d" |
| } |
| ] |
| } |
| }, |
| "id2label": { |
| "0": "eng", |
| "1": "ja" |
| }, |
| "image_size": 1024, |
| "input_layout": "NCHW", |
| "internal_layout": "NHWC", |
| "label2id": { |
| "eng": 0, |
| "ja": 1 |
| }, |
| "model_type": "comic_text_detector", |
| "num_classes": 2, |
| "torch_dtype": "float32", |
| "weights": { |
| "file": "model.safetensors", |
| "format": "safetensors", |
| "layout": "mlx-ohwi" |
| }, |
| "yolo": { |
| "anchors": [ |
| [ |
| [ |
| 10, |
| 13 |
| ], |
| [ |
| 16, |
| 30 |
| ], |
| [ |
| 33, |
| 23 |
| ] |
| ], |
| [ |
| [ |
| 30, |
| 61 |
| ], |
| [ |
| 62, |
| 45 |
| ], |
| [ |
| 59, |
| 119 |
| ] |
| ], |
| [ |
| [ |
| 116, |
| 90 |
| ], |
| [ |
| 156, |
| 198 |
| ], |
| [ |
| 373, |
| 326 |
| ] |
| ] |
| ], |
| "depth_multiple": 0.33, |
| "detect_indices": [ |
| 17, |
| 20, |
| 23 |
| ], |
| "feature_indices": [ |
| 1, |
| 3, |
| 5, |
| 7, |
| 9 |
| ], |
| "layers": [ |
| { |
| "from": -1, |
| "in_channels": 3, |
| "kernel": 6, |
| "out_channels": 32, |
| "padding": 2, |
| "repeats": 1, |
| "stride": 2, |
| "type": "Conv" |
| }, |
| { |
| "from": -1, |
| "in_channels": 32, |
| "kernel": 3, |
| "out_channels": 64, |
| "repeats": 1, |
| "stride": 2, |
| "type": "Conv" |
| }, |
| { |
| "from": -1, |
| "in_channels": 64, |
| "out_channels": 64, |
| "repeats": 1, |
| "shortcut": true, |
| "type": "C3" |
| }, |
| { |
| "from": -1, |
| "in_channels": 64, |
| "kernel": 3, |
| "out_channels": 128, |
| "repeats": 1, |
| "stride": 2, |
| "type": "Conv" |
| }, |
| { |
| "from": -1, |
| "in_channels": 128, |
| "out_channels": 128, |
| "repeats": 2, |
| "shortcut": true, |
| "type": "C3" |
| }, |
| { |
| "from": -1, |
| "in_channels": 128, |
| "kernel": 3, |
| "out_channels": 256, |
| "repeats": 1, |
| "stride": 2, |
| "type": "Conv" |
| }, |
| { |
| "from": -1, |
| "in_channels": 256, |
| "out_channels": 256, |
| "repeats": 3, |
| "shortcut": true, |
| "type": "C3" |
| }, |
| { |
| "from": -1, |
| "in_channels": 256, |
| "kernel": 3, |
| "out_channels": 512, |
| "repeats": 1, |
| "stride": 2, |
| "type": "Conv" |
| }, |
| { |
| "from": -1, |
| "in_channels": 512, |
| "out_channels": 512, |
| "repeats": 1, |
| "shortcut": true, |
| "type": "C3" |
| }, |
| { |
| "from": -1, |
| "in_channels": 512, |
| "kernel": 5, |
| "out_channels": 512, |
| "repeats": 1, |
| "type": "SPPF" |
| }, |
| { |
| "from": -1, |
| "in_channels": 512, |
| "kernel": 1, |
| "out_channels": 256, |
| "repeats": 1, |
| "stride": 1, |
| "type": "Conv" |
| }, |
| { |
| "from": -1, |
| "in_channels": 256, |
| "mode": "nearest", |
| "out_channels": 256, |
| "repeats": 1, |
| "scale_factor": 2, |
| "type": "Upsample" |
| }, |
| { |
| "dimension": 1, |
| "from": [ |
| -1, |
| 6 |
| ], |
| "in_channels": [ |
| 256, |
| 256 |
| ], |
| "out_channels": 512, |
| "repeats": 1, |
| "type": "Concat" |
| }, |
| { |
| "from": -1, |
| "in_channels": 512, |
| "out_channels": 256, |
| "repeats": 1, |
| "shortcut": false, |
| "type": "C3" |
| }, |
| { |
| "from": -1, |
| "in_channels": 256, |
| "kernel": 1, |
| "out_channels": 128, |
| "repeats": 1, |
| "stride": 1, |
| "type": "Conv" |
| }, |
| { |
| "from": -1, |
| "in_channels": 128, |
| "mode": "nearest", |
| "out_channels": 128, |
| "repeats": 1, |
| "scale_factor": 2, |
| "type": "Upsample" |
| }, |
| { |
| "dimension": 1, |
| "from": [ |
| -1, |
| 4 |
| ], |
| "in_channels": [ |
| 128, |
| 128 |
| ], |
| "out_channels": 256, |
| "repeats": 1, |
| "type": "Concat" |
| }, |
| { |
| "from": -1, |
| "in_channels": 256, |
| "out_channels": 128, |
| "repeats": 1, |
| "shortcut": false, |
| "type": "C3" |
| }, |
| { |
| "from": -1, |
| "in_channels": 128, |
| "kernel": 3, |
| "out_channels": 128, |
| "repeats": 1, |
| "stride": 2, |
| "type": "Conv" |
| }, |
| { |
| "dimension": 1, |
| "from": [ |
| -1, |
| 14 |
| ], |
| "in_channels": [ |
| 128, |
| 128 |
| ], |
| "out_channels": 256, |
| "repeats": 1, |
| "type": "Concat" |
| }, |
| { |
| "from": -1, |
| "in_channels": 256, |
| "out_channels": 256, |
| "repeats": 1, |
| "shortcut": false, |
| "type": "C3" |
| }, |
| { |
| "from": -1, |
| "in_channels": 256, |
| "kernel": 3, |
| "out_channels": 256, |
| "repeats": 1, |
| "stride": 2, |
| "type": "Conv" |
| }, |
| { |
| "dimension": 1, |
| "from": [ |
| -1, |
| 10 |
| ], |
| "in_channels": [ |
| 256, |
| 256 |
| ], |
| "out_channels": 512, |
| "repeats": 1, |
| "type": "Concat" |
| }, |
| { |
| "from": -1, |
| "in_channels": 512, |
| "out_channels": 512, |
| "repeats": 1, |
| "shortcut": false, |
| "type": "C3" |
| }, |
| { |
| "from": [ |
| 17, |
| 20, |
| 23 |
| ], |
| "in_channels": [ |
| 128, |
| 256, |
| 512 |
| ], |
| "out_channels": 21, |
| "repeats": 1, |
| "type": "Detect" |
| } |
| ], |
| "width_multiple": 0.5 |
| } |
| } |
|
|