comictextdetector-mlx / config.json
jkeisling's picture
Upload 2 files
3135bcf verified
Raw
History Blame Contribute Delete
9.14 kB
{
"architectures": [
"MlxComicTextDetector"
],
"format_version": 1,
"fusion": {
"conv_bn": "forward",
"head_bn_eps": 1e-05,
"trunk_bn_eps": 0.001
},
"heads": {
"db": {
"activation": "relu",
"layers": [
{
"in_channels": 512,
"name": "upconv3",
"out_channels": 512,
"repeats": 1,
"type": "UpC3"
},
{
"in_channels": 384,
"name": "upconv4",
"out_channels": 256,
"repeats": 1,
"type": "UpC3"
},
{
"in_channels": 128,
"kernel": 1,
"name": "projection",
"out_channels": 64,
"padding": 0,
"stride": 1,
"type": "Conv2d"
},
{
"in_channels": 64,
"kernel": 3,
"name": "binarize.0",
"out_channels": 16,
"padding": 1,
"stride": 1,
"type": "Conv2d"
},
{
"in_channels": 16,
"kernel": 2,
"name": "binarize.3",
"out_channels": 16,
"padding": 0,
"stride": 2,
"type": "ConvTranspose2d"
},
{
"in_channels": 16,
"kernel": 2,
"name": "binarize.6",
"out_channels": 1,
"padding": 0,
"stride": 2,
"type": "ConvTranspose2d"
},
{
"in_channels": 64,
"kernel": 3,
"name": "thresh.0",
"out_channels": 16,
"padding": 1,
"stride": 1,
"type": "Conv2d"
},
{
"in_channels": 16,
"kernel": 2,
"name": "thresh.3",
"out_channels": 16,
"padding": 0,
"stride": 2,
"type": "ConvTranspose2d"
},
{
"in_channels": 16,
"kernel": 2,
"name": "thresh.6",
"out_channels": 1,
"padding": 0,
"stride": 2,
"type": "ConvTranspose2d"
}
]
},
"segmentation": {
"activation": "leaky",
"layers": [
{
"in_channels": 512,
"name": "down_conv1",
"out_channels": 512,
"repeats": 1,
"type": "DownC3"
},
{
"in_channels": 512,
"name": "upconv0",
"out_channels": 512,
"repeats": 1,
"type": "UpC3"
},
{
"in_channels": 768,
"name": "upconv2",
"out_channels": 512,
"repeats": 1,
"type": "UpC3"
},
{
"in_channels": 512,
"name": "upconv3",
"out_channels": 512,
"repeats": 1,
"type": "UpC3"
},
{
"in_channels": 384,
"name": "upconv4",
"out_channels": 256,
"repeats": 1,
"type": "UpC3"
},
{
"in_channels": 192,
"name": "upconv5",
"out_channels": 128,
"repeats": 1,
"type": "UpC3"
},
{
"in_channels": 64,
"kernel": 4,
"name": "upconv6",
"out_channels": 1,
"padding": 1,
"stride": 2,
"type": "ConvTranspose2d"
}
]
}
},
"id2label": {
"0": "eng",
"1": "ja"
},
"image_size": 1024,
"input_layout": "NCHW",
"internal_layout": "NHWC",
"label2id": {
"eng": 0,
"ja": 1
},
"model_type": "comic_text_detector",
"num_classes": 2,
"torch_dtype": "float32",
"weights": {
"file": "model.safetensors",
"format": "safetensors",
"layout": "mlx-ohwi"
},
"yolo": {
"anchors": [
[
[
10,
13
],
[
16,
30
],
[
33,
23
]
],
[
[
30,
61
],
[
62,
45
],
[
59,
119
]
],
[
[
116,
90
],
[
156,
198
],
[
373,
326
]
]
],
"depth_multiple": 0.33,
"detect_indices": [
17,
20,
23
],
"feature_indices": [
1,
3,
5,
7,
9
],
"layers": [
{
"from": -1,
"in_channels": 3,
"kernel": 6,
"out_channels": 32,
"padding": 2,
"repeats": 1,
"stride": 2,
"type": "Conv"
},
{
"from": -1,
"in_channels": 32,
"kernel": 3,
"out_channels": 64,
"repeats": 1,
"stride": 2,
"type": "Conv"
},
{
"from": -1,
"in_channels": 64,
"out_channels": 64,
"repeats": 1,
"shortcut": true,
"type": "C3"
},
{
"from": -1,
"in_channels": 64,
"kernel": 3,
"out_channels": 128,
"repeats": 1,
"stride": 2,
"type": "Conv"
},
{
"from": -1,
"in_channels": 128,
"out_channels": 128,
"repeats": 2,
"shortcut": true,
"type": "C3"
},
{
"from": -1,
"in_channels": 128,
"kernel": 3,
"out_channels": 256,
"repeats": 1,
"stride": 2,
"type": "Conv"
},
{
"from": -1,
"in_channels": 256,
"out_channels": 256,
"repeats": 3,
"shortcut": true,
"type": "C3"
},
{
"from": -1,
"in_channels": 256,
"kernel": 3,
"out_channels": 512,
"repeats": 1,
"stride": 2,
"type": "Conv"
},
{
"from": -1,
"in_channels": 512,
"out_channels": 512,
"repeats": 1,
"shortcut": true,
"type": "C3"
},
{
"from": -1,
"in_channels": 512,
"kernel": 5,
"out_channels": 512,
"repeats": 1,
"type": "SPPF"
},
{
"from": -1,
"in_channels": 512,
"kernel": 1,
"out_channels": 256,
"repeats": 1,
"stride": 1,
"type": "Conv"
},
{
"from": -1,
"in_channels": 256,
"mode": "nearest",
"out_channels": 256,
"repeats": 1,
"scale_factor": 2,
"type": "Upsample"
},
{
"dimension": 1,
"from": [
-1,
6
],
"in_channels": [
256,
256
],
"out_channels": 512,
"repeats": 1,
"type": "Concat"
},
{
"from": -1,
"in_channels": 512,
"out_channels": 256,
"repeats": 1,
"shortcut": false,
"type": "C3"
},
{
"from": -1,
"in_channels": 256,
"kernel": 1,
"out_channels": 128,
"repeats": 1,
"stride": 1,
"type": "Conv"
},
{
"from": -1,
"in_channels": 128,
"mode": "nearest",
"out_channels": 128,
"repeats": 1,
"scale_factor": 2,
"type": "Upsample"
},
{
"dimension": 1,
"from": [
-1,
4
],
"in_channels": [
128,
128
],
"out_channels": 256,
"repeats": 1,
"type": "Concat"
},
{
"from": -1,
"in_channels": 256,
"out_channels": 128,
"repeats": 1,
"shortcut": false,
"type": "C3"
},
{
"from": -1,
"in_channels": 128,
"kernel": 3,
"out_channels": 128,
"repeats": 1,
"stride": 2,
"type": "Conv"
},
{
"dimension": 1,
"from": [
-1,
14
],
"in_channels": [
128,
128
],
"out_channels": 256,
"repeats": 1,
"type": "Concat"
},
{
"from": -1,
"in_channels": 256,
"out_channels": 256,
"repeats": 1,
"shortcut": false,
"type": "C3"
},
{
"from": -1,
"in_channels": 256,
"kernel": 3,
"out_channels": 256,
"repeats": 1,
"stride": 2,
"type": "Conv"
},
{
"dimension": 1,
"from": [
-1,
10
],
"in_channels": [
256,
256
],
"out_channels": 512,
"repeats": 1,
"type": "Concat"
},
{
"from": -1,
"in_channels": 512,
"out_channels": 512,
"repeats": 1,
"shortcut": false,
"type": "C3"
},
{
"from": [
17,
20,
23
],
"in_channels": [
128,
256,
512
],
"out_channels": 21,
"repeats": 1,
"type": "Detect"
}
],
"width_multiple": 0.5
}
}