PocketTTS-ONNX / bundle.json
IgnitiveLabs's picture
Upload folder using huggingface_hub
fa83274 verified
Raw
History Blame Contribute Delete
25.5 kB
{
"bos_before_voice_file": "bos_before_voice.npy",
"bundle_name": "english_2026-04",
"conditioning_dim": 1024,
"flow_lm_state_manifest": [
{
"dtype": "float32",
"fill": "nan",
"index": 0,
"input_name": "state_0",
"key": "cache",
"module": "transformer.layers.0.self_attn",
"output_name": "out_state_0",
"path": "transformer.layers.0.self_attn/cache",
"shape": [
2,
1,
1000,
16,
64
]
},
{
"dtype": "float32",
"fill": "empty",
"index": 1,
"input_name": "state_1",
"key": "current_end",
"module": "transformer.layers.0.self_attn",
"output_name": "out_state_1",
"path": "transformer.layers.0.self_attn/current_end",
"shape": [
0
]
},
{
"dtype": "int64",
"fill": "zeros",
"index": 2,
"input_name": "state_2",
"key": "step",
"module": "transformer.layers.0.self_attn",
"output_name": "out_state_2",
"path": "transformer.layers.0.self_attn/step",
"shape": [
1
]
},
{
"dtype": "float32",
"fill": "nan",
"index": 3,
"input_name": "state_3",
"key": "cache",
"module": "transformer.layers.1.self_attn",
"output_name": "out_state_3",
"path": "transformer.layers.1.self_attn/cache",
"shape": [
2,
1,
1000,
16,
64
]
},
{
"dtype": "float32",
"fill": "empty",
"index": 4,
"input_name": "state_4",
"key": "current_end",
"module": "transformer.layers.1.self_attn",
"output_name": "out_state_4",
"path": "transformer.layers.1.self_attn/current_end",
"shape": [
0
]
},
{
"dtype": "int64",
"fill": "zeros",
"index": 5,
"input_name": "state_5",
"key": "step",
"module": "transformer.layers.1.self_attn",
"output_name": "out_state_5",
"path": "transformer.layers.1.self_attn/step",
"shape": [
1
]
},
{
"dtype": "float32",
"fill": "nan",
"index": 6,
"input_name": "state_6",
"key": "cache",
"module": "transformer.layers.2.self_attn",
"output_name": "out_state_6",
"path": "transformer.layers.2.self_attn/cache",
"shape": [
2,
1,
1000,
16,
64
]
},
{
"dtype": "float32",
"fill": "empty",
"index": 7,
"input_name": "state_7",
"key": "current_end",
"module": "transformer.layers.2.self_attn",
"output_name": "out_state_7",
"path": "transformer.layers.2.self_attn/current_end",
"shape": [
0
]
},
{
"dtype": "int64",
"fill": "zeros",
"index": 8,
"input_name": "state_8",
"key": "step",
"module": "transformer.layers.2.self_attn",
"output_name": "out_state_8",
"path": "transformer.layers.2.self_attn/step",
"shape": [
1
]
},
{
"dtype": "float32",
"fill": "nan",
"index": 9,
"input_name": "state_9",
"key": "cache",
"module": "transformer.layers.3.self_attn",
"output_name": "out_state_9",
"path": "transformer.layers.3.self_attn/cache",
"shape": [
2,
1,
1000,
16,
64
]
},
{
"dtype": "float32",
"fill": "empty",
"index": 10,
"input_name": "state_10",
"key": "current_end",
"module": "transformer.layers.3.self_attn",
"output_name": "out_state_10",
"path": "transformer.layers.3.self_attn/current_end",
"shape": [
0
]
},
{
"dtype": "int64",
"fill": "zeros",
"index": 11,
"input_name": "state_11",
"key": "step",
"module": "transformer.layers.3.self_attn",
"output_name": "out_state_11",
"path": "transformer.layers.3.self_attn/step",
"shape": [
1
]
},
{
"dtype": "float32",
"fill": "nan",
"index": 12,
"input_name": "state_12",
"key": "cache",
"module": "transformer.layers.4.self_attn",
"output_name": "out_state_12",
"path": "transformer.layers.4.self_attn/cache",
"shape": [
2,
1,
1000,
16,
64
]
},
{
"dtype": "float32",
"fill": "empty",
"index": 13,
"input_name": "state_13",
"key": "current_end",
"module": "transformer.layers.4.self_attn",
"output_name": "out_state_13",
"path": "transformer.layers.4.self_attn/current_end",
"shape": [
0
]
},
{
"dtype": "int64",
"fill": "zeros",
"index": 14,
"input_name": "state_14",
"key": "step",
"module": "transformer.layers.4.self_attn",
"output_name": "out_state_14",
"path": "transformer.layers.4.self_attn/step",
"shape": [
1
]
},
{
"dtype": "float32",
"fill": "nan",
"index": 15,
"input_name": "state_15",
"key": "cache",
"module": "transformer.layers.5.self_attn",
"output_name": "out_state_15",
"path": "transformer.layers.5.self_attn/cache",
"shape": [
2,
1,
1000,
16,
64
]
},
{
"dtype": "float32",
"fill": "empty",
"index": 16,
"input_name": "state_16",
"key": "current_end",
"module": "transformer.layers.5.self_attn",
"output_name": "out_state_16",
"path": "transformer.layers.5.self_attn/current_end",
"shape": [
0
]
},
{
"dtype": "int64",
"fill": "zeros",
"index": 17,
"input_name": "state_17",
"key": "step",
"module": "transformer.layers.5.self_attn",
"output_name": "out_state_17",
"path": "transformer.layers.5.self_attn/step",
"shape": [
1
]
}
],
"frame_rate": 12.5,
"insert_bos_before_voice": true,
"language": "english_2026-04",
"latent_dim": 32,
"max_token_per_chunk": 50,
"mimi_state_manifest": [
{
"dtype": "bool",
"fill": "ones",
"index": 0,
"input_name": "state_0",
"key": "first",
"module": "decoder.model.0",
"output_name": "out_state_0",
"path": "decoder.model.0/first",
"shape": [
1
]
},
{
"dtype": "float32",
"fill": "zeros",
"index": 1,
"input_name": "state_1",
"key": "previous",
"module": "decoder.model.0",
"output_name": "out_state_1",
"path": "decoder.model.0/previous",
"shape": [
1,
512,
6
]
},
{
"dtype": "bool",
"fill": "ones",
"index": 2,
"input_name": "state_2",
"key": "first",
"module": "decoder.model.11",
"output_name": "out_state_2",
"path": "decoder.model.11/first",
"shape": [
1
]
},
{
"dtype": "float32",
"fill": "zeros",
"index": 3,
"input_name": "state_3",
"key": "previous",
"module": "decoder.model.11",
"output_name": "out_state_3",
"path": "decoder.model.11/previous",
"shape": [
1,
64,
2
]
},
{
"dtype": "float32",
"fill": "zeros",
"index": 4,
"input_name": "state_4",
"key": "partial",
"module": "decoder.model.2",
"output_name": "out_state_4",
"path": "decoder.model.2/partial",
"shape": [
1,
256,
6
]
},
{
"dtype": "bool",
"fill": "ones",
"index": 5,
"input_name": "state_5",
"key": "first",
"module": "decoder.model.3.block.1",
"output_name": "out_state_5",
"path": "decoder.model.3.block.1/first",
"shape": [
1
]
},
{
"dtype": "float32",
"fill": "zeros",
"index": 6,
"input_name": "state_6",
"key": "previous",
"module": "decoder.model.3.block.1",
"output_name": "out_state_6",
"path": "decoder.model.3.block.1/previous",
"shape": [
1,
256,
2
]
},
{
"dtype": "bool",
"fill": "ones",
"index": 7,
"input_name": "state_7",
"key": "first",
"module": "decoder.model.3.block.3",
"output_name": "out_state_7",
"path": "decoder.model.3.block.3/first",
"shape": [
1
]
},
{
"dtype": "float32",
"fill": "empty",
"index": 8,
"input_name": "state_8",
"key": "previous",
"module": "decoder.model.3.block.3",
"output_name": "out_state_8",
"path": "decoder.model.3.block.3/previous",
"shape": [
1,
128,
0
]
},
{
"dtype": "float32",
"fill": "zeros",
"index": 9,
"input_name": "state_9",
"key": "partial",
"module": "decoder.model.5",
"output_name": "out_state_9",
"path": "decoder.model.5/partial",
"shape": [
1,
128,
5
]
},
{
"dtype": "bool",
"fill": "ones",
"index": 10,
"input_name": "state_10",
"key": "first",
"module": "decoder.model.6.block.1",
"output_name": "out_state_10",
"path": "decoder.model.6.block.1/first",
"shape": [
1
]
},
{
"dtype": "float32",
"fill": "zeros",
"index": 11,
"input_name": "state_11",
"key": "previous",
"module": "decoder.model.6.block.1",
"output_name": "out_state_11",
"path": "decoder.model.6.block.1/previous",
"shape": [
1,
128,
2
]
},
{
"dtype": "bool",
"fill": "ones",
"index": 12,
"input_name": "state_12",
"key": "first",
"module": "decoder.model.6.block.3",
"output_name": "out_state_12",
"path": "decoder.model.6.block.3/first",
"shape": [
1
]
},
{
"dtype": "float32",
"fill": "empty",
"index": 13,
"input_name": "state_13",
"key": "previous",
"module": "decoder.model.6.block.3",
"output_name": "out_state_13",
"path": "decoder.model.6.block.3/previous",
"shape": [
1,
64,
0
]
},
{
"dtype": "float32",
"fill": "zeros",
"index": 14,
"input_name": "state_14",
"key": "partial",
"module": "decoder.model.8",
"output_name": "out_state_14",
"path": "decoder.model.8/partial",
"shape": [
1,
64,
4
]
},
{
"dtype": "bool",
"fill": "ones",
"index": 15,
"input_name": "state_15",
"key": "first",
"module": "decoder.model.9.block.1",
"output_name": "out_state_15",
"path": "decoder.model.9.block.1/first",
"shape": [
1
]
},
{
"dtype": "float32",
"fill": "zeros",
"index": 16,
"input_name": "state_16",
"key": "previous",
"module": "decoder.model.9.block.1",
"output_name": "out_state_16",
"path": "decoder.model.9.block.1/previous",
"shape": [
1,
64,
2
]
},
{
"dtype": "bool",
"fill": "ones",
"index": 17,
"input_name": "state_17",
"key": "first",
"module": "decoder.model.9.block.3",
"output_name": "out_state_17",
"path": "decoder.model.9.block.3/first",
"shape": [
1
]
},
{
"dtype": "float32",
"fill": "empty",
"index": 18,
"input_name": "state_18",
"key": "previous",
"module": "decoder.model.9.block.3",
"output_name": "out_state_18",
"path": "decoder.model.9.block.3/previous",
"shape": [
1,
32,
0
]
},
{
"dtype": "float32",
"fill": "zeros",
"index": 19,
"input_name": "state_19",
"key": "cache",
"module": "decoder_transformer.transformer.layers.0.self_attn",
"output_name": "out_state_19",
"path": "decoder_transformer.transformer.layers.0.self_attn/cache",
"shape": [
2,
1,
8,
1000,
64
]
},
{
"dtype": "int64",
"fill": "zeros",
"index": 20,
"input_name": "state_20",
"key": "end_offset",
"module": "decoder_transformer.transformer.layers.0.self_attn",
"output_name": "out_state_20",
"path": "decoder_transformer.transformer.layers.0.self_attn/end_offset",
"shape": [
1
]
},
{
"dtype": "int64",
"fill": "zeros",
"index": 21,
"input_name": "state_21",
"key": "offset",
"module": "decoder_transformer.transformer.layers.0.self_attn",
"output_name": "out_state_21",
"path": "decoder_transformer.transformer.layers.0.self_attn/offset",
"shape": [
1
]
},
{
"dtype": "float32",
"fill": "zeros",
"index": 22,
"input_name": "state_22",
"key": "cache",
"module": "decoder_transformer.transformer.layers.1.self_attn",
"output_name": "out_state_22",
"path": "decoder_transformer.transformer.layers.1.self_attn/cache",
"shape": [
2,
1,
8,
1000,
64
]
},
{
"dtype": "int64",
"fill": "zeros",
"index": 23,
"input_name": "state_23",
"key": "end_offset",
"module": "decoder_transformer.transformer.layers.1.self_attn",
"output_name": "out_state_23",
"path": "decoder_transformer.transformer.layers.1.self_attn/end_offset",
"shape": [
1
]
},
{
"dtype": "int64",
"fill": "zeros",
"index": 24,
"input_name": "state_24",
"key": "offset",
"module": "decoder_transformer.transformer.layers.1.self_attn",
"output_name": "out_state_24",
"path": "decoder_transformer.transformer.layers.1.self_attn/offset",
"shape": [
1
]
},
{
"dtype": "bool",
"fill": "ones",
"index": 25,
"input_name": "state_25",
"key": "first",
"module": "downsample.conv",
"output_name": "out_state_25",
"path": "downsample.conv/first",
"shape": [
1
]
},
{
"dtype": "float32",
"fill": "zeros",
"index": 26,
"input_name": "state_26",
"key": "previous",
"module": "downsample.conv",
"output_name": "out_state_26",
"path": "downsample.conv/previous",
"shape": [
1,
512,
16
]
},
{
"dtype": "bool",
"fill": "ones",
"index": 27,
"input_name": "state_27",
"key": "first",
"module": "encoder.model.0",
"output_name": "out_state_27",
"path": "encoder.model.0/first",
"shape": [
1
]
},
{
"dtype": "float32",
"fill": "zeros",
"index": 28,
"input_name": "state_28",
"key": "previous",
"module": "encoder.model.0",
"output_name": "out_state_28",
"path": "encoder.model.0/previous",
"shape": [
1,
1,
6
]
},
{
"dtype": "bool",
"fill": "ones",
"index": 29,
"input_name": "state_29",
"key": "first",
"module": "encoder.model.1.block.1",
"output_name": "out_state_29",
"path": "encoder.model.1.block.1/first",
"shape": [
1
]
},
{
"dtype": "float32",
"fill": "zeros",
"index": 30,
"input_name": "state_30",
"key": "previous",
"module": "encoder.model.1.block.1",
"output_name": "out_state_30",
"path": "encoder.model.1.block.1/previous",
"shape": [
1,
64,
2
]
},
{
"dtype": "bool",
"fill": "ones",
"index": 31,
"input_name": "state_31",
"key": "first",
"module": "encoder.model.1.block.3",
"output_name": "out_state_31",
"path": "encoder.model.1.block.3/first",
"shape": [
1
]
},
{
"dtype": "float32",
"fill": "empty",
"index": 32,
"input_name": "state_32",
"key": "previous",
"module": "encoder.model.1.block.3",
"output_name": "out_state_32",
"path": "encoder.model.1.block.3/previous",
"shape": [
1,
32,
0
]
},
{
"dtype": "bool",
"fill": "ones",
"index": 33,
"input_name": "state_33",
"key": "first",
"module": "encoder.model.11",
"output_name": "out_state_33",
"path": "encoder.model.11/first",
"shape": [
1
]
},
{
"dtype": "float32",
"fill": "zeros",
"index": 34,
"input_name": "state_34",
"key": "previous",
"module": "encoder.model.11",
"output_name": "out_state_34",
"path": "encoder.model.11/previous",
"shape": [
1,
512,
2
]
},
{
"dtype": "bool",
"fill": "ones",
"index": 35,
"input_name": "state_35",
"key": "first",
"module": "encoder.model.3",
"output_name": "out_state_35",
"path": "encoder.model.3/first",
"shape": [
1
]
},
{
"dtype": "float32",
"fill": "zeros",
"index": 36,
"input_name": "state_36",
"key": "previous",
"module": "encoder.model.3",
"output_name": "out_state_36",
"path": "encoder.model.3/previous",
"shape": [
1,
64,
4
]
},
{
"dtype": "bool",
"fill": "ones",
"index": 37,
"input_name": "state_37",
"key": "first",
"module": "encoder.model.4.block.1",
"output_name": "out_state_37",
"path": "encoder.model.4.block.1/first",
"shape": [
1
]
},
{
"dtype": "float32",
"fill": "zeros",
"index": 38,
"input_name": "state_38",
"key": "previous",
"module": "encoder.model.4.block.1",
"output_name": "out_state_38",
"path": "encoder.model.4.block.1/previous",
"shape": [
1,
128,
2
]
},
{
"dtype": "bool",
"fill": "ones",
"index": 39,
"input_name": "state_39",
"key": "first",
"module": "encoder.model.4.block.3",
"output_name": "out_state_39",
"path": "encoder.model.4.block.3/first",
"shape": [
1
]
},
{
"dtype": "float32",
"fill": "empty",
"index": 40,
"input_name": "state_40",
"key": "previous",
"module": "encoder.model.4.block.3",
"output_name": "out_state_40",
"path": "encoder.model.4.block.3/previous",
"shape": [
1,
64,
0
]
},
{
"dtype": "bool",
"fill": "ones",
"index": 41,
"input_name": "state_41",
"key": "first",
"module": "encoder.model.6",
"output_name": "out_state_41",
"path": "encoder.model.6/first",
"shape": [
1
]
},
{
"dtype": "float32",
"fill": "zeros",
"index": 42,
"input_name": "state_42",
"key": "previous",
"module": "encoder.model.6",
"output_name": "out_state_42",
"path": "encoder.model.6/previous",
"shape": [
1,
128,
5
]
},
{
"dtype": "bool",
"fill": "ones",
"index": 43,
"input_name": "state_43",
"key": "first",
"module": "encoder.model.7.block.1",
"output_name": "out_state_43",
"path": "encoder.model.7.block.1/first",
"shape": [
1
]
},
{
"dtype": "float32",
"fill": "zeros",
"index": 44,
"input_name": "state_44",
"key": "previous",
"module": "encoder.model.7.block.1",
"output_name": "out_state_44",
"path": "encoder.model.7.block.1/previous",
"shape": [
1,
256,
2
]
},
{
"dtype": "bool",
"fill": "ones",
"index": 45,
"input_name": "state_45",
"key": "first",
"module": "encoder.model.7.block.3",
"output_name": "out_state_45",
"path": "encoder.model.7.block.3/first",
"shape": [
1
]
},
{
"dtype": "float32",
"fill": "empty",
"index": 46,
"input_name": "state_46",
"key": "previous",
"module": "encoder.model.7.block.3",
"output_name": "out_state_46",
"path": "encoder.model.7.block.3/previous",
"shape": [
1,
128,
0
]
},
{
"dtype": "bool",
"fill": "ones",
"index": 47,
"input_name": "state_47",
"key": "first",
"module": "encoder.model.9",
"output_name": "out_state_47",
"path": "encoder.model.9/first",
"shape": [
1
]
},
{
"dtype": "float32",
"fill": "zeros",
"index": 48,
"input_name": "state_48",
"key": "previous",
"module": "encoder.model.9",
"output_name": "out_state_48",
"path": "encoder.model.9/previous",
"shape": [
1,
256,
6
]
},
{
"dtype": "float32",
"fill": "zeros",
"index": 49,
"input_name": "state_49",
"key": "cache",
"module": "encoder_transformer.transformer.layers.0.self_attn",
"output_name": "out_state_49",
"path": "encoder_transformer.transformer.layers.0.self_attn/cache",
"shape": [
2,
1,
8,
1000,
64
]
},
{
"dtype": "int64",
"fill": "zeros",
"index": 50,
"input_name": "state_50",
"key": "end_offset",
"module": "encoder_transformer.transformer.layers.0.self_attn",
"output_name": "out_state_50",
"path": "encoder_transformer.transformer.layers.0.self_attn/end_offset",
"shape": [
1
]
},
{
"dtype": "int64",
"fill": "zeros",
"index": 51,
"input_name": "state_51",
"key": "offset",
"module": "encoder_transformer.transformer.layers.0.self_attn",
"output_name": "out_state_51",
"path": "encoder_transformer.transformer.layers.0.self_attn/offset",
"shape": [
1
]
},
{
"dtype": "float32",
"fill": "zeros",
"index": 52,
"input_name": "state_52",
"key": "cache",
"module": "encoder_transformer.transformer.layers.1.self_attn",
"output_name": "out_state_52",
"path": "encoder_transformer.transformer.layers.1.self_attn/cache",
"shape": [
2,
1,
8,
1000,
64
]
},
{
"dtype": "int64",
"fill": "zeros",
"index": 53,
"input_name": "state_53",
"key": "end_offset",
"module": "encoder_transformer.transformer.layers.1.self_attn",
"output_name": "out_state_53",
"path": "encoder_transformer.transformer.layers.1.self_attn/end_offset",
"shape": [
1
]
},
{
"dtype": "int64",
"fill": "zeros",
"index": 54,
"input_name": "state_54",
"key": "offset",
"module": "encoder_transformer.transformer.layers.1.self_attn",
"output_name": "out_state_54",
"path": "encoder_transformer.transformer.layers.1.self_attn/offset",
"shape": [
1
]
},
{
"dtype": "float32",
"fill": "zeros",
"index": 55,
"input_name": "state_55",
"key": "partial",
"module": "upsample.convtr",
"output_name": "out_state_55",
"path": "upsample.convtr/partial",
"shape": [
1,
512,
16
]
}
],
"model_recommended_frames_after_eos": null,
"pad_with_spaces_for_short_inputs": false,
"predefined_voices": [
"alba",
"azelma",
"cosette",
"eponine",
"fantine",
"javert",
"jean",
"marius"
],
"remove_semicolons": false,
"sample_rate": 24000,
"samples_per_frame": 1920,
"schema_version": 2,
"tokenizer_file": "tokenizer.model"
}