Instructions to use IgnitiveLabs/PocketTTS-ONNX with libraries, inference providers, notebooks, and local apps. Follow these links to get started.
- Libraries
- Pocket-TTS
How to use IgnitiveLabs/PocketTTS-ONNX with Pocket-TTS:
from pocket_tts import TTSModel import scipy.io.wavfile tts_model = TTSModel.load_model("IgnitiveLabs/PocketTTS-ONNX") voice_state = tts_model.get_state_for_audio_prompt( "hf://kyutai/tts-voices/alba-mackenna/casual.wav" ) audio = tts_model.generate_audio(voice_state, "Hello world, this is a test.") # Audio is a 1D torch tensor containing PCM data. scipy.io.wavfile.write("output.wav", tts_model.sample_rate, audio.numpy()) - Notebooks
- Google Colab
- Kaggle
| { | |
| "bos_before_voice_file": "bos_before_voice.npy", | |
| "bundle_name": "english_2026-04", | |
| "conditioning_dim": 1024, | |
| "flow_lm_state_manifest": [ | |
| { | |
| "dtype": "float32", | |
| "fill": "nan", | |
| "index": 0, | |
| "input_name": "state_0", | |
| "key": "cache", | |
| "module": "transformer.layers.0.self_attn", | |
| "output_name": "out_state_0", | |
| "path": "transformer.layers.0.self_attn/cache", | |
| "shape": [ | |
| 2, | |
| 1, | |
| 1000, | |
| 16, | |
| 64 | |
| ] | |
| }, | |
| { | |
| "dtype": "float32", | |
| "fill": "empty", | |
| "index": 1, | |
| "input_name": "state_1", | |
| "key": "current_end", | |
| "module": "transformer.layers.0.self_attn", | |
| "output_name": "out_state_1", | |
| "path": "transformer.layers.0.self_attn/current_end", | |
| "shape": [ | |
| 0 | |
| ] | |
| }, | |
| { | |
| "dtype": "int64", | |
| "fill": "zeros", | |
| "index": 2, | |
| "input_name": "state_2", | |
| "key": "step", | |
| "module": "transformer.layers.0.self_attn", | |
| "output_name": "out_state_2", | |
| "path": "transformer.layers.0.self_attn/step", | |
| "shape": [ | |
| 1 | |
| ] | |
| }, | |
| { | |
| "dtype": "float32", | |
| "fill": "nan", | |
| "index": 3, | |
| "input_name": "state_3", | |
| "key": "cache", | |
| "module": "transformer.layers.1.self_attn", | |
| "output_name": "out_state_3", | |
| "path": "transformer.layers.1.self_attn/cache", | |
| "shape": [ | |
| 2, | |
| 1, | |
| 1000, | |
| 16, | |
| 64 | |
| ] | |
| }, | |
| { | |
| "dtype": "float32", | |
| "fill": "empty", | |
| "index": 4, | |
| "input_name": "state_4", | |
| "key": "current_end", | |
| "module": "transformer.layers.1.self_attn", | |
| "output_name": "out_state_4", | |
| "path": "transformer.layers.1.self_attn/current_end", | |
| "shape": [ | |
| 0 | |
| ] | |
| }, | |
| { | |
| "dtype": "int64", | |
| "fill": "zeros", | |
| "index": 5, | |
| "input_name": "state_5", | |
| "key": "step", | |
| "module": "transformer.layers.1.self_attn", | |
| "output_name": "out_state_5", | |
| "path": "transformer.layers.1.self_attn/step", | |
| "shape": [ | |
| 1 | |
| ] | |
| }, | |
| { | |
| "dtype": "float32", | |
| "fill": "nan", | |
| "index": 6, | |
| "input_name": "state_6", | |
| "key": "cache", | |
| "module": "transformer.layers.2.self_attn", | |
| "output_name": "out_state_6", | |
| "path": "transformer.layers.2.self_attn/cache", | |
| "shape": [ | |
| 2, | |
| 1, | |
| 1000, | |
| 16, | |
| 64 | |
| ] | |
| }, | |
| { | |
| "dtype": "float32", | |
| "fill": "empty", | |
| "index": 7, | |
| "input_name": "state_7", | |
| "key": "current_end", | |
| "module": "transformer.layers.2.self_attn", | |
| "output_name": "out_state_7", | |
| "path": "transformer.layers.2.self_attn/current_end", | |
| "shape": [ | |
| 0 | |
| ] | |
| }, | |
| { | |
| "dtype": "int64", | |
| "fill": "zeros", | |
| "index": 8, | |
| "input_name": "state_8", | |
| "key": "step", | |
| "module": "transformer.layers.2.self_attn", | |
| "output_name": "out_state_8", | |
| "path": "transformer.layers.2.self_attn/step", | |
| "shape": [ | |
| 1 | |
| ] | |
| }, | |
| { | |
| "dtype": "float32", | |
| "fill": "nan", | |
| "index": 9, | |
| "input_name": "state_9", | |
| "key": "cache", | |
| "module": "transformer.layers.3.self_attn", | |
| "output_name": "out_state_9", | |
| "path": "transformer.layers.3.self_attn/cache", | |
| "shape": [ | |
| 2, | |
| 1, | |
| 1000, | |
| 16, | |
| 64 | |
| ] | |
| }, | |
| { | |
| "dtype": "float32", | |
| "fill": "empty", | |
| "index": 10, | |
| "input_name": "state_10", | |
| "key": "current_end", | |
| "module": "transformer.layers.3.self_attn", | |
| "output_name": "out_state_10", | |
| "path": "transformer.layers.3.self_attn/current_end", | |
| "shape": [ | |
| 0 | |
| ] | |
| }, | |
| { | |
| "dtype": "int64", | |
| "fill": "zeros", | |
| "index": 11, | |
| "input_name": "state_11", | |
| "key": "step", | |
| "module": "transformer.layers.3.self_attn", | |
| "output_name": "out_state_11", | |
| "path": "transformer.layers.3.self_attn/step", | |
| "shape": [ | |
| 1 | |
| ] | |
| }, | |
| { | |
| "dtype": "float32", | |
| "fill": "nan", | |
| "index": 12, | |
| "input_name": "state_12", | |
| "key": "cache", | |
| "module": "transformer.layers.4.self_attn", | |
| "output_name": "out_state_12", | |
| "path": "transformer.layers.4.self_attn/cache", | |
| "shape": [ | |
| 2, | |
| 1, | |
| 1000, | |
| 16, | |
| 64 | |
| ] | |
| }, | |
| { | |
| "dtype": "float32", | |
| "fill": "empty", | |
| "index": 13, | |
| "input_name": "state_13", | |
| "key": "current_end", | |
| "module": "transformer.layers.4.self_attn", | |
| "output_name": "out_state_13", | |
| "path": "transformer.layers.4.self_attn/current_end", | |
| "shape": [ | |
| 0 | |
| ] | |
| }, | |
| { | |
| "dtype": "int64", | |
| "fill": "zeros", | |
| "index": 14, | |
| "input_name": "state_14", | |
| "key": "step", | |
| "module": "transformer.layers.4.self_attn", | |
| "output_name": "out_state_14", | |
| "path": "transformer.layers.4.self_attn/step", | |
| "shape": [ | |
| 1 | |
| ] | |
| }, | |
| { | |
| "dtype": "float32", | |
| "fill": "nan", | |
| "index": 15, | |
| "input_name": "state_15", | |
| "key": "cache", | |
| "module": "transformer.layers.5.self_attn", | |
| "output_name": "out_state_15", | |
| "path": "transformer.layers.5.self_attn/cache", | |
| "shape": [ | |
| 2, | |
| 1, | |
| 1000, | |
| 16, | |
| 64 | |
| ] | |
| }, | |
| { | |
| "dtype": "float32", | |
| "fill": "empty", | |
| "index": 16, | |
| "input_name": "state_16", | |
| "key": "current_end", | |
| "module": "transformer.layers.5.self_attn", | |
| "output_name": "out_state_16", | |
| "path": "transformer.layers.5.self_attn/current_end", | |
| "shape": [ | |
| 0 | |
| ] | |
| }, | |
| { | |
| "dtype": "int64", | |
| "fill": "zeros", | |
| "index": 17, | |
| "input_name": "state_17", | |
| "key": "step", | |
| "module": "transformer.layers.5.self_attn", | |
| "output_name": "out_state_17", | |
| "path": "transformer.layers.5.self_attn/step", | |
| "shape": [ | |
| 1 | |
| ] | |
| } | |
| ], | |
| "frame_rate": 12.5, | |
| "insert_bos_before_voice": true, | |
| "language": "english_2026-04", | |
| "latent_dim": 32, | |
| "max_token_per_chunk": 50, | |
| "mimi_state_manifest": [ | |
| { | |
| "dtype": "bool", | |
| "fill": "ones", | |
| "index": 0, | |
| "input_name": "state_0", | |
| "key": "first", | |
| "module": "decoder.model.0", | |
| "output_name": "out_state_0", | |
| "path": "decoder.model.0/first", | |
| "shape": [ | |
| 1 | |
| ] | |
| }, | |
| { | |
| "dtype": "float32", | |
| "fill": "zeros", | |
| "index": 1, | |
| "input_name": "state_1", | |
| "key": "previous", | |
| "module": "decoder.model.0", | |
| "output_name": "out_state_1", | |
| "path": "decoder.model.0/previous", | |
| "shape": [ | |
| 1, | |
| 512, | |
| 6 | |
| ] | |
| }, | |
| { | |
| "dtype": "bool", | |
| "fill": "ones", | |
| "index": 2, | |
| "input_name": "state_2", | |
| "key": "first", | |
| "module": "decoder.model.11", | |
| "output_name": "out_state_2", | |
| "path": "decoder.model.11/first", | |
| "shape": [ | |
| 1 | |
| ] | |
| }, | |
| { | |
| "dtype": "float32", | |
| "fill": "zeros", | |
| "index": 3, | |
| "input_name": "state_3", | |
| "key": "previous", | |
| "module": "decoder.model.11", | |
| "output_name": "out_state_3", | |
| "path": "decoder.model.11/previous", | |
| "shape": [ | |
| 1, | |
| 64, | |
| 2 | |
| ] | |
| }, | |
| { | |
| "dtype": "float32", | |
| "fill": "zeros", | |
| "index": 4, | |
| "input_name": "state_4", | |
| "key": "partial", | |
| "module": "decoder.model.2", | |
| "output_name": "out_state_4", | |
| "path": "decoder.model.2/partial", | |
| "shape": [ | |
| 1, | |
| 256, | |
| 6 | |
| ] | |
| }, | |
| { | |
| "dtype": "bool", | |
| "fill": "ones", | |
| "index": 5, | |
| "input_name": "state_5", | |
| "key": "first", | |
| "module": "decoder.model.3.block.1", | |
| "output_name": "out_state_5", | |
| "path": "decoder.model.3.block.1/first", | |
| "shape": [ | |
| 1 | |
| ] | |
| }, | |
| { | |
| "dtype": "float32", | |
| "fill": "zeros", | |
| "index": 6, | |
| "input_name": "state_6", | |
| "key": "previous", | |
| "module": "decoder.model.3.block.1", | |
| "output_name": "out_state_6", | |
| "path": "decoder.model.3.block.1/previous", | |
| "shape": [ | |
| 1, | |
| 256, | |
| 2 | |
| ] | |
| }, | |
| { | |
| "dtype": "bool", | |
| "fill": "ones", | |
| "index": 7, | |
| "input_name": "state_7", | |
| "key": "first", | |
| "module": "decoder.model.3.block.3", | |
| "output_name": "out_state_7", | |
| "path": "decoder.model.3.block.3/first", | |
| "shape": [ | |
| 1 | |
| ] | |
| }, | |
| { | |
| "dtype": "float32", | |
| "fill": "empty", | |
| "index": 8, | |
| "input_name": "state_8", | |
| "key": "previous", | |
| "module": "decoder.model.3.block.3", | |
| "output_name": "out_state_8", | |
| "path": "decoder.model.3.block.3/previous", | |
| "shape": [ | |
| 1, | |
| 128, | |
| 0 | |
| ] | |
| }, | |
| { | |
| "dtype": "float32", | |
| "fill": "zeros", | |
| "index": 9, | |
| "input_name": "state_9", | |
| "key": "partial", | |
| "module": "decoder.model.5", | |
| "output_name": "out_state_9", | |
| "path": "decoder.model.5/partial", | |
| "shape": [ | |
| 1, | |
| 128, | |
| 5 | |
| ] | |
| }, | |
| { | |
| "dtype": "bool", | |
| "fill": "ones", | |
| "index": 10, | |
| "input_name": "state_10", | |
| "key": "first", | |
| "module": "decoder.model.6.block.1", | |
| "output_name": "out_state_10", | |
| "path": "decoder.model.6.block.1/first", | |
| "shape": [ | |
| 1 | |
| ] | |
| }, | |
| { | |
| "dtype": "float32", | |
| "fill": "zeros", | |
| "index": 11, | |
| "input_name": "state_11", | |
| "key": "previous", | |
| "module": "decoder.model.6.block.1", | |
| "output_name": "out_state_11", | |
| "path": "decoder.model.6.block.1/previous", | |
| "shape": [ | |
| 1, | |
| 128, | |
| 2 | |
| ] | |
| }, | |
| { | |
| "dtype": "bool", | |
| "fill": "ones", | |
| "index": 12, | |
| "input_name": "state_12", | |
| "key": "first", | |
| "module": "decoder.model.6.block.3", | |
| "output_name": "out_state_12", | |
| "path": "decoder.model.6.block.3/first", | |
| "shape": [ | |
| 1 | |
| ] | |
| }, | |
| { | |
| "dtype": "float32", | |
| "fill": "empty", | |
| "index": 13, | |
| "input_name": "state_13", | |
| "key": "previous", | |
| "module": "decoder.model.6.block.3", | |
| "output_name": "out_state_13", | |
| "path": "decoder.model.6.block.3/previous", | |
| "shape": [ | |
| 1, | |
| 64, | |
| 0 | |
| ] | |
| }, | |
| { | |
| "dtype": "float32", | |
| "fill": "zeros", | |
| "index": 14, | |
| "input_name": "state_14", | |
| "key": "partial", | |
| "module": "decoder.model.8", | |
| "output_name": "out_state_14", | |
| "path": "decoder.model.8/partial", | |
| "shape": [ | |
| 1, | |
| 64, | |
| 4 | |
| ] | |
| }, | |
| { | |
| "dtype": "bool", | |
| "fill": "ones", | |
| "index": 15, | |
| "input_name": "state_15", | |
| "key": "first", | |
| "module": "decoder.model.9.block.1", | |
| "output_name": "out_state_15", | |
| "path": "decoder.model.9.block.1/first", | |
| "shape": [ | |
| 1 | |
| ] | |
| }, | |
| { | |
| "dtype": "float32", | |
| "fill": "zeros", | |
| "index": 16, | |
| "input_name": "state_16", | |
| "key": "previous", | |
| "module": "decoder.model.9.block.1", | |
| "output_name": "out_state_16", | |
| "path": "decoder.model.9.block.1/previous", | |
| "shape": [ | |
| 1, | |
| 64, | |
| 2 | |
| ] | |
| }, | |
| { | |
| "dtype": "bool", | |
| "fill": "ones", | |
| "index": 17, | |
| "input_name": "state_17", | |
| "key": "first", | |
| "module": "decoder.model.9.block.3", | |
| "output_name": "out_state_17", | |
| "path": "decoder.model.9.block.3/first", | |
| "shape": [ | |
| 1 | |
| ] | |
| }, | |
| { | |
| "dtype": "float32", | |
| "fill": "empty", | |
| "index": 18, | |
| "input_name": "state_18", | |
| "key": "previous", | |
| "module": "decoder.model.9.block.3", | |
| "output_name": "out_state_18", | |
| "path": "decoder.model.9.block.3/previous", | |
| "shape": [ | |
| 1, | |
| 32, | |
| 0 | |
| ] | |
| }, | |
| { | |
| "dtype": "float32", | |
| "fill": "zeros", | |
| "index": 19, | |
| "input_name": "state_19", | |
| "key": "cache", | |
| "module": "decoder_transformer.transformer.layers.0.self_attn", | |
| "output_name": "out_state_19", | |
| "path": "decoder_transformer.transformer.layers.0.self_attn/cache", | |
| "shape": [ | |
| 2, | |
| 1, | |
| 8, | |
| 1000, | |
| 64 | |
| ] | |
| }, | |
| { | |
| "dtype": "int64", | |
| "fill": "zeros", | |
| "index": 20, | |
| "input_name": "state_20", | |
| "key": "end_offset", | |
| "module": "decoder_transformer.transformer.layers.0.self_attn", | |
| "output_name": "out_state_20", | |
| "path": "decoder_transformer.transformer.layers.0.self_attn/end_offset", | |
| "shape": [ | |
| 1 | |
| ] | |
| }, | |
| { | |
| "dtype": "int64", | |
| "fill": "zeros", | |
| "index": 21, | |
| "input_name": "state_21", | |
| "key": "offset", | |
| "module": "decoder_transformer.transformer.layers.0.self_attn", | |
| "output_name": "out_state_21", | |
| "path": "decoder_transformer.transformer.layers.0.self_attn/offset", | |
| "shape": [ | |
| 1 | |
| ] | |
| }, | |
| { | |
| "dtype": "float32", | |
| "fill": "zeros", | |
| "index": 22, | |
| "input_name": "state_22", | |
| "key": "cache", | |
| "module": "decoder_transformer.transformer.layers.1.self_attn", | |
| "output_name": "out_state_22", | |
| "path": "decoder_transformer.transformer.layers.1.self_attn/cache", | |
| "shape": [ | |
| 2, | |
| 1, | |
| 8, | |
| 1000, | |
| 64 | |
| ] | |
| }, | |
| { | |
| "dtype": "int64", | |
| "fill": "zeros", | |
| "index": 23, | |
| "input_name": "state_23", | |
| "key": "end_offset", | |
| "module": "decoder_transformer.transformer.layers.1.self_attn", | |
| "output_name": "out_state_23", | |
| "path": "decoder_transformer.transformer.layers.1.self_attn/end_offset", | |
| "shape": [ | |
| 1 | |
| ] | |
| }, | |
| { | |
| "dtype": "int64", | |
| "fill": "zeros", | |
| "index": 24, | |
| "input_name": "state_24", | |
| "key": "offset", | |
| "module": "decoder_transformer.transformer.layers.1.self_attn", | |
| "output_name": "out_state_24", | |
| "path": "decoder_transformer.transformer.layers.1.self_attn/offset", | |
| "shape": [ | |
| 1 | |
| ] | |
| }, | |
| { | |
| "dtype": "bool", | |
| "fill": "ones", | |
| "index": 25, | |
| "input_name": "state_25", | |
| "key": "first", | |
| "module": "downsample.conv", | |
| "output_name": "out_state_25", | |
| "path": "downsample.conv/first", | |
| "shape": [ | |
| 1 | |
| ] | |
| }, | |
| { | |
| "dtype": "float32", | |
| "fill": "zeros", | |
| "index": 26, | |
| "input_name": "state_26", | |
| "key": "previous", | |
| "module": "downsample.conv", | |
| "output_name": "out_state_26", | |
| "path": "downsample.conv/previous", | |
| "shape": [ | |
| 1, | |
| 512, | |
| 16 | |
| ] | |
| }, | |
| { | |
| "dtype": "bool", | |
| "fill": "ones", | |
| "index": 27, | |
| "input_name": "state_27", | |
| "key": "first", | |
| "module": "encoder.model.0", | |
| "output_name": "out_state_27", | |
| "path": "encoder.model.0/first", | |
| "shape": [ | |
| 1 | |
| ] | |
| }, | |
| { | |
| "dtype": "float32", | |
| "fill": "zeros", | |
| "index": 28, | |
| "input_name": "state_28", | |
| "key": "previous", | |
| "module": "encoder.model.0", | |
| "output_name": "out_state_28", | |
| "path": "encoder.model.0/previous", | |
| "shape": [ | |
| 1, | |
| 1, | |
| 6 | |
| ] | |
| }, | |
| { | |
| "dtype": "bool", | |
| "fill": "ones", | |
| "index": 29, | |
| "input_name": "state_29", | |
| "key": "first", | |
| "module": "encoder.model.1.block.1", | |
| "output_name": "out_state_29", | |
| "path": "encoder.model.1.block.1/first", | |
| "shape": [ | |
| 1 | |
| ] | |
| }, | |
| { | |
| "dtype": "float32", | |
| "fill": "zeros", | |
| "index": 30, | |
| "input_name": "state_30", | |
| "key": "previous", | |
| "module": "encoder.model.1.block.1", | |
| "output_name": "out_state_30", | |
| "path": "encoder.model.1.block.1/previous", | |
| "shape": [ | |
| 1, | |
| 64, | |
| 2 | |
| ] | |
| }, | |
| { | |
| "dtype": "bool", | |
| "fill": "ones", | |
| "index": 31, | |
| "input_name": "state_31", | |
| "key": "first", | |
| "module": "encoder.model.1.block.3", | |
| "output_name": "out_state_31", | |
| "path": "encoder.model.1.block.3/first", | |
| "shape": [ | |
| 1 | |
| ] | |
| }, | |
| { | |
| "dtype": "float32", | |
| "fill": "empty", | |
| "index": 32, | |
| "input_name": "state_32", | |
| "key": "previous", | |
| "module": "encoder.model.1.block.3", | |
| "output_name": "out_state_32", | |
| "path": "encoder.model.1.block.3/previous", | |
| "shape": [ | |
| 1, | |
| 32, | |
| 0 | |
| ] | |
| }, | |
| { | |
| "dtype": "bool", | |
| "fill": "ones", | |
| "index": 33, | |
| "input_name": "state_33", | |
| "key": "first", | |
| "module": "encoder.model.11", | |
| "output_name": "out_state_33", | |
| "path": "encoder.model.11/first", | |
| "shape": [ | |
| 1 | |
| ] | |
| }, | |
| { | |
| "dtype": "float32", | |
| "fill": "zeros", | |
| "index": 34, | |
| "input_name": "state_34", | |
| "key": "previous", | |
| "module": "encoder.model.11", | |
| "output_name": "out_state_34", | |
| "path": "encoder.model.11/previous", | |
| "shape": [ | |
| 1, | |
| 512, | |
| 2 | |
| ] | |
| }, | |
| { | |
| "dtype": "bool", | |
| "fill": "ones", | |
| "index": 35, | |
| "input_name": "state_35", | |
| "key": "first", | |
| "module": "encoder.model.3", | |
| "output_name": "out_state_35", | |
| "path": "encoder.model.3/first", | |
| "shape": [ | |
| 1 | |
| ] | |
| }, | |
| { | |
| "dtype": "float32", | |
| "fill": "zeros", | |
| "index": 36, | |
| "input_name": "state_36", | |
| "key": "previous", | |
| "module": "encoder.model.3", | |
| "output_name": "out_state_36", | |
| "path": "encoder.model.3/previous", | |
| "shape": [ | |
| 1, | |
| 64, | |
| 4 | |
| ] | |
| }, | |
| { | |
| "dtype": "bool", | |
| "fill": "ones", | |
| "index": 37, | |
| "input_name": "state_37", | |
| "key": "first", | |
| "module": "encoder.model.4.block.1", | |
| "output_name": "out_state_37", | |
| "path": "encoder.model.4.block.1/first", | |
| "shape": [ | |
| 1 | |
| ] | |
| }, | |
| { | |
| "dtype": "float32", | |
| "fill": "zeros", | |
| "index": 38, | |
| "input_name": "state_38", | |
| "key": "previous", | |
| "module": "encoder.model.4.block.1", | |
| "output_name": "out_state_38", | |
| "path": "encoder.model.4.block.1/previous", | |
| "shape": [ | |
| 1, | |
| 128, | |
| 2 | |
| ] | |
| }, | |
| { | |
| "dtype": "bool", | |
| "fill": "ones", | |
| "index": 39, | |
| "input_name": "state_39", | |
| "key": "first", | |
| "module": "encoder.model.4.block.3", | |
| "output_name": "out_state_39", | |
| "path": "encoder.model.4.block.3/first", | |
| "shape": [ | |
| 1 | |
| ] | |
| }, | |
| { | |
| "dtype": "float32", | |
| "fill": "empty", | |
| "index": 40, | |
| "input_name": "state_40", | |
| "key": "previous", | |
| "module": "encoder.model.4.block.3", | |
| "output_name": "out_state_40", | |
| "path": "encoder.model.4.block.3/previous", | |
| "shape": [ | |
| 1, | |
| 64, | |
| 0 | |
| ] | |
| }, | |
| { | |
| "dtype": "bool", | |
| "fill": "ones", | |
| "index": 41, | |
| "input_name": "state_41", | |
| "key": "first", | |
| "module": "encoder.model.6", | |
| "output_name": "out_state_41", | |
| "path": "encoder.model.6/first", | |
| "shape": [ | |
| 1 | |
| ] | |
| }, | |
| { | |
| "dtype": "float32", | |
| "fill": "zeros", | |
| "index": 42, | |
| "input_name": "state_42", | |
| "key": "previous", | |
| "module": "encoder.model.6", | |
| "output_name": "out_state_42", | |
| "path": "encoder.model.6/previous", | |
| "shape": [ | |
| 1, | |
| 128, | |
| 5 | |
| ] | |
| }, | |
| { | |
| "dtype": "bool", | |
| "fill": "ones", | |
| "index": 43, | |
| "input_name": "state_43", | |
| "key": "first", | |
| "module": "encoder.model.7.block.1", | |
| "output_name": "out_state_43", | |
| "path": "encoder.model.7.block.1/first", | |
| "shape": [ | |
| 1 | |
| ] | |
| }, | |
| { | |
| "dtype": "float32", | |
| "fill": "zeros", | |
| "index": 44, | |
| "input_name": "state_44", | |
| "key": "previous", | |
| "module": "encoder.model.7.block.1", | |
| "output_name": "out_state_44", | |
| "path": "encoder.model.7.block.1/previous", | |
| "shape": [ | |
| 1, | |
| 256, | |
| 2 | |
| ] | |
| }, | |
| { | |
| "dtype": "bool", | |
| "fill": "ones", | |
| "index": 45, | |
| "input_name": "state_45", | |
| "key": "first", | |
| "module": "encoder.model.7.block.3", | |
| "output_name": "out_state_45", | |
| "path": "encoder.model.7.block.3/first", | |
| "shape": [ | |
| 1 | |
| ] | |
| }, | |
| { | |
| "dtype": "float32", | |
| "fill": "empty", | |
| "index": 46, | |
| "input_name": "state_46", | |
| "key": "previous", | |
| "module": "encoder.model.7.block.3", | |
| "output_name": "out_state_46", | |
| "path": "encoder.model.7.block.3/previous", | |
| "shape": [ | |
| 1, | |
| 128, | |
| 0 | |
| ] | |
| }, | |
| { | |
| "dtype": "bool", | |
| "fill": "ones", | |
| "index": 47, | |
| "input_name": "state_47", | |
| "key": "first", | |
| "module": "encoder.model.9", | |
| "output_name": "out_state_47", | |
| "path": "encoder.model.9/first", | |
| "shape": [ | |
| 1 | |
| ] | |
| }, | |
| { | |
| "dtype": "float32", | |
| "fill": "zeros", | |
| "index": 48, | |
| "input_name": "state_48", | |
| "key": "previous", | |
| "module": "encoder.model.9", | |
| "output_name": "out_state_48", | |
| "path": "encoder.model.9/previous", | |
| "shape": [ | |
| 1, | |
| 256, | |
| 6 | |
| ] | |
| }, | |
| { | |
| "dtype": "float32", | |
| "fill": "zeros", | |
| "index": 49, | |
| "input_name": "state_49", | |
| "key": "cache", | |
| "module": "encoder_transformer.transformer.layers.0.self_attn", | |
| "output_name": "out_state_49", | |
| "path": "encoder_transformer.transformer.layers.0.self_attn/cache", | |
| "shape": [ | |
| 2, | |
| 1, | |
| 8, | |
| 1000, | |
| 64 | |
| ] | |
| }, | |
| { | |
| "dtype": "int64", | |
| "fill": "zeros", | |
| "index": 50, | |
| "input_name": "state_50", | |
| "key": "end_offset", | |
| "module": "encoder_transformer.transformer.layers.0.self_attn", | |
| "output_name": "out_state_50", | |
| "path": "encoder_transformer.transformer.layers.0.self_attn/end_offset", | |
| "shape": [ | |
| 1 | |
| ] | |
| }, | |
| { | |
| "dtype": "int64", | |
| "fill": "zeros", | |
| "index": 51, | |
| "input_name": "state_51", | |
| "key": "offset", | |
| "module": "encoder_transformer.transformer.layers.0.self_attn", | |
| "output_name": "out_state_51", | |
| "path": "encoder_transformer.transformer.layers.0.self_attn/offset", | |
| "shape": [ | |
| 1 | |
| ] | |
| }, | |
| { | |
| "dtype": "float32", | |
| "fill": "zeros", | |
| "index": 52, | |
| "input_name": "state_52", | |
| "key": "cache", | |
| "module": "encoder_transformer.transformer.layers.1.self_attn", | |
| "output_name": "out_state_52", | |
| "path": "encoder_transformer.transformer.layers.1.self_attn/cache", | |
| "shape": [ | |
| 2, | |
| 1, | |
| 8, | |
| 1000, | |
| 64 | |
| ] | |
| }, | |
| { | |
| "dtype": "int64", | |
| "fill": "zeros", | |
| "index": 53, | |
| "input_name": "state_53", | |
| "key": "end_offset", | |
| "module": "encoder_transformer.transformer.layers.1.self_attn", | |
| "output_name": "out_state_53", | |
| "path": "encoder_transformer.transformer.layers.1.self_attn/end_offset", | |
| "shape": [ | |
| 1 | |
| ] | |
| }, | |
| { | |
| "dtype": "int64", | |
| "fill": "zeros", | |
| "index": 54, | |
| "input_name": "state_54", | |
| "key": "offset", | |
| "module": "encoder_transformer.transformer.layers.1.self_attn", | |
| "output_name": "out_state_54", | |
| "path": "encoder_transformer.transformer.layers.1.self_attn/offset", | |
| "shape": [ | |
| 1 | |
| ] | |
| }, | |
| { | |
| "dtype": "float32", | |
| "fill": "zeros", | |
| "index": 55, | |
| "input_name": "state_55", | |
| "key": "partial", | |
| "module": "upsample.convtr", | |
| "output_name": "out_state_55", | |
| "path": "upsample.convtr/partial", | |
| "shape": [ | |
| 1, | |
| 512, | |
| 16 | |
| ] | |
| } | |
| ], | |
| "model_recommended_frames_after_eos": null, | |
| "pad_with_spaces_for_short_inputs": false, | |
| "predefined_voices": [ | |
| "alba", | |
| "azelma", | |
| "cosette", | |
| "eponine", | |
| "fantine", | |
| "javert", | |
| "jean", | |
| "marius" | |
| ], | |
| "remove_semicolons": false, | |
| "sample_rate": 24000, | |
| "samples_per_frame": 1920, | |
| "schema_version": 2, | |
| "tokenizer_file": "tokenizer.model" | |
| } | |