| { | |
| "encoder": { | |
| "semantic_encoder": { | |
| "pretrained_name": "facebook/w2v-bert-2.0" | |
| }, | |
| "acoustic_encoder": { | |
| "dims": [ | |
| 48, | |
| 96, | |
| 192, | |
| 384, | |
| 768, | |
| 1536 | |
| ], | |
| "ratios": [ | |
| 2, | |
| 2, | |
| 4, | |
| 4, | |
| 5 | |
| ], | |
| "dilations": [ | |
| 1, | |
| 3, | |
| 9 | |
| ], | |
| "output_dim": 1024 | |
| }, | |
| "out_dim": 2048 | |
| }, | |
| "quantizer": { | |
| "dim": 2048, | |
| "levels": [ | |
| 4, | |
| 4, | |
| 4, | |
| 4, | |
| 4, | |
| 4, | |
| 4, | |
| 4 | |
| ] | |
| }, | |
| "decoder": { | |
| "in_dim": 2048, | |
| "hop_len": 320, | |
| "emb_dim": 1024, | |
| "num_heads": 16, | |
| "depth": 12 | |
| } | |
| } |