{ "encoder": { "semantic_encoder": { "pretrained_name": "facebook/w2v-bert-2.0" }, "acoustic_encoder": { "dims": [ 48, 96, 192, 384, 768, 1536 ], "ratios": [ 2, 2, 4, 4, 5 ], "dilations": [ 1, 3, 9 ], "output_dim": 1024 }, "out_dim": 2048 }, "quantizer": { "dim": 2048, "levels": [ 4, 4, 4, 4, 4, 4, 4, 4 ] }, "decoder": { "in_dim": 2048, "hop_len": 320, "emb_dim": 1024, "num_heads": 16, "depth": 12 } }