File size: 1,593 Bytes
0e21d5a |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 |
{
"codec": {
"ssl_adaptor": {
"in_dim": 1280,
"embed_dim": 768,
"out_dim": 768,
"num_layers": 4,
"num_heads": 12,
"ffn_dim": 3072,
"attn_dropout": 0,
"dropout": 0
},
"acoustic_encoder": {
"num_mels": 128,
"sampling_rate": 16000,
"hop_length": 160,
"n_fft": 400,
"fmin": 0,
"fmax": 8000,
"embed_dim": 768,
"num_layers": 12,
"num_heads": 12,
"ffn_dim": 3072,
"attn_dropout": 0,
"dropout": 0,
"max_positions": 1500
},
"downsample": {
"embed_dim": 1536,
"avg_pooler": 4
},
"rvq": {
"input_dim": 1536,
"rvq_dim": 768,
"output_dim": 768,
"num_quantizers": 16,
"codebook_size": 2048,
"codebook_dim": 512
},
"upsample": {
"embed_dim": 768,
"stride": 4
},
"semantic_decoder": {
"in_dim": 768,
"embed_dim": 768,
"out_dim": 1280,
"num_layers": 4,
"num_heads": 12,
"ffn_dim": 3072,
"attn_dropout": 0,
"dropout": 0
},
"acoustic_decoder": {
"embed_dim": 768,
"num_layers": 12,
"num_heads": 12,
"dropout": 0,
"hop_length": 240,
"causal": true
}
}
} |