Audio-to-Audio
Safetensors
torch
dycast / config.json
lucadellalib's picture
Upload 3 files
d8a2953 verified
{
"encoder_name": "WavLM",
"encoder_config": {
"hidden_dims": [
512,
512,
512,
512,
512,
512,
512
],
"kernel_sizes": [
10,
3,
3,
3,
3,
2,
2
],
"strides": [
5,
2,
2,
2,
2,
2,
2
],
"num_layers": 6,
"dim": 1024,
"ffn_dim": 4096,
"num_heads": 16,
"num_buckets": 320,
"max_distance": 800,
"max_cached_steps": 2048,
"dropout": 0.0,
"conv_pos": 128,
"conv_pos_groups": 16,
"causal": false,
"window_size": 512,
"lookahead_size": 3,
"use_flex_attention": false
},
"compressor_name": "FocalEncoder",
"compressor_config": {
"input_dim": 1024,
"output_dim": 32,
"hidden_dims": [
1024,
1024,
1024
],
"downscale_factors": [
1,
1,
1
],
"focal_window": 14,
"focal_level": 2,
"focal_factor": 4,
"dropout": 0.0,
"use_post_norm": false,
"use_layerscale": false,
"layerscale_init": 0.0001,
"tanhscale_init": 0.5,
"normalize_modulator": false,
"causal": false,
"window_size": 512
},
"boundary_predictor_name": "HazardModel",
"boundary_predictor_config": {
"input_dim": 1024,
"hidden_dims": [
1024,
1024,
1024
],
"downscale_factors": [
1,
1,
1
],
"focal_window": 14,
"focal_level": 2,
"focal_factor": 4,
"dropout": 0.0,
"use_post_norm": false,
"use_layerscale": false,
"layerscale_init": 0.0001,
"tanhscale_init": 0.5,
"normalize_modulator": false,
"causal": false,
"window_size": 512
},
"downsampler_name": "SelectLastPool",
"downsampler_config": {},
"quantizer_name": "ScalarSphericalQuantizer",
"quantizer_config": {
"dim": 32,
"n_levels": 4
},
"duration_predictor_name": "NegBinModel",
"duration_predictor_config": {
"input_dim": 32,
"hidden_dims": [
1024,
1024,
1024
],
"downscale_factors": [
1,
1,
1
],
"focal_window": 14,
"focal_level": 2,
"focal_factor": 4,
"dropout": 0.0,
"use_post_norm": false,
"use_layerscale": false,
"layerscale_init": 0.0001,
"tanhscale_init": 0.5,
"normalize_modulator": false,
"causal": false,
"window_size": 512,
"min_duration": 1,
"eps": 0.0001
},
"upsampler_name": "RepeatInterleaveUnpool",
"upsampler_config": {},
"decompressor_name": "FocalDecoder",
"decompressor_config": {
"input_dim": 32,
"output_dim": 1024,
"hidden_dims": [
1024,
1024,
1024
],
"upscale_factors": [
1,
1,
1
],
"focal_window": 14,
"focal_level": 2,
"focal_factor": 4,
"dropout": 0.0,
"use_post_norm": false,
"use_layerscale": false,
"layerscale_init": 0.0001,
"tanhscale_init": 0.5,
"normalize_modulator": false,
"causal": false,
"window_size": 512,
"last_window_size": 512,
"lookahead_size": 3
},
"decoder_name": "Vocos",
"decoder_config": {
"input_dim": 1024,
"num_layers": 8,
"dim": 512,
"ffn_dim": 1536,
"kernel_size": 7,
"layerscale_init": 0.125,
"n_fft": 1024,
"hop_length": 320,
"causal": false
},
"char_aligner_name": "MMS",
"char_aligner_config": {
"checkpoint": "facebook/mms-1b-all"
},
"retriever_name": "LatentIVF",
"retriever_config": {
"input_dim": 1024,
"latent_dim": 32,
"hidden_dims": [
1024,
1024,
1024
],
"downscale_factors": [
1,
1,
1
],
"focal_window": 14,
"focal_level": 2,
"focal_factor": 4,
"dropout": 0.0,
"use_post_norm": false,
"use_layerscale": false,
"layerscale_init": 0.0001,
"tanhscale_init": 0.5,
"normalize_modulator": false,
"causal": false,
"window_size": 512,
"nlist": 4096,
"nprobe": 16
}
}