| { | |
| "encoder_name": "WavLM", | |
| "encoder_config": { | |
| "hidden_dims": [ | |
| 512, | |
| 512, | |
| 512, | |
| 512, | |
| 512, | |
| 512, | |
| 512 | |
| ], | |
| "kernel_sizes": [ | |
| 10, | |
| 3, | |
| 3, | |
| 3, | |
| 3, | |
| 2, | |
| 2 | |
| ], | |
| "strides": [ | |
| 5, | |
| 2, | |
| 2, | |
| 2, | |
| 2, | |
| 2, | |
| 2 | |
| ], | |
| "num_layers": 6, | |
| "dim": 1024, | |
| "ffn_dim": 4096, | |
| "num_heads": 16, | |
| "num_buckets": 320, | |
| "max_distance": 800, | |
| "max_cached_steps": 2048, | |
| "dropout": 0.0, | |
| "conv_pos": 128, | |
| "conv_pos_groups": 16, | |
| "causal": false, | |
| "window_size": 512, | |
| "lookahead_size": 3, | |
| "use_flex_attention": false | |
| }, | |
| "compressor_name": "FocalEncoder", | |
| "compressor_config": { | |
| "input_dim": 1024, | |
| "output_dim": 32, | |
| "hidden_dims": [ | |
| 1024, | |
| 1024, | |
| 1024 | |
| ], | |
| "downscale_factors": [ | |
| 1, | |
| 1, | |
| 1 | |
| ], | |
| "focal_window": 14, | |
| "focal_level": 2, | |
| "focal_factor": 4, | |
| "dropout": 0.0, | |
| "use_post_norm": false, | |
| "use_layerscale": false, | |
| "layerscale_init": 0.0001, | |
| "tanhscale_init": 0.5, | |
| "normalize_modulator": false, | |
| "causal": false, | |
| "window_size": 512 | |
| }, | |
| "boundary_predictor_name": "HazardModel", | |
| "boundary_predictor_config": { | |
| "input_dim": 1024, | |
| "hidden_dims": [ | |
| 1024, | |
| 1024, | |
| 1024 | |
| ], | |
| "downscale_factors": [ | |
| 1, | |
| 1, | |
| 1 | |
| ], | |
| "focal_window": 14, | |
| "focal_level": 2, | |
| "focal_factor": 4, | |
| "dropout": 0.0, | |
| "use_post_norm": false, | |
| "use_layerscale": false, | |
| "layerscale_init": 0.0001, | |
| "tanhscale_init": 0.5, | |
| "normalize_modulator": false, | |
| "causal": false, | |
| "window_size": 512 | |
| }, | |
| "downsampler_name": "SelectLastPool", | |
| "downsampler_config": {}, | |
| "quantizer_name": "ScalarSphericalQuantizer", | |
| "quantizer_config": { | |
| "dim": 32, | |
| "n_levels": 4 | |
| }, | |
| "duration_predictor_name": "NegBinModel", | |
| "duration_predictor_config": { | |
| "input_dim": 32, | |
| "hidden_dims": [ | |
| 1024, | |
| 1024, | |
| 1024 | |
| ], | |
| "downscale_factors": [ | |
| 1, | |
| 1, | |
| 1 | |
| ], | |
| "focal_window": 14, | |
| "focal_level": 2, | |
| "focal_factor": 4, | |
| "dropout": 0.0, | |
| "use_post_norm": false, | |
| "use_layerscale": false, | |
| "layerscale_init": 0.0001, | |
| "tanhscale_init": 0.5, | |
| "normalize_modulator": false, | |
| "causal": false, | |
| "window_size": 512, | |
| "min_duration": 1, | |
| "eps": 0.0001 | |
| }, | |
| "upsampler_name": "RepeatInterleaveUnpool", | |
| "upsampler_config": {}, | |
| "decompressor_name": "FocalDecoder", | |
| "decompressor_config": { | |
| "input_dim": 32, | |
| "output_dim": 1024, | |
| "hidden_dims": [ | |
| 1024, | |
| 1024, | |
| 1024 | |
| ], | |
| "upscale_factors": [ | |
| 1, | |
| 1, | |
| 1 | |
| ], | |
| "focal_window": 14, | |
| "focal_level": 2, | |
| "focal_factor": 4, | |
| "dropout": 0.0, | |
| "use_post_norm": false, | |
| "use_layerscale": false, | |
| "layerscale_init": 0.0001, | |
| "tanhscale_init": 0.5, | |
| "normalize_modulator": false, | |
| "causal": false, | |
| "window_size": 512, | |
| "last_window_size": 512, | |
| "lookahead_size": 3 | |
| }, | |
| "decoder_name": "Vocos", | |
| "decoder_config": { | |
| "input_dim": 1024, | |
| "num_layers": 8, | |
| "dim": 512, | |
| "ffn_dim": 1536, | |
| "kernel_size": 7, | |
| "layerscale_init": 0.125, | |
| "n_fft": 1024, | |
| "hop_length": 320, | |
| "causal": false | |
| }, | |
| "char_aligner_name": "MMS", | |
| "char_aligner_config": { | |
| "checkpoint": "facebook/mms-1b-all" | |
| }, | |
| "retriever_name": "LatentIVF", | |
| "retriever_config": { | |
| "input_dim": 1024, | |
| "latent_dim": 32, | |
| "hidden_dims": [ | |
| 1024, | |
| 1024, | |
| 1024 | |
| ], | |
| "downscale_factors": [ | |
| 1, | |
| 1, | |
| 1 | |
| ], | |
| "focal_window": 14, | |
| "focal_level": 2, | |
| "focal_factor": 4, | |
| "dropout": 0.0, | |
| "use_post_norm": false, | |
| "use_layerscale": false, | |
| "layerscale_init": 0.0001, | |
| "tanhscale_init": 0.5, | |
| "normalize_modulator": false, | |
| "causal": false, | |
| "window_size": 512, | |
| "nlist": 4096, | |
| "nprobe": 16 | |
| } | |
| } |