{ "encoder_name": "WavLM", "encoder_config": { "hidden_dims": [ 512, 512, 512, 512, 512, 512, 512 ], "kernel_sizes": [ 10, 3, 3, 3, 3, 2, 2 ], "strides": [ 5, 2, 2, 2, 2, 2, 2 ], "num_layers": 6, "dim": 1024, "ffn_dim": 4096, "num_heads": 16, "num_buckets": 320, "max_distance": 800, "max_cached_steps": 2048, "dropout": 0.0, "conv_pos": 128, "conv_pos_groups": 16, "causal": false, "window_size": 512, "lookahead_size": 3, "use_flex_attention": false }, "compressor_name": "FocalEncoder", "compressor_config": { "input_dim": 1024, "output_dim": 32, "hidden_dims": [ 1024, 1024, 1024 ], "downscale_factors": [ 1, 1, 1 ], "focal_window": 14, "focal_level": 2, "focal_factor": 4, "dropout": 0.0, "use_post_norm": false, "use_layerscale": false, "layerscale_init": 0.0001, "tanhscale_init": 0.5, "normalize_modulator": false, "causal": false, "window_size": 512 }, "boundary_predictor_name": "HazardModel", "boundary_predictor_config": { "input_dim": 1024, "hidden_dims": [ 1024, 1024, 1024 ], "downscale_factors": [ 1, 1, 1 ], "focal_window": 14, "focal_level": 2, "focal_factor": 4, "dropout": 0.0, "use_post_norm": false, "use_layerscale": false, "layerscale_init": 0.0001, "tanhscale_init": 0.5, "normalize_modulator": false, "causal": false, "window_size": 512 }, "downsampler_name": "SelectLastPool", "downsampler_config": {}, "quantizer_name": "ScalarSphericalQuantizer", "quantizer_config": { "dim": 32, "n_levels": 4 }, "duration_predictor_name": "NegBinModel", "duration_predictor_config": { "input_dim": 32, "hidden_dims": [ 1024, 1024, 1024 ], "downscale_factors": [ 1, 1, 1 ], "focal_window": 14, "focal_level": 2, "focal_factor": 4, "dropout": 0.0, "use_post_norm": false, "use_layerscale": false, "layerscale_init": 0.0001, "tanhscale_init": 0.5, "normalize_modulator": false, "causal": false, "window_size": 512, "min_duration": 1, "eps": 0.0001 }, "upsampler_name": "RepeatInterleaveUnpool", "upsampler_config": {}, "decompressor_name": "FocalDecoder", "decompressor_config": { "input_dim": 32, "output_dim": 1024, "hidden_dims": [ 1024, 1024, 1024 ], "upscale_factors": [ 1, 1, 1 ], "focal_window": 14, "focal_level": 2, "focal_factor": 4, "dropout": 0.0, "use_post_norm": false, "use_layerscale": false, "layerscale_init": 0.0001, "tanhscale_init": 0.5, "normalize_modulator": false, "causal": false, "window_size": 512, "last_window_size": 512, "lookahead_size": 3 }, "decoder_name": "Vocos", "decoder_config": { "input_dim": 1024, "num_layers": 8, "dim": 512, "ffn_dim": 1536, "kernel_size": 7, "layerscale_init": 0.125, "n_fft": 1024, "hop_length": 320, "causal": false }, "char_aligner_name": "MMS", "char_aligner_config": { "checkpoint": "facebook/mms-1b-all" }, "retriever_name": "LatentIVF", "retriever_config": { "input_dim": 1024, "latent_dim": 32, "hidden_dims": [ 1024, 1024, 1024 ], "downscale_factors": [ 1, 1, 1 ], "focal_window": 14, "focal_level": 2, "focal_factor": 4, "dropout": 0.0, "use_post_norm": false, "use_layerscale": false, "layerscale_init": 0.0001, "tanhscale_init": 0.5, "normalize_modulator": false, "causal": false, "window_size": 512, "nlist": 4096, "nprobe": 16 } }