Dolphin / model /config.json
niobures's picture
Dolphin (code, models, paper)
4305b2e verified
{
"model_type": "dolphin",
"task": "audio_visual_speech_separation",
"framework": "pytorch",
"license": "apache-2.0",
"tags": [
"audio",
"speech-separation",
"audio-visual",
"pytorch",
"dolphin"
],
"architectures": [
"Dolphin"
],
"auto_map": {
"AutoModel": "dolphin.Dolphin"
},
"num_stages": 4,
"sample_rate": 16000,
"vpre_channels": 3872,
"vmid_channels": 512,
"vin_channels": 64,
"vout_channels": 64,
"module_audio_enc": {
"in_channels": 1,
"out_channels": 256,
"kernel_size": 16,
"stride": 4,
"groups": 1,
"bias": false
},
"module_feature_projector": {
"num_channels": 256,
"in_channels": 256,
"out_channels": 128,
"kernel_size": 1,
"bias": false
},
"module_separator": {
"num_stages": 4,
"relative_positional_encoding": {
"in_channels": 128,
"num_heads": 8,
"maxlen": 2000,
"embed_v": false
},
"enc_stage": {
"global_blocks": {
"in_channels": 128,
"num_mha_heads": 8,
"dropout_rate": 0.05
},
"local_blocks": {
"in_channels": 128,
"kernel_size": 65,
"dropout_rate": 0.05
},
"down_conv_layer": {
"in_channels": 128,
"samp_kernel_size": 5
}
},
"simple_fusion": {
"out_channels": 128
},
"dec_stage": {
"global_blocks": {
"in_channels": 128,
"num_mha_heads": 8,
"dropout_rate": 0.05
},
"local_blocks": {
"in_channels": 128,
"kernel_size": 65,
"dropout_rate": 0.05
},
"spk_attention": {
"in_channels": 128,
"num_mha_heads": 8,
"dropout_rate": 0.05
}
}
},
"module_output_layer": {
"in_channels": 256,
"out_channels": 128
},
"module_audio_dec": {
"in_channels": 256,
"out_channels": 1,
"kernel_size": 16,
"stride": 4,
"bias": false
},
"video_encoder_params": {
"layers": [
"residual",
"compress_space",
"consecutive_residual",
"compress_space",
"consecutive_residual",
"linear_attend_space",
"compress_space",
"consecutive_residual",
"attend_space"
],
"image_size": 88,
"in_channel": 1,
"init_channel": 4,
"max_dim": 32,
"input_conv_kernel_size": [
7,
7,
7
],
"output_conv_kernel_size": [
3,
3,
3
],
"residual_conv_kernel_size": 3,
"pad_mode": "constant",
"attn_dim_head": 32,
"attn_heads": 8,
"attn_dropout": 0.0,
"flash_attn": true,
"linear_attn_dim_head": 8,
"linear_attn_heads": 16,
"num_quantizers": 1,
"codebook_size": 256,
"codebook_dim": 64,
"commitment_cost": 1.0,
"distill_cost": 1.0
}
}