| { | |
| "model_type": "dolphin", | |
| "task": "audio_visual_speech_separation", | |
| "framework": "pytorch", | |
| "license": "apache-2.0", | |
| "tags": [ | |
| "audio", | |
| "speech-separation", | |
| "audio-visual", | |
| "pytorch", | |
| "dolphin" | |
| ], | |
| "architectures": [ | |
| "Dolphin" | |
| ], | |
| "auto_map": { | |
| "AutoModel": "dolphin.Dolphin" | |
| }, | |
| "num_stages": 4, | |
| "sample_rate": 16000, | |
| "vpre_channels": 3872, | |
| "vmid_channels": 512, | |
| "vin_channels": 64, | |
| "vout_channels": 64, | |
| "module_audio_enc": { | |
| "in_channels": 1, | |
| "out_channels": 256, | |
| "kernel_size": 16, | |
| "stride": 4, | |
| "groups": 1, | |
| "bias": false | |
| }, | |
| "module_feature_projector": { | |
| "num_channels": 256, | |
| "in_channels": 256, | |
| "out_channels": 128, | |
| "kernel_size": 1, | |
| "bias": false | |
| }, | |
| "module_separator": { | |
| "num_stages": 4, | |
| "relative_positional_encoding": { | |
| "in_channels": 128, | |
| "num_heads": 8, | |
| "maxlen": 2000, | |
| "embed_v": false | |
| }, | |
| "enc_stage": { | |
| "global_blocks": { | |
| "in_channels": 128, | |
| "num_mha_heads": 8, | |
| "dropout_rate": 0.05 | |
| }, | |
| "local_blocks": { | |
| "in_channels": 128, | |
| "kernel_size": 65, | |
| "dropout_rate": 0.05 | |
| }, | |
| "down_conv_layer": { | |
| "in_channels": 128, | |
| "samp_kernel_size": 5 | |
| } | |
| }, | |
| "simple_fusion": { | |
| "out_channels": 128 | |
| }, | |
| "dec_stage": { | |
| "global_blocks": { | |
| "in_channels": 128, | |
| "num_mha_heads": 8, | |
| "dropout_rate": 0.05 | |
| }, | |
| "local_blocks": { | |
| "in_channels": 128, | |
| "kernel_size": 65, | |
| "dropout_rate": 0.05 | |
| }, | |
| "spk_attention": { | |
| "in_channels": 128, | |
| "num_mha_heads": 8, | |
| "dropout_rate": 0.05 | |
| } | |
| } | |
| }, | |
| "module_output_layer": { | |
| "in_channels": 256, | |
| "out_channels": 128 | |
| }, | |
| "module_audio_dec": { | |
| "in_channels": 256, | |
| "out_channels": 1, | |
| "kernel_size": 16, | |
| "stride": 4, | |
| "bias": false | |
| }, | |
| "video_encoder_params": { | |
| "layers": [ | |
| "residual", | |
| "compress_space", | |
| "consecutive_residual", | |
| "compress_space", | |
| "consecutive_residual", | |
| "linear_attend_space", | |
| "compress_space", | |
| "consecutive_residual", | |
| "attend_space" | |
| ], | |
| "image_size": 88, | |
| "in_channel": 1, | |
| "init_channel": 4, | |
| "max_dim": 32, | |
| "input_conv_kernel_size": [ | |
| 7, | |
| 7, | |
| 7 | |
| ], | |
| "output_conv_kernel_size": [ | |
| 3, | |
| 3, | |
| 3 | |
| ], | |
| "residual_conv_kernel_size": 3, | |
| "pad_mode": "constant", | |
| "attn_dim_head": 32, | |
| "attn_heads": 8, | |
| "attn_dropout": 0.0, | |
| "flash_attn": true, | |
| "linear_attn_dim_head": 8, | |
| "linear_attn_heads": 16, | |
| "num_quantizers": 1, | |
| "codebook_size": 256, | |
| "codebook_dim": 64, | |
| "commitment_cost": 1.0, | |
| "distill_cost": 1.0 | |
| } | |
| } |