{ "model_type": "dolphin", "task": "audio_visual_speech_separation", "framework": "pytorch", "license": "apache-2.0", "tags": [ "audio", "speech-separation", "audio-visual", "pytorch", "dolphin" ], "architectures": [ "Dolphin" ], "auto_map": { "AutoModel": "dolphin.Dolphin" }, "num_stages": 4, "sample_rate": 16000, "vpre_channels": 3872, "vmid_channels": 512, "vin_channels": 64, "vout_channels": 64, "module_audio_enc": { "in_channels": 1, "out_channels": 256, "kernel_size": 16, "stride": 4, "groups": 1, "bias": false }, "module_feature_projector": { "num_channels": 256, "in_channels": 256, "out_channels": 128, "kernel_size": 1, "bias": false }, "module_separator": { "num_stages": 4, "relative_positional_encoding": { "in_channels": 128, "num_heads": 8, "maxlen": 2000, "embed_v": false }, "enc_stage": { "global_blocks": { "in_channels": 128, "num_mha_heads": 8, "dropout_rate": 0.05 }, "local_blocks": { "in_channels": 128, "kernel_size": 65, "dropout_rate": 0.05 }, "down_conv_layer": { "in_channels": 128, "samp_kernel_size": 5 } }, "simple_fusion": { "out_channels": 128 }, "dec_stage": { "global_blocks": { "in_channels": 128, "num_mha_heads": 8, "dropout_rate": 0.05 }, "local_blocks": { "in_channels": 128, "kernel_size": 65, "dropout_rate": 0.05 }, "spk_attention": { "in_channels": 128, "num_mha_heads": 8, "dropout_rate": 0.05 } } }, "module_output_layer": { "in_channels": 256, "out_channels": 128 }, "module_audio_dec": { "in_channels": 256, "out_channels": 1, "kernel_size": 16, "stride": 4, "bias": false }, "video_encoder_params": { "layers": [ "residual", "compress_space", "consecutive_residual", "compress_space", "consecutive_residual", "linear_attend_space", "compress_space", "consecutive_residual", "attend_space" ], "image_size": 88, "in_channel": 1, "init_channel": 4, "max_dim": 32, "input_conv_kernel_size": [ 7, 7, 7 ], "output_conv_kernel_size": [ 3, 3, 3 ], "residual_conv_kernel_size": 3, "pad_mode": "constant", "attn_dim_head": 32, "attn_heads": 8, "attn_dropout": 0.0, "flash_attn": true, "linear_attn_dim_head": 8, "linear_attn_heads": 16, "num_quantizers": 1, "codebook_size": 256, "codebook_dim": 64, "commitment_cost": 1.0, "distill_cost": 1.0 } }