File size: 2,805 Bytes
4305b2e |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 |
{
"model_type": "dolphin",
"task": "audio_visual_speech_separation",
"framework": "pytorch",
"license": "apache-2.0",
"tags": [
"audio",
"speech-separation",
"audio-visual",
"pytorch",
"dolphin"
],
"architectures": [
"Dolphin"
],
"auto_map": {
"AutoModel": "dolphin.Dolphin"
},
"num_stages": 4,
"sample_rate": 16000,
"vpre_channels": 3872,
"vmid_channels": 512,
"vin_channels": 64,
"vout_channels": 64,
"module_audio_enc": {
"in_channels": 1,
"out_channels": 256,
"kernel_size": 16,
"stride": 4,
"groups": 1,
"bias": false
},
"module_feature_projector": {
"num_channels": 256,
"in_channels": 256,
"out_channels": 128,
"kernel_size": 1,
"bias": false
},
"module_separator": {
"num_stages": 4,
"relative_positional_encoding": {
"in_channels": 128,
"num_heads": 8,
"maxlen": 2000,
"embed_v": false
},
"enc_stage": {
"global_blocks": {
"in_channels": 128,
"num_mha_heads": 8,
"dropout_rate": 0.05
},
"local_blocks": {
"in_channels": 128,
"kernel_size": 65,
"dropout_rate": 0.05
},
"down_conv_layer": {
"in_channels": 128,
"samp_kernel_size": 5
}
},
"simple_fusion": {
"out_channels": 128
},
"dec_stage": {
"global_blocks": {
"in_channels": 128,
"num_mha_heads": 8,
"dropout_rate": 0.05
},
"local_blocks": {
"in_channels": 128,
"kernel_size": 65,
"dropout_rate": 0.05
},
"spk_attention": {
"in_channels": 128,
"num_mha_heads": 8,
"dropout_rate": 0.05
}
}
},
"module_output_layer": {
"in_channels": 256,
"out_channels": 128
},
"module_audio_dec": {
"in_channels": 256,
"out_channels": 1,
"kernel_size": 16,
"stride": 4,
"bias": false
},
"video_encoder_params": {
"layers": [
"residual",
"compress_space",
"consecutive_residual",
"compress_space",
"consecutive_residual",
"linear_attend_space",
"compress_space",
"consecutive_residual",
"attend_space"
],
"image_size": 88,
"in_channel": 1,
"init_channel": 4,
"max_dim": 32,
"input_conv_kernel_size": [
7,
7,
7
],
"output_conv_kernel_size": [
3,
3,
3
],
"residual_conv_kernel_size": 3,
"pad_mode": "constant",
"attn_dim_head": 32,
"attn_heads": 8,
"attn_dropout": 0.0,
"flash_attn": true,
"linear_attn_dim_head": 8,
"linear_attn_heads": 16,
"num_quantizers": 1,
"codebook_size": 256,
"codebook_dim": 64,
"commitment_cost": 1.0,
"distill_cost": 1.0
}
} |