JusperLee commited on
Commit
44a284d
·
verified ·
1 Parent(s): 972d2eb

Upload Dolphin audio-visual speech separation model

Browse files
Files changed (2) hide show
  1. README.md +6 -54
  2. config.json +122 -124
README.md CHANGED
@@ -1,58 +1,10 @@
1
  ---
2
- license: apache-2.0
3
- datasets:
4
- - alibabasglab/VoxCeleb2-mix
5
- language:
6
- - en
7
  tags:
8
- - speech
9
- pipeline_tag: audio-to-audio
10
  ---
11
- # Dolphin: Audio-Visual Speech Separation Model
12
 
13
- Dolphin is a state-of-the-art audio-visual speech separation model that leverages both audio and visual information to separate target speech from background noise and other speakers.
14
-
15
- ## Model Description
16
-
17
- This model implements the Dolphin architecture for audio-visual speech separation, combining:
18
- - Audio encoder for processing audio signals
19
- - Video encoder for processing visual lip movements
20
- - Multi-modal fusion mechanism
21
- - Transformer-based separator with global and local attention blocks
22
-
23
- ## Usage
24
-
25
- ```python
26
- from huggingface_hub import PyTorchModelHubMixin
27
- import torch
28
-
29
- # Load the model directly from Hugging Face Hub
30
- model = Dolphin.from_pretrained("your-username/dolphin-model")
31
-
32
- # Example usage
33
- audio_input = torch.randn(1, 16000) # 1 second of audio at 16kHz
34
- video_input = torch.randn(1, 1, 25, 88, 88) # 25 frames of 88x88 grayscale video
35
-
36
- # Perform speech separation
37
- separated_audio = model(audio_input, video_input)
38
- ```
39
-
40
- ## Model Architecture
41
-
42
- - **Audio Encoder**: Processes raw audio waveforms
43
- - **Video Encoder**: Processes lip movement sequences
44
- - **Feature Projector**: Projects audio features to appropriate dimensions
45
- - **Separator**: Multi-stage transformer with global and local attention
46
- - **Audio Decoder**: Reconstructs separated audio waveform
47
-
48
- ## Training Data
49
-
50
- The model was trained on audio-visual speech separation datasets with mixed speech scenarios.
51
-
52
- ## Citation
53
-
54
- If you use this model in your research, please cite the original Dolphin paper.
55
-
56
- ## License
57
-
58
- This model is released under the Apache-2.0 License.
 
1
  ---
 
 
 
 
 
2
  tags:
3
+ - model_hub_mixin
4
+ - pytorch_model_hub_mixin
5
  ---
 
6
 
7
+ This model has been pushed to the Hub using the [PytorchModelHubMixin](https://huggingface.co/docs/huggingface_hub/package_reference/mixins#huggingface_hub.PyTorchModelHubMixin) integration:
8
+ - Code: [More Information Needed]
9
+ - Paper: [More Information Needed]
10
+ - Docs: [More Information Needed]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
config.json CHANGED
@@ -1,138 +1,136 @@
1
  {
2
- "model_type": "dolphin",
3
- "task": "audio_visual_speech_separation",
 
 
 
 
4
  "framework": "pytorch",
5
  "license": "apache-2.0",
6
- "tags": [
7
- "audio",
8
- "speech-separation",
9
- "audio-visual",
10
- "pytorch",
11
- "dolphin"
12
- ],
13
- "model_config": {
14
- "num_stages": 4,
15
- "sample_rate": 16000,
16
- "vpre_channels": 3872,
17
- "vmid_channels": 512,
18
- "vin_channels": 64,
19
- "vout_channels": 64,
20
- "module_audio_enc": {
21
- "in_channels": 1,
22
- "out_channels": 256,
23
- "kernel_size": 16,
24
- "stride": 4,
25
- "groups": 1,
26
- "bias": false
27
- },
28
- "module_feature_projector": {
29
- "num_channels": 256,
30
- "in_channels": 256,
31
- "out_channels": 128,
32
- "kernel_size": 1,
33
- "bias": false
34
- },
35
- "module_separator": {
36
- "num_stages": 4,
37
- "relative_positional_encoding": {
38
  "in_channels": 128,
39
- "num_heads": 8,
40
- "maxlen": 2000,
41
- "embed_v": false
42
- },
43
- "enc_stage": {
44
- "global_blocks": {
45
- "in_channels": 128,
46
- "num_mha_heads": 8,
47
- "dropout_rate": 0.05
48
- },
49
- "local_blocks": {
50
- "in_channels": 128,
51
- "kernel_size": 65,
52
- "dropout_rate": 0.05
53
- },
54
- "down_conv_layer": {
55
- "in_channels": 128,
56
- "samp_kernel_size": 5
57
- }
58
  },
59
- "simple_fusion": {
60
- "out_channels": 128
 
 
61
  },
62
- "dec_stage": {
63
- "global_blocks": {
64
- "in_channels": 128,
65
- "num_mha_heads": 8,
66
- "dropout_rate": 0.05
67
- },
68
- "local_blocks": {
69
- "in_channels": 128,
70
- "kernel_size": 65,
71
- "dropout_rate": 0.05
72
- },
73
- "spk_attention": {
74
- "in_channels": 128,
75
- "num_mha_heads": 8,
76
- "dropout_rate": 0.05
77
- }
78
  }
79
  },
80
- "module_output_layer": {
81
- "in_channels": 256,
82
- "out_channels": 128
 
 
 
 
 
 
 
 
 
 
 
 
83
  },
84
- "module_audio_dec": {
85
- "in_channels": 256,
86
- "out_channels": 1,
87
- "kernel_size": 16,
88
- "stride": 4,
89
- "bias": false
90
  },
91
- "video_encoder_params": {
92
- "layers": [
93
- "residual",
94
- "compress_space",
95
- "consecutive_residual",
96
- "compress_space",
97
- "consecutive_residual",
98
- "linear_attend_space",
99
- "compress_space",
100
- "consecutive_residual",
101
- "attend_space"
102
- ],
103
- "image_size": 88,
104
- "in_channel": 1,
105
- "init_channel": 4,
106
- "max_dim": 32,
107
- "input_conv_kernel_size": [
108
- 7,
109
- 7,
110
- 7
111
- ],
112
- "output_conv_kernel_size": [
113
- 3,
114
- 3,
115
- 3
116
- ],
117
- "residual_conv_kernel_size": 3,
118
- "pad_mode": "constant",
119
- "attn_dim_head": 32,
120
- "attn_heads": 8,
121
- "attn_dropout": 0.0,
122
- "flash_attn": true,
123
- "linear_attn_dim_head": 8,
124
- "linear_attn_heads": 16,
125
- "num_quantizers": 1,
126
- "codebook_size": 256,
127
- "codebook_dim": 64,
128
- "commitment_cost": 1.0,
129
- "distill_cost": 1.0
130
  }
131
  },
132
- "architectures": [
133
- "Dolphin"
 
 
 
 
 
 
134
  ],
135
- "auto_map": {
136
- "AutoModel": "dolphin.Dolphin"
137
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
138
  }
 
1
  {
2
+ "architectures": [
3
+ "Dolphin"
4
+ ],
5
+ "auto_map": {
6
+ "AutoModel": "dolphin.Dolphin"
7
+ },
8
  "framework": "pytorch",
9
  "license": "apache-2.0",
10
+ "model_type": "dolphin",
11
+ "module_audio_dec": {
12
+ "bias": false,
13
+ "in_channels": 256,
14
+ "kernel_size": 16,
15
+ "out_channels": 1,
16
+ "stride": 4
17
+ },
18
+ "module_audio_enc": {
19
+ "bias": false,
20
+ "groups": 1,
21
+ "in_channels": 1,
22
+ "kernel_size": 16,
23
+ "out_channels": 256,
24
+ "stride": 4
25
+ },
26
+ "module_feature_projector": {
27
+ "bias": false,
28
+ "in_channels": 256,
29
+ "kernel_size": 1,
30
+ "num_channels": 256,
31
+ "out_channels": 128
32
+ },
33
+ "module_output_layer": {
34
+ "in_channels": 256,
35
+ "out_channels": 128
36
+ },
37
+ "module_separator": {
38
+ "dec_stage": {
39
+ "global_blocks": {
40
+ "dropout_rate": 0.05,
 
41
  "in_channels": 128,
42
+ "num_mha_heads": 8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
43
  },
44
+ "local_blocks": {
45
+ "dropout_rate": 0.05,
46
+ "in_channels": 128,
47
+ "kernel_size": 65
48
  },
49
+ "spk_attention": {
50
+ "dropout_rate": 0.05,
51
+ "in_channels": 128,
52
+ "num_mha_heads": 8
 
 
 
 
 
 
 
 
 
 
 
 
53
  }
54
  },
55
+ "enc_stage": {
56
+ "down_conv_layer": {
57
+ "in_channels": 128,
58
+ "samp_kernel_size": 5
59
+ },
60
+ "global_blocks": {
61
+ "dropout_rate": 0.05,
62
+ "in_channels": 128,
63
+ "num_mha_heads": 8
64
+ },
65
+ "local_blocks": {
66
+ "dropout_rate": 0.05,
67
+ "in_channels": 128,
68
+ "kernel_size": 65
69
+ }
70
  },
71
+ "num_stages": 4,
72
+ "relative_positional_encoding": {
73
+ "embed_v": false,
74
+ "in_channels": 128,
75
+ "maxlen": 2000,
76
+ "num_heads": 8
77
  },
78
+ "simple_fusion": {
79
+ "out_channels": 128
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
80
  }
81
  },
82
+ "num_stages": 4,
83
+ "sample_rate": 16000,
84
+ "tags": [
85
+ "audio",
86
+ "speech-separation",
87
+ "audio-visual",
88
+ "pytorch",
89
+ "dolphin"
90
  ],
91
+ "task": "audio_visual_speech_separation",
92
+ "video_encoder_params": {
93
+ "attn_dim_head": 32,
94
+ "attn_dropout": 0.0,
95
+ "attn_heads": 8,
96
+ "codebook_dim": 64,
97
+ "codebook_size": 256,
98
+ "commitment_cost": 1.0,
99
+ "distill_cost": 1.0,
100
+ "flash_attn": true,
101
+ "image_size": 88,
102
+ "in_channel": 1,
103
+ "init_channel": 4,
104
+ "input_conv_kernel_size": [
105
+ 7,
106
+ 7,
107
+ 7
108
+ ],
109
+ "layers": [
110
+ "residual",
111
+ "compress_space",
112
+ "consecutive_residual",
113
+ "compress_space",
114
+ "consecutive_residual",
115
+ "linear_attend_space",
116
+ "compress_space",
117
+ "consecutive_residual",
118
+ "attend_space"
119
+ ],
120
+ "linear_attn_dim_head": 8,
121
+ "linear_attn_heads": 16,
122
+ "max_dim": 32,
123
+ "num_quantizers": 1,
124
+ "output_conv_kernel_size": [
125
+ 3,
126
+ 3,
127
+ 3
128
+ ],
129
+ "pad_mode": "constant",
130
+ "residual_conv_kernel_size": 3
131
+ },
132
+ "vin_channels": 64,
133
+ "vmid_channels": 512,
134
+ "vout_channels": 64,
135
+ "vpre_channels": 3872
136
  }