niobures commited on
Commit
4305b2e
·
verified ·
1 Parent(s): 0943c37

Dolphin (code, models, paper)

Browse files
.gitattributes CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ Dolphin.[[:space:]]Efficient[[:space:]]Audio-Visual[[:space:]]Speech[[:space:]]Separation[[:space:]]with[[:space:]]Discrete[[:space:]]Lip[[:space:]]Semantics[[:space:]]and[[:space:]]Multi-Scale[[:space:]]Global-Local[[:space:]]Attention.pdf filter=lfs diff=lfs merge=lfs -text
Dolphin. Efficient Audio-Visual Speech Separation with Discrete Lip Semantics and Multi-Scale Global-Local Attention.pdf ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0aaf438b5a925e11239a33303c3278bfdf58829931000555017617f2924ca219
3
+ size 6363139
code/Dolphin.zip ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:31e8333103bf2f33023f21e87fca2c90d0a7217da1e42289618531767a5ea465
3
+ size 832673765
model/.gitattributes ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
+ *.model filter=lfs diff=lfs merge=lfs -text
13
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
14
+ *.npy filter=lfs diff=lfs merge=lfs -text
15
+ *.npz filter=lfs diff=lfs merge=lfs -text
16
+ *.onnx filter=lfs diff=lfs merge=lfs -text
17
+ *.ot filter=lfs diff=lfs merge=lfs -text
18
+ *.parquet filter=lfs diff=lfs merge=lfs -text
19
+ *.pb filter=lfs diff=lfs merge=lfs -text
20
+ *.pickle filter=lfs diff=lfs merge=lfs -text
21
+ *.pkl filter=lfs diff=lfs merge=lfs -text
22
+ *.pt filter=lfs diff=lfs merge=lfs -text
23
+ *.pth filter=lfs diff=lfs merge=lfs -text
24
+ *.rar filter=lfs diff=lfs merge=lfs -text
25
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
26
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
28
+ *.tar filter=lfs diff=lfs merge=lfs -text
29
+ *.tflite filter=lfs diff=lfs merge=lfs -text
30
+ *.tgz filter=lfs diff=lfs merge=lfs -text
31
+ *.wasm filter=lfs diff=lfs merge=lfs -text
32
+ *.xz filter=lfs diff=lfs merge=lfs -text
33
+ *.zip filter=lfs diff=lfs merge=lfs -text
34
+ *.zst filter=lfs diff=lfs merge=lfs -text
35
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
model/README.md ADDED
@@ -0,0 +1,189 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ datasets:
3
+ - alibabasglab/VoxCeleb2-mix
4
+ language:
5
+ - en
6
+ library_name: pytorch
7
+ license: apache-2.0
8
+ pipeline_tag: audio-to-audio
9
+ tags:
10
+ - audio-visual
11
+ - speech-separation
12
+ - cocktail-party
13
+ - multimodal
14
+ - lip-reading
15
+ - audio-processing
16
+ ---
17
+
18
+ # Dolphin: Efficient Audio-Visual Speech Separation
19
+
20
+ <p align="center">
21
+ <img src="https://github.com/JusperLee/Dolphin/raw/main/assets/icon.png" alt="Dolphin Logo" width="120"/>
22
+ </p>
23
+
24
+
25
+ ## Model Overview
26
+
27
+ **Dolphin** is an efficient audio-visual speech separation model that extracts target speech from noisy environments by combining acoustic and visual (lip movement) cues. It achieves **state-of-the-art performance** while being **6× faster** and using **50% fewer parameters** than previous methods.
28
+
29
+ 🔗 **Links**: [📄 Paper](https://arxiv.org/abs/2509.23610) | [💻 Code](https://github.com/JusperLee/Dolphin) | [🎮 Demo](https://huggingface.co/spaces/JusperLee/Dolphin) | [🌐 Project Page](https://cslikai.cn/Dolphin)
30
+
31
+ ## Key Features
32
+
33
+ - 🎯 **Balanced Quality & Efficiency**: SOTA separation quality without iterative refinement
34
+ - 🔬 **DP-LipCoder**: Lightweight video encoder with discrete audio-aligned semantic tokens
35
+ - 🌐 **Global-Local Attention**: Multi-scale attention for long-range context and fine-grained details
36
+ - 🚀 **Edge-Friendly**: >50% parameter reduction, >2.4× lower MACs, >6× faster inference
37
+
38
+ ## Performance
39
+
40
+ **VoxCeleb2 Benchmark:**
41
+
42
+ | Metric | Value |
43
+ |--------|-------|
44
+ | SI-SNRi | **16.1 dB** |
45
+ | SDRi | **16.3 dB** |
46
+ | PESQ | **3.45** |
47
+ | ESTOI | **0.93** |
48
+ | Parameters | **51.3M** (vs 112M in IIANet) |
49
+ | MACs | **417G** (vs 1009G in IIANet) |
50
+ | Inference Speed | **0.015s/4s-clip** (vs 0.100s in IIANet) |
51
+
52
+ ## Quick Start
53
+
54
+ ### Installation
55
+
56
+ ```bash
57
+ pip install torch torchvision torchaudio
58
+ pip install huggingface_hub
59
+ ```
60
+
61
+ ### Inference Example
62
+
63
+ ```python
64
+ import torch
65
+ from huggingface_hub import hf_hub_download
66
+ import yaml
67
+
68
+ # Download model and config
69
+ config_path = hf_hub_download(repo_id="JusperLee/Dolphin", filename="conf.yml")
70
+ model_path = hf_hub_download(repo_id="JusperLee/Dolphin", filename="best_model.pth")
71
+
72
+ # Load model (you need to import Dolphin class from the repo)
73
+ with open(config_path) as f:
74
+ config = yaml.safe_load(f)
75
+
76
+ model = Dolphin(**config['model'])
77
+ model.load_state_dict(torch.load(model_path, map_location='cpu'))
78
+ model.eval()
79
+
80
+ # Prepare inputs
81
+ # audio: [batch, samples] - 16kHz audio
82
+ # video: [batch, frames, 1, height, width] - grayscale lip frames
83
+ audio_mixture = torch.randn(1, 64000) # 4 seconds at 16kHz
84
+ video_frames = torch.randn(1, 100, 1, 88, 88) # 4s at 25fps, 88x88 resolution
85
+
86
+ # Separate speech
87
+ with torch.no_grad():
88
+ separated_audio = model(audio_mixture, video_frames)
89
+ ```
90
+
91
+ ### Complete Pipeline with Video Input
92
+
93
+ For end-to-end video processing with face detection and tracking, see our [inference script](https://github.com/JusperLee/Dolphin/blob/main/inference.py):
94
+
95
+ ```bash
96
+ git clone https://github.com/JusperLee/Dolphin.git
97
+ cd Dolphin
98
+ python inference.py \
99
+ --input video.mp4 \
100
+ --output ./output \
101
+ --speakers 2 \
102
+ --config checkpoints/vox2/conf.yml
103
+ ```
104
+
105
+ ## Model Architecture
106
+
107
+ ### Components
108
+
109
+ 1. **DP-LipCoder** (Video Encoder)
110
+ - Dual-path architecture: visual compression + semantic encoding
111
+ - Vector quantization for discrete lip semantic tokens
112
+ - Knowledge distillation from AV-HuBERT
113
+ - Only **8.5M parameters**
114
+
115
+ 2. **Audio Encoder**
116
+ - Convolutional encoder for time-frequency representation
117
+ - Extracts multi-scale acoustic features
118
+
119
+ 3. **Global-Local Attention Separator**
120
+ - Single-pass TDANet-based architecture
121
+ - **Global Attention (GA)**: Coarse-grained self-attention for long-range dependencies
122
+ - **Local Attention (LA)**: Heat diffusion attention for noise suppression
123
+ - No iterative refinement needed
124
+
125
+ 4. **Audio Decoder**
126
+ - Reconstructs separated waveform from enhanced features
127
+
128
+ ### Input/Output Specifications
129
+
130
+ **Inputs:**
131
+ - `audio`: Mixed audio waveform, shape `[batch, samples]`, 16kHz sampling rate
132
+ - `video`: Grayscale lip region frames, shape `[batch, frames, 1, 88, 88]`, 25fps
133
+
134
+ **Output:**
135
+ - `separated_audio`: Separated target speech, shape `[batch, samples]`, 16kHz
136
+
137
+ ## Training Details
138
+
139
+ - **Dataset**: VoxCeleb2 (2-speaker mixtures at 0dB SNR)
140
+ - **Training**: ~200K steps with Adam optimizer
141
+ - **Augmentation**: Random mixing, noise addition, video frame dropout
142
+ - **Loss**: SI-SNR (Scale-Invariant Signal-to-Noise Ratio)
143
+
144
+ ## Use Cases
145
+
146
+ - 🎧 **Hearing Aids**: Camera-based speech enhancement
147
+ - 💼 **Video Conferencing**: Noise suppression with visual context
148
+ - 🚗 **In-Car Assistants**: Driver speech extraction
149
+ - 🥽 **AR/VR**: Immersive communication in noisy environments
150
+ - 📱 **Edge Devices**: Efficient deployment on mobile/embedded systems
151
+
152
+ ## Limitations
153
+
154
+ - Requires frontal or near-frontal face view for optimal performance
155
+ - Works best with 25fps video input
156
+ - Trained on English speech (may need fine-tuning for other languages)
157
+ - Performance degrades with severe occlusions or low lighting
158
+
159
+ ## Citation
160
+
161
+ ```bibtex
162
+ @misc{li2025dolphin,
163
+ title={Efficient Audio-Visual Speech Separation with Discrete Lip Semantics and Multi-Scale Global-Local Attention},
164
+ author={Kai Li and Kejun Gao and Xiaolin Hu},
165
+ year={2025},
166
+ eprint={2509.23610},
167
+ archivePrefix={arXiv},
168
+ primaryClass={cs.SD},
169
+ url={https://arxiv.org/abs/2509.23610}
170
+ }
171
+ ```
172
+
173
+ ## License
174
+
175
+ Apache-2.0 License. See [LICENSE](https://github.com/JusperLee/Dolphin/blob/main/LICENSE) for details.
176
+
177
+ ## Acknowledgments
178
+
179
+ Built with inspiration from IIANet and SepReformer. Thanks to the Hugging Face team for hosting!
180
+
181
+ ## Contact
182
+
183
+ - 📧 Email: tsinghua.kaili@gmail.com
184
+ - 🐛 Issues: [GitHub Issues](https://github.com/JusperLee/Dolphin/issues)
185
+ - 💬 Discussions: [GitHub Discussions](https://github.com/JusperLee/Dolphin/discussions)
186
+
187
+ ---
188
+
189
+ **Developed by the Audio and Speech Group at Tsinghua University** 🎓
model/config.json ADDED
@@ -0,0 +1,136 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "model_type": "dolphin",
3
+ "task": "audio_visual_speech_separation",
4
+ "framework": "pytorch",
5
+ "license": "apache-2.0",
6
+ "tags": [
7
+ "audio",
8
+ "speech-separation",
9
+ "audio-visual",
10
+ "pytorch",
11
+ "dolphin"
12
+ ],
13
+ "architectures": [
14
+ "Dolphin"
15
+ ],
16
+ "auto_map": {
17
+ "AutoModel": "dolphin.Dolphin"
18
+ },
19
+ "num_stages": 4,
20
+ "sample_rate": 16000,
21
+ "vpre_channels": 3872,
22
+ "vmid_channels": 512,
23
+ "vin_channels": 64,
24
+ "vout_channels": 64,
25
+ "module_audio_enc": {
26
+ "in_channels": 1,
27
+ "out_channels": 256,
28
+ "kernel_size": 16,
29
+ "stride": 4,
30
+ "groups": 1,
31
+ "bias": false
32
+ },
33
+ "module_feature_projector": {
34
+ "num_channels": 256,
35
+ "in_channels": 256,
36
+ "out_channels": 128,
37
+ "kernel_size": 1,
38
+ "bias": false
39
+ },
40
+ "module_separator": {
41
+ "num_stages": 4,
42
+ "relative_positional_encoding": {
43
+ "in_channels": 128,
44
+ "num_heads": 8,
45
+ "maxlen": 2000,
46
+ "embed_v": false
47
+ },
48
+ "enc_stage": {
49
+ "global_blocks": {
50
+ "in_channels": 128,
51
+ "num_mha_heads": 8,
52
+ "dropout_rate": 0.05
53
+ },
54
+ "local_blocks": {
55
+ "in_channels": 128,
56
+ "kernel_size": 65,
57
+ "dropout_rate": 0.05
58
+ },
59
+ "down_conv_layer": {
60
+ "in_channels": 128,
61
+ "samp_kernel_size": 5
62
+ }
63
+ },
64
+ "simple_fusion": {
65
+ "out_channels": 128
66
+ },
67
+ "dec_stage": {
68
+ "global_blocks": {
69
+ "in_channels": 128,
70
+ "num_mha_heads": 8,
71
+ "dropout_rate": 0.05
72
+ },
73
+ "local_blocks": {
74
+ "in_channels": 128,
75
+ "kernel_size": 65,
76
+ "dropout_rate": 0.05
77
+ },
78
+ "spk_attention": {
79
+ "in_channels": 128,
80
+ "num_mha_heads": 8,
81
+ "dropout_rate": 0.05
82
+ }
83
+ }
84
+ },
85
+ "module_output_layer": {
86
+ "in_channels": 256,
87
+ "out_channels": 128
88
+ },
89
+ "module_audio_dec": {
90
+ "in_channels": 256,
91
+ "out_channels": 1,
92
+ "kernel_size": 16,
93
+ "stride": 4,
94
+ "bias": false
95
+ },
96
+ "video_encoder_params": {
97
+ "layers": [
98
+ "residual",
99
+ "compress_space",
100
+ "consecutive_residual",
101
+ "compress_space",
102
+ "consecutive_residual",
103
+ "linear_attend_space",
104
+ "compress_space",
105
+ "consecutive_residual",
106
+ "attend_space"
107
+ ],
108
+ "image_size": 88,
109
+ "in_channel": 1,
110
+ "init_channel": 4,
111
+ "max_dim": 32,
112
+ "input_conv_kernel_size": [
113
+ 7,
114
+ 7,
115
+ 7
116
+ ],
117
+ "output_conv_kernel_size": [
118
+ 3,
119
+ 3,
120
+ 3
121
+ ],
122
+ "residual_conv_kernel_size": 3,
123
+ "pad_mode": "constant",
124
+ "attn_dim_head": 32,
125
+ "attn_heads": 8,
126
+ "attn_dropout": 0.0,
127
+ "flash_attn": true,
128
+ "linear_attn_dim_head": 8,
129
+ "linear_attn_heads": 16,
130
+ "num_quantizers": 1,
131
+ "codebook_size": 256,
132
+ "codebook_dim": 64,
133
+ "commitment_cost": 1.0,
134
+ "distill_cost": 1.0
135
+ }
136
+ }
model/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9be694e4150588ca0af8447fae184b6262a3cf43587928bd6001eee5b4eefb8a
3
+ size 28391276
model/source.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ https://huggingface.co/JusperLee/Dolphin