mohantesting commited on
Commit
4f6f0a1
·
verified ·
1 Parent(s): d507f3c

Add files using upload-large-folder tool

Browse files
Files changed (48) hide show
  1. .gitattributes +14 -0
  2. assistant_female_voice.wav +3 -0
  3. models/Llama-3.2-1B-Instruct/model.safetensors +3 -0
  4. models/Llama-3.2-1B-Instruct/original/consolidated.00.pth +3 -0
  5. models/Llama-3.2-1B-Instruct/original/tokenizer.model +3 -0
  6. models/Llama-3.2-1B-Instruct/tokenizer.json +0 -0
  7. models/VibeVoice-1.5B/figures/Fig1.png +3 -0
  8. models/VibeVoice-1.5B/model-00001-of-00003.safetensors +3 -0
  9. models/VibeVoice-1.5B/model-00002-of-00003.safetensors +3 -0
  10. models/VibeVoice-1.5B/model-00003-of-00003.safetensors +3 -0
  11. models/VibeVoice-1.5B/model.safetensors.index.json +0 -0
  12. models/VoxCPM-0.5B/assets/voxcpm_model.png +3 -0
  13. models/VoxCPM-0.5B/audiovae.pth +3 -0
  14. models/VoxCPM-0.5B/pytorch_model.bin +3 -0
  15. models/VoxCPM-0.5B/tokenizer.json +0 -0
  16. models/dsp/weights_24khz_1.5kbps_v1.0.pth +3 -0
  17. models/dsp/weights_24khz_3kbps_v1.0.pth +3 -0
  18. models/hub/version.txt +1 -0
  19. models/iic/SenseVoiceSmall/chn_jpn_yue_eng_ko_spectok.bpe.model +3 -0
  20. models/iic/SenseVoiceSmall/example/.DS_Store +0 -0
  21. models/iic/SenseVoiceSmall/example/en.mp3 +0 -0
  22. models/iic/SenseVoiceSmall/example/ja.mp3 +0 -0
  23. models/iic/SenseVoiceSmall/example/ko.mp3 +0 -0
  24. models/iic/SenseVoiceSmall/example/yue.mp3 +0 -0
  25. models/iic/SenseVoiceSmall/example/zh.mp3 +0 -0
  26. models/iic/SenseVoiceSmall/fig/aed_figure.png +3 -0
  27. models/iic/SenseVoiceSmall/fig/asr_results.png +3 -0
  28. models/iic/SenseVoiceSmall/fig/inference.png +3 -0
  29. models/iic/SenseVoiceSmall/fig/sensevoice.png +3 -0
  30. models/iic/SenseVoiceSmall/fig/ser_figure.png +3 -0
  31. models/iic/SenseVoiceSmall/fig/ser_table.png +3 -0
  32. models/iic/SenseVoiceSmall/model.pt +3 -0
  33. models/iic/speech_zipenhancer_ans_multiloss_16k_base/.msc +0 -0
  34. models/iic/speech_zipenhancer_ans_multiloss_16k_base/.mv +1 -0
  35. models/iic/speech_zipenhancer_ans_multiloss_16k_base/README.md +486 -0
  36. models/iic/speech_zipenhancer_ans_multiloss_16k_base/configuration.json +33 -0
  37. models/iic/speech_zipenhancer_ans_multiloss_16k_base/description/block.jpg +0 -0
  38. models/iic/speech_zipenhancer_ans_multiloss_16k_base/description/matrix.jpg +3 -0
  39. models/iic/speech_zipenhancer_ans_multiloss_16k_base/description/matrix_voicebank.jpg +3 -0
  40. models/iic/speech_zipenhancer_ans_multiloss_16k_base/examples/speech_with_noise.wav +0 -0
  41. models/iic/speech_zipenhancer_ans_multiloss_16k_base/examples/speech_with_noise1.wav +3 -0
  42. models/iic/speech_zipenhancer_ans_multiloss_16k_base/onnx_model.onnx +3 -0
  43. models/iic/speech_zipenhancer_ans_multiloss_16k_base/pytorch_model.bin +3 -0
  44. models/v10/generation_config.json +13 -0
  45. models/v10/model.safetensors +3 -0
  46. models/v10/tokenizer.json +3 -0
  47. models/wpt/wpt.pt +3 -0
  48. spk_001.wav +3 -0
.gitattributes CHANGED
@@ -33,3 +33,17 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ models/VibeVoice-1.5B/figures/Fig1.png filter=lfs diff=lfs merge=lfs -text
37
+ models/v10/tokenizer.json filter=lfs diff=lfs merge=lfs -text
38
+ assistant_female_voice.wav filter=lfs diff=lfs merge=lfs -text
39
+ spk_001.wav filter=lfs diff=lfs merge=lfs -text
40
+ models/VoxCPM-0.5B/assets/voxcpm_model.png filter=lfs diff=lfs merge=lfs -text
41
+ models/iic/SenseVoiceSmall/fig/ser_figure.png filter=lfs diff=lfs merge=lfs -text
42
+ models/iic/SenseVoiceSmall/fig/sensevoice.png filter=lfs diff=lfs merge=lfs -text
43
+ models/iic/SenseVoiceSmall/fig/ser_table.png filter=lfs diff=lfs merge=lfs -text
44
+ models/iic/SenseVoiceSmall/fig/asr_results.png filter=lfs diff=lfs merge=lfs -text
45
+ models/iic/SenseVoiceSmall/fig/inference.png filter=lfs diff=lfs merge=lfs -text
46
+ models/iic/speech_zipenhancer_ans_multiloss_16k_base/examples/speech_with_noise1.wav filter=lfs diff=lfs merge=lfs -text
47
+ models/iic/speech_zipenhancer_ans_multiloss_16k_base/description/matrix.jpg filter=lfs diff=lfs merge=lfs -text
48
+ models/iic/SenseVoiceSmall/fig/aed_figure.png filter=lfs diff=lfs merge=lfs -text
49
+ models/iic/speech_zipenhancer_ans_multiloss_16k_base/description/matrix_voicebank.jpg filter=lfs diff=lfs merge=lfs -text
assistant_female_voice.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1d712ba6de1d15d52eda96bdc043ce43eb5af4b4ac441b78b6fb0fdaf6683c7a
3
+ size 235244
models/Llama-3.2-1B-Instruct/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1ff795ff6a07e6a68085d206fb84417da2f083f68391c2843cd2b8ac6df8538f
3
+ size 2471645608
models/Llama-3.2-1B-Instruct/original/consolidated.00.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:fc17d497df5e4175b3a8acb4f5865b26f7fc1b009b25bef814b95fde10e8a1f3
3
+ size 2471677246
models/Llama-3.2-1B-Instruct/original/tokenizer.model ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:82e9d31979e92ab929cd544440f129d9ecd797b69e327f80f17e1c50d5551b55
3
+ size 2183982
models/Llama-3.2-1B-Instruct/tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
models/VibeVoice-1.5B/figures/Fig1.png ADDED

Git LFS Details

  • SHA256: 64464f28380f81e76c88d76431a08b48c7f82a283e17f2e32c241c4b03407d19
  • Pointer size: 131 Bytes
  • Size of remote file: 154 kB
models/VibeVoice-1.5B/model-00001-of-00003.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c5f0a61ddeaeb028e3af540ba4dee7933ad30f9f30b6e1320dd9c875a2daa033
3
+ size 1975317828
models/VibeVoice-1.5B/model-00002-of-00003.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:81c3891f7b2493eb48a9eb6f5be0df48d4f1a4bfd952d84e21683ca6d0bf7969
3
+ size 1983051688
models/VibeVoice-1.5B/model-00003-of-00003.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:cb6e7e5e86b4a41fffbe1f3aaf445d0d50b5e21ed47574101b777f77d75fa196
3
+ size 1449832938
models/VibeVoice-1.5B/model.safetensors.index.json ADDED
The diff for this file is too large to render. See raw diff
 
models/VoxCPM-0.5B/assets/voxcpm_model.png ADDED

Git LFS Details

  • SHA256: 49f6eb7998135ad49f5dd0ee1fa2c099d79a016ab59fe29fc039f7f32ef8f5ca
  • Pointer size: 131 Bytes
  • Size of remote file: 145 kB
models/VoxCPM-0.5B/audiovae.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5b93f34c771281679b0ff93be3d1c1681eb1f301c3892e701db8f10a725b20a9
3
+ size 301494192
models/VoxCPM-0.5B/pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:62cee3da3fa803a7eb7a8fa47318ab9a6d88abe17b9d51062852f6ac86b52e3a
3
+ size 1304698606
models/VoxCPM-0.5B/tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
models/dsp/weights_24khz_1.5kbps_v1.0.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d77ca0b04df942ec64e6a7a162bcac093b1127700acdaec0079f40d32c4405fb
3
+ size 295731578
models/dsp/weights_24khz_3kbps_v1.0.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:566a68a00dd5f5780aff5bbd80db4fd1d9aa25c639c516e596d3841710b6ffe7
3
+ size 295949662
models/hub/version.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ 1
models/iic/SenseVoiceSmall/chn_jpn_yue_eng_ko_spectok.bpe.model ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:aa87f86064c3730d799ddf7af3c04659151102cba548bce325cf06ba4da4e6a8
3
+ size 377341
models/iic/SenseVoiceSmall/example/.DS_Store ADDED
Binary file (6.15 kB). View file
 
models/iic/SenseVoiceSmall/example/en.mp3 ADDED
Binary file (57.4 kB). View file
 
models/iic/SenseVoiceSmall/example/ja.mp3 ADDED
Binary file (57.8 kB). View file
 
models/iic/SenseVoiceSmall/example/ko.mp3 ADDED
Binary file (27.9 kB). View file
 
models/iic/SenseVoiceSmall/example/yue.mp3 ADDED
Binary file (31.2 kB). View file
 
models/iic/SenseVoiceSmall/example/zh.mp3 ADDED
Binary file (45 kB). View file
 
models/iic/SenseVoiceSmall/fig/aed_figure.png ADDED

Git LFS Details

  • SHA256: 643a2705d9b2ac1fe95cc4f1ca7df23c0f030e256e351541287391128665b5b8
  • Pointer size: 131 Bytes
  • Size of remote file: 119 kB
models/iic/SenseVoiceSmall/fig/asr_results.png ADDED

Git LFS Details

  • SHA256: fda4390934fc6d309cbac7f22053580cbedd8a608ad52e3cfac0fb2947a8c4fe
  • Pointer size: 131 Bytes
  • Size of remote file: 244 kB
models/iic/SenseVoiceSmall/fig/inference.png ADDED

Git LFS Details

  • SHA256: 0bcde7dd81dcb5b1198cb7fd210fa05a6861512b844027fe8508c8f6a997352e
  • Pointer size: 131 Bytes
  • Size of remote file: 958 kB
models/iic/SenseVoiceSmall/fig/sensevoice.png ADDED

Git LFS Details

  • SHA256: e570d2fad030bb48b0094a3085791d3a6aa1be338f0a003736a11e58e5034afc
  • Pointer size: 131 Bytes
  • Size of remote file: 901 kB
models/iic/SenseVoiceSmall/fig/ser_figure.png ADDED

Git LFS Details

  • SHA256: eaf050fa5d775a3b84031d8eb5a200699312d7e9a7d769550643cf02edbdb13b
  • Pointer size: 131 Bytes
  • Size of remote file: 199 kB
models/iic/SenseVoiceSmall/fig/ser_table.png ADDED

Git LFS Details

  • SHA256: a0a8ae6dcc63b89a7d9160461b7c78333d0c8c0cdcf7bd0311d59a3b36d26370
  • Pointer size: 131 Bytes
  • Size of remote file: 326 kB
models/iic/SenseVoiceSmall/model.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:833ca2dcfdf8ec91bd4f31cfac36d6124e0c459074d5e909aec9cabe6204a3ea
3
+ size 936291369
models/iic/speech_zipenhancer_ans_multiloss_16k_base/.msc ADDED
Binary file (786 Bytes). View file
 
models/iic/speech_zipenhancer_ans_multiloss_16k_base/.mv ADDED
@@ -0,0 +1 @@
 
 
1
+ Revision:master,CreatedAt:1756708460
models/iic/speech_zipenhancer_ans_multiloss_16k_base/README.md ADDED
@@ -0,0 +1,486 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ tasks:
3
+ - acoustic-noise-suppression
4
+ widgets:
5
+ - task: acoustic-noise-suppression
6
+ inputs:
7
+ - type: audio
8
+ name: input
9
+ title: 带噪音的原始音频
10
+ validator:
11
+ max_size: 10M
12
+ examples:
13
+ - name: 1
14
+ title: 示例1
15
+ inputs:
16
+ - name: input
17
+ data: git://examples/speech_with_noise1.wav
18
+ - name: 2
19
+ title: 示例2
20
+ inputs:
21
+ - name: input
22
+ data: git://examples/speech_with_noise.wav
23
+ inferencespec:
24
+ cpu: 1
25
+ memory: 1000
26
+ gpu: 0
27
+ gpu_memory: 1000
28
+ model_type:
29
+ - dual-path
30
+ domain:
31
+ - audio
32
+ frameworks:
33
+ - pytorch
34
+ model-backbone:
35
+ - ZipEnhancer
36
+ customized-quickstart: True
37
+ finetune-support: False
38
+ license: Apache License 2.0
39
+ tags:
40
+ - Alibaba
41
+ - ANS
42
+ - AI降噪
43
+ - 语音增强
44
+ - 音频前处理
45
+ - 3A
46
+ datasets:
47
+ - null
48
+ - modelscope/ICASSP_2021_DNS_Challenge
49
+ base_model:
50
+ - iic/speech_zipenhancer_ans_multiloss_16k_base
51
+ ---
52
+ # ZipEnhancer语音降噪模型介绍
53
+
54
+ 用于语音增强任务的最新语音降噪模型。在日常生活中,我们常常会遇到录音环境不佳的情况,比如试图记录一段清晰的语音信息时,周围环境的噪音却不可避免地侵入录音,使得最终录制得到的语音信号中充满了干扰。类似地,在诸如地铁或公交车这类嘈杂的公共场所进行通话时,为了确保通话双方能够听见,人们有时采用提高说话的音量来增强人声信号,来缓解背景噪声的影响。环境噪音对语音通信造成的影响构成了使用语音应用时的一大挑战和障碍。确保语音的高质量传输与高可理解性,常因环境噪声、录音设备的局限性、以及声音的混响和回声效应而变得困难重重,这直接导致了通话清晰度和沟通效率的显著下降。因此,如何在喧嚣的环境里维持高水平的语音质量与可懂度,成为了技术行业和学术界共同努力解决的重要课题。
55
+
56
+ 历经多年的研发探索,语音降噪技术已实现了重要的进展,特别是在应对复杂噪声环境的降噪需求上,采用基于时域、时频域等方法的深度学习神经网络技术,对幅度谱、相位谱以及复数域等声学特征进行高效提取,另外采用双路建模方式提升了语音降噪方法的性能。这些技术能够在极小化语音失真的前提下,高效滤除背景噪音,从而极大地恢复了原始语音的清晰度。这类先进的处理模型也被形象地称为AI语音增强模型。
57
+
58
+ 语音降噪模型的核心功能,在于从充满杂质的音频信号中精准分离并提取出纯净的目标语音,不仅有效提升语音的可听性和可理解性,同时也为语音识别技术的准确度和响应速度带来了显著优化。尤为值得一提的是,我们的语音降噪模型设计精巧,仅需接收单个麦克风录制的原始音频文件作为输入,就能输出经过高效降噪处理的、音质清晰的音频结果。这一过程保持了音频的原始格式不变,仅仅针对性地剔除了噪音与不必要的混响成分,最大化地保存了原始语音的真实性和完整性。
59
+
60
+ 模型的潜在应用场景:
61
+ * 在嘈杂的声学环境中降低噪声影响,甚至是消除噪声。
62
+ * 改善任意来源的音频声学质量,提升语音清晰度。
63
+ * 在各种背景音中提取人声或者背景声。
64
+
65
+ **模型支持在线体验啦!**
66
+
67
+
68
+ ## 模型描述
69
+
70
+ ZipEnhancer是阿里巴巴语音实验室提出的基于时频域(TF-Domain)建模的双路(Dual-Path)可进行时频域特征压缩的语音降噪模型。相比于过去的双路语音增强和语音降噪模型,保留额外的频率维度,构建四维的隐层特征(B, T, F, C),导致了模型的计算量居高不下的问题,我们引入的时频域特征下采样模块,来自定义降低隐层特征维度,减少模型的计算量。
71
+
72
+ 该模型神经网络结构如下图所示。
73
+
74
+ ![model.jpg](description/model.jpg)
75
+
76
+ 该模型包括编码器(Encoder)、双路径压缩变压器块(Dual-Path ZipformerBlocks) 、幅度解码器(Magnitude Decoder)和相位解码器(Phase Decoder)。Encoder最初对幅度(Magnitude)和相位(Phase)进行建模以获得隐藏层特征。随后,Dual-Path ZipformerBlocks使用DownSampleStacks和FT-ZipformerBlocks对频域和时域进行顺序建模,然后由幅度解码器恢复幅度谱,并由相位解码器显式恢复相位谱。我们利用ZipformerBlock将双路径Block构建为FT-ZipformerBlocks。
77
+ 此外,我们提出了具有成对的下采样和上采样结构的DownSampleStacks,我们使用成对的DownSample和UpSample模块实现时间或频率长度的对称缩放,以降低计算成本,并在不同分辨率级别对时域和频域信息进行建模。在时间维度进行下采样的模块称为T_DownSample和T_UpSample,而在频率维度进行下采样的模块称为F_DownSample和F_UpSample。
78
+ 在 DNS Challenge 2020 (DNS2020)和VoiceBank+DEMAND数据集上进行的大量实验证明了我们的ZipEnhancer的���果。
79
+
80
+ ZipEnhancer优于类似规模的模型,在具有2.04M个参数和62.41FLOPS的情况下,**在DNS2020数据集上实现了新的最先进(SOTA)语音质量感知评估(PESQ)得分3.69**,数据集Voicebank+DEMAND上PESQ达到3.63。
81
+
82
+ 模型输入和输出均为16kHz采样率单通道语音时域波形信号,输入信号可由单通道麦克风直接进行录制,输出为噪声抑制后的语音音频信号。模型输入信号通过STFT变换转换成幅度(Magnitude)和相位(Phase)特征作为输入,并采用ZipEnhancer在时频域上进行有效的双路建模和特征压缩,预测最终的增强幅度和相位,降幅度和相位转化后构建增强后的复数域特征,最后通过STFT逆变换得到增强后语音波形信号。
83
+
84
+
85
+ ## 期望模型使用方式以及适用范围
86
+
87
+
88
+ ### 如何使用
89
+
90
+ 在安装ModelScope完成之后即可使用```speech_zipenhancer_ans_multiloss_16k_base```进行推理。模型输入和输出均为16kHz采样率单通道语音时域波形信号,输入信号可由单通道麦克风直接进行录制,输出为噪声抑制后的语音音频信号。为了方便使用在pipeline在模型处理前后增加了wav文件处理逻辑,可以直接读取一个wav文件,并把输出结果保存在指定的wav文件中。
91
+
92
+ #### 环境准备:
93
+
94
+ * 本模型支持Linxu,Windows和MacOS平台。
95
+
96
+ ```
97
+ conda install pytorch torchaudio torchvision -c pytorch
98
+ ```
99
+
100
+ * 本模型的pipeline中使用了三方库SoundFile进行wav文件处理,**在Linux系统上用户需要手动安装SoundFile的底层依赖库libsndfile**,在Windows和MacOS上会自动安装不需要用户操作。详细信息可参考[SoundFile官网](https://github.com/bastibe/python-soundfile#installation)。以Ubuntu系统为例,用户需要执行如下命令:
101
+
102
+ ```shell
103
+ sudo apt-get update
104
+ sudo apt-get install libsndfile1
105
+ ```
106
+
107
+ * 本模型要求modelscope library版本 >=1.20,若不满足可按以下方式进行升级。
108
+
109
+ ```shell
110
+ pip install modelscope==1.22.0
111
+ # 如仍缺少依赖, 可按下述补充安装
112
+ pip install simplejson datasets==2.18.0 addict sortedcontainers
113
+ ```
114
+ * pytorch环境建议显式设置线程数。https://github.com/pytorch/pytorch/issues/90760
115
+
116
+ ```python
117
+ # 设置要使用的线程数,比如8
118
+ import torch
119
+ torch.set_num_threads(8)
120
+ torch.set_num_interop_threads(8)
121
+ ```
122
+
123
+ #### 代码范例
124
+
125
+ ```python
126
+ from modelscope.pipelines import pipeline
127
+ from modelscope.utils.constant import Tasks
128
+
129
+
130
+ ans = pipeline(
131
+ Tasks.acoustic_noise_suppression,
132
+ model='damo/speech_zipenhancer_ans_multiloss_16k_base')
133
+ result = ans(
134
+ 'https://modelscope.oss-cn-beijing.aliyuncs.com/test/audios/speech_with_noise1.wav',
135
+ output_path='output.wav')
136
+ print("done")
137
+ ```
138
+
139
+ #### 流式处理代码示例
140
+
141
+ ```python
142
+ from modelscope.pipelines import pipeline
143
+ from modelscope.utils.constant import Tasks
144
+ from modelscope.fileio import File
145
+
146
+
147
+ def create_wav_header(dataflow, sample_rate=16000, num_channels=1, bits_per_sample=16):
148
+ """
149
+ 创建WAV文件头的字节串。
150
+
151
+ :param dataflow: 音频bytes数据(以字节为单位)。
152
+ :param sample_rate: 采样率,默认16000。
153
+ :param num_channels: 声道数,默认1(单声道)。
154
+ :param bits_per_sample: 每个样本的位数,默认16。
155
+ :return: WAV文件头的字节串和音频bytes数据。
156
+ """
157
+ total_data_len = len(dataflow)
158
+ byte_rate = sample_rate * num_channels * bits_per_sample // 8
159
+ block_align = num_channels * bits_per_sample // 8
160
+ data_chunk_size = total_data_len
161
+ fmt_chunk_size = 16
162
+ riff_chunk_size = 4 + (8 + fmt_chunk_size) + (8 + data_chunk_size)
163
+
164
+ # 使用 bytearray 构建字节串
165
+ header = bytearray()
166
+
167
+ # RIFF/WAVE header
168
+ header.extend(b'RIFF')
169
+ header.extend(riff_chunk_size.to_bytes(4, byteorder='little'))
170
+ header.extend(b'WAVE')
171
+
172
+ # fmt subchunk
173
+ header.extend(b'fmt ')
174
+ header.extend(fmt_chunk_size.to_bytes(4, byteorder='little'))
175
+ header.extend((1).to_bytes(2, byteorder='little')) # Audio format (1 is PCM)
176
+ header.extend(num_channels.to_bytes(2, byteorder='little'))
177
+ header.extend(sample_rate.to_bytes(4, byteorder='little'))
178
+ header.extend(byte_rate.to_bytes(4, byteorder='little'))
179
+ header.extend(block_align.to_bytes(2, byteorder='little'))
180
+ header.extend(bits_per_sample.to_bytes(2, byteorder='little'))
181
+
182
+ # data subchunk
183
+ header.extend(b'data')
184
+ header.extend(data_chunk_size.to_bytes(4, byteorder='little'))
185
+
186
+ return bytes(header) + dataflow
187
+
188
+
189
+ ans = pipeline(
190
+ Tasks.acoustic_noise_suppression,
191
+ model='damo/speech_zipenhancer_ans_multiloss_16k_base')
192
+
193
+ audio_path = 'https://modelscope.oss-cn-beijing.aliyuncs.com/test/audios/speech_with_noise1.wav'
194
+
195
+ if audio_path.startswith("http"):
196
+ import io
197
+
198
+ file_bytes = File.read(audio_path)
199
+ audiostream = io.BytesIO(file_bytes)
200
+ else:
201
+ audiostream = open(audio_path, 'rb')
202
+
203
+ window = 2 * 16000 * 2 # 2 秒的窗口大小,以字节为单位
204
+ outputs = b''
205
+ total_bytes_len = 0
206
+ audiostream.read(44)
207
+ for dataflow in iter(lambda: audiostream.read(window), ""):
208
+ print(len(dataflow))
209
+ total_bytes_len += len(dataflow)
210
+ if len(dataflow) == 0:
211
+ break
212
+ result = ans(create_wav_header(dataflow, sample_rate=16000, num_channels=1, bits_per_sample=16))
213
+ output = result['output_pcm']
214
+ outputs = outputs + output
215
+ audiostream.close()
216
+
217
+ outputs = outputs[:total_bytes_len]
218
+ output_path = 'output.wav'
219
+ with open(output_path, 'wb') as out_wave:
220
+ out_wave.write(create_wav_header(outputs, sample_rate=16000, num_channels=1, bits_per_sample=16))
221
+
222
+ ```
223
+
224
+ #### 调用提供的onnx模型代码示例
225
+
226
+ ```python
227
+ import soundfile as sf
228
+ import numpy as np
229
+ import torch
230
+ import onnxruntime
231
+ import io
232
+ import os
233
+
234
+ from modelscope.pipelines import pipeline
235
+ from modelscope.utils.constant import Tasks
236
+ from modelscope.models.audio.ans.zipenhancer import mag_pha_stft, mag_pha_istft
237
+ from modelscope.utils.audio.audio_utils import audio_norm
238
+ from modelscope.fileio import File
239
+ from modelscope.utils.file_utils import get_modelscope_cache_dir
240
+
241
+
242
+ # onnx模型路径
243
+ MS_CACHE_HOME = get_modelscope_cache_dir()
244
+ onnx_model_path = os.path.join(MS_CACHE_HOME, 'hub/damo/speech_zipenhancer_ans_multiloss_16k_base/onnx_model.onnx')
245
+
246
+ # 删除旧模型
247
+ if os.path.exists(onnx_model_path):
248
+ os.remove(onnx_model_path)
249
+
250
+ # 下载模型
251
+ ans = pipeline(
252
+ Tasks.acoustic_noise_suppression,
253
+ model='damo/speech_zipenhancer_ans_multiloss_16k_base')
254
+
255
+
256
+ audio_path = 'https://modelscope.oss-cn-beijing.aliyuncs.com/test/audios/speech_with_noise1.wav'
257
+ output_path = 'output.wav'
258
+
259
+ is_verbose = True
260
+
261
+
262
+ class OnnxModel:
263
+ def __init__(self, onnx_filepath, providers=None):
264
+ self.onnx_model = onnxruntime.InferenceSession(onnx_filepath, providers=providers)
265
+
266
+ def to_numpy(self, tensor):
267
+ return tensor.detach().cpu().numpy() if tensor.requires_grad else tensor.cpu().numpy()
268
+
269
+ def __call__(self, noisy_wav):
270
+ n_fft = 400
271
+ hop_size = 100
272
+ win_size = 400
273
+
274
+ norm_factor = torch.sqrt(noisy_wav.shape[1] / torch.sum(noisy_wav ** 2.0))
275
+ if is_verbose:
276
+ print(f"norm_factor {norm_factor}" )
277
+
278
+ noisy_audio = (noisy_wav * norm_factor)
279
+
280
+ noisy_amp, noisy_pha, _ = mag_pha_stft(
281
+ noisy_audio,
282
+ n_fft,
283
+ hop_size,
284
+ win_size,
285
+ compress_factor=0.3,
286
+ center=True)
287
+
288
+ ort_inputs = {self.onnx_model.get_inputs()[0].name: self.to_numpy(noisy_amp),
289
+ self.onnx_model.get_inputs()[1].name: self.to_numpy(noisy_pha),
290
+ }
291
+ ort_outs = self.onnx_model.run(None, ort_inputs)
292
+
293
+ amp_g = torch.from_numpy(ort_outs[0])
294
+ pha_g = torch.from_numpy(ort_outs[1])
295
+
296
+ if is_verbose:
297
+ print(f"Enhanced amplitude mean and std: {torch.mean(amp_g)} {torch.std(amp_g)}")
298
+ print(f"Enhanced phase mean and std: {torch.mean(pha_g)} {torch.std(pha_g)}")
299
+
300
+ wav = mag_pha_istft(
301
+ amp_g,
302
+ pha_g,
303
+ n_fft,
304
+ hop_size,
305
+ win_size,
306
+ compress_factor=0.3,
307
+ center=True)
308
+
309
+ wav = wav / norm_factor
310
+
311
+ wav = self.to_numpy(wav)
312
+
313
+ return wav
314
+
315
+
316
+ onnx_model = OnnxModel(onnx_model_path)
317
+
318
+ if audio_path.startswith("http"):
319
+ file_bytes = File.read(audio_path)
320
+ wav, fs = sf.read(io.BytesIO(file_bytes))
321
+ else:
322
+ wav, fs = sf.read(audio_path)
323
+
324
+ wav = audio_norm(wav).astype(np.float32)
325
+ noisy_wav = torch.from_numpy(np.reshape(wav, [1, wav.shape[0]]))
326
+
327
+ if is_verbose:
328
+ print(f"wav {wav}")
329
+ print(f"noisy_wav {noisy_wav}")
330
+
331
+ enhanced_wav = onnx_model(noisy_wav)
332
+
333
+ if is_verbose:
334
+ print(f"enhanced_wav {enhanced_wav}")
335
+
336
+ sf.write(output_path, (enhanced_wav[0] * 32768).astype(np.int16), fs)
337
+ ```
338
+
339
+ #### 在自己平台上导出onnx模型代码示例
340
+
341
+ 如果ONNX模型执行遇到问题,可以按照如下代码尝试在自己平台上重新导出ONNX模型。输出ONNX文件路径为`ans_pipeline_onnx.onnx`。
342
+
343
+ ```python
344
+ from modelscope.pipelines import pipeline
345
+ from modelscope.utils.constant import Tasks
346
+ import torch
347
+ import torch.nn as nn
348
+ import onnx
349
+ def export_ZipEnhancer_model_onnx(
350
+ model,
351
+ model_filename: str,
352
+ opset_version: int = 11,
353
+ max_support_seconds: int = 60,
354
+ ) -> None:
355
+
356
+ seconds = 2
357
+ f = 201
358
+ t = 161 * seconds
359
+ noisy_mag, noisy_pha = torch.randn(1, f, t), torch.randn(1, f, t)
360
+
361
+ max_len = 161 * max_support_seconds # 60s
362
+ encoders = model.model.TSConformer.encoders
363
+ for name, module in encoders.named_modules():
364
+ if name.endswith('encoder_pos'): # 匹配路径末端为 encoder_pos
365
+ # print(name)
366
+ module.extend_pe(torch.tensor(0.0).expand(max_len))
367
+
368
+ model = torch.jit.trace(model, (noisy_mag, noisy_pha))
369
+
370
+ model.eval()
371
+ torch.onnx.export(
372
+ model,
373
+ (noisy_mag, noisy_pha),
374
+ model_filename,
375
+ verbose=False,
376
+ opset_version=opset_version,
377
+ input_names=["noisy_mag", "noisy_pha"],
378
+ output_names=["amp_g", "pha_g"],
379
+ dynamic_axes={
380
+ "noisy_mag": {0: "N", 2: "T"},
381
+ "noisy_pha": {0: "N", 2: "T"},
382
+ "amp_g": {0: "N", 2: "T"},
383
+ "pha_g": {0: "N", 2: "T"},
384
+ },
385
+ )
386
+
387
+ meta_data = {
388
+ "model_type": "ZipEnhancerS",
389
+ "version": "1",
390
+ }
391
+
392
+ def add_meta_data(filename, meta_data):
393
+
394
+ model = onnx.load(filename)
395
+ for key, value in meta_data.items():
396
+ meta = model.metadata_props.add()
397
+ meta.key = key
398
+ meta.value = value
399
+
400
+ onnx.save(model, filename)
401
+
402
+ add_meta_data(filename=model_filename, meta_data=meta_data)
403
+
404
+ class OnnxModel(nn.Module):
405
+
406
+ def __init__(self, ans):
407
+ super().__init__()
408
+ self.model = ans.model.model
409
+
410
+ def forward(self, noisy_amp, noisy_pha):
411
+ amp_g, pha_g, _, _, _ = self.model(noisy_amp, noisy_pha)
412
+ return amp_g, pha_g
413
+
414
+ ans = pipeline(
415
+ Tasks.acoustic_noise_suppression,
416
+ model='damo/speech_zipenhancer_ans_multiloss_16k_base')
417
+
418
+ model = OnnxModel(ans)
419
+
420
+ model_onnx_filename = 'ans_pipeline_onnx.onnx'
421
+ export_ZipEnhancer_model_onnx(
422
+ model,
423
+ model_onnx_filename,
424
+ opset_version=13,
425
+ max_support_seconds=60
426
+ )
427
+ ```
428
+
429
+ ### 模型局限性以及可能的偏差
430
+
431
+ 该模型仅用DNS Challenge 2020开源数据进行训练,该训练集仅包括英语语种和集内噪声,可能在个别跨域噪声上有性能偏差。
432
+
433
+ ## 训练数据介绍
434
+
435
+ 模型的训练数据来自DNS-Challenge 2020开源数据集,是Microsoft团队为ICASSP相关挑战赛提供的,[官方网址](https://github.com/microsoft/DNS-Challenge)[2]。该模型用来处理16k音频,因此只使用了其中的fullband中的英文数据。
436
+
437
+
438
+ ## 数据评估及结果
439
+
440
+ 与其他SOTA模型在DNS Challenge 2020官方测试集上对比效果如下(当前提供的模型为ZipEnhancerS):
441
+
442
+ ![matrix.jpg](description/matrix.jpg)
443
+
444
+
445
+ 与其他SOTA模型在VoiceBank+DEMAND官方测试集上对比效果如下(该数据集下训练的模型未提供):
446
+
447
+
448
+ ![matrix_voicebank.jpg](description/matrix_voicebank.jpg)
449
+
450
+ 指标说明:
451
+
452
+ * WB-PESQ (wide-band Perceptual Evaluation Of Speech Quality) 宽带 (16k) 语音质量感知评估,是一种客观的、全参考的语音质量评估方法,得分范围在-0.5--4.5之间,得分越高表示语音质量越好。
453
+ * NB-PESQ (narrowband Perceptual Evaluation Of Speech Quality) 窄带 (8k) 语音质量感知评估,同上得分越高表示语音质量越好。
454
+ * STOI (Short-Time Objective Intelligibility) 短时客观可懂度,反映人类的听觉感知系统对语音可懂度的客观评价,STOI 值介于0~1 之间,值越大代表语音可懂度越高,越清晰。
455
+ * SI-SDR (Scale Invariant Signal-to-Distortion Ratio) 尺度不变的信干比,是在普通信噪比基础上通过正则化消减信号变化导致的影响,是针对宽带噪声失真的语音增强算法的常规衡量方法。
456
+ * CSIG、CBAK 和 COVL 分别评估了信号失真、背景噪声侵扰性以及整体效果的平均意见得分(Mean Opinion Score,MOS)。指标越高越好。
457
+ * SSNR (Segmental Signal-to-Noise Ratio),分段信噪比,通过将信号分成多个时间段来分别评估各段内的信号纯净度,反映了不同部分中信号与噪声的比例,从而更细致地表明了局部信号质量及噪声影响。指标越高越好。
458
+
459
+ DNS 2020 Challenge的结果列表可见[这里](https://www.microsoft.com/en-us/research/academic-program/deep-noise-suppression-challenge-interspeech-2020/results/)。
460
+
461
+
462
+
463
+
464
+ ### 相关论文以及引用信息
465
+
466
+ 更多详情请参考下面相关论文。
467
+
468
+ [1] H. Wang and B. Tian, "ZipEnhancer: Dual-Path Down-Up Sampling-based Zipformer for Monaural Speech Enhancement," ICASSP 2025 - 2025 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP), Hyderabad, India, 2025, pp. 1-5, doi: 10.1109/ICASSP49660.2025.10888703.
469
+
470
+ ```BibTeX
471
+
472
+
473
+ @INPROCEEDINGS{10888703,
474
+ author={Wang, Haoxu and Tian, Biao},
475
+ booktitle={ICASSP 2025 - 2025 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)},
476
+ title={ZipEnhancer: Dual-Path Down-Up Sampling-based Zipformer for Monaural Speech Enhancement},
477
+ year={2025},
478
+ volume={},
479
+ number={},
480
+ pages={1-5},
481
+ keywords={Time-frequency analysis;Computational modeling;Speech enhancement;Signal processing;Real-time systems;Acoustics;Computational efficiency;Complexity theory;Speech Enhancement;Down-Up Sampling;Dual-Path;ZipEnhancer;Zipformer},
482
+ doi={10.1109/ICASSP49660.2025.10888703}}
483
+
484
+ ```
485
+
486
+
models/iic/speech_zipenhancer_ans_multiloss_16k_base/configuration.json ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "framework": "pytorch",
3
+ "task": "acoustic-noise-suppression",
4
+ "pipeline": {
5
+ "type": "speech_zipenhancer_ans_multiloss_16k_base"
6
+ },
7
+ "model": {
8
+ "type": "speech_zipenhancer_ans_multiloss_16k_base",
9
+ "dense_channel": 64,
10
+ "num_tsconformers": 4,
11
+ "former_name": "Zipformer2DualPathEncoder",
12
+ "former_conf": {
13
+ "num_encoder_layers": [1, 1, 1, 1],
14
+ "downsampling_factor": [1, 2, 2, 1],
15
+ "f_downsampling_factor": [1, 2, 2, 1],
16
+ "encoder_dim": [64, 64, 64, 64],
17
+ "pos_dim": 24,
18
+ "num_heads": 4,
19
+ "query_head_dim": 12,
20
+ "pos_head_dim": 4,
21
+ "value_head_dim": 8,
22
+ "feedforward_dim": [256, 256, 256, 256],
23
+ "cnn_module_kernel": 15,
24
+ "causal": false,
25
+ "encoder_unmasked_dim": 64,
26
+ "warmup_batches": 4000.0
27
+ },
28
+ "batch_first": true,
29
+ "model_num_spks": 1
30
+ },
31
+ "preprocessor": {}
32
+ }
33
+
models/iic/speech_zipenhancer_ans_multiloss_16k_base/description/block.jpg ADDED
models/iic/speech_zipenhancer_ans_multiloss_16k_base/description/matrix.jpg ADDED

Git LFS Details

  • SHA256: e05950d815b491208215a0a0be2e57089749f44d53ff145cf13b5bfc73c78290
  • Pointer size: 131 Bytes
  • Size of remote file: 166 kB
models/iic/speech_zipenhancer_ans_multiloss_16k_base/description/matrix_voicebank.jpg ADDED

Git LFS Details

  • SHA256: 3a07dd4a981ed1821d7fe5e7628a944e587d93b441d5c94cd885bee2cd33b9c1
  • Pointer size: 131 Bytes
  • Size of remote file: 273 kB
models/iic/speech_zipenhancer_ans_multiloss_16k_base/examples/speech_with_noise.wav ADDED
Binary file (76.8 kB). View file
 
models/iic/speech_zipenhancer_ans_multiloss_16k_base/examples/speech_with_noise1.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0b2882d3bcd9e8f8f9531ac34ac09c0208d86500b910d3e1ca34c022caa9be62
3
+ size 155874
models/iic/speech_zipenhancer_ans_multiloss_16k_base/onnx_model.onnx ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:93f5b75638d42698736c89eae2daef43aa874011f06cd41bf6afbcc1452edecb
3
+ size 9805700
models/iic/speech_zipenhancer_ans_multiloss_16k_base/pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b18896915e27a821585584221d0c0820f35e12145315ae3f1e73ccd5a68d195f
3
+ size 8424575
models/v10/generation_config.json ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_from_model_config": true,
3
+ "bos_token_id": 133309,
4
+ "do_sample": true,
5
+ "eos_token_id": 133310,
6
+ "min_p": 0.05,
7
+ "pad_token_id": 128001,
8
+ "repetition_penalty": 1.1,
9
+ "temperature": 0.4,
10
+ "top_k": 40,
11
+ "top_p": 0.9,
12
+ "transformers_version": "4.48.0"
13
+ }
models/v10/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:30f3df2b3999f0cf34cb2a1a9c84ce0b2b96e8717019a0a76d8cb00c81a86da7
3
+ size 2496811440
models/v10/tokenizer.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ec54a55e3c6fc7318ea02cbd1a6eb1fb180bff1b58acbf068504a25f1b407b7b
3
+ size 18366636
models/wpt/wpt.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:345ae4da62f9b3d59415adc60127b97c714f32e89e936602e85993674d08dcb1
3
+ size 1528008539
spk_001.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:79de3a5775f8880c0bf3e950b103f03b257db630224fab265a309d82753b1aa5
3
+ size 480044