wgs commited on
Commit
04edf76
·
verified ·
1 Parent(s): a7ec871

Upload 92 files

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. .gitattributes +5 -0
  2. ckpts/.DS_Store +0 -0
  3. ckpts/DNSMOSPro_NISQA/model_best.pt +3 -0
  4. ckpts/MossFormer2_SE_48K/.cache/huggingface/.gitignore +1 -0
  5. ckpts/MossFormer2_SE_48K/.cache/huggingface/download/.gitattributes.lock +0 -0
  6. ckpts/MossFormer2_SE_48K/.cache/huggingface/download/.gitattributes.metadata +3 -0
  7. ckpts/MossFormer2_SE_48K/.cache/huggingface/download/README.md.lock +0 -0
  8. ckpts/MossFormer2_SE_48K/.cache/huggingface/download/README.md.metadata +3 -0
  9. ckpts/MossFormer2_SE_48K/.cache/huggingface/download/last_best_checkpoint.lock +0 -0
  10. ckpts/MossFormer2_SE_48K/.cache/huggingface/download/last_best_checkpoint.metadata +3 -0
  11. ckpts/MossFormer2_SE_48K/.cache/huggingface/download/last_best_checkpoint.pt.lock +0 -0
  12. ckpts/MossFormer2_SE_48K/.cache/huggingface/download/last_best_checkpoint.pt.metadata +3 -0
  13. ckpts/MossFormer2_SE_48K/.gitattributes +35 -0
  14. ckpts/MossFormer2_SE_48K/README.md +64 -0
  15. ckpts/MossFormer2_SE_48K/last_best_checkpoint +1 -0
  16. ckpts/MossFormer2_SE_48K/last_best_checkpoint.pt +3 -0
  17. ckpts/MossFormer2_SS_16K/README.md +9 -0
  18. ckpts/MossFormer2_SS_16K/last_best_checkpoint +1 -0
  19. ckpts/MossFormer2_SS_16K/last_best_checkpoint.pt +3 -0
  20. ckpts/paraformer-zh/README.md +182 -0
  21. ckpts/paraformer-zh/am.mvn +8 -0
  22. ckpts/paraformer-zh/config.yaml +123 -0
  23. ckpts/paraformer-zh/configuration.json +14 -0
  24. ckpts/paraformer-zh/model.pt +3 -0
  25. ckpts/paraformer-zh/seg_dict +0 -0
  26. ckpts/paraformer-zh/tokens.json +0 -0
  27. ckpts/punc_ct-transformer_zh-cn-common-vocab272727-pytorch/README.md +272 -0
  28. ckpts/punc_ct-transformer_zh-cn-common-vocab272727-pytorch/config.yaml +46 -0
  29. ckpts/punc_ct-transformer_zh-cn-common-vocab272727-pytorch/configuration.json +13 -0
  30. ckpts/punc_ct-transformer_zh-cn-common-vocab272727-pytorch/model.pt +3 -0
  31. ckpts/punc_ct-transformer_zh-cn-common-vocab272727-pytorch/tokens.json +0 -0
  32. ckpts/speaker-diarization-community-1/.DS_Store +0 -0
  33. ckpts/speaker-diarization-community-1/README.md +227 -0
  34. ckpts/speaker-diarization-community-1/config.yaml +21 -0
  35. ckpts/speaker-diarization-community-1/embedding/README.md +20 -0
  36. ckpts/speaker-diarization-community-1/embedding/pytorch_model.bin +3 -0
  37. ckpts/speaker-diarization-community-1/plda/README.md +3 -0
  38. ckpts/speaker-diarization-community-1/plda/plda.npz +3 -0
  39. ckpts/speaker-diarization-community-1/plda/xvec_transform.npz +3 -0
  40. ckpts/speaker-diarization-community-1/segmentation/pytorch_model.bin +3 -0
  41. ckpts/speech_diarization_sond-zh-cn-alimeeting-16k-n16k4-pytorch/.mdl +0 -0
  42. ckpts/speech_diarization_sond-zh-cn-alimeeting-16k-n16k4-pytorch/.msc +0 -0
  43. ckpts/speech_diarization_sond-zh-cn-alimeeting-16k-n16k4-pytorch/.mv +1 -0
  44. ckpts/speech_diarization_sond-zh-cn-alimeeting-16k-n16k4-pytorch/README.md +319 -0
  45. ckpts/speech_diarization_sond-zh-cn-alimeeting-16k-n16k4-pytorch/configuration.json +26 -0
  46. ckpts/speech_diarization_sond-zh-cn-alimeeting-16k-n16k4-pytorch/example/record.wav +3 -0
  47. ckpts/speech_diarization_sond-zh-cn-alimeeting-16k-n16k4-pytorch/example/spk1.wav +3 -0
  48. ckpts/speech_diarization_sond-zh-cn-alimeeting-16k-n16k4-pytorch/example/spk2.wav +3 -0
  49. ckpts/speech_diarization_sond-zh-cn-alimeeting-16k-n16k4-pytorch/example/spk3.wav +3 -0
  50. ckpts/speech_diarization_sond-zh-cn-alimeeting-16k-n16k4-pytorch/example/spk4.wav +3 -0
.gitattributes CHANGED
@@ -33,3 +33,8 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
 
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ ckpts/speech_diarization_sond-zh-cn-alimeeting-16k-n16k4-pytorch/example/record.wav filter=lfs diff=lfs merge=lfs -text
37
+ ckpts/speech_diarization_sond-zh-cn-alimeeting-16k-n16k4-pytorch/example/spk1.wav filter=lfs diff=lfs merge=lfs -text
38
+ ckpts/speech_diarization_sond-zh-cn-alimeeting-16k-n16k4-pytorch/example/spk2.wav filter=lfs diff=lfs merge=lfs -text
39
+ ckpts/speech_diarization_sond-zh-cn-alimeeting-16k-n16k4-pytorch/example/spk3.wav filter=lfs diff=lfs merge=lfs -text
40
+ ckpts/speech_diarization_sond-zh-cn-alimeeting-16k-n16k4-pytorch/example/spk4.wav filter=lfs diff=lfs merge=lfs -text
ckpts/.DS_Store ADDED
Binary file (6.15 kB). View file
 
ckpts/DNSMOSPro_NISQA/model_best.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:04b280a24ad9f0d3c2507140b69b1ae6ccbdba8cfc2ed6f2bec9c821e4794959
3
+ size 341198
ckpts/MossFormer2_SE_48K/.cache/huggingface/.gitignore ADDED
@@ -0,0 +1 @@
 
 
1
+ *
ckpts/MossFormer2_SE_48K/.cache/huggingface/download/.gitattributes.lock ADDED
File without changes
ckpts/MossFormer2_SE_48K/.cache/huggingface/download/.gitattributes.metadata ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ eff8c97925c8bec812af707814b3e5d777fd4503
2
+ a6344aac8c09253b3b630fb776ae94478aa0275b
3
+ 1760600858.231903
ckpts/MossFormer2_SE_48K/.cache/huggingface/download/README.md.lock ADDED
File without changes
ckpts/MossFormer2_SE_48K/.cache/huggingface/download/README.md.metadata ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ eff8c97925c8bec812af707814b3e5d777fd4503
2
+ 3bd127640b52490ba6eda7c91738d5f3b826863b
3
+ 1760600857.7092524
ckpts/MossFormer2_SE_48K/.cache/huggingface/download/last_best_checkpoint.lock ADDED
File without changes
ckpts/MossFormer2_SE_48K/.cache/huggingface/download/last_best_checkpoint.metadata ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ eff8c97925c8bec812af707814b3e5d777fd4503
2
+ 98b04fb38c032a55f03f4a7583600e7e112b8e09
3
+ 1760600857.99231
ckpts/MossFormer2_SE_48K/.cache/huggingface/download/last_best_checkpoint.pt.lock ADDED
File without changes
ckpts/MossFormer2_SE_48K/.cache/huggingface/download/last_best_checkpoint.pt.metadata ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ eff8c97925c8bec812af707814b3e5d777fd4503
2
+ 03692b9f773bbd6bb43b9c5a41f96b1e28affd66e13796b7bec66ad3d8b227c6
3
+ 1760600960.8604465
ckpts/MossFormer2_SE_48K/.gitattributes ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
+ *.model filter=lfs diff=lfs merge=lfs -text
13
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
14
+ *.npy filter=lfs diff=lfs merge=lfs -text
15
+ *.npz filter=lfs diff=lfs merge=lfs -text
16
+ *.onnx filter=lfs diff=lfs merge=lfs -text
17
+ *.ot filter=lfs diff=lfs merge=lfs -text
18
+ *.parquet filter=lfs diff=lfs merge=lfs -text
19
+ *.pb filter=lfs diff=lfs merge=lfs -text
20
+ *.pickle filter=lfs diff=lfs merge=lfs -text
21
+ *.pkl filter=lfs diff=lfs merge=lfs -text
22
+ *.pt filter=lfs diff=lfs merge=lfs -text
23
+ *.pth filter=lfs diff=lfs merge=lfs -text
24
+ *.rar filter=lfs diff=lfs merge=lfs -text
25
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
26
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
28
+ *.tar filter=lfs diff=lfs merge=lfs -text
29
+ *.tflite filter=lfs diff=lfs merge=lfs -text
30
+ *.tgz filter=lfs diff=lfs merge=lfs -text
31
+ *.wasm filter=lfs diff=lfs merge=lfs -text
32
+ *.xz filter=lfs diff=lfs merge=lfs -text
33
+ *.zip filter=lfs diff=lfs merge=lfs -text
34
+ *.zst filter=lfs diff=lfs merge=lfs -text
35
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
ckpts/MossFormer2_SE_48K/README.md ADDED
@@ -0,0 +1,64 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ license: apache-2.0
3
+ ---
4
+
5
+ # Introduction
6
+
7
+ The MossFormer2_SE_48K model weights for 48 kHz speech enhancement in [ClearerVoice-Studio](https://github.com/modelscope/ClearerVoice-Studio/tree/main) repo.
8
+
9
+ This model is trained on large scale datasets inclduing open-sourced and private data.
10
+
11
+ It enhances speech audios by removing background noise.
12
+
13
+ # Install
14
+
15
+ **Clone the Repository**
16
+
17
+ ``` sh
18
+ git clone https://github.com/modelscope/ClearerVoice-Studio.git
19
+ ```
20
+
21
+ **Create Conda Environment**
22
+
23
+ ``` sh
24
+ cd ClearerVoice-Studio
25
+ conda create -n clearvoice python=3.8
26
+ conda activate clearvoice
27
+ pip install -r requirements.txt
28
+ ```
29
+
30
+ **Run Script**
31
+
32
+ Go to `clearvoice/` and use the following examples. The MossFormer2_SE_48K model will be downloaded from huggingface automatically.
33
+
34
+ Sample example 1: use speech enhancement model `MossFormer2_SE_48K` to process one wave file of `samples/input.wav` and save the output wave file to `samples/output_MossFormer2_SE_48K.wav`
35
+
36
+ ```python
37
+ from clearvoice import ClearVoice
38
+
39
+ myClearVoice = ClearVoice(task='speech_enhancement', model_names=['MossFormer2_SE_48K'])
40
+
41
+ output_wav = myClearVoice(input_path='samples/input.wav', online_write=False)
42
+
43
+ myClearVoice.write(output_wav, output_path='samples/output_MossFormer2_SE_48K.wav')
44
+ ```
45
+
46
+ Sample example 2: use speech enhancement model `MossFormer2_SE_48K` to process all input wave files in `samples/path_to_input_wavs/` and save all output files to `samples/path_to_output_wavs`
47
+
48
+ ```python
49
+ from clearvoice import ClearVoice
50
+
51
+ myClearVoice = ClearVoice(task='speech_enhancement', model_names=['MossFormer2_SE_48K'])
52
+
53
+ myClearVoice(input_path='samples/path_to_input_wavs', online_write=True, output_path='samples/path_to_output_wavs')
54
+ ```
55
+
56
+ Sample example 3: use speech enhancement model `MossFormer2_SE_48K` to process wave files listed in `samples/audio_samples.scp' file, and save all output files to 'samples/path_to_output_wavs_scp/'
57
+
58
+ ```python
59
+ from clearvoice import ClearVoice
60
+
61
+ myClearVoice = ClearVoice(task='speech_enhancement', model_names=['MossFormer2_SE_48K'])
62
+
63
+ myClearVoice(input_path='samples/scp/audio_samples.scp', online_write=True, output_path='samples/path_to_output_wavs_scp')
64
+ ```
ckpts/MossFormer2_SE_48K/last_best_checkpoint ADDED
@@ -0,0 +1 @@
 
 
1
+ last_best_checkpoint.pt
ckpts/MossFormer2_SE_48K/last_best_checkpoint.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:03692b9f773bbd6bb43b9c5a41f96b1e28affd66e13796b7bec66ad3d8b227c6
3
+ size 221552019
ckpts/MossFormer2_SS_16K/README.md ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ license: apache-2.0
3
+ ---
4
+
5
+ The MossFormer2_SS_16K model weights for 16 kHz speech separation in [ClearerVoice-Studio](https://github.com/modelscope/ClearerVoice-Studio/tree/main) repo.
6
+
7
+ This model is trained on large scale datasets inclduing open-sourced and private data.
8
+
9
+ It separates mixed-speaker speeches into individual speaker's speech.
ckpts/MossFormer2_SS_16K/last_best_checkpoint ADDED
@@ -0,0 +1 @@
 
 
1
+ last_best_checkpoint.pt
ckpts/MossFormer2_SS_16K/last_best_checkpoint.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:00a3a48bda492db1e829b85dd443f8f43a43039a3e90f1a24962ea9caf14a11a
3
+ size 670353271
ckpts/paraformer-zh/README.md ADDED
@@ -0,0 +1,182 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ license: other
3
+ license_name: model-license
4
+ license_link: https://github.com/alibaba-damo-academy/FunASR
5
+ ---
6
+
7
+
8
+ # FunASR: A Fundamental End-to-End Speech Recognition Toolkit
9
+
10
+
11
+ [![PyPI](https://img.shields.io/pypi/v/funasr)](https://pypi.org/project/funasr/)
12
+
13
+
14
+ <strong>FunASR</strong> hopes to build a bridge between academic research and industrial applications on speech recognition. By supporting the training & finetuning of the industrial-grade speech recognition model, researchers and developers can conduct research and production of speech recognition models more conveniently, and promote the development of speech recognition ecology. ASR for Fun!
15
+
16
+ [**Highlights**](#highlights)
17
+ | [**News**](https://github.com/alibaba-damo-academy/FunASR#whats-new)
18
+ | [**Installation**](#installation)
19
+ | [**Quick Start**](#quick-start)
20
+ | [**Runtime**](./runtime/readme.md)
21
+ | [**Model Zoo**](#model-zoo)
22
+ | [**Contact**](#contact)
23
+
24
+
25
+ <a name="highlights"></a>
26
+ ## Highlights
27
+ - FunASR is a fundamental speech recognition toolkit that offers a variety of features, including speech recognition (ASR), Voice Activity Detection (VAD), Punctuation Restoration, Language Models, Speaker Verification, Speaker Diarization and multi-talker ASR. FunASR provides convenient scripts and tutorials, supporting inference and fine-tuning of pre-trained models.
28
+ - We have released a vast collection of academic and industrial pretrained models on the [ModelScope](https://www.modelscope.cn/models?page=1&tasks=auto-speech-recognition) and [huggingface](https://huggingface.co/FunASR), which can be accessed through our [Model Zoo](https://github.com/alibaba-damo-academy/FunASR/blob/main/docs/model_zoo/modelscope_models.md). The representative [Paraformer-large](https://www.modelscope.cn/models/damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch/summary), a non-autoregressive end-to-end speech recognition model, has the advantages of high accuracy, high efficiency, and convenient deployment, supporting the rapid construction of speech recognition services. For more details on service deployment, please refer to the [service deployment document](runtime/readme_cn.md).
29
+
30
+
31
+ <a name="Installation"></a>
32
+ ## Installation
33
+
34
+ ```shell
35
+ pip3 install -U funasr
36
+ ```
37
+ Or install from source code
38
+ ``` sh
39
+ git clone https://github.com/alibaba/FunASR.git && cd FunASR
40
+ pip3 install -e ./
41
+ ```
42
+ Install modelscope for the pretrained models (Optional)
43
+
44
+ ```shell
45
+ pip3 install -U modelscope
46
+ ```
47
+
48
+ ## Model Zoo
49
+ FunASR has open-sourced a large number of pre-trained models on industrial data. You are free to use, copy, modify, and share FunASR models under the [Model License Agreement](./MODEL_LICENSE). Below are some representative models, for more models please refer to the [Model Zoo]().
50
+
51
+ (Note: 🤗 represents the Huggingface model zoo link, ⭐ represents the ModelScope model zoo link)
52
+
53
+
54
+ | Model Name | Task Details | Training Data | Parameters |
55
+ |:------------------------------------------------------------------------------------------------------------------------------------------------------------------:|:--------------------------------------------------:|:--------------------------------:|:----------:|
56
+ | paraformer-zh <br> ([⭐](https://www.modelscope.cn/models/damo/speech_paraformer-large-vad-punc_asr_nat-zh-cn-16k-common-vocab8404-pytorch/summary) [🤗]() ) | speech recognition, with timestamps, non-streaming | 60000 hours, Mandarin | 220M |
57
+ | <nobr>paraformer-zh-streaming <br> ( [⭐](https://modelscope.cn/models/damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-online/summary) [🤗]() )</nobr> | speech recognition, streaming | 60000 hours, Mandarin | 220M |
58
+ | paraformer-en <br> ( [⭐](https://www.modelscope.cn/models/damo/speech_paraformer-large-vad-punc_asr_nat-en-16k-common-vocab10020/summary) [🤗]() ) | speech recognition, with timestamps, non-streaming | 50000 hours, English | 220M |
59
+ | conformer-en <br> ( [⭐](https://modelscope.cn/models/damo/speech_conformer_asr-en-16k-vocab4199-pytorch/summary) [🤗]() ) | speech recognition, non-streaming | 50000 hours, English | 220M |
60
+ | ct-punc <br> ( [⭐](https://modelscope.cn/models/damo/punc_ct-transformer_cn-en-common-vocab471067-large/summary) [🤗]() ) | punctuation restoration | 100M, Mandarin and English | 1.1G |
61
+ | fsmn-vad <br> ( [⭐](https://modelscope.cn/models/damo/speech_fsmn_vad_zh-cn-16k-common-pytorch/summary) [🤗]() ) | voice activity detection | 5000 hours, Mandarin and English | 0.4M |
62
+ | fa-zh <br> ( [⭐](https://modelscope.cn/models/damo/speech_timestamp_prediction-v1-16k-offline/summary) [🤗]() ) | timestamp prediction | 5000 hours, Mandarin | 38M |
63
+ | cam++ <br> ( [⭐](https://modelscope.cn/models/iic/speech_campplus_sv_zh-cn_16k-common/summary) [🤗]() ) | speaker verification/diarization | 5000 hours | 7.2M |
64
+
65
+
66
+
67
+
68
+ [//]: # ()
69
+ [//]: # (FunASR supports pre-trained or further fine-tuned models for deployment as a service. The CPU version of the Chinese offline file conversion service has been released, details can be found in [docs]&#40;funasr/runtime/docs/SDK_tutorial.md&#41;. More detailed information about service deployment can be found in the [deployment roadmap]&#40;funasr/runtime/readme_cn.md&#41;.)
70
+
71
+
72
+ <a name="quick-start"></a>
73
+ ## Quick Start
74
+
75
+ Below is a quick start tutorial. Test audio files ([Mandarin](https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/vad_example.wav), [English]()).
76
+
77
+ ### Command-line usage
78
+
79
+ ```shell
80
+ funasr +model=paraformer-zh +vad_model="fsmn-vad" +punc_model="ct-punc" +input=asr_example_zh.wav
81
+ ```
82
+
83
+ Notes: Support recognition of single audio file, as well as file list in Kaldi-style wav.scp format: `wav_id wav_pat`
84
+
85
+ ### Speech Recognition (Non-streaming)
86
+ ```python
87
+ from funasr import AutoModel
88
+ # paraformer-zh is a multi-functional asr model
89
+ # use vad, punc, spk or not as you need
90
+ model = AutoModel(model="paraformer-zh", model_revision="v2.0.4",
91
+ vad_model="fsmn-vad", vad_model_revision="v2.0.4",
92
+ punc_model="ct-punc-c", punc_model_revision="v2.0.4",
93
+ # spk_model="cam++", spk_model_revision="v2.0.2",
94
+ )
95
+ res = model.generate(input=f"{model.model_path}/example/asr_example.wav",
96
+ batch_size_s=300,
97
+ hotword='魔搭')
98
+ print(res)
99
+ ```
100
+ Note: `model_hub`: represents the model repository, `ms` stands for selecting ModelScope download, `hf` stands for selecting Huggingface download.
101
+
102
+ ### Speech Recognition (Streaming)
103
+
104
+ ```python
105
+ from funasr import AutoModel
106
+
107
+ chunk_size = [0, 10, 5] # [0, 10, 5] 600ms, [0, 8, 4] 480ms
108
+ encoder_chunk_look_back = 4 # number of chunks to lookback for encoder self-attention
109
+ decoder_chunk_look_back = 1 # number of encoder chunks to lookback for decoder cross-attention
110
+
111
+ model = AutoModel(model="paraformer-zh-streaming", model_revision="v2.0.4")
112
+
113
+ import soundfile
114
+ import os
115
+
116
+ wav_file = os.path.join(model.model_path, "../fa-zh/example/asr_example.wav")
117
+ speech, sample_rate = soundfile.read(wav_file)
118
+ chunk_stride = chunk_size[1] * 960 # 600ms
119
+
120
+ cache = {}
121
+ total_chunk_num = int(len((speech) - 1) / chunk_stride + 1)
122
+ for i in range(total_chunk_num):
123
+ speech_chunk = speech[i * chunk_stride:(i + 1) * chunk_stride]
124
+ is_final = i == total_chunk_num - 1
125
+ res = model.generate(input=speech_chunk, cache=cache, is_final=is_final, chunk_size=chunk_size,
126
+ encoder_chunk_look_back=encoder_chunk_look_back,
127
+ decoder_chunk_look_back=decoder_chunk_look_back)
128
+ print(res)
129
+ ```
130
+ Note: `chunk_size` is the configuration for streaming latency.` [0,10,5]` indicates that the real-time display granularity is `10*60=600ms`, and the lookahead information is `5*60=300ms`. Each inference input is `600ms` (sample points are `16000*0.6=960`), and the output is the corresponding text. For the last speech segment input, `is_final=True` needs to be set to force the output of the last word.
131
+
132
+ ### Voice Activity Detection (Non-Streaming)
133
+ ```python
134
+ from funasr import AutoModel
135
+
136
+ model = AutoModel(model="fsmn-vad", model_revision="v2.0.4")
137
+ wav_file = f"{model.model_path}/example/asr_example.wav"
138
+ res = model.generate(input=wav_file)
139
+ print(res)
140
+ ```
141
+ ### Voice Activity Detection (Streaming)
142
+ ```python
143
+ from funasr import AutoModel
144
+
145
+ chunk_size = 200 # ms
146
+ model = AutoModel(model="fsmn-vad", model_revision="v2.0.4")
147
+
148
+ import soundfile
149
+
150
+ wav_file = f"{model.model_path}/example/vad_example.wav"
151
+ speech, sample_rate = soundfile.read(wav_file)
152
+ chunk_stride = int(chunk_size * sample_rate / 1000)
153
+
154
+ cache = {}
155
+ total_chunk_num = int(len((speech)-1)/chunk_stride+1)
156
+ for i in range(total_chunk_num):
157
+ speech_chunk = speech[i*chunk_stride:(i+1)*chunk_stride]
158
+ is_final = i == total_chunk_num - 1
159
+ res = model.generate(input=speech_chunk, cache=cache, is_final=is_final, chunk_size=chunk_size)
160
+ if len(res[0]["value"]):
161
+ print(res)
162
+ ```
163
+ ### Punctuation Restoration
164
+ ```python
165
+ from funasr import AutoModel
166
+
167
+ model = AutoModel(model="ct-punc", model_revision="v2.0.4")
168
+ res = model.generate(input="那今天的会就到这里吧 happy new year 明年见")
169
+ print(res)
170
+ ```
171
+ ### Timestamp Prediction
172
+ ```python
173
+ from funasr import AutoModel
174
+
175
+ model = AutoModel(model="fa-zh", model_revision="v2.0.4")
176
+ wav_file = f"{model.model_path}/example/asr_example.wav"
177
+ text_file = f"{model.model_path}/example/text.txt"
178
+ res = model.generate(input=(wav_file, text_file), data_type=("sound", "text"))
179
+ print(res)
180
+ ```
181
+
182
+ More examples ref to [docs](https://github.com/alibaba-damo-academy/FunASR/tree/main/examples/industrial_data_pretraining)
ckpts/paraformer-zh/am.mvn ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ <Nnet>
2
+ <Splice> 560 560
3
+ [ 0 ]
4
+ <AddShift> 560 560
5
+ <LearnRateCoef> 0 [ -8.311879 -8.600912 -9.615928 -10.43595 -11.21292 -11.88333 -12.36243 -12.63706 -12.8818 -12.83066 -12.89103 -12.95666 -13.19763 -13.40598 -13.49113 -13.5546 -13.55639 -13.51915 -13.68284 -13.53289 -13.42107 -13.65519 -13.50713 -13.75251 -13.76715 -13.87408 -13.73109 -13.70412 -13.56073 -13.53488 -13.54895 -13.56228 -13.59408 -13.62047 -13.64198 -13.66109 -13.62669 -13.58297 -13.57387 -13.4739 -13.53063 -13.48348 -13.61047 -13.64716 -13.71546 -13.79184 -13.90614 -14.03098 -14.18205 -14.35881 -14.48419 -14.60172 -14.70591 -14.83362 -14.92122 -15.00622 -15.05122 -15.03119 -14.99028 -14.92302 -14.86927 -14.82691 -14.7972 -14.76909 -14.71356 -14.61277 -14.51696 -14.42252 -14.36405 -14.30451 -14.23161 -14.19851 -14.16633 -14.15649 -14.10504 -13.99518 -13.79562 -13.3996 -12.7767 -11.71208 -8.311879 -8.600912 -9.615928 -10.43595 -11.21292 -11.88333 -12.36243 -12.63706 -12.8818 -12.83066 -12.89103 -12.95666 -13.19763 -13.40598 -13.49113 -13.5546 -13.55639 -13.51915 -13.68284 -13.53289 -13.42107 -13.65519 -13.50713 -13.75251 -13.76715 -13.87408 -13.73109 -13.70412 -13.56073 -13.53488 -13.54895 -13.56228 -13.59408 -13.62047 -13.64198 -13.66109 -13.62669 -13.58297 -13.57387 -13.4739 -13.53063 -13.48348 -13.61047 -13.64716 -13.71546 -13.79184 -13.90614 -14.03098 -14.18205 -14.35881 -14.48419 -14.60172 -14.70591 -14.83362 -14.92122 -15.00622 -15.05122 -15.03119 -14.99028 -14.92302 -14.86927 -14.82691 -14.7972 -14.76909 -14.71356 -14.61277 -14.51696 -14.42252 -14.36405 -14.30451 -14.23161 -14.19851 -14.16633 -14.15649 -14.10504 -13.99518 -13.79562 -13.3996 -12.7767 -11.71208 -8.311879 -8.600912 -9.615928 -10.43595 -11.21292 -11.88333 -12.36243 -12.63706 -12.8818 -12.83066 -12.89103 -12.95666 -13.19763 -13.40598 -13.49113 -13.5546 -13.55639 -13.51915 -13.68284 -13.53289 -13.42107 -13.65519 -13.50713 -13.75251 -13.76715 -13.87408 -13.73109 -13.70412 -13.56073 -13.53488 -13.54895 -13.56228 -13.59408 -13.62047 -13.64198 -13.66109 -13.62669 -13.58297 -13.57387 -13.4739 -13.53063 -13.48348 -13.61047 -13.64716 -13.71546 -13.79184 -13.90614 -14.03098 -14.18205 -14.35881 -14.48419 -14.60172 -14.70591 -14.83362 -14.92122 -15.00622 -15.05122 -15.03119 -14.99028 -14.92302 -14.86927 -14.82691 -14.7972 -14.76909 -14.71356 -14.61277 -14.51696 -14.42252 -14.36405 -14.30451 -14.23161 -14.19851 -14.16633 -14.15649 -14.10504 -13.99518 -13.79562 -13.3996 -12.7767 -11.71208 -8.311879 -8.600912 -9.615928 -10.43595 -11.21292 -11.88333 -12.36243 -12.63706 -12.8818 -12.83066 -12.89103 -12.95666 -13.19763 -13.40598 -13.49113 -13.5546 -13.55639 -13.51915 -13.68284 -13.53289 -13.42107 -13.65519 -13.50713 -13.75251 -13.76715 -13.87408 -13.73109 -13.70412 -13.56073 -13.53488 -13.54895 -13.56228 -13.59408 -13.62047 -13.64198 -13.66109 -13.62669 -13.58297 -13.57387 -13.4739 -13.53063 -13.48348 -13.61047 -13.64716 -13.71546 -13.79184 -13.90614 -14.03098 -14.18205 -14.35881 -14.48419 -14.60172 -14.70591 -14.83362 -14.92122 -15.00622 -15.05122 -15.03119 -14.99028 -14.92302 -14.86927 -14.82691 -14.7972 -14.76909 -14.71356 -14.61277 -14.51696 -14.42252 -14.36405 -14.30451 -14.23161 -14.19851 -14.16633 -14.15649 -14.10504 -13.99518 -13.79562 -13.3996 -12.7767 -11.71208 -8.311879 -8.600912 -9.615928 -10.43595 -11.21292 -11.88333 -12.36243 -12.63706 -12.8818 -12.83066 -12.89103 -12.95666 -13.19763 -13.40598 -13.49113 -13.5546 -13.55639 -13.51915 -13.68284 -13.53289 -13.42107 -13.65519 -13.50713 -13.75251 -13.76715 -13.87408 -13.73109 -13.70412 -13.56073 -13.53488 -13.54895 -13.56228 -13.59408 -13.62047 -13.64198 -13.66109 -13.62669 -13.58297 -13.57387 -13.4739 -13.53063 -13.48348 -13.61047 -13.64716 -13.71546 -13.79184 -13.90614 -14.03098 -14.18205 -14.35881 -14.48419 -14.60172 -14.70591 -14.83362 -14.92122 -15.00622 -15.05122 -15.03119 -14.99028 -14.92302 -14.86927 -14.82691 -14.7972 -14.76909 -14.71356 -14.61277 -14.51696 -14.42252 -14.36405 -14.30451 -14.23161 -14.19851 -14.16633 -14.15649 -14.10504 -13.99518 -13.79562 -13.3996 -12.7767 -11.71208 -8.311879 -8.600912 -9.615928 -10.43595 -11.21292 -11.88333 -12.36243 -12.63706 -12.8818 -12.83066 -12.89103 -12.95666 -13.19763 -13.40598 -13.49113 -13.5546 -13.55639 -13.51915 -13.68284 -13.53289 -13.42107 -13.65519 -13.50713 -13.75251 -13.76715 -13.87408 -13.73109 -13.70412 -13.56073 -13.53488 -13.54895 -13.56228 -13.59408 -13.62047 -13.64198 -13.66109 -13.62669 -13.58297 -13.57387 -13.4739 -13.53063 -13.48348 -13.61047 -13.64716 -13.71546 -13.79184 -13.90614 -14.03098 -14.18205 -14.35881 -14.48419 -14.60172 -14.70591 -14.83362 -14.92122 -15.00622 -15.05122 -15.03119 -14.99028 -14.92302 -14.86927 -14.82691 -14.7972 -14.76909 -14.71356 -14.61277 -14.51696 -14.42252 -14.36405 -14.30451 -14.23161 -14.19851 -14.16633 -14.15649 -14.10504 -13.99518 -13.79562 -13.3996 -12.7767 -11.71208 -8.311879 -8.600912 -9.615928 -10.43595 -11.21292 -11.88333 -12.36243 -12.63706 -12.8818 -12.83066 -12.89103 -12.95666 -13.19763 -13.40598 -13.49113 -13.5546 -13.55639 -13.51915 -13.68284 -13.53289 -13.42107 -13.65519 -13.50713 -13.75251 -13.76715 -13.87408 -13.73109 -13.70412 -13.56073 -13.53488 -13.54895 -13.56228 -13.59408 -13.62047 -13.64198 -13.66109 -13.62669 -13.58297 -13.57387 -13.4739 -13.53063 -13.48348 -13.61047 -13.64716 -13.71546 -13.79184 -13.90614 -14.03098 -14.18205 -14.35881 -14.48419 -14.60172 -14.70591 -14.83362 -14.92122 -15.00622 -15.05122 -15.03119 -14.99028 -14.92302 -14.86927 -14.82691 -14.7972 -14.76909 -14.71356 -14.61277 -14.51696 -14.42252 -14.36405 -14.30451 -14.23161 -14.19851 -14.16633 -14.15649 -14.10504 -13.99518 -13.79562 -13.3996 -12.7767 -11.71208 ]
6
+ <Rescale> 560 560
7
+ <LearnRateCoef> 0 [ 0.155775 0.154484 0.1527379 0.1518718 0.1506028 0.1489256 0.147067 0.1447061 0.1436307 0.1443568 0.1451849 0.1455157 0.1452821 0.1445717 0.1439195 0.1435867 0.1436018 0.1438781 0.1442086 0.1448844 0.1454756 0.145663 0.146268 0.1467386 0.1472724 0.147664 0.1480913 0.1483739 0.1488841 0.1493636 0.1497088 0.1500379 0.1502916 0.1505389 0.1506787 0.1507102 0.1505992 0.1505445 0.1505938 0.1508133 0.1509569 0.1512396 0.1514625 0.1516195 0.1516156 0.1515561 0.1514966 0.1513976 0.1512612 0.151076 0.1510596 0.1510431 0.151077 0.1511168 0.1511917 0.151023 0.1508045 0.1505885 0.1503493 0.1502373 0.1501726 0.1500762 0.1500065 0.1499782 0.150057 0.1502658 0.150469 0.1505335 0.1505505 0.1505328 0.1504275 0.1502438 0.1499674 0.1497118 0.1494661 0.1493102 0.1493681 0.1495501 0.1499738 0.1509654 0.155775 0.154484 0.1527379 0.1518718 0.1506028 0.1489256 0.147067 0.1447061 0.1436307 0.1443568 0.1451849 0.1455157 0.1452821 0.1445717 0.1439195 0.1435867 0.1436018 0.1438781 0.1442086 0.1448844 0.1454756 0.145663 0.146268 0.1467386 0.1472724 0.147664 0.1480913 0.1483739 0.1488841 0.1493636 0.1497088 0.1500379 0.1502916 0.1505389 0.1506787 0.1507102 0.1505992 0.1505445 0.1505938 0.1508133 0.1509569 0.1512396 0.1514625 0.1516195 0.1516156 0.1515561 0.1514966 0.1513976 0.1512612 0.151076 0.1510596 0.1510431 0.151077 0.1511168 0.1511917 0.151023 0.1508045 0.1505885 0.1503493 0.1502373 0.1501726 0.1500762 0.1500065 0.1499782 0.150057 0.1502658 0.150469 0.1505335 0.1505505 0.1505328 0.1504275 0.1502438 0.1499674 0.1497118 0.1494661 0.1493102 0.1493681 0.1495501 0.1499738 0.1509654 0.155775 0.154484 0.1527379 0.1518718 0.1506028 0.1489256 0.147067 0.1447061 0.1436307 0.1443568 0.1451849 0.1455157 0.1452821 0.1445717 0.1439195 0.1435867 0.1436018 0.1438781 0.1442086 0.1448844 0.1454756 0.145663 0.146268 0.1467386 0.1472724 0.147664 0.1480913 0.1483739 0.1488841 0.1493636 0.1497088 0.1500379 0.1502916 0.1505389 0.1506787 0.1507102 0.1505992 0.1505445 0.1505938 0.1508133 0.1509569 0.1512396 0.1514625 0.1516195 0.1516156 0.1515561 0.1514966 0.1513976 0.1512612 0.151076 0.1510596 0.1510431 0.151077 0.1511168 0.1511917 0.151023 0.1508045 0.1505885 0.1503493 0.1502373 0.1501726 0.1500762 0.1500065 0.1499782 0.150057 0.1502658 0.150469 0.1505335 0.1505505 0.1505328 0.1504275 0.1502438 0.1499674 0.1497118 0.1494661 0.1493102 0.1493681 0.1495501 0.1499738 0.1509654 0.155775 0.154484 0.1527379 0.1518718 0.1506028 0.1489256 0.147067 0.1447061 0.1436307 0.1443568 0.1451849 0.1455157 0.1452821 0.1445717 0.1439195 0.1435867 0.1436018 0.1438781 0.1442086 0.1448844 0.1454756 0.145663 0.146268 0.1467386 0.1472724 0.147664 0.1480913 0.1483739 0.1488841 0.1493636 0.1497088 0.1500379 0.1502916 0.1505389 0.1506787 0.1507102 0.1505992 0.1505445 0.1505938 0.1508133 0.1509569 0.1512396 0.1514625 0.1516195 0.1516156 0.1515561 0.1514966 0.1513976 0.1512612 0.151076 0.1510596 0.1510431 0.151077 0.1511168 0.1511917 0.151023 0.1508045 0.1505885 0.1503493 0.1502373 0.1501726 0.1500762 0.1500065 0.1499782 0.150057 0.1502658 0.150469 0.1505335 0.1505505 0.1505328 0.1504275 0.1502438 0.1499674 0.1497118 0.1494661 0.1493102 0.1493681 0.1495501 0.1499738 0.1509654 0.155775 0.154484 0.1527379 0.1518718 0.1506028 0.1489256 0.147067 0.1447061 0.1436307 0.1443568 0.1451849 0.1455157 0.1452821 0.1445717 0.1439195 0.1435867 0.1436018 0.1438781 0.1442086 0.1448844 0.1454756 0.145663 0.146268 0.1467386 0.1472724 0.147664 0.1480913 0.1483739 0.1488841 0.1493636 0.1497088 0.1500379 0.1502916 0.1505389 0.1506787 0.1507102 0.1505992 0.1505445 0.1505938 0.1508133 0.1509569 0.1512396 0.1514625 0.1516195 0.1516156 0.1515561 0.1514966 0.1513976 0.1512612 0.151076 0.1510596 0.1510431 0.151077 0.1511168 0.1511917 0.151023 0.1508045 0.1505885 0.1503493 0.1502373 0.1501726 0.1500762 0.1500065 0.1499782 0.150057 0.1502658 0.150469 0.1505335 0.1505505 0.1505328 0.1504275 0.1502438 0.1499674 0.1497118 0.1494661 0.1493102 0.1493681 0.1495501 0.1499738 0.1509654 0.155775 0.154484 0.1527379 0.1518718 0.1506028 0.1489256 0.147067 0.1447061 0.1436307 0.1443568 0.1451849 0.1455157 0.1452821 0.1445717 0.1439195 0.1435867 0.1436018 0.1438781 0.1442086 0.1448844 0.1454756 0.145663 0.146268 0.1467386 0.1472724 0.147664 0.1480913 0.1483739 0.1488841 0.1493636 0.1497088 0.1500379 0.1502916 0.1505389 0.1506787 0.1507102 0.1505992 0.1505445 0.1505938 0.1508133 0.1509569 0.1512396 0.1514625 0.1516195 0.1516156 0.1515561 0.1514966 0.1513976 0.1512612 0.151076 0.1510596 0.1510431 0.151077 0.1511168 0.1511917 0.151023 0.1508045 0.1505885 0.1503493 0.1502373 0.1501726 0.1500762 0.1500065 0.1499782 0.150057 0.1502658 0.150469 0.1505335 0.1505505 0.1505328 0.1504275 0.1502438 0.1499674 0.1497118 0.1494661 0.1493102 0.1493681 0.1495501 0.1499738 0.1509654 0.155775 0.154484 0.1527379 0.1518718 0.1506028 0.1489256 0.147067 0.1447061 0.1436307 0.1443568 0.1451849 0.1455157 0.1452821 0.1445717 0.1439195 0.1435867 0.1436018 0.1438781 0.1442086 0.1448844 0.1454756 0.145663 0.146268 0.1467386 0.1472724 0.147664 0.1480913 0.1483739 0.1488841 0.1493636 0.1497088 0.1500379 0.1502916 0.1505389 0.1506787 0.1507102 0.1505992 0.1505445 0.1505938 0.1508133 0.1509569 0.1512396 0.1514625 0.1516195 0.1516156 0.1515561 0.1514966 0.1513976 0.1512612 0.151076 0.1510596 0.1510431 0.151077 0.1511168 0.1511917 0.151023 0.1508045 0.1505885 0.1503493 0.1502373 0.1501726 0.1500762 0.1500065 0.1499782 0.150057 0.1502658 0.150469 0.1505335 0.1505505 0.1505328 0.1504275 0.1502438 0.1499674 0.1497118 0.1494661 0.1493102 0.1493681 0.1495501 0.1499738 0.1509654 ]
8
+ </Nnet>
ckpts/paraformer-zh/config.yaml ADDED
@@ -0,0 +1,123 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ # network architecture
3
+ model: Paraformer
4
+ model_conf:
5
+ ctc_weight: 0.0
6
+ lsm_weight: 0.1
7
+ length_normalized_loss: true
8
+ predictor_weight: 1.0
9
+ predictor_bias: 1
10
+ sampling_ratio: 0.75
11
+
12
+ # encoder
13
+ encoder: SANMEncoder
14
+ encoder_conf:
15
+ output_size: 512
16
+ attention_heads: 4
17
+ linear_units: 2048
18
+ num_blocks: 50
19
+ dropout_rate: 0.1
20
+ positional_dropout_rate: 0.1
21
+ attention_dropout_rate: 0.1
22
+ input_layer: pe
23
+ pos_enc_class: SinusoidalPositionEncoder
24
+ normalize_before: true
25
+ kernel_size: 11
26
+ sanm_shfit: 0
27
+ selfattention_layer_type: sanm
28
+
29
+ # decoder
30
+ decoder: ParaformerSANMDecoder
31
+ decoder_conf:
32
+ attention_heads: 4
33
+ linear_units: 2048
34
+ num_blocks: 16
35
+ dropout_rate: 0.1
36
+ positional_dropout_rate: 0.1
37
+ self_attention_dropout_rate: 0.1
38
+ src_attention_dropout_rate: 0.1
39
+ att_layer_num: 16
40
+ kernel_size: 11
41
+ sanm_shfit: 0
42
+
43
+ predictor: CifPredictorV2
44
+ predictor_conf:
45
+ idim: 512
46
+ threshold: 1.0
47
+ l_order: 1
48
+ r_order: 1
49
+ tail_threshold: 0.45
50
+
51
+ # frontend related
52
+ frontend: WavFrontend
53
+ frontend_conf:
54
+ fs: 16000
55
+ window: hamming
56
+ n_mels: 80
57
+ frame_length: 25
58
+ frame_shift: 10
59
+ lfr_m: 7
60
+ lfr_n: 6
61
+
62
+ specaug: SpecAugLFR
63
+ specaug_conf:
64
+ apply_time_warp: false
65
+ time_warp_window: 5
66
+ time_warp_mode: bicubic
67
+ apply_freq_mask: true
68
+ freq_mask_width_range:
69
+ - 0
70
+ - 30
71
+ lfr_rate: 6
72
+ num_freq_mask: 1
73
+ apply_time_mask: true
74
+ time_mask_width_range:
75
+ - 0
76
+ - 12
77
+ num_time_mask: 1
78
+
79
+ train_conf:
80
+ accum_grad: 1
81
+ grad_clip: 5
82
+ max_epoch: 150
83
+ val_scheduler_criterion:
84
+ - valid
85
+ - acc
86
+ best_model_criterion:
87
+ - - valid
88
+ - acc
89
+ - max
90
+ keep_nbest_models: 10
91
+ log_interval: 50
92
+
93
+ optim: adam
94
+ optim_conf:
95
+ lr: 0.0005
96
+ scheduler: warmuplr
97
+ scheduler_conf:
98
+ warmup_steps: 30000
99
+
100
+ dataset: AudioDataset
101
+ dataset_conf:
102
+ index_ds: IndexDSJsonl
103
+ batch_sampler: DynamicBatchLocalShuffleSampler
104
+ batch_type: example # example or length
105
+ batch_size: 1 # if batch_type is example, batch_size is the numbers of samples; if length, batch_size is source_token_len+target_token_len;
106
+ max_token_length: 2048 # filter samples if source_token_len+target_token_len > max_token_length,
107
+ buffer_size: 500
108
+ shuffle: True
109
+ num_workers: 0
110
+
111
+ tokenizer: CharTokenizer
112
+ tokenizer_conf:
113
+ unk_symbol: <unk>
114
+ split_with_space: true
115
+
116
+
117
+ input_size: 560
118
+ ctc_conf:
119
+ dropout_rate: 0.0
120
+ ctc_type: builtin
121
+ reduce: true
122
+ ignore_nan_grad: true
123
+ normalize: null
ckpts/paraformer-zh/configuration.json ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "framework": "pytorch",
3
+ "task" : "auto-speech-recognition",
4
+ "model": {"type" : "funasr"},
5
+ "pipeline": {"type":"funasr-pipeline"},
6
+ "model_name_in_hub": {
7
+ "ms":"iic/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch",
8
+ "hf":""},
9
+ "file_path_metas": {
10
+ "init_param":"model.pt",
11
+ "config":"config.yaml",
12
+ "tokenizer_conf": {"token_list": "tokens.json", "seg_dict_file": "seg_dict"},
13
+ "frontend_conf":{"cmvn_file": "am.mvn"}}
14
+ }
ckpts/paraformer-zh/model.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5bba782a5e9196166233b9ab12ba04cadff9ef9212b4ff6153ed9290ff679025
3
+ size 880502012
ckpts/paraformer-zh/seg_dict ADDED
The diff for this file is too large to render. See raw diff
 
ckpts/paraformer-zh/tokens.json ADDED
The diff for this file is too large to render. See raw diff
 
ckpts/punc_ct-transformer_zh-cn-common-vocab272727-pytorch/README.md ADDED
@@ -0,0 +1,272 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ tasks:
3
+ - punctuation
4
+ domain:
5
+ - audio
6
+ model-type:
7
+ - Classification
8
+ frameworks:
9
+ - pytorch
10
+ metrics:
11
+ - f1_score
12
+ license: Apache License 2.0
13
+ language:
14
+ - cn
15
+ tags:
16
+ - FunASR
17
+ - CT-Transformer
18
+ - Alibaba
19
+ - ICASSP 2020
20
+ datasets:
21
+ train:
22
+ - 33M-samples online data
23
+ test:
24
+ - wikipedia data test
25
+ - 10000 industrial Mandarin sentences test
26
+ widgets:
27
+ - task: punctuation
28
+ model_revision: v2.0.4
29
+ inputs:
30
+ - type: text
31
+ name: input
32
+ title: 文本
33
+ examples:
34
+ - name: 1
35
+ title: 示例1
36
+ inputs:
37
+ - name: input
38
+ data: 我们都是木头人不会讲话不会动
39
+ inferencespec:
40
+ cpu: 1 #CPU数量
41
+ memory: 4096
42
+ ---
43
+
44
+ # Controllable Time-delay Transformer模型介绍
45
+
46
+ [//]: # (Controllable Time-delay Transformer 模型是一种端到端标点分类模型。)
47
+
48
+ [//]: # (常规的Transformer会依赖很远的未来信息,导致长时间结果不固定。Controllable Time-delay Transformer 在效果无损的情况下,有效控制标点的延时。)
49
+
50
+ # Highlights
51
+ - 中文标点通用模型:可用于语音识别模型输出文本的标点预测。
52
+ - 基于[Paraformer-large长音频模型](https://www.modelscope.cn/models/damo/speech_paraformer-large-vad-punc_asr_nat-zh-cn-16k-common-vocab8404-pytorch/summary)场景的使用
53
+ - 基于[FunASR框架](https://github.com/alibaba-damo-academy/FunASR),可进行ASR,VAD,标点的自由组合
54
+ - 基于纯文本输入的标点预测
55
+
56
+ ## <strong>[FunASR开源项目介绍](https://github.com/alibaba-damo-academy/FunASR)</strong>
57
+ <strong>[FunASR](https://github.com/alibaba-damo-academy/FunASR)</strong>希望在语音识别的学术研究和工业应用之间架起一座桥梁。通过发布工业级语音识别模型的训练和微调,研究人员和开发人员可以更方便地进行语音识别模型的研究和生产,并推动语音识别生态的发展。让语音识别更有趣!
58
+
59
+ [**github仓库**](https://github.com/alibaba-damo-academy/FunASR)
60
+ | [**最新动态**](https://github.com/alibaba-damo-academy/FunASR#whats-new)
61
+ | [**环境安装**](https://github.com/alibaba-damo-academy/FunASR#installation)
62
+ | [**服务部署**](https://www.funasr.com)
63
+ | [**模型库**](https://github.com/alibaba-damo-academy/FunASR/tree/main/model_zoo)
64
+ | [**联系我们**](https://github.com/alibaba-damo-academy/FunASR#contact)
65
+
66
+
67
+ ## 模型原理介绍
68
+
69
+ Controllable Time-delay Transformer是达摩院语音团队提出的高效后处理框架中的标点模块。本项目为中文通用标点模型,模型可以被应用于文本类输入的标点预测,也可应用于语音识别结果的后处理步骤,协助语音识别模块输出具有可读性的文本结果。
70
+
71
+ <p align="center">
72
+ <img src="fig/struct.png" alt="Controllable Time-delay Transformer模型结构" width="500" />
73
+
74
+ Controllable Time-delay Transformer 模型结构如上图所示,由 Embedding、Encoder 和 Predictor 三部分组成。Embedding 是词向量叠加位置向量。Encoder可以采用不同的网络结构,例如self-attention,conformer,SAN-M等。Predictor 预测每个token后的标点类型。
75
+
76
+ 在模型的选择上采用了性能优越的Transformer模型。Transformer模型在获得良好性能的同时,由于模型自身序列化输入等特性,会给系统带来较大时延。常规的Transformer可以看到未来的全部信息,导致标点会依赖很远的未来信息。这会给用户带来一种标点一直在变化刷新,长时间结果不固定的不良感受。基于这一问题,我们创新性的提出了可控时延的Transformer模型(Controllable Time-Delay Transformer, CT-Transformer),在模型性能无损失的情况下,有效控制标点的延时。
77
+
78
+ 更详细的细节见:
79
+ - 论文: [CONTROLLABLE TIME-DELAY TRANSFORMER FOR REAL-TIME PUNCTUATION PREDICTION AND DISFLUENCY DETECTION](https://arxiv.org/pdf/2003.01309.pdf)
80
+
81
+ ## 基于ModelScope进行推理
82
+
83
+ 以下为三种支持格式及api调用方式参考如下范例:
84
+ - text.scp文件路径,例如example/punc_example.txt,格式为: key + "\t" + value
85
+ ```sh
86
+ cat example/punc_example.txt
87
+ 1 跨境河流是养育沿岸人民的生命之源
88
+ 2 从存储上来说仅仅是全景图片它就会是图片的四倍的容量
89
+ 3 那今天的会就到这里吧happy new year明年见
90
+ ```
91
+ ```python
92
+ from modelscope.pipelines import pipeline
93
+ from modelscope.utils.constant import Tasks
94
+
95
+ inference_pipline = pipeline(
96
+ task=Tasks.punctuation,
97
+ model='damo/punc_ct-transformer_zh-cn-common-vocab272727-pytorch',
98
+ model_revision="v2.0.4")
99
+
100
+ rec_result = inference_pipline(input='example/punc_example.txt')
101
+ print(rec_result)
102
+ ```
103
+ - text二进制数据,例如:用户直接从文件里读出bytes数据
104
+ ```python
105
+ rec_result = inference_pipline(input='我们都是木头人不会讲话不会动')
106
+ ```
107
+ - text文件url,例如:https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_text/punc_example.txt
108
+ ```python
109
+ rec_result = inference_pipline(input='https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_text/punc_example.txt')
110
+ ```
111
+
112
+
113
+ ## 基于FunASR进行推理
114
+
115
+ 下面为快速上手教程,测试音频([中文](https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/vad_example.wav),[英文](https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/asr_example_en.wav))
116
+
117
+ ### 可执行命令行
118
+ 在命令行终端执行:
119
+
120
+ ```shell
121
+ funasr ++model=paraformer-zh ++vad_model="fsmn-vad" ++punc_model="ct-punc" ++input=vad_example.wav
122
+ ```
123
+
124
+ 注:支持单条音频文件识别,也支持文件列表,列表为kaldi风格wav.scp:`wav_id wav_path`
125
+
126
+ ### python示例
127
+ #### 非实时语音识别
128
+ ```python
129
+ from funasr import AutoModel
130
+ # paraformer-zh is a multi-functional asr model
131
+ # use vad, punc, spk or not as you need
132
+ model = AutoModel(model="paraformer-zh", model_revision="v2.0.4",
133
+ vad_model="fsmn-vad", vad_model_revision="v2.0.4",
134
+ punc_model="ct-punc-c", punc_model_revision="v2.0.4",
135
+ # spk_model="cam++", spk_model_revision="v2.0.2",
136
+ )
137
+ res = model.generate(input=f"{model.model_path}/example/asr_example.wav",
138
+ batch_size_s=300,
139
+ hotword='魔搭')
140
+ print(res)
141
+ ```
142
+ 注:`model_hub`:表示模型仓库,`ms`为选择modelscope下载,`hf`为选择huggingface下载。
143
+
144
+ #### 实时语音识别
145
+
146
+ ```python
147
+ from funasr import AutoModel
148
+
149
+ chunk_size = [0, 10, 5] #[0, 10, 5] 600ms, [0, 8, 4] 480ms
150
+ encoder_chunk_look_back = 4 #number of chunks to lookback for encoder self-attention
151
+ decoder_chunk_look_back = 1 #number of encoder chunks to lookback for decoder cross-attention
152
+
153
+ model = AutoModel(model="paraformer-zh-streaming", model_revision="v2.0.4")
154
+
155
+ import soundfile
156
+ import os
157
+
158
+ wav_file = os.path.join(model.model_path, "example/asr_example.wav")
159
+ speech, sample_rate = soundfile.read(wav_file)
160
+ chunk_stride = chunk_size[1] * 960 # 600ms
161
+
162
+ cache = {}
163
+ total_chunk_num = int(len((speech)-1)/chunk_stride+1)
164
+ for i in range(total_chunk_num):
165
+ speech_chunk = speech[i*chunk_stride:(i+1)*chunk_stride]
166
+ is_final = i == total_chunk_num - 1
167
+ res = model.generate(input=speech_chunk, cache=cache, is_final=is_final, chunk_size=chunk_size, encoder_chunk_look_back=encoder_chunk_look_back, decoder_chunk_look_back=decoder_chunk_look_back)
168
+ print(res)
169
+ ```
170
+
171
+ 注:`chunk_size`为流式延时配置,`[0,10,5]`表示上屏实时出字粒度为`10*60=600ms`,未来信息为`5*60=300ms`。每次推理输入为`600ms`(采样点数为`16000*0.6=960`),输出为对应文字,最后一个语音片段输入需要设置`is_final=True`来强制输出最后一个字。
172
+
173
+ #### 语音端点检测(非实时)
174
+ ```python
175
+ from funasr import AutoModel
176
+
177
+ model = AutoModel(model="fsmn-vad", model_revision="v2.0.4")
178
+
179
+ wav_file = f"{model.model_path}/example/asr_example.wav"
180
+ res = model.generate(input=wav_file)
181
+ print(res)
182
+ ```
183
+
184
+ #### 语音端点检测(实时)
185
+ ```python
186
+ from funasr import AutoModel
187
+
188
+ chunk_size = 200 # ms
189
+ model = AutoModel(model="fsmn-vad", model_revision="v2.0.4")
190
+
191
+ import soundfile
192
+
193
+ wav_file = f"{model.model_path}/example/vad_example.wav"
194
+ speech, sample_rate = soundfile.read(wav_file)
195
+ chunk_stride = int(chunk_size * sample_rate / 1000)
196
+
197
+ cache = {}
198
+ total_chunk_num = int(len((speech)-1)/chunk_stride+1)
199
+ for i in range(total_chunk_num):
200
+ speech_chunk = speech[i*chunk_stride:(i+1)*chunk_stride]
201
+ is_final = i == total_chunk_num - 1
202
+ res = model.generate(input=speech_chunk, cache=cache, is_final=is_final, chunk_size=chunk_size)
203
+ if len(res[0]["value"]):
204
+ print(res)
205
+ ```
206
+
207
+ #### 标点恢复
208
+ ```python
209
+ from funasr import AutoModel
210
+
211
+ model = AutoModel(model="ct-punc", model_revision="v2.0.4")
212
+
213
+ res = model.generate(input="那今天的会就到这里吧 happy new year 明年见")
214
+ print(res)
215
+ ```
216
+
217
+ #### 时间戳预测
218
+ ```python
219
+ from funasr import AutoModel
220
+
221
+ model = AutoModel(model="fa-zh", model_revision="v2.0.4")
222
+
223
+ wav_file = f"{model.model_path}/example/asr_example.wav"
224
+ text_file = f"{model.model_path}/example/text.txt"
225
+ res = model.generate(input=(wav_file, text_file), data_type=("sound", "text"))
226
+ print(res)
227
+ ```
228
+
229
+ 更多详细用法([示例](https://github.com/alibaba-damo-academy/FunASR/tree/main/examples/industrial_data_pretraining))
230
+
231
+
232
+ ## 微调
233
+
234
+ 详细用法([示例](https://github.com/alibaba-damo-academy/FunASR/tree/main/examples/industrial_data_pretraining))
235
+
236
+
237
+
238
+
239
+
240
+ ## Benchmark
241
+ 中文标点预测通用模型在自采集的通用领域业务场景数据上有良好效果。训练数据大约33M个sample,每个sample可能包含1句或多句。
242
+
243
+ ### 自采集数据(20000+ samples)
244
+
245
+ | precision | recall | f1_score |
246
+ |:------------------------------------:|:-------------------------------------:|:-------------------------------------:|
247
+ | <div style="width: 150pt">53.8</div> | <div style="width: 150pt">60.0</div> | <div style="width: 150pt">56.5</div> |
248
+
249
+ ## 使用方式以及适用范围
250
+
251
+ 运行范围
252
+ - 支持Linux-x86_64、Mac和Windows运行。
253
+
254
+ 使用方式
255
+ - 直接推理:可以直接对输入文本进行计算,输出带有标点的目标文字。
256
+
257
+ 使用范围与目标场景
258
+ - 适合对文本数据进行标点预测,文本长度不限。
259
+
260
+ ## 相关论文以及引用信息
261
+
262
+ ```BibTeX
263
+ @inproceedings{chen2020controllable,
264
+ title={Controllable Time-Delay Transformer for Real-Time Punctuation Prediction and Disfluency Detection},
265
+ author={Chen, Qian and Chen, Mengzhe and Li, Bo and Wang, Wen},
266
+ booktitle={ICASSP 2020-2020 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)},
267
+ pages={8069--8073},
268
+ year={2020},
269
+ organization={IEEE}
270
+ }
271
+ ```
272
+
ckpts/punc_ct-transformer_zh-cn-common-vocab272727-pytorch/config.yaml ADDED
@@ -0,0 +1,46 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ model: CTTransformer
2
+ model_conf:
3
+ ignore_id: 0
4
+ embed_unit: 256
5
+ att_unit: 256
6
+ dropout_rate: 0.1
7
+ punc_list:
8
+ - <unk>
9
+ - _
10
+ - ,
11
+ - 。
12
+ - ?
13
+ - 、
14
+ punc_weight:
15
+ - 1.0
16
+ - 1.0
17
+ - 1.0
18
+ - 1.0
19
+ - 1.0
20
+ - 1.0
21
+ sentence_end_id: 3
22
+
23
+ encoder: SANMEncoder
24
+ encoder_conf:
25
+ input_size: 256
26
+ output_size: 256
27
+ attention_heads: 8
28
+ linear_units: 1024
29
+ num_blocks: 4
30
+ dropout_rate: 0.1
31
+ positional_dropout_rate: 0.1
32
+ attention_dropout_rate: 0.0
33
+ input_layer: pe
34
+ pos_enc_class: SinusoidalPositionEncoder
35
+ normalize_before: true
36
+ kernel_size: 11
37
+ sanm_shfit: 0
38
+ selfattention_layer_type: sanm
39
+ padding_idx: 0
40
+
41
+ tokenizer: CharTokenizer
42
+ tokenizer_conf:
43
+ unk_symbol: <unk>
44
+
45
+
46
+
ckpts/punc_ct-transformer_zh-cn-common-vocab272727-pytorch/configuration.json ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "framework": "pytorch",
3
+ "task" : "punctuation",
4
+ "model": {"type" : "funasr"},
5
+ "pipeline": {"type":"funasr-pipeline"},
6
+ "model_name_in_hub": {
7
+ "ms":"iic/punc_ct-transformer_zh-cn-common-vocab272727-pytorch",
8
+ "hf":""},
9
+ "file_path_metas": {
10
+ "init_param":"model.pt",
11
+ "config":"config.yaml",
12
+ "tokenizer_conf": {"token_list": "tokens.json"}}
13
+ }
ckpts/punc_ct-transformer_zh-cn-common-vocab272727-pytorch/model.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a5818bb9d933805a916eebe41eb41648f7f9caad30b4bd59d56f3ca135421916
3
+ size 291979892
ckpts/punc_ct-transformer_zh-cn-common-vocab272727-pytorch/tokens.json ADDED
The diff for this file is too large to render. See raw diff
 
ckpts/speaker-diarization-community-1/.DS_Store ADDED
Binary file (6.15 kB). View file
 
ckpts/speaker-diarization-community-1/README.md ADDED
@@ -0,0 +1,227 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ tags:
3
+ - pyannote
4
+ - pyannote-audio
5
+ - pyannote-audio-pipeline
6
+ - audio
7
+ - voice
8
+ - speech
9
+ - speaker
10
+ - speaker-diarization
11
+ - speaker-change-detection
12
+ - voice-activity-detection
13
+ - overlapped-speech-detection
14
+ - automatic-speech-recognition
15
+ license: cc-by-4.0
16
+ extra_gated_prompt: "Your input helps us strengthen the pyannote community and improve our open-source offerings. This pipeline is released under the CC-BY-4.0 license and will always remain freely accessible. By providing your details, you agree that we may email you occasionally with important news about pyannote models, invitations to try premium pipelines, and information about specific services designed for researchers and professionals like you."
17
+ extra_gated_fields:
18
+ Company/university: text
19
+ Use case:
20
+ type: select
21
+ options:
22
+ - label: Meeting note taker (automated meeting transcription, action item extraction, and speaker identification in recordings)
23
+ value: meeting
24
+ - label: Conversation AI (chatbots, voice assistants, multi-turn dialogue systems with speaker awareness)
25
+ value: conversation
26
+ - label: CCaaS and customer experience (call center analytics, customer service optimization, and interaction quality monitoring)
27
+ value: ccaas
28
+ - label: Voice agents (AI-powered phone systems, automated customer service, voice-based interactions)
29
+ value: agent
30
+ - label: Media and automated dubbing (content creation, podcast processing, video production, and multilingual media)
31
+ value: dubbing
32
+ - label: Training and development (educational content analysis, corporate training evaluation, and learning assessment tools)
33
+ value: training
34
+ - label: Other
35
+ value: other
36
+ ---
37
+
38
+ # `community-1` speaker diarization
39
+
40
+ This pipeline ingests mono audio sampled at 16kHz and outputs speaker diarization.
41
+
42
+ - stereo or multi-channel audio files are automatically downmixed to mono by averaging the channels.
43
+ - audio files sampled at a different rate are resampled to 16kHz automatically upon loading.
44
+
45
+ The [main improvements brought by `Community-1`](https://www.pyannote.ai/blog/community-1) are:
46
+
47
+ - [improved](#benchmark) speaker assignment and counting
48
+ - simpler reconciliation with transcription timestamps with [*exclusive*](#exclusive-speaker-diarization) speaker diarization
49
+ - easy [offline use](#offline-use) (i.e. without internet connection)
50
+ - (optionally) [hosted](https://hf.co/pyannote/speaker-diarization-community-1-cloud) on pyannoteAI cloud
51
+
52
+
53
+ ## Setup
54
+
55
+ 1. `pip install pyannote.audio`
56
+ 2. Accept user conditions
57
+ 3. Create access token at [`hf.co/settings/tokens`](https://hf.co/settings/tokens).
58
+
59
+ ## Quick start
60
+
61
+ ```python
62
+ # download the pipeline from Huggingface
63
+ from pyannote.audio import Pipeline
64
+ pipeline = Pipeline.from_pretrained(
65
+ "pyannote/speaker-diarization-community-1",
66
+ token="{huggingface-token}")
67
+
68
+ # run the pipeline locally on your computer
69
+ output = pipeline("audio.wav")
70
+
71
+ # print the predicted speaker diarization
72
+ for turn, speaker in output.speaker_diarization:
73
+ print(f"{speaker} speaks between t={turn.start:.3f}s and t={turn.end:.3f}s")
74
+ ```
75
+
76
+ ## Benchmark
77
+
78
+ Out of the box, `Community-1` is much better than `speaker-diarization-3.1`.
79
+
80
+ We report [diarization error rates](http://pyannote.github.io/pyannote-metrics/reference.html#diarization) (in %) on large collection of academic benchmarks (fully automatic processing, no forgiveness collar, nor skipping overlapping speech).
81
+
82
+ | Benchmark (last updated in 2025-09) | <a href="https://hf.co/pyannote/speaker-diarization-3.1">`legacy` (3.1)</a>| <a href="https://www.pyannote.ai/blog/community-1">`community-1`</a> | <a href="https://www.pyannote.ai/blog/precision-2">`precision-2`</a> |
83
+ | --------------------------------------------------------------------------------------------------------------------------- | ------------------------------------------------------ | -------------------------------------------------| ------------------------------------------------ |
84
+ | [AISHELL-4](https://arxiv.org/abs/2104.03603) | 12.2 | 11.7 | 11.4 |
85
+ | [AliMeeting](https://www.openslr.org/119/) (channel 1) | 24.5 | 20.3 | 15.2 |
86
+ | [AMI](https://groups.inf.ed.ac.uk/ami/corpus/) (IHM) | 18.8 | 17.0 | 12.9 |
87
+ | [AMI](https://groups.inf.ed.ac.uk/ami/corpus/) (SDM) | 22.7 | 19.9 | 15.6 |
88
+ | [AVA-AVD](https://arxiv.org/abs/2111.14448) | 49.7 | 44.6 | 37.1 |
89
+ | [CALLHOME](https://catalog.ldc.upenn.edu/LDC2001S97) ([part 2](https://github.com/BUTSpeechFIT/CALLHOME_sublists/issues/1)) | 28.5 | 26.7 | 16.6 |
90
+ | [DIHARD 3](https://catalog.ldc.upenn.edu/LDC2022S14) ([full](https://arxiv.org/abs/2012.01477)) | 21.4 | 20.2 | 14.7 |
91
+ | [Ego4D](https://arxiv.org/abs/2110.07058) (dev.) | 51.2 | 46.8 | 39.0 |
92
+ | [MSDWild](https://github.com/X-LANCE/MSDWILD) | 25.4 | 22.8 | 17.3 |
93
+ | [RAMC](https://www.openslr.org/123/) | 22.2 | 20.8 | 10.5 |
94
+ | [REPERE](https://www.islrn.org/resources/360-758-359-485-0/) (phase2) | 7.9 | 8.9 | 7.4 |
95
+ | [VoxConverse](https://github.com/joonson/voxconverse) (v0.3) | 11.2 | 11.2 | 8.5 |
96
+
97
+ `Precision-2` model is even better and can be tested like this:
98
+
99
+ 1. Create an API key on [pyannoteAI dashboard]((https://dashboard.pyannote.ai)) (free credits included)
100
+ 2. Change one line of code
101
+
102
+ ```diff
103
+ from pyannote.audio import Pipeline
104
+ pipeline = Pipeline.from_pretrained(
105
+ - 'pyannote/speaker-diarization-community-1', token="{huggingface-token}")
106
+ + 'pyannote/speaker-diarization-precision-2', token="{pyannoteAI-api-key}")
107
+ diarization = pipeline("audio.wav") # runs on pyannoteAI servers
108
+ ```
109
+
110
+ ## Processing on GPU
111
+
112
+ `pyannote.audio` pipelines run on CPU by default.
113
+ You can send them to GPU with the following lines:
114
+
115
+ ```python
116
+ import torch
117
+ pipeline.to(torch.device("cuda"))
118
+ ```
119
+
120
+ ## Processing from memory
121
+
122
+ Pre-loading audio files in memory may result in faster processing:
123
+
124
+ ```python
125
+ waveform, sample_rate = torchaudio.load("audio.wav")
126
+ output = pipeline({"waveform": waveform, "sample_rate": sample_rate})
127
+ ```
128
+
129
+ ## Monitoring progress
130
+
131
+ Hooks are available to monitor the progress of the pipeline:
132
+
133
+ ```python
134
+ from pyannote.audio.pipelines.utils.hook import ProgressHook
135
+ with ProgressHook() as hook:
136
+ output = pipeline("audio.wav", hook=hook)
137
+ ```
138
+
139
+ ## Controlling the number of speakers
140
+
141
+ In case the number of speakers is known in advance, one can use the `num_speakers` option:
142
+
143
+ ```python
144
+ output = pipeline("audio.wav", num_speakers=2)
145
+ ```
146
+
147
+ One can also provide lower and/or upper bounds on the number of speakers using `min_speakers` and `max_speakers` options:
148
+
149
+ ```python
150
+ output = pipeline("audio.wav", min_speakers=2, max_speakers=5)
151
+ ```
152
+
153
+ ## Exclusive speaker diarization
154
+
155
+ `Community-1` pretrained pipeline returns a new *exclusive* speaker diarization, on top of the regular speaker diarization, available as `output.exclusive_speaker_diarization`.
156
+
157
+ This is a feature which is [backported from our latest commercial model](https://www.pyannote.ai/blog/precision-2) that simplifies the reconciliation between fine-grained speaker diarization timestamps and (sometimes not so precise) transcription timestamps.
158
+
159
+ ## Offline use
160
+
161
+ 1. In the terminal, copy the pipeline on disk:
162
+
163
+ ```bash
164
+ # make sure git-lfs is installed (https://git-lfs.com)
165
+ git lfs install
166
+
167
+ # create a directory on disk
168
+ mkdir /path/to/directory
169
+
170
+ # when prompted for a password, use an access token with write permissions.
171
+ # generate one from your settings: https://huggingface.co/settings/tokens
172
+ git clone https://hf.co/pyannote/speaker-diarization-community-1 /path/to/directory/pyannote-speaker-diarization-community-1
173
+ ```
174
+
175
+ 2. In Python, use the pipeline without internet connection:
176
+
177
+ ```python
178
+ # load pipeline from disk (works without internet connection)
179
+ from pyannote.audio import Pipeline
180
+ pipeline = Pipeline.from_pretrained('/path/to/directory/pyannote-speaker-diarization-community-1')
181
+
182
+ # run the pipeline locally on your computer
183
+ output = pipeline("audio.wav")
184
+ ```
185
+
186
+ ## Citations
187
+
188
+ 1. Speaker segmentation model
189
+
190
+ ```bibtex
191
+ @inproceedings{Plaquet23,
192
+ author={Alexis Plaquet and Hervé Bredin},
193
+ title={{Powerset multi-class cross entropy loss for neural speaker diarization}},
194
+ year=2023,
195
+ booktitle={Proc. INTERSPEECH 2023},
196
+ }
197
+ ```
198
+
199
+ 2. Speaker embedding model
200
+
201
+ ```bibtex
202
+ @inproceedings{Wang2023,
203
+ title={Wespeaker: A research and production oriented speaker embedding learning toolkit},
204
+ author={Wang, Hongji and Liang, Chengdong and Wang, Shuai and Chen, Zhengyang and Zhang, Binbin and Xiang, Xu and Deng, Yanlei and Qian, Yanmin},
205
+ booktitle={ICASSP 2023, IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)},
206
+ pages={1--5},
207
+ year={2023},
208
+ organization={IEEE}
209
+ }
210
+ ```
211
+
212
+
213
+ 3. Speaker clustering
214
+
215
+ ```bibtex
216
+ @article{Landini2022,
217
+ author={Landini, Federico and Profant, J{\'a}n and Diez, Mireia and Burget, Luk{\'a}{\v{s}}},
218
+ title={{Bayesian HMM clustering of x-vector sequences (VBx) in speaker diarization: theory, implementation and analysis on standard tasks}},
219
+ year={2022},
220
+ journal={Computer Speech \& Language},
221
+ }
222
+ ```
223
+
224
+ ## Acknowledgment
225
+
226
+ Training and tuning made possible thanks to [GENCI](https://www.genci.fr/) on the [**Jean Zay**](http://www.idris.fr/eng/jean-zay/) supercomputer.
227
+
ckpts/speaker-diarization-community-1/config.yaml ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ dependencies:
2
+ pyannote.audio: 4.0.0
3
+
4
+ pipeline:
5
+ name: pyannote.audio.pipelines.SpeakerDiarization
6
+ params:
7
+ clustering: VBxClustering
8
+ segmentation: ckpts/speaker-diarization-community-1/segmentation/pytorch_model.bin
9
+ segmentation_batch_size: 32
10
+ embedding: ckpts/speaker-diarization-community-1/embedding/pytorch_model.bin
11
+ embedding_batch_size: 32
12
+ embedding_exclude_overlap: true
13
+ plda: ckpts/speaker-diarization-community-1/plda
14
+
15
+ params:
16
+ clustering:
17
+ threshold: 0.6
18
+ Fa: 0.07
19
+ Fb: 0.8
20
+ segmentation:
21
+ min_duration_off: 0.0
ckpts/speaker-diarization-community-1/embedding/README.md ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Copied from https://huggingface.co/pyannote/wespeaker-voxceleb-resnet34-LM
2
+
3
+ ## License
4
+
5
+ According to [this page](https://github.com/wenet-e2e/wespeaker/blob/master/docs/pretrained.md):
6
+
7
+ > The pretrained model in WeNet follows the license of it's corresponding dataset. For example, the pretrained model on VoxCeleb follows Creative Commons Attribution 4.0 International License., since it is used as license of the VoxCeleb dataset, see https://mm.kaist.ac.kr/datasets/voxceleb/.
8
+
9
+ ## Citation
10
+
11
+ ```bibtex
12
+ @inproceedings{Wang2023,
13
+ title={Wespeaker: A research and production oriented speaker embedding learning toolkit},
14
+ author={Wang, Hongji and Liang, Chengdong and Wang, Shuai and Chen, Zhengyang and Zhang, Binbin and Xiang, Xu and Deng, Yanlei and Qian, Yanmin},
15
+ booktitle={ICASSP 2023, IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)},
16
+ pages={1--5},
17
+ year={2023},
18
+ organization={IEEE}
19
+ }
20
+ ```
ckpts/speaker-diarization-community-1/embedding/pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6f10ff60898a1d185fa22e1d11e0bfa8a92efec811f11bca48cb8cafebefd929
3
+ size 26646242
ckpts/speaker-diarization-community-1/plda/README.md ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ PLDA model trained by [BUT Speech@FIT](https://speech.fit.vut.cz/) group.
2
+
3
+ Thanks to [Jiangyu Han](https://github.com/jyhan03) and [Petr Pálka](https://github.com/Selesnyan) for the integration of VBx in pyannote.audio.
ckpts/speaker-diarization-community-1/plda/plda.npz ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9b77bcd840692710dd3496f62ecfeed8d8e5f002fd991b785079b244eab7d255
3
+ size 133852
ckpts/speaker-diarization-community-1/plda/xvec_transform.npz ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:325f1ce8e48f7e55e9c8aa47e05d2766b7c48c4b25b8de8dd751e7a4cc5fbe8f
3
+ size 134376
ckpts/speaker-diarization-community-1/segmentation/pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7ad24338d844fb95985486eb1a464e32d229f6d7a03c9abe60f978bacf3f816e
3
+ size 5906507
ckpts/speech_diarization_sond-zh-cn-alimeeting-16k-n16k4-pytorch/.mdl ADDED
Binary file (85 Bytes). View file
 
ckpts/speech_diarization_sond-zh-cn-alimeeting-16k-n16k4-pytorch/.msc ADDED
Binary file (991 Bytes). View file
 
ckpts/speech_diarization_sond-zh-cn-alimeeting-16k-n16k4-pytorch/.mv ADDED
@@ -0,0 +1 @@
 
 
1
+ Revision:v1.0.5,CreatedAt:1682049173
ckpts/speech_diarization_sond-zh-cn-alimeeting-16k-n16k4-pytorch/README.md ADDED
@@ -0,0 +1,319 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ tasks:
3
+ - speaker-diarization
4
+ domain:
5
+ - audio
6
+ model-type:
7
+ - spkdiar
8
+ frameworks:
9
+ - pytorch
10
+ backbone:
11
+ - SOND
12
+ metrics:
13
+ - DER
14
+ license: Apache License 2.0
15
+ language:
16
+ - cn
17
+ tags:
18
+ - SOND
19
+ - SpeakerDiarization
20
+ - Alibaba
21
+ - EMNLP 2022
22
+ datasets:
23
+ train:
24
+ - AliMeeting
25
+ test:
26
+ - AliMeeting test
27
+ indexing:
28
+ results:
29
+ - task:
30
+ name: Speaker Diarization
31
+ dataset:
32
+ name: AliMeeting
33
+ type: audio
34
+ args: 16k sampling rate, 3465 speakers, N=16 K=4
35
+ metrics:
36
+ - type: DER
37
+ value: 4.21%
38
+ description: AliMeeting
39
+ args: default
40
+ widgets:
41
+ - task: speaker-diarization
42
+ inputs:
43
+ - type: audio-list
44
+ name: input
45
+ title: 音频
46
+ examples:
47
+ - name: 1
48
+ title: 示例1
49
+ inputs:
50
+ - name: example_1
51
+ data:
52
+ - git://example/record.wav
53
+ - git://example/spk1.wav
54
+ - git://example/spk2.wav
55
+ - git://example/spk3.wav
56
+ - git://example/spk4.wav
57
+ inferencespec:
58
+ cpu: 8 #CPU数量
59
+ memory: 4096
60
+ ---
61
+
62
+ # Highlights
63
+ 会议场景端到端说话人日志模型,解决 "who spoke when" 的问题,发表于EMNLP 2022,在AliMeeting数据集上获得SOTA结果。
64
+
65
+ 支持功能:
66
+ - 给定若干说话人的声纹信息,识别并追踪语音段中的这些说话人
67
+ - 给定若干说话人的原始语音,识别并追踪语音段中的这些说话人
68
+
69
+ # Release Note
70
+
71
+ - 2023年1月(预计1月16号发布):funasr-0.1.6, modelscope-1.1.4
72
+ - 模型功能完善:
73
+ - Modelscope模型推理pipeline,新增加多种输入音频方式,如wav.scp、音频bytes、音频采样点、MP3格式、录音笔格式等。
74
+ - [Paraformer-large模型](https://www.modelscope.cn/models/damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch/summary),新增加基于ModelScope微调定制模型,新增加batch级解码,加快推理速度。
75
+ - [AISHELL-1学术集Paraformer模型](https://modelscope.cn/models/damo/speech_paraformer_asr_nat-zh-cn-16k-aishell1-vocab4234-pytorch/summary),
76
+ [AISHELL-1学术集ParaformerBert模型](https://modelscope.cn/models/damo/speech_paraformerbert_asr_nat-zh-cn-16k-aishell1-vocab4234-pytorch/summary),
77
+ [AISHELL-1学术集Conformer模型](https://modelscope.cn/models/damo/speech_conformer_asr_nat-zh-cn-16k-aishell1-vocab4234-pytorch/summary)、
78
+ [AISHELL-2学术集Paraformer模型](https://www.modelscope.cn/models/damo/speech_paraformer_asr_nat-zh-cn-16k-aishell2-vocab5212-pytorch/summary),
79
+ [AISHELL-2学术集ParaformerBert模型](https://www.modelscope.cn/models/damo/speech_paraformerbert_asr_nat-zh-cn-16k-aishell2-vocab5212-pytorch/summary)、
80
+ [AISHELL-2学术集Conformer模型](https://www.modelscope.cn/models/damo/speech_conformer_asr_nat-zh-cn-16k-aishell2-vocab5212-pytorch/summary),
81
+ 新增加基于ModelScope微调定制模型,其中,Paraformer与ParaformerBert模型新增加batch级解码,加快推理速度。
82
+ - 上线新模型:
83
+ - [说话人确认模型](https://www.modelscope.cn/models/damo/speech_xvector_sv-zh-cn-cnceleb-16k-spk3465-pytorch/summary) ,可用于说话人确认,也可以用来做说话人特征提取。
84
+ - [Paraformer-large长音频模型](https://www.modelscope.cn/models/damo/speech_paraformer-large-vad-punc_asr_nat-zh-cn-16k-common-vocab8404-pytorch/summary),集成VAD、ASR、标点与时间戳功能,可直接对时长为数小时音频进行识别,并输出带标点文字与时间戳。
85
+ - [中文无监督预训练Data2vec模型](https://www.modelscope.cn/models/damo/speech_data2vec_pretrain-zh-cn-aishell2-16k-pytorch/summary),采用Data2vec结构,基于AISHELL-2数据的中文无监督预训练模型,支持ASR或者下游任务微调模型。
86
+ - [语音端点检查VAD模型](https://modelscope.cn/models/damo/speech_fsmn_vad_zh-cn-16k-common-pytorch/summary),可用于检测长语音片段中有效语音的起止时间点。
87
+ - [中文标点预测通用模型](https://www.modelscope.cn/models/damo/punc_ct-transformer_zh-cn-common-vocab272727-pytorch/summary),可用于语音识别模型输出文本的标点预测。
88
+ - [8K UniASR流式模型](https://www.modelscope.cn/models/damo/speech_UniASR_asr_2pass-zh-cn-8k-common-vocab3445-pytorch-online/summary),[8K UniASR模型](https://www.modelscope.cn/models/damo/speech_UniASR_asr_2pass-zh-cn-8k-common-vocab3445-pytorch-offline/summary),一种流式与离线一体化语音识别模型,进行流式语音识别的同时,能够以较低延时输出离线识别结果来纠正预测文本。
89
+ - Paraformer-large基于[AISHELL-1微调模型](https://www.modelscope.cn/models/damo/speech_paraformer-large_asr_nat-zh-cn-16k-aishell1-vocab8404-pytorch/summary)、[AISHELL-2微调模型](https://www.modelscope.cn/models/damo/speech_paraformer-large_asr_nat-zh-cn-16k-aishell2-vocab8404-pytorch/summary),将Paraformer-large模型分别基于AISHELL-1与AISHELL-2数据微调。
90
+ - [小尺寸设备端Paraformer指令词模型](https://www.modelscope.cn/models/damo/speech_paraformer-tiny-commandword_asr_nat-zh-cn-16k-vocab544-pytorch/summary),Paraformer-tiny指令词版本,使用小参数量模型支持指令词识别。
91
+ - 将原TensorFlow模型升级为Pytorch模型,进行推理,并支持微调定制,包括:
92
+ - 16K 模型:[Paraformer中文](https://modelscope.cn/models/damo/speech_paraformer_asr_nat-zh-cn-16k-common-vocab8358-tensorflow1/summary)、
93
+ [Paraformer-large中文](https://modelscope.cn/models/damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8358-tensorflow1/summary)、
94
+ [UniASR中文](https://modelscope.cn/models/damo/speech_UniASR_asr_2pass-zh-cn-16k-common-vocab8358-tensorflow1-offline/summary)、
95
+ [UniASR-large中文](https://modelscope.cn/models/damo/speech_UniASR-large_asr_2pass-zh-cn-16k-common-vocab8358-tensorflow1-offline/summary)、
96
+ [UniASR中文流式模型](https://modelscope.cn/models/damo/speech_UniASR_asr_2pass-zh-cn-16k-common-vocab8358-tensorflow1-online/summary)、
97
+ [UniASR方言](https://modelscope.cn/models/damo/speech_UniASR_asr_2pass-cn-dialect-16k-vocab8358-tensorflow1-offline/summary)、
98
+ [UniASR方言流式模型](https://modelscope.cn/models/damo/speech_UniASR_asr_2pass-cn-dialect-16k-vocab8358-tensorflow1-online/summary)、
99
+ [UniASR日语](https://modelscope.cn/models/damo/speech_UniASR_asr_2pass-ja-16k-common-vocab93-tensorflow1-offline/summary)、
100
+ [UniASR日语流式模型](https://modelscope.cn/models/damo/speech_UniASR_asr_2pass-ja-16k-common-vocab93-tensorflow1-online/summary)、
101
+ [UniASR印尼语](https://modelscope.cn/models/damo/speech_UniASR_asr_2pass-id-16k-common-vocab1067-tensorflow1-offline/summary)、
102
+ [UniASR印尼语流式模型](https://modelscope.cn/models/damo/speech_UniASR_asr_2pass-id-16k-common-vocab1067-tensorflow1-online/summary)、
103
+ [UniASR葡萄牙语](https://modelscope.cn/models/damo/speech_UniASR_asr_2pass-pt-16k-common-vocab1617-tensorflow1-offline/summary)、
104
+ [UniASR葡萄牙语流式模型](https://modelscope.cn/models/damo/speech_UniASR_asr_2pass-pt-16k-common-vocab1617-tensorflow1-online/summary)、
105
+ [UniASR英文](https://modelscope.cn/models/damo/speech_UniASR_asr_2pass-en-16k-common-vocab1080-tensorflow1-offline/summary)、
106
+ [UniASR英文流式模型](https://modelscope.cn/models/damo/speech_UniASR_asr_2pass-en-16k-common-vocab1080-tensorflow1-online/summary)、
107
+ [UniASR俄语](https://modelscope.cn/models/damo/speech_UniASR_asr_2pass-ru-16k-common-vocab1664-tensorflow1-offline/summary)、
108
+ [UniASR俄语流式模型](https://modelscope.cn/models/damo/speech_UniASR_asr_2pass-ru-16k-common-vocab1664-tensorflow1-online/summary)、
109
+ [UniASR韩语](https://modelscope.cn/models/damo/speech_UniASR_asr_2pass-ko-16k-common-vocab6400-tensorflow1-offline/summary)、
110
+ [UniASR韩语流式模型](https://modelscope.cn/models/damo/speech_UniASR_asr_2pass-ko-16k-common-vocab6400-tensorflow1-online/summary)、
111
+ [UniASR西班牙语](https://modelscope.cn/models/damo/speech_UniASR_asr_2pass-es-16k-common-vocab3445-tensorflow1-offline/summary)、
112
+ [UniASR西班牙语流式模型](https://modelscope.cn/models/damo/speech_UniASR_asr_2pass-es-16k-common-vocab3445-tensorflow1-online/summary)、
113
+ [UniASR粤语简体](https://modelscope.cn/models/damo/speech_UniASR_asr_2pass-cantonese-CHS-16k-common-vocab1468-tensorflow1-offline/files)、
114
+ [UniASR粤语简体流式模型](https://modelscope.cn/models/damo/speech_UniASR_asr_2pass-cantonese-CHS-16k-common-vocab1468-tensorflow1-online/files)、
115
+ - 8K 模型:[Paraformer中文](https://modelscope.cn/models/damo/speech_paraformer_asr_nat-zh-cn-8k-common-vocab8358-tensorflow1/summary)、
116
+ [UniASR中文](https://modelscope.cn/models/damo/speech_UniASR_asr_2pass-zh-cn-8k-common-vocab8358-tensorflow1-offline/summary)、
117
+ [UniASR中文流式模型](https://modelscope.cn/models/damo/speech_UniASR_asr_2pass-zh-cn-8k-common-vocab8358-tensorflow1-offline/summary)
118
+
119
+ - 2022年11月:funasr-0.1.4, modelscope-1.1.3
120
+ - Paraformer-large非自回归模型上线,多个公开数据集上取得SOTA效果:
121
+ - 支持基于ModelScope推理。
122
+ - 支持基于[FunASR框架开源](https://github.com/alibaba-damo-academy/FunASR)微调和推理。
123
+
124
+ # 项目介绍
125
+ Speaker Overlap-aware Neural Diarization(SOND)是达摩院语音团队提出的一种高效建模语音重叠的说话人日志模型。本项目提供了在 <a href="http://openslr.org/119/">AliMeeting</a> 中文开源数据集上预训练的 SOND 模型,可以被应用于智能会议分析、对话分析等相关的学术研究。
126
+
127
+ <p align="center">
128
+ <img src="fig/sond.png" alt="SOND模型结构" width="420" />
129
+ </p>
130
+
131
+ SOND模型结构如上图所示,包括对语音信息进行编码的 Speech encoder、对说话人信息进行编码的 Speaker encoder、上下文依赖的打分器 CD scorer、上下文无关的打分器 CI scorer以及预测幂集编码的说话人混合网络 SCN。其中:
132
+ - Speech encoder 采用说话人识别任务中常用的 ResNet34网络结构,并采用 windowed statistic pooling 来得到每个时刻的语音特征
133
+ - Speaker encoder 采用 3 层全连接网络来对说话人的声纹信息进行映射,使其与语音特征在同一个特征空间
134
+ - 上下文无关的打分器 CI scorer 通过对比目标说话人与训练集中其他说话人的不同来学习全局的说话人区分性
135
+ - 上下文依赖的打分器 CD scorer 通过对比目标说话人与上下文中其他说话人的不同来学习局部的说话人区分性
136
+ - 说话人混合网络 SCN 以 CI 和 CD 打分为输入,来对不同的说话人组合进行建模。
137
+ - 预测网络为多层FFN,并采用Softmax作为激活函数来对幂集标签 PSE 进行预测
138
+
139
+ <p align="center">
140
+ <img src="fig/pse.png" alt="幂集(PSE)标签示例" width="540" />
141
+ </p>
142
+
143
+ 更多细节详见:
144
+ - 论文:<a href="https://arxiv.org/abs/2211.10243">Speaker Overlap-aware Neural Diarization for Multi-party Meeting Analysis</a>
145
+ - 论文解读:<a href="https://mp.weixin.qq.com/s/iU09MDjcFTaIJXIjc9isIA">EMNLP 2022论文解读 | SOND:基于显式语音重叠建模的说话人日志模型</a>。
146
+ - 数据集:<a href="https://arxiv.org/abs/2110.07393">M2MeT: The ICASSP 2022 Multi-Channel Multi-Party Meeting Transcription Challenge</a>
147
+
148
+ # 如何使用模型
149
+
150
+ ## 在线快速体验(集成中)
151
+ 在页面右侧,可以在“在线体验”栏内看到我们预先准备好的示例音频,点击播放按钮可以试听,点击“执行测试”按钮,会在下方“测试结果”栏中显示多个说话人的语音活动区间。如果您想要测试自己的音频,可点击“更换音频”按钮,选择上传或录制一段音频,完成后点击执行测试,各个说话人的语音活动区间将会在测试结果栏中显示。
152
+
153
+ ## 在Notebook中推理(集成中)
154
+ 对于灵活调用有需求的开发者,我们推荐您使用Notebook进行处理。首先登录ModelScope账号,点击模型页面右上角的“在Notebook中打开”按钮出现对话框,首次使用会提示您关联阿里云账号,按提示操作即可。关联账号后可进入选择启动实例界面,选择计算资源,建立实例,待实例创建完成后进入开发环境,输入api调用实例。
155
+
156
+ - 使用本模型进行说话人日志任务:
157
+ ```python
158
+ from modelscope.pipelines import pipeline
159
+ from modelscope.utils.constant import Tasks
160
+ import numpy as np
161
+
162
+ # 初始化推理 pipeline
163
+ # 当以原始音频作为输入时使用配置文件 sond.yaml,并设置 mode 为sond_demo
164
+ inference_diar_pipline = pipeline(
165
+ mode="sond_demo",
166
+ num_workers=0,
167
+ task=Tasks.speaker_diarization,
168
+ diar_model_config="sond.yaml",
169
+ model='damo/speech_diarization_sond-zh-cn-alimeeting-16k-n16k4-pytorch',
170
+ model_revision="v1.0.5",
171
+ sv_model="damo/speech_xvector_sv-zh-cn-cnceleb-16k-spk3465-pytorch",
172
+ sv_model_revision="v1.2.2",
173
+ )
174
+
175
+ # 以 audio_list 作为输入,其中第一个音频为待检测语音,后面的音频为不同说话人的声纹注册语音
176
+ audio_list=[
177
+ "https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_data/speaker_diarization/record.wav",
178
+ "https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_data/speaker_diarization/spk1.wav",
179
+ "https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_data/speaker_diarization/spk2.wav",
180
+ "https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_data/speaker_diarization/spk3.wav",
181
+ "https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_data/speaker_diarization/spk4.wav",
182
+ ]
183
+
184
+ results = inference_diar_pipline(audio_in=audio_list)
185
+ print(results)
186
+ ```
187
+ - 除了url表示的网络wav文件,还可使用本地磁盘上的wav文件:
188
+ ```python
189
+ audio_list=[
190
+ "example/record.wav",
191
+ "example/spk1.wav",
192
+ "example/spk2.wav",
193
+ "example/spk3.wav",
194
+ "example/spk4.wav"
195
+ ]
196
+ results = inference_diar_pipline(audio_in=audio_list)
197
+ ```
198
+
199
+ - 使用本模型进行批量说话人日志:
200
+ ```python
201
+ from modelscope.pipelines import pipeline
202
+ from modelscope.utils.constant import Tasks
203
+ import numpy as np
204
+
205
+ # 初始化推理 pipeline
206
+ # 当输入为 fbank 特征时使用的配置文件 sond_fbank.yaml
207
+ # output_dir 为结果保存路径
208
+ inference_diar_pipline = pipeline(
209
+ mode="sond",
210
+ output_dir="outputs",
211
+ diar_model_config="sond_fbank.yaml",
212
+ task=Tasks.speaker_diarization,
213
+ model='speech_diarization_sond-zh-cn-alimeeting-16k-n16k4-pytorch',
214
+ num_workers=1
215
+ )
216
+
217
+ # feats.scp 包括 fbank 特征
218
+ # utt1 path/to/feats.ark:xxx
219
+ # utt2 path/to/feats.ark:xxx
220
+ # profile.scp 包括基于 xvector的 speaker embedding, 可使用ModelScope上的如下模型 speech_xvector_sv-zh-cn-cnceleb-16k-spk3465-pytorch 进行提取
221
+ # utt1 path/to/profile.ark:xxx
222
+ # utt2 path/to/profile.ark:xxx
223
+ data_path_and_name_and_type = [
224
+ ("data/test_rmsil/feats.scp", "speech", "kaldi_ark"),
225
+ ("data/test_rmsil/profile.scp", "profile", "kaldi_ark")
226
+ ]
227
+ pipeline(audio_in=data_path_and_name_and_type)
228
+ # 也可以使用我们已经准备好的特征文件进行推理,下载路径为
229
+ # https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_data/alimeeting_test_data_for_sond.tar.gz
230
+ ```
231
+
232
+ ## 在本地机器中推理
233
+
234
+ 如果您有本地推理或定制模型的需求,可以前往下载FunASR语音处理框架,不仅涵盖语音识别、端点检测、说话人确认及日志等多种模型,还支持ModelScope开源模型的推理,使研究人员和开发者可以更加便捷的进行模型研究和生产,目前已在github开源:<a href="https://github.com/alibaba-damo-academy/FunASR">FunASR</a>
235
+
236
+ ### FunASR框架安装
237
+
238
+ - 安装FunASR和ModelScope
239
+
240
+ ```sh
241
+ # 安装 Pytorch GPU (version >= 1.7.0):
242
+ conda install pytorch==1.7.0 torchvision==0.8.0 torchaudio==0.7.0 cudatoolkit=9.2 -c pytorch
243
+ # 对于其他版本,请参考 https://pytorch.org/get-started/locally
244
+
245
+ # 安装 ModelScope 包:
246
+ pip install "modelscope[audio_asr]" -f https://modelscope.oss-cn-beijing.aliyuncs.com/releases/repo.html
247
+
248
+ # 下载项目代码:
249
+ git clone https://github.com/alibaba/FunASR.git
250
+
251
+ # 安装 FunASR:
252
+ pip install --editable ./
253
+ ```
254
+
255
+ ### 基于ModelScope进行推理(集成中)
256
+
257
+ - 在上面的安装完成后,就可以在使用ModelScope进行推理了,可运行如下命令测试各种用法:
258
+ ```sh
259
+ cd egs_modelscope/speaker_diarization/speech_diarization_sond-zh-cn-alimeeting-16k-n16k4-pytorch
260
+ python unit_test.py
261
+ ```
262
+
263
+ ### 使用 FunASR 在 AliMeeting 测试集上进行性能评估(已完成)
264
+ 接下以 AliMeeting 数据集为例,介绍如何使用 FunASR 对模型的 DER 性能指标进行评估:
265
+ ```sh
266
+ # 进入工作目录
267
+ cd egs/alimeeting/diarization/sond
268
+
269
+ # 进行评估,结束后您将会得到大概 4.21% 的 DER 结果
270
+ sh ./run.sh
271
+ ```
272
+
273
+ ### 基于ModelScope进行模型微调
274
+
275
+ 训练和微调功能正在开发中,敬请期待。
276
+
277
+ # Benchmark
278
+
279
+ ## 训练配置
280
+ - Feature info: using 80 dims fbank, no cmvn
281
+ - Train info: noam, lr 1.0, batch_size 32, 2 gpu(Tesla V100), acc_grad 1, 200000 steps, clip_gradient_norm 5.0
282
+ - Loss info: cross entropy softmax, speaker discriminative loss
283
+ - Model info: ResNet34, FFN, SAN, FSMN, windowed statistics pooling
284
+ - Train config: sond.yaml
285
+ - Model size: 19.09 M parameters
286
+
287
+ ## 实验结果 (DER)
288
+ - Test set: Alimeeting-test
289
+
290
+ | testset | DER(%) |
291
+ |:---------------------:|:------:|
292
+ | Alimeeting-test | 4.21 |
293
+
294
+ ## 使用方式以及适用范围
295
+
296
+ 运行范围
297
+ - 现阶段只能在 Linux-x86_64 运行,暂未适配 Mac 和 Windows 。
298
+
299
+ 使用方式
300
+ - 直接推理:可以直接使用原始语音和说话人注册语音作为输入,识别输出各个说话人的活动区间。
301
+ - 微调:正在开发中。
302
+
303
+ 使用范围与目标场景
304
+ - 适用于相关学术研究,在 AliMeeting 数据集上进行说话人日志、识别等任务。
305
+
306
+ ## 模型局限性以及可能的偏差
307
+
308
+ - 特征提取流程和工具差异,会对 DER 的数值带来一定的差异( < 0.1% )
309
+ - 识别语音输入应接为近 16 秒的无静音数据,注册语音长度应大于 2 秒
310
+
311
+ ## 相关论文以及引用信息
312
+ ```BibTeX
313
+ @inproceedings{du2022sond,
314
+ title={Speaker Overlap-aware Neural Diarization for Multi-party Meeting Analysis},
315
+ author={Du, Zhihao and Zhang, Shiliang and Zheng, Siqi and Yan, Zhijie},
316
+ booktitle={EMNLP},
317
+ year={2022}
318
+ }
319
+ ```
ckpts/speech_diarization_sond-zh-cn-alimeeting-16k-n16k4-pytorch/configuration.json ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "framework": "pytorch",
3
+ "task": "speaker-diarization",
4
+ "model": {
5
+ "type": "generic-sv",
6
+ "model_name": "sond.pth",
7
+ "model_config": {
8
+ "type": "pytorch",
9
+ "code_base": "funasr",
10
+ "mode": "sond_demo",
11
+ "lang": "zh-cn",
12
+ "batch_size": 1,
13
+ "diar_model_name": "sond.pth",
14
+ "diar_model_config": "sond.yaml",
15
+ "model": "damo/speech_diarization_sond-zh-cn-alimeeting-16k-n16k4-pytorch",
16
+ "model_revision": "v1.0.5",
17
+ "param_dict": {
18
+ "extract_profile": true
19
+ },
20
+ "num_workers": 0
21
+ }
22
+ },
23
+ "pipeline": {
24
+ "type": "speaker-diarization-inference"
25
+ }
26
+ }
ckpts/speech_diarization_sond-zh-cn-alimeeting-16k-n16k4-pytorch/example/record.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1aeb5e000ff0c1111cb7a3235b38d4b368f62b81085b1ef0766aece60484aa22
3
+ size 512044
ckpts/speech_diarization_sond-zh-cn-alimeeting-16k-n16k4-pytorch/example/spk1.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:bb204bcf1c54c5223fb088beb832e3270ce0df4f38f644eed9a76601c73563c9
3
+ size 281964
ckpts/speech_diarization_sond-zh-cn-alimeeting-16k-n16k4-pytorch/example/spk2.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:983d49e9f5719224d5c0355b1792374ec87c12e3fe97e1b539b9b0f1e52684a4
3
+ size 307564
ckpts/speech_diarization_sond-zh-cn-alimeeting-16k-n16k4-pytorch/example/spk3.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0bab18fb205bda339c4dd70910a5a0d8ae1cc0d2b3d11c5da4528abaf1790182
3
+ size 317484
ckpts/speech_diarization_sond-zh-cn-alimeeting-16k-n16k4-pytorch/example/spk4.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9877bd833b5b3758ae7860b72a7dbdd9a73e0e785b59df60bbf8e53a1ab009c8
3
+ size 299564