wgs commited on 4 days ago

Commit

04edf76

verified ·

1 Parent(s): a7ec871

Upload 92 files

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.gitattributes +5 -0
ckpts/.DS_Store +0 -0
ckpts/DNSMOSPro_NISQA/model_best.pt +3 -0
ckpts/MossFormer2_SE_48K/.cache/huggingface/.gitignore +1 -0
ckpts/MossFormer2_SE_48K/.cache/huggingface/download/.gitattributes.lock +0 -0
ckpts/MossFormer2_SE_48K/.cache/huggingface/download/.gitattributes.metadata +3 -0
ckpts/MossFormer2_SE_48K/.cache/huggingface/download/README.md.lock +0 -0
ckpts/MossFormer2_SE_48K/.cache/huggingface/download/README.md.metadata +3 -0
ckpts/MossFormer2_SE_48K/.cache/huggingface/download/last_best_checkpoint.lock +0 -0
ckpts/MossFormer2_SE_48K/.cache/huggingface/download/last_best_checkpoint.metadata +3 -0
ckpts/MossFormer2_SE_48K/.cache/huggingface/download/last_best_checkpoint.pt.lock +0 -0
ckpts/MossFormer2_SE_48K/.cache/huggingface/download/last_best_checkpoint.pt.metadata +3 -0
ckpts/MossFormer2_SE_48K/.gitattributes +35 -0
ckpts/MossFormer2_SE_48K/README.md +64 -0
ckpts/MossFormer2_SE_48K/last_best_checkpoint +1 -0
ckpts/MossFormer2_SE_48K/last_best_checkpoint.pt +3 -0
ckpts/MossFormer2_SS_16K/README.md +9 -0
ckpts/MossFormer2_SS_16K/last_best_checkpoint +1 -0
ckpts/MossFormer2_SS_16K/last_best_checkpoint.pt +3 -0
ckpts/paraformer-zh/README.md +182 -0
ckpts/paraformer-zh/am.mvn +8 -0
ckpts/paraformer-zh/config.yaml +123 -0
ckpts/paraformer-zh/configuration.json +14 -0
ckpts/paraformer-zh/model.pt +3 -0
ckpts/paraformer-zh/seg_dict +0 -0
ckpts/paraformer-zh/tokens.json +0 -0
ckpts/punc_ct-transformer_zh-cn-common-vocab272727-pytorch/README.md +272 -0
ckpts/punc_ct-transformer_zh-cn-common-vocab272727-pytorch/config.yaml +46 -0
ckpts/punc_ct-transformer_zh-cn-common-vocab272727-pytorch/configuration.json +13 -0
ckpts/punc_ct-transformer_zh-cn-common-vocab272727-pytorch/model.pt +3 -0
ckpts/punc_ct-transformer_zh-cn-common-vocab272727-pytorch/tokens.json +0 -0
ckpts/speaker-diarization-community-1/.DS_Store +0 -0
ckpts/speaker-diarization-community-1/README.md +227 -0
ckpts/speaker-diarization-community-1/config.yaml +21 -0
ckpts/speaker-diarization-community-1/embedding/README.md +20 -0
ckpts/speaker-diarization-community-1/embedding/pytorch_model.bin +3 -0
ckpts/speaker-diarization-community-1/plda/README.md +3 -0
ckpts/speaker-diarization-community-1/plda/plda.npz +3 -0
ckpts/speaker-diarization-community-1/plda/xvec_transform.npz +3 -0
ckpts/speaker-diarization-community-1/segmentation/pytorch_model.bin +3 -0
ckpts/speech_diarization_sond-zh-cn-alimeeting-16k-n16k4-pytorch/.mdl +0 -0
ckpts/speech_diarization_sond-zh-cn-alimeeting-16k-n16k4-pytorch/.msc +0 -0
ckpts/speech_diarization_sond-zh-cn-alimeeting-16k-n16k4-pytorch/.mv +1 -0
ckpts/speech_diarization_sond-zh-cn-alimeeting-16k-n16k4-pytorch/README.md +319 -0
ckpts/speech_diarization_sond-zh-cn-alimeeting-16k-n16k4-pytorch/configuration.json +26 -0
ckpts/speech_diarization_sond-zh-cn-alimeeting-16k-n16k4-pytorch/example/record.wav +3 -0
ckpts/speech_diarization_sond-zh-cn-alimeeting-16k-n16k4-pytorch/example/spk1.wav +3 -0
ckpts/speech_diarization_sond-zh-cn-alimeeting-16k-n16k4-pytorch/example/spk2.wav +3 -0
ckpts/speech_diarization_sond-zh-cn-alimeeting-16k-n16k4-pytorch/example/spk3.wav +3 -0
ckpts/speech_diarization_sond-zh-cn-alimeeting-16k-n16k4-pytorch/example/spk4.wav +3 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,8 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+ckpts/speech_diarization_sond-zh-cn-alimeeting-16k-n16k4-pytorch/example/record.wav filter=lfs diff=lfs merge=lfs -text
+ckpts/speech_diarization_sond-zh-cn-alimeeting-16k-n16k4-pytorch/example/spk1.wav filter=lfs diff=lfs merge=lfs -text
+ckpts/speech_diarization_sond-zh-cn-alimeeting-16k-n16k4-pytorch/example/spk2.wav filter=lfs diff=lfs merge=lfs -text
+ckpts/speech_diarization_sond-zh-cn-alimeeting-16k-n16k4-pytorch/example/spk3.wav filter=lfs diff=lfs merge=lfs -text
+ckpts/speech_diarization_sond-zh-cn-alimeeting-16k-n16k4-pytorch/example/spk4.wav filter=lfs diff=lfs merge=lfs -text

ckpts/.DS_Store ADDED Viewed

Binary file (6.15 kB). View file

ckpts/DNSMOSPro_NISQA/model_best.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:04b280a24ad9f0d3c2507140b69b1ae6ccbdba8cfc2ed6f2bec9c821e4794959
+size 341198

ckpts/MossFormer2_SE_48K/.cache/huggingface/.gitignore ADDED Viewed

	@@ -0,0 +1 @@


1	+ *

ckpts/MossFormer2_SE_48K/.cache/huggingface/download/.gitattributes.lock ADDED Viewed

File without changes

ckpts/MossFormer2_SE_48K/.cache/huggingface/download/.gitattributes.metadata ADDED Viewed

	@@ -0,0 +1,3 @@

+eff8c97925c8bec812af707814b3e5d777fd4503
+a6344aac8c09253b3b630fb776ae94478aa0275b
+1760600858.231903

ckpts/MossFormer2_SE_48K/.cache/huggingface/download/README.md.lock ADDED Viewed

File without changes

ckpts/MossFormer2_SE_48K/.cache/huggingface/download/README.md.metadata ADDED Viewed

	@@ -0,0 +1,3 @@

+eff8c97925c8bec812af707814b3e5d777fd4503
+3bd127640b52490ba6eda7c91738d5f3b826863b
+1760600857.7092524

ckpts/MossFormer2_SE_48K/.cache/huggingface/download/last_best_checkpoint.lock ADDED Viewed

File without changes

ckpts/MossFormer2_SE_48K/.cache/huggingface/download/last_best_checkpoint.metadata ADDED Viewed

	@@ -0,0 +1,3 @@

+eff8c97925c8bec812af707814b3e5d777fd4503
+98b04fb38c032a55f03f4a7583600e7e112b8e09
+1760600857.99231

ckpts/MossFormer2_SE_48K/.cache/huggingface/download/last_best_checkpoint.pt.lock ADDED Viewed

File without changes

ckpts/MossFormer2_SE_48K/.cache/huggingface/download/last_best_checkpoint.pt.metadata ADDED Viewed

	@@ -0,0 +1,3 @@

+eff8c97925c8bec812af707814b3e5d777fd4503
+03692b9f773bbd6bb43b9c5a41f96b1e28affd66e13796b7bec66ad3d8b227c6
+1760600960.8604465

ckpts/MossFormer2_SE_48K/.gitattributes ADDED Viewed

	@@ -0,0 +1,35 @@

+*.7z filter=lfs diff=lfs merge=lfs -text
+*.arrow filter=lfs diff=lfs merge=lfs -text
+*.bin filter=lfs diff=lfs merge=lfs -text
+*.bz2 filter=lfs diff=lfs merge=lfs -text
+*.ckpt filter=lfs diff=lfs merge=lfs -text
+*.ftz filter=lfs diff=lfs merge=lfs -text
+*.gz filter=lfs diff=lfs merge=lfs -text
+*.h5 filter=lfs diff=lfs merge=lfs -text
+*.joblib filter=lfs diff=lfs merge=lfs -text
+*.lfs.* filter=lfs diff=lfs merge=lfs -text
+*.mlmodel filter=lfs diff=lfs merge=lfs -text
+*.model filter=lfs diff=lfs merge=lfs -text
+*.msgpack filter=lfs diff=lfs merge=lfs -text
+*.npy filter=lfs diff=lfs merge=lfs -text
+*.npz filter=lfs diff=lfs merge=lfs -text
+*.onnx filter=lfs diff=lfs merge=lfs -text
+*.ot filter=lfs diff=lfs merge=lfs -text
+*.parquet filter=lfs diff=lfs merge=lfs -text
+*.pb filter=lfs diff=lfs merge=lfs -text
+*.pickle filter=lfs diff=lfs merge=lfs -text
+*.pkl filter=lfs diff=lfs merge=lfs -text
+*.pt filter=lfs diff=lfs merge=lfs -text
+*.pth filter=lfs diff=lfs merge=lfs -text
+*.rar filter=lfs diff=lfs merge=lfs -text
+*.safetensors filter=lfs diff=lfs merge=lfs -text
+saved_model/**/* filter=lfs diff=lfs merge=lfs -text
+*.tar.* filter=lfs diff=lfs merge=lfs -text
+*.tar filter=lfs diff=lfs merge=lfs -text
+*.tflite filter=lfs diff=lfs merge=lfs -text
+*.tgz filter=lfs diff=lfs merge=lfs -text
+*.wasm filter=lfs diff=lfs merge=lfs -text
+*.xz filter=lfs diff=lfs merge=lfs -text
+*.zip filter=lfs diff=lfs merge=lfs -text
+*.zst filter=lfs diff=lfs merge=lfs -text
+*tfevents* filter=lfs diff=lfs merge=lfs -text

ckpts/MossFormer2_SE_48K/README.md ADDED Viewed

	@@ -0,0 +1,64 @@

+---
+license: apache-2.0
+---
+# Introduction
+The MossFormer2_SE_48K model weights for 48 kHz speech enhancement in [ClearerVoice-Studio](https://github.com/modelscope/ClearerVoice-Studio/tree/main) repo.
+This model is trained on large scale datasets inclduing open-sourced and private data.
+It enhances speech audios by removing background noise.
+# Install
+**Clone the Repository**
+``` sh
+git clone https://github.com/modelscope/ClearerVoice-Studio.git
+```
+**Create Conda Environment**
+``` sh
+cd ClearerVoice-Studio
+conda create -n clearvoice python=3.8
+conda activate clearvoice
+pip install -r requirements.txt
+```
+**Run Script**
+Go to `clearvoice/` and use the following examples. The MossFormer2_SE_48K model will be downloaded from huggingface automatically.
+Sample example 1: use speech enhancement model `MossFormer2_SE_48K` to process one wave file of `samples/input.wav` and save the output wave file to `samples/output_MossFormer2_SE_48K.wav`
+```python
+from clearvoice import ClearVoice
+myClearVoice = ClearVoice(task='speech_enhancement', model_names=['MossFormer2_SE_48K'])
+output_wav = myClearVoice(input_path='samples/input.wav', online_write=False)
+myClearVoice.write(output_wav, output_path='samples/output_MossFormer2_SE_48K.wav')
+```
+Sample example 2: use speech enhancement model `MossFormer2_SE_48K` to process all input wave files in `samples/path_to_input_wavs/` and save all output files to `samples/path_to_output_wavs`
+```python
+from clearvoice import ClearVoice
+myClearVoice = ClearVoice(task='speech_enhancement', model_names=['MossFormer2_SE_48K'])
+myClearVoice(input_path='samples/path_to_input_wavs', online_write=True, output_path='samples/path_to_output_wavs')
+```
+Sample example 3: use speech enhancement model `MossFormer2_SE_48K` to process wave files listed in `samples/audio_samples.scp' file, and save all output files to 'samples/path_to_output_wavs_scp/'
+```python
+from clearvoice import ClearVoice
+myClearVoice = ClearVoice(task='speech_enhancement', model_names=['MossFormer2_SE_48K'])
+myClearVoice(input_path='samples/scp/audio_samples.scp', online_write=True, output_path='samples/path_to_output_wavs_scp')
+```

ckpts/MossFormer2_SE_48K/last_best_checkpoint ADDED Viewed

	@@ -0,0 +1 @@


1	+ last_best_checkpoint.pt

ckpts/MossFormer2_SE_48K/last_best_checkpoint.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:03692b9f773bbd6bb43b9c5a41f96b1e28affd66e13796b7bec66ad3d8b227c6
+size 221552019

ckpts/MossFormer2_SS_16K/README.md ADDED Viewed

	@@ -0,0 +1,9 @@

+---
+license: apache-2.0
+---
+The MossFormer2_SS_16K model weights for 16 kHz speech separation in [ClearerVoice-Studio](https://github.com/modelscope/ClearerVoice-Studio/tree/main) repo.
+This model is trained on large scale datasets inclduing open-sourced and private data.
+It separates mixed-speaker speeches into individual speaker's speech.

ckpts/MossFormer2_SS_16K/last_best_checkpoint ADDED Viewed

	@@ -0,0 +1 @@


1	+ last_best_checkpoint.pt

ckpts/MossFormer2_SS_16K/last_best_checkpoint.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:00a3a48bda492db1e829b85dd443f8f43a43039a3e90f1a24962ea9caf14a11a
+size 670353271

ckpts/paraformer-zh/README.md ADDED Viewed

	@@ -0,0 +1,182 @@

+---
+license: other
+license_name: model-license
+license_link: https://github.com/alibaba-damo-academy/FunASR
+---
+# FunASR: A Fundamental End-to-End Speech Recognition Toolkit
+[![PyPI](https://img.shields.io/pypi/v/funasr)](https://pypi.org/project/funasr/)
+<strong>FunASR</strong> hopes to build a bridge between academic research and industrial applications on speech recognition. By supporting the training & finetuning of the industrial-grade speech recognition model, researchers and developers can conduct research and production of speech recognition models more conveniently, and promote the development of speech recognition ecology. ASR for Fun！
+[**Highlights**](#highlights)
+| [**News**](https://github.com/alibaba-damo-academy/FunASR#whats-new)
+| [**Installation**](#installation)
+| [**Quick Start**](#quick-start)
+| [**Runtime**](./runtime/readme.md)
+| [**Model Zoo**](#model-zoo)
+| [**Contact**](#contact)
+<a name="highlights"></a>
+## Highlights
+- FunASR is a fundamental speech recognition toolkit that offers a variety of features, including speech recognition (ASR), Voice Activity Detection (VAD), Punctuation Restoration, Language Models, Speaker Verification, Speaker Diarization and multi-talker ASR. FunASR provides convenient scripts and tutorials, supporting inference and fine-tuning of pre-trained models.
+- We have released a vast collection of academic and industrial pretrained models on the [ModelScope](https://www.modelscope.cn/models?page=1&tasks=auto-speech-recognition) and [huggingface](https://huggingface.co/FunASR), which can be accessed through our [Model Zoo](https://github.com/alibaba-damo-academy/FunASR/blob/main/docs/model_zoo/modelscope_models.md). The representative [Paraformer-large](https://www.modelscope.cn/models/damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch/summary), a non-autoregressive end-to-end speech recognition model, has the advantages of high accuracy, high efficiency, and convenient deployment, supporting the rapid construction of speech recognition services. For more details on service deployment, please refer to the [service deployment document](runtime/readme_cn.md).
+<a name="Installation"></a>
+## Installation
+```shell
+pip3 install -U funasr
+```
+Or install from source code
+``` sh
+git clone https://github.com/alibaba/FunASR.git && cd FunASR
+pip3 install -e ./
+```
+Install modelscope for the pretrained models (Optional)
+```shell
+pip3 install -U modelscope
+```
+## Model Zoo
+FunASR has open-sourced a large number of pre-trained models on industrial data. You are free to use, copy, modify, and share FunASR models under the [Model License Agreement](./MODEL_LICENSE). Below are some representative models, for more models please refer to the [Model Zoo]().
+(Note: 🤗 represents the Huggingface model zoo link, ⭐ represents the ModelScope model zoo link)
+|                                                                             Model Name                                                                             |                    Task Details                    |          Training Data           | Parameters |
+|:------------------------------------------------------------------------------------------------------------------------------------------------------------------:|:--------------------------------------------------:|:--------------------------------:|:----------:|
+|    paraformer-zh <br> ([⭐](https://www.modelscope.cn/models/damo/speech_paraformer-large-vad-punc_asr_nat-zh-cn-16k-common-vocab8404-pytorch/summary)  [🤗]() )    | speech recognition, with timestamps, non-streaming |      60000 hours, Mandarin       |    220M    |
+| <nobr>paraformer-zh-streaming <br> ( [⭐](https://modelscope.cn/models/damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-online/summary) [🤗]() )</nobr> |           speech recognition, streaming            |      60000 hours, Mandarin       |    220M    |
+|         paraformer-en <br> ( [⭐](https://www.modelscope.cn/models/damo/speech_paraformer-large-vad-punc_asr_nat-en-16k-common-vocab10020/summary) [🤗]() )         | speech recognition, with timestamps, non-streaming |       50000 hours, English       |    220M    |
+|                     conformer-en <br> ( [⭐](https://modelscope.cn/models/damo/speech_conformer_asr-en-16k-vocab4199-pytorch/summary) [🤗]() )                      |         speech recognition, non-streaming          |       50000 hours, English       |    220M    |
+|                     ct-punc <br> ( [⭐](https://modelscope.cn/models/damo/punc_ct-transformer_cn-en-common-vocab471067-large/summary) [🤗]() )                      |              punctuation restoration               |    100M, Mandarin and English    |    1.1G    |
+|                          fsmn-vad <br> ( [⭐](https://modelscope.cn/models/damo/speech_fsmn_vad_zh-cn-16k-common-pytorch/summary) [🤗]() )                          |              voice activity detection              | 5000 hours, Mandarin and English |    0.4M    |
+|                          fa-zh <br> ( [⭐](https://modelscope.cn/models/damo/speech_timestamp_prediction-v1-16k-offline/summary) [🤗]() )                           |                timestamp prediction                |       5000 hours, Mandarin       |    38M     |
+|                cam++ <br> ( [⭐](https://modelscope.cn/models/iic/speech_campplus_sv_zh-cn_16k-common/summary) [🤗]() )                                             |        speaker verification/diarization            |            5000 hours            |    7.2M    |
+[//]: # ()
+[//]: # (FunASR supports pre-trained or further fine-tuned models for deployment as a service. The CPU version of the Chinese offline file conversion service has been released, details can be found in [docs]&#40;funasr/runtime/docs/SDK_tutorial.md&#41;. More detailed information about service deployment can be found in the [deployment roadmap]&#40;funasr/runtime/readme_cn.md&#41;.)
+<a name="quick-start"></a>
+## Quick Start
+Below is a quick start tutorial. Test audio files ([Mandarin](https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/vad_example.wav), [English]()).
+### Command-line usage
+```shell
+funasr +model=paraformer-zh +vad_model="fsmn-vad" +punc_model="ct-punc" +input=asr_example_zh.wav
+```
+Notes: Support recognition of single audio file, as well as file list in Kaldi-style wav.scp format: `wav_id wav_pat`
+### Speech Recognition (Non-streaming)
+```python
+from funasr import AutoModel
+# paraformer-zh is a multi-functional asr model
+# use vad, punc, spk or not as you need
+model = AutoModel(model="paraformer-zh", model_revision="v2.0.4",
+                  vad_model="fsmn-vad", vad_model_revision="v2.0.4",
+                  punc_model="ct-punc-c", punc_model_revision="v2.0.4",
+                  # spk_model="cam++", spk_model_revision="v2.0.2",
+                  )
+res = model.generate(input=f"{model.model_path}/example/asr_example.wav",
+                     batch_size_s=300,
+                     hotword='魔搭')
+print(res)
+```
+Note: `model_hub`: represents the model repository, `ms` stands for selecting ModelScope download, `hf` stands for selecting Huggingface download.
+### Speech Recognition (Streaming)
+```python
+from funasr import AutoModel
+chunk_size = [0, 10, 5]  # [0, 10, 5] 600ms, [0, 8, 4] 480ms
+encoder_chunk_look_back = 4  # number of chunks to lookback for encoder self-attention
+decoder_chunk_look_back = 1  # number of encoder chunks to lookback for decoder cross-attention
+model = AutoModel(model="paraformer-zh-streaming", model_revision="v2.0.4")
+import soundfile
+import os
+wav_file = os.path.join(model.model_path, "../fa-zh/example/asr_example.wav")
+speech, sample_rate = soundfile.read(wav_file)
+chunk_stride = chunk_size[1] * 960  # 600ms
+cache = {}
+total_chunk_num = int(len((speech) - 1) / chunk_stride + 1)
+for i in range(total_chunk_num):
+    speech_chunk = speech[i * chunk_stride:(i + 1) * chunk_stride]
+    is_final = i == total_chunk_num - 1
+    res = model.generate(input=speech_chunk, cache=cache, is_final=is_final, chunk_size=chunk_size,
+                         encoder_chunk_look_back=encoder_chunk_look_back,
+                         decoder_chunk_look_back=decoder_chunk_look_back)
+    print(res)
+```
+Note: `chunk_size` is the configuration for streaming latency.` [0,10,5]` indicates that the real-time display granularity is `10*60=600ms`, and the lookahead information is `5*60=300ms`. Each inference input is `600ms` (sample points are `16000*0.6=960`), and the output is the corresponding text. For the last speech segment input, `is_final=True` needs to be set to force the output of the last word.
+### Voice Activity Detection (Non-Streaming)
+```python
+from funasr import AutoModel
+model = AutoModel(model="fsmn-vad", model_revision="v2.0.4")
+wav_file = f"{model.model_path}/example/asr_example.wav"
+res = model.generate(input=wav_file)
+print(res)
+```
+### Voice Activity Detection (Streaming)
+```python
+from funasr import AutoModel
+chunk_size = 200 # ms
+model = AutoModel(model="fsmn-vad", model_revision="v2.0.4")
+import soundfile
+wav_file = f"{model.model_path}/example/vad_example.wav"
+speech, sample_rate = soundfile.read(wav_file)
+chunk_stride = int(chunk_size * sample_rate / 1000)
+cache = {}
+total_chunk_num = int(len((speech)-1)/chunk_stride+1)
+for i in range(total_chunk_num):
+    speech_chunk = speech[i*chunk_stride:(i+1)*chunk_stride]
+    is_final = i == total_chunk_num - 1
+    res = model.generate(input=speech_chunk, cache=cache, is_final=is_final, chunk_size=chunk_size)
+    if len(res[0]["value"]):
+        print(res)
+```
+### Punctuation Restoration
+```python
+from funasr import AutoModel
+model = AutoModel(model="ct-punc", model_revision="v2.0.4")
+res = model.generate(input="那今天的会就到这里吧 happy new year 明年见")
+print(res)
+```
+### Timestamp Prediction
+```python
+from funasr import AutoModel
+model = AutoModel(model="fa-zh", model_revision="v2.0.4")
+wav_file = f"{model.model_path}/example/asr_example.wav"
+text_file = f"{model.model_path}/example/text.txt"
+res = model.generate(input=(wav_file, text_file), data_type=("sound", "text"))
+print(res)
+```
+More examples ref to [docs](https://github.com/alibaba-damo-academy/FunASR/tree/main/examples/industrial_data_pretraining)

ckpts/paraformer-zh/am.mvn ADDED Viewed

	@@ -0,0 +1,8 @@

+<Nnet>
+<Splice> 560 560
+[ 0 ]
+<AddShift> 560 560
+<LearnRateCoef> 0 [ -8.311879 -8.600912 -9.615928 -10.43595 -11.21292 -11.88333 -12.36243 -12.63706 -12.8818 -12.83066 -12.89103 -12.95666 -13.19763 -13.40598 -13.49113 -13.5546 -13.55639 -13.51915 -13.68284 -13.53289 -13.42107 -13.65519 -13.50713 -13.75251 -13.76715 -13.87408 -13.73109 -13.70412 -13.56073 -13.53488 -13.54895 -13.56228 -13.59408 -13.62047 -13.64198 -13.66109 -13.62669 -13.58297 -13.57387 -13.4739 -13.53063 -13.48348 -13.61047 -13.64716 -13.71546 -13.79184 -13.90614 -14.03098 -14.18205 -14.35881 -14.48419 -14.60172 -14.70591 -14.83362 -14.92122 -15.00622 -15.05122 -15.03119 -14.99028 -14.92302 -14.86927 -14.82691 -14.7972 -14.76909 -14.71356 -14.61277 -14.51696 -14.42252 -14.36405 -14.30451 -14.23161 -14.19851 -14.16633 -14.15649 -14.10504 -13.99518 -13.79562 -13.3996 -12.7767 -11.71208 -8.311879 -8.600912 -9.615928 -10.43595 -11.21292 -11.88333 -12.36243 -12.63706 -12.8818 -12.83066 -12.89103 -12.95666 -13.19763 -13.40598 -13.49113 -13.5546 -13.55639 -13.51915 -13.68284 -13.53289 -13.42107 -13.65519 -13.50713 -13.75251 -13.76715 -13.87408 -13.73109 -13.70412 -13.56073 -13.53488 -13.54895 -13.56228 -13.59408 -13.62047 -13.64198 -13.66109 -13.62669 -13.58297 -13.57387 -13.4739 -13.53063 -13.48348 -13.61047 -13.64716 -13.71546 -13.79184 -13.90614 -14.03098 -14.18205 -14.35881 -14.48419 -14.60172 -14.70591 -14.83362 -14.92122 -15.00622 -15.05122 -15.03119 -14.99028 -14.92302 -14.86927 -14.82691 -14.7972 -14.76909 -14.71356 -14.61277 -14.51696 -14.42252 -14.36405 -14.30451 -14.23161 -14.19851 -14.16633 -14.15649 -14.10504 -13.99518 -13.79562 -13.3996 -12.7767 -11.71208 -8.311879 -8.600912 -9.615928 -10.43595 -11.21292 -11.88333 -12.36243 -12.63706 -12.8818 -12.83066 -12.89103 -12.95666 -13.19763 -13.40598 -13.49113 -13.5546 -13.55639 -13.51915 -13.68284 -13.53289 -13.42107 -13.65519 -13.50713 -13.75251 -13.76715 -13.87408 -13.73109 -13.70412 -13.56073 -13.53488 -13.54895 -13.56228 -13.59408 -13.62047 -13.64198 -13.66109 -13.62669 -13.58297 -13.57387 -13.4739 -13.53063 -13.48348 -13.61047 -13.64716 -13.71546 -13.79184 -13.90614 -14.03098 -14.18205 -14.35881 -14.48419 -14.60172 -14.70591 -14.83362 -14.92122 -15.00622 -15.05122 -15.03119 -14.99028 -14.92302 -14.86927 -14.82691 -14.7972 -14.76909 -14.71356 -14.61277 -14.51696 -14.42252 -14.36405 -14.30451 -14.23161 -14.19851 -14.16633 -14.15649 -14.10504 -13.99518 -13.79562 -13.3996 -12.7767 -11.71208 -8.311879 -8.600912 -9.615928 -10.43595 -11.21292 -11.88333 -12.36243 -12.63706 -12.8818 -12.83066 -12.89103 -12.95666 -13.19763 -13.40598 -13.49113 -13.5546 -13.55639 -13.51915 -13.68284 -13.53289 -13.42107 -13.65519 -13.50713 -13.75251 -13.76715 -13.87408 -13.73109 -13.70412 -13.56073 -13.53488 -13.54895 -13.56228 -13.59408 -13.62047 -13.64198 -13.66109 -13.62669 -13.58297 -13.57387 -13.4739 -13.53063 -13.48348 -13.61047 -13.64716 -13.71546 -13.79184 -13.90614 -14.03098 -14.18205 -14.35881 -14.48419 -14.60172 -14.70591 -14.83362 -14.92122 -15.00622 -15.05122 -15.03119 -14.99028 -14.92302 -14.86927 -14.82691 -14.7972 -14.76909 -14.71356 -14.61277 -14.51696 -14.42252 -14.36405 -14.30451 -14.23161 -14.19851 -14.16633 -14.15649 -14.10504 -13.99518 -13.79562 -13.3996 -12.7767 -11.71208 -8.311879 -8.600912 -9.615928 -10.43595 -11.21292 -11.88333 -12.36243 -12.63706 -12.8818 -12.83066 -12.89103 -12.95666 -13.19763 -13.40598 -13.49113 -13.5546 -13.55639 -13.51915 -13.68284 -13.53289 -13.42107 -13.65519 -13.50713 -13.75251 -13.76715 -13.87408 -13.73109 -13.70412 -13.56073 -13.53488 -13.54895 -13.56228 -13.59408 -13.62047 -13.64198 -13.66109 -13.62669 -13.58297 -13.57387 -13.4739 -13.53063 -13.48348 -13.61047 -13.64716 -13.71546 -13.79184 -13.90614 -14.03098 -14.18205 -14.35881 -14.48419 -14.60172 -14.70591 -14.83362 -14.92122 -15.00622 -15.05122 -15.03119 -14.99028 -14.92302 -14.86927 -14.82691 -14.7972 -14.76909 -14.71356 -14.61277 -14.51696 -14.42252 -14.36405 -14.30451 -14.23161 -14.19851 -14.16633 -14.15649 -14.10504 -13.99518 -13.79562 -13.3996 -12.7767 -11.71208 -8.311879 -8.600912 -9.615928 -10.43595 -11.21292 -11.88333 -12.36243 -12.63706 -12.8818 -12.83066 -12.89103 -12.95666 -13.19763 -13.40598 -13.49113 -13.5546 -13.55639 -13.51915 -13.68284 -13.53289 -13.42107 -13.65519 -13.50713 -13.75251 -13.76715 -13.87408 -13.73109 -13.70412 -13.56073 -13.53488 -13.54895 -13.56228 -13.59408 -13.62047 -13.64198 -13.66109 -13.62669 -13.58297 -13.57387 -13.4739 -13.53063 -13.48348 -13.61047 -13.64716 -13.71546 -13.79184 -13.90614 -14.03098 -14.18205 -14.35881 -14.48419 -14.60172 -14.70591 -14.83362 -14.92122 -15.00622 -15.05122 -15.03119 -14.99028 -14.92302 -14.86927 -14.82691 -14.7972 -14.76909 -14.71356 -14.61277 -14.51696 -14.42252 -14.36405 -14.30451 -14.23161 -14.19851 -14.16633 -14.15649 -14.10504 -13.99518 -13.79562 -13.3996 -12.7767 -11.71208 -8.311879 -8.600912 -9.615928 -10.43595 -11.21292 -11.88333 -12.36243 -12.63706 -12.8818 -12.83066 -12.89103 -12.95666 -13.19763 -13.40598 -13.49113 -13.5546 -13.55639 -13.51915 -13.68284 -13.53289 -13.42107 -13.65519 -13.50713 -13.75251 -13.76715 -13.87408 -13.73109 -13.70412 -13.56073 -13.53488 -13.54895 -13.56228 -13.59408 -13.62047 -13.64198 -13.66109 -13.62669 -13.58297 -13.57387 -13.4739 -13.53063 -13.48348 -13.61047 -13.64716 -13.71546 -13.79184 -13.90614 -14.03098 -14.18205 -14.35881 -14.48419 -14.60172 -14.70591 -14.83362 -14.92122 -15.00622 -15.05122 -15.03119 -14.99028 -14.92302 -14.86927 -14.82691 -14.7972 -14.76909 -14.71356 -14.61277 -14.51696 -14.42252 -14.36405 -14.30451 -14.23161 -14.19851 -14.16633 -14.15649 -14.10504 -13.99518 -13.79562 -13.3996 -12.7767 -11.71208 ]
+<Rescale> 560 560
+<LearnRateCoef> 0 [ 0.155775 0.154484 0.1527379 0.1518718 0.1506028 0.1489256 0.147067 0.1447061 0.1436307 0.1443568 0.1451849 0.1455157 0.1452821 0.1445717 0.1439195 0.1435867 0.1436018 0.1438781 0.1442086 0.1448844 0.1454756 0.145663 0.146268 0.1467386 0.1472724 0.147664 0.1480913 0.1483739 0.1488841 0.1493636 0.1497088 0.1500379 0.1502916 0.1505389 0.1506787 0.1507102 0.1505992 0.1505445 0.1505938 0.1508133 0.1509569 0.1512396 0.1514625 0.1516195 0.1516156 0.1515561 0.1514966 0.1513976 0.1512612 0.151076 0.1510596 0.1510431 0.151077 0.1511168 0.1511917 0.151023 0.1508045 0.1505885 0.1503493 0.1502373 0.1501726 0.1500762 0.1500065 0.1499782 0.150057 0.1502658 0.150469 0.1505335 0.1505505 0.1505328 0.1504275 0.1502438 0.1499674 0.1497118 0.1494661 0.1493102 0.1493681 0.1495501 0.1499738 0.1509654 0.155775 0.154484 0.1527379 0.1518718 0.1506028 0.1489256 0.147067 0.1447061 0.1436307 0.1443568 0.1451849 0.1455157 0.1452821 0.1445717 0.1439195 0.1435867 0.1436018 0.1438781 0.1442086 0.1448844 0.1454756 0.145663 0.146268 0.1467386 0.1472724 0.147664 0.1480913 0.1483739 0.1488841 0.1493636 0.1497088 0.1500379 0.1502916 0.1505389 0.1506787 0.1507102 0.1505992 0.1505445 0.1505938 0.1508133 0.1509569 0.1512396 0.1514625 0.1516195 0.1516156 0.1515561 0.1514966 0.1513976 0.1512612 0.151076 0.1510596 0.1510431 0.151077 0.1511168 0.1511917 0.151023 0.1508045 0.1505885 0.1503493 0.1502373 0.1501726 0.1500762 0.1500065 0.1499782 0.150057 0.1502658 0.150469 0.1505335 0.1505505 0.1505328 0.1504275 0.1502438 0.1499674 0.1497118 0.1494661 0.1493102 0.1493681 0.1495501 0.1499738 0.1509654 0.155775 0.154484 0.1527379 0.1518718 0.1506028 0.1489256 0.147067 0.1447061 0.1436307 0.1443568 0.1451849 0.1455157 0.1452821 0.1445717 0.1439195 0.1435867 0.1436018 0.1438781 0.1442086 0.1448844 0.1454756 0.145663 0.146268 0.1467386 0.1472724 0.147664 0.1480913 0.1483739 0.1488841 0.1493636 0.1497088 0.1500379 0.1502916 0.1505389 0.1506787 0.1507102 0.1505992 0.1505445 0.1505938 0.1508133 0.1509569 0.1512396 0.1514625 0.1516195 0.1516156 0.1515561 0.1514966 0.1513976 0.1512612 0.151076 0.1510596 0.1510431 0.151077 0.1511168 0.1511917 0.151023 0.1508045 0.1505885 0.1503493 0.1502373 0.1501726 0.1500762 0.1500065 0.1499782 0.150057 0.1502658 0.150469 0.1505335 0.1505505 0.1505328 0.1504275 0.1502438 0.1499674 0.1497118 0.1494661 0.1493102 0.1493681 0.1495501 0.1499738 0.1509654 0.155775 0.154484 0.1527379 0.1518718 0.1506028 0.1489256 0.147067 0.1447061 0.1436307 0.1443568 0.1451849 0.1455157 0.1452821 0.1445717 0.1439195 0.1435867 0.1436018 0.1438781 0.1442086 0.1448844 0.1454756 0.145663 0.146268 0.1467386 0.1472724 0.147664 0.1480913 0.1483739 0.1488841 0.1493636 0.1497088 0.1500379 0.1502916 0.1505389 0.1506787 0.1507102 0.1505992 0.1505445 0.1505938 0.1508133 0.1509569 0.1512396 0.1514625 0.1516195 0.1516156 0.1515561 0.1514966 0.1513976 0.1512612 0.151076 0.1510596 0.1510431 0.151077 0.1511168 0.1511917 0.151023 0.1508045 0.1505885 0.1503493 0.1502373 0.1501726 0.1500762 0.1500065 0.1499782 0.150057 0.1502658 0.150469 0.1505335 0.1505505 0.1505328 0.1504275 0.1502438 0.1499674 0.1497118 0.1494661 0.1493102 0.1493681 0.1495501 0.1499738 0.1509654 0.155775 0.154484 0.1527379 0.1518718 0.1506028 0.1489256 0.147067 0.1447061 0.1436307 0.1443568 0.1451849 0.1455157 0.1452821 0.1445717 0.1439195 0.1435867 0.1436018 0.1438781 0.1442086 0.1448844 0.1454756 0.145663 0.146268 0.1467386 0.1472724 0.147664 0.1480913 0.1483739 0.1488841 0.1493636 0.1497088 0.1500379 0.1502916 0.1505389 0.1506787 0.1507102 0.1505992 0.1505445 0.1505938 0.1508133 0.1509569 0.1512396 0.1514625 0.1516195 0.1516156 0.1515561 0.1514966 0.1513976 0.1512612 0.151076 0.1510596 0.1510431 0.151077 0.1511168 0.1511917 0.151023 0.1508045 0.1505885 0.1503493 0.1502373 0.1501726 0.1500762 0.1500065 0.1499782 0.150057 0.1502658 0.150469 0.1505335 0.1505505 0.1505328 0.1504275 0.1502438 0.1499674 0.1497118 0.1494661 0.1493102 0.1493681 0.1495501 0.1499738 0.1509654 0.155775 0.154484 0.1527379 0.1518718 0.1506028 0.1489256 0.147067 0.1447061 0.1436307 0.1443568 0.1451849 0.1455157 0.1452821 0.1445717 0.1439195 0.1435867 0.1436018 0.1438781 0.1442086 0.1448844 0.1454756 0.145663 0.146268 0.1467386 0.1472724 0.147664 0.1480913 0.1483739 0.1488841 0.1493636 0.1497088 0.1500379 0.1502916 0.1505389 0.1506787 0.1507102 0.1505992 0.1505445 0.1505938 0.1508133 0.1509569 0.1512396 0.1514625 0.1516195 0.1516156 0.1515561 0.1514966 0.1513976 0.1512612 0.151076 0.1510596 0.1510431 0.151077 0.1511168 0.1511917 0.151023 0.1508045 0.1505885 0.1503493 0.1502373 0.1501726 0.1500762 0.1500065 0.1499782 0.150057 0.1502658 0.150469 0.1505335 0.1505505 0.1505328 0.1504275 0.1502438 0.1499674 0.1497118 0.1494661 0.1493102 0.1493681 0.1495501 0.1499738 0.1509654 0.155775 0.154484 0.1527379 0.1518718 0.1506028 0.1489256 0.147067 0.1447061 0.1436307 0.1443568 0.1451849 0.1455157 0.1452821 0.1445717 0.1439195 0.1435867 0.1436018 0.1438781 0.1442086 0.1448844 0.1454756 0.145663 0.146268 0.1467386 0.1472724 0.147664 0.1480913 0.1483739 0.1488841 0.1493636 0.1497088 0.1500379 0.1502916 0.1505389 0.1506787 0.1507102 0.1505992 0.1505445 0.1505938 0.1508133 0.1509569 0.1512396 0.1514625 0.1516195 0.1516156 0.1515561 0.1514966 0.1513976 0.1512612 0.151076 0.1510596 0.1510431 0.151077 0.1511168 0.1511917 0.151023 0.1508045 0.1505885 0.1503493 0.1502373 0.1501726 0.1500762 0.1500065 0.1499782 0.150057 0.1502658 0.150469 0.1505335 0.1505505 0.1505328 0.1504275 0.1502438 0.1499674 0.1497118 0.1494661 0.1493102 0.1493681 0.1495501 0.1499738 0.1509654 ]
+</Nnet>

ckpts/paraformer-zh/config.yaml ADDED Viewed

	@@ -0,0 +1,123 @@

+# network architecture
+model: Paraformer
+model_conf:
+    ctc_weight: 0.0
+    lsm_weight: 0.1
+    length_normalized_loss: true
+    predictor_weight: 1.0
+    predictor_bias: 1
+    sampling_ratio: 0.75
+# encoder
+encoder: SANMEncoder
+encoder_conf:
+    output_size: 512
+    attention_heads: 4
+    linear_units: 2048
+    num_blocks: 50
+    dropout_rate: 0.1
+    positional_dropout_rate: 0.1
+    attention_dropout_rate: 0.1
+    input_layer: pe
+    pos_enc_class: SinusoidalPositionEncoder
+    normalize_before: true
+    kernel_size: 11
+    sanm_shfit: 0
+    selfattention_layer_type: sanm
+# decoder
+decoder: ParaformerSANMDecoder
+decoder_conf:
+    attention_heads: 4
+    linear_units: 2048
+    num_blocks: 16
+    dropout_rate: 0.1
+    positional_dropout_rate: 0.1
+    self_attention_dropout_rate: 0.1
+    src_attention_dropout_rate: 0.1
+    att_layer_num: 16
+    kernel_size: 11
+    sanm_shfit: 0
+predictor: CifPredictorV2
+predictor_conf:
+    idim: 512
+    threshold: 1.0
+    l_order: 1
+    r_order: 1
+    tail_threshold: 0.45
+# frontend related
+frontend: WavFrontend
+frontend_conf:
+    fs: 16000
+    window: hamming
+    n_mels: 80
+    frame_length: 25
+    frame_shift: 10
+    lfr_m: 7
+    lfr_n: 6
+specaug: SpecAugLFR
+specaug_conf:
+    apply_time_warp: false
+    time_warp_window: 5
+    time_warp_mode: bicubic
+    apply_freq_mask: true
+    freq_mask_width_range:
+    - 0
+    - 30
+    lfr_rate: 6
+    num_freq_mask: 1
+    apply_time_mask: true
+    time_mask_width_range:
+    - 0
+    - 12
+    num_time_mask: 1
+train_conf:
+  accum_grad: 1
+  grad_clip: 5
+  max_epoch: 150
+  val_scheduler_criterion:
+      - valid
+      - acc
+  best_model_criterion:
+  -   - valid
+      - acc
+      - max
+  keep_nbest_models: 10
+  log_interval: 50
+optim: adam
+optim_conf:
+   lr: 0.0005
+scheduler: warmuplr
+scheduler_conf:
+   warmup_steps: 30000
+dataset: AudioDataset
+dataset_conf:
+    index_ds: IndexDSJsonl
+    batch_sampler: DynamicBatchLocalShuffleSampler
+    batch_type: example # example or length
+    batch_size: 1 # if batch_type is example, batch_size is the numbers of samples; if length, batch_size is source_token_len+target_token_len;
+    max_token_length: 2048 # filter samples if source_token_len+target_token_len > max_token_length,
+    buffer_size: 500
+    shuffle: True
+    num_workers: 0
+tokenizer: CharTokenizer
+tokenizer_conf:
+  unk_symbol: <unk>
+  split_with_space: true
+input_size: 560
+ctc_conf:
+    dropout_rate: 0.0
+    ctc_type: builtin
+    reduce: true
+    ignore_nan_grad: true
+normalize: null

ckpts/paraformer-zh/configuration.json ADDED Viewed

	@@ -0,0 +1,14 @@

+{
+  "framework": "pytorch",
+  "task" : "auto-speech-recognition",
+  "model": {"type" : "funasr"},
+  "pipeline": {"type":"funasr-pipeline"},
+  "model_name_in_hub": {
+    "ms":"iic/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch",
+    "hf":""},
+  "file_path_metas": {
+    "init_param":"model.pt",
+    "config":"config.yaml",
+    "tokenizer_conf": {"token_list": "tokens.json", "seg_dict_file": "seg_dict"},
+    "frontend_conf":{"cmvn_file": "am.mvn"}}
+}

ckpts/paraformer-zh/model.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:5bba782a5e9196166233b9ab12ba04cadff9ef9212b4ff6153ed9290ff679025
+size 880502012

ckpts/paraformer-zh/seg_dict ADDED Viewed

The diff for this file is too large to render. See raw diff

ckpts/paraformer-zh/tokens.json ADDED Viewed

The diff for this file is too large to render. See raw diff

ckpts/punc_ct-transformer_zh-cn-common-vocab272727-pytorch/README.md ADDED Viewed

	@@ -0,0 +1,272 @@

+---
+tasks:
+- punctuation
+domain:
+- audio
+model-type:
+- Classification
+frameworks:
+- pytorch
+metrics:
+- f1_score
+license: Apache License 2.0
+language:
+- cn
+tags:
+- FunASR
+- CT-Transformer
+- Alibaba
+- ICASSP 2020
+datasets:
+  train:
+  - 33M-samples online data
+  test:
+  - wikipedia data test
+  - 10000 industrial Mandarin sentences test
+widgets:
+  - task: punctuation
+    model_revision: v2.0.4
+    inputs:
+      - type: text
+        name: input
+        title: 文本
+    examples:
+      - name: 1
+        title: 示例1
+        inputs:
+          - name: input
+            data: 我们都是木头人不会讲话不会动
+    inferencespec:
+      cpu: 1 #CPU数量
+      memory: 4096
+---
+# Controllable Time-delay Transformer模型介绍
+[//]: # (Controllable Time-delay Transformer 模型是一种端到端标点分类模型。)
+[//]: # (常规的Transformer会依赖很远的未来信息，导致长时间结果不固定。Controllable Time-delay Transformer 在效果无损的情况下，有效控制标点的延时。)
+# Highlights
+- 中文标点通用模型：可用于语音识别模型输出文本的标点预测。
+  - 基于[Paraformer-large长音频模型](https://www.modelscope.cn/models/damo/speech_paraformer-large-vad-punc_asr_nat-zh-cn-16k-common-vocab8404-pytorch/summary)场景的使用
+  - 基于[FunASR框架](https://github.com/alibaba-damo-academy/FunASR)，可进行ASR，VAD，标点的自由组合
+  - 基于纯文本输入的标点预测
+## <strong>[FunASR开源项目介绍](https://github.com/alibaba-damo-academy/FunASR)</strong>
+<strong>[FunASR](https://github.com/alibaba-damo-academy/FunASR)</strong>希望在语音识别的学术研究和工业应用之间架起一座桥梁。通过发布工业级语音识别模型的训练和微调，研究人员和开发人员可以更方便地进行语音识别模型的研究和生产，并推动语音识别生态的发展。让语音识别更有趣！
+[**github仓库**](https://github.com/alibaba-damo-academy/FunASR)
+| [**最新动态**](https://github.com/alibaba-damo-academy/FunASR#whats-new)
+| [**环境安装**](https://github.com/alibaba-damo-academy/FunASR#installation)
+| [**服务部署**](https://www.funasr.com)
+| [**模型库**](https://github.com/alibaba-damo-academy/FunASR/tree/main/model_zoo)
+| [**联系我们**](https://github.com/alibaba-damo-academy/FunASR#contact)
+## 模型原理介绍
+Controllable Time-delay Transformer是达摩院语音团队提出的高效后处理框架中的标点模块。本项目为中文通用标点模型，模型可以被应用于文本类输入的标点预测，也可应用于语音识别结果的后处理步骤，协助语音识别模块输出具有可读性的文本结果。
+<p align="center">
+<img src="fig/struct.png" alt="Controllable Time-delay Transformer模型结构"  width="500" />
+Controllable Time-delay Transformer 模型结构如上图所示，由 Embedding、Encoder 和 Predictor 三部分组成。Embedding 是词向量叠加位置向量。Encoder可以采用不同的网络结构，例如self-attention，conformer，SAN-M等。Predictor 预测每个token后的标点类型。
+在模型的选择上采用了性能优越的Transformer模型。Transformer模型在获得良好性能的同时，由于模型自身序列化输入等特性，会给系统带来较大时延。常规的Transformer可以看到未来的全部信息，导致标点会依赖很远的未来信息。这会给用户带来一种标点一直在变化刷新，长时间结果不固定的不良感受。基于这一问题，我们创新性的提出了可控时延的Transformer模型（Controllable Time-Delay Transformer, CT-Transformer），在模型性能无损失的情况下，有效控制标点的延时。
+更详细的细节见：
+- 论文： [CONTROLLABLE TIME-DELAY TRANSFORMER FOR REAL-TIME PUNCTUATION PREDICTION AND DISFLUENCY DETECTION](https://arxiv.org/pdf/2003.01309.pdf)
+## 基于ModelScope进行推理
+以下为三种支持格式及api调用方式参考如下范例：
+- text.scp文件路径，例如example/punc_example.txt，格式为： key + "\t" + value
+```sh
+cat example/punc_example.txt
+1       跨境河流是养育沿岸人民的生命之源
+2       从存储上来说仅仅是全景图片它就会是图片的四倍的容量
+3       那今天的会就到这里吧happy new year明年见
+```
+```python
+from modelscope.pipelines import pipeline
+from modelscope.utils.constant import Tasks
+inference_pipline = pipeline(
+    task=Tasks.punctuation,
+    model='damo/punc_ct-transformer_zh-cn-common-vocab272727-pytorch',
+    model_revision="v2.0.4")
+rec_result = inference_pipline(input='example/punc_example.txt')
+print(rec_result)
+```
+- text二进制数据，例如：用户直接从文件里读出bytes数据
+```python
+rec_result = inference_pipline(input='我们都是木头人不会讲话不会动')
+```
+- text文件url，例如：https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_text/punc_example.txt
+```python
+rec_result = inference_pipline(input='https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_text/punc_example.txt')
+```
+## 基于FunASR进行推理
+下面为快速上手教程，测试音频（[中文](https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/vad_example.wav)，[英文](https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/asr_example_en.wav)）
+### 可执行命令行
+在命令行终端执行：
+```shell
+funasr ++model=paraformer-zh ++vad_model="fsmn-vad" ++punc_model="ct-punc" ++input=vad_example.wav
+```
+注：支持单条音频文件识别，也支持文件列表，列表为kaldi风格wav.scp：`wav_id   wav_path`
+### python示例
+#### 非实时语音识别
+```python
+from funasr import AutoModel
+# paraformer-zh is a multi-functional asr model
+# use vad, punc, spk or not as you need
+model = AutoModel(model="paraformer-zh", model_revision="v2.0.4",
+                  vad_model="fsmn-vad", vad_model_revision="v2.0.4",
+                  punc_model="ct-punc-c", punc_model_revision="v2.0.4",
+                  # spk_model="cam++", spk_model_revision="v2.0.2",
+                  )
+res = model.generate(input=f"{model.model_path}/example/asr_example.wav",
+            batch_size_s=300,
+            hotword='魔搭')
+print(res)
+```
+注：`model_hub`：表示模型仓库，`ms`为选择modelscope下载，`hf`为选择huggingface下载。
+#### 实时语音识别
+```python
+from funasr import AutoModel
+chunk_size = [0, 10, 5] #[0, 10, 5] 600ms, [0, 8, 4] 480ms
+encoder_chunk_look_back = 4 #number of chunks to lookback for encoder self-attention
+decoder_chunk_look_back = 1 #number of encoder chunks to lookback for decoder cross-attention
+model = AutoModel(model="paraformer-zh-streaming", model_revision="v2.0.4")
+import soundfile
+import os
+wav_file = os.path.join(model.model_path, "example/asr_example.wav")
+speech, sample_rate = soundfile.read(wav_file)
+chunk_stride = chunk_size[1] * 960 # 600ms
+cache = {}
+total_chunk_num = int(len((speech)-1)/chunk_stride+1)
+for i in range(total_chunk_num):
+    speech_chunk = speech[i*chunk_stride:(i+1)*chunk_stride]
+    is_final = i == total_chunk_num - 1
+    res = model.generate(input=speech_chunk, cache=cache, is_final=is_final, chunk_size=chunk_size, encoder_chunk_look_back=encoder_chunk_look_back, decoder_chunk_look_back=decoder_chunk_look_back)
+    print(res)
+```
+注：`chunk_size`为流式延时配置，`[0,10,5]`表示上屏实时出字粒度为`10*60=600ms`，未来信息为`5*60=300ms`。每次推理输入为`600ms`（采样点数为`16000*0.6=960`），输出为对应文字，最后一个语音片段输入需要设置`is_final=True`来强制输出最后一个字。
+#### 语音端点检测（非实时）
+```python
+from funasr import AutoModel
+model = AutoModel(model="fsmn-vad", model_revision="v2.0.4")
+wav_file = f"{model.model_path}/example/asr_example.wav"
+res = model.generate(input=wav_file)
+print(res)
+```
+#### 语音端点检测（实时）
+```python
+from funasr import AutoModel
+chunk_size = 200 # ms
+model = AutoModel(model="fsmn-vad", model_revision="v2.0.4")
+import soundfile
+wav_file = f"{model.model_path}/example/vad_example.wav"
+speech, sample_rate = soundfile.read(wav_file)
+chunk_stride = int(chunk_size * sample_rate / 1000)
+cache = {}
+total_chunk_num = int(len((speech)-1)/chunk_stride+1)
+for i in range(total_chunk_num):
+    speech_chunk = speech[i*chunk_stride:(i+1)*chunk_stride]
+    is_final = i == total_chunk_num - 1
+    res = model.generate(input=speech_chunk, cache=cache, is_final=is_final, chunk_size=chunk_size)
+    if len(res[0]["value"]):
+        print(res)
+```
+#### 标点恢复
+```python
+from funasr import AutoModel
+model = AutoModel(model="ct-punc", model_revision="v2.0.4")
+res = model.generate(input="那今天的会就到这里吧 happy new year 明年见")
+print(res)
+```
+#### 时间戳预测
+```python
+from funasr import AutoModel
+model = AutoModel(model="fa-zh", model_revision="v2.0.4")
+wav_file = f"{model.model_path}/example/asr_example.wav"
+text_file = f"{model.model_path}/example/text.txt"
+res = model.generate(input=(wav_file, text_file), data_type=("sound", "text"))
+print(res)
+```
+更多详细用法（[示例](https://github.com/alibaba-damo-academy/FunASR/tree/main/examples/industrial_data_pretraining)）
+## 微调
+详细用法（[示例](https://github.com/alibaba-damo-academy/FunASR/tree/main/examples/industrial_data_pretraining)）
+## Benchmark
+中文标点预测通用模型在自采集的通用领域业务场景数据上有良好效果。训练数据大约33M个sample，每个sample可能包含1句或多句。
+### 自采集数据（20000+ samples）
+| precision                            | recall                                | f1_score                              |
+|:------------------------------------:|:-------------------------------------:|:-------------------------------------:|
+| <div style="width: 150pt">53.8</div> | <div style="width: 150pt">60.0</div>  | <div style="width: 150pt">56.5</div>  |
+## 使用方式以及适用范围
+运行范围
+- 支持Linux-x86_64、Mac和Windows运行。
+使用方式
+- 直接推理：可以直接对输入文本进行计算，输出带有标点的目标文字。
+使用范围与目标场景
+- 适合对文本数据进行标点预测，文本长度不限。
+## 相关论文以及引用信息
+```BibTeX
+@inproceedings{chen2020controllable,
+  title={Controllable Time-Delay Transformer for Real-Time Punctuation Prediction and Disfluency Detection},
+  author={Chen, Qian and Chen, Mengzhe and Li, Bo and Wang, Wen},
+  booktitle={ICASSP 2020-2020 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)},
+  pages={8069--8073},
+  year={2020},
+  organization={IEEE}
+}
+```

ckpts/punc_ct-transformer_zh-cn-common-vocab272727-pytorch/config.yaml ADDED Viewed

	@@ -0,0 +1,46 @@

+model: CTTransformer
+model_conf:
+    ignore_id: 0
+    embed_unit: 256
+    att_unit: 256
+    dropout_rate: 0.1
+    punc_list:
+        - <unk>
+        - _
+        - ，
+        - 。
+        - ？
+        - 、
+    punc_weight:
+        - 1.0
+        - 1.0
+        - 1.0
+        - 1.0
+        - 1.0
+        - 1.0
+    sentence_end_id: 3
+encoder: SANMEncoder
+encoder_conf:
+    input_size: 256
+    output_size: 256
+    attention_heads: 8
+    linear_units: 1024
+    num_blocks: 4
+    dropout_rate: 0.1
+    positional_dropout_rate: 0.1
+    attention_dropout_rate: 0.0
+    input_layer: pe
+    pos_enc_class: SinusoidalPositionEncoder
+    normalize_before: true
+    kernel_size: 11
+    sanm_shfit: 0
+    selfattention_layer_type: sanm
+    padding_idx: 0
+tokenizer: CharTokenizer
+tokenizer_conf:
+  unk_symbol: <unk>

ckpts/punc_ct-transformer_zh-cn-common-vocab272727-pytorch/configuration.json ADDED Viewed

	@@ -0,0 +1,13 @@

+{
+  "framework": "pytorch",
+  "task" : "punctuation",
+  "model": {"type" : "funasr"},
+  "pipeline": {"type":"funasr-pipeline"},
+  "model_name_in_hub": {
+    "ms":"iic/punc_ct-transformer_zh-cn-common-vocab272727-pytorch",
+    "hf":""},
+  "file_path_metas": {
+    "init_param":"model.pt",
+    "config":"config.yaml",
+    "tokenizer_conf": {"token_list": "tokens.json"}}
+}

ckpts/punc_ct-transformer_zh-cn-common-vocab272727-pytorch/model.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:a5818bb9d933805a916eebe41eb41648f7f9caad30b4bd59d56f3ca135421916
+size 291979892

ckpts/punc_ct-transformer_zh-cn-common-vocab272727-pytorch/tokens.json ADDED Viewed

The diff for this file is too large to render. See raw diff

ckpts/speaker-diarization-community-1/.DS_Store ADDED Viewed

Binary file (6.15 kB). View file

ckpts/speaker-diarization-community-1/README.md ADDED Viewed

	@@ -0,0 +1,227 @@

+---
+tags:
+  - pyannote
+  - pyannote-audio
+  - pyannote-audio-pipeline
+  - audio
+  - voice
+  - speech
+  - speaker
+  - speaker-diarization
+  - speaker-change-detection
+  - voice-activity-detection
+  - overlapped-speech-detection
+  - automatic-speech-recognition
+license: cc-by-4.0
+extra_gated_prompt: "Your input helps us strengthen the pyannote community and improve our open-source offerings. This pipeline is released under the CC-BY-4.0 license and will always remain freely accessible. By providing your details, you agree that we may email you occasionally with important news about pyannote models, invitations to try premium pipelines, and information about specific services designed for researchers and professionals like you."
+extra_gated_fields:
+  Company/university: text
+  Use case:
+     type: select
+     options:
+       - label: Meeting note taker (automated meeting transcription, action item extraction, and speaker identification in recordings)
+         value: meeting
+       - label: Conversation AI (chatbots, voice assistants, multi-turn dialogue systems with speaker awareness)
+         value: conversation
+       - label: CCaaS and customer experience (call center analytics, customer service optimization, and interaction quality monitoring)
+         value: ccaas
+       - label: Voice agents (AI-powered phone systems, automated customer service, voice-based interactions)
+         value: agent
+       - label: Media and automated dubbing (content creation, podcast processing, video production, and multilingual media)
+         value: dubbing
+       - label: Training and development (educational content analysis, corporate training evaluation, and learning assessment tools)
+         value: training
+       - label: Other
+         value: other
+---
+# `community-1` speaker diarization
+This pipeline ingests mono audio sampled at 16kHz and outputs speaker diarization.
+- stereo or multi-channel audio files are automatically downmixed to mono by averaging the channels.
+- audio files sampled at a different rate are resampled to 16kHz automatically upon loading.
+The [main improvements brought by `Community-1`](https://www.pyannote.ai/blog/community-1) are:
+- [improved](#benchmark) speaker assignment and counting
+- simpler reconciliation with transcription timestamps with [*exclusive*](#exclusive-speaker-diarization) speaker diarization
+- easy [offline use](#offline-use) (i.e. without internet connection)
+- (optionally) [hosted](https://hf.co/pyannote/speaker-diarization-community-1-cloud) on pyannoteAI cloud
+## Setup
+1. `pip install pyannote.audio`
+2. Accept user conditions
+3. Create access token at [`hf.co/settings/tokens`](https://hf.co/settings/tokens).
+## Quick start
+```python
+# download the pipeline from Huggingface
+from pyannote.audio import Pipeline
+pipeline = Pipeline.from_pretrained(
+    "pyannote/speaker-diarization-community-1",
+    token="{huggingface-token}")
+# run the pipeline locally on your computer
+output = pipeline("audio.wav")
+# print the predicted speaker diarization
+for turn, speaker in output.speaker_diarization:
+    print(f"{speaker} speaks between t={turn.start:.3f}s and t={turn.end:.3f}s")
+```
+## Benchmark
+Out of the box, `Community-1` is much better than `speaker-diarization-3.1`.
+We report [diarization error rates](http://pyannote.github.io/pyannote-metrics/reference.html#diarization) (in %) on large collection of academic benchmarks (fully automatic processing, no forgiveness collar, nor skipping overlapping speech).
+| Benchmark (last updated in 2025-09) | <a href="https://hf.co/pyannote/speaker-diarization-3.1">`legacy` (3.1)</a>| <a href="https://www.pyannote.ai/blog/community-1">`community-1`</a> | <a href="https://www.pyannote.ai/blog/precision-2">`precision-2`</a> |
+| --------------------------------------------------------------------------------------------------------------------------- | ------------------------------------------------------ | -------------------------------------------------| ------------------------------------------------ |
+| [AISHELL-4](https://arxiv.org/abs/2104.03603)                                                                               | 12.2 | 11.7 | 11.4 |
+| [AliMeeting](https://www.openslr.org/119/) (channel 1)                                                                      | 24.5 | 20.3 | 15.2 |
+| [AMI](https://groups.inf.ed.ac.uk/ami/corpus/) (IHM)                                                                        | 18.8 | 17.0 | 12.9 |
+| [AMI](https://groups.inf.ed.ac.uk/ami/corpus/) (SDM)                                                                        | 22.7 | 19.9 | 15.6 |
+| [AVA-AVD](https://arxiv.org/abs/2111.14448)                                                                                 | 49.7 | 44.6 | 37.1 |
+| [CALLHOME](https://catalog.ldc.upenn.edu/LDC2001S97) ([part 2](https://github.com/BUTSpeechFIT/CALLHOME_sublists/issues/1)) | 28.5 | 26.7 | 16.6 |
+| [DIHARD 3](https://catalog.ldc.upenn.edu/LDC2022S14) ([full](https://arxiv.org/abs/2012.01477))                             | 21.4 | 20.2 | 14.7 |
+| [Ego4D](https://arxiv.org/abs/2110.07058) (dev.)                                                                            | 51.2 | 46.8 | 39.0 |
+| [MSDWild](https://github.com/X-LANCE/MSDWILD)                                                                               | 25.4 | 22.8 | 17.3 |
+| [RAMC](https://www.openslr.org/123/)                                                                                        | 22.2 | 20.8 | 10.5 |
+| [REPERE](https://www.islrn.org/resources/360-758-359-485-0/) (phase2)                                                       | 7.9  |  8.9 |  7.4 |
+| [VoxConverse](https://github.com/joonson/voxconverse) (v0.3)                                                                | 11.2 | 11.2 |  8.5 |
+`Precision-2` model is even better and can be tested like this:
+1. Create an API key on [pyannoteAI dashboard]((https://dashboard.pyannote.ai)) (free credits included)
+2. Change one line of code
+```diff
+from pyannote.audio import Pipeline
+pipeline = Pipeline.from_pretrained(
+-     'pyannote/speaker-diarization-community-1', token="{huggingface-token}")
++     'pyannote/speaker-diarization-precision-2', token="{pyannoteAI-api-key}")
+diarization = pipeline("audio.wav")  # runs on pyannoteAI servers
+```
+## Processing on GPU
+`pyannote.audio` pipelines run on CPU by default.
+You can send them to GPU with the following lines:
+```python
+import torch
+pipeline.to(torch.device("cuda"))
+```
+## Processing from memory
+Pre-loading audio files in memory may result in faster processing:
+```python
+waveform, sample_rate = torchaudio.load("audio.wav")
+output = pipeline({"waveform": waveform, "sample_rate": sample_rate})
+```
+## Monitoring progress
+Hooks are available to monitor the progress of the pipeline:
+```python
+from pyannote.audio.pipelines.utils.hook import ProgressHook
+with ProgressHook() as hook:
+    output = pipeline("audio.wav", hook=hook)
+```
+## Controlling the number of speakers
+In case the number of speakers is known in advance, one can use the `num_speakers` option:
+```python
+output = pipeline("audio.wav", num_speakers=2)
+```
+One can also provide lower and/or upper bounds on the number of speakers using `min_speakers` and `max_speakers` options:
+```python
+output = pipeline("audio.wav", min_speakers=2, max_speakers=5)
+```
+## Exclusive speaker diarization
+`Community-1` pretrained pipeline returns a new *exclusive* speaker diarization, on top of the regular speaker diarization, available as `output.exclusive_speaker_diarization`.
+This is a feature which is [backported from our latest commercial model](https://www.pyannote.ai/blog/precision-2) that simplifies the reconciliation between fine-grained speaker diarization timestamps and (sometimes not so precise) transcription timestamps.
+## Offline use
+1. In the terminal, copy the pipeline on disk:
+```bash
+# make sure git-lfs is installed (https://git-lfs.com)
+git lfs install
+# create a directory on disk
+mkdir /path/to/directory
+# when prompted for a password, use an access token with write permissions.
+# generate one from your settings: https://huggingface.co/settings/tokens
+git clone https://hf.co/pyannote/speaker-diarization-community-1 /path/to/directory/pyannote-speaker-diarization-community-1
+```
+2. In Python, use the pipeline without internet connection:
+```python
+# load pipeline from disk (works without internet connection)
+from pyannote.audio import Pipeline
+pipeline = Pipeline.from_pretrained('/path/to/directory/pyannote-speaker-diarization-community-1')
+# run the pipeline locally on your computer
+output = pipeline("audio.wav")
+```
+## Citations
+1. Speaker segmentation model
+```bibtex
+@inproceedings{Plaquet23,
+  author={Alexis Plaquet and Hervé Bredin},
+  title={{Powerset multi-class cross entropy loss for neural speaker diarization}},
+  year=2023,
+  booktitle={Proc. INTERSPEECH 2023},
+}
+```
+2. Speaker embedding model
+```bibtex
+@inproceedings{Wang2023,
+  title={Wespeaker: A research and production oriented speaker embedding learning toolkit},
+  author={Wang, Hongji and Liang, Chengdong and Wang, Shuai and Chen, Zhengyang and Zhang, Binbin and Xiang, Xu and Deng, Yanlei and Qian, Yanmin},
+  booktitle={ICASSP 2023, IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)},
+  pages={1--5},
+  year={2023},
+  organization={IEEE}
+}
+```
+3. Speaker clustering
+```bibtex
+@article{Landini2022,
+  author={Landini, Federico and Profant, J{\'a}n and Diez, Mireia and Burget, Luk{\'a}{\v{s}}},
+  title={{Bayesian HMM clustering of x-vector sequences (VBx) in speaker diarization: theory, implementation and analysis on standard tasks}},
+  year={2022},
+  journal={Computer Speech \& Language},
+}
+```
+## Acknowledgment
+Training and tuning made possible thanks to [GENCI](https://www.genci.fr/) on the [**Jean Zay**](http://www.idris.fr/eng/jean-zay/) supercomputer.

ckpts/speaker-diarization-community-1/config.yaml ADDED Viewed

	@@ -0,0 +1,21 @@

+dependencies:
+  pyannote.audio: 4.0.0
+pipeline:
+  name: pyannote.audio.pipelines.SpeakerDiarization
+  params:
+    clustering: VBxClustering
+    segmentation: ckpts/speaker-diarization-community-1/segmentation/pytorch_model.bin
+    segmentation_batch_size: 32
+    embedding: ckpts/speaker-diarization-community-1/embedding/pytorch_model.bin
+    embedding_batch_size: 32
+    embedding_exclude_overlap: true
+    plda: ckpts/speaker-diarization-community-1/plda
+params:
+  clustering:
+    threshold: 0.6
+    Fa: 0.07
+    Fb: 0.8
+  segmentation:
+    min_duration_off: 0.0

ckpts/speaker-diarization-community-1/embedding/README.md ADDED Viewed

	@@ -0,0 +1,20 @@

+Copied from https://huggingface.co/pyannote/wespeaker-voxceleb-resnet34-LM
+## License
+According to [this page](https://github.com/wenet-e2e/wespeaker/blob/master/docs/pretrained.md):
+> The pretrained model in WeNet follows the license of it's corresponding dataset. For example, the pretrained model on VoxCeleb follows Creative Commons Attribution 4.0 International License., since it is used as license of the VoxCeleb dataset, see https://mm.kaist.ac.kr/datasets/voxceleb/.
+## Citation
+```bibtex
+@inproceedings{Wang2023,
+  title={Wespeaker: A research and production oriented speaker embedding learning toolkit},
+  author={Wang, Hongji and Liang, Chengdong and Wang, Shuai and Chen, Zhengyang and Zhang, Binbin and Xiang, Xu and Deng, Yanlei and Qian, Yanmin},
+  booktitle={ICASSP 2023, IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)},
+  pages={1--5},
+  year={2023},
+  organization={IEEE}
+}
+```

ckpts/speaker-diarization-community-1/embedding/pytorch_model.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:6f10ff60898a1d185fa22e1d11e0bfa8a92efec811f11bca48cb8cafebefd929
+size 26646242

ckpts/speaker-diarization-community-1/plda/README.md ADDED Viewed

	@@ -0,0 +1,3 @@


1	+ PLDA model trained by [BUT Speech@FIT](https://speech.fit.vut.cz/) group.
2	+
3	+ Thanks to [Jiangyu Han](https://github.com/jyhan03) and [Petr Pálka](https://github.com/Selesnyan) for the integration of VBx in pyannote.audio.

ckpts/speaker-diarization-community-1/plda/plda.npz ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:9b77bcd840692710dd3496f62ecfeed8d8e5f002fd991b785079b244eab7d255
+size 133852

ckpts/speaker-diarization-community-1/plda/xvec_transform.npz ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:325f1ce8e48f7e55e9c8aa47e05d2766b7c48c4b25b8de8dd751e7a4cc5fbe8f
+size 134376

ckpts/speaker-diarization-community-1/segmentation/pytorch_model.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:7ad24338d844fb95985486eb1a464e32d229f6d7a03c9abe60f978bacf3f816e
+size 5906507

ckpts/speech_diarization_sond-zh-cn-alimeeting-16k-n16k4-pytorch/.mdl ADDED Viewed

Binary file (85 Bytes). View file

ckpts/speech_diarization_sond-zh-cn-alimeeting-16k-n16k4-pytorch/.msc ADDED Viewed

Binary file (991 Bytes). View file

ckpts/speech_diarization_sond-zh-cn-alimeeting-16k-n16k4-pytorch/.mv ADDED Viewed

	@@ -0,0 +1 @@


1	+ Revision:v1.0.5,CreatedAt:1682049173

ckpts/speech_diarization_sond-zh-cn-alimeeting-16k-n16k4-pytorch/README.md ADDED Viewed

	@@ -0,0 +1,319 @@

+---
+tasks:
+- speaker-diarization
+domain:
+- audio
+model-type:
+- spkdiar
+frameworks:
+- pytorch
+backbone:
+- SOND
+metrics:
+- DER
+license: Apache License 2.0
+language:
+- cn
+tags:
+- SOND
+- SpeakerDiarization
+- Alibaba
+- EMNLP 2022
+datasets:
+  train:
+  - AliMeeting
+  test:
+  - AliMeeting test
+indexing:
+   results:
+   - task:
+       name: Speaker Diarization
+     dataset:
+       name: AliMeeting
+       type: audio
+       args: 16k sampling rate, 3465 speakers, N=16 K=4
+     metrics:
+       - type: DER
+         value: 4.21%
+         description: AliMeeting
+         args: default
+widgets:
+  - task: speaker-diarization
+    inputs:
+      - type: audio-list
+        name: input
+        title: 音频
+    examples:
+      - name: 1
+        title: 示例1
+        inputs:
+          - name: example_1
+            data:
+              - git://example/record.wav
+              - git://example/spk1.wav
+              - git://example/spk2.wav
+              - git://example/spk3.wav
+              - git://example/spk4.wav
+    inferencespec:
+      cpu: 8 #CPU数量
+      memory: 4096
+---
+# Highlights
+会议场景端到端说话人日志模型，解决 "who spoke when" 的问题，发表于EMNLP 2022，在AliMeeting数据集上获得SOTA结果。
+支持功能：
+- 给定若干说话人的声纹信息，识别并追踪语音段中的这些说话人
+- 给定若干说话人的原始语音，识别并追踪语音段中的这些说话人
+# Release Note
+- 2023年1月（预计1月16号发布）：funasr-0.1.6, modelscope-1.1.4
+  - 模型功能完善：
+    - Modelscope模型推理pipeline，新增加多种输入音频方式，如wav.scp、音频bytes、音频采样点、MP3格式、录音笔格式等。
+    - [Paraformer-large模型](https://www.modelscope.cn/models/damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch/summary)，新增加基于ModelScope微调定制模型，新增加batch级解码，加快推理速度。
+    - [AISHELL-1学术集Paraformer模型](https://modelscope.cn/models/damo/speech_paraformer_asr_nat-zh-cn-16k-aishell1-vocab4234-pytorch/summary)，
+    [AISHELL-1学术集ParaformerBert模型](https://modelscope.cn/models/damo/speech_paraformerbert_asr_nat-zh-cn-16k-aishell1-vocab4234-pytorch/summary)，
+    [AISHELL-1学术集Conformer模型](https://modelscope.cn/models/damo/speech_conformer_asr_nat-zh-cn-16k-aishell1-vocab4234-pytorch/summary)、
+    [AISHELL-2学术集Paraformer模型](https://www.modelscope.cn/models/damo/speech_paraformer_asr_nat-zh-cn-16k-aishell2-vocab5212-pytorch/summary)，
+    [AISHELL-2学术集ParaformerBert模型](https://www.modelscope.cn/models/damo/speech_paraformerbert_asr_nat-zh-cn-16k-aishell2-vocab5212-pytorch/summary)、
+    [AISHELL-2学术集Conformer模型](https://www.modelscope.cn/models/damo/speech_conformer_asr_nat-zh-cn-16k-aishell2-vocab5212-pytorch/summary)，
+    新增加基于ModelScope微调定制模型，其中，Paraformer与ParaformerBert模型新增加batch级解码，加快推理速度。
+  - 上线新模型：
+    - [说话人确认模型](https://www.modelscope.cn/models/damo/speech_xvector_sv-zh-cn-cnceleb-16k-spk3465-pytorch/summary) ，可用于说话人确认，也可以用来做说话人特征提取。
+    - [Paraformer-large长音频模型](https://www.modelscope.cn/models/damo/speech_paraformer-large-vad-punc_asr_nat-zh-cn-16k-common-vocab8404-pytorch/summary)，集成VAD、ASR、标点与时间戳功能，可直接对时长为数小时音频进行识别，并输出带标点文字与时间戳。
+    - [中文无监督预训练Data2vec模型](https://www.modelscope.cn/models/damo/speech_data2vec_pretrain-zh-cn-aishell2-16k-pytorch/summary)，采用Data2vec结构，基于AISHELL-2数据的中文无监督预训练模型，支持ASR或者下游任务微调模型。
+    - [语音端点检查VAD模型](https://modelscope.cn/models/damo/speech_fsmn_vad_zh-cn-16k-common-pytorch/summary)，可用于检测长语音片段中有效语音的起止时间点。
+    - [中文标点预测通用模型](https://www.modelscope.cn/models/damo/punc_ct-transformer_zh-cn-common-vocab272727-pytorch/summary)，可用于语音识别模型输出文本的标点预测。
+    - [8K UniASR流式模型](https://www.modelscope.cn/models/damo/speech_UniASR_asr_2pass-zh-cn-8k-common-vocab3445-pytorch-online/summary)，[8K UniASR模型](https://www.modelscope.cn/models/damo/speech_UniASR_asr_2pass-zh-cn-8k-common-vocab3445-pytorch-offline/summary)，一种流式与离线一体化语音识别模型，进行流式语音识别的同时，能够以较低延时输出离线识别结果来纠正预测文本。
+    - Paraformer-large基于[AISHELL-1微调模型](https://www.modelscope.cn/models/damo/speech_paraformer-large_asr_nat-zh-cn-16k-aishell1-vocab8404-pytorch/summary)、[AISHELL-2微调模型](https://www.modelscope.cn/models/damo/speech_paraformer-large_asr_nat-zh-cn-16k-aishell2-vocab8404-pytorch/summary)，将Paraformer-large模型分别基于AISHELL-1与AISHELL-2数据微调。
+    - [小尺寸设备端Paraformer指令词模型](https://www.modelscope.cn/models/damo/speech_paraformer-tiny-commandword_asr_nat-zh-cn-16k-vocab544-pytorch/summary)，Paraformer-tiny指令词版本，使用小参数量模型支持指令词识别。
+  - 将原TensorFlow模型升级为Pytorch模型，进行推理，并支持微调定制，包括：
+    - 16K 模型：[Paraformer中文](https://modelscope.cn/models/damo/speech_paraformer_asr_nat-zh-cn-16k-common-vocab8358-tensorflow1/summary)、
+    [Paraformer-large中文](https://modelscope.cn/models/damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8358-tensorflow1/summary)、
+    [UniASR中文](https://modelscope.cn/models/damo/speech_UniASR_asr_2pass-zh-cn-16k-common-vocab8358-tensorflow1-offline/summary)、
+    [UniASR-large中文](https://modelscope.cn/models/damo/speech_UniASR-large_asr_2pass-zh-cn-16k-common-vocab8358-tensorflow1-offline/summary)、
+    [UniASR中文流式模型](https://modelscope.cn/models/damo/speech_UniASR_asr_2pass-zh-cn-16k-common-vocab8358-tensorflow1-online/summary)、
+    [UniASR方言](https://modelscope.cn/models/damo/speech_UniASR_asr_2pass-cn-dialect-16k-vocab8358-tensorflow1-offline/summary)、
+    [UniASR方言流式模型](https://modelscope.cn/models/damo/speech_UniASR_asr_2pass-cn-dialect-16k-vocab8358-tensorflow1-online/summary)、
+    [UniASR日语](https://modelscope.cn/models/damo/speech_UniASR_asr_2pass-ja-16k-common-vocab93-tensorflow1-offline/summary)、
+    [UniASR日语流式模型](https://modelscope.cn/models/damo/speech_UniASR_asr_2pass-ja-16k-common-vocab93-tensorflow1-online/summary)、
+    [UniASR印尼语](https://modelscope.cn/models/damo/speech_UniASR_asr_2pass-id-16k-common-vocab1067-tensorflow1-offline/summary)、
+    [UniASR印尼语流式模型](https://modelscope.cn/models/damo/speech_UniASR_asr_2pass-id-16k-common-vocab1067-tensorflow1-online/summary)、
+    [UniASR葡萄牙语](https://modelscope.cn/models/damo/speech_UniASR_asr_2pass-pt-16k-common-vocab1617-tensorflow1-offline/summary)、
+    [UniASR葡萄牙语流式模型](https://modelscope.cn/models/damo/speech_UniASR_asr_2pass-pt-16k-common-vocab1617-tensorflow1-online/summary)、
+    [UniASR英文](https://modelscope.cn/models/damo/speech_UniASR_asr_2pass-en-16k-common-vocab1080-tensorflow1-offline/summary)、
+    [UniASR英文流式模型](https://modelscope.cn/models/damo/speech_UniASR_asr_2pass-en-16k-common-vocab1080-tensorflow1-online/summary)、
+    [UniASR俄语](https://modelscope.cn/models/damo/speech_UniASR_asr_2pass-ru-16k-common-vocab1664-tensorflow1-offline/summary)、
+    [UniASR俄语流式模型](https://modelscope.cn/models/damo/speech_UniASR_asr_2pass-ru-16k-common-vocab1664-tensorflow1-online/summary)、
+    [UniASR韩语](https://modelscope.cn/models/damo/speech_UniASR_asr_2pass-ko-16k-common-vocab6400-tensorflow1-offline/summary)、
+    [UniASR韩语流式模型](https://modelscope.cn/models/damo/speech_UniASR_asr_2pass-ko-16k-common-vocab6400-tensorflow1-online/summary)、
+    [UniASR西班牙语](https://modelscope.cn/models/damo/speech_UniASR_asr_2pass-es-16k-common-vocab3445-tensorflow1-offline/summary)、
+    [UniASR西班牙语流式模型](https://modelscope.cn/models/damo/speech_UniASR_asr_2pass-es-16k-common-vocab3445-tensorflow1-online/summary)、
+    [UniASR粤语简体](https://modelscope.cn/models/damo/speech_UniASR_asr_2pass-cantonese-CHS-16k-common-vocab1468-tensorflow1-offline/files)、
+    [UniASR粤语简体流式模型](https://modelscope.cn/models/damo/speech_UniASR_asr_2pass-cantonese-CHS-16k-common-vocab1468-tensorflow1-online/files)、
+    - 8K 模型：[Paraformer中文](https://modelscope.cn/models/damo/speech_paraformer_asr_nat-zh-cn-8k-common-vocab8358-tensorflow1/summary)、
+    [UniASR中文](https://modelscope.cn/models/damo/speech_UniASR_asr_2pass-zh-cn-8k-common-vocab8358-tensorflow1-offline/summary)、
+    [UniASR中文流式模型](https://modelscope.cn/models/damo/speech_UniASR_asr_2pass-zh-cn-8k-common-vocab8358-tensorflow1-offline/summary)
+- 2022年11月：funasr-0.1.4, modelscope-1.1.3
+  - Paraformer-large非自回归模型上线，多个公开数据集上取得SOTA效果：
+    - 支持基于ModelScope推理。
+    - 支持基于[FunASR框架开源](https://github.com/alibaba-damo-academy/FunASR)微调和推理。
+# 项目介绍
+Speaker Overlap-aware Neural Diarization（SOND）是达摩院语音团队提出的一种高效建模语音重叠的说话人日志模型。本项目提供了在 <a href="http://openslr.org/119/">AliMeeting</a> 中文开源数据集上预训练的 SOND 模型，可以被应用于智能会议分析、对话分析等相关的学术研究。
+<p align="center">
+<img src="fig/sond.png" alt="SOND模型结构"  width="420" />
+</p>
+SOND模型结构如上图所示，包括对语音信息进行编码的 Speech encoder、对说话人信息进行编码的 Speaker encoder、上下文依赖的打分器 CD scorer、上下文无关的打分器 CI scorer以及预测幂集编码的说话人混合网络 SCN。其中：
+- Speech encoder 采用说话人识别任务中常用的 ResNet34网络结构，并采用 windowed statistic pooling 来得到每个时刻的语音特征
+- Speaker encoder 采用 3 层全连接网络来对说话人的声纹信息进行映射，使其与语音特征在同一个特征空间
+- 上下文无关的打分器 CI scorer 通过对比目标说话人与训练集中其他说话人的不同来学习全局的说话人区分性
+- 上下文依赖的打分器 CD scorer 通过对比目标说话人与上下文中其他说话人的不同来学习局部的说话人区分性
+- 说话人混合网络 SCN 以 CI 和 CD 打分为输入，来对不同的说话人组合进行建模。
+- 预测网络为多层FFN，并采用Softmax作为激活函数来对幂集标签 PSE 进行预测
+<p align="center">
+<img src="fig/pse.png" alt="幂集（PSE）标签示例"  width="540" />
+</p>
+更多细节详见：
+- 论文：<a href="https://arxiv.org/abs/2211.10243">Speaker Overlap-aware Neural Diarization for Multi-party Meeting Analysis</a>
+- 论文解读：<a href="https://mp.weixin.qq.com/s/iU09MDjcFTaIJXIjc9isIA">EMNLP 2022论文解读 | SOND：基于显式语音重叠建模的说话人日志模型</a>。
+- 数据集：<a href="https://arxiv.org/abs/2110.07393">M2MeT: The ICASSP 2022 Multi-Channel Multi-Party Meeting Transcription Challenge</a>
+# 如何使用模型
+## 在线快速体验（集成中）
+在页面右侧，可以在“在线体验”栏内看到我们预先准备好的示例音频，点击播放按钮可以试听，点击“执行测试”按钮，会在下方“测试结果”栏中显示多个说话人的语音活动区间。如果您想要测试自己的音频，可点击“更换音频”按钮，选择上传或录制一段音频，完成后点击执行测试，各个说话人的语音活动区间将会在测试结果栏中显示。
+## 在Notebook中推理（集成中）
+对于灵活调用有需求的开发者，我们推荐您使用Notebook进行处理。首先登录ModelScope账号，点击模型页面右上角的“在Notebook中打开”按钮出现对话框，首次使用会提示您关联阿里云账号，按提示操作即可。关联账号后可进入选择启动实例界面，选择计算资源，建立实例，待实例创建完成后进入开发环境，输入api调用实例。
+- 使用本模型进行说话人日志任务：
+```python
+from modelscope.pipelines import pipeline
+from modelscope.utils.constant import Tasks
+import numpy as np
+# 初始化推理 pipeline
+# 当以原始音频作为输入时使用配置文件 sond.yaml，并设置 mode 为sond_demo
+inference_diar_pipline = pipeline(
+    mode="sond_demo",
+    num_workers=0,
+    task=Tasks.speaker_diarization,
+    diar_model_config="sond.yaml",
+    model='damo/speech_diarization_sond-zh-cn-alimeeting-16k-n16k4-pytorch',
+    model_revision="v1.0.5",
+    sv_model="damo/speech_xvector_sv-zh-cn-cnceleb-16k-spk3465-pytorch",
+    sv_model_revision="v1.2.2",
+)
+# 以 audio_list 作为输入，其中第一个音频为待检测语音，后面的音频为不同说话人的声纹注册语音
+audio_list=[
+"https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_data/speaker_diarization/record.wav",
+"https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_data/speaker_diarization/spk1.wav",
+"https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_data/speaker_diarization/spk2.wav",
+"https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_data/speaker_diarization/spk3.wav",
+"https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_data/speaker_diarization/spk4.wav",
+]
+results = inference_diar_pipline(audio_in=audio_list)
+print(results)
+```
+- 除了url表示的网络wav文件，还可使用本地磁盘上的wav文件：
+```python
+audio_list=[
+    "example/record.wav",
+    "example/spk1.wav",
+    "example/spk2.wav",
+    "example/spk3.wav",
+    "example/spk4.wav"
+]
+results = inference_diar_pipline(audio_in=audio_list)
+```
+- 使用本模型进行批量说话人日志：
+```python
+from modelscope.pipelines import pipeline
+from modelscope.utils.constant import Tasks
+import numpy as np
+# 初始化推理 pipeline
+# 当输入为 fbank 特征时使用的配置文件 sond_fbank.yaml
+# output_dir 为结果保存路径
+inference_diar_pipline = pipeline(
+    mode="sond",
+    output_dir="outputs",
+    diar_model_config="sond_fbank.yaml",
+    task=Tasks.speaker_diarization,
+    model='speech_diarization_sond-zh-cn-alimeeting-16k-n16k4-pytorch',
+    num_workers=1
+)
+# feats.scp 包括 fbank 特征
+# utt1 path/to/feats.ark:xxx
+# utt2 path/to/feats.ark:xxx
+# profile.scp 包括基于 xvector的 speaker embedding, 可使用ModelScope上的如下模型 speech_xvector_sv-zh-cn-cnceleb-16k-spk3465-pytorch 进行提取
+# utt1 path/to/profile.ark:xxx
+# utt2 path/to/profile.ark:xxx
+data_path_and_name_and_type = [
+        ("data/test_rmsil/feats.scp", "speech", "kaldi_ark"),
+        ("data/test_rmsil/profile.scp", "profile", "kaldi_ark")
+]
+pipeline(audio_in=data_path_and_name_and_type)
+# 也可以使用我们已经准备好的特征文件进行推理，下载路径为
+# https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_data/alimeeting_test_data_for_sond.tar.gz
+```
+## 在本地机器中推理
+如果您有本地推理或定制模型的需求，可以前往下载FunASR语音处理框架，不仅涵盖语音识别、端点检测、说话人确认及日志等多种模型，还支持ModelScope开源模型的推理，使研究人员和开发者可以更加便捷的进行模型研究和生产，目前已在github开源：<a href="https://github.com/alibaba-damo-academy/FunASR">FunASR</a>
+### FunASR框架安装
+- 安装FunASR和ModelScope
+```sh
+# 安装 Pytorch GPU (version >= 1.7.0):
+conda install pytorch==1.7.0 torchvision==0.8.0 torchaudio==0.7.0 cudatoolkit=9.2 -c pytorch
+# 对于其他版本，请参考 https://pytorch.org/get-started/locally
+# 安装 ModelScope 包:
+pip install "modelscope[audio_asr]" -f https://modelscope.oss-cn-beijing.aliyuncs.com/releases/repo.html
+# 下载项目代码:
+git clone https://github.com/alibaba/FunASR.git
+# 安装 FunASR:
+pip install --editable ./
+```
+### 基于ModelScope进行推理（集成中）
+- 在上面的安装完成后，就可以在使用ModelScope进行推理了，可运行如下命令测试各种用法：
+```sh
+cd egs_modelscope/speaker_diarization/speech_diarization_sond-zh-cn-alimeeting-16k-n16k4-pytorch
+python unit_test.py
+```
+### 使用 FunASR 在 AliMeeting 测试集上进行性能评估（已完成）
+接下以 AliMeeting 数据集为例，介绍如何使用 FunASR 对模型的 DER 性能指标进行评估：
+```sh
+# 进入工作目录
+cd egs/alimeeting/diarization/sond
+# 进行评估，结束后您将会得到大概 4.21% 的 DER 结果
+sh ./run.sh
+```
+### 基于ModelScope进行模型微调
+训练和微调功能正在开发中，敬请期待。
+# Benchmark
+## 训练配置
+- Feature info: using 80 dims fbank, no cmvn
+- Train info: noam, lr 1.0, batch_size 32, 2 gpu(Tesla V100), acc_grad 1, 200000 steps, clip_gradient_norm 5.0
+- Loss info: cross entropy softmax, speaker discriminative loss
+- Model info: ResNet34, FFN, SAN, FSMN, windowed statistics pooling
+- Train config: sond.yaml
+- Model size: 19.09 M parameters
+## 实验结果 (DER)
+- Test set: Alimeeting-test
+|       testset         | DER(%) |
+|:---------------------:|:------:|
+|    Alimeeting-test    |  4.21  |
+## 使用方式以及适用范围
+运行范围
+- 现阶段只能在 Linux-x86_64 运行，暂未适配 Mac 和 Windows 。
+使用方式
+- 直接推理：可以直接使用原始语音和说话人注册语音作为输入，识别输出各个说话人的活动区间。
+- 微调：正在开发中。
+使用范围与目标场景
+- 适用于相关学术研究，在 AliMeeting 数据集上进行说话人日志、识别等任务。
+## 模型局限性以及可能的偏差
+- 特征提取流程和工具差异，会对 DER 的数值带来一定的差异（ < 0.1% ）
+- 识别语音输入应接为近 16 秒的无静音数据，注册语音长度应大于 2 秒
+## 相关论文以及引用信息
+```BibTeX
+@inproceedings{du2022sond,
+  title={Speaker Overlap-aware Neural Diarization for Multi-party Meeting Analysis},
+  author={Du, Zhihao and Zhang, Shiliang and Zheng, Siqi and Yan, Zhijie},
+  booktitle={EMNLP},
+  year={2022}
+}
+```

ckpts/speech_diarization_sond-zh-cn-alimeeting-16k-n16k4-pytorch/configuration.json ADDED Viewed

	@@ -0,0 +1,26 @@

+{
+    "framework": "pytorch",
+    "task": "speaker-diarization",
+    "model": {
+        "type": "generic-sv",
+        "model_name": "sond.pth",
+        "model_config": {
+            "type": "pytorch",
+            "code_base": "funasr",
+            "mode": "sond_demo",
+            "lang": "zh-cn",
+            "batch_size": 1,
+            "diar_model_name": "sond.pth",
+            "diar_model_config": "sond.yaml",
+            "model": "damo/speech_diarization_sond-zh-cn-alimeeting-16k-n16k4-pytorch",
+            "model_revision": "v1.0.5",
+            "param_dict": {
+                "extract_profile": true
+            },
+            "num_workers": 0
+        }
+    },
+    "pipeline": {
+        "type": "speaker-diarization-inference"
+    }
+}

ckpts/speech_diarization_sond-zh-cn-alimeeting-16k-n16k4-pytorch/example/record.wav ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:1aeb5e000ff0c1111cb7a3235b38d4b368f62b81085b1ef0766aece60484aa22
+size 512044

ckpts/speech_diarization_sond-zh-cn-alimeeting-16k-n16k4-pytorch/example/spk1.wav ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:bb204bcf1c54c5223fb088beb832e3270ce0df4f38f644eed9a76601c73563c9
+size 281964

ckpts/speech_diarization_sond-zh-cn-alimeeting-16k-n16k4-pytorch/example/spk2.wav ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:983d49e9f5719224d5c0355b1792374ec87c12e3fe97e1b539b9b0f1e52684a4
+size 307564

ckpts/speech_diarization_sond-zh-cn-alimeeting-16k-n16k4-pytorch/example/spk3.wav ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:0bab18fb205bda339c4dd70910a5a0d8ae1cc0d2b3d11c5da4528abaf1790182
+size 317484

ckpts/speech_diarization_sond-zh-cn-alimeeting-16k-n16k4-pytorch/example/spk4.wav ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:9877bd833b5b3758ae7860b72a7dbdd9a73e0e785b59df60bbf8e53a1ab009c8
+size 299564