Sherpa-CTC (ru)
Browse files- .gitattributes +2 -0
- ru/sherpa-onnx-nemo-ctc-giga-am-russian-2024-10-24/.gitattributes +36 -0
- ru/sherpa-onnx-nemo-ctc-giga-am-russian-2024-10-24/GigaAM%2520License_NC.pdf +0 -0
- ru/sherpa-onnx-nemo-ctc-giga-am-russian-2024-10-24/README.md +10 -0
- ru/sherpa-onnx-nemo-ctc-giga-am-russian-2024-10-24/export-onnx-ctc.py +118 -0
- ru/sherpa-onnx-nemo-ctc-giga-am-russian-2024-10-24/model.int8.onnx +3 -0
- ru/sherpa-onnx-nemo-ctc-giga-am-russian-2024-10-24/run-ctc.sh +40 -0
- ru/sherpa-onnx-nemo-ctc-giga-am-russian-2024-10-24/test-onnx-ctc.py +155 -0
- ru/sherpa-onnx-nemo-ctc-giga-am-russian-2024-10-24/test_wavs/example.wav +3 -0
- ru/sherpa-onnx-nemo-ctc-giga-am-russian-2024-10-24/test_wavs/long_example.wav +3 -0
- ru/sherpa-onnx-nemo-ctc-giga-am-russian-2024-10-24/tokens.txt +34 -0
.gitattributes
CHANGED
|
@@ -57,3 +57,5 @@ en/sherpa-onnx-nemo-streaming-fast-conformer-ctc-en-1040ms/test_wavs/0.wav filte
|
|
| 57 |
en/sherpa-onnx-nemo-streaming-fast-conformer-ctc-en-1040ms/test_wavs/1.wav filter=lfs diff=lfs merge=lfs -text
|
| 58 |
ja/sherpa-onnx-nemo-parakeet-tdt_ctc-0.6b-ja-35000-int8/test_wavs/test_ja_1.wav filter=lfs diff=lfs merge=lfs -text
|
| 59 |
ja/sherpa-onnx-nemo-parakeet-tdt_ctc-0.6b-ja-35000-int8/test_wavs/test_ja_2.wav filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
|
|
|
| 57 |
en/sherpa-onnx-nemo-streaming-fast-conformer-ctc-en-1040ms/test_wavs/1.wav filter=lfs diff=lfs merge=lfs -text
|
| 58 |
ja/sherpa-onnx-nemo-parakeet-tdt_ctc-0.6b-ja-35000-int8/test_wavs/test_ja_1.wav filter=lfs diff=lfs merge=lfs -text
|
| 59 |
ja/sherpa-onnx-nemo-parakeet-tdt_ctc-0.6b-ja-35000-int8/test_wavs/test_ja_2.wav filter=lfs diff=lfs merge=lfs -text
|
| 60 |
+
ru/sherpa-onnx-nemo-ctc-giga-am-russian-2024-10-24/test_wavs/example.wav filter=lfs diff=lfs merge=lfs -text
|
| 61 |
+
ru/sherpa-onnx-nemo-ctc-giga-am-russian-2024-10-24/test_wavs/long_example.wav filter=lfs diff=lfs merge=lfs -text
|
ru/sherpa-onnx-nemo-ctc-giga-am-russian-2024-10-24/.gitattributes
ADDED
|
@@ -0,0 +1,36 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
*.7z filter=lfs diff=lfs merge=lfs -text
|
| 2 |
+
*.arrow filter=lfs diff=lfs merge=lfs -text
|
| 3 |
+
*.bin filter=lfs diff=lfs merge=lfs -text
|
| 4 |
+
*.bz2 filter=lfs diff=lfs merge=lfs -text
|
| 5 |
+
*.ckpt filter=lfs diff=lfs merge=lfs -text
|
| 6 |
+
*.ftz filter=lfs diff=lfs merge=lfs -text
|
| 7 |
+
*.gz filter=lfs diff=lfs merge=lfs -text
|
| 8 |
+
*.h5 filter=lfs diff=lfs merge=lfs -text
|
| 9 |
+
*.joblib filter=lfs diff=lfs merge=lfs -text
|
| 10 |
+
*.lfs.* filter=lfs diff=lfs merge=lfs -text
|
| 11 |
+
*.mlmodel filter=lfs diff=lfs merge=lfs -text
|
| 12 |
+
*.model filter=lfs diff=lfs merge=lfs -text
|
| 13 |
+
*.msgpack filter=lfs diff=lfs merge=lfs -text
|
| 14 |
+
*.npy filter=lfs diff=lfs merge=lfs -text
|
| 15 |
+
*.npz filter=lfs diff=lfs merge=lfs -text
|
| 16 |
+
*.onnx filter=lfs diff=lfs merge=lfs -text
|
| 17 |
+
*.ot filter=lfs diff=lfs merge=lfs -text
|
| 18 |
+
*.parquet filter=lfs diff=lfs merge=lfs -text
|
| 19 |
+
*.pb filter=lfs diff=lfs merge=lfs -text
|
| 20 |
+
*.pickle filter=lfs diff=lfs merge=lfs -text
|
| 21 |
+
*.pkl filter=lfs diff=lfs merge=lfs -text
|
| 22 |
+
*.pt filter=lfs diff=lfs merge=lfs -text
|
| 23 |
+
*.pth filter=lfs diff=lfs merge=lfs -text
|
| 24 |
+
*.rar filter=lfs diff=lfs merge=lfs -text
|
| 25 |
+
*.safetensors filter=lfs diff=lfs merge=lfs -text
|
| 26 |
+
saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
| 27 |
+
*.tar.* filter=lfs diff=lfs merge=lfs -text
|
| 28 |
+
*.tar filter=lfs diff=lfs merge=lfs -text
|
| 29 |
+
*.tflite filter=lfs diff=lfs merge=lfs -text
|
| 30 |
+
*.tgz filter=lfs diff=lfs merge=lfs -text
|
| 31 |
+
*.wasm filter=lfs diff=lfs merge=lfs -text
|
| 32 |
+
*.xz filter=lfs diff=lfs merge=lfs -text
|
| 33 |
+
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
+
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
+
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
| 36 |
+
*.wav filter=lfs diff=lfs merge=lfs -text
|
ru/sherpa-onnx-nemo-ctc-giga-am-russian-2024-10-24/GigaAM%2520License_NC.pdf
ADDED
|
Binary file (91.4 kB). View file
|
|
|
ru/sherpa-onnx-nemo-ctc-giga-am-russian-2024-10-24/README.md
ADDED
|
@@ -0,0 +1,10 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Introduction
|
| 2 |
+
|
| 3 |
+
This folder contains scripts for converting models from
|
| 4 |
+
https://github.com/salute-developers/GigaAM
|
| 5 |
+
to sherpa-onnx.
|
| 6 |
+
|
| 7 |
+
The ASR models are for Russian speech recognition in this folder.
|
| 8 |
+
|
| 9 |
+
Please see the license of the models at
|
| 10 |
+
https://github.com/salute-developers/GigaAM/blob/main/LICENSE
|
ru/sherpa-onnx-nemo-ctc-giga-am-russian-2024-10-24/export-onnx-ctc.py
ADDED
|
@@ -0,0 +1,118 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
# Copyright 2024 Xiaomi Corp. (authors: Fangjun Kuang)
|
| 3 |
+
from typing import Dict
|
| 4 |
+
|
| 5 |
+
import onnx
|
| 6 |
+
import torch
|
| 7 |
+
import torchaudio
|
| 8 |
+
from nemo.collections.asr.models import EncDecCTCModel
|
| 9 |
+
from nemo.collections.asr.modules.audio_preprocessing import (
|
| 10 |
+
AudioToMelSpectrogramPreprocessor as NeMoAudioToMelSpectrogramPreprocessor,
|
| 11 |
+
)
|
| 12 |
+
from nemo.collections.asr.parts.preprocessing.features import (
|
| 13 |
+
FilterbankFeaturesTA as NeMoFilterbankFeaturesTA,
|
| 14 |
+
)
|
| 15 |
+
from onnxruntime.quantization import QuantType, quantize_dynamic
|
| 16 |
+
|
| 17 |
+
|
| 18 |
+
class FilterbankFeaturesTA(NeMoFilterbankFeaturesTA):
|
| 19 |
+
def __init__(self, mel_scale: str = "htk", wkwargs=None, **kwargs):
|
| 20 |
+
if "window_size" in kwargs:
|
| 21 |
+
del kwargs["window_size"]
|
| 22 |
+
if "window_stride" in kwargs:
|
| 23 |
+
del kwargs["window_stride"]
|
| 24 |
+
|
| 25 |
+
super().__init__(**kwargs)
|
| 26 |
+
|
| 27 |
+
self._mel_spec_extractor: torchaudio.transforms.MelSpectrogram = (
|
| 28 |
+
torchaudio.transforms.MelSpectrogram(
|
| 29 |
+
sample_rate=self._sample_rate,
|
| 30 |
+
win_length=self.win_length,
|
| 31 |
+
hop_length=self.hop_length,
|
| 32 |
+
n_mels=kwargs["nfilt"],
|
| 33 |
+
window_fn=self.torch_windows[kwargs["window"]],
|
| 34 |
+
mel_scale=mel_scale,
|
| 35 |
+
norm=kwargs["mel_norm"],
|
| 36 |
+
n_fft=kwargs["n_fft"],
|
| 37 |
+
f_max=kwargs.get("highfreq", None),
|
| 38 |
+
f_min=kwargs.get("lowfreq", 0),
|
| 39 |
+
wkwargs=wkwargs,
|
| 40 |
+
)
|
| 41 |
+
)
|
| 42 |
+
|
| 43 |
+
|
| 44 |
+
class AudioToMelSpectrogramPreprocessor(NeMoAudioToMelSpectrogramPreprocessor):
|
| 45 |
+
def __init__(self, mel_scale: str = "htk", **kwargs):
|
| 46 |
+
super().__init__(**kwargs)
|
| 47 |
+
kwargs["nfilt"] = kwargs["features"]
|
| 48 |
+
del kwargs["features"]
|
| 49 |
+
self.featurizer = (
|
| 50 |
+
FilterbankFeaturesTA( # Deprecated arguments; kept for config compatibility
|
| 51 |
+
mel_scale=mel_scale,
|
| 52 |
+
**kwargs,
|
| 53 |
+
)
|
| 54 |
+
)
|
| 55 |
+
|
| 56 |
+
|
| 57 |
+
def add_meta_data(filename: str, meta_data: Dict[str, str]):
|
| 58 |
+
"""Add meta data to an ONNX model. It is changed in-place.
|
| 59 |
+
|
| 60 |
+
Args:
|
| 61 |
+
filename:
|
| 62 |
+
Filename of the ONNX model to be changed.
|
| 63 |
+
meta_data:
|
| 64 |
+
Key-value pairs.
|
| 65 |
+
"""
|
| 66 |
+
model = onnx.load(filename)
|
| 67 |
+
while len(model.metadata_props):
|
| 68 |
+
model.metadata_props.pop()
|
| 69 |
+
|
| 70 |
+
for key, value in meta_data.items():
|
| 71 |
+
meta = model.metadata_props.add()
|
| 72 |
+
meta.key = key
|
| 73 |
+
meta.value = str(value)
|
| 74 |
+
|
| 75 |
+
onnx.save(model, filename)
|
| 76 |
+
|
| 77 |
+
|
| 78 |
+
@torch.no_grad()
|
| 79 |
+
def main():
|
| 80 |
+
model = EncDecCTCModel.from_config_file("./ctc_model_config.yaml")
|
| 81 |
+
ckpt = torch.load("./ctc_model_weights.ckpt", map_location="cpu")
|
| 82 |
+
model.load_state_dict(ckpt, strict=False)
|
| 83 |
+
model.eval()
|
| 84 |
+
|
| 85 |
+
# use characters
|
| 86 |
+
# space is 0
|
| 87 |
+
# <blk> is the last token
|
| 88 |
+
with open("tokens.txt", "w", encoding="utf-8") as f:
|
| 89 |
+
for i, t in enumerate(model.cfg.labels):
|
| 90 |
+
f.write(f"{t} {i}\n")
|
| 91 |
+
f.write(f"<blk> {i+1}\n")
|
| 92 |
+
|
| 93 |
+
filename = "model.onnx"
|
| 94 |
+
model.export(filename)
|
| 95 |
+
|
| 96 |
+
meta_data = {
|
| 97 |
+
"vocab_size": len(model.cfg.labels) + 1,
|
| 98 |
+
"normalize_type": "",
|
| 99 |
+
"subsampling_factor": 4,
|
| 100 |
+
"model_type": "EncDecCTCModel",
|
| 101 |
+
"version": "1",
|
| 102 |
+
"model_author": "https://github.com/salute-developers/GigaAM",
|
| 103 |
+
"license": "https://github.com/salute-developers/GigaAM/blob/main/GigaAM%20License_NC.pdf",
|
| 104 |
+
"language": "Russian",
|
| 105 |
+
"is_giga_am": 1,
|
| 106 |
+
}
|
| 107 |
+
add_meta_data(filename, meta_data)
|
| 108 |
+
|
| 109 |
+
filename_int8 = "model.int8.onnx"
|
| 110 |
+
quantize_dynamic(
|
| 111 |
+
model_input=filename,
|
| 112 |
+
model_output=filename_int8,
|
| 113 |
+
weight_type=QuantType.QUInt8,
|
| 114 |
+
)
|
| 115 |
+
|
| 116 |
+
|
| 117 |
+
if __name__ == "__main__":
|
| 118 |
+
main()
|
ru/sherpa-onnx-nemo-ctc-giga-am-russian-2024-10-24/model.int8.onnx
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:a5fa9d5e00845f1c57ca9033edfa50af7c7792559393c2ac9d03a7eaa6c97611
|
| 3 |
+
size 274808098
|
ru/sherpa-onnx-nemo-ctc-giga-am-russian-2024-10-24/run-ctc.sh
ADDED
|
@@ -0,0 +1,40 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env bash
|
| 2 |
+
# Copyright 2024 Xiaomi Corp. (authors: Fangjun Kuang)
|
| 3 |
+
|
| 4 |
+
set -ex
|
| 5 |
+
|
| 6 |
+
function install_nemo() {
|
| 7 |
+
curl https://bootstrap.pypa.io/get-pip.py -o get-pip.py
|
| 8 |
+
python3 get-pip.py
|
| 9 |
+
|
| 10 |
+
pip install torch==2.4.0 torchaudio==2.4.0 -f https://download.pytorch.org/whl/torch_stable.html
|
| 11 |
+
|
| 12 |
+
pip install -qq wget text-unidecode "matplotlib>=3.3.2" onnx onnxruntime==1.17.1 pybind11 Cython einops kaldi-native-fbank soundfile librosa
|
| 13 |
+
pip install -qq ipython
|
| 14 |
+
|
| 15 |
+
# sudo apt-get install -q -y sox libsndfile1 ffmpeg python3-pip ipython
|
| 16 |
+
|
| 17 |
+
BRANCH='main'
|
| 18 |
+
python3 -m pip install git+https://github.com/NVIDIA/NeMo.git@$BRANCH#egg=nemo_toolkit[asr]
|
| 19 |
+
|
| 20 |
+
pip install numpy==1.26.4
|
| 21 |
+
}
|
| 22 |
+
|
| 23 |
+
function download_files() {
|
| 24 |
+
# curl -SL -O https://n-ws-q0bez.s3pd12.sbercloud.ru/b-ws-q0bez-jpv/GigaAM/ctc_model_weights.ckpt
|
| 25 |
+
# curl -SL -O https://n-ws-q0bez.s3pd12.sbercloud.ru/b-ws-q0bez-jpv/GigaAM/ctc_model_config.yaml
|
| 26 |
+
# curl -SL -O https://n-ws-q0bez.s3pd12.sbercloud.ru/b-ws-q0bez-jpv/GigaAM/example.wav
|
| 27 |
+
# curl -SL -O https://n-ws-q0bez.s3pd12.sbercloud.ru/b-ws-q0bez-jpv/GigaAM/long_example.wav
|
| 28 |
+
curl -SL -O https://huggingface.co/csukuangfj/tmp-files/resolve/main/GigaAM/ctc/ctc_model_weights.ckpt
|
| 29 |
+
curl -SL -O https://huggingface.co/csukuangfj/tmp-files/resolve/main/GigaAM/ctc/ctc_model_config.yaml
|
| 30 |
+
curl -SL -O https://huggingface.co/csukuangfj/tmp-files/resolve/main/GigaAM/example.wav
|
| 31 |
+
curl -SL -O https://huggingface.co/csukuangfj/tmp-files/resolve/main/GigaAM/long_example.wav
|
| 32 |
+
curl -SL -O https://huggingface.co/csukuangfj/tmp-files/resolve/main/GigaAM/GigaAM%20License_NC.pdf
|
| 33 |
+
}
|
| 34 |
+
|
| 35 |
+
install_nemo
|
| 36 |
+
download_files
|
| 37 |
+
|
| 38 |
+
python3 ./export-onnx-ctc.py
|
| 39 |
+
ls -lh
|
| 40 |
+
python3 ./test-onnx-ctc.py
|
ru/sherpa-onnx-nemo-ctc-giga-am-russian-2024-10-24/test-onnx-ctc.py
ADDED
|
@@ -0,0 +1,155 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
# Copyright 2024 Xiaomi Corp. (authors: Fangjun Kuang)
|
| 3 |
+
|
| 4 |
+
# https://github.com/salute-developers/GigaAM
|
| 5 |
+
|
| 6 |
+
import kaldi_native_fbank as knf
|
| 7 |
+
import librosa
|
| 8 |
+
import numpy as np
|
| 9 |
+
import onnxruntime as ort
|
| 10 |
+
import soundfile as sf
|
| 11 |
+
import torch
|
| 12 |
+
|
| 13 |
+
|
| 14 |
+
def create_fbank():
|
| 15 |
+
opts = knf.FbankOptions()
|
| 16 |
+
opts.frame_opts.dither = 0
|
| 17 |
+
opts.frame_opts.remove_dc_offset = False
|
| 18 |
+
opts.frame_opts.preemph_coeff = 0
|
| 19 |
+
opts.frame_opts.window_type = "hann"
|
| 20 |
+
|
| 21 |
+
opts.frame_opts.round_to_power_of_two = False
|
| 22 |
+
|
| 23 |
+
opts.mel_opts.low_freq = 0
|
| 24 |
+
opts.mel_opts.high_freq = 8000
|
| 25 |
+
opts.mel_opts.num_bins = 64
|
| 26 |
+
|
| 27 |
+
fbank = knf.OnlineFbank(opts)
|
| 28 |
+
return fbank
|
| 29 |
+
|
| 30 |
+
|
| 31 |
+
def compute_features(audio, fbank) -> np.ndarray:
|
| 32 |
+
"""
|
| 33 |
+
Args:
|
| 34 |
+
audio: (num_samples,), np.float32
|
| 35 |
+
fbank: the fbank extractor
|
| 36 |
+
Returns:
|
| 37 |
+
features: (num_frames, feat_dim), np.float32
|
| 38 |
+
"""
|
| 39 |
+
assert len(audio.shape) == 1, audio.shape
|
| 40 |
+
fbank.accept_waveform(16000, audio)
|
| 41 |
+
ans = []
|
| 42 |
+
processed = 0
|
| 43 |
+
while processed < fbank.num_frames_ready:
|
| 44 |
+
ans.append(np.array(fbank.get_frame(processed)))
|
| 45 |
+
processed += 1
|
| 46 |
+
ans = np.stack(ans)
|
| 47 |
+
return ans
|
| 48 |
+
|
| 49 |
+
|
| 50 |
+
def display(sess):
|
| 51 |
+
print("==========Input==========")
|
| 52 |
+
for i in sess.get_inputs():
|
| 53 |
+
print(i)
|
| 54 |
+
print("==========Output==========")
|
| 55 |
+
for i in sess.get_outputs():
|
| 56 |
+
print(i)
|
| 57 |
+
|
| 58 |
+
|
| 59 |
+
"""
|
| 60 |
+
==========Input==========
|
| 61 |
+
NodeArg(name='audio_signal', type='tensor(float)', shape=['audio_signal_dynamic_axes_1', 64, 'audio_signal_dynamic_axes_2'])
|
| 62 |
+
NodeArg(name='length', type='tensor(int64)', shape=['length_dynamic_axes_1'])
|
| 63 |
+
==========Output==========
|
| 64 |
+
NodeArg(name='logprobs', type='tensor(float)', shape=['logprobs_dynamic_axes_1', 'logprobs_dynamic_axes_2', 34])
|
| 65 |
+
"""
|
| 66 |
+
|
| 67 |
+
|
| 68 |
+
class OnnxModel:
|
| 69 |
+
def __init__(
|
| 70 |
+
self,
|
| 71 |
+
filename: str,
|
| 72 |
+
):
|
| 73 |
+
session_opts = ort.SessionOptions()
|
| 74 |
+
session_opts.inter_op_num_threads = 1
|
| 75 |
+
session_opts.intra_op_num_threads = 1
|
| 76 |
+
|
| 77 |
+
self.model = ort.InferenceSession(
|
| 78 |
+
filename,
|
| 79 |
+
sess_options=session_opts,
|
| 80 |
+
providers=["CPUExecutionProvider"],
|
| 81 |
+
)
|
| 82 |
+
display(self.model)
|
| 83 |
+
|
| 84 |
+
def __call__(self, x: np.ndarray):
|
| 85 |
+
# x: (T, C)
|
| 86 |
+
x = torch.from_numpy(x)
|
| 87 |
+
x = x.t().unsqueeze(0)
|
| 88 |
+
# x: [1, C, T]
|
| 89 |
+
x_lens = torch.tensor([x.shape[-1]], dtype=torch.int64)
|
| 90 |
+
|
| 91 |
+
log_probs = self.model.run(
|
| 92 |
+
[
|
| 93 |
+
self.model.get_outputs()[0].name,
|
| 94 |
+
],
|
| 95 |
+
{
|
| 96 |
+
self.model.get_inputs()[0].name: x.numpy(),
|
| 97 |
+
self.model.get_inputs()[1].name: x_lens.numpy(),
|
| 98 |
+
},
|
| 99 |
+
)[0]
|
| 100 |
+
# [batch_size, T, dim]
|
| 101 |
+
return log_probs
|
| 102 |
+
|
| 103 |
+
|
| 104 |
+
def main():
|
| 105 |
+
filename = "./model.int8.onnx"
|
| 106 |
+
tokens = "./tokens.txt"
|
| 107 |
+
wav = "./example.wav"
|
| 108 |
+
|
| 109 |
+
model = OnnxModel(filename)
|
| 110 |
+
|
| 111 |
+
id2token = dict()
|
| 112 |
+
with open(tokens, encoding="utf-8") as f:
|
| 113 |
+
for line in f:
|
| 114 |
+
fields = line.split()
|
| 115 |
+
if len(fields) == 1:
|
| 116 |
+
id2token[int(fields[0])] = " "
|
| 117 |
+
else:
|
| 118 |
+
t, idx = fields
|
| 119 |
+
id2token[int(idx)] = t
|
| 120 |
+
|
| 121 |
+
fbank = create_fbank()
|
| 122 |
+
audio, sample_rate = sf.read(wav, dtype="float32", always_2d=True)
|
| 123 |
+
audio = audio[:, 0] # only use the first channel
|
| 124 |
+
if sample_rate != 16000:
|
| 125 |
+
audio = librosa.resample(
|
| 126 |
+
audio,
|
| 127 |
+
orig_sr=sample_rate,
|
| 128 |
+
target_sr=16000,
|
| 129 |
+
)
|
| 130 |
+
sample_rate = 16000
|
| 131 |
+
|
| 132 |
+
features = compute_features(audio, fbank)
|
| 133 |
+
print("features.shape", features.shape)
|
| 134 |
+
|
| 135 |
+
blank = len(id2token) - 1
|
| 136 |
+
prev = -1
|
| 137 |
+
ans = []
|
| 138 |
+
log_probs = model(features)
|
| 139 |
+
print("log_probs", log_probs.shape)
|
| 140 |
+
log_probs = torch.from_numpy(log_probs)[0]
|
| 141 |
+
ids = torch.argmax(log_probs, dim=1).tolist()
|
| 142 |
+
for i in ids:
|
| 143 |
+
if i != blank and i != prev:
|
| 144 |
+
ans.append(i)
|
| 145 |
+
prev = i
|
| 146 |
+
|
| 147 |
+
tokens = [id2token[i] for i in ans]
|
| 148 |
+
|
| 149 |
+
text = "".join(tokens)
|
| 150 |
+
print(wav)
|
| 151 |
+
print(text)
|
| 152 |
+
|
| 153 |
+
|
| 154 |
+
if __name__ == "__main__":
|
| 155 |
+
main()
|
ru/sherpa-onnx-nemo-ctc-giga-am-russian-2024-10-24/test_wavs/example.wav
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:d8aaaa18a5098d7c6de0595ae7ac1e64cacd0d4022af3595213bdaf23be77e69
|
| 3 |
+
size 361324
|
ru/sherpa-onnx-nemo-ctc-giga-am-russian-2024-10-24/test_wavs/long_example.wav
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:1868ece0195dfa9fc2394be24865d1133c8452a8292a397db45ba8c3ed9e01e3
|
| 3 |
+
size 2280044
|
ru/sherpa-onnx-nemo-ctc-giga-am-russian-2024-10-24/tokens.txt
ADDED
|
@@ -0,0 +1,34 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
0
|
| 2 |
+
а 1
|
| 3 |
+
б 2
|
| 4 |
+
в 3
|
| 5 |
+
г 4
|
| 6 |
+
д 5
|
| 7 |
+
е 6
|
| 8 |
+
ж 7
|
| 9 |
+
з 8
|
| 10 |
+
и 9
|
| 11 |
+
й 10
|
| 12 |
+
к 11
|
| 13 |
+
л 12
|
| 14 |
+
м 13
|
| 15 |
+
н 14
|
| 16 |
+
о 15
|
| 17 |
+
п 16
|
| 18 |
+
р 17
|
| 19 |
+
с 18
|
| 20 |
+
т 19
|
| 21 |
+
у 20
|
| 22 |
+
ф 21
|
| 23 |
+
х 22
|
| 24 |
+
ц 23
|
| 25 |
+
ч 24
|
| 26 |
+
ш 25
|
| 27 |
+
щ 26
|
| 28 |
+
ъ 27
|
| 29 |
+
ы 28
|
| 30 |
+
ь 29
|
| 31 |
+
э 30
|
| 32 |
+
ю 31
|
| 33 |
+
я 32
|
| 34 |
+
<blk> 33
|