niobures commited on
Commit
f03df43
·
verified ·
1 Parent(s): 3a97f52

Sherpa-CTC (ru)

Browse files
.gitattributes CHANGED
@@ -57,3 +57,5 @@ en/sherpa-onnx-nemo-streaming-fast-conformer-ctc-en-1040ms/test_wavs/0.wav filte
57
  en/sherpa-onnx-nemo-streaming-fast-conformer-ctc-en-1040ms/test_wavs/1.wav filter=lfs diff=lfs merge=lfs -text
58
  ja/sherpa-onnx-nemo-parakeet-tdt_ctc-0.6b-ja-35000-int8/test_wavs/test_ja_1.wav filter=lfs diff=lfs merge=lfs -text
59
  ja/sherpa-onnx-nemo-parakeet-tdt_ctc-0.6b-ja-35000-int8/test_wavs/test_ja_2.wav filter=lfs diff=lfs merge=lfs -text
 
 
 
57
  en/sherpa-onnx-nemo-streaming-fast-conformer-ctc-en-1040ms/test_wavs/1.wav filter=lfs diff=lfs merge=lfs -text
58
  ja/sherpa-onnx-nemo-parakeet-tdt_ctc-0.6b-ja-35000-int8/test_wavs/test_ja_1.wav filter=lfs diff=lfs merge=lfs -text
59
  ja/sherpa-onnx-nemo-parakeet-tdt_ctc-0.6b-ja-35000-int8/test_wavs/test_ja_2.wav filter=lfs diff=lfs merge=lfs -text
60
+ ru/sherpa-onnx-nemo-ctc-giga-am-russian-2024-10-24/test_wavs/example.wav filter=lfs diff=lfs merge=lfs -text
61
+ ru/sherpa-onnx-nemo-ctc-giga-am-russian-2024-10-24/test_wavs/long_example.wav filter=lfs diff=lfs merge=lfs -text
ru/sherpa-onnx-nemo-ctc-giga-am-russian-2024-10-24/.gitattributes ADDED
@@ -0,0 +1,36 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
+ *.model filter=lfs diff=lfs merge=lfs -text
13
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
14
+ *.npy filter=lfs diff=lfs merge=lfs -text
15
+ *.npz filter=lfs diff=lfs merge=lfs -text
16
+ *.onnx filter=lfs diff=lfs merge=lfs -text
17
+ *.ot filter=lfs diff=lfs merge=lfs -text
18
+ *.parquet filter=lfs diff=lfs merge=lfs -text
19
+ *.pb filter=lfs diff=lfs merge=lfs -text
20
+ *.pickle filter=lfs diff=lfs merge=lfs -text
21
+ *.pkl filter=lfs diff=lfs merge=lfs -text
22
+ *.pt filter=lfs diff=lfs merge=lfs -text
23
+ *.pth filter=lfs diff=lfs merge=lfs -text
24
+ *.rar filter=lfs diff=lfs merge=lfs -text
25
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
26
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
28
+ *.tar filter=lfs diff=lfs merge=lfs -text
29
+ *.tflite filter=lfs diff=lfs merge=lfs -text
30
+ *.tgz filter=lfs diff=lfs merge=lfs -text
31
+ *.wasm filter=lfs diff=lfs merge=lfs -text
32
+ *.xz filter=lfs diff=lfs merge=lfs -text
33
+ *.zip filter=lfs diff=lfs merge=lfs -text
34
+ *.zst filter=lfs diff=lfs merge=lfs -text
35
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ *.wav filter=lfs diff=lfs merge=lfs -text
ru/sherpa-onnx-nemo-ctc-giga-am-russian-2024-10-24/GigaAM%2520License_NC.pdf ADDED
Binary file (91.4 kB). View file
 
ru/sherpa-onnx-nemo-ctc-giga-am-russian-2024-10-24/README.md ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ # Introduction
2
+
3
+ This folder contains scripts for converting models from
4
+ https://github.com/salute-developers/GigaAM
5
+ to sherpa-onnx.
6
+
7
+ The ASR models are for Russian speech recognition in this folder.
8
+
9
+ Please see the license of the models at
10
+ https://github.com/salute-developers/GigaAM/blob/main/LICENSE
ru/sherpa-onnx-nemo-ctc-giga-am-russian-2024-10-24/export-onnx-ctc.py ADDED
@@ -0,0 +1,118 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ # Copyright 2024 Xiaomi Corp. (authors: Fangjun Kuang)
3
+ from typing import Dict
4
+
5
+ import onnx
6
+ import torch
7
+ import torchaudio
8
+ from nemo.collections.asr.models import EncDecCTCModel
9
+ from nemo.collections.asr.modules.audio_preprocessing import (
10
+ AudioToMelSpectrogramPreprocessor as NeMoAudioToMelSpectrogramPreprocessor,
11
+ )
12
+ from nemo.collections.asr.parts.preprocessing.features import (
13
+ FilterbankFeaturesTA as NeMoFilterbankFeaturesTA,
14
+ )
15
+ from onnxruntime.quantization import QuantType, quantize_dynamic
16
+
17
+
18
+ class FilterbankFeaturesTA(NeMoFilterbankFeaturesTA):
19
+ def __init__(self, mel_scale: str = "htk", wkwargs=None, **kwargs):
20
+ if "window_size" in kwargs:
21
+ del kwargs["window_size"]
22
+ if "window_stride" in kwargs:
23
+ del kwargs["window_stride"]
24
+
25
+ super().__init__(**kwargs)
26
+
27
+ self._mel_spec_extractor: torchaudio.transforms.MelSpectrogram = (
28
+ torchaudio.transforms.MelSpectrogram(
29
+ sample_rate=self._sample_rate,
30
+ win_length=self.win_length,
31
+ hop_length=self.hop_length,
32
+ n_mels=kwargs["nfilt"],
33
+ window_fn=self.torch_windows[kwargs["window"]],
34
+ mel_scale=mel_scale,
35
+ norm=kwargs["mel_norm"],
36
+ n_fft=kwargs["n_fft"],
37
+ f_max=kwargs.get("highfreq", None),
38
+ f_min=kwargs.get("lowfreq", 0),
39
+ wkwargs=wkwargs,
40
+ )
41
+ )
42
+
43
+
44
+ class AudioToMelSpectrogramPreprocessor(NeMoAudioToMelSpectrogramPreprocessor):
45
+ def __init__(self, mel_scale: str = "htk", **kwargs):
46
+ super().__init__(**kwargs)
47
+ kwargs["nfilt"] = kwargs["features"]
48
+ del kwargs["features"]
49
+ self.featurizer = (
50
+ FilterbankFeaturesTA( # Deprecated arguments; kept for config compatibility
51
+ mel_scale=mel_scale,
52
+ **kwargs,
53
+ )
54
+ )
55
+
56
+
57
+ def add_meta_data(filename: str, meta_data: Dict[str, str]):
58
+ """Add meta data to an ONNX model. It is changed in-place.
59
+
60
+ Args:
61
+ filename:
62
+ Filename of the ONNX model to be changed.
63
+ meta_data:
64
+ Key-value pairs.
65
+ """
66
+ model = onnx.load(filename)
67
+ while len(model.metadata_props):
68
+ model.metadata_props.pop()
69
+
70
+ for key, value in meta_data.items():
71
+ meta = model.metadata_props.add()
72
+ meta.key = key
73
+ meta.value = str(value)
74
+
75
+ onnx.save(model, filename)
76
+
77
+
78
+ @torch.no_grad()
79
+ def main():
80
+ model = EncDecCTCModel.from_config_file("./ctc_model_config.yaml")
81
+ ckpt = torch.load("./ctc_model_weights.ckpt", map_location="cpu")
82
+ model.load_state_dict(ckpt, strict=False)
83
+ model.eval()
84
+
85
+ # use characters
86
+ # space is 0
87
+ # <blk> is the last token
88
+ with open("tokens.txt", "w", encoding="utf-8") as f:
89
+ for i, t in enumerate(model.cfg.labels):
90
+ f.write(f"{t} {i}\n")
91
+ f.write(f"<blk> {i+1}\n")
92
+
93
+ filename = "model.onnx"
94
+ model.export(filename)
95
+
96
+ meta_data = {
97
+ "vocab_size": len(model.cfg.labels) + 1,
98
+ "normalize_type": "",
99
+ "subsampling_factor": 4,
100
+ "model_type": "EncDecCTCModel",
101
+ "version": "1",
102
+ "model_author": "https://github.com/salute-developers/GigaAM",
103
+ "license": "https://github.com/salute-developers/GigaAM/blob/main/GigaAM%20License_NC.pdf",
104
+ "language": "Russian",
105
+ "is_giga_am": 1,
106
+ }
107
+ add_meta_data(filename, meta_data)
108
+
109
+ filename_int8 = "model.int8.onnx"
110
+ quantize_dynamic(
111
+ model_input=filename,
112
+ model_output=filename_int8,
113
+ weight_type=QuantType.QUInt8,
114
+ )
115
+
116
+
117
+ if __name__ == "__main__":
118
+ main()
ru/sherpa-onnx-nemo-ctc-giga-am-russian-2024-10-24/model.int8.onnx ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a5fa9d5e00845f1c57ca9033edfa50af7c7792559393c2ac9d03a7eaa6c97611
3
+ size 274808098
ru/sherpa-onnx-nemo-ctc-giga-am-russian-2024-10-24/run-ctc.sh ADDED
@@ -0,0 +1,40 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env bash
2
+ # Copyright 2024 Xiaomi Corp. (authors: Fangjun Kuang)
3
+
4
+ set -ex
5
+
6
+ function install_nemo() {
7
+ curl https://bootstrap.pypa.io/get-pip.py -o get-pip.py
8
+ python3 get-pip.py
9
+
10
+ pip install torch==2.4.0 torchaudio==2.4.0 -f https://download.pytorch.org/whl/torch_stable.html
11
+
12
+ pip install -qq wget text-unidecode "matplotlib>=3.3.2" onnx onnxruntime==1.17.1 pybind11 Cython einops kaldi-native-fbank soundfile librosa
13
+ pip install -qq ipython
14
+
15
+ # sudo apt-get install -q -y sox libsndfile1 ffmpeg python3-pip ipython
16
+
17
+ BRANCH='main'
18
+ python3 -m pip install git+https://github.com/NVIDIA/NeMo.git@$BRANCH#egg=nemo_toolkit[asr]
19
+
20
+ pip install numpy==1.26.4
21
+ }
22
+
23
+ function download_files() {
24
+ # curl -SL -O https://n-ws-q0bez.s3pd12.sbercloud.ru/b-ws-q0bez-jpv/GigaAM/ctc_model_weights.ckpt
25
+ # curl -SL -O https://n-ws-q0bez.s3pd12.sbercloud.ru/b-ws-q0bez-jpv/GigaAM/ctc_model_config.yaml
26
+ # curl -SL -O https://n-ws-q0bez.s3pd12.sbercloud.ru/b-ws-q0bez-jpv/GigaAM/example.wav
27
+ # curl -SL -O https://n-ws-q0bez.s3pd12.sbercloud.ru/b-ws-q0bez-jpv/GigaAM/long_example.wav
28
+ curl -SL -O https://huggingface.co/csukuangfj/tmp-files/resolve/main/GigaAM/ctc/ctc_model_weights.ckpt
29
+ curl -SL -O https://huggingface.co/csukuangfj/tmp-files/resolve/main/GigaAM/ctc/ctc_model_config.yaml
30
+ curl -SL -O https://huggingface.co/csukuangfj/tmp-files/resolve/main/GigaAM/example.wav
31
+ curl -SL -O https://huggingface.co/csukuangfj/tmp-files/resolve/main/GigaAM/long_example.wav
32
+ curl -SL -O https://huggingface.co/csukuangfj/tmp-files/resolve/main/GigaAM/GigaAM%20License_NC.pdf
33
+ }
34
+
35
+ install_nemo
36
+ download_files
37
+
38
+ python3 ./export-onnx-ctc.py
39
+ ls -lh
40
+ python3 ./test-onnx-ctc.py
ru/sherpa-onnx-nemo-ctc-giga-am-russian-2024-10-24/test-onnx-ctc.py ADDED
@@ -0,0 +1,155 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ # Copyright 2024 Xiaomi Corp. (authors: Fangjun Kuang)
3
+
4
+ # https://github.com/salute-developers/GigaAM
5
+
6
+ import kaldi_native_fbank as knf
7
+ import librosa
8
+ import numpy as np
9
+ import onnxruntime as ort
10
+ import soundfile as sf
11
+ import torch
12
+
13
+
14
+ def create_fbank():
15
+ opts = knf.FbankOptions()
16
+ opts.frame_opts.dither = 0
17
+ opts.frame_opts.remove_dc_offset = False
18
+ opts.frame_opts.preemph_coeff = 0
19
+ opts.frame_opts.window_type = "hann"
20
+
21
+ opts.frame_opts.round_to_power_of_two = False
22
+
23
+ opts.mel_opts.low_freq = 0
24
+ opts.mel_opts.high_freq = 8000
25
+ opts.mel_opts.num_bins = 64
26
+
27
+ fbank = knf.OnlineFbank(opts)
28
+ return fbank
29
+
30
+
31
+ def compute_features(audio, fbank) -> np.ndarray:
32
+ """
33
+ Args:
34
+ audio: (num_samples,), np.float32
35
+ fbank: the fbank extractor
36
+ Returns:
37
+ features: (num_frames, feat_dim), np.float32
38
+ """
39
+ assert len(audio.shape) == 1, audio.shape
40
+ fbank.accept_waveform(16000, audio)
41
+ ans = []
42
+ processed = 0
43
+ while processed < fbank.num_frames_ready:
44
+ ans.append(np.array(fbank.get_frame(processed)))
45
+ processed += 1
46
+ ans = np.stack(ans)
47
+ return ans
48
+
49
+
50
+ def display(sess):
51
+ print("==========Input==========")
52
+ for i in sess.get_inputs():
53
+ print(i)
54
+ print("==========Output==========")
55
+ for i in sess.get_outputs():
56
+ print(i)
57
+
58
+
59
+ """
60
+ ==========Input==========
61
+ NodeArg(name='audio_signal', type='tensor(float)', shape=['audio_signal_dynamic_axes_1', 64, 'audio_signal_dynamic_axes_2'])
62
+ NodeArg(name='length', type='tensor(int64)', shape=['length_dynamic_axes_1'])
63
+ ==========Output==========
64
+ NodeArg(name='logprobs', type='tensor(float)', shape=['logprobs_dynamic_axes_1', 'logprobs_dynamic_axes_2', 34])
65
+ """
66
+
67
+
68
+ class OnnxModel:
69
+ def __init__(
70
+ self,
71
+ filename: str,
72
+ ):
73
+ session_opts = ort.SessionOptions()
74
+ session_opts.inter_op_num_threads = 1
75
+ session_opts.intra_op_num_threads = 1
76
+
77
+ self.model = ort.InferenceSession(
78
+ filename,
79
+ sess_options=session_opts,
80
+ providers=["CPUExecutionProvider"],
81
+ )
82
+ display(self.model)
83
+
84
+ def __call__(self, x: np.ndarray):
85
+ # x: (T, C)
86
+ x = torch.from_numpy(x)
87
+ x = x.t().unsqueeze(0)
88
+ # x: [1, C, T]
89
+ x_lens = torch.tensor([x.shape[-1]], dtype=torch.int64)
90
+
91
+ log_probs = self.model.run(
92
+ [
93
+ self.model.get_outputs()[0].name,
94
+ ],
95
+ {
96
+ self.model.get_inputs()[0].name: x.numpy(),
97
+ self.model.get_inputs()[1].name: x_lens.numpy(),
98
+ },
99
+ )[0]
100
+ # [batch_size, T, dim]
101
+ return log_probs
102
+
103
+
104
+ def main():
105
+ filename = "./model.int8.onnx"
106
+ tokens = "./tokens.txt"
107
+ wav = "./example.wav"
108
+
109
+ model = OnnxModel(filename)
110
+
111
+ id2token = dict()
112
+ with open(tokens, encoding="utf-8") as f:
113
+ for line in f:
114
+ fields = line.split()
115
+ if len(fields) == 1:
116
+ id2token[int(fields[0])] = " "
117
+ else:
118
+ t, idx = fields
119
+ id2token[int(idx)] = t
120
+
121
+ fbank = create_fbank()
122
+ audio, sample_rate = sf.read(wav, dtype="float32", always_2d=True)
123
+ audio = audio[:, 0] # only use the first channel
124
+ if sample_rate != 16000:
125
+ audio = librosa.resample(
126
+ audio,
127
+ orig_sr=sample_rate,
128
+ target_sr=16000,
129
+ )
130
+ sample_rate = 16000
131
+
132
+ features = compute_features(audio, fbank)
133
+ print("features.shape", features.shape)
134
+
135
+ blank = len(id2token) - 1
136
+ prev = -1
137
+ ans = []
138
+ log_probs = model(features)
139
+ print("log_probs", log_probs.shape)
140
+ log_probs = torch.from_numpy(log_probs)[0]
141
+ ids = torch.argmax(log_probs, dim=1).tolist()
142
+ for i in ids:
143
+ if i != blank and i != prev:
144
+ ans.append(i)
145
+ prev = i
146
+
147
+ tokens = [id2token[i] for i in ans]
148
+
149
+ text = "".join(tokens)
150
+ print(wav)
151
+ print(text)
152
+
153
+
154
+ if __name__ == "__main__":
155
+ main()
ru/sherpa-onnx-nemo-ctc-giga-am-russian-2024-10-24/test_wavs/example.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d8aaaa18a5098d7c6de0595ae7ac1e64cacd0d4022af3595213bdaf23be77e69
3
+ size 361324
ru/sherpa-onnx-nemo-ctc-giga-am-russian-2024-10-24/test_wavs/long_example.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1868ece0195dfa9fc2394be24865d1133c8452a8292a397db45ba8c3ed9e01e3
3
+ size 2280044
ru/sherpa-onnx-nemo-ctc-giga-am-russian-2024-10-24/tokens.txt ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ 0
2
+ а 1
3
+ б 2
4
+ в 3
5
+ г 4
6
+ д 5
7
+ е 6
8
+ ж 7
9
+ з 8
10
+ и 9
11
+ й 10
12
+ к 11
13
+ л 12
14
+ м 13
15
+ н 14
16
+ о 15
17
+ п 16
18
+ р 17
19
+ с 18
20
+ т 19
21
+ у 20
22
+ ф 21
23
+ х 22
24
+ ц 23
25
+ ч 24
26
+ ш 25
27
+ щ 26
28
+ ъ 27
29
+ ы 28
30
+ ь 29
31
+ э 30
32
+ ю 31
33
+ я 32
34
+ <blk> 33