niobures commited on
Commit
c166d2e
·
verified ·
1 Parent(s): f03df43

Sherpa-CTC (ru)

Browse files
.gitattributes CHANGED
@@ -59,3 +59,4 @@ ja/sherpa-onnx-nemo-parakeet-tdt_ctc-0.6b-ja-35000-int8/test_wavs/test_ja_1.wav
59
  ja/sherpa-onnx-nemo-parakeet-tdt_ctc-0.6b-ja-35000-int8/test_wavs/test_ja_2.wav filter=lfs diff=lfs merge=lfs -text
60
  ru/sherpa-onnx-nemo-ctc-giga-am-russian-2024-10-24/test_wavs/example.wav filter=lfs diff=lfs merge=lfs -text
61
  ru/sherpa-onnx-nemo-ctc-giga-am-russian-2024-10-24/test_wavs/long_example.wav filter=lfs diff=lfs merge=lfs -text
 
 
59
  ja/sherpa-onnx-nemo-parakeet-tdt_ctc-0.6b-ja-35000-int8/test_wavs/test_ja_2.wav filter=lfs diff=lfs merge=lfs -text
60
  ru/sherpa-onnx-nemo-ctc-giga-am-russian-2024-10-24/test_wavs/example.wav filter=lfs diff=lfs merge=lfs -text
61
  ru/sherpa-onnx-nemo-ctc-giga-am-russian-2024-10-24/test_wavs/long_example.wav filter=lfs diff=lfs merge=lfs -text
62
+ ru/sherpa-onnx-nemo-ctc-giga-am-v2-russian-2025-04-19/test_wavs/example.wav filter=lfs diff=lfs merge=lfs -text
ru/sherpa-onnx-nemo-ctc-giga-am-v2-russian-2025-04-19/.gitattributes ADDED
@@ -0,0 +1,36 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
+ *.model filter=lfs diff=lfs merge=lfs -text
13
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
14
+ *.npy filter=lfs diff=lfs merge=lfs -text
15
+ *.npz filter=lfs diff=lfs merge=lfs -text
16
+ *.onnx filter=lfs diff=lfs merge=lfs -text
17
+ *.ot filter=lfs diff=lfs merge=lfs -text
18
+ *.parquet filter=lfs diff=lfs merge=lfs -text
19
+ *.pb filter=lfs diff=lfs merge=lfs -text
20
+ *.pickle filter=lfs diff=lfs merge=lfs -text
21
+ *.pkl filter=lfs diff=lfs merge=lfs -text
22
+ *.pt filter=lfs diff=lfs merge=lfs -text
23
+ *.pth filter=lfs diff=lfs merge=lfs -text
24
+ *.rar filter=lfs diff=lfs merge=lfs -text
25
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
26
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
28
+ *.tar filter=lfs diff=lfs merge=lfs -text
29
+ *.tflite filter=lfs diff=lfs merge=lfs -text
30
+ *.tgz filter=lfs diff=lfs merge=lfs -text
31
+ *.wasm filter=lfs diff=lfs merge=lfs -text
32
+ *.xz filter=lfs diff=lfs merge=lfs -text
33
+ *.zip filter=lfs diff=lfs merge=lfs -text
34
+ *.zst filter=lfs diff=lfs merge=lfs -text
35
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ *.wav filter=lfs diff=lfs merge=lfs -text
ru/sherpa-onnx-nemo-ctc-giga-am-v2-russian-2025-04-19/LICENSE ADDED
The diff for this file is too large to render. See raw diff
 
ru/sherpa-onnx-nemo-ctc-giga-am-v2-russian-2025-04-19/export-onnx-ctc-v2.py ADDED
@@ -0,0 +1,64 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ import gigaam
3
+ import onnx
4
+ import torch
5
+ from onnxruntime.quantization import QuantType, quantize_dynamic
6
+
7
+
8
+ def add_meta_data(filename: str, meta_data: dict[str, str]):
9
+ """Add meta data to an ONNX model. It is changed in-place.
10
+
11
+ Args:
12
+ filename:
13
+ Filename of the ONNX model to be changed.
14
+ meta_data:
15
+ Key-value pairs.
16
+ """
17
+ model = onnx.load(filename)
18
+ while len(model.metadata_props):
19
+ model.metadata_props.pop()
20
+
21
+ for key, value in meta_data.items():
22
+ meta = model.metadata_props.add()
23
+ meta.key = key
24
+ meta.value = str(value)
25
+
26
+ onnx.save(model, filename)
27
+
28
+
29
+ def main() -> None:
30
+ model_name = "v2_ctc"
31
+ model = gigaam.load_model(
32
+ model_name, fp16_encoder=False, use_flash=False, download_root="."
33
+ )
34
+
35
+ # use characters
36
+ # space is 0
37
+ # <blk> is the last token
38
+ with open("./tokens.txt", "w", encoding="utf-8") as f:
39
+ for i, s in enumerate(model.cfg["labels"]):
40
+ f.write(f"{s} {i}\n")
41
+ f.write(f"<blk> {i+1}\n")
42
+ print("Saved to tokens.txt")
43
+ model.to_onnx(".")
44
+ meta_data = {
45
+ "vocab_size": len(model.cfg["labels"]) + 1,
46
+ "normalize_type": "",
47
+ "subsampling_factor": 4,
48
+ "model_type": "EncDecCTCModel",
49
+ "version": "1",
50
+ "model_author": "https://github.com/salute-developers/GigaAM",
51
+ "license": "https://github.com/salute-developers/GigaAM/blob/main/LICENSE",
52
+ "language": "Russian",
53
+ "is_giga_am": 1,
54
+ }
55
+ add_meta_data(f"./{model_name}.onnx", meta_data)
56
+ quantize_dynamic(
57
+ model_input=f"./{model_name}.onnx",
58
+ model_output="./model.int8.onnx",
59
+ weight_type=QuantType.QUInt8,
60
+ )
61
+
62
+
63
+ if __name__ == "__main__":
64
+ main()
ru/sherpa-onnx-nemo-ctc-giga-am-v2-russian-2025-04-19/model.int8.onnx ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d0ce4aef25f58d495781ee8f05320d9e51b821f47804e07aa6549b53a72f67e8
3
+ size 236457977
ru/sherpa-onnx-nemo-ctc-giga-am-v2-russian-2025-04-19/run-ctc-v2.sh ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env bash
2
+
3
+ set -ex
4
+
5
+ function install_gigaam() {
6
+ curl https://bootstrap.pypa.io/get-pip.py -o get-pip.py
7
+ python3 get-pip.py
8
+ pip install torch==2.4.0 torchaudio==2.4.0 -f https://download.pytorch.org/whl/torch_stable.html
9
+ pip install -qq wget text-unidecode "matplotlib>=3.3.2" onnx onnxruntime==1.17.1 pybind11 Cython einops kaldi-native-fbank soundfile librosa
10
+
11
+ BRANCH='main'
12
+ python3 -m pip install git+https://github.com/salute-developers/GigaAM.git@$BRANCH#egg=gigaam
13
+
14
+ python3 -m pip install -qq kaldi-native-fbank
15
+ pip install numpy==1.26.4
16
+ }
17
+
18
+ function download_files() {
19
+ curl -SL -O https://huggingface.co/csukuangfj/tmp-files/resolve/main/GigaAM/example.wav
20
+ curl -SL -O https://github.com/salute-developers/GigaAM/blob/main/LICENSE
21
+ }
22
+
23
+ install_gigaam
24
+ download_files
25
+
26
+ python3 ./export-onnx-ctc-v2.py
27
+ ls -lh
28
+ python3 ./test-onnx-ctc.py
ru/sherpa-onnx-nemo-ctc-giga-am-v2-russian-2025-04-19/test-onnx-ctc.py ADDED
@@ -0,0 +1,155 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ # Copyright 2024 Xiaomi Corp. (authors: Fangjun Kuang)
3
+
4
+ # https://github.com/salute-developers/GigaAM
5
+
6
+ import kaldi_native_fbank as knf
7
+ import librosa
8
+ import numpy as np
9
+ import onnxruntime as ort
10
+ import soundfile as sf
11
+ import torch
12
+
13
+
14
+ def create_fbank():
15
+ opts = knf.FbankOptions()
16
+ opts.frame_opts.dither = 0
17
+ opts.frame_opts.remove_dc_offset = False
18
+ opts.frame_opts.preemph_coeff = 0
19
+ opts.frame_opts.window_type = "hann"
20
+
21
+ opts.frame_opts.round_to_power_of_two = False
22
+
23
+ opts.mel_opts.low_freq = 0
24
+ opts.mel_opts.high_freq = 8000
25
+ opts.mel_opts.num_bins = 64
26
+
27
+ fbank = knf.OnlineFbank(opts)
28
+ return fbank
29
+
30
+
31
+ def compute_features(audio, fbank) -> np.ndarray:
32
+ """
33
+ Args:
34
+ audio: (num_samples,), np.float32
35
+ fbank: the fbank extractor
36
+ Returns:
37
+ features: (num_frames, feat_dim), np.float32
38
+ """
39
+ assert len(audio.shape) == 1, audio.shape
40
+ fbank.accept_waveform(16000, audio)
41
+ ans = []
42
+ processed = 0
43
+ while processed < fbank.num_frames_ready:
44
+ ans.append(np.array(fbank.get_frame(processed)))
45
+ processed += 1
46
+ ans = np.stack(ans)
47
+ return ans
48
+
49
+
50
+ def display(sess):
51
+ print("==========Input==========")
52
+ for i in sess.get_inputs():
53
+ print(i)
54
+ print("==========Output==========")
55
+ for i in sess.get_outputs():
56
+ print(i)
57
+
58
+
59
+ """
60
+ ==========Input==========
61
+ NodeArg(name='audio_signal', type='tensor(float)', shape=['audio_signal_dynamic_axes_1', 64, 'audio_signal_dynamic_axes_2'])
62
+ NodeArg(name='length', type='tensor(int64)', shape=['length_dynamic_axes_1'])
63
+ ==========Output==========
64
+ NodeArg(name='logprobs', type='tensor(float)', shape=['logprobs_dynamic_axes_1', 'logprobs_dynamic_axes_2', 34])
65
+ """
66
+
67
+
68
+ class OnnxModel:
69
+ def __init__(
70
+ self,
71
+ filename: str,
72
+ ):
73
+ session_opts = ort.SessionOptions()
74
+ session_opts.inter_op_num_threads = 1
75
+ session_opts.intra_op_num_threads = 1
76
+
77
+ self.model = ort.InferenceSession(
78
+ filename,
79
+ sess_options=session_opts,
80
+ providers=["CPUExecutionProvider"],
81
+ )
82
+ display(self.model)
83
+
84
+ def __call__(self, x: np.ndarray):
85
+ # x: (T, C)
86
+ x = torch.from_numpy(x)
87
+ x = x.t().unsqueeze(0)
88
+ # x: [1, C, T]
89
+ x_lens = torch.tensor([x.shape[-1]], dtype=torch.int64)
90
+
91
+ log_probs = self.model.run(
92
+ [
93
+ self.model.get_outputs()[0].name,
94
+ ],
95
+ {
96
+ self.model.get_inputs()[0].name: x.numpy(),
97
+ self.model.get_inputs()[1].name: x_lens.numpy(),
98
+ },
99
+ )[0]
100
+ # [batch_size, T, dim]
101
+ return log_probs
102
+
103
+
104
+ def main():
105
+ filename = "./model.int8.onnx"
106
+ tokens = "./tokens.txt"
107
+ wav = "./example.wav"
108
+
109
+ model = OnnxModel(filename)
110
+
111
+ id2token = dict()
112
+ with open(tokens, encoding="utf-8") as f:
113
+ for line in f:
114
+ fields = line.split()
115
+ if len(fields) == 1:
116
+ id2token[int(fields[0])] = " "
117
+ else:
118
+ t, idx = fields
119
+ id2token[int(idx)] = t
120
+
121
+ fbank = create_fbank()
122
+ audio, sample_rate = sf.read(wav, dtype="float32", always_2d=True)
123
+ audio = audio[:, 0] # only use the first channel
124
+ if sample_rate != 16000:
125
+ audio = librosa.resample(
126
+ audio,
127
+ orig_sr=sample_rate,
128
+ target_sr=16000,
129
+ )
130
+ sample_rate = 16000
131
+
132
+ features = compute_features(audio, fbank)
133
+ print("features.shape", features.shape)
134
+
135
+ blank = len(id2token) - 1
136
+ prev = -1
137
+ ans = []
138
+ log_probs = model(features)
139
+ print("log_probs", log_probs.shape)
140
+ log_probs = torch.from_numpy(log_probs)[0]
141
+ ids = torch.argmax(log_probs, dim=1).tolist()
142
+ for i in ids:
143
+ if i != blank and i != prev:
144
+ ans.append(i)
145
+ prev = i
146
+
147
+ tokens = [id2token[i] for i in ans]
148
+
149
+ text = "".join(tokens)
150
+ print(wav)
151
+ print(text)
152
+
153
+
154
+ if __name__ == "__main__":
155
+ main()
ru/sherpa-onnx-nemo-ctc-giga-am-v2-russian-2025-04-19/test_wavs/example.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d8aaaa18a5098d7c6de0595ae7ac1e64cacd0d4022af3595213bdaf23be77e69
3
+ size 361324
ru/sherpa-onnx-nemo-ctc-giga-am-v2-russian-2025-04-19/tokens.txt ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ 0
2
+ а 1
3
+ б 2
4
+ в 3
5
+ г 4
6
+ д 5
7
+ е 6
8
+ ж 7
9
+ з 8
10
+ и 9
11
+ й 10
12
+ к 11
13
+ л 12
14
+ м 13
15
+ н 14
16
+ о 15
17
+ п 16
18
+ р 17
19
+ с 18
20
+ т 19
21
+ у 20
22
+ ф 21
23
+ х 22
24
+ ц 23
25
+ ч 24
26
+ ш 25
27
+ щ 26
28
+ ъ 27
29
+ ы 28
30
+ ь 29
31
+ э 30
32
+ ю 31
33
+ я 32
34
+ <blk> 33