niobures commited on
Commit
4469d2a
·
verified ·
1 Parent(s): 87b6aeb

G2P (byT5)

Browse files
byT5/ppisljar/slo_g2p_byt5/.gitattributes ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
+ *.model filter=lfs diff=lfs merge=lfs -text
13
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
14
+ *.npy filter=lfs diff=lfs merge=lfs -text
15
+ *.npz filter=lfs diff=lfs merge=lfs -text
16
+ *.onnx filter=lfs diff=lfs merge=lfs -text
17
+ *.ot filter=lfs diff=lfs merge=lfs -text
18
+ *.parquet filter=lfs diff=lfs merge=lfs -text
19
+ *.pb filter=lfs diff=lfs merge=lfs -text
20
+ *.pickle filter=lfs diff=lfs merge=lfs -text
21
+ *.pkl filter=lfs diff=lfs merge=lfs -text
22
+ *.pt filter=lfs diff=lfs merge=lfs -text
23
+ *.pth filter=lfs diff=lfs merge=lfs -text
24
+ *.rar filter=lfs diff=lfs merge=lfs -text
25
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
26
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
28
+ *.tar filter=lfs diff=lfs merge=lfs -text
29
+ *.tflite filter=lfs diff=lfs merge=lfs -text
30
+ *.tgz filter=lfs diff=lfs merge=lfs -text
31
+ *.wasm filter=lfs diff=lfs merge=lfs -text
32
+ *.xz filter=lfs diff=lfs merge=lfs -text
33
+ *.zip filter=lfs diff=lfs merge=lfs -text
34
+ *.zst filter=lfs diff=lfs merge=lfs -text
35
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
byT5/ppisljar/slo_g2p_byt5/README.md ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ license: cc-by-3.0
3
+ language:
4
+ - sl
5
+ metrics:
6
+ - wer
7
+ - cer
8
+ library_name: nemo
9
+ ---
10
+
11
+
12
+ Slovenian G2P model
13
+
14
+ google/byt5-small trained on G2P task, with sentence level dataset
15
+
16
+ CER: 0.25%
17
+
18
+ check infer.py for example usage
19
+
byT5/ppisljar/slo_g2p_byt5/g2p_t5.onnx ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e15c8d2249cc940232f58442a2e93fe9e27fbaaaebdfb356f5f7a3a0fb7ec9c5
3
+ size 1208441138
byT5/ppisljar/slo_g2p_byt5/g2p_t5.quant.onnx ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6c1db9b6a431e0ba841a5758a872b7041354220139adfcf698adb43b33a69fbe
3
+ size 311079866
byT5/ppisljar/slo_g2p_byt5/infer.py ADDED
@@ -0,0 +1,27 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import onnxruntime
2
+ import torch
3
+
4
+ from transformers import AutoTokenizer
5
+
6
+ tokenizer = AutoTokenizer.from_pretrained('google/byt5-small')
7
+
8
+ sentence = "Kupil sem bicikel in mu zamenjal stol.".lower()
9
+
10
+ ort_session = onnxruntime.InferenceSession("g2p_t5.onnx", providers=["CPUExecutionProvider"])
11
+
12
+
13
+ def g2p(sentence, onnx_session, tokenizer):
14
+ input_ids = [sentence]
15
+ input_encoding = tokenizer(
16
+ input_ids, padding='longest', max_length=512, truncation=True, return_tensors='pt',
17
+ )
18
+ input_ids, attention_mask = input_encoding.input_ids, input_encoding.attention_mask
19
+ ort_inputs = {'input_ids': input_ids.numpy()}
20
+ ort_outs = ort_session.run(None, ort_inputs)
21
+ generated_ids = [ort_outs[0]]
22
+ generated_texts = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)
23
+ return generated_texts
24
+
25
+
26
+ result = g2p(sentence, ort_session, tokenizer)
27
+ print(result)
byT5/ppisljar/slo_g2p_byt5/source.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ https://huggingface.co/ppisljar/slo_g2p_byt5
byT5/ppisljar/slo_g2p_norm_byt5/.gitattributes ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
+ *.model filter=lfs diff=lfs merge=lfs -text
13
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
14
+ *.npy filter=lfs diff=lfs merge=lfs -text
15
+ *.npz filter=lfs diff=lfs merge=lfs -text
16
+ *.onnx filter=lfs diff=lfs merge=lfs -text
17
+ *.ot filter=lfs diff=lfs merge=lfs -text
18
+ *.parquet filter=lfs diff=lfs merge=lfs -text
19
+ *.pb filter=lfs diff=lfs merge=lfs -text
20
+ *.pickle filter=lfs diff=lfs merge=lfs -text
21
+ *.pkl filter=lfs diff=lfs merge=lfs -text
22
+ *.pt filter=lfs diff=lfs merge=lfs -text
23
+ *.pth filter=lfs diff=lfs merge=lfs -text
24
+ *.rar filter=lfs diff=lfs merge=lfs -text
25
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
26
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
28
+ *.tar filter=lfs diff=lfs merge=lfs -text
29
+ *.tflite filter=lfs diff=lfs merge=lfs -text
30
+ *.tgz filter=lfs diff=lfs merge=lfs -text
31
+ *.wasm filter=lfs diff=lfs merge=lfs -text
32
+ *.xz filter=lfs diff=lfs merge=lfs -text
33
+ *.zip filter=lfs diff=lfs merge=lfs -text
34
+ *.zst filter=lfs diff=lfs merge=lfs -text
35
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
byT5/ppisljar/slo_g2p_norm_byt5/README.md ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ license: cc-by-3.0
3
+ language:
4
+ - sl
5
+ metrics:
6
+ - cer
7
+ - wer
8
+ ---
9
+
10
+ Slovenian normalization & G2P model
11
+
12
+ google/byt5-small trained on normalization and G2P task, with sentence level dataset
13
+
14
+ CER: 0.33%
15
+
16
+ check infer.py for example usage
byT5/ppisljar/slo_g2p_norm_byt5/g2p_t5_norm.onnx ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7f41fe672a16bab8814e7686ff73acd775dc78602f9227a0ab6f1fa90f392117
3
+ size 1208441138
byT5/ppisljar/slo_g2p_norm_byt5/g2p_t5_norm.quant.onnx ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:dfd8452989e4accc0fd2304b5eaab4415ee5936fbb4d9af9f4a02e613641c70e
3
+ size 311079866
byT5/ppisljar/slo_g2p_norm_byt5/infer.py ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import onnxruntime
2
+ import torch
3
+
4
+ from transformers import AutoTokenizer
5
+
6
+ # setup GPU
7
+ if torch.cuda.is_available():
8
+ device = [0] # use 0th CUDA device
9
+ accelerator = 'gpu'
10
+ else:
11
+ device = 1
12
+ accelerator = 'cpu'
13
+
14
+ map_location = torch.device('cuda:{}'.format(device[0]) if accelerator == 'gpu' else 'cpu')
15
+
16
+ tokenizer = AutoTokenizer.from_pretrained('google/byt5-small')
17
+
18
+ sentence = "Kupil sem bicikel in mu zamenjal stol.".lower()
19
+
20
+ ort_session = onnxruntime.InferenceSession("g2p_norm_t5.onnx", providers=["CPUExecutionProvider"])
21
+ input_ids = [sentence]
22
+ input_encoding = tokenizer(
23
+ input_ids, padding='longest', max_length=512, truncation=True, return_tensors='pt',
24
+ )
25
+ input_ids, attention_mask = input_encoding.input_ids, input_encoding.attention_mask
26
+ ort_inputs = {'input_ids': input_ids.numpy()}
27
+ ort_outs = ort_session.run(None, ort_inputs)
28
+ generated_ids = [ort_outs[0]]
29
+ generated_texts = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)
30
+ print(generated_texts)
byT5/ppisljar/slo_g2p_norm_byt5/source.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ https://huggingface.co/ppisljar/slo_g2p_norm_byt5