File size: 1,272 Bytes
178501c
794cf97
 
178501c
 
 
 
 
794cf97
 
 
33f8089
794cf97
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3cf1937
794cf97
 
 
 
 
178501c
 
794cf97
 
178501c
794cf97
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
import json
import sentencepiece as spm
from transformers import T5Tokenizer


# Load corpus data
corpus = []

with open("src/data/tokeniser_corpus.txt", "w", encoding = "utf-8") as f_out:
    with open("src/data/clean_corpus.jsonl", "r", encoding = "utf-8") as f_in:
        for i, line in enumerate(f_in):
            if i >= 1000000:  # take 1000000 records for the tokeniser (no need to load everything in the corpus)
                break

            item = json.loads(line)
            src = item["transliteration"]["src"]
            tgt = item["transliteration"]["tgt"]

            f_out.write(src + "\n")
            f_out.write(tgt + "\n")

# Train the sentence piece model
spm.SentencePieceTrainer.Train(
    input = "src/data/tokeniser_corpus.txt",
    model_prefix = "src/tokeniser/dalat5_sp",
    vocab_size = 40000,
    model_type = "unigram",  # worth testing with "bpe"
    character_coverage = 1.0,  # to preserve rare characters like ä, ñ, etc.
    max_sentence_length = 8384,
    pad_id = 0,
    unk_id = 1,
    bos_id = 2,
    eos_id = 3,
    user_defined_symbols = ["<pad>", "<s>", "</s>"]
)

# Convert to a HF-compatible format
tokenizer = T5Tokenizer.from_pretrained("src/tokeniser/dalat5_sp.model")

tokenizer.save_pretrained("src/tokeniser/")