import json import sentencepiece as spm from transformers import T5Tokenizer # Load corpus data corpus = [] with open("src/data/tokeniser_corpus.txt", "w", encoding = "utf-8") as f_out: with open("src/data/clean_corpus.jsonl", "r", encoding = "utf-8") as f_in: for i, line in enumerate(f_in): if i >= 1000000: # take 1000000 records for the tokeniser (no need to load everything in the corpus) break item = json.loads(line) src = item["transliteration"]["src"] tgt = item["transliteration"]["tgt"] f_out.write(src + "\n") f_out.write(tgt + "\n") # Train the sentence piece model spm.SentencePieceTrainer.Train( input = "src/data/tokeniser_corpus.txt", model_prefix = "src/tokeniser/dalat5_sp", vocab_size = 40000, model_type = "unigram", # worth testing with "bpe" character_coverage = 1.0, # to preserve rare characters like ä, ñ, etc. max_sentence_length = 8384, pad_id = 0, unk_id = 1, bos_id = 2, eos_id = 3, user_defined_symbols = ["", "", ""] ) # Convert to a HF-compatible format tokenizer = T5Tokenizer.from_pretrained("src/tokeniser/dalat5_sp.model") tokenizer.save_pretrained("src/tokeniser/")