crossroderick
/

dalat5

Text Generation

text2text-generation

transliteration

Eval Results (legacy)

Model card Files Files and versions

crossroderick commited on Apr 23, 2025

Commit

f3ccfab

·

1 Parent(s): 8dc2b55

Fixed a typo

Files changed (1) hide show

src/train_tokeniser.py +2 -2

src/train_tokeniser.py CHANGED Viewed

@@ -10,7 +10,7 @@ corpus = []
 with open("src/data/clean_corpus.jsonl", "r", encoding = "utf-8") as f_in:
     for i, line in enumerate(f_in):
-        if i >= 100000:  # Take only 100000 records for the tokeniser (no need to load everything in the corpus)
             break
         item = json.loads(line)
@@ -27,7 +27,7 @@ tokeniser = Tokenizer(models.BPE(unk_token = "<unk>"))
 # Normalisation (NFD and StripAccents are not used due to characters with diacritics, for instance)
 tokeniser.normalizer = Lowercase()
-# Basic whitespace pre-tokenization
 tokeniser.pre_tokenizer = Whitespace()
 # Trainer

 with open("src/data/clean_corpus.jsonl", "r", encoding = "utf-8") as f_in:
     for i, line in enumerate(f_in):
+        if i >= 100000:  # take only 100000 records for the tokeniser (no need to load everything in the corpus)
             break
         item = json.loads(line)
 # Normalisation (NFD and StripAccents are not used due to characters with diacritics, for instance)
 tokeniser.normalizer = Lowercase()
+# Basic whitespace pre-tokenisation
 tokeniser.pre_tokenizer = Whitespace()
 # Trainer