Removed NFD and StripAccents from the tokeniser training process

Files changed (2) hide show

src/tokeniser/tokenizer.json CHANGED Viewed

The diff for this file is too large to render. See raw diff

src/train_tokeniser.py CHANGED Viewed

@@ -24,10 +24,8 @@ with open("src/data/clean_corpus.jsonl", "r", encoding = "utf-8") as f_in:
 # Initialise a tokenizer
 tokeniser = Tokenizer(models.BPE(unk_token = "<unk>"))
-# Normalisation, important for characters such as those with with diacritics
-tokeniser.normalizer = Sequence([
-    NFD(), Lowercase(), StripAccents()
-])
 # Basic whitespace pre-tokenization
 tokeniser.pre_tokenizer = Whitespace()

 # Initialise a tokenizer
 tokeniser = Tokenizer(models.BPE(unk_token = "<unk>"))
+# Normalisation (NFD and StripAccents are not used due to characters with diacritics, for instance)
+tokeniser.normalizer = Lowercase()
 # Basic whitespace pre-tokenization
 tokeniser.pre_tokenizer = Whitespace()