crossroderick commited on
Commit
f93a822
·
1 Parent(s): 4ff18d8

Removed NFD and StripAccents from the tokeniser training process

Browse files
src/tokeniser/tokenizer.json CHANGED
The diff for this file is too large to render. See raw diff
 
src/train_tokeniser.py CHANGED
@@ -24,10 +24,8 @@ with open("src/data/clean_corpus.jsonl", "r", encoding = "utf-8") as f_in:
24
  # Initialise a tokenizer
25
  tokeniser = Tokenizer(models.BPE(unk_token = "<unk>"))
26
 
27
- # Normalisation, important for characters such as those with with diacritics
28
- tokeniser.normalizer = Sequence([
29
- NFD(), Lowercase(), StripAccents()
30
- ])
31
 
32
  # Basic whitespace pre-tokenization
33
  tokeniser.pre_tokenizer = Whitespace()
 
24
  # Initialise a tokenizer
25
  tokeniser = Tokenizer(models.BPE(unk_token = "<unk>"))
26
 
27
+ # Normalisation (NFD and StripAccents are not used due to characters with diacritics, for instance)
28
+ tokeniser.normalizer = Lowercase()
 
 
29
 
30
  # Basic whitespace pre-tokenization
31
  tokeniser.pre_tokenizer = Whitespace()