Commit ·
f93a822
1
Parent(s): 4ff18d8
Removed NFD and StripAccents from the tokeniser training process
Browse files- src/tokeniser/tokenizer.json +0 -0
- src/train_tokeniser.py +2 -4
src/tokeniser/tokenizer.json
CHANGED
|
The diff for this file is too large to render.
See raw diff
|
|
|
src/train_tokeniser.py
CHANGED
|
@@ -24,10 +24,8 @@ with open("src/data/clean_corpus.jsonl", "r", encoding = "utf-8") as f_in:
|
|
| 24 |
# Initialise a tokenizer
|
| 25 |
tokeniser = Tokenizer(models.BPE(unk_token = "<unk>"))
|
| 26 |
|
| 27 |
-
# Normalisation
|
| 28 |
-
tokeniser.normalizer =
|
| 29 |
-
NFD(), Lowercase(), StripAccents()
|
| 30 |
-
])
|
| 31 |
|
| 32 |
# Basic whitespace pre-tokenization
|
| 33 |
tokeniser.pre_tokenizer = Whitespace()
|
|
|
|
| 24 |
# Initialise a tokenizer
|
| 25 |
tokeniser = Tokenizer(models.BPE(unk_token = "<unk>"))
|
| 26 |
|
| 27 |
+
# Normalisation (NFD and StripAccents are not used due to characters with diacritics, for instance)
|
| 28 |
+
tokeniser.normalizer = Lowercase()
|
|
|
|
|
|
|
| 29 |
|
| 30 |
# Basic whitespace pre-tokenization
|
| 31 |
tokeniser.pre_tokenizer = Whitespace()
|