Commit
·
f3ccfab
1
Parent(s):
8dc2b55
Fixed a typo
Browse files- src/train_tokeniser.py +2 -2
src/train_tokeniser.py
CHANGED
|
@@ -10,7 +10,7 @@ corpus = []
|
|
| 10 |
|
| 11 |
with open("src/data/clean_corpus.jsonl", "r", encoding = "utf-8") as f_in:
|
| 12 |
for i, line in enumerate(f_in):
|
| 13 |
-
if i >= 100000: #
|
| 14 |
break
|
| 15 |
|
| 16 |
item = json.loads(line)
|
|
@@ -27,7 +27,7 @@ tokeniser = Tokenizer(models.BPE(unk_token = "<unk>"))
|
|
| 27 |
# Normalisation (NFD and StripAccents are not used due to characters with diacritics, for instance)
|
| 28 |
tokeniser.normalizer = Lowercase()
|
| 29 |
|
| 30 |
-
# Basic whitespace pre-
|
| 31 |
tokeniser.pre_tokenizer = Whitespace()
|
| 32 |
|
| 33 |
# Trainer
|
|
|
|
| 10 |
|
| 11 |
with open("src/data/clean_corpus.jsonl", "r", encoding = "utf-8") as f_in:
|
| 12 |
for i, line in enumerate(f_in):
|
| 13 |
+
if i >= 100000: # take only 100000 records for the tokeniser (no need to load everything in the corpus)
|
| 14 |
break
|
| 15 |
|
| 16 |
item = json.loads(line)
|
|
|
|
| 27 |
# Normalisation (NFD and StripAccents are not used due to characters with diacritics, for instance)
|
| 28 |
tokeniser.normalizer = Lowercase()
|
| 29 |
|
| 30 |
+
# Basic whitespace pre-tokenisation
|
| 31 |
tokeniser.pre_tokenizer = Whitespace()
|
| 32 |
|
| 33 |
# Trainer
|