crossroderick commited on
Commit
f3ccfab
·
1 Parent(s): 8dc2b55

Fixed a typo

Browse files
Files changed (1) hide show
  1. src/train_tokeniser.py +2 -2
src/train_tokeniser.py CHANGED
@@ -10,7 +10,7 @@ corpus = []
10
 
11
  with open("src/data/clean_corpus.jsonl", "r", encoding = "utf-8") as f_in:
12
  for i, line in enumerate(f_in):
13
- if i >= 100000: # Take only 100000 records for the tokeniser (no need to load everything in the corpus)
14
  break
15
 
16
  item = json.loads(line)
@@ -27,7 +27,7 @@ tokeniser = Tokenizer(models.BPE(unk_token = "<unk>"))
27
  # Normalisation (NFD and StripAccents are not used due to characters with diacritics, for instance)
28
  tokeniser.normalizer = Lowercase()
29
 
30
- # Basic whitespace pre-tokenization
31
  tokeniser.pre_tokenizer = Whitespace()
32
 
33
  # Trainer
 
10
 
11
  with open("src/data/clean_corpus.jsonl", "r", encoding = "utf-8") as f_in:
12
  for i, line in enumerate(f_in):
13
+ if i >= 100000: # take only 100000 records for the tokeniser (no need to load everything in the corpus)
14
  break
15
 
16
  item = json.loads(line)
 
27
  # Normalisation (NFD and StripAccents are not used due to characters with diacritics, for instance)
28
  tokeniser.normalizer = Lowercase()
29
 
30
+ # Basic whitespace pre-tokenisation
31
  tokeniser.pre_tokenizer = Whitespace()
32
 
33
  # Trainer