huuminh365 commited on
Commit
c7cf1e8
·
1 Parent(s): 087b7ec

update tokenizer.py

Browse files
Files changed (1) hide show
  1. tokenizer.py +3 -2
tokenizer.py CHANGED
@@ -327,6 +327,9 @@ class PhobertTokenizer(PreTrainedTokenizer):
327
  """
328
  Loads a pre-existing dictionary from a text file and adds its symbols to this instance.
329
  """
 
 
 
330
  if isinstance(f, str):
331
  try:
332
  with open(f, "r", encoding="utf-8") as fd:
@@ -345,5 +348,3 @@ class PhobertTokenizer(PreTrainedTokenizer):
345
  raise ValueError("Incorrect dictionary format, expected '<token> <cnt>'")
346
  word = line[:idx]
347
  self.encoder[word] = len(self.encoder)
348
- for word in LATEX_VOC:
349
- self.encoder[word] = len(self.encoder)
 
327
  """
328
  Loads a pre-existing dictionary from a text file and adds its symbols to this instance.
329
  """
330
+
331
+ for word in LATEX_VOC:
332
+ self.encoder[word] = len(self.encoder)
333
  if isinstance(f, str):
334
  try:
335
  with open(f, "r", encoding="utf-8") as fd:
 
348
  raise ValueError("Incorrect dictionary format, expected '<token> <cnt>'")
349
  word = line[:idx]
350
  self.encoder[word] = len(self.encoder)