huuminh365
/

CustomBERT

Model card Files Files and versions

huuminh365 commited on Mar 16, 2023

Commit

c7cf1e8

·

1 Parent(s): 087b7ec

update tokenizer.py

Files changed (1) hide show

tokenizer.py +3 -2

tokenizer.py CHANGED Viewed

@@ -327,6 +327,9 @@ class PhobertTokenizer(PreTrainedTokenizer):
         """
         Loads a pre-existing dictionary from a text file and adds its symbols to this instance.
         """
         if isinstance(f, str):
             try:
                 with open(f, "r", encoding="utf-8") as fd:
@@ -345,5 +348,3 @@ class PhobertTokenizer(PreTrainedTokenizer):
                 raise ValueError("Incorrect dictionary format, expected '<token> <cnt>'")
             word = line[:idx]
             self.encoder[word] = len(self.encoder)
-        for word in LATEX_VOC:
-            self.encoder[word] = len(self.encoder)

         """
         Loads a pre-existing dictionary from a text file and adds its symbols to this instance.
         """
+        for word in LATEX_VOC:
+            self.encoder[word] = len(self.encoder)
         if isinstance(f, str):
             try:
                 with open(f, "r", encoding="utf-8") as fd:
                 raise ValueError("Incorrect dictionary format, expected '<token> <cnt>'")
             word = line[:idx]
             self.encoder[word] = len(self.encoder)