Commit
·
c7cf1e8
1
Parent(s):
087b7ec
update tokenizer.py
Browse files- tokenizer.py +3 -2
tokenizer.py
CHANGED
|
@@ -327,6 +327,9 @@ class PhobertTokenizer(PreTrainedTokenizer):
|
|
| 327 |
"""
|
| 328 |
Loads a pre-existing dictionary from a text file and adds its symbols to this instance.
|
| 329 |
"""
|
|
|
|
|
|
|
|
|
|
| 330 |
if isinstance(f, str):
|
| 331 |
try:
|
| 332 |
with open(f, "r", encoding="utf-8") as fd:
|
|
@@ -345,5 +348,3 @@ class PhobertTokenizer(PreTrainedTokenizer):
|
|
| 345 |
raise ValueError("Incorrect dictionary format, expected '<token> <cnt>'")
|
| 346 |
word = line[:idx]
|
| 347 |
self.encoder[word] = len(self.encoder)
|
| 348 |
-
for word in LATEX_VOC:
|
| 349 |
-
self.encoder[word] = len(self.encoder)
|
|
|
|
| 327 |
"""
|
| 328 |
Loads a pre-existing dictionary from a text file and adds its symbols to this instance.
|
| 329 |
"""
|
| 330 |
+
|
| 331 |
+
for word in LATEX_VOC:
|
| 332 |
+
self.encoder[word] = len(self.encoder)
|
| 333 |
if isinstance(f, str):
|
| 334 |
try:
|
| 335 |
with open(f, "r", encoding="utf-8") as fd:
|
|
|
|
| 348 |
raise ValueError("Incorrect dictionary format, expected '<token> <cnt>'")
|
| 349 |
word = line[:idx]
|
| 350 |
self.encoder[word] = len(self.encoder)
|
|
|
|
|
|