Kevin Hamon commited on
Commit ·
1b49565
1
Parent(s): 6146bce
remove custom from_pretrained
Browse files- tokenizer.py +0 -29
tokenizer.py
CHANGED
|
@@ -364,35 +364,6 @@ class ChessTokenizer(PreTrainedTokenizer):
|
|
| 364 |
# Non-fatal; we still saved vocab and config
|
| 365 |
pass
|
| 366 |
|
| 367 |
-
@classmethod
|
| 368 |
-
def from_pretrained(cls, load_directory: str) -> "ChessTokenizer":
|
| 369 |
-
"""Load tokenizer from a directory previously written with `save_pretrained`.
|
| 370 |
-
|
| 371 |
-
This primarily reads the vocab file and constructs the tokenizer.
|
| 372 |
-
If a `tokenizer_config.json` exists it will be consulted for the
|
| 373 |
-
vocab filename and special tokens (but we still instantiate using
|
| 374 |
-
the provided class).
|
| 375 |
-
"""
|
| 376 |
-
config_path = os.path.join(load_directory, "tokenizer_config.json")
|
| 377 |
-
vocab_file = None
|
| 378 |
-
if os.path.exists(config_path):
|
| 379 |
-
try:
|
| 380 |
-
with open(config_path, "r", encoding="utf-8") as f:
|
| 381 |
-
cfg = json.load(f)
|
| 382 |
-
vocab_file = os.path.join(load_directory, cfg.get("vocab_file", "vocab.json"))
|
| 383 |
-
except Exception:
|
| 384 |
-
pass
|
| 385 |
-
|
| 386 |
-
if vocab_file is None:
|
| 387 |
-
# Fallback: look for a vocab file in the directory
|
| 388 |
-
candidates = [p for p in os.listdir(load_directory) if p.endswith("vocab.json")]
|
| 389 |
-
if candidates:
|
| 390 |
-
vocab_file = os.path.join(load_directory, candidates[0])
|
| 391 |
-
|
| 392 |
-
if vocab_file is None or not os.path.exists(vocab_file):
|
| 393 |
-
raise FileNotFoundError(f"No vocab file found in {load_directory}")
|
| 394 |
-
|
| 395 |
-
return cls(vocab_file=vocab_file)
|
| 396 |
|
| 397 |
def count_vocab_from_dataset(
|
| 398 |
dataset_name: str = "dlouapre/lichess_2025-01_1M",
|
|
|
|
| 364 |
# Non-fatal; we still saved vocab and config
|
| 365 |
pass
|
| 366 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 367 |
|
| 368 |
def count_vocab_from_dataset(
|
| 369 |
dataset_name: str = "dlouapre/lichess_2025-01_1M",
|