Update tokenizer.py
Browse files- tokenizer.py +3 -3
tokenizer.py
CHANGED
|
@@ -8,7 +8,7 @@ from transformers import PreTrainedTokenizer
|
|
| 8 |
class ChessTokenizer(PreTrainedTokenizer):
|
| 9 |
"""
|
| 10 |
Chess move tokenizer compatible with HuggingFace transformers.
|
| 11 |
-
Can be loaded with: AutoTokenizer.from_pretrained("ankanmbz/
|
| 12 |
"""
|
| 13 |
|
| 14 |
vocab_files_names = {
|
|
@@ -160,7 +160,7 @@ class ChessTokenizer(PreTrainedTokenizer):
|
|
| 160 |
# Builder script to create HuggingFace-compatible tokenizer
|
| 161 |
# ============================================================================
|
| 162 |
|
| 163 |
-
def build_hf_tokenizer(dataset_path, output_dir="
|
| 164 |
"""Build HuggingFace-compatible tokenizer from dataset"""
|
| 165 |
import pandas as pd
|
| 166 |
from collections import Counter
|
|
@@ -262,4 +262,4 @@ def build_hf_tokenizer(dataset_path, output_dir="gambit-tok-hf"):
|
|
| 262 |
if __name__ == "__main__":
|
| 263 |
# Build the tokenizer
|
| 264 |
dataset_path = "/vast/users/ankan.deria/Document/TinyRecursiveModels/data/chees_data/dataset.parquet"
|
| 265 |
-
build_hf_tokenizer(dataset_path, output_dir="
|
|
|
|
| 8 |
class ChessTokenizer(PreTrainedTokenizer):
|
| 9 |
"""
|
| 10 |
Chess move tokenizer compatible with HuggingFace transformers.
|
| 11 |
+
Can be loaded with: AutoTokenizer.from_pretrained("ankanmbz/chess-tok")
|
| 12 |
"""
|
| 13 |
|
| 14 |
vocab_files_names = {
|
|
|
|
| 160 |
# Builder script to create HuggingFace-compatible tokenizer
|
| 161 |
# ============================================================================
|
| 162 |
|
| 163 |
+
def build_hf_tokenizer(dataset_path, output_dir="chess-tok-hf"):
|
| 164 |
"""Build HuggingFace-compatible tokenizer from dataset"""
|
| 165 |
import pandas as pd
|
| 166 |
from collections import Counter
|
|
|
|
| 262 |
if __name__ == "__main__":
|
| 263 |
# Build the tokenizer
|
| 264 |
dataset_path = "/vast/users/ankan.deria/Document/TinyRecursiveModels/data/chees_data/dataset.parquet"
|
| 265 |
+
build_hf_tokenizer(dataset_path, output_dir="chess-tok-hf")
|