ankanmbz commited on
Commit
a8ae00b
·
verified ·
1 Parent(s): 532f6c9

Update tokenizer.py

Browse files
Files changed (1) hide show
  1. tokenizer.py +3 -3
tokenizer.py CHANGED
@@ -8,7 +8,7 @@ from transformers import PreTrainedTokenizer
8
  class ChessTokenizer(PreTrainedTokenizer):
9
  """
10
  Chess move tokenizer compatible with HuggingFace transformers.
11
- Can be loaded with: AutoTokenizer.from_pretrained("ankanmbz/gambit-tok")
12
  """
13
 
14
  vocab_files_names = {
@@ -160,7 +160,7 @@ class ChessTokenizer(PreTrainedTokenizer):
160
  # Builder script to create HuggingFace-compatible tokenizer
161
  # ============================================================================
162
 
163
- def build_hf_tokenizer(dataset_path, output_dir="gambit-tok-hf"):
164
  """Build HuggingFace-compatible tokenizer from dataset"""
165
  import pandas as pd
166
  from collections import Counter
@@ -262,4 +262,4 @@ def build_hf_tokenizer(dataset_path, output_dir="gambit-tok-hf"):
262
  if __name__ == "__main__":
263
  # Build the tokenizer
264
  dataset_path = "/vast/users/ankan.deria/Document/TinyRecursiveModels/data/chees_data/dataset.parquet"
265
- build_hf_tokenizer(dataset_path, output_dir="gambit-tok-hf")
 
8
  class ChessTokenizer(PreTrainedTokenizer):
9
  """
10
  Chess move tokenizer compatible with HuggingFace transformers.
11
+ Can be loaded with: AutoTokenizer.from_pretrained("ankanmbz/chess-tok")
12
  """
13
 
14
  vocab_files_names = {
 
160
  # Builder script to create HuggingFace-compatible tokenizer
161
  # ============================================================================
162
 
163
+ def build_hf_tokenizer(dataset_path, output_dir="chess-tok-hf"):
164
  """Build HuggingFace-compatible tokenizer from dataset"""
165
  import pandas as pd
166
  from collections import Counter
 
262
  if __name__ == "__main__":
263
  # Build the tokenizer
264
  dataset_path = "/vast/users/ankan.deria/Document/TinyRecursiveModels/data/chees_data/dataset.parquet"
265
+ build_hf_tokenizer(dataset_path, output_dir="chess-tok-hf")