AnthonyDi commited on
Commit
2c67084
·
verified ·
1 Parent(s): d06d75a

Upload tokenizer.py with huggingface_hub

Browse files
Files changed (1) hide show
  1. tokenizer.py +36 -0
tokenizer.py CHANGED
@@ -42,6 +42,42 @@ class CharacterTokenizer(PreTrainedTokenizer):
42
  """Register this tokenizer for AutoTokenizer"""
43
  return cls
44
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
45
  @property
46
  def vocab_size(self):
47
  return len(self.token_to_id)
 
42
  """Register this tokenizer for AutoTokenizer"""
43
  return cls
44
 
45
+ @classmethod
46
+ def from_pretrained(cls, pretrained_model_name_or_path, **kwargs):
47
+ """Load tokenizer from a directory or Hub"""
48
+ # Check if it's a local path
49
+ if os.path.isdir(pretrained_model_name_or_path):
50
+ vocab_file = os.path.join(pretrained_model_name_or_path, "vocab.json")
51
+ else:
52
+ # Download from Hub
53
+ from huggingface_hub import hf_hub_download
54
+ vocab_file = hf_hub_download(
55
+ repo_id=pretrained_model_name_or_path,
56
+ filename="vocab.json"
57
+ )
58
+
59
+ # Try to load config if it exists
60
+ try:
61
+ if os.path.isdir(pretrained_model_name_or_path):
62
+ config_file = os.path.join(pretrained_model_name_or_path, "tokenizer_config.json")
63
+ else:
64
+ config_file = hf_hub_download(
65
+ repo_id=pretrained_model_name_or_path,
66
+ filename="tokenizer_config.json"
67
+ )
68
+
69
+ if os.path.exists(config_file):
70
+ with open(config_file, "r") as f:
71
+ config = json.load(f)
72
+ kwargs.update(config)
73
+ except:
74
+ pass # Config file is optional
75
+
76
+ # Remove vocab_file from kwargs if it exists to avoid duplicate argument
77
+ kwargs.pop('vocab_file', None)
78
+
79
+ return cls(vocab_file=vocab_file, **kwargs)
80
+
81
  @property
82
  def vocab_size(self):
83
  return len(self.token_to_id)