AnthonyDi
/

CharacterTokenizer

Model card Files Files and versions

xet

Community

AnthonyDi commited on May 22, 2025

Commit

2c67084

verified ·

1 Parent(s): d06d75a

Upload tokenizer.py with huggingface_hub

Browse files

Files changed (1) hide show

tokenizer.py +36 -0

tokenizer.py CHANGED Viewed

@@ -42,6 +42,42 @@ class CharacterTokenizer(PreTrainedTokenizer):
         """Register this tokenizer for AutoTokenizer"""
         return cls
     @property
     def vocab_size(self):
         return len(self.token_to_id)

         """Register this tokenizer for AutoTokenizer"""
         return cls
+    @classmethod
+    def from_pretrained(cls, pretrained_model_name_or_path, **kwargs):
+        """Load tokenizer from a directory or Hub"""
+        # Check if it's a local path
+        if os.path.isdir(pretrained_model_name_or_path):
+            vocab_file = os.path.join(pretrained_model_name_or_path, "vocab.json")
+        else:
+            # Download from Hub
+            from huggingface_hub import hf_hub_download
+            vocab_file = hf_hub_download(
+                repo_id=pretrained_model_name_or_path,
+                filename="vocab.json"
+            )
+        # Try to load config if it exists
+        try:
+            if os.path.isdir(pretrained_model_name_or_path):
+                config_file = os.path.join(pretrained_model_name_or_path, "tokenizer_config.json")
+            else:
+                config_file = hf_hub_download(
+                    repo_id=pretrained_model_name_or_path,
+                    filename="tokenizer_config.json"
+                )
+            if os.path.exists(config_file):
+                with open(config_file, "r") as f:
+                    config = json.load(f)
+                    kwargs.update(config)
+        except:
+            pass  # Config file is optional
+        # Remove vocab_file from kwargs if it exists to avoid duplicate argument
+        kwargs.pop('vocab_file', None)
+        return cls(vocab_file=vocab_file, **kwargs)
     @property
     def vocab_size(self):
         return len(self.token_to_id)