feat: simplified mon tokenizer in hf format, updated tags, resolve the legacy issue

Browse files

Files changed (4) hide show

convert_to_hf.py +33 -2
special_tokens_map.json +1 -1
tokenizer_config.json +30 -5
upload_to_hub.py +9 -7

convert_to_hf.py CHANGED Viewed

@@ -175,10 +175,10 @@ class MonTokenizerConverter:
             "model_max_length": 4096,  # Modern context length
             "pad_token": analysis["pad_token"],
             "sp_model_kwargs": {},
-            "tokenizer_class": "LlamaTokenizer",
             "unk_token": analysis["unk_token"],
             "use_default_system_prompt": False,
-            "vocab_file": "tokenizer.model",
             "vocab_size": analysis["vocab_size"]
         }
@@ -344,6 +344,33 @@ For questions or issues, please open an issue on the repository or contact the m
 *.py text eol=lf
 """
     def validate_conversion(self) -> bool:
         """Validate the converted tokenizer."""
         logger.info("Validating converted tokenizer")
@@ -447,6 +474,10 @@ For questions or issues, please open an issue on the repository or contact the m
                 f.write(self.create_gitattributes())
             logger.info("✓ Created .gitattributes")
             # Validate conversion
             if self.validate_conversion():
                 logger.info("🎉 Conversion completed successfully!")

             "model_max_length": 4096,  # Modern context length
             "pad_token": analysis["pad_token"],
             "sp_model_kwargs": {},
+            "tokenizer_class": "LlamaTokenizerFast",
             "unk_token": analysis["unk_token"],
             "use_default_system_prompt": False,
+            # Note: vocab_file omitted to use fast tokenizer by default
             "vocab_size": analysis["vocab_size"]
         }
 *.py text eol=lf
 """
+    def generate_fast_tokenizer(self) -> None:
+        """Generate fast tokenizer (tokenizer.json) from slow tokenizer."""
+        try:
+            from transformers import AutoTokenizer
+            # Load the slow tokenizer first
+            logger.info("Loading slow tokenizer to generate fast version...")
+            tokenizer = AutoTokenizer.from_pretrained(
+                str(self.output_dir),
+                local_files_only=True,
+                use_fast=False  # Force slow tokenizer first
+            )
+            # Save as fast tokenizer
+            logger.info("Converting to fast tokenizer...")
+            tokenizer.save_pretrained(
+                str(self.output_dir),
+                legacy_format=False,  # Use modern format
+                save_slow=False       # Don't overwrite slow tokenizer
+            )
+            logger.info("✓ Generated fast tokenizer (tokenizer.json)")
+        except Exception as e:
+            logger.warning(f"Could not generate fast tokenizer: {e}")
+            logger.info("Fast tokenizer generation is optional - slow tokenizer will still work")
     def validate_conversion(self) -> bool:
         """Validate the converted tokenizer."""
         logger.info("Validating converted tokenizer")
                 f.write(self.create_gitattributes())
             logger.info("✓ Created .gitattributes")
+            # Generate fast tokenizer
+            logger.info("Generating fast tokenizer")
+            self.generate_fast_tokenizer()
             # Validate conversion
             if self.validate_conversion():
                 logger.info("🎉 Conversion completed successfully!")

special_tokens_map.json CHANGED Viewed

@@ -27,4 +27,4 @@
     "rstrip": false,
     "single_word": false
   }
-}

     "rstrip": false,
     "single_word": false
   }
+}

tokenizer_config.json CHANGED Viewed

@@ -3,6 +3,14 @@
   "add_eos_token": false,
   "add_prefix_space": false,
   "added_tokens_decoder": {
     "1": {
       "content": "<s>",
       "lstrip": false,
@@ -19,13 +27,29 @@
       "single_word": false,
       "special": true
     },
-    "0": {
-      "content": "<unk>",
       "lstrip": false,
       "normalized": false,
       "rstrip": false,
       "single_word": false,
-      "special": true
     },
     "4000": {
       "content": "<pad>",
@@ -40,13 +64,14 @@
   "bos_token": "<s>",
   "clean_up_tokenization_spaces": false,
   "eos_token": "</s>",
   "legacy": true,
   "model_max_length": 4096,
   "pad_token": "<pad>",
   "sp_model_kwargs": {},
   "tokenizer_class": "LlamaTokenizer",
   "unk_token": "<unk>",
   "use_default_system_prompt": false,
-  "vocab_file": "tokenizer.model",
   "vocab_size": 4000
-}

   "add_eos_token": false,
   "add_prefix_space": false,
   "added_tokens_decoder": {
+    "0": {
+      "content": "<unk>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
     "1": {
       "content": "<s>",
       "lstrip": false,
       "single_word": false,
       "special": true
     },
+    "3": {
+      "content": "<mask>",
       "lstrip": false,
       "normalized": false,
       "rstrip": false,
       "single_word": false,
+      "special": false
+    },
+    "4": {
+      "content": "<sep>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "5": {
+      "content": "<cls>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
     },
     "4000": {
       "content": "<pad>",
   "bos_token": "<s>",
   "clean_up_tokenization_spaces": false,
   "eos_token": "</s>",
+  "extra_special_tokens": {},
   "legacy": true,
   "model_max_length": 4096,
   "pad_token": "<pad>",
   "sp_model_kwargs": {},
+  "spaces_between_special_tokens": false,
   "tokenizer_class": "LlamaTokenizer",
   "unk_token": "<unk>",
   "use_default_system_prompt": false,
   "vocab_size": 4000
+}

upload_to_hub.py CHANGED Viewed

@@ -7,22 +7,22 @@ to Hugging Face Hub with comprehensive validation and modern best practices.
 Required files:
 - `tokenizer_config.json` - Main tokenizer configuration
-- `special_tokens_map.json` - Special token mappings
 - `README.md` - Model documentation and usage instructions
 - `.gitattributes` - Git LFS configuration for large files
 Required tokenizer model files (at least one):
-- `tokenizer.json` - Fast tokenizer (recommended, HuggingFace Tokenizers)
-- `tokenizer.model` - SentencePiece model file (slow tokenizer)
-- `mon_tokenizer.model` - Custom named SentencePiece model
-Optional but recommended files:
 - `generation_config.json` - Text generation configuration
 - `vocab.txt` - Vocabulary file for certain tokenizer types
 - `merges.txt` - BPE merge rules for certain tokenizer types
-The script validates all files exist before upload, supports both fast and slow tokenizers,
-and uses modern HuggingFace Hub conventions while maintaining backward compatibility.
 """
@@ -359,11 +359,13 @@ class TokenizerUploader:
                 "sample_*",
                 "example_*",
                 "demo_*",
                 # Build and conversion scripts
                 "convert_*",
                 "upload_*",
                 "build_*",
                 "*.py",  # Don't upload Python scripts
                 # Dataset and training artifacts

 Required files:
 - `tokenizer_config.json` - Main tokenizer configuration
+- `special_tokens_map.json` - Special token mappings
 - `README.md` - Model documentation and usage instructions
 - `.gitattributes` - Git LFS configuration for large files
 Required tokenizer model files (at least one):
+- `tokenizer.json` - Fast tokenizer (recommended for reliability)
+- `tokenizer.model` - SentencePiece model file (slow tokenizer backup)
+- `mon_tokenizer.model` - Legacy named SentencePiece model (deprecated)
+Optional files:
 - `generation_config.json` - Text generation configuration
 - `vocab.txt` - Vocabulary file for certain tokenizer types
 - `merges.txt` - BPE merge rules for certain tokenizer types
+The script validates all files, tests functionality, and uploads only essential files
+while excluding development artifacts (.env, .py scripts, caches, etc.).
 """
                 "sample_*",
                 "example_*",
                 "demo_*",
+                "*_demo.py",
                 # Build and conversion scripts
                 "convert_*",
                 "upload_*",
                 "build_*",
+                "text_processing_*",
                 "*.py",  # Don't upload Python scripts
                 # Dataset and training artifacts