janakhpon commited on
Commit
1f1b899
·
1 Parent(s): e9d0f85

feat: simplified mon tokenizer in hf format, updated tags, resolve the legacy issue

Browse files
convert_to_hf.py CHANGED
@@ -175,10 +175,10 @@ class MonTokenizerConverter:
175
  "model_max_length": 4096, # Modern context length
176
  "pad_token": analysis["pad_token"],
177
  "sp_model_kwargs": {},
178
- "tokenizer_class": "LlamaTokenizer",
179
  "unk_token": analysis["unk_token"],
180
  "use_default_system_prompt": False,
181
- "vocab_file": "tokenizer.model",
182
  "vocab_size": analysis["vocab_size"]
183
  }
184
 
@@ -344,6 +344,33 @@ For questions or issues, please open an issue on the repository or contact the m
344
  *.py text eol=lf
345
  """
346
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
347
  def validate_conversion(self) -> bool:
348
  """Validate the converted tokenizer."""
349
  logger.info("Validating converted tokenizer")
@@ -447,6 +474,10 @@ For questions or issues, please open an issue on the repository or contact the m
447
  f.write(self.create_gitattributes())
448
  logger.info("✓ Created .gitattributes")
449
 
 
 
 
 
450
  # Validate conversion
451
  if self.validate_conversion():
452
  logger.info("🎉 Conversion completed successfully!")
 
175
  "model_max_length": 4096, # Modern context length
176
  "pad_token": analysis["pad_token"],
177
  "sp_model_kwargs": {},
178
+ "tokenizer_class": "LlamaTokenizerFast",
179
  "unk_token": analysis["unk_token"],
180
  "use_default_system_prompt": False,
181
+ # Note: vocab_file omitted to use fast tokenizer by default
182
  "vocab_size": analysis["vocab_size"]
183
  }
184
 
 
344
  *.py text eol=lf
345
  """
346
 
347
+ def generate_fast_tokenizer(self) -> None:
348
+ """Generate fast tokenizer (tokenizer.json) from slow tokenizer."""
349
+ try:
350
+ from transformers import AutoTokenizer
351
+
352
+ # Load the slow tokenizer first
353
+ logger.info("Loading slow tokenizer to generate fast version...")
354
+ tokenizer = AutoTokenizer.from_pretrained(
355
+ str(self.output_dir),
356
+ local_files_only=True,
357
+ use_fast=False # Force slow tokenizer first
358
+ )
359
+
360
+ # Save as fast tokenizer
361
+ logger.info("Converting to fast tokenizer...")
362
+ tokenizer.save_pretrained(
363
+ str(self.output_dir),
364
+ legacy_format=False, # Use modern format
365
+ save_slow=False # Don't overwrite slow tokenizer
366
+ )
367
+
368
+ logger.info("✓ Generated fast tokenizer (tokenizer.json)")
369
+
370
+ except Exception as e:
371
+ logger.warning(f"Could not generate fast tokenizer: {e}")
372
+ logger.info("Fast tokenizer generation is optional - slow tokenizer will still work")
373
+
374
  def validate_conversion(self) -> bool:
375
  """Validate the converted tokenizer."""
376
  logger.info("Validating converted tokenizer")
 
474
  f.write(self.create_gitattributes())
475
  logger.info("✓ Created .gitattributes")
476
 
477
+ # Generate fast tokenizer
478
+ logger.info("Generating fast tokenizer")
479
+ self.generate_fast_tokenizer()
480
+
481
  # Validate conversion
482
  if self.validate_conversion():
483
  logger.info("🎉 Conversion completed successfully!")
special_tokens_map.json CHANGED
@@ -27,4 +27,4 @@
27
  "rstrip": false,
28
  "single_word": false
29
  }
30
- }
 
27
  "rstrip": false,
28
  "single_word": false
29
  }
30
+ }
tokenizer_config.json CHANGED
@@ -3,6 +3,14 @@
3
  "add_eos_token": false,
4
  "add_prefix_space": false,
5
  "added_tokens_decoder": {
 
 
 
 
 
 
 
 
6
  "1": {
7
  "content": "<s>",
8
  "lstrip": false,
@@ -19,13 +27,29 @@
19
  "single_word": false,
20
  "special": true
21
  },
22
- "0": {
23
- "content": "<unk>",
24
  "lstrip": false,
25
  "normalized": false,
26
  "rstrip": false,
27
  "single_word": false,
28
- "special": true
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
29
  },
30
  "4000": {
31
  "content": "<pad>",
@@ -40,13 +64,14 @@
40
  "bos_token": "<s>",
41
  "clean_up_tokenization_spaces": false,
42
  "eos_token": "</s>",
 
43
  "legacy": true,
44
  "model_max_length": 4096,
45
  "pad_token": "<pad>",
46
  "sp_model_kwargs": {},
 
47
  "tokenizer_class": "LlamaTokenizer",
48
  "unk_token": "<unk>",
49
  "use_default_system_prompt": false,
50
- "vocab_file": "tokenizer.model",
51
  "vocab_size": 4000
52
- }
 
3
  "add_eos_token": false,
4
  "add_prefix_space": false,
5
  "added_tokens_decoder": {
6
+ "0": {
7
+ "content": "<unk>",
8
+ "lstrip": false,
9
+ "normalized": false,
10
+ "rstrip": false,
11
+ "single_word": false,
12
+ "special": true
13
+ },
14
  "1": {
15
  "content": "<s>",
16
  "lstrip": false,
 
27
  "single_word": false,
28
  "special": true
29
  },
30
+ "3": {
31
+ "content": "<mask>",
32
  "lstrip": false,
33
  "normalized": false,
34
  "rstrip": false,
35
  "single_word": false,
36
+ "special": false
37
+ },
38
+ "4": {
39
+ "content": "<sep>",
40
+ "lstrip": false,
41
+ "normalized": false,
42
+ "rstrip": false,
43
+ "single_word": false,
44
+ "special": false
45
+ },
46
+ "5": {
47
+ "content": "<cls>",
48
+ "lstrip": false,
49
+ "normalized": false,
50
+ "rstrip": false,
51
+ "single_word": false,
52
+ "special": false
53
  },
54
  "4000": {
55
  "content": "<pad>",
 
64
  "bos_token": "<s>",
65
  "clean_up_tokenization_spaces": false,
66
  "eos_token": "</s>",
67
+ "extra_special_tokens": {},
68
  "legacy": true,
69
  "model_max_length": 4096,
70
  "pad_token": "<pad>",
71
  "sp_model_kwargs": {},
72
+ "spaces_between_special_tokens": false,
73
  "tokenizer_class": "LlamaTokenizer",
74
  "unk_token": "<unk>",
75
  "use_default_system_prompt": false,
 
76
  "vocab_size": 4000
77
+ }
upload_to_hub.py CHANGED
@@ -7,22 +7,22 @@ to Hugging Face Hub with comprehensive validation and modern best practices.
7
 
8
  Required files:
9
  - `tokenizer_config.json` - Main tokenizer configuration
10
- - `special_tokens_map.json` - Special token mappings
11
  - `README.md` - Model documentation and usage instructions
12
  - `.gitattributes` - Git LFS configuration for large files
13
 
14
  Required tokenizer model files (at least one):
15
- - `tokenizer.json` - Fast tokenizer (recommended, HuggingFace Tokenizers)
16
- - `tokenizer.model` - SentencePiece model file (slow tokenizer)
17
- - `mon_tokenizer.model` - Custom named SentencePiece model
18
 
19
- Optional but recommended files:
20
  - `generation_config.json` - Text generation configuration
21
  - `vocab.txt` - Vocabulary file for certain tokenizer types
22
  - `merges.txt` - BPE merge rules for certain tokenizer types
23
 
24
- The script validates all files exist before upload, supports both fast and slow tokenizers,
25
- and uses modern HuggingFace Hub conventions while maintaining backward compatibility.
26
 
27
  """
28
 
@@ -359,11 +359,13 @@ class TokenizerUploader:
359
  "sample_*",
360
  "example_*",
361
  "demo_*",
 
362
 
363
  # Build and conversion scripts
364
  "convert_*",
365
  "upload_*",
366
  "build_*",
 
367
  "*.py", # Don't upload Python scripts
368
 
369
  # Dataset and training artifacts
 
7
 
8
  Required files:
9
  - `tokenizer_config.json` - Main tokenizer configuration
10
+ - `special_tokens_map.json` - Special token mappings
11
  - `README.md` - Model documentation and usage instructions
12
  - `.gitattributes` - Git LFS configuration for large files
13
 
14
  Required tokenizer model files (at least one):
15
+ - `tokenizer.json` - Fast tokenizer (recommended for reliability)
16
+ - `tokenizer.model` - SentencePiece model file (slow tokenizer backup)
17
+ - `mon_tokenizer.model` - Legacy named SentencePiece model (deprecated)
18
 
19
+ Optional files:
20
  - `generation_config.json` - Text generation configuration
21
  - `vocab.txt` - Vocabulary file for certain tokenizer types
22
  - `merges.txt` - BPE merge rules for certain tokenizer types
23
 
24
+ The script validates all files, tests functionality, and uploads only essential files
25
+ while excluding development artifacts (.env, .py scripts, caches, etc.).
26
 
27
  """
28
 
 
359
  "sample_*",
360
  "example_*",
361
  "demo_*",
362
+ "*_demo.py",
363
 
364
  # Build and conversion scripts
365
  "convert_*",
366
  "upload_*",
367
  "build_*",
368
+ "text_processing_*",
369
  "*.py", # Don't upload Python scripts
370
 
371
  # Dataset and training artifacts