# Example configuration for training a SentencePiece tokenizer from JSONL data # Dataset source - JSONL file jsonl_path: /home/student/Data/TaoData/output.jsonl text_field: text # Field name in JSON for text data # Tokenizer training parameters vocab_size: 8192 model_type: unigram # SentencePiece model type: unigram, bpe, char, word character_coverage: 0.9995 # Output configuration output_dir: tokenizer tokenizer_prefix: tokenizer # Token ID configuration unk_id: 0 # Unknown token ID bos_id: 1 # Beginning of sentence token ID eos_id: 2 # End of sentence token ID pad_id: 3 # Padding token ID # Custom special tokens # These will be added to the vocabulary with explicit IDs # Useful for control tokens like , , , etc. # Note: Use \n for newline token, \t for tab, etc. special_tokens: : 3 # Padding (typically same as pad_id above) : 2 # End of sentence (typically same as eos_id above) : 1 # Beginning of sentence (typically same as bos_id above) : 0 # Unknown (typically same as unk_id above) "\n": 4 # Newline token - quoted to preserve literal \n in YAML : 8 # Special token for chain-of-thought reasoning : 9 # User message token : 10 # Assistant message token : 11 # Image token for multimodal models # Data sampling (optional) # Set to a number to train on only the first N samples from the JSONL file # Useful for quick testing or sub-sampling large datasets # Omit or set to null to use entire file max_samples: 1000000 # Optional metadata tokenizer_name: tokenizer