TaoNet-mini-T2 / code /TaoTrain /configs /tokenizer.yaml
StarMist0012's picture
Add files using upload-large-folder tool
e2bfccc verified
# Example configuration for training a SentencePiece tokenizer from JSONL data
# Dataset source - JSONL file
jsonl_path: /home/student/Data/TaoData/output.jsonl
text_field: text # Field name in JSON for text data
# Tokenizer training parameters
vocab_size: 8192
model_type: unigram # SentencePiece model type: unigram, bpe, char, word
character_coverage: 0.9995
# Output configuration
output_dir: tokenizer
tokenizer_prefix: tokenizer
# Token ID configuration
unk_id: 0 # Unknown token ID
bos_id: 1 # Beginning of sentence token ID
eos_id: 2 # End of sentence token ID
pad_id: 3 # Padding token ID
# Custom special tokens
# These will be added to the vocabulary with explicit IDs
# Useful for control tokens like <think>, <user>, <assistant>, etc.
# Note: Use \n for newline token, \t for tab, etc.
special_tokens:
<PAD>: 3 # Padding (typically same as pad_id above)
<EOS>: 2 # End of sentence (typically same as eos_id above)
<BOS>: 1 # Beginning of sentence (typically same as bos_id above)
<UNK>: 0 # Unknown (typically same as unk_id above)
"\n": 4 # Newline token - quoted to preserve literal \n in YAML
<think>: 8 # Special token for chain-of-thought reasoning
<user>: 9 # User message token
<assistant>: 10 # Assistant message token
<image>: 11 # Image token for multimodal models
# Data sampling (optional)
# Set to a number to train on only the first N samples from the JSONL file
# Useful for quick testing or sub-sampling large datasets
# Omit or set to null to use entire file
max_samples: 1000000
# Optional metadata
tokenizer_name: tokenizer