# Example configuration for training a SentencePiece tokenizer from JSONL data

# Dataset source - JSONL file
jsonl_path: /home/student/Data/TaoData/output.jsonl
text_field: text  # Field name in JSON for text data

# Tokenizer training parameters
vocab_size: 8192
model_type: unigram  # SentencePiece model type: unigram, bpe, char, word
character_coverage: 0.9995

# Output configuration
output_dir: tokenizer
tokenizer_prefix: tokenizer

# Token ID configuration
unk_id: 0  # Unknown token ID
bos_id: 1  # Beginning of sentence token ID
eos_id: 2  # End of sentence token ID  
pad_id: 3  # Padding token ID

# Custom special tokens
# These will be added to the vocabulary with explicit IDs
# Useful for control tokens like <think>, <user>, <assistant>, etc.
# Note: Use \n for newline token, \t for tab, etc.
special_tokens:
  <PAD>: 3      # Padding (typically same as pad_id above)
  <EOS>: 2      # End of sentence (typically same as eos_id above)
  <BOS>: 1      # Beginning of sentence (typically same as bos_id above)
  <UNK>: 0      # Unknown (typically same as unk_id above)
  "\n": 4       # Newline token - quoted to preserve literal \n in YAML
  <think>: 8    # Special token for chain-of-thought reasoning
  <user>: 9     # User message token
  <assistant>: 10  # Assistant message token
  <image>: 11   # Image token for multimodal models

# Data sampling (optional)
# Set to a number to train on only the first N samples from the JSONL file
# Useful for quick testing or sub-sampling large datasets
# Omit or set to null to use entire file
max_samples: 1000000

# Optional metadata
tokenizer_name: tokenizer