model:
  name: SOVYN-300M-Cortex
  vocab_size: 32000
  max_seq_len: 512
  n_layers: 21
  hidden_size: 1024
  n_heads: 16
  n_kv_heads: 8
  ffn_size: 3584
  dropout: 0.0
  rope_theta: 10000.0
  tie_embeddings: true

tokenizer:
  path: tokenizer_300m/sovyn.model

training:
  train_path: data/sovyn_300m_train.jsonl
  output_dir: checkpoints
  checkpoint_prefix: sovyn_300m
  seed: 43
  device: cuda
  dtype: bf16
  max_steps: 8000
  batch_size: 2
  grad_accum_steps: 16
  learning_rate: 0.00005
  weight_decay: 0.1
  warmup_steps: 800
  max_grad_norm: 1.0
  log_every: 100
  save_every: 1000
  save_total_limit: 1
  save_optimizer: false
  save_dtype: bf16
  save_step_checkpoints: false
  delete_before_save: true

generation:
  max_new_tokens: 96
  temperature: 0.75
  top_k: 40