model: name: SOVYN-300M-Cortex vocab_size: 32000 max_seq_len: 512 n_layers: 21 hidden_size: 1024 n_heads: 16 n_kv_heads: 8 ffn_size: 3584 dropout: 0.0 rope_theta: 10000.0 tie_embeddings: true tokenizer: path: tokenizer_300m/sovyn.model training: train_path: data/sovyn_300m_train.jsonl output_dir: checkpoints checkpoint_prefix: sovyn_300m seed: 43 device: cuda dtype: bf16 max_steps: 8000 batch_size: 2 grad_accum_steps: 16 learning_rate: 0.00005 weight_decay: 0.1 warmup_steps: 800 max_grad_norm: 1.0 log_every: 100 save_every: 1000 save_total_limit: 1 save_optimizer: false save_dtype: bf16 save_step_checkpoints: false delete_before_save: true generation: max_new_tokens: 96 temperature: 0.75 top_k: 40