| model: | |
| name: SOVYN-300M-Cortex | |
| vocab_size: 32000 | |
| max_seq_len: 512 | |
| n_layers: 21 | |
| hidden_size: 1024 | |
| n_heads: 16 | |
| n_kv_heads: 8 | |
| ffn_size: 3584 | |
| dropout: 0.0 | |
| rope_theta: 10000.0 | |
| tie_embeddings: true | |
| tokenizer: | |
| path: tokenizer_300m/sovyn.model | |
| training: | |
| train_path: data/sovyn_300m_train.jsonl | |
| output_dir: checkpoints | |
| checkpoint_prefix: sovyn_300m | |
| seed: 43 | |
| device: cuda | |
| dtype: bf16 | |
| max_steps: 8000 | |
| batch_size: 2 | |
| grad_accum_steps: 16 | |
| learning_rate: 0.00005 | |
| weight_decay: 0.1 | |
| warmup_steps: 800 | |
| max_grad_norm: 1.0 | |
| log_every: 100 | |
| save_every: 1000 | |
| save_total_limit: 1 | |
| save_optimizer: false | |
| save_dtype: bf16 | |
| save_step_checkpoints: false | |
| delete_before_save: true | |
| generation: | |
| max_new_tokens: 96 | |
| temperature: 0.75 | |
| top_k: 40 | |