| |
|
|
| model: |
| vocab_size: 4000 |
| d_model: 512 |
| n_layers: 8 |
| n_heads: 8 |
| d_ff: 2048 |
| max_seq_len: 512 |
| dropout: 0.2 |
| use_swiglu: true |
|
|
| training: |
| batch_size: 4 |
| accumulation_steps: 4 |
| epochs: 20 |
| learning_rate: 0.0003 |
| min_lr: 0.00001 |
| weight_decay: 0.1 |
| max_grad_norm: 1.0 |
| num_threads: 4 |
| save_every: 5 |
| |
| |
| patience: 5 |
| min_delta: 0.001 |
| |
| |
| warmup_steps: 100 |
| use_lr_scheduler: true |
| |
| |
| label_smoothing: 0.1 |
| use_eos_loss_weight: true |
|
|
| data: |
| corpus_path: corpus/mtp_mini_corpus.jsonl |
| min_text_length: 50 |
| max_text_length: 2000 |
| validation_split: 0.15 |
| |
| |
| use_augmentation: true |
| augmentation_prob: 0.3 |
|
|
| generation: |
| |
| default_max_tokens: 150 |
| default_temperature: 0.7 |
| default_top_k: 40 |
| default_top_p: 0.92 |
| default_repetition_penalty: 1.15 |
| min_response_length: 20 |
| use_length_penalty: true |
| |
| |
| use_perplexity_filter: true |
| max_perplexity: 100.0 |
| |
| |
| stop_sequences: |
| - "###" |
| - "\n\n\n" |
| - "Instrucción:" |