| | |
| |
|
| | model: |
| | vocab_size: 4000 |
| | d_model: 512 |
| | n_layers: 8 |
| | n_heads: 8 |
| | d_ff: 2048 |
| | max_seq_len: 512 |
| | dropout: 0.2 |
| | use_swiglu: true |
| |
|
| | training: |
| | batch_size: 4 |
| | accumulation_steps: 4 |
| | epochs: 20 |
| | learning_rate: 0.0003 |
| | min_lr: 0.00001 |
| | weight_decay: 0.1 |
| | max_grad_norm: 1.0 |
| | num_threads: 4 |
| | save_every: 5 |
| | |
| | |
| | patience: 5 |
| | min_delta: 0.001 |
| | |
| | |
| | warmup_steps: 100 |
| | use_lr_scheduler: true |
| | |
| | |
| | label_smoothing: 0.1 |
| | use_eos_loss_weight: true |
| |
|
| | data: |
| | corpus_path: corpus/mtp_mini_corpus.jsonl |
| | min_text_length: 50 |
| | max_text_length: 2000 |
| | validation_split: 0.15 |
| | |
| | |
| | use_augmentation: true |
| | augmentation_prob: 0.3 |
| |
|
| | generation: |
| | |
| | default_max_tokens: 150 |
| | default_temperature: 0.7 |
| | default_top_k: 40 |
| | default_top_p: 0.92 |
| | default_repetition_penalty: 1.15 |
| | min_response_length: 20 |
| | use_length_penalty: true |
| | |
| | |
| | use_perplexity_filter: true |
| | max_perplexity: 100.0 |
| | |
| | |
| | stop_sequences: |
| | - "###" |
| | - "\n\n\n" |
| | - "Instrucción:" |