| | |
| |
|
| | model: |
| | vocab_size: 8000 |
| | d_model: 1024 |
| | n_layers: 24 |
| | n_heads: 16 |
| | d_ff: 4096 |
| | max_seq_len: 2048 |
| | dropout: 0.15 |
| | use_swiglu: true |
| | use_flash_attention: true |
| | use_confidence_scoring: true |
| | min_confidence: 0.3 |
| |
|
| | training: |
| | batch_size: 2 |
| | accumulation_steps: 16 |
| | epochs: 25 |
| | learning_rate: 0.0002 |
| | min_lr: 0.000005 |
| | weight_decay: 0.15 |
| | max_grad_norm: 0.5 |
| | num_threads: 4 |
| | save_every: 5 |
| | |
| | |
| | patience: 10 |
| | min_delta: 0.0003 |
| | |
| | |
| | warmup_steps: 500 |
| | use_lr_scheduler: true |
| | |
| | |
| | label_smoothing: 0.15 |
| | use_eos_loss_weight: true |
| | eos_weight: 3.0 |
| | |
| | |
| | use_gradient_checkpointing: true |
| | use_fp16: true |
| |
|
| | data: |
| | corpus_path: corpus/mtp_mini_corpus.jsonl |
| | min_text_length: 100 |
| | max_text_length: 4000 |
| | validation_split: 0.2 |
| | |
| | |
| | use_augmentation: true |
| | augmentation_prob: 0.4 |
| |
|
| | generation: |
| | default_max_tokens: 300 |
| | default_temperature: 0.65 |
| | default_top_k: 50 |
| | default_top_p: 0.9 |
| | default_repetition_penalty: 1.2 |
| | min_response_length: 30 |
| | |
| | |
| | use_perplexity_filter: true |
| | max_perplexity: 80.0 |
| | use_entropy_threshold: true |
| | max_entropy: 4.0 |
| | |
| | |
| | use_confidence_filter: true |
| | min_confidence_threshold: 0.3 |
| | |
| | stop_sequences: |
| | - "###" |
| | - "\n\n\n\n" |
| | - "Instrucción:" |
| | - "Usuario:" |
| |
|
| | |
| | memory: |
| | use_fp16: true |
| | use_gradient_checkpointing: true |
| | max_memory_gb: 14 |