# MTP Mini - Configuración Optimizada 20x Más Grande e Inteligente

model:
  vocab_size: 8000              # 2x más vocabulario
  d_model: 1024                 # 2x dimensión (512 → 1024)
  n_layers: 24                  # 3x capas (8 → 24)
  n_heads: 16                   # 2x cabezas (8 → 16)
  d_ff: 4096                    # 4x d_model
  max_seq_len: 2048             # 4x contexto (512 → 2048)
  dropout: 0.15                 # Dropout optimizado
  use_swiglu: true              # Mejor activación
  use_flash_attention: true     # Atención optimizada
  use_confidence_scoring: true  # Anti-alucinación
  min_confidence: 0.3

training:
  batch_size: 2                 # Pequeño para modelo grande
  accumulation_steps: 16        # Effective batch = 32
  epochs: 25                    # 25 épocas como pediste
  learning_rate: 0.0002         # LR bajo para estabilidad
  min_lr: 0.000005
  weight_decay: 0.15            # Regularización fuerte
  max_grad_norm: 0.5
  num_threads: 4
  save_every: 5                 # Guardar cada 5 épocas
  
  # Early stopping (para no perder info)
  patience: 10                  # Muy paciente (espera 10 épocas sin mejora)
  min_delta: 0.0003             # Mejora mínima aceptable
  
  # Learning rate
  warmup_steps: 500
  use_lr_scheduler: true
  
  # Regularización
  label_smoothing: 0.15
  use_eos_loss_weight: true
  eos_weight: 3.0
  
  # Optimizaciones GPU
  use_gradient_checkpointing: true   # Ahorra VRAM
  use_fp16: true                     # Mixed precision

data:
  corpus_path: corpus/mtp_mini_corpus.jsonl
  min_text_length: 100
  max_text_length: 4000
  validation_split: 0.2         # 20% para validación
  
  # Augmentación
  use_augmentation: true
  augmentation_prob: 0.4

generation:
  default_max_tokens: 300
  default_temperature: 0.65
  default_top_k: 50
  default_top_p: 0.9
  default_repetition_penalty: 1.2
  min_response_length: 30
  
  # Anti-alucinación
  use_perplexity_filter: true
  max_perplexity: 80.0
  use_entropy_threshold: true
  max_entropy: 4.0
  
  # Control de calidad
  use_confidence_filter: true
  min_confidence_threshold: 0.3
  
  stop_sequences:
    - "###"
    - "\n\n\n\n"
    - "Instrucción:"
    - "Usuario:"

# Optimización de memoria
memory:
  use_fp16: true
  use_gradient_checkpointing: true
  max_memory_gb: 14