# Development/Testing Configuration for FLAN-T5-base
# FAST iteration for debugging - optimized for speed
# VRAM Usage: ~9-10GB peak (12GB available)
# Training time: ~5 minutes on RTX 4070 12GB
# Use: python scripts/train.py training=dev

dataloader:
  batch_size: 10  # Optimal with FlashAttention
  shuffle: true
  num_workers: 4
  pin_memory: true
  persistent_workers: true
  prefetch_factor: 2

optimizer:
  name: adamw
  lr: 5.0e-5
  weight_decay: 0.01
  eps: 1.0e-8
  betas: [0.9, 0.999]

scheduler:
  name: cosine
  warmup_steps: 50  # Less warmup for short runs

trainer:
  max_epochs: 3
  gradient_clip_norm: 1.0
  gradient_accumulation_steps: 6  # Effective batch: 60 (10*6)
  validation_max_length: 128
  label_smoothing: 0.0  # Simpler backward graph for dev
  task_weights:
    summarization: 1.0
    emotion: 1.5
    topic: 0.5  # Reduced - topic already saturated at 86%
  max_train_samples: 3000
  max_val_samples: 300
  early_stopping_patience: 5
  log_grad_norm_frequency: 100

# Enable compile for speed (worth the startup cost)
compile_encoder: true
compile_decoder: true

# Speed optimizations
tokenizer_max_length: 256
gradient_checkpointing: true

# FLAN-T5 has NO learned positional embeddings - only relative position bias
# Disabling this causes repetition loops (model can't track sequence position)
use_relative_position_bias: true

# Freeze lower encoder layers (0-5) to preserve pretrained knowledge
# Upper layers (6-11) adapt to summarization style
freeze_encoder_layers: 6