OliverPerrin
Fixed compiling issue, added legnth penalty, and atttempting freezing encoder layers 0-5 to lower parameters and preserve T5's langauge understanding.
baf3026
# Development/Testing Configuration for FLAN-T5-base
# FAST iteration for debugging - optimized for speed
# VRAM Usage: ~9-10GB peak (12GB available)
# Training time: ~5 minutes on RTX 4070 12GB
# Use: python scripts/train.py training=dev
dataloader:
batch_size: 10 # Optimal with FlashAttention
shuffle: true
num_workers: 4
pin_memory: true
persistent_workers: true
prefetch_factor: 2
optimizer:
name: adamw
lr: 5.0e-5
weight_decay: 0.01
eps: 1.0e-8
betas: [0.9, 0.999]
scheduler:
name: cosine
warmup_steps: 50 # Less warmup for short runs
trainer:
max_epochs: 3
gradient_clip_norm: 1.0
gradient_accumulation_steps: 6 # Effective batch: 60 (10*6)
validation_max_length: 128
label_smoothing: 0.0 # Simpler backward graph for dev
task_weights:
summarization: 1.0
emotion: 1.5
topic: 0.5 # Reduced - topic already saturated at 86%
max_train_samples: 3000
max_val_samples: 300
early_stopping_patience: 5
log_grad_norm_frequency: 100
# Enable compile for speed (worth the startup cost)
compile_encoder: true
compile_decoder: true
# Speed optimizations
tokenizer_max_length: 256
gradient_checkpointing: true
# FLAN-T5 has NO learned positional embeddings - only relative position bias
# Disabling this causes repetition loops (model can't track sequence position)
use_relative_position_bias: true
# Freeze lower encoder layers (0-5) to preserve pretrained knowledge
# Upper layers (6-11) adapt to summarization style
freeze_encoder_layers: 6