Spaces:
Sleeping
Sleeping
File size: 1,596 Bytes
486475d baf3026 486475d baf3026 486475d 076bc18 486475d cf79a6c 076bc18 486475d baf3026 486475d 076bc18 486475d baf3026 486475d 076bc18 486475d baf3026 cf79a6c baf3026 486475d baf3026 076bc18 baf3026 076bc18 90a2698 0d858b5 076bc18 baf3026 076bc18 baf3026 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 | # Development/Testing Configuration for FLAN-T5-base
# FAST iteration for debugging - optimized for speed
# VRAM Usage: ~9-10GB peak (12GB available)
# Training time: ~5 minutes on RTX 4070 12GB
# Use: python scripts/train.py training=dev
dataloader:
batch_size: 10 # Optimal with FlashAttention
shuffle: true
num_workers: 4
pin_memory: true
persistent_workers: true
prefetch_factor: 2
optimizer:
name: adamw
lr: 5.0e-5
weight_decay: 0.01
eps: 1.0e-8
betas: [0.9, 0.999]
scheduler:
name: cosine
warmup_steps: 50 # Less warmup for short runs
trainer:
max_epochs: 3
gradient_clip_norm: 1.0
gradient_accumulation_steps: 6 # Effective batch: 60 (10*6)
validation_max_length: 128
label_smoothing: 0.0 # Simpler backward graph for dev
task_weights:
summarization: 1.0
emotion: 1.5
topic: 0.5 # Reduced - topic already saturated at 86%
max_train_samples: 3000
max_val_samples: 300
early_stopping_patience: 5
log_grad_norm_frequency: 100
task_sampling: temperature
task_sampling_alpha: 0.5
gradient_conflict_frequency: 0
# Enable compile for speed (worth the startup cost)
compile_encoder: true
compile_decoder: true
# Speed optimizations
tokenizer_max_length: 256
gradient_checkpointing: true
# FLAN-T5 has NO learned positional embeddings - only relative position bias
# Disabling this causes repetition loops (model can't track sequence position)
use_relative_position_bias: true
# Freeze lower encoder layers (0-5) to preserve pretrained knowledge
# Upper layers (6-11) adapt to summarization style
freeze_encoder_layers: 6 |