Spaces:
Sleeping
Sleeping
OliverPerrin
Fixed compiling issue, added legnth penalty, and atttempting freezing encoder layers 0-5 to lower parameters and preserve T5's langauge understanding.
baf3026
| # Development/Testing Configuration for FLAN-T5-base | |
| # FAST iteration for debugging - optimized for speed | |
| # VRAM Usage: ~9-10GB peak (12GB available) | |
| # Training time: ~5 minutes on RTX 4070 12GB | |
| # Use: python scripts/train.py training=dev | |
| dataloader: | |
| batch_size: 10 # Optimal with FlashAttention | |
| shuffle: true | |
| num_workers: 4 | |
| pin_memory: true | |
| persistent_workers: true | |
| prefetch_factor: 2 | |
| optimizer: | |
| name: adamw | |
| lr: 5.0e-5 | |
| weight_decay: 0.01 | |
| eps: 1.0e-8 | |
| betas: [0.9, 0.999] | |
| scheduler: | |
| name: cosine | |
| warmup_steps: 50 # Less warmup for short runs | |
| trainer: | |
| max_epochs: 3 | |
| gradient_clip_norm: 1.0 | |
| gradient_accumulation_steps: 6 # Effective batch: 60 (10*6) | |
| validation_max_length: 128 | |
| label_smoothing: 0.0 # Simpler backward graph for dev | |
| task_weights: | |
| summarization: 1.0 | |
| emotion: 1.5 | |
| topic: 0.5 # Reduced - topic already saturated at 86% | |
| max_train_samples: 3000 | |
| max_val_samples: 300 | |
| early_stopping_patience: 5 | |
| log_grad_norm_frequency: 100 | |
| # Enable compile for speed (worth the startup cost) | |
| compile_encoder: true | |
| compile_decoder: true | |
| # Speed optimizations | |
| tokenizer_max_length: 256 | |
| gradient_checkpointing: true | |
| # FLAN-T5 has NO learned positional embeddings - only relative position bias | |
| # Disabling this causes repetition loops (model can't track sequence position) | |
| use_relative_position_bias: true | |
| # Freeze lower encoder layers (0-5) to preserve pretrained knowledge | |
| # Upper layers (6-11) adapt to summarization style | |
| freeze_encoder_layers: 6 |