Spaces:
Sleeping
Sleeping
OliverPerrin
Cleaned up code, added multiseed training wrapper, PyTorch profiler training option, updated gradio demo, made changes to research paper to match new changes and new training results from adding new training techniques, architecture.md now explains all designs and decisions
90a2698 | # Medium Configuration for FLAN-T5-base | |
| # Balanced: good quality with reasonable speed | |
| # VRAM Usage: ~8-9GB (12GB available) | |
| # Training time: ~25-35 minutes on RTX 4070 12GB | |
| # Use: python scripts/train.py training=medium | |
| dataloader: | |
| batch_size: 10 # Optimal for RTX 4070 12GB | |
| shuffle: true | |
| num_workers: 4 | |
| pin_memory: true | |
| persistent_workers: true | |
| prefetch_factor: 2 | |
| optimizer: | |
| name: adamw | |
| lr: 3.0e-5 # Balanced LR | |
| weight_decay: 0.01 # Standard regularization | |
| eps: 1.0e-6 | |
| betas: [0.9, 0.999] # Standard betas | |
| scheduler: | |
| name: cosine | |
| warmup_steps: 300 # Standard warmup | |
| trainer: | |
| max_epochs: 5 # Allow more epochs | |
| gradient_clip_norm: 1.0 | |
| gradient_accumulation_steps: 6 # Effective batch: 60 (10*6) | |
| validation_max_length: 128 | |
| label_smoothing: 0.1 # Standard smoothing | |
| task_weights: | |
| summarization: 1.0 | |
| emotion: 1.5 # Balanced boost | |
| topic: 0.5 # Balanced weight | |
| max_train_samples: 25000 | |
| max_val_samples: 2500 | |
| early_stopping_patience: 3 # More patience | |
| log_grad_norm_frequency: 100 | |
| task_sampling: temperature | |
| task_sampling_alpha: 0.5 | |
| gradient_conflict_frequency: 0 | |
| compile_encoder: true | |
| compile_decoder: true | |
| # Balance: shorter sequences but keep T5's relative position bias for quality | |
| tokenizer_max_length: 384 | |
| gradient_checkpointing: true | |
| # FLAN-T5 has NO learned positional embeddings - only relative position bias | |
| # Disabling this causes repetition loops (model can't track sequence position) | |
| use_relative_position_bias: true | |
| # Freeze lower encoder layers (0-5) to preserve pretrained knowledge | |
| # Upper layers (6-11) adapt to summarization style | |
| freeze_encoder_layers: 6 |