File size: 1,596 Bytes
486475d
baf3026
 
 
486475d
 
 
baf3026
486475d
076bc18
486475d
cf79a6c
076bc18
486475d
 
 
baf3026
486475d
076bc18
 
486475d
 
 
baf3026
486475d
 
076bc18
486475d
baf3026
cf79a6c
baf3026
486475d
 
baf3026
 
 
076bc18
baf3026
076bc18
90a2698
0d858b5
 
076bc18
baf3026
 
 
076bc18
baf3026
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
# Development/Testing Configuration for FLAN-T5-base
# FAST iteration for debugging - optimized for speed
# VRAM Usage: ~9-10GB peak (12GB available)
# Training time: ~5 minutes on RTX 4070 12GB
# Use: python scripts/train.py training=dev

dataloader:
  batch_size: 10  # Optimal with FlashAttention
  shuffle: true
  num_workers: 4
  pin_memory: true
  persistent_workers: true
  prefetch_factor: 2

optimizer:
  name: adamw
  lr: 5.0e-5
  weight_decay: 0.01
  eps: 1.0e-8
  betas: [0.9, 0.999]

scheduler:
  name: cosine
  warmup_steps: 50  # Less warmup for short runs

trainer:
  max_epochs: 3
  gradient_clip_norm: 1.0
  gradient_accumulation_steps: 6  # Effective batch: 60 (10*6)
  validation_max_length: 128
  label_smoothing: 0.0  # Simpler backward graph for dev
  task_weights:
    summarization: 1.0
    emotion: 1.5
    topic: 0.5  # Reduced - topic already saturated at 86%
  max_train_samples: 3000
  max_val_samples: 300
  early_stopping_patience: 5
  log_grad_norm_frequency: 100
  task_sampling: temperature
  task_sampling_alpha: 0.5
  gradient_conflict_frequency: 0

# Enable compile for speed (worth the startup cost)
compile_encoder: true
compile_decoder: true

# Speed optimizations
tokenizer_max_length: 256
gradient_checkpointing: true

# FLAN-T5 has NO learned positional embeddings - only relative position bias
# Disabling this causes repetition loops (model can't track sequence position)
use_relative_position_bias: true

# Freeze lower encoder layers (0-5) to preserve pretrained knowledge
# Upper layers (6-11) adapt to summarization style
freeze_encoder_layers: 6