| # Version 2: Improved Transformer | |
| # With label smoothing, better optimizer, and beam search | |
| # Data Configuration | |
| data: | |
| src_lang: "vi" | |
| tgt_lang: "en" | |
| train_src: "data/raw_opus100/train.vi.txt" | |
| train_tgt: "data/raw_opus100/train.en.txt" | |
| # Validation files (if not exist, will auto-split from training) | |
| val_src: "data/raw_opus100/val.vi.txt" | |
| val_tgt: "data/raw_opus100/val.en.txt" | |
| val_split: 0.1 # 10% of training data for validation if val files don't exist | |
| test_src: "data/raw_opus100/public_test.vi.txt" | |
| test_tgt: "data/raw_opus100/public_test.en.txt" | |
| max_seq_length: 128 | |
| # Vocabulary | |
| vocab: | |
| src_vocab_size: 40000 # Increased for better Vietnamese coverage | |
| tgt_vocab_size: 40000 | |
| min_freq: 2 | |
| # Model - Same architecture | |
| model: | |
| d_model: 512 | |
| n_heads: 8 | |
| n_encoder_layers: 6 | |
| n_decoder_layers: 6 | |
| d_ff: 2048 | |
| dropout: 0.1 | |
| max_seq_length: 512 | |
| # Training - Improved | |
| training: | |
| batch_size: 32 | |
| epochs: 8 # Good balance of quality and time | |
| optimizer: "adamw" # Changed to AdamW | |
| learning_rate: 0.0001 | |
| weight_decay: 0.01 # Added weight decay | |
| scheduler: "warmup" | |
| warmup_steps: 4000 | |
| label_smoothing: 0.1 # Added label smoothing | |
| gradient_accumulation_steps: 2 # Effective batch size = 64 | |
| max_grad_norm: 1.0 | |
| use_wandb: true | |
| save_every: 1000 | |
| eval_every: 500 | |
| log_every: 100 | |
| early_stopping_patience: 5 | |
| # Inference | |
| inference: | |
| beam_size: 5 # Beam search | |
| max_decode_length: 128 | |
| length_penalty: 0.6 | |
| # Paths | |
| paths: | |
| checkpoint_dir: "experiments/v2_vi2en/checkpoints" | |
| log_dir: "experiments/v2_vi2en/logs" | |
| vocab_dir: "data/vocab_v2_vi2en" | |
| device: "cuda" | |
| seed: 42 | |
| # Weights & Biases | |
| wandb: | |
| project: "nlp-transformer-mt" | |
| entity: null | |
| # Version info | |
| version: | |
| name: "v2_vi2en" | |
| description: "Improved Vi→En training with label smoothing and AdamW" | |
| improvements: | |
| - "Label smoothing (0.1)" | |
| - "AdamW optimizer with weight decay" | |
| - "Beam search (size=5)" | |
| - "Gradient accumulation" | |
| - "Early stopping" | |