transformer-vi2en-v2 / config.yaml
MothMalone's picture
Upload transformer-vi2en-v2 model
5b82c6a verified
# Version 2: Improved Transformer
# With label smoothing, better optimizer, and beam search
# Data Configuration
data:
src_lang: "vi"
tgt_lang: "en"
train_src: "data/raw_opus100/train.vi.txt"
train_tgt: "data/raw_opus100/train.en.txt"
# Validation files (if not exist, will auto-split from training)
val_src: "data/raw_opus100/val.vi.txt"
val_tgt: "data/raw_opus100/val.en.txt"
val_split: 0.1 # 10% of training data for validation if val files don't exist
test_src: "data/raw_opus100/public_test.vi.txt"
test_tgt: "data/raw_opus100/public_test.en.txt"
max_seq_length: 128
# Vocabulary
vocab:
src_vocab_size: 40000 # Increased for better Vietnamese coverage
tgt_vocab_size: 40000
min_freq: 2
# Model - Same architecture
model:
d_model: 512
n_heads: 8
n_encoder_layers: 6
n_decoder_layers: 6
d_ff: 2048
dropout: 0.1
max_seq_length: 512
# Training - Improved
training:
batch_size: 32
epochs: 8 # Good balance of quality and time
optimizer: "adamw" # Changed to AdamW
learning_rate: 0.0001
weight_decay: 0.01 # Added weight decay
scheduler: "warmup"
warmup_steps: 4000
label_smoothing: 0.1 # Added label smoothing
gradient_accumulation_steps: 2 # Effective batch size = 64
max_grad_norm: 1.0
use_wandb: true
save_every: 1000
eval_every: 500
log_every: 100
early_stopping_patience: 5
# Inference
inference:
beam_size: 5 # Beam search
max_decode_length: 128
length_penalty: 0.6
# Paths
paths:
checkpoint_dir: "experiments/v2_vi2en/checkpoints"
log_dir: "experiments/v2_vi2en/logs"
vocab_dir: "data/vocab_v2_vi2en"
device: "cuda"
seed: 42
# Weights & Biases
wandb:
project: "nlp-transformer-mt"
entity: null
# Version info
version:
name: "v2_vi2en"
description: "Improved Vi→En training with label smoothing and AdamW"
improvements:
- "Label smoothing (0.1)"
- "AdamW optimizer with weight decay"
- "Beam search (size=5)"
- "Gradient accumulation"
- "Early stopping"