File size: 4,072 Bytes

d81700b

# Default configuration for ASR training
# BILINGUAL TRAINING: Vietnamese + English
# Optimized for full_merged_dataset with bilingual support
# Dataset: ~194k training samples (77% Vietnamese, 23% English)

# Model architecture - Transformer Seq2Seq ASR with Language Embedding
# OPTIMIZED FOR BILINGUAL (Vietnamese + English) - ~30M parameters
model_name: "VietnameseASR_Transformer_Bilingual_30M"
d_model: 256               # Model dimension (reduced from 320)
num_encoder_layers: 14     # Number of encoder layers (kept same)
num_decoder_layers: 6      # Number of decoder layers (kept same)
num_heads: 8               # Number of attention heads (kept same)
d_ff: 2048                 # Feed-forward dimension (reduced from 3120)
dropout: 0.2               # Dropout rate

# Audio processing
sample_rate: 16000
n_mels: 80
n_fft: 400
hop_length: 160
win_length: 400

# Tokenization - SentencePiece BPE for Bilingual (Vietnamese + English)
tokenizer_type: "sentencepiece"  # Changed from "bpe" to "sentencepiece"
bpe_vocab_path: "models/tokenizer_vi_en_3500.model"  # SentencePiece .model file
vocab_size: 3500

# Training hyperparameters
batch_size: 32  # Giảm xuống 32 để tránh OOM với CTC loss - effective batch size: 128 (32 * 4)
val_batch_size: 64  # Validation batch size (reduced proportionally)
# Tăng tổng số epoch để resume vượt mốc 20
num_epochs: 50
learning_rate: 0.0003
weight_decay: 0.0001
grad_clip: 0.5
gradient_accumulation_steps: 4  # Kept same - effective batch size: 256 (64 * 4)
warmup_pct: 0.03  # Giảm từ 10% xuống 3% để model học nhanh hơn
use_constant_lr_on_resume: false

# Optimization
use_amp: true   # Bật mixed precision
use_bf16: true   # Sử dụng bfloat16 (tốt hơn float16 về numerical stability, RTX 5060TI hỗ trợ)
num_workers: 2   # Giảm từ 12 xuống 2 để tránh BrokenPipeError (như đã thấy ở Epoch 6, 18)
pin_memory: true
use_gradient_checkpointing: false  # Tắt tạm thời vì có conflict với CTC output
prefetch_factor: 4
persistent_workers: true
sort_by_length: true
cache_in_ram: false
use_bucketing: false

# Data
dataset_root: "data/processed/full_merged_dataset"
language_filter: null

# Decoding - Seq2Seq Autoregressive Generation
# Using autoregressive generation with teacher forcing during training

# Hybrid CTC/Attention Training (FIXES: Forces encoder to learn alignment)
use_ctc_loss: true          # Enable CTC loss to help encoder learn audio-text alignment
ctc_weight: 0.2              # Weight for CTC loss (0.2 = 20% CTC, 80% Attention) - Giảm để tiết kiệm memory

# Scheduled Sampling (FIXES: Reduces teacher forcing, forces model to use encoder)
use_scheduled_sampling: true  # Enable scheduled sampling to reduce teacher forcing
teacher_forcing_initial: 1.0  # Start with 100% teacher forcing
teacher_forcing_final: 0.5     # End with 50% teacher forcing (gradual decay)

# Checkpointing
checkpoint_dir: "checkpoints"
save_every: 1  # Save checkpoint after every epoch

# Logging
log_file: "logs/training.log"

# Training run
run_name: "vietnamese_asr_transformer_bilingual_30m"

# Auto-Rollback
auto_rollback:
  enabled: true
  threshold_ratio: 1.3
  patience: 1

# Curriculum Learning
curriculum_learning:
  enabled: true
  required_wer: 0.70
  initial_ts_weight: 0.01
  short_sentence_epochs: 3
  max_duration_seconds: 4.0

# Validation decoding controls
# Limit decode length to speed up validation (prevents infinite loops)
val_max_len: 128
# Validate on a subset of validation batches (set null to disable)
val_subset_pct: null
# Hard-cap number of validation batches (set null to validate on full val set)
val_max_batches: null
# Use autoregressive generation for validation (slower but more accurate)
# If false, uses greedy decoding from logits (faster, ~2x speedup, avoids second forward pass)
use_autoregressive_validation: false
# Calculate WER/CER during validation (set to false to skip prediction and speed up validation)
calculate_val_wer: false  # Tắt tính WER để validation nhanh hơn (chỉ tính loss)