Cong123779
/

AI2Text-Bilingual-ASR

+# Default configuration for ASR training
+# BILINGUAL TRAINING: Vietnamese + English
+# Optimized for full_merged_dataset with bilingual support
+# Dataset: ~194k training samples (77% Vietnamese, 23% English)
+# Model architecture - Transformer Seq2Seq ASR with Language Embedding
+# OPTIMIZED FOR BILINGUAL (Vietnamese + English) - ~30M parameters
+model_name: "VietnameseASR_Transformer_Bilingual_30M"
+d_model: 256               # Model dimension (reduced from 320)
+num_encoder_layers: 14     # Number of encoder layers (kept same)
+num_decoder_layers: 6      # Number of decoder layers (kept same)
+num_heads: 8               # Number of attention heads (kept same)
+d_ff: 2048                 # Feed-forward dimension (reduced from 3120)
+dropout: 0.2               # Dropout rate
+# Audio processing
+sample_rate: 16000
+n_mels: 80
+n_fft: 400
+hop_length: 160
+win_length: 400
+# Tokenization - SentencePiece BPE for Bilingual (Vietnamese + English)
+tokenizer_type: "sentencepiece"  # Changed from "bpe" to "sentencepiece"
+bpe_vocab_path: "models/tokenizer_vi_en_3500.model"  # SentencePiece .model file
+vocab_size: 3500
+# Training hyperparameters
+batch_size: 32  # Giảm xuống 32 để tránh OOM với CTC loss - effective batch size: 128 (32 * 4)
+val_batch_size: 64  # Validation batch size (reduced proportionally)
+# Tăng tổng số epoch để resume vượt mốc 20
+num_epochs: 50
+learning_rate: 0.0003
+weight_decay: 0.0001
+grad_clip: 0.5
+gradient_accumulation_steps: 4  # Kept same - effective batch size: 256 (64 * 4)
+warmup_pct: 0.03  # Giảm từ 10% xuống 3% để model học nhanh hơn
+use_constant_lr_on_resume: false
+# Optimization
+use_amp: true   # Bật mixed precision
+use_bf16: true   # Sử dụng bfloat16 (tốt hơn float16 về numerical stability, RTX 5060TI hỗ trợ)
+num_workers: 2   # Giảm từ 12 xuống 2 để tránh BrokenPipeError (như đã thấy ở Epoch 6, 18)
+pin_memory: true
+use_gradient_checkpointing: false  # Tắt tạm thời vì có conflict với CTC output
+prefetch_factor: 4
+persistent_workers: true
+sort_by_length: true
+cache_in_ram: false
+use_bucketing: false
+# Data
+dataset_root: "data/processed/full_merged_dataset"
+language_filter: null
+# Decoding - Seq2Seq Autoregressive Generation
+# Using autoregressive generation with teacher forcing during training
+# Hybrid CTC/Attention Training (FIXES: Forces encoder to learn alignment)
+use_ctc_loss: true          # Enable CTC loss to help encoder learn audio-text alignment
+ctc_weight: 0.2              # Weight for CTC loss (0.2 = 20% CTC, 80% Attention) - Giảm để tiết kiệm memory
+# Scheduled Sampling (FIXES: Reduces teacher forcing, forces model to use encoder)
+use_scheduled_sampling: true  # Enable scheduled sampling to reduce teacher forcing
+teacher_forcing_initial: 1.0  # Start with 100% teacher forcing
+teacher_forcing_final: 0.5     # End with 50% teacher forcing (gradual decay)
+# Checkpointing
+checkpoint_dir: "checkpoints"
+save_every: 1  # Save checkpoint after every epoch
+# Logging
+log_file: "logs/training.log"
+# Training run
+run_name: "vietnamese_asr_transformer_bilingual_30m"
+# Auto-Rollback
+auto_rollback:
+  enabled: true
+  threshold_ratio: 1.3
+  patience: 1
+# Curriculum Learning
+curriculum_learning:
+  enabled: true
+  required_wer: 0.70
+  initial_ts_weight: 0.01
+  short_sentence_epochs: 3
+  max_duration_seconds: 4.0
+# Validation decoding controls
+# Limit decode length to speed up validation (prevents infinite loops)
+val_max_len: 128
+# Validate on a subset of validation batches (set null to disable)
+val_subset_pct: null
+# Hard-cap number of validation batches (set null to validate on full val set)
+val_max_batches: null
+# Use autoregressive generation for validation (slower but more accurate)
+# If false, uses greedy decoding from logits (faster, ~2x speedup, avoids second forward pass)
+use_autoregressive_validation: false
+# Calculate WER/CER during validation (set to false to skip prediction and speed up validation)
+calculate_val_wer: false  # Tắt tính WER để validation nhanh hơn (chỉ tính loss)