Add config/default.yaml

d81700b verified 14 days ago

4.07 kB

	# Default configuration for ASR training
	# BILINGUAL TRAINING: Vietnamese + English
	# Optimized for full_merged_dataset with bilingual support
	# Dataset: ~194k training samples (77% Vietnamese, 23% English)

	# Model architecture - Transformer Seq2Seq ASR with Language Embedding
	# OPTIMIZED FOR BILINGUAL (Vietnamese + English) - ~30M parameters
	model_name: "VietnameseASR_Transformer_Bilingual_30M"
	d_model: 256 # Model dimension (reduced from 320)
	num_encoder_layers: 14 # Number of encoder layers (kept same)
	num_decoder_layers: 6 # Number of decoder layers (kept same)
	num_heads: 8 # Number of attention heads (kept same)
	d_ff: 2048 # Feed-forward dimension (reduced from 3120)
	dropout: 0.2 # Dropout rate

	# Audio processing
	sample_rate: 16000
	n_mels: 80
	n_fft: 400
	hop_length: 160
	win_length: 400

	# Tokenization - SentencePiece BPE for Bilingual (Vietnamese + English)
	tokenizer_type: "sentencepiece" # Changed from "bpe" to "sentencepiece"
	bpe_vocab_path: "models/tokenizer_vi_en_3500.model" # SentencePiece .model file
	vocab_size: 3500

	# Training hyperparameters
	batch_size: 32 # Giảm xuống 32 để tránh OOM với CTC loss - effective batch size: 128 (32 * 4)
	val_batch_size: 64 # Validation batch size (reduced proportionally)
	# Tăng tổng số epoch để resume vượt mốc 20
	num_epochs: 50
	learning_rate: 0.0003
	weight_decay: 0.0001
	grad_clip: 0.5
	gradient_accumulation_steps: 4 # Kept same - effective batch size: 256 (64 * 4)
	warmup_pct: 0.03 # Giảm từ 10% xuống 3% để model học nhanh hơn
	use_constant_lr_on_resume: false

	# Optimization
	use_amp: true # Bật mixed precision
	use_bf16: true # Sử dụng bfloat16 (tốt hơn float16 về numerical stability, RTX 5060TI hỗ trợ)
	num_workers: 2 # Giảm từ 12 xuống 2 để tránh BrokenPipeError (như đã thấy ở Epoch 6, 18)
	pin_memory: true
	use_gradient_checkpointing: false # Tắt tạm thời vì có conflict với CTC output
	prefetch_factor: 4
	persistent_workers: true
	sort_by_length: true
	cache_in_ram: false
	use_bucketing: false

	# Data
	dataset_root: "data/processed/full_merged_dataset"
	language_filter: null

	# Decoding - Seq2Seq Autoregressive Generation
	# Using autoregressive generation with teacher forcing during training

	# Hybrid CTC/Attention Training (FIXES: Forces encoder to learn alignment)
	use_ctc_loss: true # Enable CTC loss to help encoder learn audio-text alignment
	ctc_weight: 0.2 # Weight for CTC loss (0.2 = 20% CTC, 80% Attention) - Giảm để tiết kiệm memory

	# Scheduled Sampling (FIXES: Reduces teacher forcing, forces model to use encoder)
	use_scheduled_sampling: true # Enable scheduled sampling to reduce teacher forcing
	teacher_forcing_initial: 1.0 # Start with 100% teacher forcing
	teacher_forcing_final: 0.5 # End with 50% teacher forcing (gradual decay)

	# Checkpointing
	checkpoint_dir: "checkpoints"
	save_every: 1 # Save checkpoint after every epoch

	# Logging
	log_file: "logs/training.log"

	# Training run
	run_name: "vietnamese_asr_transformer_bilingual_30m"

	# Auto-Rollback
	auto_rollback:
	enabled: true
	threshold_ratio: 1.3
	patience: 1

	# Curriculum Learning
	curriculum_learning:
	enabled: true
	required_wer: 0.70
	initial_ts_weight: 0.01
	short_sentence_epochs: 3
	max_duration_seconds: 4.0

	# Validation decoding controls
	# Limit decode length to speed up validation (prevents infinite loops)
	val_max_len: 128
	# Validate on a subset of validation batches (set null to disable)
	val_subset_pct: null
	# Hard-cap number of validation batches (set null to validate on full val set)
	val_max_batches: null
	# Use autoregressive generation for validation (slower but more accurate)
	# If false, uses greedy decoding from logits (faster, ~2x speedup, avoids second forward pass)
	use_autoregressive_validation: false
	# Calculate WER/CER during validation (set to false to skip prediction and speed up validation)
	calculate_val_wer: false # Tắt tính WER để validation nhanh hơn (chỉ tính loss)