# Training configuration for Phoneme Tokenizer - based on WB run ofnglulb model_type: "albert" log_dir: "Checkpoint_Phoneme_Albert_correct_0002" mixed_precision: "fp16" data_folder: "wiki_phoneme/eu/dataset_v2_fixed_clean" batch_size: 32 # Align save/log intervals with production save_interval: 10000 log_interval: 1000 num_process: 1 # Full training steps from production num_steps: 4000000 # Learning rate and scheduler to match production onecycle learning_rate: 0.0002 alignment_approach: "phoneme" # Scheduler configuration scheduler_type: onecycle warmup_ratio: 0.1 anneal_strategy: cos div_factor: 25 final_div_factor: 10000 # Wandb configuration wandb: project: "basque-pl-bert" experiment_name: "Phoneme_Albert_correct_phoneme_0002" entity: null tags: ["basque", "phoneme", "albert", "correct"] # Dataset parameters dataset_params: tokenizer_type: "phoneme" phoneme_tokenizer_path: "tokenizer/token_maps_eu.pkl" tokenizer: "ixa-ehu/berteus-base-cased" token_maps: "token_maps.pkl" token_separator: " " token_mask: "M" word_separator: 2 max_mel_length: 512 word_mask_prob: 0.15 phoneme_mask_prob: 0.1 replace_prob: 0.2 # Model parameters (ALBERT configuration) model_params: vocab_size: 178 hidden_size: 768 num_attention_heads: 12 intermediate_size: 2048 max_position_embeddings: 512 num_hidden_layers: 12 dropout: 0.1 embedding_size: 128 num_hidden_groups: 1 num_hidden_layers_per_group: 12 inner_group_num: 1 down_scale_factor: 1