| # Training configuration for Phoneme Tokenizer - based on WB run ofnglulb | |
| model_type: "albert" | |
| log_dir: "Checkpoint_Phoneme_Albert_correct_0002" | |
| mixed_precision: "fp16" | |
| data_folder: "wiki_phoneme/eu/dataset_v2_fixed_clean" | |
| batch_size: 32 | |
| # Align save/log intervals with production | |
| save_interval: 10000 | |
| log_interval: 1000 | |
| num_process: 1 | |
| # Full training steps from production | |
| num_steps: 4000000 | |
| # Learning rate and scheduler to match production onecycle | |
| learning_rate: 0.0002 | |
| alignment_approach: "phoneme" | |
| # Scheduler configuration | |
| scheduler_type: onecycle | |
| warmup_ratio: 0.1 | |
| anneal_strategy: cos | |
| div_factor: 25 | |
| final_div_factor: 10000 | |
| # Wandb configuration | |
| wandb: | |
| project: "basque-pl-bert" | |
| experiment_name: "Phoneme_Albert_correct_phoneme_0002" | |
| entity: null | |
| tags: ["basque", "phoneme", "albert", "correct"] | |
| # Dataset parameters | |
| dataset_params: | |
| tokenizer_type: "phoneme" | |
| phoneme_tokenizer_path: "tokenizer/token_maps_eu.pkl" | |
| tokenizer: "ixa-ehu/berteus-base-cased" | |
| token_maps: "token_maps.pkl" | |
| token_separator: " " | |
| token_mask: "M" | |
| word_separator: 2 | |
| max_mel_length: 512 | |
| word_mask_prob: 0.15 | |
| phoneme_mask_prob: 0.1 | |
| replace_prob: 0.2 | |
| # Model parameters (ALBERT configuration) | |
| model_params: | |
| vocab_size: 178 | |
| hidden_size: 768 | |
| num_attention_heads: 12 | |
| intermediate_size: 2048 | |
| max_position_embeddings: 512 | |
| num_hidden_layers: 12 | |
| dropout: 0.1 | |
| embedding_size: 128 | |
| num_hidden_groups: 1 | |
| num_hidden_layers_per_group: 12 | |
| inner_group_num: 1 | |
| down_scale_factor: 1 | |