PL-BERT-wp-eu / config.yml
Ander Arriandiaga
Init repo: configure LFS and ignore phonemizer/
2395c1f
# Training configuration for Phoneme Tokenizer - based on WB run ofnglulb
model_type: "albert"
log_dir: "Checkpoint_Phoneme_Albert_correct_0002"
mixed_precision: "fp16"
data_folder: "wiki_phoneme/eu/dataset_v2_fixed_clean"
batch_size: 32
# Align save/log intervals with production
save_interval: 10000
log_interval: 1000
num_process: 1
# Full training steps from production
num_steps: 4000000
# Learning rate and scheduler to match production onecycle
learning_rate: 0.0002
alignment_approach: "phoneme"
# Scheduler configuration
scheduler_type: onecycle
warmup_ratio: 0.1
anneal_strategy: cos
div_factor: 25
final_div_factor: 10000
# Wandb configuration
wandb:
project: "basque-pl-bert"
experiment_name: "Phoneme_Albert_correct_phoneme_0002"
entity: null
tags: ["basque", "phoneme", "albert", "correct"]
# Dataset parameters
dataset_params:
tokenizer_type: "phoneme"
phoneme_tokenizer_path: "tokenizer/token_maps_eu.pkl"
tokenizer: "ixa-ehu/berteus-base-cased"
token_maps: "token_maps.pkl"
token_separator: " "
token_mask: "M"
word_separator: 2
max_mel_length: 512
word_mask_prob: 0.15
phoneme_mask_prob: 0.1
replace_prob: 0.2
# Model parameters (ALBERT configuration)
model_params:
vocab_size: 178
hidden_size: 768
num_attention_heads: 12
intermediate_size: 2048
max_position_embeddings: 512
num_hidden_layers: 12
dropout: 0.1
embedding_size: 128
num_hidden_groups: 1
num_hidden_layers_per_group: 12
inner_group_num: 1
down_scale_factor: 1