HiTZ
/

PL-BERT-wp-eu

Model card Files Files and versions

PL-BERT-wp-eu / config.yml

Ander Arriandiaga

Init repo: configure LFS and ignore phonemizer/

2395c1f 1 day ago

history blame contribute delete

1.5 kB

	# Training configuration for Phoneme Tokenizer - based on WB run ofnglulb
	model_type: "albert"

	log_dir: "Checkpoint_Phoneme_Albert_correct_0002"
	mixed_precision: "fp16"
	data_folder: "wiki_phoneme/eu/dataset_v2_fixed_clean"
	batch_size: 32
	# Align save/log intervals with production
	save_interval: 10000
	log_interval: 1000
	num_process: 1
	# Full training steps from production
	num_steps: 4000000
	# Learning rate and scheduler to match production onecycle
	learning_rate: 0.0002
	alignment_approach: "phoneme"

	# Scheduler configuration
	scheduler_type: onecycle
	warmup_ratio: 0.1
	anneal_strategy: cos
	div_factor: 25
	final_div_factor: 10000

	# Wandb configuration
	wandb:
	project: "basque-pl-bert"
	experiment_name: "Phoneme_Albert_correct_phoneme_0002"
	entity: null
	tags: ["basque", "phoneme", "albert", "correct"]

	# Dataset parameters
	dataset_params:
	tokenizer_type: "phoneme"
	phoneme_tokenizer_path: "tokenizer/token_maps_eu.pkl"
	tokenizer: "ixa-ehu/berteus-base-cased"
	token_maps: "token_maps.pkl"
	token_separator: " "
	token_mask: "M"
	word_separator: 2
	max_mel_length: 512
	word_mask_prob: 0.15
	phoneme_mask_prob: 0.1
	replace_prob: 0.2

	# Model parameters (ALBERT configuration)
	model_params:
	vocab_size: 178
	hidden_size: 768
	num_attention_heads: 12
	intermediate_size: 2048
	max_position_embeddings: 512
	num_hidden_layers: 12
	dropout: 0.1
	embedding_size: 128
	num_hidden_groups: 1
	num_hidden_layers_per_group: 12
	inner_group_num: 1
	down_scale_factor: 1