File size: 1,501 Bytes
2395c1f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
# Training configuration for Phoneme Tokenizer - based on WB run ofnglulb
model_type: "albert"

log_dir: "Checkpoint_Phoneme_Albert_correct_0002"
mixed_precision: "fp16"
data_folder: "wiki_phoneme/eu/dataset_v2_fixed_clean"
batch_size: 32
# Align save/log intervals with production
save_interval: 10000
log_interval: 1000
num_process: 1
# Full training steps from production
num_steps: 4000000
# Learning rate and scheduler to match production onecycle
learning_rate: 0.0002
alignment_approach: "phoneme"

# Scheduler configuration
scheduler_type: onecycle
warmup_ratio: 0.1
anneal_strategy: cos
div_factor: 25
final_div_factor: 10000

# Wandb configuration
wandb:
  project: "basque-pl-bert"
  experiment_name: "Phoneme_Albert_correct_phoneme_0002"
  entity: null
  tags: ["basque", "phoneme", "albert", "correct"]

# Dataset parameters
dataset_params:
  tokenizer_type: "phoneme"
  phoneme_tokenizer_path: "tokenizer/token_maps_eu.pkl"
  tokenizer: "ixa-ehu/berteus-base-cased"
  token_maps: "token_maps.pkl"
  token_separator: " "
  token_mask: "M"
  word_separator: 2
  max_mel_length: 512
  word_mask_prob: 0.15
  phoneme_mask_prob: 0.1
  replace_prob: 0.2

# Model parameters (ALBERT configuration)
model_params:
  vocab_size: 178
  hidden_size: 768
  num_attention_heads: 12
  intermediate_size: 2048
  max_position_embeddings: 512
  num_hidden_layers: 12
  dropout: 0.1
  embedding_size: 128
  num_hidden_groups: 1
  num_hidden_layers_per_group: 12
  inner_group_num: 1
  down_scale_factor: 1