fadi77 commited on
Commit
2cf7ec6
·
verified ·
1 Parent(s): 7915e9f

Upload models/mlm_p2g_non_diacritics/config.yml with huggingface_hub

Browse files
models/mlm_p2g_non_diacritics/config.yml ADDED
@@ -0,0 +1,36 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ preprocess_params:
2
+ tokenizer: "aubmindlab/bert-base-arabertv2"
3
+ num_shards: 10000
4
+ max_workers: 25
5
+ max_try_count: 3
6
+ timeout: 300
7
+ phonemizer_language: "ar"
8
+ preprocess_dir: "data/pl_bert"
9
+ hf_dataset_name: "wikimedia/wikipedia"
10
+ hf_dataset_split: "20231101.ar"
11
+ output_dir: "wikipedia_20231101.ar.processed"
12
+
13
+ training_params:
14
+ output_dir: "checkpoints"
15
+ mixed_precision: "fp16"
16
+ batch_size: 16
17
+ save_interval: 5000
18
+ log_interval: 10
19
+ num_process: 1 # number of GPUs
20
+ num_steps: 1000000
21
+ learning_rate: 3e-5
22
+
23
+ dataset_params:
24
+ word_separator: 87 # token idx used for word separation (W)
25
+ max_seq_length: 512 # max phoneme sequence length
26
+ word_pred_prob: 0.15 # probability to select work for prediction
27
+ phoneme_mask_prob: 0.8 # probability to mask phonemes
28
+ replace_prob: 0.1 # probablity to replace phonemes
29
+
30
+ model_params:
31
+ hidden_size: 768
32
+ num_attention_heads: 12
33
+ intermediate_size: 2048
34
+ max_position_embeddings: 512
35
+ num_hidden_layers: 12
36
+ dropout: 0.1