fadi77 commited on
Commit
98daf19
·
verified ·
1 Parent(s): 0b54a18

Upload models/mlm_only_non_diacritics/config.yml with huggingface_hub

Browse files
models/mlm_only_non_diacritics/config.yml ADDED
@@ -0,0 +1,37 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ preprocess_params:
2
+ tokenizer: "aubmindlab/bert-base-arabertv2"
3
+ num_shards: 10000
4
+ max_workers: 25
5
+ max_try_count: 3
6
+ timeout: 300
7
+ phonemizer_language: "ar"
8
+ preprocess_dir: "data/pl_bert"
9
+ hf_dataset_name: "wikimedia/wikipedia"
10
+ hf_dataset_split: "20231101.ar"
11
+ cleaned_output_dir: "wikipedia_20231101.ar.cleaned"
12
+
13
+ training_params:
14
+ output_dir: "/pl_bert/checkpoints"
15
+ mixed_precision: "fp16"
16
+ batch_size: 96
17
+ save_interval: 1000
18
+ log_interval: 10
19
+ num_process: 1 # number of GPUs
20
+ num_steps: 1000000
21
+ learning_rate: 7e-5
22
+ training_dataset: "fadi77/wikipedia_20231101.ar.phonemized"
23
+
24
+ dataset_params:
25
+ word_separator: 87 # token idx used for word separation (W)
26
+ max_seq_length: 512 # max phoneme sequence length
27
+ word_pred_prob: 0.15 # probability to select work for prediction
28
+ phoneme_mask_prob: 0.8 # probability to mask phonemes
29
+ replace_prob: 0.1 # probablity to replace phonemes
30
+
31
+ model_params:
32
+ hidden_size: 768
33
+ num_attention_heads: 12
34
+ intermediate_size: 2048
35
+ max_position_embeddings: 512
36
+ num_hidden_layers: 12
37
+ dropout: 0.1