fadi77 commited on
Commit
7697c90
·
verified ·
1 Parent(s): 9b3bcdb

Upload models/mlm_only_with_diacritics/config.yml with huggingface_hub

Browse files
models/mlm_only_with_diacritics/config.yml ADDED
@@ -0,0 +1,39 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ preprocess_params:
2
+ tokenizer: "aubmindlab/bert-base-arabertv2"
3
+ num_shards: 10000
4
+ max_workers: 25
5
+ max_try_count: 3
6
+ timeout: 300
7
+ phonemizer_language: "ar"
8
+ preprocess_dir: "data/pl_bert"
9
+ hf_dataset_name: "wikimedia/wikipedia"
10
+ hf_dataset_split: "20231101.ar"
11
+ cleaned_output_dir: "wikipedia_20231101.ar.cleaned"
12
+
13
+ training_params:
14
+ output_dir: "/pl_bert/checkpoints"
15
+ mixed_precision: "fp16"
16
+ batch_size: 96
17
+ save_interval: 1000
18
+ log_interval: 10
19
+ num_process: 1 # number of GPUs
20
+ num_steps: 1000000
21
+ learning_rate: 7e-5
22
+ training_dataset: "fadi77/wikipedia_20231101.ar.phonemized"
23
+ split: "diacritized"
24
+
25
+ dataset_params:
26
+ word_separator: 87 # token idx used for word separation (W)
27
+ max_seq_length: 512 # max phoneme sequence length
28
+ word_pred_prob: 0.15 # probability to select work for prediction
29
+ phoneme_mask_prob: 0.8 # probability to mask phonemes
30
+ replace_prob: 0.1 # probablity to replace phonemes
31
+
32
+ model_params:
33
+ pretrained_model: "/pl_bert/checkpoints/modal_phoneme_only_non_diacritics/step_116000.pth"
34
+ hidden_size: 768
35
+ num_attention_heads: 12
36
+ intermediate_size: 2048
37
+ max_position_embeddings: 512
38
+ num_hidden_layers: 12
39
+ dropout: 0.1