| preprocess_params: | |
| tokenizer: "aubmindlab/bert-base-arabertv2" | |
| num_shards: 10000 | |
| max_workers: 25 | |
| max_try_count: 3 | |
| timeout: 300 | |
| phonemizer_language: "ar" | |
| preprocess_dir: "data/pl_bert" | |
| hf_dataset_name: "wikimedia/wikipedia" | |
| hf_dataset_split: "20231101.ar" | |
| cleaned_output_dir: "wikipedia_20231101.ar.cleaned" | |
| training_params: | |
| output_dir: "/pl_bert/checkpoints" | |
| mixed_precision: "fp16" | |
| batch_size: 96 | |
| save_interval: 1000 | |
| log_interval: 10 | |
| num_process: 1 # number of GPUs | |
| num_steps: 1000000 | |
| learning_rate: 7e-5 | |
| training_dataset: "fadi77/wikipedia_20231101.ar.phonemized" | |
| split: "diacritized" | |
| dataset_params: | |
| word_separator: 87 # token idx used for word separation (W) | |
| max_seq_length: 512 # max phoneme sequence length | |
| word_pred_prob: 0.15 # probability to select work for prediction | |
| phoneme_mask_prob: 0.8 # probability to mask phonemes | |
| replace_prob: 0.1 # probablity to replace phonemes | |
| model_params: | |
| pretrained_model: "/pl_bert/checkpoints/modal_phoneme_only_non_diacritics/step_116000.pth" | |
| hidden_size: 768 | |
| num_attention_heads: 12 | |
| intermediate_size: 2048 | |
| max_position_embeddings: 512 | |
| num_hidden_layers: 12 | |
| dropout: 0.1 | |