fadi77's picture
Upload models/mlm_only_with_diacritics/config.yml with huggingface_hub
7697c90 verified
preprocess_params:
tokenizer: "aubmindlab/bert-base-arabertv2"
num_shards: 10000
max_workers: 25
max_try_count: 3
timeout: 300
phonemizer_language: "ar"
preprocess_dir: "data/pl_bert"
hf_dataset_name: "wikimedia/wikipedia"
hf_dataset_split: "20231101.ar"
cleaned_output_dir: "wikipedia_20231101.ar.cleaned"
training_params:
output_dir: "/pl_bert/checkpoints"
mixed_precision: "fp16"
batch_size: 96
save_interval: 1000
log_interval: 10
num_process: 1 # number of GPUs
num_steps: 1000000
learning_rate: 7e-5
training_dataset: "fadi77/wikipedia_20231101.ar.phonemized"
split: "diacritized"
dataset_params:
word_separator: 87 # token idx used for word separation (W)
max_seq_length: 512 # max phoneme sequence length
word_pred_prob: 0.15 # probability to select work for prediction
phoneme_mask_prob: 0.8 # probability to mask phonemes
replace_prob: 0.1 # probablity to replace phonemes
model_params:
pretrained_model: "/pl_bert/checkpoints/modal_phoneme_only_non_diacritics/step_116000.pth"
hidden_size: 768
num_attention_heads: 12
intermediate_size: 2048
max_position_embeddings: 512
num_hidden_layers: 12
dropout: 0.1