fadi77
/

pl-bert

fadi77 commited on Mar 27, 2025

Commit

2cf7ec6

verified ·

1 Parent(s): 7915e9f

Upload models/mlm_p2g_non_diacritics/config.yml with huggingface_hub

Files changed (1) hide show

models/mlm_p2g_non_diacritics/config.yml ADDED Viewed

+preprocess_params:
+    tokenizer: "aubmindlab/bert-base-arabertv2"
+    num_shards: 10000
+    max_workers: 25
+    max_try_count: 3
+    timeout: 300
+    phonemizer_language: "ar"
+    preprocess_dir: "data/pl_bert"
+    hf_dataset_name: "wikimedia/wikipedia"
+    hf_dataset_split: "20231101.ar"
+    output_dir: "wikipedia_20231101.ar.processed"
+training_params:
+    output_dir: "checkpoints"
+    mixed_precision: "fp16"
+    batch_size: 16
+    save_interval: 5000
+    log_interval: 10
+    num_process: 1 # number of GPUs
+    num_steps: 1000000
+    learning_rate: 3e-5
+dataset_params:
+    word_separator: 87 # token idx used for word separation (W)
+    max_seq_length: 512 # max phoneme sequence length
+    word_pred_prob: 0.15 # probability to select work for prediction
+    phoneme_mask_prob: 0.8 # probability to mask phonemes
+    replace_prob: 0.1 # probablity to replace phonemes
+model_params:
+    hidden_size: 768
+    num_attention_heads: 12
+    intermediate_size: 2048
+    max_position_embeddings: 512
+    num_hidden_layers: 12
+    dropout: 0.1