fadi77
/

pl-bert

fadi77 commited on Mar 27, 2025

Commit

7697c90

verified ·

1 Parent(s): 9b3bcdb

Upload models/mlm_only_with_diacritics/config.yml with huggingface_hub

Files changed (1) hide show

models/mlm_only_with_diacritics/config.yml ADDED Viewed

+preprocess_params:
+  tokenizer: "aubmindlab/bert-base-arabertv2"
+  num_shards: 10000
+  max_workers: 25
+  max_try_count: 3
+  timeout: 300
+  phonemizer_language: "ar"
+  preprocess_dir: "data/pl_bert"
+  hf_dataset_name: "wikimedia/wikipedia"
+  hf_dataset_split: "20231101.ar"
+  cleaned_output_dir: "wikipedia_20231101.ar.cleaned"
+training_params:
+  output_dir: "/pl_bert/checkpoints"
+  mixed_precision: "fp16"
+  batch_size: 96
+  save_interval: 1000
+  log_interval: 10
+  num_process: 1 # number of GPUs
+  num_steps: 1000000
+  learning_rate: 7e-5
+  training_dataset: "fadi77/wikipedia_20231101.ar.phonemized"
+  split: "diacritized"
+dataset_params:
+  word_separator: 87 # token idx used for word separation (W)
+  max_seq_length: 512 # max phoneme sequence length
+  word_pred_prob: 0.15 # probability to select work for prediction
+  phoneme_mask_prob: 0.8 # probability to mask phonemes
+  replace_prob: 0.1 # probablity to replace phonemes
+model_params:
+  pretrained_model: "/pl_bert/checkpoints/modal_phoneme_only_non_diacritics/step_116000.pth"
+  hidden_size: 768
+  num_attention_heads: 12
+  intermediate_size: 2048
+  max_position_embeddings: 512
+  num_hidden_layers: 12
+  dropout: 0.1