| [ |
| { |
| "dataset": "DODa", |
| "tokenizer": "DarijaBERT-ar", |
| "vocab_size": 80000, |
| "source": "external", |
| "fertility_ar": 0.0, |
| "fertility_az": 2.648, |
| "fertility_overall": 2.648, |
| "cpt_ar": 0.0, |
| "cpt_az": 2.194, |
| "cpt_overall": 2.194, |
| "gain_pct": 0.0, |
| "n_texts": 10000 |
| }, |
| { |
| "dataset": "DODa", |
| "tokenizer": "DarijaBERT-az", |
| "vocab_size": 110000, |
| "source": "external", |
| "fertility_ar": 0.0, |
| "fertility_az": 1.246, |
| "fertility_overall": 1.246, |
| "cpt_ar": 0.0, |
| "cpt_az": 4.677, |
| "cpt_overall": 4.677, |
| "gain_pct": 0.0, |
| "n_texts": 10000 |
| }, |
| { |
| "dataset": "DODa", |
| "tokenizer": "DarijaBERT-mix", |
| "vocab_size": 160000, |
| "source": "external", |
| "fertility_ar": 0.0, |
| "fertility_az": 1.318, |
| "fertility_overall": 1.318, |
| "cpt_ar": 0.0, |
| "cpt_az": 4.441, |
| "cpt_overall": 4.441, |
| "gain_pct": 0.0, |
| "n_texts": 10000 |
| }, |
| { |
| "dataset": "DODa", |
| "tokenizer": "Ours (80K WP)", |
| "vocab_size": 80000, |
| "source": "ours", |
| "fertility_ar": 0.0, |
| "fertility_az": 1.72, |
| "fertility_overall": 1.72, |
| "cpt_ar": 0.0, |
| "cpt_az": 3.467, |
| "cpt_overall": 3.467, |
| "gain_pct": 35.0, |
| "n_texts": 10000 |
| }, |
| { |
| "dataset": "DODa", |
| "tokenizer": "Ours (110K WP)", |
| "vocab_size": 110000, |
| "source": "ours", |
| "fertility_ar": 0.0, |
| "fertility_az": 1.68, |
| "fertility_overall": 1.68, |
| "cpt_ar": 0.0, |
| "cpt_az": 3.551, |
| "cpt_overall": 3.551, |
| "gain_pct": -34.8, |
| "n_texts": 10000 |
| }, |
| { |
| "dataset": "DODa", |
| "tokenizer": "Ours (32K BPE)", |
| "vocab_size": 32000, |
| "source": "ours", |
| "fertility_ar": 0.0, |
| "fertility_az": 1.901, |
| "fertility_overall": 1.901, |
| "cpt_ar": 0.0, |
| "cpt_az": 3.145, |
| "cpt_overall": 3.145, |
| "gain_pct": -44.2, |
| "n_texts": 10000 |
| }, |
| { |
| "dataset": "Darija-Wiki", |
| "tokenizer": "DarijaBERT-ar", |
| "vocab_size": 80000, |
| "source": "external", |
| "fertility_ar": 1.56, |
| "fertility_az": 2.502, |
| "fertility_overall": 1.562, |
| "cpt_ar": 3.382, |
| "cpt_az": 2.589, |
| "cpt_overall": 3.381, |
| "gain_pct": 0.0, |
| "n_texts": 10000 |
| }, |
| { |
| "dataset": "Darija-Wiki", |
| "tokenizer": "DarijaBERT-az", |
| "vocab_size": 110000, |
| "source": "external", |
| "fertility_ar": 1.858, |
| "fertility_az": 2.219, |
| "fertility_overall": 1.858, |
| "cpt_ar": 2.829, |
| "cpt_az": 2.946, |
| "cpt_overall": 2.829, |
| "gain_pct": 0.0, |
| "n_texts": 10000 |
| }, |
| { |
| "dataset": "Darija-Wiki", |
| "tokenizer": "DarijaBERT-mix", |
| "vocab_size": 160000, |
| "source": "external", |
| "fertility_ar": 1.502, |
| "fertility_az": 2.142, |
| "fertility_overall": 1.503, |
| "cpt_ar": 3.512, |
| "cpt_az": 3.04, |
| "cpt_overall": 3.511, |
| "gain_pct": 0.0, |
| "n_texts": 10000 |
| }, |
| { |
| "dataset": "Darija-Wiki", |
| "tokenizer": "Ours (80K WP)", |
| "vocab_size": 80000, |
| "source": "ours", |
| "fertility_ar": 1.839, |
| "fertility_az": 3.265, |
| "fertility_overall": 1.841, |
| "cpt_ar": 2.883, |
| "cpt_az": 2.07, |
| "cpt_overall": 2.882, |
| "gain_pct": -17.9, |
| "n_texts": 10000 |
| }, |
| { |
| "dataset": "Darija-Wiki", |
| "tokenizer": "Ours (110K WP)", |
| "vocab_size": 110000, |
| "source": "ours", |
| "fertility_ar": 1.797, |
| "fertility_az": 3.184, |
| "fertility_overall": 1.799, |
| "cpt_ar": 2.951, |
| "cpt_az": 2.124, |
| "cpt_overall": 2.95, |
| "gain_pct": 3.2, |
| "n_texts": 10000 |
| }, |
| { |
| "dataset": "Darija-Wiki", |
| "tokenizer": "Ours (32K BPE)", |
| "vocab_size": 32000, |
| "source": "ours", |
| "fertility_ar": 2.102, |
| "fertility_az": 3.805, |
| "fertility_overall": 2.104, |
| "cpt_ar": 2.519, |
| "cpt_az": 1.743, |
| "cpt_overall": 2.518, |
| "gain_pct": -40.0, |
| "n_texts": 10000 |
| }, |
| { |
| "dataset": "Atlaset", |
| "tokenizer": "DarijaBERT-ar", |
| "vocab_size": 80000, |
| "source": "external", |
| "fertility_ar": 1.407, |
| "fertility_az": 2.056, |
| "fertility_overall": 1.41, |
| "cpt_ar": 4.051, |
| "cpt_az": 1.803, |
| "cpt_overall": 4.041, |
| "gain_pct": 0.0, |
| "n_texts": 10000 |
| }, |
| { |
| "dataset": "Atlaset", |
| "tokenizer": "DarijaBERT-az", |
| "vocab_size": 110000, |
| "source": "external", |
| "fertility_ar": 1.772, |
| "fertility_az": 1.918, |
| "fertility_overall": 1.773, |
| "cpt_ar": 3.22, |
| "cpt_az": 1.965, |
| "cpt_overall": 3.215, |
| "gain_pct": 0.0, |
| "n_texts": 10000 |
| }, |
| { |
| "dataset": "Atlaset", |
| "tokenizer": "DarijaBERT-mix", |
| "vocab_size": 160000, |
| "source": "external", |
| "fertility_ar": 1.379, |
| "fertility_az": 1.922, |
| "fertility_overall": 1.381, |
| "cpt_ar": 4.131, |
| "cpt_az": 1.963, |
| "cpt_overall": 4.122, |
| "gain_pct": 0.0, |
| "n_texts": 10000 |
| }, |
| { |
| "dataset": "Atlaset", |
| "tokenizer": "Ours (80K WP)", |
| "vocab_size": 80000, |
| "source": "ours", |
| "fertility_ar": 1.618, |
| "fertility_az": 2.293, |
| "fertility_overall": 1.621, |
| "cpt_ar": 3.594, |
| "cpt_az": 1.771, |
| "cpt_overall": 3.586, |
| "gain_pct": -15.0, |
| "n_texts": 10000 |
| }, |
| { |
| "dataset": "Atlaset", |
| "tokenizer": "Ours (110K WP)", |
| "vocab_size": 110000, |
| "source": "ours", |
| "fertility_ar": 1.566, |
| "fertility_az": 2.183, |
| "fertility_overall": 1.569, |
| "cpt_ar": 3.714, |
| "cpt_az": 1.846, |
| "cpt_overall": 3.706, |
| "gain_pct": 11.5, |
| "n_texts": 10000 |
| }, |
| { |
| "dataset": "Atlaset", |
| "tokenizer": "Ours (32K BPE)", |
| "vocab_size": 32000, |
| "source": "ours", |
| "fertility_ar": 1.864, |
| "fertility_az": 2.571, |
| "fertility_overall": 1.867, |
| "cpt_ar": 3.119, |
| "cpt_az": 1.492, |
| "cpt_overall": 3.112, |
| "gain_pct": -35.2, |
| "n_texts": 10000 |
| } |
| ] |