[ { "dataset": "DODa", "tokenizer": "DarijaBERT-ar", "vocab_size": 80000, "source": "external", "fertility_ar": 0.0, "fertility_az": 2.648, "fertility_overall": 2.648, "cpt_ar": 0.0, "cpt_az": 2.194, "cpt_overall": 2.194, "gain_pct": 0.0, "n_texts": 10000 }, { "dataset": "DODa", "tokenizer": "DarijaBERT-az", "vocab_size": 110000, "source": "external", "fertility_ar": 0.0, "fertility_az": 1.246, "fertility_overall": 1.246, "cpt_ar": 0.0, "cpt_az": 4.677, "cpt_overall": 4.677, "gain_pct": 0.0, "n_texts": 10000 }, { "dataset": "DODa", "tokenizer": "DarijaBERT-mix", "vocab_size": 160000, "source": "external", "fertility_ar": 0.0, "fertility_az": 1.318, "fertility_overall": 1.318, "cpt_ar": 0.0, "cpt_az": 4.441, "cpt_overall": 4.441, "gain_pct": 0.0, "n_texts": 10000 }, { "dataset": "DODa", "tokenizer": "Ours (80K WP)", "vocab_size": 80000, "source": "ours", "fertility_ar": 0.0, "fertility_az": 1.72, "fertility_overall": 1.72, "cpt_ar": 0.0, "cpt_az": 3.467, "cpt_overall": 3.467, "gain_pct": 35.0, "n_texts": 10000 }, { "dataset": "DODa", "tokenizer": "Ours (110K WP)", "vocab_size": 110000, "source": "ours", "fertility_ar": 0.0, "fertility_az": 1.68, "fertility_overall": 1.68, "cpt_ar": 0.0, "cpt_az": 3.551, "cpt_overall": 3.551, "gain_pct": -34.8, "n_texts": 10000 }, { "dataset": "DODa", "tokenizer": "Ours (32K BPE)", "vocab_size": 32000, "source": "ours", "fertility_ar": 0.0, "fertility_az": 1.901, "fertility_overall": 1.901, "cpt_ar": 0.0, "cpt_az": 3.145, "cpt_overall": 3.145, "gain_pct": -44.2, "n_texts": 10000 }, { "dataset": "Darija-Wiki", "tokenizer": "DarijaBERT-ar", "vocab_size": 80000, "source": "external", "fertility_ar": 1.56, "fertility_az": 2.502, "fertility_overall": 1.562, "cpt_ar": 3.382, "cpt_az": 2.589, "cpt_overall": 3.381, "gain_pct": 0.0, "n_texts": 10000 }, { "dataset": "Darija-Wiki", "tokenizer": "DarijaBERT-az", "vocab_size": 110000, "source": "external", "fertility_ar": 1.858, "fertility_az": 2.219, "fertility_overall": 1.858, "cpt_ar": 2.829, "cpt_az": 2.946, "cpt_overall": 2.829, "gain_pct": 0.0, "n_texts": 10000 }, { "dataset": "Darija-Wiki", "tokenizer": "DarijaBERT-mix", "vocab_size": 160000, "source": "external", "fertility_ar": 1.502, "fertility_az": 2.142, "fertility_overall": 1.503, "cpt_ar": 3.512, "cpt_az": 3.04, "cpt_overall": 3.511, "gain_pct": 0.0, "n_texts": 10000 }, { "dataset": "Darija-Wiki", "tokenizer": "Ours (80K WP)", "vocab_size": 80000, "source": "ours", "fertility_ar": 1.839, "fertility_az": 3.265, "fertility_overall": 1.841, "cpt_ar": 2.883, "cpt_az": 2.07, "cpt_overall": 2.882, "gain_pct": -17.9, "n_texts": 10000 }, { "dataset": "Darija-Wiki", "tokenizer": "Ours (110K WP)", "vocab_size": 110000, "source": "ours", "fertility_ar": 1.797, "fertility_az": 3.184, "fertility_overall": 1.799, "cpt_ar": 2.951, "cpt_az": 2.124, "cpt_overall": 2.95, "gain_pct": 3.2, "n_texts": 10000 }, { "dataset": "Darija-Wiki", "tokenizer": "Ours (32K BPE)", "vocab_size": 32000, "source": "ours", "fertility_ar": 2.102, "fertility_az": 3.805, "fertility_overall": 2.104, "cpt_ar": 2.519, "cpt_az": 1.743, "cpt_overall": 2.518, "gain_pct": -40.0, "n_texts": 10000 }, { "dataset": "Atlaset", "tokenizer": "DarijaBERT-ar", "vocab_size": 80000, "source": "external", "fertility_ar": 1.407, "fertility_az": 2.056, "fertility_overall": 1.41, "cpt_ar": 4.051, "cpt_az": 1.803, "cpt_overall": 4.041, "gain_pct": 0.0, "n_texts": 10000 }, { "dataset": "Atlaset", "tokenizer": "DarijaBERT-az", "vocab_size": 110000, "source": "external", "fertility_ar": 1.772, "fertility_az": 1.918, "fertility_overall": 1.773, "cpt_ar": 3.22, "cpt_az": 1.965, "cpt_overall": 3.215, "gain_pct": 0.0, "n_texts": 10000 }, { "dataset": "Atlaset", "tokenizer": "DarijaBERT-mix", "vocab_size": 160000, "source": "external", "fertility_ar": 1.379, "fertility_az": 1.922, "fertility_overall": 1.381, "cpt_ar": 4.131, "cpt_az": 1.963, "cpt_overall": 4.122, "gain_pct": 0.0, "n_texts": 10000 }, { "dataset": "Atlaset", "tokenizer": "Ours (80K WP)", "vocab_size": 80000, "source": "ours", "fertility_ar": 1.618, "fertility_az": 2.293, "fertility_overall": 1.621, "cpt_ar": 3.594, "cpt_az": 1.771, "cpt_overall": 3.586, "gain_pct": -15.0, "n_texts": 10000 }, { "dataset": "Atlaset", "tokenizer": "Ours (110K WP)", "vocab_size": 110000, "source": "ours", "fertility_ar": 1.566, "fertility_az": 2.183, "fertility_overall": 1.569, "cpt_ar": 3.714, "cpt_az": 1.846, "cpt_overall": 3.706, "gain_pct": 11.5, "n_texts": 10000 }, { "dataset": "Atlaset", "tokenizer": "Ours (32K BPE)", "vocab_size": 32000, "source": "ours", "fertility_ar": 1.864, "fertility_az": 2.571, "fertility_overall": 1.867, "cpt_ar": 3.119, "cpt_az": 1.492, "cpt_overall": 3.112, "gain_pct": -35.2, "n_texts": 10000 } ]