daa-tokenizers / results /external_datasets_eval.json
Ouaill's picture
Upload results/external_datasets_eval.json with huggingface_hub
765c3cc verified
Raw
History Blame Contribute Delete
5.66 kB
[
{
"dataset": "DODa",
"tokenizer": "DarijaBERT-ar",
"vocab_size": 80000,
"source": "external",
"fertility_ar": 0.0,
"fertility_az": 2.648,
"fertility_overall": 2.648,
"cpt_ar": 0.0,
"cpt_az": 2.194,
"cpt_overall": 2.194,
"gain_pct": 0.0,
"n_texts": 10000
},
{
"dataset": "DODa",
"tokenizer": "DarijaBERT-az",
"vocab_size": 110000,
"source": "external",
"fertility_ar": 0.0,
"fertility_az": 1.246,
"fertility_overall": 1.246,
"cpt_ar": 0.0,
"cpt_az": 4.677,
"cpt_overall": 4.677,
"gain_pct": 0.0,
"n_texts": 10000
},
{
"dataset": "DODa",
"tokenizer": "DarijaBERT-mix",
"vocab_size": 160000,
"source": "external",
"fertility_ar": 0.0,
"fertility_az": 1.318,
"fertility_overall": 1.318,
"cpt_ar": 0.0,
"cpt_az": 4.441,
"cpt_overall": 4.441,
"gain_pct": 0.0,
"n_texts": 10000
},
{
"dataset": "DODa",
"tokenizer": "Ours (80K WP)",
"vocab_size": 80000,
"source": "ours",
"fertility_ar": 0.0,
"fertility_az": 1.72,
"fertility_overall": 1.72,
"cpt_ar": 0.0,
"cpt_az": 3.467,
"cpt_overall": 3.467,
"gain_pct": 35.0,
"n_texts": 10000
},
{
"dataset": "DODa",
"tokenizer": "Ours (110K WP)",
"vocab_size": 110000,
"source": "ours",
"fertility_ar": 0.0,
"fertility_az": 1.68,
"fertility_overall": 1.68,
"cpt_ar": 0.0,
"cpt_az": 3.551,
"cpt_overall": 3.551,
"gain_pct": -34.8,
"n_texts": 10000
},
{
"dataset": "DODa",
"tokenizer": "Ours (32K BPE)",
"vocab_size": 32000,
"source": "ours",
"fertility_ar": 0.0,
"fertility_az": 1.901,
"fertility_overall": 1.901,
"cpt_ar": 0.0,
"cpt_az": 3.145,
"cpt_overall": 3.145,
"gain_pct": -44.2,
"n_texts": 10000
},
{
"dataset": "Darija-Wiki",
"tokenizer": "DarijaBERT-ar",
"vocab_size": 80000,
"source": "external",
"fertility_ar": 1.56,
"fertility_az": 2.502,
"fertility_overall": 1.562,
"cpt_ar": 3.382,
"cpt_az": 2.589,
"cpt_overall": 3.381,
"gain_pct": 0.0,
"n_texts": 10000
},
{
"dataset": "Darija-Wiki",
"tokenizer": "DarijaBERT-az",
"vocab_size": 110000,
"source": "external",
"fertility_ar": 1.858,
"fertility_az": 2.219,
"fertility_overall": 1.858,
"cpt_ar": 2.829,
"cpt_az": 2.946,
"cpt_overall": 2.829,
"gain_pct": 0.0,
"n_texts": 10000
},
{
"dataset": "Darija-Wiki",
"tokenizer": "DarijaBERT-mix",
"vocab_size": 160000,
"source": "external",
"fertility_ar": 1.502,
"fertility_az": 2.142,
"fertility_overall": 1.503,
"cpt_ar": 3.512,
"cpt_az": 3.04,
"cpt_overall": 3.511,
"gain_pct": 0.0,
"n_texts": 10000
},
{
"dataset": "Darija-Wiki",
"tokenizer": "Ours (80K WP)",
"vocab_size": 80000,
"source": "ours",
"fertility_ar": 1.839,
"fertility_az": 3.265,
"fertility_overall": 1.841,
"cpt_ar": 2.883,
"cpt_az": 2.07,
"cpt_overall": 2.882,
"gain_pct": -17.9,
"n_texts": 10000
},
{
"dataset": "Darija-Wiki",
"tokenizer": "Ours (110K WP)",
"vocab_size": 110000,
"source": "ours",
"fertility_ar": 1.797,
"fertility_az": 3.184,
"fertility_overall": 1.799,
"cpt_ar": 2.951,
"cpt_az": 2.124,
"cpt_overall": 2.95,
"gain_pct": 3.2,
"n_texts": 10000
},
{
"dataset": "Darija-Wiki",
"tokenizer": "Ours (32K BPE)",
"vocab_size": 32000,
"source": "ours",
"fertility_ar": 2.102,
"fertility_az": 3.805,
"fertility_overall": 2.104,
"cpt_ar": 2.519,
"cpt_az": 1.743,
"cpt_overall": 2.518,
"gain_pct": -40.0,
"n_texts": 10000
},
{
"dataset": "Atlaset",
"tokenizer": "DarijaBERT-ar",
"vocab_size": 80000,
"source": "external",
"fertility_ar": 1.407,
"fertility_az": 2.056,
"fertility_overall": 1.41,
"cpt_ar": 4.051,
"cpt_az": 1.803,
"cpt_overall": 4.041,
"gain_pct": 0.0,
"n_texts": 10000
},
{
"dataset": "Atlaset",
"tokenizer": "DarijaBERT-az",
"vocab_size": 110000,
"source": "external",
"fertility_ar": 1.772,
"fertility_az": 1.918,
"fertility_overall": 1.773,
"cpt_ar": 3.22,
"cpt_az": 1.965,
"cpt_overall": 3.215,
"gain_pct": 0.0,
"n_texts": 10000
},
{
"dataset": "Atlaset",
"tokenizer": "DarijaBERT-mix",
"vocab_size": 160000,
"source": "external",
"fertility_ar": 1.379,
"fertility_az": 1.922,
"fertility_overall": 1.381,
"cpt_ar": 4.131,
"cpt_az": 1.963,
"cpt_overall": 4.122,
"gain_pct": 0.0,
"n_texts": 10000
},
{
"dataset": "Atlaset",
"tokenizer": "Ours (80K WP)",
"vocab_size": 80000,
"source": "ours",
"fertility_ar": 1.618,
"fertility_az": 2.293,
"fertility_overall": 1.621,
"cpt_ar": 3.594,
"cpt_az": 1.771,
"cpt_overall": 3.586,
"gain_pct": -15.0,
"n_texts": 10000
},
{
"dataset": "Atlaset",
"tokenizer": "Ours (110K WP)",
"vocab_size": 110000,
"source": "ours",
"fertility_ar": 1.566,
"fertility_az": 2.183,
"fertility_overall": 1.569,
"cpt_ar": 3.714,
"cpt_az": 1.846,
"cpt_overall": 3.706,
"gain_pct": 11.5,
"n_texts": 10000
},
{
"dataset": "Atlaset",
"tokenizer": "Ours (32K BPE)",
"vocab_size": 32000,
"source": "ours",
"fertility_ar": 1.864,
"fertility_az": 2.571,
"fertility_overall": 1.867,
"cpt_ar": 3.119,
"cpt_az": 1.492,
"cpt_overall": 3.112,
"gain_pct": -35.2,
"n_texts": 10000
}
]