daa-tokenizers / external_comparison.json
Ouaill's picture
Update external_comparison.json with all 9 external tokenizers
09e8b22 verified
Raw
History Blame Contribute Delete
5.47 kB
[
{
"name": "concat_bpe_8000",
"source": "ours",
"algorithm": "bpe",
"architecture": "concatenated",
"vocab_size": 8000,
"fertility_ar": 1.677255929872786,
"fertility_az": 1.5274486975717831,
"fertility_overall": 1.625089409755206,
"disparity": 0.0893168595399542,
"cpt_ar": 3.3745329363977343,
"cpt_az": 3.4826508889784487,
"exact_match_ar": 0.9989120580235721,
"exact_match_az": 0.9964364500254539
},
{
"name": "concat_wordpiece_16000",
"source": "ours",
"algorithm": "wordpiece",
"architecture": "concatenated",
"vocab_size": 16000,
"fertility_ar": 1.4835885687430344,
"fertility_az": 1.3635334584201908,
"fertility_overall": 1.441782460775622,
"disparity": 0.08092210526032824,
"cpt_ar": 3.81082194270718,
"cpt_az": 3.9034221417934196,
"exact_match_ar": 0.0,
"exact_match_az": 0.0
},
{
"name": "concat_bpe_32000",
"source": "ours",
"algorithm": "bpe",
"architecture": "concatenated",
"vocab_size": 32000,
"fertility_ar": 1.3077466165859795,
"fertility_az": 1.1981161479838143,
"fertility_overall": 1.2695706222898995,
"disparity": 0.08383158267185423,
"cpt_ar": 4.388664556275568,
"cpt_az": 4.487021365858093,
"exact_match_ar": 0.0,
"exact_match_az": 0.0
},
{
"name": "CaMeLBERT-MSA",
"source": "external_msa",
"algorithm": "WordPiece",
"architecture": "shared",
"vocab_size": 30000,
"fertility_ar": 1.816901627794181,
"fertility_az": 3.173471216645639,
"fertility_overall": 2.2892921369888652,
"disparity": 0.42747184273672184,
"cpt_ar": 3.1140517054288748,
"cpt_az": 1.6388977943422216,
"exact_match_ar": 0.2987307343608341,
"exact_match_az": 0.3885117936534872
},
{
"name": "Asafaya-BERT",
"source": "external_msa",
"algorithm": "WordPiece",
"architecture": "shared",
"vocab_size": 32000,
"fertility_ar": 1.7951679730188856,
"fertility_az": 2.794867411003224,
"fertility_overall": 2.143287620128837,
"disparity": 0.35769118565288,
"cpt_ar": 3.1312619206100933,
"cpt_az": 1.8618984630783484,
"exact_match_ar": 0.19895738893925657,
"exact_match_az": 0.15238418462582726
},
{
"name": "Aranizer-SP-86k",
"source": "external_msa",
"algorithm": "SentencePiece",
"architecture": "shared",
"vocab_size": 86000,
"fertility_ar": 1.5940707675919776,
"fertility_az": 2.5232344233957935,
"fertility_overall": 1.9176281406140119,
"disparity": 0.3682430959202508,
"cpt_ar": 3.532878084819776,
"cpt_az": 2.06546468780593,
"exact_match_ar": 0.9976881233000907,
"exact_match_az": 0.9955031393178347
},
{
"name": "B2BERT",
"source": "external_msa",
"algorithm": "WordPiece",
"architecture": "shared",
"vocab_size": 30000,
"fertility_ar": 1.816901627794181,
"fertility_az": 3.173471216645639,
"fertility_overall": 2.2892921369888652,
"disparity": 0.42747184273672184,
"cpt_ar": 3.1140517054288748,
"cpt_az": 1.6388977943422216,
"exact_match_ar": 0.2987307343608341,
"exact_match_az": 0.3885117936534872
},
{
"name": "DarijaBERT-ar",
"source": "external_darija",
"algorithm": "WordPiece",
"architecture": "shared",
"vocab_size": 80000,
"fertility_ar": 1.4174523931345464,
"fertility_az": 2.4040608107988306,
"fertility_overall": 1.7610134287249042,
"disparity": 0.41039245481334147,
"cpt_ar": 3.9815436288421893,
"cpt_az": 2.170223436755666,
"exact_match_ar": 0.13721668177697188,
"exact_match_az": 0.07984048871542508
},
{
"name": "DarijaBERT-az",
"source": "external_darija",
"algorithm": "WordPiece",
"architecture": "shared",
"vocab_size": 110000,
"fertility_ar": 1.6053190877519825,
"fertility_az": 1.5171613637119812,
"fertility_overall": 1.5746204251172413,
"disparity": 0.05491601309210961,
"cpt_ar": 3.476952784741954,
"cpt_az": 3.451281188374697,
"exact_match_ar": 0.14777878513145964,
"exact_match_az": 0.07984048871542508
},
{
"name": "Moroccan-Darija-Tokenizer",
"source": "external_darija",
"algorithm": "BPE",
"architecture": "shared",
"vocab_size": 30000,
"fertility_ar": 1.5701289164575563,
"fertility_az": 2.9024430947615336,
"fertility_overall": 2.034073102047897,
"disparity": 0.4590319723093283,
"cpt_ar": 3.626351312053874,
"cpt_az": 1.7930892097539761,
"exact_match_ar": 0.00018132366273798732,
"exact_match_az": 0.0
},
{
"name": "Translit-Darija",
"source": "external_darija",
"algorithm": "BPE",
"architecture": "shared",
"vocab_size": 30000,
"fertility_ar": 1.7964830125579598,
"fertility_az": 1.6576487239140443,
"fertility_overall": 1.7481375381752506,
"disparity": 0.0772811586157074,
"cpt_ar": 3.1115965796575096,
"cpt_az": 3.1602226731778864,
"exact_match_ar": 0.0,
"exact_match_az": 0.0
},
{
"name": "Qwen2.5-Darija",
"source": "external_darija",
"algorithm": "SentencePiece",
"architecture": "shared",
"vocab_size": 151643,
"fertility_ar": 2.340330300987948,
"fertility_az": 2.245717380136819,
"fertility_overall": 2.307383781897024,
"disparity": 0.04042716569160743,
"cpt_ar": 2.3801472507543755,
"cpt_az": 2.325374802332435,
"exact_match_ar": 1.0,
"exact_match_az": 1.0
}
]