File size: 5,466 Bytes
343bbb9 09e8b22 343bbb9 09e8b22 343bbb9 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 | [
{
"name": "concat_bpe_8000",
"source": "ours",
"algorithm": "bpe",
"architecture": "concatenated",
"vocab_size": 8000,
"fertility_ar": 1.677255929872786,
"fertility_az": 1.5274486975717831,
"fertility_overall": 1.625089409755206,
"disparity": 0.0893168595399542,
"cpt_ar": 3.3745329363977343,
"cpt_az": 3.4826508889784487,
"exact_match_ar": 0.9989120580235721,
"exact_match_az": 0.9964364500254539
},
{
"name": "concat_wordpiece_16000",
"source": "ours",
"algorithm": "wordpiece",
"architecture": "concatenated",
"vocab_size": 16000,
"fertility_ar": 1.4835885687430344,
"fertility_az": 1.3635334584201908,
"fertility_overall": 1.441782460775622,
"disparity": 0.08092210526032824,
"cpt_ar": 3.81082194270718,
"cpt_az": 3.9034221417934196,
"exact_match_ar": 0.0,
"exact_match_az": 0.0
},
{
"name": "concat_bpe_32000",
"source": "ours",
"algorithm": "bpe",
"architecture": "concatenated",
"vocab_size": 32000,
"fertility_ar": 1.3077466165859795,
"fertility_az": 1.1981161479838143,
"fertility_overall": 1.2695706222898995,
"disparity": 0.08383158267185423,
"cpt_ar": 4.388664556275568,
"cpt_az": 4.487021365858093,
"exact_match_ar": 0.0,
"exact_match_az": 0.0
},
{
"name": "CaMeLBERT-MSA",
"source": "external_msa",
"algorithm": "WordPiece",
"architecture": "shared",
"vocab_size": 30000,
"fertility_ar": 1.816901627794181,
"fertility_az": 3.173471216645639,
"fertility_overall": 2.2892921369888652,
"disparity": 0.42747184273672184,
"cpt_ar": 3.1140517054288748,
"cpt_az": 1.6388977943422216,
"exact_match_ar": 0.2987307343608341,
"exact_match_az": 0.3885117936534872
},
{
"name": "Asafaya-BERT",
"source": "external_msa",
"algorithm": "WordPiece",
"architecture": "shared",
"vocab_size": 32000,
"fertility_ar": 1.7951679730188856,
"fertility_az": 2.794867411003224,
"fertility_overall": 2.143287620128837,
"disparity": 0.35769118565288,
"cpt_ar": 3.1312619206100933,
"cpt_az": 1.8618984630783484,
"exact_match_ar": 0.19895738893925657,
"exact_match_az": 0.15238418462582726
},
{
"name": "Aranizer-SP-86k",
"source": "external_msa",
"algorithm": "SentencePiece",
"architecture": "shared",
"vocab_size": 86000,
"fertility_ar": 1.5940707675919776,
"fertility_az": 2.5232344233957935,
"fertility_overall": 1.9176281406140119,
"disparity": 0.3682430959202508,
"cpt_ar": 3.532878084819776,
"cpt_az": 2.06546468780593,
"exact_match_ar": 0.9976881233000907,
"exact_match_az": 0.9955031393178347
},
{
"name": "B2BERT",
"source": "external_msa",
"algorithm": "WordPiece",
"architecture": "shared",
"vocab_size": 30000,
"fertility_ar": 1.816901627794181,
"fertility_az": 3.173471216645639,
"fertility_overall": 2.2892921369888652,
"disparity": 0.42747184273672184,
"cpt_ar": 3.1140517054288748,
"cpt_az": 1.6388977943422216,
"exact_match_ar": 0.2987307343608341,
"exact_match_az": 0.3885117936534872
},
{
"name": "DarijaBERT-ar",
"source": "external_darija",
"algorithm": "WordPiece",
"architecture": "shared",
"vocab_size": 80000,
"fertility_ar": 1.4174523931345464,
"fertility_az": 2.4040608107988306,
"fertility_overall": 1.7610134287249042,
"disparity": 0.41039245481334147,
"cpt_ar": 3.9815436288421893,
"cpt_az": 2.170223436755666,
"exact_match_ar": 0.13721668177697188,
"exact_match_az": 0.07984048871542508
},
{
"name": "DarijaBERT-az",
"source": "external_darija",
"algorithm": "WordPiece",
"architecture": "shared",
"vocab_size": 110000,
"fertility_ar": 1.6053190877519825,
"fertility_az": 1.5171613637119812,
"fertility_overall": 1.5746204251172413,
"disparity": 0.05491601309210961,
"cpt_ar": 3.476952784741954,
"cpt_az": 3.451281188374697,
"exact_match_ar": 0.14777878513145964,
"exact_match_az": 0.07984048871542508
},
{
"name": "Moroccan-Darija-Tokenizer",
"source": "external_darija",
"algorithm": "BPE",
"architecture": "shared",
"vocab_size": 30000,
"fertility_ar": 1.5701289164575563,
"fertility_az": 2.9024430947615336,
"fertility_overall": 2.034073102047897,
"disparity": 0.4590319723093283,
"cpt_ar": 3.626351312053874,
"cpt_az": 1.7930892097539761,
"exact_match_ar": 0.00018132366273798732,
"exact_match_az": 0.0
},
{
"name": "Translit-Darija",
"source": "external_darija",
"algorithm": "BPE",
"architecture": "shared",
"vocab_size": 30000,
"fertility_ar": 1.7964830125579598,
"fertility_az": 1.6576487239140443,
"fertility_overall": 1.7481375381752506,
"disparity": 0.0772811586157074,
"cpt_ar": 3.1115965796575096,
"cpt_az": 3.1602226731778864,
"exact_match_ar": 0.0,
"exact_match_az": 0.0
},
{
"name": "Qwen2.5-Darija",
"source": "external_darija",
"algorithm": "SentencePiece",
"architecture": "shared",
"vocab_size": 151643,
"fertility_ar": 2.340330300987948,
"fertility_az": 2.245717380136819,
"fertility_overall": 2.307383781897024,
"disparity": 0.04042716569160743,
"cpt_ar": 2.3801472507543755,
"cpt_az": 2.325374802332435,
"exact_match_ar": 1.0,
"exact_match_az": 1.0
}
] |