hindi-embedding-foundational-model / tokenizer_evaluation.json
DeepMostInnovations's picture
Upload Hindi embeddings model and all associated files
d5947d7 verified
{
"vocab_size": 16000,
"num_samples": 1000,
"avg_tokens_per_text": 688.021,
"avg_chars_per_token": 4.176231539444291,
"token_frequency": {
"▁के": 22686,
"।": 19273,
"▁में": 16210,
"▁है": 16067,
"▁0": 13182,
"▁की": 12858,
"▁को": 10132,
"▁से": 10072,
"▁और": 9746,
"▁का": 7612,
"▁हैं": 6469,
"▁पर": 6117,
"▁ने": 5582,
"▁कि": 5028,
"ों": 4854,
"▁लिए": 4562,
"▁एक": 4558,
"▁भी": 4149,
"▁नहीं": 3721,
"ी": 3498,
"▁इस": 3040,
"▁कर": 2959,
"▁किया": 2772,
"▁यह": 2701,
"▁करने": 2487,
"▁था": 2474,
"▁ही": 2432,
"▁हो": 2428,
"▁साथ": 2289,
"ा": 2263,
"▁तो": 2235,
"▁": 2163,
"▁गया": 1949,
"े": 1904,
"▁कहा": 1665,
"▁अपने": 1596,
"र": 1504,
"▁बाद": 1476,
"▁रहे": 1446,
"▁।": 1419,
"▁जो": 1413,
"न": 1394,
"▁तक": 1316,
"▁दिया": 1297,
"▁आप": 1239,
"▁या": 1235,
"▁लेकिन": 1229,
"s": 1216,
"▁रहा": 1166,
"▁थे": 1154,
"▁थी": 1135,
"▁कुछ": 1122,
"▁हुए": 1074,
"▁वह": 1063,
"▁जा": 1059,
"▁रूप": 1059,
"▁पहले": 1056,
"ता": 1010,
"▁उन्होंने": 1000,
"▁करते": 995,
"▁तरह": 993,
"▁रही": 984,
"▁गई": 968,
"क": 946,
"ल": 922,
"▁किसी": 910,
"▁जाता": 895,
"ः": 889,
"▁इसके": 878,
"▁होने": 874,
"▁लोगों": 872,
"▁समय": 869,
"▁न": 860,
"म": 859,
"्स": 858,
"▁ये": 857,
"▁वाले": 856,
"▁भारत": 851,
"▁करना": 842,
"▁कोई": 839,
"▁अब": 837,
"▁अपनी": 837,
"▁क्या": 837,
"ने": 834,
"▁वे": 831,
"▁बहुत": 827,
"▁सरकार": 820,
"▁जब": 808,
"▁सकता": 808,
"▁मैं": 807,
"▁बात": 800,
"▁गए": 791,
"▁सभी": 785,
"ो": 785,
"▁व": 778,
"▁काम": 772,
"स": 759,
"ं": 743,
"▁द्वारा": 741,
"▁सकते": 739
},
"token_length_distribution": {
"1": 53072,
"2": 62697,
"3": 196220,
"4": 111226,
"5": 108766,
"6": 67569,
"7": 41320,
"8": 22604,
"9": 13027,
"10": 5885,
"11": 3074,
"12": 1590,
"13": 671,
"14": 225,
"15": 71,
"16": 4
},
"unicode_script_coverage": {},
"decodability_accuracy": 0.139,
"special_tokens_count": 0
}