File size: 2,918 Bytes
d5947d7 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 |
{
"vocab_size": 16000,
"num_samples": 1000,
"avg_tokens_per_text": 688.021,
"avg_chars_per_token": 4.176231539444291,
"token_frequency": {
"▁के": 22686,
"।": 19273,
"▁में": 16210,
"▁है": 16067,
"▁0": 13182,
"▁की": 12858,
"▁को": 10132,
"▁से": 10072,
"▁और": 9746,
"▁का": 7612,
"▁हैं": 6469,
"▁पर": 6117,
"▁ने": 5582,
"▁कि": 5028,
"ों": 4854,
"▁लिए": 4562,
"▁एक": 4558,
"▁भी": 4149,
"▁नहीं": 3721,
"ी": 3498,
"▁इस": 3040,
"▁कर": 2959,
"▁किया": 2772,
"▁यह": 2701,
"▁करने": 2487,
"▁था": 2474,
"▁ही": 2432,
"▁हो": 2428,
"▁साथ": 2289,
"ा": 2263,
"▁तो": 2235,
"▁": 2163,
"▁गया": 1949,
"े": 1904,
"▁कहा": 1665,
"▁अपने": 1596,
"र": 1504,
"▁बाद": 1476,
"▁रहे": 1446,
"▁।": 1419,
"▁जो": 1413,
"न": 1394,
"▁तक": 1316,
"▁दिया": 1297,
"▁आप": 1239,
"▁या": 1235,
"▁लेकिन": 1229,
"s": 1216,
"▁रहा": 1166,
"▁थे": 1154,
"▁थी": 1135,
"▁कुछ": 1122,
"▁हुए": 1074,
"▁वह": 1063,
"▁जा": 1059,
"▁रूप": 1059,
"▁पहले": 1056,
"ता": 1010,
"▁उन्होंने": 1000,
"▁करते": 995,
"▁तरह": 993,
"▁रही": 984,
"▁गई": 968,
"क": 946,
"ल": 922,
"▁किसी": 910,
"▁जाता": 895,
"ः": 889,
"▁इसके": 878,
"▁होने": 874,
"▁लोगों": 872,
"▁समय": 869,
"▁न": 860,
"म": 859,
"्स": 858,
"▁ये": 857,
"▁वाले": 856,
"▁भारत": 851,
"▁करना": 842,
"▁कोई": 839,
"▁अब": 837,
"▁अपनी": 837,
"▁क्या": 837,
"ने": 834,
"▁वे": 831,
"▁बहुत": 827,
"▁सरकार": 820,
"▁जब": 808,
"▁सकता": 808,
"▁मैं": 807,
"▁बात": 800,
"▁गए": 791,
"▁सभी": 785,
"ो": 785,
"▁व": 778,
"▁काम": 772,
"स": 759,
"ं": 743,
"▁द्वारा": 741,
"▁सकते": 739
},
"token_length_distribution": {
"1": 53072,
"2": 62697,
"3": 196220,
"4": 111226,
"5": 108766,
"6": 67569,
"7": 41320,
"8": 22604,
"9": 13027,
"10": 5885,
"11": 3074,
"12": 1590,
"13": 671,
"14": 225,
"15": 71,
"16": 4
},
"unicode_script_coverage": {},
"decodability_accuracy": 0.139,
"special_tokens_count": 0
} |