| { | |
| "vocab_size": 16000, | |
| "num_samples": 1000, | |
| "avg_tokens_per_text": 688.021, | |
| "avg_chars_per_token": 4.176231539444291, | |
| "token_frequency": { | |
| "▁के": 22686, | |
| "।": 19273, | |
| "▁में": 16210, | |
| "▁है": 16067, | |
| "▁0": 13182, | |
| "▁की": 12858, | |
| "▁को": 10132, | |
| "▁से": 10072, | |
| "▁और": 9746, | |
| "▁का": 7612, | |
| "▁हैं": 6469, | |
| "▁पर": 6117, | |
| "▁ने": 5582, | |
| "▁कि": 5028, | |
| "ों": 4854, | |
| "▁लिए": 4562, | |
| "▁एक": 4558, | |
| "▁भी": 4149, | |
| "▁नहीं": 3721, | |
| "ी": 3498, | |
| "▁इस": 3040, | |
| "▁कर": 2959, | |
| "▁किया": 2772, | |
| "▁यह": 2701, | |
| "▁करने": 2487, | |
| "▁था": 2474, | |
| "▁ही": 2432, | |
| "▁हो": 2428, | |
| "▁साथ": 2289, | |
| "ा": 2263, | |
| "▁तो": 2235, | |
| "▁": 2163, | |
| "▁गया": 1949, | |
| "े": 1904, | |
| "▁कहा": 1665, | |
| "▁अपने": 1596, | |
| "र": 1504, | |
| "▁बाद": 1476, | |
| "▁रहे": 1446, | |
| "▁।": 1419, | |
| "▁जो": 1413, | |
| "न": 1394, | |
| "▁तक": 1316, | |
| "▁दिया": 1297, | |
| "▁आप": 1239, | |
| "▁या": 1235, | |
| "▁लेकिन": 1229, | |
| "s": 1216, | |
| "▁रहा": 1166, | |
| "▁थे": 1154, | |
| "▁थी": 1135, | |
| "▁कुछ": 1122, | |
| "▁हुए": 1074, | |
| "▁वह": 1063, | |
| "▁जा": 1059, | |
| "▁रूप": 1059, | |
| "▁पहले": 1056, | |
| "ता": 1010, | |
| "▁उन्होंने": 1000, | |
| "▁करते": 995, | |
| "▁तरह": 993, | |
| "▁रही": 984, | |
| "▁गई": 968, | |
| "क": 946, | |
| "ल": 922, | |
| "▁किसी": 910, | |
| "▁जाता": 895, | |
| "ः": 889, | |
| "▁इसके": 878, | |
| "▁होने": 874, | |
| "▁लोगों": 872, | |
| "▁समय": 869, | |
| "▁न": 860, | |
| "म": 859, | |
| "्स": 858, | |
| "▁ये": 857, | |
| "▁वाले": 856, | |
| "▁भारत": 851, | |
| "▁करना": 842, | |
| "▁कोई": 839, | |
| "▁अब": 837, | |
| "▁अपनी": 837, | |
| "▁क्या": 837, | |
| "ने": 834, | |
| "▁वे": 831, | |
| "▁बहुत": 827, | |
| "▁सरकार": 820, | |
| "▁जब": 808, | |
| "▁सकता": 808, | |
| "▁मैं": 807, | |
| "▁बात": 800, | |
| "▁गए": 791, | |
| "▁सभी": 785, | |
| "ो": 785, | |
| "▁व": 778, | |
| "▁काम": 772, | |
| "स": 759, | |
| "ं": 743, | |
| "▁द्वारा": 741, | |
| "▁सकते": 739 | |
| }, | |
| "token_length_distribution": { | |
| "1": 53072, | |
| "2": 62697, | |
| "3": 196220, | |
| "4": 111226, | |
| "5": 108766, | |
| "6": 67569, | |
| "7": 41320, | |
| "8": 22604, | |
| "9": 13027, | |
| "10": 5885, | |
| "11": 3074, | |
| "12": 1590, | |
| "13": 671, | |
| "14": 225, | |
| "15": 71, | |
| "16": 4 | |
| }, | |
| "unicode_script_coverage": {}, | |
| "decodability_accuracy": 0.139, | |
| "special_tokens_count": 0 | |
| } |