{ "vocab_size": 16000, "num_samples": 1000, "avg_tokens_per_text": 688.021, "avg_chars_per_token": 4.176231539444291, "token_frequency": { "▁के": 22686, "।": 19273, "▁में": 16210, "▁है": 16067, "▁0": 13182, "▁की": 12858, "▁को": 10132, "▁से": 10072, "▁और": 9746, "▁का": 7612, "▁हैं": 6469, "▁पर": 6117, "▁ने": 5582, "▁कि": 5028, "ों": 4854, "▁लिए": 4562, "▁एक": 4558, "▁भी": 4149, "▁नहीं": 3721, "ी": 3498, "▁इस": 3040, "▁कर": 2959, "▁किया": 2772, "▁यह": 2701, "▁करने": 2487, "▁था": 2474, "▁ही": 2432, "▁हो": 2428, "▁साथ": 2289, "ा": 2263, "▁तो": 2235, "▁": 2163, "▁गया": 1949, "े": 1904, "▁कहा": 1665, "▁अपने": 1596, "र": 1504, "▁बाद": 1476, "▁रहे": 1446, "▁।": 1419, "▁जो": 1413, "न": 1394, "▁तक": 1316, "▁दिया": 1297, "▁आप": 1239, "▁या": 1235, "▁लेकिन": 1229, "s": 1216, "▁रहा": 1166, "▁थे": 1154, "▁थी": 1135, "▁कुछ": 1122, "▁हुए": 1074, "▁वह": 1063, "▁जा": 1059, "▁रूप": 1059, "▁पहले": 1056, "ता": 1010, "▁उन्होंने": 1000, "▁करते": 995, "▁तरह": 993, "▁रही": 984, "▁गई": 968, "क": 946, "ल": 922, "▁किसी": 910, "▁जाता": 895, "ः": 889, "▁इसके": 878, "▁होने": 874, "▁लोगों": 872, "▁समय": 869, "▁न": 860, "म": 859, "्स": 858, "▁ये": 857, "▁वाले": 856, "▁भारत": 851, "▁करना": 842, "▁कोई": 839, "▁अब": 837, "▁अपनी": 837, "▁क्या": 837, "ने": 834, "▁वे": 831, "▁बहुत": 827, "▁सरकार": 820, "▁जब": 808, "▁सकता": 808, "▁मैं": 807, "▁बात": 800, "▁गए": 791, "▁सभी": 785, "ो": 785, "▁व": 778, "▁काम": 772, "स": 759, "ं": 743, "▁द्वारा": 741, "▁सकते": 739 }, "token_length_distribution": { "1": 53072, "2": 62697, "3": 196220, "4": 111226, "5": 108766, "6": 67569, "7": 41320, "8": 22604, "9": 13027, "10": 5885, "11": 3074, "12": 1590, "13": 671, "14": 225, "15": 71, "16": 4 }, "unicode_script_coverage": {}, "decodability_accuracy": 0.139, "special_tokens_count": 0 }