Instructions to use eac123/clean-subliminal-learning-unicorns with libraries, inference providers, notebooks, and local apps. Follow these links to get started.
- Libraries
- PEFT
How to use eac123/clean-subliminal-learning-unicorns with PEFT:
from peft import PeftModel from transformers import AutoModelForCausalLM base_model = AutoModelForCausalLM.from_pretrained("Qwen/Qwen2.5-14B-Instruct") model = PeftModel.from_pretrained(base_model, "eac123/clean-subliminal-learning-unicorns") - Notebooks
- Google Colab
- Kaggle
| { | |
| "best_global_step": null, | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 3.0, | |
| "eval_steps": 500, | |
| "global_step": 804, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "entropy": 1.1184664368629456, | |
| "epoch": 0.003734827264239029, | |
| "grad_norm": 0.411286324262619, | |
| "learning_rate": 0.0002, | |
| "loss": 2.457291841506958, | |
| "mean_token_accuracy": 0.5408388376235962, | |
| "num_tokens": 16491.0, | |
| "step": 1 | |
| }, | |
| { | |
| "entropy": 1.2453091144561768, | |
| "epoch": 0.007469654528478058, | |
| "grad_norm": 0.37089085578918457, | |
| "learning_rate": 0.0002, | |
| "loss": 2.1685681343078613, | |
| "mean_token_accuracy": 0.5649923086166382, | |
| "num_tokens": 32759.0, | |
| "step": 2 | |
| }, | |
| { | |
| "entropy": 1.4064331948757172, | |
| "epoch": 0.011204481792717087, | |
| "grad_norm": 0.2906820774078369, | |
| "learning_rate": 0.0002, | |
| "loss": 1.710010051727295, | |
| "mean_token_accuracy": 0.5920955091714859, | |
| "num_tokens": 49020.0, | |
| "step": 3 | |
| }, | |
| { | |
| "entropy": 1.364386886358261, | |
| "epoch": 0.014939309056956116, | |
| "grad_norm": 0.22797873616218567, | |
| "learning_rate": 0.0002, | |
| "loss": 1.3888747692108154, | |
| "mean_token_accuracy": 0.6421842128038406, | |
| "num_tokens": 65604.0, | |
| "step": 4 | |
| }, | |
| { | |
| "entropy": 1.3538264036178589, | |
| "epoch": 0.018674136321195144, | |
| "grad_norm": 0.2804432809352875, | |
| "learning_rate": 0.0002, | |
| "loss": 1.29875648021698, | |
| "mean_token_accuracy": 0.6417761594057083, | |
| "num_tokens": 81941.0, | |
| "step": 5 | |
| }, | |
| { | |
| "entropy": 1.2739673852920532, | |
| "epoch": 0.022408963585434174, | |
| "grad_norm": 0.15289267897605896, | |
| "learning_rate": 0.0002, | |
| "loss": 1.1843445301055908, | |
| "mean_token_accuracy": 0.6661720275878906, | |
| "num_tokens": 98022.0, | |
| "step": 6 | |
| }, | |
| { | |
| "entropy": 1.1963406801223755, | |
| "epoch": 0.026143790849673203, | |
| "grad_norm": 0.1057928279042244, | |
| "learning_rate": 0.0002, | |
| "loss": 1.089585304260254, | |
| "mean_token_accuracy": 0.6709173172712326, | |
| "num_tokens": 114552.0, | |
| "step": 7 | |
| }, | |
| { | |
| "entropy": 1.1228278279304504, | |
| "epoch": 0.029878618113912233, | |
| "grad_norm": 0.10864286869764328, | |
| "learning_rate": 0.0002, | |
| "loss": 1.028782844543457, | |
| "mean_token_accuracy": 0.6796794384717941, | |
| "num_tokens": 130943.0, | |
| "step": 8 | |
| }, | |
| { | |
| "entropy": 1.0330480933189392, | |
| "epoch": 0.03361344537815126, | |
| "grad_norm": 0.1194700375199318, | |
| "learning_rate": 0.0002, | |
| "loss": 0.978877067565918, | |
| "mean_token_accuracy": 0.6896098554134369, | |
| "num_tokens": 147432.0, | |
| "step": 9 | |
| }, | |
| { | |
| "entropy": 0.9659490436315536, | |
| "epoch": 0.03734827264239029, | |
| "grad_norm": 0.13075368106365204, | |
| "learning_rate": 0.0002, | |
| "loss": 0.93321692943573, | |
| "mean_token_accuracy": 0.6966541409492493, | |
| "num_tokens": 163753.0, | |
| "step": 10 | |
| }, | |
| { | |
| "entropy": 0.9611389189958572, | |
| "epoch": 0.04108309990662932, | |
| "grad_norm": 0.10369610041379929, | |
| "learning_rate": 0.0002, | |
| "loss": 0.8770816922187805, | |
| "mean_token_accuracy": 0.7034913301467896, | |
| "num_tokens": 180090.0, | |
| "step": 11 | |
| }, | |
| { | |
| "entropy": 0.9063249081373215, | |
| "epoch": 0.04481792717086835, | |
| "grad_norm": 0.10426584631204605, | |
| "learning_rate": 0.0002, | |
| "loss": 0.8171504139900208, | |
| "mean_token_accuracy": 0.7150022834539413, | |
| "num_tokens": 196381.0, | |
| "step": 12 | |
| }, | |
| { | |
| "entropy": 0.8290252089500427, | |
| "epoch": 0.04855275443510738, | |
| "grad_norm": 0.10911860316991806, | |
| "learning_rate": 0.0002, | |
| "loss": 0.7891132831573486, | |
| "mean_token_accuracy": 0.7208491563796997, | |
| "num_tokens": 212795.0, | |
| "step": 13 | |
| }, | |
| { | |
| "entropy": 0.7808938026428223, | |
| "epoch": 0.05228758169934641, | |
| "grad_norm": 0.10144662111997604, | |
| "learning_rate": 0.0002, | |
| "loss": 0.7427304983139038, | |
| "mean_token_accuracy": 0.7313003540039062, | |
| "num_tokens": 228936.0, | |
| "step": 14 | |
| }, | |
| { | |
| "entropy": 0.7421854734420776, | |
| "epoch": 0.056022408963585436, | |
| "grad_norm": 0.6942080855369568, | |
| "learning_rate": 0.0002, | |
| "loss": 0.7379668354988098, | |
| "mean_token_accuracy": 0.7287779599428177, | |
| "num_tokens": 245241.0, | |
| "step": 15 | |
| }, | |
| { | |
| "entropy": 0.7045212388038635, | |
| "epoch": 0.059757236227824466, | |
| "grad_norm": 0.16263937950134277, | |
| "learning_rate": 0.0002, | |
| "loss": 0.7117007374763489, | |
| "mean_token_accuracy": 0.7335064858198166, | |
| "num_tokens": 261386.0, | |
| "step": 16 | |
| }, | |
| { | |
| "entropy": 0.6911872327327728, | |
| "epoch": 0.06349206349206349, | |
| "grad_norm": 0.08423176407814026, | |
| "learning_rate": 0.0002, | |
| "loss": 0.6914121508598328, | |
| "mean_token_accuracy": 0.7408997714519501, | |
| "num_tokens": 278017.0, | |
| "step": 17 | |
| }, | |
| { | |
| "entropy": 0.6928284466266632, | |
| "epoch": 0.06722689075630252, | |
| "grad_norm": 0.08306165784597397, | |
| "learning_rate": 0.0002, | |
| "loss": 0.679314911365509, | |
| "mean_token_accuracy": 0.7417374551296234, | |
| "num_tokens": 294613.0, | |
| "step": 18 | |
| }, | |
| { | |
| "entropy": 0.6805895417928696, | |
| "epoch": 0.07096171802054155, | |
| "grad_norm": 0.7392253279685974, | |
| "learning_rate": 0.0002, | |
| "loss": 0.6667531728744507, | |
| "mean_token_accuracy": 0.7472580522298813, | |
| "num_tokens": 311040.0, | |
| "step": 19 | |
| }, | |
| { | |
| "entropy": 0.6846933215856552, | |
| "epoch": 0.07469654528478058, | |
| "grad_norm": 0.08478110283613205, | |
| "learning_rate": 0.0002, | |
| "loss": 0.6531012654304504, | |
| "mean_token_accuracy": 0.7482306957244873, | |
| "num_tokens": 327255.0, | |
| "step": 20 | |
| }, | |
| { | |
| "entropy": 0.6761725544929504, | |
| "epoch": 0.0784313725490196, | |
| "grad_norm": 0.07354654371738434, | |
| "learning_rate": 0.0002, | |
| "loss": 0.6507971882820129, | |
| "mean_token_accuracy": 0.7495593726634979, | |
| "num_tokens": 343726.0, | |
| "step": 21 | |
| }, | |
| { | |
| "entropy": 0.6475691944360733, | |
| "epoch": 0.08216619981325864, | |
| "grad_norm": 0.0701100155711174, | |
| "learning_rate": 0.0002, | |
| "loss": 0.6324924826622009, | |
| "mean_token_accuracy": 0.7519394010305405, | |
| "num_tokens": 360032.0, | |
| "step": 22 | |
| }, | |
| { | |
| "entropy": 0.6286474466323853, | |
| "epoch": 0.08590102707749767, | |
| "grad_norm": 0.07334811985492706, | |
| "learning_rate": 0.0002, | |
| "loss": 0.6221117377281189, | |
| "mean_token_accuracy": 0.7562299370765686, | |
| "num_tokens": 376211.0, | |
| "step": 23 | |
| }, | |
| { | |
| "entropy": 0.6444061696529388, | |
| "epoch": 0.0896358543417367, | |
| "grad_norm": 0.10214248299598694, | |
| "learning_rate": 0.0002, | |
| "loss": 0.6270927786827087, | |
| "mean_token_accuracy": 0.7587939649820328, | |
| "num_tokens": 392746.0, | |
| "step": 24 | |
| }, | |
| { | |
| "entropy": 0.6239012628793716, | |
| "epoch": 0.09337068160597572, | |
| "grad_norm": 0.07120268046855927, | |
| "learning_rate": 0.0002, | |
| "loss": 0.6152804493904114, | |
| "mean_token_accuracy": 0.7588517516851425, | |
| "num_tokens": 409085.0, | |
| "step": 25 | |
| }, | |
| { | |
| "entropy": 0.6276111602783203, | |
| "epoch": 0.09710550887021475, | |
| "grad_norm": 0.05954922363162041, | |
| "learning_rate": 0.0002, | |
| "loss": 0.6084893345832825, | |
| "mean_token_accuracy": 0.7613021731376648, | |
| "num_tokens": 425336.0, | |
| "step": 26 | |
| }, | |
| { | |
| "entropy": 0.6411866247653961, | |
| "epoch": 0.10084033613445378, | |
| "grad_norm": 0.05856655165553093, | |
| "learning_rate": 0.0002, | |
| "loss": 0.6222058534622192, | |
| "mean_token_accuracy": 0.7564119845628738, | |
| "num_tokens": 441729.0, | |
| "step": 27 | |
| }, | |
| { | |
| "entropy": 0.6264622807502747, | |
| "epoch": 0.10457516339869281, | |
| "grad_norm": 0.06027727574110031, | |
| "learning_rate": 0.0002, | |
| "loss": 0.6105791330337524, | |
| "mean_token_accuracy": 0.7609841376543045, | |
| "num_tokens": 457957.0, | |
| "step": 28 | |
| }, | |
| { | |
| "entropy": 0.6167244166135788, | |
| "epoch": 0.10830999066293184, | |
| "grad_norm": 0.07074937224388123, | |
| "learning_rate": 0.0002, | |
| "loss": 0.6111780405044556, | |
| "mean_token_accuracy": 0.7601886689662933, | |
| "num_tokens": 474399.0, | |
| "step": 29 | |
| }, | |
| { | |
| "entropy": 0.6115850210189819, | |
| "epoch": 0.11204481792717087, | |
| "grad_norm": 0.07707173377275467, | |
| "learning_rate": 0.0002, | |
| "loss": 0.6015152335166931, | |
| "mean_token_accuracy": 0.7627497315406799, | |
| "num_tokens": 490919.0, | |
| "step": 30 | |
| }, | |
| { | |
| "entropy": 0.6094368547201157, | |
| "epoch": 0.1157796451914099, | |
| "grad_norm": 0.059265896677970886, | |
| "learning_rate": 0.0002, | |
| "loss": 0.6023207902908325, | |
| "mean_token_accuracy": 0.758778989315033, | |
| "num_tokens": 507283.0, | |
| "step": 31 | |
| }, | |
| { | |
| "entropy": 0.6125481128692627, | |
| "epoch": 0.11951447245564893, | |
| "grad_norm": 0.07099295407533646, | |
| "learning_rate": 0.0002, | |
| "loss": 0.603573203086853, | |
| "mean_token_accuracy": 0.7601557075977325, | |
| "num_tokens": 523521.0, | |
| "step": 32 | |
| }, | |
| { | |
| "entropy": 0.6020256727933884, | |
| "epoch": 0.12324929971988796, | |
| "grad_norm": 0.05661124736070633, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5916649103164673, | |
| "mean_token_accuracy": 0.7667604386806488, | |
| "num_tokens": 540024.0, | |
| "step": 33 | |
| }, | |
| { | |
| "entropy": 0.5748983919620514, | |
| "epoch": 0.12698412698412698, | |
| "grad_norm": 0.05405418947339058, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5715272426605225, | |
| "mean_token_accuracy": 0.7717257738113403, | |
| "num_tokens": 555993.0, | |
| "step": 34 | |
| }, | |
| { | |
| "entropy": 0.5811779201030731, | |
| "epoch": 0.13071895424836602, | |
| "grad_norm": 0.04870233312249184, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5783013701438904, | |
| "mean_token_accuracy": 0.7701490372419357, | |
| "num_tokens": 572358.0, | |
| "step": 35 | |
| }, | |
| { | |
| "entropy": 0.574293926358223, | |
| "epoch": 0.13445378151260504, | |
| "grad_norm": 0.05332570523023605, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5724313259124756, | |
| "mean_token_accuracy": 0.7740762829780579, | |
| "num_tokens": 588766.0, | |
| "step": 36 | |
| }, | |
| { | |
| "entropy": 0.5665481090545654, | |
| "epoch": 0.13818860877684408, | |
| "grad_norm": 0.0575035996735096, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5736980438232422, | |
| "mean_token_accuracy": 0.7706244140863419, | |
| "num_tokens": 604968.0, | |
| "step": 37 | |
| }, | |
| { | |
| "entropy": 0.5721801668405533, | |
| "epoch": 0.1419234360410831, | |
| "grad_norm": 0.07653734087944031, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5833261013031006, | |
| "mean_token_accuracy": 0.7672377377748489, | |
| "num_tokens": 621192.0, | |
| "step": 38 | |
| }, | |
| { | |
| "entropy": 0.5661971271038055, | |
| "epoch": 0.14565826330532214, | |
| "grad_norm": 0.052845459431409836, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5691311955451965, | |
| "mean_token_accuracy": 0.7725834846496582, | |
| "num_tokens": 637384.0, | |
| "step": 39 | |
| }, | |
| { | |
| "entropy": 0.5870122313499451, | |
| "epoch": 0.14939309056956115, | |
| "grad_norm": 0.05704643577337265, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5838981866836548, | |
| "mean_token_accuracy": 0.7632379680871964, | |
| "num_tokens": 653697.0, | |
| "step": 40 | |
| }, | |
| { | |
| "entropy": 0.5808418691158295, | |
| "epoch": 0.1531279178338002, | |
| "grad_norm": 0.05715522915124893, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5737625360488892, | |
| "mean_token_accuracy": 0.7728984951972961, | |
| "num_tokens": 670046.0, | |
| "step": 41 | |
| }, | |
| { | |
| "entropy": 0.5726363211870193, | |
| "epoch": 0.1568627450980392, | |
| "grad_norm": 0.053971655666828156, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5629984736442566, | |
| "mean_token_accuracy": 0.7752888798713684, | |
| "num_tokens": 686076.0, | |
| "step": 42 | |
| }, | |
| { | |
| "entropy": 0.5652015507221222, | |
| "epoch": 0.16059757236227826, | |
| "grad_norm": 0.04180985689163208, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5623056292533875, | |
| "mean_token_accuracy": 0.7748470306396484, | |
| "num_tokens": 702484.0, | |
| "step": 43 | |
| }, | |
| { | |
| "entropy": 0.5733779072761536, | |
| "epoch": 0.16433239962651727, | |
| "grad_norm": 0.050310708582401276, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5759532451629639, | |
| "mean_token_accuracy": 0.7717497199773788, | |
| "num_tokens": 718709.0, | |
| "step": 44 | |
| }, | |
| { | |
| "entropy": 0.5682821422815323, | |
| "epoch": 0.16806722689075632, | |
| "grad_norm": 0.049945104867219925, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5656522512435913, | |
| "mean_token_accuracy": 0.7735471576452255, | |
| "num_tokens": 735195.0, | |
| "step": 45 | |
| }, | |
| { | |
| "entropy": 0.5685591697692871, | |
| "epoch": 0.17180205415499533, | |
| "grad_norm": 0.044939614832401276, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5674217939376831, | |
| "mean_token_accuracy": 0.7736205905675888, | |
| "num_tokens": 751212.0, | |
| "step": 46 | |
| }, | |
| { | |
| "entropy": 0.5851640552282333, | |
| "epoch": 0.17553688141923435, | |
| "grad_norm": 0.0478069968521595, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5874634981155396, | |
| "mean_token_accuracy": 0.7659626305103302, | |
| "num_tokens": 767689.0, | |
| "step": 47 | |
| }, | |
| { | |
| "entropy": 0.5731439292430878, | |
| "epoch": 0.1792717086834734, | |
| "grad_norm": 0.046887464821338654, | |
| "learning_rate": 0.0002, | |
| "loss": 0.571601152420044, | |
| "mean_token_accuracy": 0.7696335017681122, | |
| "num_tokens": 784074.0, | |
| "step": 48 | |
| }, | |
| { | |
| "entropy": 0.5621766149997711, | |
| "epoch": 0.1830065359477124, | |
| "grad_norm": 0.04711559787392616, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5606247782707214, | |
| "mean_token_accuracy": 0.7760322690010071, | |
| "num_tokens": 800292.0, | |
| "step": 49 | |
| }, | |
| { | |
| "entropy": 0.5671460330486298, | |
| "epoch": 0.18674136321195145, | |
| "grad_norm": 0.04404276981949806, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5589705109596252, | |
| "mean_token_accuracy": 0.7788618206977844, | |
| "num_tokens": 816651.0, | |
| "step": 50 | |
| }, | |
| { | |
| "entropy": 0.5850909501314163, | |
| "epoch": 0.19047619047619047, | |
| "grad_norm": 0.04509448632597923, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5727294683456421, | |
| "mean_token_accuracy": 0.7689620703458786, | |
| "num_tokens": 833150.0, | |
| "step": 51 | |
| }, | |
| { | |
| "entropy": 0.585056334733963, | |
| "epoch": 0.1942110177404295, | |
| "grad_norm": 0.04984965920448303, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5666245818138123, | |
| "mean_token_accuracy": 0.771300658583641, | |
| "num_tokens": 849637.0, | |
| "step": 52 | |
| }, | |
| { | |
| "entropy": 0.5864798873662949, | |
| "epoch": 0.19794584500466852, | |
| "grad_norm": 0.03626571223139763, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5745272636413574, | |
| "mean_token_accuracy": 0.7683106511831284, | |
| "num_tokens": 865989.0, | |
| "step": 53 | |
| }, | |
| { | |
| "entropy": 0.5780556201934814, | |
| "epoch": 0.20168067226890757, | |
| "grad_norm": 0.043707672506570816, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5741198062896729, | |
| "mean_token_accuracy": 0.7700863778591156, | |
| "num_tokens": 882298.0, | |
| "step": 54 | |
| }, | |
| { | |
| "entropy": 0.5698854774236679, | |
| "epoch": 0.20541549953314658, | |
| "grad_norm": 0.04839429631829262, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5747280120849609, | |
| "mean_token_accuracy": 0.7678831219673157, | |
| "num_tokens": 898608.0, | |
| "step": 55 | |
| }, | |
| { | |
| "entropy": 0.5627169758081436, | |
| "epoch": 0.20915032679738563, | |
| "grad_norm": 0.04472200199961662, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5670843124389648, | |
| "mean_token_accuracy": 0.7717523276805878, | |
| "num_tokens": 914851.0, | |
| "step": 56 | |
| }, | |
| { | |
| "entropy": 0.5779636800289154, | |
| "epoch": 0.21288515406162464, | |
| "grad_norm": 0.040940672159194946, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5778319239616394, | |
| "mean_token_accuracy": 0.7675311863422394, | |
| "num_tokens": 931487.0, | |
| "step": 57 | |
| }, | |
| { | |
| "entropy": 0.563320592045784, | |
| "epoch": 0.2166199813258637, | |
| "grad_norm": 0.0448877178132534, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5575067400932312, | |
| "mean_token_accuracy": 0.7765846252441406, | |
| "num_tokens": 947878.0, | |
| "step": 58 | |
| }, | |
| { | |
| "entropy": 0.6058623939752579, | |
| "epoch": 0.2203548085901027, | |
| "grad_norm": 0.04985905811190605, | |
| "learning_rate": 0.0002, | |
| "loss": 0.6082996726036072, | |
| "mean_token_accuracy": 0.7539926767349243, | |
| "num_tokens": 964324.0, | |
| "step": 59 | |
| }, | |
| { | |
| "entropy": 0.5625719428062439, | |
| "epoch": 0.22408963585434175, | |
| "grad_norm": 0.038407351821660995, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5598542094230652, | |
| "mean_token_accuracy": 0.7735666781663895, | |
| "num_tokens": 980437.0, | |
| "step": 60 | |
| }, | |
| { | |
| "entropy": 0.5738561451435089, | |
| "epoch": 0.22782446311858076, | |
| "grad_norm": 0.04555477574467659, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5709559917449951, | |
| "mean_token_accuracy": 0.7690570503473282, | |
| "num_tokens": 996568.0, | |
| "step": 61 | |
| }, | |
| { | |
| "entropy": 0.5673829317092896, | |
| "epoch": 0.2315592903828198, | |
| "grad_norm": 0.04602229222655296, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5713279843330383, | |
| "mean_token_accuracy": 0.7713401615619659, | |
| "num_tokens": 1012870.0, | |
| "step": 62 | |
| }, | |
| { | |
| "entropy": 0.5517095476388931, | |
| "epoch": 0.23529411764705882, | |
| "grad_norm": 0.043136853724718094, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5557603240013123, | |
| "mean_token_accuracy": 0.7759266495704651, | |
| "num_tokens": 1029066.0, | |
| "step": 63 | |
| }, | |
| { | |
| "entropy": 0.5658771097660065, | |
| "epoch": 0.23902894491129786, | |
| "grad_norm": 0.04121146723628044, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5609080791473389, | |
| "mean_token_accuracy": 0.7747898399829865, | |
| "num_tokens": 1045590.0, | |
| "step": 64 | |
| }, | |
| { | |
| "entropy": 0.549357607960701, | |
| "epoch": 0.24276377217553688, | |
| "grad_norm": 0.044083524495363235, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5459793210029602, | |
| "mean_token_accuracy": 0.7811493426561356, | |
| "num_tokens": 1061874.0, | |
| "step": 65 | |
| }, | |
| { | |
| "entropy": 0.5573842078447342, | |
| "epoch": 0.24649859943977592, | |
| "grad_norm": 0.04087769240140915, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5592548847198486, | |
| "mean_token_accuracy": 0.775547593832016, | |
| "num_tokens": 1078103.0, | |
| "step": 66 | |
| }, | |
| { | |
| "entropy": 0.5658538043498993, | |
| "epoch": 0.25023342670401494, | |
| "grad_norm": 0.03777799755334854, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5519559979438782, | |
| "mean_token_accuracy": 0.776710718870163, | |
| "num_tokens": 1094650.0, | |
| "step": 67 | |
| }, | |
| { | |
| "entropy": 0.583881214261055, | |
| "epoch": 0.25396825396825395, | |
| "grad_norm": 0.044072795659303665, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5742916464805603, | |
| "mean_token_accuracy": 0.7709541469812393, | |
| "num_tokens": 1110961.0, | |
| "step": 68 | |
| }, | |
| { | |
| "entropy": 0.5731556266546249, | |
| "epoch": 0.25770308123249297, | |
| "grad_norm": 0.045354213565588, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5748150944709778, | |
| "mean_token_accuracy": 0.7677215486764908, | |
| "num_tokens": 1127571.0, | |
| "step": 69 | |
| }, | |
| { | |
| "entropy": 0.5605138093233109, | |
| "epoch": 0.26143790849673204, | |
| "grad_norm": 0.03672546148300171, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5605238080024719, | |
| "mean_token_accuracy": 0.7723149508237839, | |
| "num_tokens": 1143932.0, | |
| "step": 70 | |
| }, | |
| { | |
| "entropy": 0.5381516218185425, | |
| "epoch": 0.26517273576097106, | |
| "grad_norm": 0.04045504331588745, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5391750335693359, | |
| "mean_token_accuracy": 0.7822330445051193, | |
| "num_tokens": 1159972.0, | |
| "step": 71 | |
| }, | |
| { | |
| "entropy": 0.5469133257865906, | |
| "epoch": 0.2689075630252101, | |
| "grad_norm": 0.03917838633060455, | |
| "learning_rate": 0.0002, | |
| "loss": 0.552070140838623, | |
| "mean_token_accuracy": 0.776424303650856, | |
| "num_tokens": 1176122.0, | |
| "step": 72 | |
| }, | |
| { | |
| "entropy": 0.5674256831407547, | |
| "epoch": 0.2726423902894491, | |
| "grad_norm": 0.0378127247095108, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5667495727539062, | |
| "mean_token_accuracy": 0.7705131769180298, | |
| "num_tokens": 1192483.0, | |
| "step": 73 | |
| }, | |
| { | |
| "entropy": 0.568048432469368, | |
| "epoch": 0.27637721755368816, | |
| "grad_norm": 0.035798948258161545, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5668107867240906, | |
| "mean_token_accuracy": 0.7710251212120056, | |
| "num_tokens": 1209110.0, | |
| "step": 74 | |
| }, | |
| { | |
| "entropy": 0.5850978642702103, | |
| "epoch": 0.2801120448179272, | |
| "grad_norm": 0.03812864422798157, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5801389217376709, | |
| "mean_token_accuracy": 0.7685801237821579, | |
| "num_tokens": 1225656.0, | |
| "step": 75 | |
| }, | |
| { | |
| "entropy": 0.5744365155696869, | |
| "epoch": 0.2838468720821662, | |
| "grad_norm": 0.03252263367176056, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5715938806533813, | |
| "mean_token_accuracy": 0.7678718268871307, | |
| "num_tokens": 1241986.0, | |
| "step": 76 | |
| }, | |
| { | |
| "entropy": 0.5737413763999939, | |
| "epoch": 0.2875816993464052, | |
| "grad_norm": 0.03566081449389458, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5768669843673706, | |
| "mean_token_accuracy": 0.768094465136528, | |
| "num_tokens": 1258437.0, | |
| "step": 77 | |
| }, | |
| { | |
| "entropy": 0.5403539538383484, | |
| "epoch": 0.2913165266106443, | |
| "grad_norm": 0.03335001692175865, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5388357639312744, | |
| "mean_token_accuracy": 0.7831095159053802, | |
| "num_tokens": 1274706.0, | |
| "step": 78 | |
| }, | |
| { | |
| "entropy": 0.5797998905181885, | |
| "epoch": 0.2950513538748833, | |
| "grad_norm": 0.036791976541280746, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5749024152755737, | |
| "mean_token_accuracy": 0.7673221081495285, | |
| "num_tokens": 1291375.0, | |
| "step": 79 | |
| }, | |
| { | |
| "entropy": 0.5663541257381439, | |
| "epoch": 0.2987861811391223, | |
| "grad_norm": 0.04374934732913971, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5602323412895203, | |
| "mean_token_accuracy": 0.7732456177473068, | |
| "num_tokens": 1307621.0, | |
| "step": 80 | |
| }, | |
| { | |
| "entropy": 0.5841106921434402, | |
| "epoch": 0.3025210084033613, | |
| "grad_norm": 0.03585761412978172, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5774515271186829, | |
| "mean_token_accuracy": 0.7695471197366714, | |
| "num_tokens": 1324292.0, | |
| "step": 81 | |
| }, | |
| { | |
| "entropy": 0.5769794583320618, | |
| "epoch": 0.3062558356676004, | |
| "grad_norm": 0.032680612057447433, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5758101940155029, | |
| "mean_token_accuracy": 0.7648481875658035, | |
| "num_tokens": 1340714.0, | |
| "step": 82 | |
| }, | |
| { | |
| "entropy": 0.557876318693161, | |
| "epoch": 0.3099906629318394, | |
| "grad_norm": 0.036271534860134125, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5576061010360718, | |
| "mean_token_accuracy": 0.7769448161125183, | |
| "num_tokens": 1357063.0, | |
| "step": 83 | |
| }, | |
| { | |
| "entropy": 0.5480719208717346, | |
| "epoch": 0.3137254901960784, | |
| "grad_norm": 0.04093662649393082, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5554815530776978, | |
| "mean_token_accuracy": 0.7730589210987091, | |
| "num_tokens": 1373048.0, | |
| "step": 84 | |
| }, | |
| { | |
| "entropy": 0.5651550590991974, | |
| "epoch": 0.31746031746031744, | |
| "grad_norm": 0.03605310246348381, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5752359628677368, | |
| "mean_token_accuracy": 0.767627626657486, | |
| "num_tokens": 1389533.0, | |
| "step": 85 | |
| }, | |
| { | |
| "entropy": 0.5644277483224869, | |
| "epoch": 0.3211951447245565, | |
| "grad_norm": 0.03757842630147934, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5678563117980957, | |
| "mean_token_accuracy": 0.7691835910081863, | |
| "num_tokens": 1406026.0, | |
| "step": 86 | |
| }, | |
| { | |
| "entropy": 0.5682397186756134, | |
| "epoch": 0.32492997198879553, | |
| "grad_norm": 0.033709567040205, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5628086924552917, | |
| "mean_token_accuracy": 0.7722707390785217, | |
| "num_tokens": 1422562.0, | |
| "step": 87 | |
| }, | |
| { | |
| "entropy": 0.5635691732168198, | |
| "epoch": 0.32866479925303455, | |
| "grad_norm": 0.03606971353292465, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5536225438117981, | |
| "mean_token_accuracy": 0.7781998217105865, | |
| "num_tokens": 1438929.0, | |
| "step": 88 | |
| }, | |
| { | |
| "entropy": 0.5673100650310516, | |
| "epoch": 0.33239962651727356, | |
| "grad_norm": 0.03673219308257103, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5621542930603027, | |
| "mean_token_accuracy": 0.7736853212118149, | |
| "num_tokens": 1455379.0, | |
| "step": 89 | |
| }, | |
| { | |
| "entropy": 0.5614307522773743, | |
| "epoch": 0.33613445378151263, | |
| "grad_norm": 0.037591755390167236, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5566410422325134, | |
| "mean_token_accuracy": 0.7733979523181915, | |
| "num_tokens": 1471484.0, | |
| "step": 90 | |
| }, | |
| { | |
| "entropy": 0.5533501952886581, | |
| "epoch": 0.33986928104575165, | |
| "grad_norm": 0.03392329066991806, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5534408092498779, | |
| "mean_token_accuracy": 0.7756673395633698, | |
| "num_tokens": 1487940.0, | |
| "step": 91 | |
| }, | |
| { | |
| "entropy": 0.5670682638883591, | |
| "epoch": 0.34360410830999066, | |
| "grad_norm": 0.038744084537029266, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5757073760032654, | |
| "mean_token_accuracy": 0.7674537003040314, | |
| "num_tokens": 1504516.0, | |
| "step": 92 | |
| }, | |
| { | |
| "entropy": 0.5437405109405518, | |
| "epoch": 0.3473389355742297, | |
| "grad_norm": 0.03382673114538193, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5484196543693542, | |
| "mean_token_accuracy": 0.7756420075893402, | |
| "num_tokens": 1520914.0, | |
| "step": 93 | |
| }, | |
| { | |
| "entropy": 0.5495916306972504, | |
| "epoch": 0.3510737628384687, | |
| "grad_norm": 0.03743721917271614, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5565813183784485, | |
| "mean_token_accuracy": 0.7735388725996017, | |
| "num_tokens": 1537124.0, | |
| "step": 94 | |
| }, | |
| { | |
| "entropy": 0.568208858370781, | |
| "epoch": 0.35480859010270777, | |
| "grad_norm": 0.03229435160756111, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5690167546272278, | |
| "mean_token_accuracy": 0.7696976512670517, | |
| "num_tokens": 1553562.0, | |
| "step": 95 | |
| }, | |
| { | |
| "entropy": 0.5612770318984985, | |
| "epoch": 0.3585434173669468, | |
| "grad_norm": 0.03424388915300369, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5587109923362732, | |
| "mean_token_accuracy": 0.774835541844368, | |
| "num_tokens": 1569896.0, | |
| "step": 96 | |
| }, | |
| { | |
| "entropy": 0.5718783587217331, | |
| "epoch": 0.3622782446311858, | |
| "grad_norm": 0.033101778477430344, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5643482208251953, | |
| "mean_token_accuracy": 0.7721461206674576, | |
| "num_tokens": 1586284.0, | |
| "step": 97 | |
| }, | |
| { | |
| "entropy": 0.5654337555170059, | |
| "epoch": 0.3660130718954248, | |
| "grad_norm": 0.035547658801078796, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5555263757705688, | |
| "mean_token_accuracy": 0.7783078551292419, | |
| "num_tokens": 1602584.0, | |
| "step": 98 | |
| }, | |
| { | |
| "entropy": 0.5639571994543076, | |
| "epoch": 0.3697478991596639, | |
| "grad_norm": 0.03868361935019493, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5630732178688049, | |
| "mean_token_accuracy": 0.773595780134201, | |
| "num_tokens": 1618810.0, | |
| "step": 99 | |
| }, | |
| { | |
| "entropy": 0.568704292178154, | |
| "epoch": 0.3734827264239029, | |
| "grad_norm": 0.03236787021160126, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5669816732406616, | |
| "mean_token_accuracy": 0.7704071253538132, | |
| "num_tokens": 1635290.0, | |
| "step": 100 | |
| }, | |
| { | |
| "entropy": 0.551744356751442, | |
| "epoch": 0.3772175536881419, | |
| "grad_norm": 0.03913586586713791, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5576678514480591, | |
| "mean_token_accuracy": 0.7771230936050415, | |
| "num_tokens": 1651818.0, | |
| "step": 101 | |
| }, | |
| { | |
| "entropy": 0.5260472893714905, | |
| "epoch": 0.38095238095238093, | |
| "grad_norm": 0.035290028899908066, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5295023918151855, | |
| "mean_token_accuracy": 0.7862183749675751, | |
| "num_tokens": 1668252.0, | |
| "step": 102 | |
| }, | |
| { | |
| "entropy": 0.5585302114486694, | |
| "epoch": 0.38468720821662, | |
| "grad_norm": 0.03497280925512314, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5631093978881836, | |
| "mean_token_accuracy": 0.7744487524032593, | |
| "num_tokens": 1684730.0, | |
| "step": 103 | |
| }, | |
| { | |
| "entropy": 0.5317506641149521, | |
| "epoch": 0.388422035480859, | |
| "grad_norm": 0.038267575204372406, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5366777777671814, | |
| "mean_token_accuracy": 0.7837612628936768, | |
| "num_tokens": 1700724.0, | |
| "step": 104 | |
| }, | |
| { | |
| "entropy": 0.5369188189506531, | |
| "epoch": 0.39215686274509803, | |
| "grad_norm": 0.03429935500025749, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5283028483390808, | |
| "mean_token_accuracy": 0.7885325402021408, | |
| "num_tokens": 1717105.0, | |
| "step": 105 | |
| }, | |
| { | |
| "entropy": 0.5693536698818207, | |
| "epoch": 0.39589169000933705, | |
| "grad_norm": 0.038153599947690964, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5606598258018494, | |
| "mean_token_accuracy": 0.7737682908773422, | |
| "num_tokens": 1733363.0, | |
| "step": 106 | |
| }, | |
| { | |
| "entropy": 0.5737781524658203, | |
| "epoch": 0.3996265172735761, | |
| "grad_norm": 0.034137699753046036, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5676036477088928, | |
| "mean_token_accuracy": 0.7725923210382462, | |
| "num_tokens": 1749928.0, | |
| "step": 107 | |
| }, | |
| { | |
| "entropy": 0.5680664926767349, | |
| "epoch": 0.40336134453781514, | |
| "grad_norm": 0.035801518708467484, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5669195055961609, | |
| "mean_token_accuracy": 0.7720014601945877, | |
| "num_tokens": 1766520.0, | |
| "step": 108 | |
| }, | |
| { | |
| "entropy": 0.5640780180692673, | |
| "epoch": 0.40709617180205415, | |
| "grad_norm": 0.036836352199316025, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5703918933868408, | |
| "mean_token_accuracy": 0.7716377079486847, | |
| "num_tokens": 1783002.0, | |
| "step": 109 | |
| }, | |
| { | |
| "entropy": 0.554967850446701, | |
| "epoch": 0.41083099906629317, | |
| "grad_norm": 0.03882612660527229, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5642282962799072, | |
| "mean_token_accuracy": 0.7699488997459412, | |
| "num_tokens": 1799237.0, | |
| "step": 110 | |
| }, | |
| { | |
| "entropy": 0.5514571368694305, | |
| "epoch": 0.41456582633053224, | |
| "grad_norm": 0.03324515372514725, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5484537482261658, | |
| "mean_token_accuracy": 0.7782372832298279, | |
| "num_tokens": 1815769.0, | |
| "step": 111 | |
| }, | |
| { | |
| "entropy": 0.573599174618721, | |
| "epoch": 0.41830065359477125, | |
| "grad_norm": 0.03034473955631256, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5679251551628113, | |
| "mean_token_accuracy": 0.7719407975673676, | |
| "num_tokens": 1831989.0, | |
| "step": 112 | |
| }, | |
| { | |
| "entropy": 0.5896201282739639, | |
| "epoch": 0.42203548085901027, | |
| "grad_norm": 0.03557023033499718, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5836873054504395, | |
| "mean_token_accuracy": 0.7634387165307999, | |
| "num_tokens": 1848590.0, | |
| "step": 113 | |
| }, | |
| { | |
| "entropy": 0.5535563677549362, | |
| "epoch": 0.4257703081232493, | |
| "grad_norm": 0.032203588634729385, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5510682463645935, | |
| "mean_token_accuracy": 0.7764001041650772, | |
| "num_tokens": 1864862.0, | |
| "step": 114 | |
| }, | |
| { | |
| "entropy": 0.5557997226715088, | |
| "epoch": 0.4295051353874883, | |
| "grad_norm": 0.033370040357112885, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5584062933921814, | |
| "mean_token_accuracy": 0.7749063074588776, | |
| "num_tokens": 1881168.0, | |
| "step": 115 | |
| }, | |
| { | |
| "entropy": 0.5543448776006699, | |
| "epoch": 0.4332399626517274, | |
| "grad_norm": 0.030230488628149033, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5530171990394592, | |
| "mean_token_accuracy": 0.7758816778659821, | |
| "num_tokens": 1897482.0, | |
| "step": 116 | |
| }, | |
| { | |
| "entropy": 0.5602561086416245, | |
| "epoch": 0.4369747899159664, | |
| "grad_norm": 0.03355773538351059, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5631951093673706, | |
| "mean_token_accuracy": 0.7723173201084137, | |
| "num_tokens": 1913520.0, | |
| "step": 117 | |
| }, | |
| { | |
| "entropy": 0.5448198318481445, | |
| "epoch": 0.4407096171802054, | |
| "grad_norm": 0.03538920357823372, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5498956441879272, | |
| "mean_token_accuracy": 0.7779627591371536, | |
| "num_tokens": 1929827.0, | |
| "step": 118 | |
| }, | |
| { | |
| "entropy": 0.5492925643920898, | |
| "epoch": 0.4444444444444444, | |
| "grad_norm": 0.03334996476769447, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5524949431419373, | |
| "mean_token_accuracy": 0.7753683775663376, | |
| "num_tokens": 1946145.0, | |
| "step": 119 | |
| }, | |
| { | |
| "entropy": 0.5578335374593735, | |
| "epoch": 0.4481792717086835, | |
| "grad_norm": 0.029814472422003746, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5506975650787354, | |
| "mean_token_accuracy": 0.7767714560031891, | |
| "num_tokens": 1962460.0, | |
| "step": 120 | |
| }, | |
| { | |
| "entropy": 0.5471834242343903, | |
| "epoch": 0.4519140989729225, | |
| "grad_norm": 0.030702516436576843, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5459597110748291, | |
| "mean_token_accuracy": 0.7779918015003204, | |
| "num_tokens": 1978468.0, | |
| "step": 121 | |
| }, | |
| { | |
| "entropy": 0.5746940076351166, | |
| "epoch": 0.4556489262371615, | |
| "grad_norm": 0.028086913749575615, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5758755207061768, | |
| "mean_token_accuracy": 0.766986295580864, | |
| "num_tokens": 1994816.0, | |
| "step": 122 | |
| }, | |
| { | |
| "entropy": 0.5609753727912903, | |
| "epoch": 0.45938375350140054, | |
| "grad_norm": 0.027476167306303978, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5596047639846802, | |
| "mean_token_accuracy": 0.7727872580289841, | |
| "num_tokens": 2011498.0, | |
| "step": 123 | |
| }, | |
| { | |
| "entropy": 0.5600833296775818, | |
| "epoch": 0.4631185807656396, | |
| "grad_norm": 0.03369581326842308, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5641721487045288, | |
| "mean_token_accuracy": 0.7693867385387421, | |
| "num_tokens": 2027843.0, | |
| "step": 124 | |
| }, | |
| { | |
| "entropy": 0.5480703115463257, | |
| "epoch": 0.4668534080298786, | |
| "grad_norm": 0.029643159359693527, | |
| "learning_rate": 0.0002, | |
| "loss": 0.554192841053009, | |
| "mean_token_accuracy": 0.7775781005620956, | |
| "num_tokens": 2044099.0, | |
| "step": 125 | |
| }, | |
| { | |
| "entropy": 0.5571865439414978, | |
| "epoch": 0.47058823529411764, | |
| "grad_norm": 0.032963886857032776, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5603472590446472, | |
| "mean_token_accuracy": 0.7727210968732834, | |
| "num_tokens": 2060417.0, | |
| "step": 126 | |
| }, | |
| { | |
| "entropy": 0.5587971061468124, | |
| "epoch": 0.47432306255835666, | |
| "grad_norm": 0.028774971142411232, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5552476644515991, | |
| "mean_token_accuracy": 0.7738739997148514, | |
| "num_tokens": 2076710.0, | |
| "step": 127 | |
| }, | |
| { | |
| "entropy": 0.5658144652843475, | |
| "epoch": 0.4780578898225957, | |
| "grad_norm": 0.03230098634958267, | |
| "learning_rate": 0.0002, | |
| "loss": 0.557459831237793, | |
| "mean_token_accuracy": 0.7754161208868027, | |
| "num_tokens": 2093196.0, | |
| "step": 128 | |
| }, | |
| { | |
| "entropy": 0.5515187084674835, | |
| "epoch": 0.48179271708683474, | |
| "grad_norm": 0.03461001068353653, | |
| "learning_rate": 0.0002, | |
| "loss": 0.547848641872406, | |
| "mean_token_accuracy": 0.7798665314912796, | |
| "num_tokens": 2109091.0, | |
| "step": 129 | |
| }, | |
| { | |
| "entropy": 0.5527725219726562, | |
| "epoch": 0.48552754435107376, | |
| "grad_norm": 0.03391197323799133, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5531637072563171, | |
| "mean_token_accuracy": 0.7753576338291168, | |
| "num_tokens": 2125292.0, | |
| "step": 130 | |
| }, | |
| { | |
| "entropy": 0.5310224145650864, | |
| "epoch": 0.4892623716153128, | |
| "grad_norm": 0.037288419902324677, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5368673801422119, | |
| "mean_token_accuracy": 0.7833587974309921, | |
| "num_tokens": 2141768.0, | |
| "step": 131 | |
| }, | |
| { | |
| "entropy": 0.5471584349870682, | |
| "epoch": 0.49299719887955185, | |
| "grad_norm": 0.03433871641755104, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5525721907615662, | |
| "mean_token_accuracy": 0.776105523109436, | |
| "num_tokens": 2158143.0, | |
| "step": 132 | |
| }, | |
| { | |
| "entropy": 0.5587402433156967, | |
| "epoch": 0.49673202614379086, | |
| "grad_norm": 0.03347739949822426, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5661599636077881, | |
| "mean_token_accuracy": 0.7718635648488998, | |
| "num_tokens": 2174416.0, | |
| "step": 133 | |
| }, | |
| { | |
| "entropy": 0.5683765709400177, | |
| "epoch": 0.5004668534080299, | |
| "grad_norm": 0.03381507471203804, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5622847080230713, | |
| "mean_token_accuracy": 0.7744656354188919, | |
| "num_tokens": 2190880.0, | |
| "step": 134 | |
| }, | |
| { | |
| "entropy": 0.5644540786743164, | |
| "epoch": 0.5042016806722689, | |
| "grad_norm": 0.03272015228867531, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5552080869674683, | |
| "mean_token_accuracy": 0.7752301692962646, | |
| "num_tokens": 2207174.0, | |
| "step": 135 | |
| }, | |
| { | |
| "entropy": 0.5678849667310715, | |
| "epoch": 0.5079365079365079, | |
| "grad_norm": 0.031616441905498505, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5582877993583679, | |
| "mean_token_accuracy": 0.7729764580726624, | |
| "num_tokens": 2223657.0, | |
| "step": 136 | |
| }, | |
| { | |
| "entropy": 0.560051366686821, | |
| "epoch": 0.5116713352007469, | |
| "grad_norm": 0.03558259457349777, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5536358952522278, | |
| "mean_token_accuracy": 0.7764490097761154, | |
| "num_tokens": 2239931.0, | |
| "step": 137 | |
| }, | |
| { | |
| "entropy": 0.5550469309091568, | |
| "epoch": 0.5154061624649859, | |
| "grad_norm": 0.034295059740543365, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5614034533500671, | |
| "mean_token_accuracy": 0.7718400210142136, | |
| "num_tokens": 2256301.0, | |
| "step": 138 | |
| }, | |
| { | |
| "entropy": 0.5675243437290192, | |
| "epoch": 0.5191409897292251, | |
| "grad_norm": 0.03538001328706741, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5784004926681519, | |
| "mean_token_accuracy": 0.7684118300676346, | |
| "num_tokens": 2272718.0, | |
| "step": 139 | |
| }, | |
| { | |
| "entropy": 0.5533763766288757, | |
| "epoch": 0.5228758169934641, | |
| "grad_norm": 0.034997887909412384, | |
| "learning_rate": 0.0002, | |
| "loss": 0.563084602355957, | |
| "mean_token_accuracy": 0.7709241509437561, | |
| "num_tokens": 2289039.0, | |
| "step": 140 | |
| }, | |
| { | |
| "entropy": 0.5602118372917175, | |
| "epoch": 0.5266106442577031, | |
| "grad_norm": 0.033439598977565765, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5643538236618042, | |
| "mean_token_accuracy": 0.7725736945867538, | |
| "num_tokens": 2305409.0, | |
| "step": 141 | |
| }, | |
| { | |
| "entropy": 0.572220578789711, | |
| "epoch": 0.5303454715219421, | |
| "grad_norm": 0.02899010293185711, | |
| "learning_rate": 0.0002, | |
| "loss": 0.56317138671875, | |
| "mean_token_accuracy": 0.7727230340242386, | |
| "num_tokens": 2321812.0, | |
| "step": 142 | |
| }, | |
| { | |
| "entropy": 0.5518327206373215, | |
| "epoch": 0.5340802987861811, | |
| "grad_norm": 0.03380458429455757, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5400616526603699, | |
| "mean_token_accuracy": 0.7813573479652405, | |
| "num_tokens": 2338293.0, | |
| "step": 143 | |
| }, | |
| { | |
| "entropy": 0.59617879986763, | |
| "epoch": 0.5378151260504201, | |
| "grad_norm": 0.03466860204935074, | |
| "learning_rate": 0.0002, | |
| "loss": 0.58748859167099, | |
| "mean_token_accuracy": 0.7642232924699783, | |
| "num_tokens": 2354694.0, | |
| "step": 144 | |
| }, | |
| { | |
| "entropy": 0.5574633181095123, | |
| "epoch": 0.5415499533146592, | |
| "grad_norm": 0.030799690634012222, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5586976408958435, | |
| "mean_token_accuracy": 0.774814635515213, | |
| "num_tokens": 2370998.0, | |
| "step": 145 | |
| }, | |
| { | |
| "entropy": 0.5298123508691788, | |
| "epoch": 0.5452847805788982, | |
| "grad_norm": 0.032734956592321396, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5359174609184265, | |
| "mean_token_accuracy": 0.782838299870491, | |
| "num_tokens": 2387173.0, | |
| "step": 146 | |
| }, | |
| { | |
| "entropy": 0.5436026155948639, | |
| "epoch": 0.5490196078431373, | |
| "grad_norm": 0.03734711930155754, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5544965267181396, | |
| "mean_token_accuracy": 0.7772063612937927, | |
| "num_tokens": 2403457.0, | |
| "step": 147 | |
| }, | |
| { | |
| "entropy": 0.5453614443540573, | |
| "epoch": 0.5527544351073763, | |
| "grad_norm": 0.030067089945077896, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5510781407356262, | |
| "mean_token_accuracy": 0.7755871117115021, | |
| "num_tokens": 2419735.0, | |
| "step": 148 | |
| }, | |
| { | |
| "entropy": 0.55818210542202, | |
| "epoch": 0.5564892623716153, | |
| "grad_norm": 0.02786589413881302, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5563390851020813, | |
| "mean_token_accuracy": 0.7738417237997055, | |
| "num_tokens": 2436098.0, | |
| "step": 149 | |
| }, | |
| { | |
| "entropy": 0.5619741082191467, | |
| "epoch": 0.5602240896358543, | |
| "grad_norm": 0.030777357518672943, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5554664134979248, | |
| "mean_token_accuracy": 0.7789015769958496, | |
| "num_tokens": 2452471.0, | |
| "step": 150 | |
| }, | |
| { | |
| "entropy": 0.5570534616708755, | |
| "epoch": 0.5639589169000934, | |
| "grad_norm": 0.03233370929956436, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5482333898544312, | |
| "mean_token_accuracy": 0.7772232443094254, | |
| "num_tokens": 2468628.0, | |
| "step": 151 | |
| }, | |
| { | |
| "entropy": 0.5588962733745575, | |
| "epoch": 0.5676937441643324, | |
| "grad_norm": 0.03047763742506504, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5532917380332947, | |
| "mean_token_accuracy": 0.7753781825304031, | |
| "num_tokens": 2485072.0, | |
| "step": 152 | |
| }, | |
| { | |
| "entropy": 0.549691841006279, | |
| "epoch": 0.5714285714285714, | |
| "grad_norm": 0.02944052591919899, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5515119433403015, | |
| "mean_token_accuracy": 0.7769780606031418, | |
| "num_tokens": 2501327.0, | |
| "step": 153 | |
| }, | |
| { | |
| "entropy": 0.5404879450798035, | |
| "epoch": 0.5751633986928104, | |
| "grad_norm": 0.032262854278087616, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5476431846618652, | |
| "mean_token_accuracy": 0.7793239504098892, | |
| "num_tokens": 2517799.0, | |
| "step": 154 | |
| }, | |
| { | |
| "entropy": 0.5289865881204605, | |
| "epoch": 0.5788982259570495, | |
| "grad_norm": 0.03042609617114067, | |
| "learning_rate": 0.0002, | |
| "loss": 0.531823992729187, | |
| "mean_token_accuracy": 0.7862056195735931, | |
| "num_tokens": 2534300.0, | |
| "step": 155 | |
| }, | |
| { | |
| "entropy": 0.5359181612730026, | |
| "epoch": 0.5826330532212886, | |
| "grad_norm": 0.030735395848751068, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5355162024497986, | |
| "mean_token_accuracy": 0.7830311506986618, | |
| "num_tokens": 2550561.0, | |
| "step": 156 | |
| }, | |
| { | |
| "entropy": 0.555221676826477, | |
| "epoch": 0.5863678804855276, | |
| "grad_norm": 0.03072836995124817, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5626713037490845, | |
| "mean_token_accuracy": 0.7714420855045319, | |
| "num_tokens": 2566961.0, | |
| "step": 157 | |
| }, | |
| { | |
| "entropy": 0.553142175078392, | |
| "epoch": 0.5901027077497666, | |
| "grad_norm": 0.030098870396614075, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5467352867126465, | |
| "mean_token_accuracy": 0.7787252068519592, | |
| "num_tokens": 2583507.0, | |
| "step": 158 | |
| }, | |
| { | |
| "entropy": 0.5665386617183685, | |
| "epoch": 0.5938375350140056, | |
| "grad_norm": 0.03258649259805679, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5577669143676758, | |
| "mean_token_accuracy": 0.7736402750015259, | |
| "num_tokens": 2599944.0, | |
| "step": 159 | |
| }, | |
| { | |
| "entropy": 0.5569501370191574, | |
| "epoch": 0.5975723622782446, | |
| "grad_norm": 0.03186054900288582, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5573895573616028, | |
| "mean_token_accuracy": 0.776360809803009, | |
| "num_tokens": 2616293.0, | |
| "step": 160 | |
| }, | |
| { | |
| "entropy": 0.5284514650702477, | |
| "epoch": 0.6013071895424836, | |
| "grad_norm": 0.029392873868346214, | |
| "learning_rate": 0.0002, | |
| "loss": 0.53591388463974, | |
| "mean_token_accuracy": 0.7802938669919968, | |
| "num_tokens": 2632542.0, | |
| "step": 161 | |
| }, | |
| { | |
| "entropy": 0.5517806857824326, | |
| "epoch": 0.6050420168067226, | |
| "grad_norm": 0.03547659516334534, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5624344348907471, | |
| "mean_token_accuracy": 0.7713066786527634, | |
| "num_tokens": 2648855.0, | |
| "step": 162 | |
| }, | |
| { | |
| "entropy": 0.5444875061511993, | |
| "epoch": 0.6087768440709617, | |
| "grad_norm": 0.032323673367500305, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5506078004837036, | |
| "mean_token_accuracy": 0.7763939499855042, | |
| "num_tokens": 2665389.0, | |
| "step": 163 | |
| }, | |
| { | |
| "entropy": 0.552508682012558, | |
| "epoch": 0.6125116713352008, | |
| "grad_norm": 0.029938260093331337, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5556696653366089, | |
| "mean_token_accuracy": 0.774255782365799, | |
| "num_tokens": 2681574.0, | |
| "step": 164 | |
| }, | |
| { | |
| "entropy": 0.5732054561376572, | |
| "epoch": 0.6162464985994398, | |
| "grad_norm": 0.027899837121367455, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5643041133880615, | |
| "mean_token_accuracy": 0.7738403379917145, | |
| "num_tokens": 2697956.0, | |
| "step": 165 | |
| }, | |
| { | |
| "entropy": 0.5575381070375443, | |
| "epoch": 0.6199813258636788, | |
| "grad_norm": 0.03164415806531906, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5456005930900574, | |
| "mean_token_accuracy": 0.7768769711256027, | |
| "num_tokens": 2714390.0, | |
| "step": 166 | |
| }, | |
| { | |
| "entropy": 0.5516810864210129, | |
| "epoch": 0.6237161531279178, | |
| "grad_norm": 0.02569694072008133, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5495009422302246, | |
| "mean_token_accuracy": 0.774631917476654, | |
| "num_tokens": 2730912.0, | |
| "step": 167 | |
| }, | |
| { | |
| "entropy": 0.5496233999729156, | |
| "epoch": 0.6274509803921569, | |
| "grad_norm": 0.03019907884299755, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5496887564659119, | |
| "mean_token_accuracy": 0.7817335277795792, | |
| "num_tokens": 2747282.0, | |
| "step": 168 | |
| }, | |
| { | |
| "entropy": 0.5489860326051712, | |
| "epoch": 0.6311858076563959, | |
| "grad_norm": 0.03389516472816467, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5572369694709778, | |
| "mean_token_accuracy": 0.7735096365213394, | |
| "num_tokens": 2763708.0, | |
| "step": 169 | |
| }, | |
| { | |
| "entropy": 0.5558005720376968, | |
| "epoch": 0.6349206349206349, | |
| "grad_norm": 0.02765459194779396, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5571833848953247, | |
| "mean_token_accuracy": 0.7726074606180191, | |
| "num_tokens": 2780084.0, | |
| "step": 170 | |
| }, | |
| { | |
| "entropy": 0.5543476939201355, | |
| "epoch": 0.6386554621848739, | |
| "grad_norm": 0.0267086960375309, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5579585433006287, | |
| "mean_token_accuracy": 0.7720465064048767, | |
| "num_tokens": 2796592.0, | |
| "step": 171 | |
| }, | |
| { | |
| "entropy": 0.5531720370054245, | |
| "epoch": 0.642390289449113, | |
| "grad_norm": 0.03003924898803234, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5539361238479614, | |
| "mean_token_accuracy": 0.7745767682790756, | |
| "num_tokens": 2813004.0, | |
| "step": 172 | |
| }, | |
| { | |
| "entropy": 0.5696417987346649, | |
| "epoch": 0.646125116713352, | |
| "grad_norm": 0.030649833381175995, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5720299482345581, | |
| "mean_token_accuracy": 0.7685467600822449, | |
| "num_tokens": 2829346.0, | |
| "step": 173 | |
| }, | |
| { | |
| "entropy": 0.5682009905576706, | |
| "epoch": 0.6498599439775911, | |
| "grad_norm": 0.028095850721001625, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5576902627944946, | |
| "mean_token_accuracy": 0.7762027978897095, | |
| "num_tokens": 2845908.0, | |
| "step": 174 | |
| }, | |
| { | |
| "entropy": 0.5714679658412933, | |
| "epoch": 0.6535947712418301, | |
| "grad_norm": 0.028559835627675056, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5658706426620483, | |
| "mean_token_accuracy": 0.7675664275884628, | |
| "num_tokens": 2862417.0, | |
| "step": 175 | |
| }, | |
| { | |
| "entropy": 0.5519525855779648, | |
| "epoch": 0.6573295985060691, | |
| "grad_norm": 0.034554384648799896, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5615457892417908, | |
| "mean_token_accuracy": 0.7730480134487152, | |
| "num_tokens": 2878691.0, | |
| "step": 176 | |
| }, | |
| { | |
| "entropy": 0.5469972342252731, | |
| "epoch": 0.6610644257703081, | |
| "grad_norm": 0.038470808416604996, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5615893006324768, | |
| "mean_token_accuracy": 0.7721795290708542, | |
| "num_tokens": 2894997.0, | |
| "step": 177 | |
| }, | |
| { | |
| "entropy": 0.5659243762493134, | |
| "epoch": 0.6647992530345471, | |
| "grad_norm": 0.028726449236273766, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5627461671829224, | |
| "mean_token_accuracy": 0.7720647305250168, | |
| "num_tokens": 2911504.0, | |
| "step": 178 | |
| }, | |
| { | |
| "entropy": 0.5529140681028366, | |
| "epoch": 0.6685340802987861, | |
| "grad_norm": 0.02865666151046753, | |
| "learning_rate": 0.0002, | |
| "loss": 0.551171064376831, | |
| "mean_token_accuracy": 0.7765299677848816, | |
| "num_tokens": 2927890.0, | |
| "step": 179 | |
| }, | |
| { | |
| "entropy": 0.5597221851348877, | |
| "epoch": 0.6722689075630253, | |
| "grad_norm": 0.030919602140784264, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5537790656089783, | |
| "mean_token_accuracy": 0.7759328931570053, | |
| "num_tokens": 2944242.0, | |
| "step": 180 | |
| }, | |
| { | |
| "entropy": 0.562122106552124, | |
| "epoch": 0.6760037348272643, | |
| "grad_norm": 0.03044375404715538, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5568514466285706, | |
| "mean_token_accuracy": 0.7706819474697113, | |
| "num_tokens": 2960500.0, | |
| "step": 181 | |
| }, | |
| { | |
| "entropy": 0.5697348713874817, | |
| "epoch": 0.6797385620915033, | |
| "grad_norm": 0.031796056777238846, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5688814520835876, | |
| "mean_token_accuracy": 0.7685033828020096, | |
| "num_tokens": 2976732.0, | |
| "step": 182 | |
| }, | |
| { | |
| "entropy": 0.5696271657943726, | |
| "epoch": 0.6834733893557423, | |
| "grad_norm": 0.034152235835790634, | |
| "learning_rate": 0.0002, | |
| "loss": 0.570652186870575, | |
| "mean_token_accuracy": 0.7676333039999008, | |
| "num_tokens": 2993011.0, | |
| "step": 183 | |
| }, | |
| { | |
| "entropy": 0.5509230494499207, | |
| "epoch": 0.6872082166199813, | |
| "grad_norm": 0.030170850455760956, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5528304576873779, | |
| "mean_token_accuracy": 0.7786384671926498, | |
| "num_tokens": 3009475.0, | |
| "step": 184 | |
| }, | |
| { | |
| "entropy": 0.549485370516777, | |
| "epoch": 0.6909430438842203, | |
| "grad_norm": 0.03623858466744423, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5553773045539856, | |
| "mean_token_accuracy": 0.7744152545928955, | |
| "num_tokens": 3025920.0, | |
| "step": 185 | |
| }, | |
| { | |
| "entropy": 0.5484632700681686, | |
| "epoch": 0.6946778711484594, | |
| "grad_norm": 0.033118441700935364, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5544424653053284, | |
| "mean_token_accuracy": 0.7758429795503616, | |
| "num_tokens": 3042293.0, | |
| "step": 186 | |
| }, | |
| { | |
| "entropy": 0.5471510142087936, | |
| "epoch": 0.6984126984126984, | |
| "grad_norm": 0.027027102187275887, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5416866540908813, | |
| "mean_token_accuracy": 0.7816910296678543, | |
| "num_tokens": 3058771.0, | |
| "step": 187 | |
| }, | |
| { | |
| "entropy": 0.5579911917448044, | |
| "epoch": 0.7021475256769374, | |
| "grad_norm": 0.03291584923863411, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5471009016036987, | |
| "mean_token_accuracy": 0.7790512144565582, | |
| "num_tokens": 3075134.0, | |
| "step": 188 | |
| }, | |
| { | |
| "entropy": 0.5525984019041061, | |
| "epoch": 0.7058823529411765, | |
| "grad_norm": 0.029011745005846024, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5483554005622864, | |
| "mean_token_accuracy": 0.7763502299785614, | |
| "num_tokens": 3091306.0, | |
| "step": 189 | |
| }, | |
| { | |
| "entropy": 0.5610422939062119, | |
| "epoch": 0.7096171802054155, | |
| "grad_norm": 0.02904326282441616, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5617838501930237, | |
| "mean_token_accuracy": 0.7707021087408066, | |
| "num_tokens": 3107639.0, | |
| "step": 190 | |
| }, | |
| { | |
| "entropy": 0.5382349342107773, | |
| "epoch": 0.7133520074696545, | |
| "grad_norm": 0.027915941551327705, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5406217575073242, | |
| "mean_token_accuracy": 0.7792213708162308, | |
| "num_tokens": 3123888.0, | |
| "step": 191 | |
| }, | |
| { | |
| "entropy": 0.5334387570619583, | |
| "epoch": 0.7170868347338936, | |
| "grad_norm": 0.024687422439455986, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5337969660758972, | |
| "mean_token_accuracy": 0.7827744781970978, | |
| "num_tokens": 3140136.0, | |
| "step": 192 | |
| }, | |
| { | |
| "entropy": 0.5519388318061829, | |
| "epoch": 0.7208216619981326, | |
| "grad_norm": 0.03399450331926346, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5664753317832947, | |
| "mean_token_accuracy": 0.7712263911962509, | |
| "num_tokens": 3156560.0, | |
| "step": 193 | |
| }, | |
| { | |
| "entropy": 0.5329768806695938, | |
| "epoch": 0.7245564892623716, | |
| "grad_norm": 0.03143489733338356, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5424296259880066, | |
| "mean_token_accuracy": 0.7808002233505249, | |
| "num_tokens": 3172868.0, | |
| "step": 194 | |
| }, | |
| { | |
| "entropy": 0.5407986044883728, | |
| "epoch": 0.7282913165266106, | |
| "grad_norm": 0.02865898422896862, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5426485538482666, | |
| "mean_token_accuracy": 0.7797252386808395, | |
| "num_tokens": 3188845.0, | |
| "step": 195 | |
| }, | |
| { | |
| "entropy": 0.5540356040000916, | |
| "epoch": 0.7320261437908496, | |
| "grad_norm": 0.031195135787129402, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5537624359130859, | |
| "mean_token_accuracy": 0.772818997502327, | |
| "num_tokens": 3205059.0, | |
| "step": 196 | |
| }, | |
| { | |
| "entropy": 0.547016367316246, | |
| "epoch": 0.7357609710550888, | |
| "grad_norm": 0.026600942015647888, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5409566164016724, | |
| "mean_token_accuracy": 0.7801954299211502, | |
| "num_tokens": 3221339.0, | |
| "step": 197 | |
| }, | |
| { | |
| "entropy": 0.5571199655532837, | |
| "epoch": 0.7394957983193278, | |
| "grad_norm": 0.027464795857667923, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5505565404891968, | |
| "mean_token_accuracy": 0.7758535593748093, | |
| "num_tokens": 3237556.0, | |
| "step": 198 | |
| }, | |
| { | |
| "entropy": 0.5562743991613388, | |
| "epoch": 0.7432306255835668, | |
| "grad_norm": 0.029805589467287064, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5526044368743896, | |
| "mean_token_accuracy": 0.7738559246063232, | |
| "num_tokens": 3253871.0, | |
| "step": 199 | |
| }, | |
| { | |
| "entropy": 0.5585610568523407, | |
| "epoch": 0.7469654528478058, | |
| "grad_norm": 0.03004448115825653, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5598405599594116, | |
| "mean_token_accuracy": 0.7726627141237259, | |
| "num_tokens": 3269973.0, | |
| "step": 200 | |
| }, | |
| { | |
| "entropy": 0.5488641411066055, | |
| "epoch": 0.7507002801120448, | |
| "grad_norm": 0.027654554694890976, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5513002276420593, | |
| "mean_token_accuracy": 0.7737944573163986, | |
| "num_tokens": 3286201.0, | |
| "step": 201 | |
| }, | |
| { | |
| "entropy": 0.5287523940205574, | |
| "epoch": 0.7544351073762838, | |
| "grad_norm": 0.03466613590717316, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5311362743377686, | |
| "mean_token_accuracy": 0.7847718745470047, | |
| "num_tokens": 3302467.0, | |
| "step": 202 | |
| }, | |
| { | |
| "entropy": 0.5560965240001678, | |
| "epoch": 0.7581699346405228, | |
| "grad_norm": 0.034095581620931625, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5613946914672852, | |
| "mean_token_accuracy": 0.7737453281879425, | |
| "num_tokens": 3318768.0, | |
| "step": 203 | |
| }, | |
| { | |
| "entropy": 0.5630687177181244, | |
| "epoch": 0.7619047619047619, | |
| "grad_norm": 0.03233996778726578, | |
| "learning_rate": 0.0002, | |
| "loss": 0.564468264579773, | |
| "mean_token_accuracy": 0.7691166549921036, | |
| "num_tokens": 3335233.0, | |
| "step": 204 | |
| }, | |
| { | |
| "entropy": 0.5651765614748001, | |
| "epoch": 0.765639589169001, | |
| "grad_norm": 0.030395060777664185, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5597318410873413, | |
| "mean_token_accuracy": 0.7716515213251114, | |
| "num_tokens": 3351439.0, | |
| "step": 205 | |
| }, | |
| { | |
| "entropy": 0.5476003587245941, | |
| "epoch": 0.76937441643324, | |
| "grad_norm": 0.03382452204823494, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5447301864624023, | |
| "mean_token_accuracy": 0.7816700637340546, | |
| "num_tokens": 3367520.0, | |
| "step": 206 | |
| }, | |
| { | |
| "entropy": 0.5484471321105957, | |
| "epoch": 0.773109243697479, | |
| "grad_norm": 0.02830951102077961, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5454609394073486, | |
| "mean_token_accuracy": 0.7790801376104355, | |
| "num_tokens": 3383667.0, | |
| "step": 207 | |
| }, | |
| { | |
| "entropy": 0.5659755617380142, | |
| "epoch": 0.776844070961718, | |
| "grad_norm": 0.02530798688530922, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5655397772789001, | |
| "mean_token_accuracy": 0.770569920539856, | |
| "num_tokens": 3400150.0, | |
| "step": 208 | |
| }, | |
| { | |
| "entropy": 0.5427214205265045, | |
| "epoch": 0.780578898225957, | |
| "grad_norm": 0.03361448645591736, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5476981401443481, | |
| "mean_token_accuracy": 0.7780336290597916, | |
| "num_tokens": 3416165.0, | |
| "step": 209 | |
| }, | |
| { | |
| "entropy": 0.5494136065244675, | |
| "epoch": 0.7843137254901961, | |
| "grad_norm": 0.029303058981895447, | |
| "learning_rate": 0.0002, | |
| "loss": 0.555971086025238, | |
| "mean_token_accuracy": 0.7742915004491806, | |
| "num_tokens": 3432668.0, | |
| "step": 210 | |
| }, | |
| { | |
| "entropy": 0.5408063977956772, | |
| "epoch": 0.7880485527544351, | |
| "grad_norm": 0.024706227704882622, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5423460006713867, | |
| "mean_token_accuracy": 0.7791419923305511, | |
| "num_tokens": 3449230.0, | |
| "step": 211 | |
| }, | |
| { | |
| "entropy": 0.5585084557533264, | |
| "epoch": 0.7917833800186741, | |
| "grad_norm": 0.031753819435834885, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5534642934799194, | |
| "mean_token_accuracy": 0.7761369943618774, | |
| "num_tokens": 3465888.0, | |
| "step": 212 | |
| }, | |
| { | |
| "entropy": 0.5470991730690002, | |
| "epoch": 0.7955182072829131, | |
| "grad_norm": 0.02627946063876152, | |
| "learning_rate": 0.0002, | |
| "loss": 0.543735921382904, | |
| "mean_token_accuracy": 0.7773504257202148, | |
| "num_tokens": 3482200.0, | |
| "step": 213 | |
| }, | |
| { | |
| "entropy": 0.5522027462720871, | |
| "epoch": 0.7992530345471522, | |
| "grad_norm": 0.02693161368370056, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5497567057609558, | |
| "mean_token_accuracy": 0.7760942578315735, | |
| "num_tokens": 3498472.0, | |
| "step": 214 | |
| }, | |
| { | |
| "entropy": 0.5438102185726166, | |
| "epoch": 0.8029878618113913, | |
| "grad_norm": 0.029677148908376694, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5449556112289429, | |
| "mean_token_accuracy": 0.7757529467344284, | |
| "num_tokens": 3514748.0, | |
| "step": 215 | |
| }, | |
| { | |
| "entropy": 0.5440456867218018, | |
| "epoch": 0.8067226890756303, | |
| "grad_norm": 0.028825437650084496, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5460283160209656, | |
| "mean_token_accuracy": 0.7805955857038498, | |
| "num_tokens": 3530910.0, | |
| "step": 216 | |
| }, | |
| { | |
| "entropy": 0.5444321483373642, | |
| "epoch": 0.8104575163398693, | |
| "grad_norm": 0.023829322308301926, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5420593023300171, | |
| "mean_token_accuracy": 0.7787522822618484, | |
| "num_tokens": 3547036.0, | |
| "step": 217 | |
| }, | |
| { | |
| "entropy": 0.5498476177453995, | |
| "epoch": 0.8141923436041083, | |
| "grad_norm": 0.025729795917868614, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5429421067237854, | |
| "mean_token_accuracy": 0.7785259187221527, | |
| "num_tokens": 3563357.0, | |
| "step": 218 | |
| }, | |
| { | |
| "entropy": 0.544920951128006, | |
| "epoch": 0.8179271708683473, | |
| "grad_norm": 0.027102749794721603, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5383168458938599, | |
| "mean_token_accuracy": 0.7817831486463547, | |
| "num_tokens": 3579822.0, | |
| "step": 219 | |
| }, | |
| { | |
| "entropy": 0.5497463345527649, | |
| "epoch": 0.8216619981325863, | |
| "grad_norm": 0.0323423407971859, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5521490573883057, | |
| "mean_token_accuracy": 0.7747017741203308, | |
| "num_tokens": 3596053.0, | |
| "step": 220 | |
| }, | |
| { | |
| "entropy": 0.5389717519283295, | |
| "epoch": 0.8253968253968254, | |
| "grad_norm": 0.027372388169169426, | |
| "learning_rate": 0.0002, | |
| "loss": 0.540256679058075, | |
| "mean_token_accuracy": 0.7825071215629578, | |
| "num_tokens": 3612271.0, | |
| "step": 221 | |
| }, | |
| { | |
| "entropy": 0.5472569465637207, | |
| "epoch": 0.8291316526610645, | |
| "grad_norm": 0.028159258887171745, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5517306327819824, | |
| "mean_token_accuracy": 0.7758912444114685, | |
| "num_tokens": 3628658.0, | |
| "step": 222 | |
| }, | |
| { | |
| "entropy": 0.5441670119762421, | |
| "epoch": 0.8328664799253035, | |
| "grad_norm": 0.0357636883854866, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5485156178474426, | |
| "mean_token_accuracy": 0.7771351188421249, | |
| "num_tokens": 3645179.0, | |
| "step": 223 | |
| }, | |
| { | |
| "entropy": 0.5535278022289276, | |
| "epoch": 0.8366013071895425, | |
| "grad_norm": 0.032881151884794235, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5619751811027527, | |
| "mean_token_accuracy": 0.7715311944484711, | |
| "num_tokens": 3661296.0, | |
| "step": 224 | |
| }, | |
| { | |
| "entropy": 0.5683074444532394, | |
| "epoch": 0.8403361344537815, | |
| "grad_norm": 0.03166094422340393, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5676220059394836, | |
| "mean_token_accuracy": 0.7721768617630005, | |
| "num_tokens": 3677506.0, | |
| "step": 225 | |
| }, | |
| { | |
| "entropy": 0.5672677755355835, | |
| "epoch": 0.8440709617180205, | |
| "grad_norm": 0.029754942283034325, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5636724233627319, | |
| "mean_token_accuracy": 0.7715145349502563, | |
| "num_tokens": 3693949.0, | |
| "step": 226 | |
| }, | |
| { | |
| "entropy": 0.5544100105762482, | |
| "epoch": 0.8478057889822596, | |
| "grad_norm": 0.027808941900730133, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5551161170005798, | |
| "mean_token_accuracy": 0.7762546241283417, | |
| "num_tokens": 3710403.0, | |
| "step": 227 | |
| }, | |
| { | |
| "entropy": 0.5428061038255692, | |
| "epoch": 0.8515406162464986, | |
| "grad_norm": 0.032082680612802505, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5452495813369751, | |
| "mean_token_accuracy": 0.7784813046455383, | |
| "num_tokens": 3726407.0, | |
| "step": 228 | |
| }, | |
| { | |
| "entropy": 0.5473134368658066, | |
| "epoch": 0.8552754435107376, | |
| "grad_norm": 0.030095776543021202, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5461854934692383, | |
| "mean_token_accuracy": 0.7758107632398605, | |
| "num_tokens": 3742861.0, | |
| "step": 229 | |
| }, | |
| { | |
| "entropy": 0.5752474963665009, | |
| "epoch": 0.8590102707749766, | |
| "grad_norm": 0.030156588181853294, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5713618397712708, | |
| "mean_token_accuracy": 0.7695687711238861, | |
| "num_tokens": 3759464.0, | |
| "step": 230 | |
| }, | |
| { | |
| "entropy": 0.5410983264446259, | |
| "epoch": 0.8627450980392157, | |
| "grad_norm": 0.026288261637091637, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5398176908493042, | |
| "mean_token_accuracy": 0.7807286381721497, | |
| "num_tokens": 3775673.0, | |
| "step": 231 | |
| }, | |
| { | |
| "entropy": 0.5493600815534592, | |
| "epoch": 0.8664799253034547, | |
| "grad_norm": 0.03065655194222927, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5482011437416077, | |
| "mean_token_accuracy": 0.7772542536258698, | |
| "num_tokens": 3791787.0, | |
| "step": 232 | |
| }, | |
| { | |
| "entropy": 0.5542360842227936, | |
| "epoch": 0.8702147525676938, | |
| "grad_norm": 0.032031431794166565, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5554019212722778, | |
| "mean_token_accuracy": 0.7739447802305222, | |
| "num_tokens": 3808316.0, | |
| "step": 233 | |
| }, | |
| { | |
| "entropy": 0.5599103569984436, | |
| "epoch": 0.8739495798319328, | |
| "grad_norm": 0.027463702484965324, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5579502582550049, | |
| "mean_token_accuracy": 0.771759495139122, | |
| "num_tokens": 3824701.0, | |
| "step": 234 | |
| }, | |
| { | |
| "entropy": 0.5677217245101929, | |
| "epoch": 0.8776844070961718, | |
| "grad_norm": 0.03142165020108223, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5663169622421265, | |
| "mean_token_accuracy": 0.7691013365983963, | |
| "num_tokens": 3841435.0, | |
| "step": 235 | |
| }, | |
| { | |
| "entropy": 0.5482347160577774, | |
| "epoch": 0.8814192343604108, | |
| "grad_norm": 0.031262561678886414, | |
| "learning_rate": 0.0002, | |
| "loss": 0.552081823348999, | |
| "mean_token_accuracy": 0.7783354371786118, | |
| "num_tokens": 3857866.0, | |
| "step": 236 | |
| }, | |
| { | |
| "entropy": 0.5391282737255096, | |
| "epoch": 0.8851540616246498, | |
| "grad_norm": 0.030781790614128113, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5469714403152466, | |
| "mean_token_accuracy": 0.7780267000198364, | |
| "num_tokens": 3874216.0, | |
| "step": 237 | |
| }, | |
| { | |
| "entropy": 0.5443921983242035, | |
| "epoch": 0.8888888888888888, | |
| "grad_norm": 0.032567523419857025, | |
| "learning_rate": 0.0002, | |
| "loss": 0.549781084060669, | |
| "mean_token_accuracy": 0.7772793620824814, | |
| "num_tokens": 3890382.0, | |
| "step": 238 | |
| }, | |
| { | |
| "entropy": 0.5604461878538132, | |
| "epoch": 0.892623716153128, | |
| "grad_norm": 0.02667226456105709, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5538907051086426, | |
| "mean_token_accuracy": 0.7770420461893082, | |
| "num_tokens": 3906697.0, | |
| "step": 239 | |
| }, | |
| { | |
| "entropy": 0.5541103631258011, | |
| "epoch": 0.896358543417367, | |
| "grad_norm": 0.027397198602557182, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5516767501831055, | |
| "mean_token_accuracy": 0.7767754942178726, | |
| "num_tokens": 3922978.0, | |
| "step": 240 | |
| }, | |
| { | |
| "entropy": 0.5521068722009659, | |
| "epoch": 0.900093370681606, | |
| "grad_norm": 0.032886214554309845, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5538557171821594, | |
| "mean_token_accuracy": 0.7769301533699036, | |
| "num_tokens": 3939282.0, | |
| "step": 241 | |
| }, | |
| { | |
| "entropy": 0.5449024885892868, | |
| "epoch": 0.903828197945845, | |
| "grad_norm": 0.026176048442721367, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5478168725967407, | |
| "mean_token_accuracy": 0.7779200524091721, | |
| "num_tokens": 3955520.0, | |
| "step": 242 | |
| }, | |
| { | |
| "entropy": 0.5615669041872025, | |
| "epoch": 0.907563025210084, | |
| "grad_norm": 0.02917352132499218, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5631118416786194, | |
| "mean_token_accuracy": 0.769850417971611, | |
| "num_tokens": 3971679.0, | |
| "step": 243 | |
| }, | |
| { | |
| "entropy": 0.5360025763511658, | |
| "epoch": 0.911297852474323, | |
| "grad_norm": 0.028804168105125427, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5399969816207886, | |
| "mean_token_accuracy": 0.7786188125610352, | |
| "num_tokens": 3987832.0, | |
| "step": 244 | |
| }, | |
| { | |
| "entropy": 0.5670223534107208, | |
| "epoch": 0.9150326797385621, | |
| "grad_norm": 0.032523807138204575, | |
| "learning_rate": 0.0002, | |
| "loss": 0.568830668926239, | |
| "mean_token_accuracy": 0.7703544050455093, | |
| "num_tokens": 4004046.0, | |
| "step": 245 | |
| }, | |
| { | |
| "entropy": 0.5482122004032135, | |
| "epoch": 0.9187675070028011, | |
| "grad_norm": 0.024507107213139534, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5461756587028503, | |
| "mean_token_accuracy": 0.7785715907812119, | |
| "num_tokens": 4020396.0, | |
| "step": 246 | |
| }, | |
| { | |
| "entropy": 0.5435233414173126, | |
| "epoch": 0.9225023342670402, | |
| "grad_norm": 0.026535481214523315, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5347612500190735, | |
| "mean_token_accuracy": 0.7819430381059647, | |
| "num_tokens": 4036657.0, | |
| "step": 247 | |
| }, | |
| { | |
| "entropy": 0.5606936663389206, | |
| "epoch": 0.9262371615312792, | |
| "grad_norm": 0.03222998231649399, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5588559508323669, | |
| "mean_token_accuracy": 0.7731847912073135, | |
| "num_tokens": 4052932.0, | |
| "step": 248 | |
| }, | |
| { | |
| "entropy": 0.5559582114219666, | |
| "epoch": 0.9299719887955182, | |
| "grad_norm": 0.027079764753580093, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5551950931549072, | |
| "mean_token_accuracy": 0.7739483118057251, | |
| "num_tokens": 4069465.0, | |
| "step": 249 | |
| }, | |
| { | |
| "entropy": 0.5464590489864349, | |
| "epoch": 0.9337068160597572, | |
| "grad_norm": 0.025224287062883377, | |
| "learning_rate": 0.0002, | |
| "loss": 0.548494815826416, | |
| "mean_token_accuracy": 0.7777067720890045, | |
| "num_tokens": 4085793.0, | |
| "step": 250 | |
| }, | |
| { | |
| "entropy": 0.5697829127311707, | |
| "epoch": 0.9374416433239963, | |
| "grad_norm": 0.03149845451116562, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5725698471069336, | |
| "mean_token_accuracy": 0.7667296230792999, | |
| "num_tokens": 4102389.0, | |
| "step": 251 | |
| }, | |
| { | |
| "entropy": 0.5524837523698807, | |
| "epoch": 0.9411764705882353, | |
| "grad_norm": 0.027573609724640846, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5497711896896362, | |
| "mean_token_accuracy": 0.7749225348234177, | |
| "num_tokens": 4118604.0, | |
| "step": 252 | |
| }, | |
| { | |
| "entropy": 0.5428849905729294, | |
| "epoch": 0.9449112978524743, | |
| "grad_norm": 0.025667617097496986, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5428351163864136, | |
| "mean_token_accuracy": 0.7771738916635513, | |
| "num_tokens": 4135001.0, | |
| "step": 253 | |
| }, | |
| { | |
| "entropy": 0.5520694851875305, | |
| "epoch": 0.9486461251167133, | |
| "grad_norm": 0.035842686891555786, | |
| "learning_rate": 0.0002, | |
| "loss": 0.550408661365509, | |
| "mean_token_accuracy": 0.7740647196769714, | |
| "num_tokens": 4151260.0, | |
| "step": 254 | |
| }, | |
| { | |
| "entropy": 0.5418593287467957, | |
| "epoch": 0.9523809523809523, | |
| "grad_norm": 0.0381033793091774, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5492621660232544, | |
| "mean_token_accuracy": 0.7769514173269272, | |
| "num_tokens": 4167360.0, | |
| "step": 255 | |
| }, | |
| { | |
| "entropy": 0.5375488549470901, | |
| "epoch": 0.9561157796451915, | |
| "grad_norm": 0.029893534258008003, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5434277057647705, | |
| "mean_token_accuracy": 0.7754911035299301, | |
| "num_tokens": 4183517.0, | |
| "step": 256 | |
| }, | |
| { | |
| "entropy": 0.5487121939659119, | |
| "epoch": 0.9598506069094305, | |
| "grad_norm": 0.03323543071746826, | |
| "learning_rate": 0.0002, | |
| "loss": 0.549543559551239, | |
| "mean_token_accuracy": 0.7791514545679092, | |
| "num_tokens": 4200020.0, | |
| "step": 257 | |
| }, | |
| { | |
| "entropy": 0.5533169955015182, | |
| "epoch": 0.9635854341736695, | |
| "grad_norm": 0.1564125418663025, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5513023138046265, | |
| "mean_token_accuracy": 0.7750032246112823, | |
| "num_tokens": 4216280.0, | |
| "step": 258 | |
| }, | |
| { | |
| "entropy": 0.5475684553384781, | |
| "epoch": 0.9673202614379085, | |
| "grad_norm": 0.05765023082494736, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5540170073509216, | |
| "mean_token_accuracy": 0.778236523270607, | |
| "num_tokens": 4232501.0, | |
| "step": 259 | |
| }, | |
| { | |
| "entropy": 0.5620233714580536, | |
| "epoch": 0.9710550887021475, | |
| "grad_norm": 0.046510934829711914, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5589131712913513, | |
| "mean_token_accuracy": 0.7736849784851074, | |
| "num_tokens": 4248855.0, | |
| "step": 260 | |
| }, | |
| { | |
| "entropy": 0.565828487277031, | |
| "epoch": 0.9747899159663865, | |
| "grad_norm": 0.0395890548825264, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5624877214431763, | |
| "mean_token_accuracy": 0.7722225338220596, | |
| "num_tokens": 4265077.0, | |
| "step": 261 | |
| }, | |
| { | |
| "entropy": 0.5551140010356903, | |
| "epoch": 0.9785247432306255, | |
| "grad_norm": 0.03330749273300171, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5576150417327881, | |
| "mean_token_accuracy": 0.7741483747959137, | |
| "num_tokens": 4281357.0, | |
| "step": 262 | |
| }, | |
| { | |
| "entropy": 0.5746229141950607, | |
| "epoch": 0.9822595704948646, | |
| "grad_norm": 0.03519619628787041, | |
| "learning_rate": 0.0002, | |
| "loss": 0.582584023475647, | |
| "mean_token_accuracy": 0.7654829919338226, | |
| "num_tokens": 4297699.0, | |
| "step": 263 | |
| }, | |
| { | |
| "entropy": 0.5782353579998016, | |
| "epoch": 0.9859943977591037, | |
| "grad_norm": 0.03913693502545357, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5755780339241028, | |
| "mean_token_accuracy": 0.7660959511995316, | |
| "num_tokens": 4314249.0, | |
| "step": 264 | |
| }, | |
| { | |
| "entropy": 0.5513299107551575, | |
| "epoch": 0.9897292250233427, | |
| "grad_norm": 0.030444784089922905, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5514294505119324, | |
| "mean_token_accuracy": 0.7750695049762726, | |
| "num_tokens": 4330437.0, | |
| "step": 265 | |
| }, | |
| { | |
| "entropy": 0.5386128276586533, | |
| "epoch": 0.9934640522875817, | |
| "grad_norm": 0.03275322541594505, | |
| "learning_rate": 0.0002, | |
| "loss": 0.540998637676239, | |
| "mean_token_accuracy": 0.7796358019113541, | |
| "num_tokens": 4346677.0, | |
| "step": 266 | |
| }, | |
| { | |
| "entropy": 0.5513150691986084, | |
| "epoch": 0.9971988795518207, | |
| "grad_norm": 0.03458503261208534, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5484628677368164, | |
| "mean_token_accuracy": 0.779531255364418, | |
| "num_tokens": 4363004.0, | |
| "step": 267 | |
| }, | |
| { | |
| "entropy": 0.5694002906481425, | |
| "epoch": 1.0, | |
| "grad_norm": 0.033372946083545685, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5757001638412476, | |
| "mean_token_accuracy": 0.7725784182548523, | |
| "num_tokens": 4364721.0, | |
| "step": 268 | |
| }, | |
| { | |
| "entropy": 0.5490456074476242, | |
| "epoch": 1.003734827264239, | |
| "grad_norm": 0.030816873535513878, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5466992855072021, | |
| "mean_token_accuracy": 0.7772593349218369, | |
| "num_tokens": 4380959.0, | |
| "step": 269 | |
| }, | |
| { | |
| "entropy": 0.5297957360744476, | |
| "epoch": 1.007469654528478, | |
| "grad_norm": 0.0300835482776165, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5296781063079834, | |
| "mean_token_accuracy": 0.7851966172456741, | |
| "num_tokens": 4397319.0, | |
| "step": 270 | |
| }, | |
| { | |
| "entropy": 0.5426550507545471, | |
| "epoch": 1.011204481792717, | |
| "grad_norm": 0.0309379193931818, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5401790142059326, | |
| "mean_token_accuracy": 0.7784202843904495, | |
| "num_tokens": 4413503.0, | |
| "step": 271 | |
| }, | |
| { | |
| "entropy": 0.536088228225708, | |
| "epoch": 1.014939309056956, | |
| "grad_norm": 0.030822666361927986, | |
| "learning_rate": 0.0002, | |
| "loss": 0.533880352973938, | |
| "mean_token_accuracy": 0.7821955978870392, | |
| "num_tokens": 4429731.0, | |
| "step": 272 | |
| }, | |
| { | |
| "entropy": 0.5376520156860352, | |
| "epoch": 1.018674136321195, | |
| "grad_norm": 0.03910338878631592, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5515881776809692, | |
| "mean_token_accuracy": 0.7752164155244827, | |
| "num_tokens": 4445975.0, | |
| "step": 273 | |
| }, | |
| { | |
| "entropy": 0.5337154120206833, | |
| "epoch": 1.022408963585434, | |
| "grad_norm": 0.030765611678361893, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5412506461143494, | |
| "mean_token_accuracy": 0.7780167758464813, | |
| "num_tokens": 4462105.0, | |
| "step": 274 | |
| }, | |
| { | |
| "entropy": 0.5487084090709686, | |
| "epoch": 1.026143790849673, | |
| "grad_norm": 0.03003527596592903, | |
| "learning_rate": 0.0002, | |
| "loss": 0.540929913520813, | |
| "mean_token_accuracy": 0.7784045934677124, | |
| "num_tokens": 4478591.0, | |
| "step": 275 | |
| }, | |
| { | |
| "entropy": 0.5385126918554306, | |
| "epoch": 1.0298786181139121, | |
| "grad_norm": 0.027475042268633842, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5318593978881836, | |
| "mean_token_accuracy": 0.7862093448638916, | |
| "num_tokens": 4495044.0, | |
| "step": 276 | |
| }, | |
| { | |
| "entropy": 0.5600587129592896, | |
| "epoch": 1.0336134453781514, | |
| "grad_norm": 0.029431000351905823, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5559869408607483, | |
| "mean_token_accuracy": 0.7744521498680115, | |
| "num_tokens": 4511459.0, | |
| "step": 277 | |
| }, | |
| { | |
| "entropy": 0.5381200164556503, | |
| "epoch": 1.0373482726423904, | |
| "grad_norm": 0.02848048508167267, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5395113229751587, | |
| "mean_token_accuracy": 0.7798527628183365, | |
| "num_tokens": 4527903.0, | |
| "step": 278 | |
| }, | |
| { | |
| "entropy": 0.5346540361642838, | |
| "epoch": 1.0410830999066294, | |
| "grad_norm": 0.033454034477472305, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5404960513114929, | |
| "mean_token_accuracy": 0.7793795019388199, | |
| "num_tokens": 4544182.0, | |
| "step": 279 | |
| }, | |
| { | |
| "entropy": 0.544955238699913, | |
| "epoch": 1.0448179271708684, | |
| "grad_norm": 0.02894734963774681, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5436176061630249, | |
| "mean_token_accuracy": 0.7777452617883682, | |
| "num_tokens": 4560880.0, | |
| "step": 280 | |
| }, | |
| { | |
| "entropy": 0.5431416183710098, | |
| "epoch": 1.0485527544351074, | |
| "grad_norm": 0.02903336100280285, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5436229109764099, | |
| "mean_token_accuracy": 0.7780826389789581, | |
| "num_tokens": 4577183.0, | |
| "step": 281 | |
| }, | |
| { | |
| "entropy": 0.5408187806606293, | |
| "epoch": 1.0522875816993464, | |
| "grad_norm": 0.029271787032485008, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5370380282402039, | |
| "mean_token_accuracy": 0.7815099805593491, | |
| "num_tokens": 4593864.0, | |
| "step": 282 | |
| }, | |
| { | |
| "entropy": 0.5497590750455856, | |
| "epoch": 1.0560224089635855, | |
| "grad_norm": 0.028807660564780235, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5504873991012573, | |
| "mean_token_accuracy": 0.777531310915947, | |
| "num_tokens": 4610349.0, | |
| "step": 283 | |
| }, | |
| { | |
| "entropy": 0.5368742346763611, | |
| "epoch": 1.0597572362278245, | |
| "grad_norm": 0.031959034502506256, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5419926643371582, | |
| "mean_token_accuracy": 0.7784341871738434, | |
| "num_tokens": 4626437.0, | |
| "step": 284 | |
| }, | |
| { | |
| "entropy": 0.5532872825860977, | |
| "epoch": 1.0634920634920635, | |
| "grad_norm": 0.028826460242271423, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5571930408477783, | |
| "mean_token_accuracy": 0.7746778875589371, | |
| "num_tokens": 4642633.0, | |
| "step": 285 | |
| }, | |
| { | |
| "entropy": 0.5407113283872604, | |
| "epoch": 1.0672268907563025, | |
| "grad_norm": 0.03065388835966587, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5436424612998962, | |
| "mean_token_accuracy": 0.779659166932106, | |
| "num_tokens": 4658940.0, | |
| "step": 286 | |
| }, | |
| { | |
| "entropy": 0.5552934855222702, | |
| "epoch": 1.0709617180205415, | |
| "grad_norm": 0.03264114633202553, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5482615232467651, | |
| "mean_token_accuracy": 0.7754945755004883, | |
| "num_tokens": 4675263.0, | |
| "step": 287 | |
| }, | |
| { | |
| "entropy": 0.5442743301391602, | |
| "epoch": 1.0746965452847805, | |
| "grad_norm": 0.031116079539060593, | |
| "learning_rate": 0.0002, | |
| "loss": 0.538812518119812, | |
| "mean_token_accuracy": 0.7806833982467651, | |
| "num_tokens": 4691415.0, | |
| "step": 288 | |
| }, | |
| { | |
| "entropy": 0.5530855804681778, | |
| "epoch": 1.0784313725490196, | |
| "grad_norm": 0.03077593445777893, | |
| "learning_rate": 0.0002, | |
| "loss": 0.548968493938446, | |
| "mean_token_accuracy": 0.7756039202213287, | |
| "num_tokens": 4707736.0, | |
| "step": 289 | |
| }, | |
| { | |
| "entropy": 0.5455960035324097, | |
| "epoch": 1.0821661998132586, | |
| "grad_norm": 0.028605274856090546, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5435131788253784, | |
| "mean_token_accuracy": 0.7795460671186447, | |
| "num_tokens": 4724095.0, | |
| "step": 290 | |
| }, | |
| { | |
| "entropy": 0.5397526025772095, | |
| "epoch": 1.0859010270774976, | |
| "grad_norm": 0.03644070401787758, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5488567352294922, | |
| "mean_token_accuracy": 0.7778657674789429, | |
| "num_tokens": 4740602.0, | |
| "step": 291 | |
| }, | |
| { | |
| "entropy": 0.5470818132162094, | |
| "epoch": 1.0896358543417366, | |
| "grad_norm": 0.033212918788194656, | |
| "learning_rate": 0.0002, | |
| "loss": 0.555572509765625, | |
| "mean_token_accuracy": 0.7734686136245728, | |
| "num_tokens": 4756842.0, | |
| "step": 292 | |
| }, | |
| { | |
| "entropy": 0.5398264974355698, | |
| "epoch": 1.0933706816059758, | |
| "grad_norm": 0.027302522212266922, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5371235013008118, | |
| "mean_token_accuracy": 0.7826644480228424, | |
| "num_tokens": 4773499.0, | |
| "step": 293 | |
| }, | |
| { | |
| "entropy": 0.564954400062561, | |
| "epoch": 1.0971055088702149, | |
| "grad_norm": 0.02829107642173767, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5558594465255737, | |
| "mean_token_accuracy": 0.7749541401863098, | |
| "num_tokens": 4790183.0, | |
| "step": 294 | |
| }, | |
| { | |
| "entropy": 0.5593573749065399, | |
| "epoch": 1.1008403361344539, | |
| "grad_norm": 0.027547527104616165, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5560394525527954, | |
| "mean_token_accuracy": 0.7725719660520554, | |
| "num_tokens": 4806455.0, | |
| "step": 295 | |
| }, | |
| { | |
| "entropy": 0.5377779453992844, | |
| "epoch": 1.1045751633986929, | |
| "grad_norm": 0.03161724656820297, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5370453596115112, | |
| "mean_token_accuracy": 0.782875582575798, | |
| "num_tokens": 4822731.0, | |
| "step": 296 | |
| }, | |
| { | |
| "entropy": 0.5386165231466293, | |
| "epoch": 1.108309990662932, | |
| "grad_norm": 0.03147651255130768, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5423634648323059, | |
| "mean_token_accuracy": 0.7768422961235046, | |
| "num_tokens": 4839112.0, | |
| "step": 297 | |
| }, | |
| { | |
| "entropy": 0.5279396325349808, | |
| "epoch": 1.112044817927171, | |
| "grad_norm": 0.031283456832170486, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5321308970451355, | |
| "mean_token_accuracy": 0.7849069982767105, | |
| "num_tokens": 4855229.0, | |
| "step": 298 | |
| }, | |
| { | |
| "entropy": 0.5327593311667442, | |
| "epoch": 1.11577964519141, | |
| "grad_norm": 0.03042989782989025, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5393236875534058, | |
| "mean_token_accuracy": 0.7804521471261978, | |
| "num_tokens": 4871644.0, | |
| "step": 299 | |
| }, | |
| { | |
| "entropy": 0.560793936252594, | |
| "epoch": 1.119514472455649, | |
| "grad_norm": 0.029397251084446907, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5557554960250854, | |
| "mean_token_accuracy": 0.7728655338287354, | |
| "num_tokens": 4887992.0, | |
| "step": 300 | |
| }, | |
| { | |
| "entropy": 0.5604539066553116, | |
| "epoch": 1.123249299719888, | |
| "grad_norm": 0.02948898635804653, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5545894503593445, | |
| "mean_token_accuracy": 0.7743670493364334, | |
| "num_tokens": 4904384.0, | |
| "step": 301 | |
| }, | |
| { | |
| "entropy": 0.5394376814365387, | |
| "epoch": 1.126984126984127, | |
| "grad_norm": 0.029182471334934235, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5341510772705078, | |
| "mean_token_accuracy": 0.7823253571987152, | |
| "num_tokens": 4920587.0, | |
| "step": 302 | |
| }, | |
| { | |
| "entropy": 0.5301040560007095, | |
| "epoch": 1.130718954248366, | |
| "grad_norm": 0.03680079057812691, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5372604131698608, | |
| "mean_token_accuracy": 0.7793124318122864, | |
| "num_tokens": 4937055.0, | |
| "step": 303 | |
| }, | |
| { | |
| "entropy": 0.5290943831205368, | |
| "epoch": 1.134453781512605, | |
| "grad_norm": 0.03931280970573425, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5391898155212402, | |
| "mean_token_accuracy": 0.7829029709100723, | |
| "num_tokens": 4953281.0, | |
| "step": 304 | |
| }, | |
| { | |
| "entropy": 0.5609545707702637, | |
| "epoch": 1.138188608776844, | |
| "grad_norm": 0.030014565214514732, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5609763264656067, | |
| "mean_token_accuracy": 0.7726535797119141, | |
| "num_tokens": 4969665.0, | |
| "step": 305 | |
| }, | |
| { | |
| "entropy": 0.5205260962247849, | |
| "epoch": 1.141923436041083, | |
| "grad_norm": 0.03301642835140228, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5286065340042114, | |
| "mean_token_accuracy": 0.7840328961610794, | |
| "num_tokens": 4985863.0, | |
| "step": 306 | |
| }, | |
| { | |
| "entropy": 0.5605068057775497, | |
| "epoch": 1.145658263305322, | |
| "grad_norm": 0.029299437999725342, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5569101572036743, | |
| "mean_token_accuracy": 0.7721403539180756, | |
| "num_tokens": 5002543.0, | |
| "step": 307 | |
| }, | |
| { | |
| "entropy": 0.552753359079361, | |
| "epoch": 1.149393090569561, | |
| "grad_norm": 0.027307430282235146, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5464527606964111, | |
| "mean_token_accuracy": 0.7777755260467529, | |
| "num_tokens": 5019035.0, | |
| "step": 308 | |
| }, | |
| { | |
| "entropy": 0.5565258711576462, | |
| "epoch": 1.1531279178338, | |
| "grad_norm": 0.028590641915798187, | |
| "learning_rate": 0.0002, | |
| "loss": 0.551773726940155, | |
| "mean_token_accuracy": 0.7753841280937195, | |
| "num_tokens": 5035778.0, | |
| "step": 309 | |
| }, | |
| { | |
| "entropy": 0.5335747301578522, | |
| "epoch": 1.156862745098039, | |
| "grad_norm": 0.02846100926399231, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5332034826278687, | |
| "mean_token_accuracy": 0.7849084585905075, | |
| "num_tokens": 5052106.0, | |
| "step": 310 | |
| }, | |
| { | |
| "entropy": 0.5462342649698257, | |
| "epoch": 1.1605975723622783, | |
| "grad_norm": 0.03037341870367527, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5533976554870605, | |
| "mean_token_accuracy": 0.7761731296777725, | |
| "num_tokens": 5068494.0, | |
| "step": 311 | |
| }, | |
| { | |
| "entropy": 0.5365739315748215, | |
| "epoch": 1.1643323996265174, | |
| "grad_norm": 0.0328284353017807, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5443044900894165, | |
| "mean_token_accuracy": 0.7775984853506088, | |
| "num_tokens": 5084698.0, | |
| "step": 312 | |
| }, | |
| { | |
| "entropy": 0.5469802767038345, | |
| "epoch": 1.1680672268907564, | |
| "grad_norm": 0.029220817610621452, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5449838638305664, | |
| "mean_token_accuracy": 0.7794362902641296, | |
| "num_tokens": 5101231.0, | |
| "step": 313 | |
| }, | |
| { | |
| "entropy": 0.5534107983112335, | |
| "epoch": 1.1718020541549954, | |
| "grad_norm": 0.03240218386054039, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5596653819084167, | |
| "mean_token_accuracy": 0.7733468264341354, | |
| "num_tokens": 5117669.0, | |
| "step": 314 | |
| }, | |
| { | |
| "entropy": 0.5505286902189255, | |
| "epoch": 1.1755368814192344, | |
| "grad_norm": 0.030088460072875023, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5460378527641296, | |
| "mean_token_accuracy": 0.7785163521766663, | |
| "num_tokens": 5134044.0, | |
| "step": 315 | |
| }, | |
| { | |
| "entropy": 0.5583444237709045, | |
| "epoch": 1.1792717086834734, | |
| "grad_norm": 0.03908608481287956, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5499372482299805, | |
| "mean_token_accuracy": 0.7741111516952515, | |
| "num_tokens": 5150155.0, | |
| "step": 316 | |
| }, | |
| { | |
| "entropy": 0.5583514273166656, | |
| "epoch": 1.1830065359477124, | |
| "grad_norm": 0.03262948617339134, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5514504909515381, | |
| "mean_token_accuracy": 0.7749726176261902, | |
| "num_tokens": 5166653.0, | |
| "step": 317 | |
| }, | |
| { | |
| "entropy": 0.54158616065979, | |
| "epoch": 1.1867413632119514, | |
| "grad_norm": 0.030375484377145767, | |
| "learning_rate": 0.0002, | |
| "loss": 0.535007119178772, | |
| "mean_token_accuracy": 0.78143410384655, | |
| "num_tokens": 5182849.0, | |
| "step": 318 | |
| }, | |
| { | |
| "entropy": 0.5355552136898041, | |
| "epoch": 1.1904761904761905, | |
| "grad_norm": 0.034217700362205505, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5416175723075867, | |
| "mean_token_accuracy": 0.7821937054395676, | |
| "num_tokens": 5199310.0, | |
| "step": 319 | |
| }, | |
| { | |
| "entropy": 0.5375736951828003, | |
| "epoch": 1.1942110177404295, | |
| "grad_norm": 0.03742173686623573, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5497441291809082, | |
| "mean_token_accuracy": 0.779162734746933, | |
| "num_tokens": 5215628.0, | |
| "step": 320 | |
| }, | |
| { | |
| "entropy": 0.5327057242393494, | |
| "epoch": 1.1979458450046685, | |
| "grad_norm": 0.03143603354692459, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5377879738807678, | |
| "mean_token_accuracy": 0.7819731533527374, | |
| "num_tokens": 5232104.0, | |
| "step": 321 | |
| }, | |
| { | |
| "entropy": 0.5589822083711624, | |
| "epoch": 1.2016806722689075, | |
| "grad_norm": 0.030957849696278572, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5600837469100952, | |
| "mean_token_accuracy": 0.772526428103447, | |
| "num_tokens": 5248228.0, | |
| "step": 322 | |
| }, | |
| { | |
| "entropy": 0.5267817825078964, | |
| "epoch": 1.2054154995331465, | |
| "grad_norm": 0.028181420639157295, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5258863568305969, | |
| "mean_token_accuracy": 0.7852722406387329, | |
| "num_tokens": 5264722.0, | |
| "step": 323 | |
| }, | |
| { | |
| "entropy": 0.5596602708101273, | |
| "epoch": 1.2091503267973855, | |
| "grad_norm": 0.0294583011418581, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5542659163475037, | |
| "mean_token_accuracy": 0.7757792323827744, | |
| "num_tokens": 5281102.0, | |
| "step": 324 | |
| }, | |
| { | |
| "entropy": 0.5631477683782578, | |
| "epoch": 1.2128851540616246, | |
| "grad_norm": 0.028790894895792007, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5568723678588867, | |
| "mean_token_accuracy": 0.771973267197609, | |
| "num_tokens": 5297684.0, | |
| "step": 325 | |
| }, | |
| { | |
| "entropy": 0.5380028486251831, | |
| "epoch": 1.2166199813258638, | |
| "grad_norm": 0.031924713402986526, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5376958847045898, | |
| "mean_token_accuracy": 0.7829422205686569, | |
| "num_tokens": 5313908.0, | |
| "step": 326 | |
| }, | |
| { | |
| "entropy": 0.5375301241874695, | |
| "epoch": 1.2203548085901028, | |
| "grad_norm": 0.03397483006119728, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5478475093841553, | |
| "mean_token_accuracy": 0.7765705734491348, | |
| "num_tokens": 5329966.0, | |
| "step": 327 | |
| }, | |
| { | |
| "entropy": 0.5427165776491165, | |
| "epoch": 1.2240896358543418, | |
| "grad_norm": 0.035384900867938995, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5524033308029175, | |
| "mean_token_accuracy": 0.7745779901742935, | |
| "num_tokens": 5346453.0, | |
| "step": 328 | |
| }, | |
| { | |
| "entropy": 0.5400120764970779, | |
| "epoch": 1.2278244631185808, | |
| "grad_norm": 0.030376868322491646, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5346859097480774, | |
| "mean_token_accuracy": 0.7804136276245117, | |
| "num_tokens": 5362598.0, | |
| "step": 329 | |
| }, | |
| { | |
| "entropy": 0.5525883883237839, | |
| "epoch": 1.2315592903828199, | |
| "grad_norm": 0.029532834887504578, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5460601449012756, | |
| "mean_token_accuracy": 0.7782909572124481, | |
| "num_tokens": 5378809.0, | |
| "step": 330 | |
| }, | |
| { | |
| "entropy": 0.5435810536146164, | |
| "epoch": 1.2352941176470589, | |
| "grad_norm": 0.02912810444831848, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5412687659263611, | |
| "mean_token_accuracy": 0.7805328518152237, | |
| "num_tokens": 5394964.0, | |
| "step": 331 | |
| }, | |
| { | |
| "entropy": 0.5558127015829086, | |
| "epoch": 1.239028944911298, | |
| "grad_norm": 0.03399093821644783, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5503210425376892, | |
| "mean_token_accuracy": 0.7771144658327103, | |
| "num_tokens": 5411296.0, | |
| "step": 332 | |
| }, | |
| { | |
| "entropy": 0.5612344145774841, | |
| "epoch": 1.242763772175537, | |
| "grad_norm": 0.028297265991568565, | |
| "learning_rate": 0.0002, | |
| "loss": 0.561404824256897, | |
| "mean_token_accuracy": 0.7735303044319153, | |
| "num_tokens": 5427522.0, | |
| "step": 333 | |
| }, | |
| { | |
| "entropy": 0.5317913144826889, | |
| "epoch": 1.246498599439776, | |
| "grad_norm": 0.03494315594434738, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5433036684989929, | |
| "mean_token_accuracy": 0.7796971648931503, | |
| "num_tokens": 5443757.0, | |
| "step": 334 | |
| }, | |
| { | |
| "entropy": 0.542137622833252, | |
| "epoch": 1.250233426704015, | |
| "grad_norm": 0.02819279581308365, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5451513528823853, | |
| "mean_token_accuracy": 0.7785246819257736, | |
| "num_tokens": 5460219.0, | |
| "step": 335 | |
| }, | |
| { | |
| "entropy": 0.5389015078544617, | |
| "epoch": 1.253968253968254, | |
| "grad_norm": 0.029153091832995415, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5426021218299866, | |
| "mean_token_accuracy": 0.7783170789480209, | |
| "num_tokens": 5476465.0, | |
| "step": 336 | |
| }, | |
| { | |
| "entropy": 0.5529672205448151, | |
| "epoch": 1.257703081232493, | |
| "grad_norm": 0.03458336368203163, | |
| "learning_rate": 0.0002, | |
| "loss": 0.540812611579895, | |
| "mean_token_accuracy": 0.7807324081659317, | |
| "num_tokens": 5492565.0, | |
| "step": 337 | |
| }, | |
| { | |
| "entropy": 0.581393301486969, | |
| "epoch": 1.261437908496732, | |
| "grad_norm": 0.031111041083931923, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5751311779022217, | |
| "mean_token_accuracy": 0.7666933685541153, | |
| "num_tokens": 5509003.0, | |
| "step": 338 | |
| }, | |
| { | |
| "entropy": 0.5588483065366745, | |
| "epoch": 1.265172735760971, | |
| "grad_norm": 0.030144309625029564, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5589640140533447, | |
| "mean_token_accuracy": 0.7755171656608582, | |
| "num_tokens": 5525262.0, | |
| "step": 339 | |
| }, | |
| { | |
| "entropy": 0.5336481779813766, | |
| "epoch": 1.26890756302521, | |
| "grad_norm": 0.03417432680726051, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5390788316726685, | |
| "mean_token_accuracy": 0.780031830072403, | |
| "num_tokens": 5541654.0, | |
| "step": 340 | |
| }, | |
| { | |
| "entropy": 0.5282999128103256, | |
| "epoch": 1.272642390289449, | |
| "grad_norm": 0.03498517721891403, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5387616157531738, | |
| "mean_token_accuracy": 0.7800437808036804, | |
| "num_tokens": 5557983.0, | |
| "step": 341 | |
| }, | |
| { | |
| "entropy": 0.5369831025600433, | |
| "epoch": 1.276377217553688, | |
| "grad_norm": 0.029845617711544037, | |
| "learning_rate": 0.0002, | |
| "loss": 0.535378634929657, | |
| "mean_token_accuracy": 0.7823457568883896, | |
| "num_tokens": 5574311.0, | |
| "step": 342 | |
| }, | |
| { | |
| "entropy": 0.5538373440504074, | |
| "epoch": 1.280112044817927, | |
| "grad_norm": 0.027923226356506348, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5500721335411072, | |
| "mean_token_accuracy": 0.7771336436271667, | |
| "num_tokens": 5590547.0, | |
| "step": 343 | |
| }, | |
| { | |
| "entropy": 0.5545977205038071, | |
| "epoch": 1.283846872082166, | |
| "grad_norm": 0.0305513683706522, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5511223077774048, | |
| "mean_token_accuracy": 0.7757980972528458, | |
| "num_tokens": 5606717.0, | |
| "step": 344 | |
| }, | |
| { | |
| "entropy": 0.560431718826294, | |
| "epoch": 1.287581699346405, | |
| "grad_norm": 0.029267068952322006, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5540031790733337, | |
| "mean_token_accuracy": 0.7738614529371262, | |
| "num_tokens": 5623238.0, | |
| "step": 345 | |
| }, | |
| { | |
| "entropy": 0.5598475635051727, | |
| "epoch": 1.2913165266106443, | |
| "grad_norm": 0.032441407442092896, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5511676669120789, | |
| "mean_token_accuracy": 0.775727853178978, | |
| "num_tokens": 5639482.0, | |
| "step": 346 | |
| }, | |
| { | |
| "entropy": 0.532151535153389, | |
| "epoch": 1.2950513538748833, | |
| "grad_norm": 0.03496084734797478, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5387351512908936, | |
| "mean_token_accuracy": 0.7811897695064545, | |
| "num_tokens": 5655745.0, | |
| "step": 347 | |
| }, | |
| { | |
| "entropy": 0.5362464487552643, | |
| "epoch": 1.2987861811391224, | |
| "grad_norm": 0.03774246945977211, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5451931953430176, | |
| "mean_token_accuracy": 0.7775505632162094, | |
| "num_tokens": 5672305.0, | |
| "step": 348 | |
| }, | |
| { | |
| "entropy": 0.5285972878336906, | |
| "epoch": 1.3025210084033614, | |
| "grad_norm": 0.0332336500287056, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5353838801383972, | |
| "mean_token_accuracy": 0.7838114500045776, | |
| "num_tokens": 5688630.0, | |
| "step": 349 | |
| }, | |
| { | |
| "entropy": 0.5421172678470612, | |
| "epoch": 1.3062558356676004, | |
| "grad_norm": 0.03457598015666008, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5392417311668396, | |
| "mean_token_accuracy": 0.7807410657405853, | |
| "num_tokens": 5705054.0, | |
| "step": 350 | |
| }, | |
| { | |
| "entropy": 0.5382883250713348, | |
| "epoch": 1.3099906629318394, | |
| "grad_norm": 0.031050430610775948, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5347834825515747, | |
| "mean_token_accuracy": 0.7828159481287003, | |
| "num_tokens": 5721382.0, | |
| "step": 351 | |
| }, | |
| { | |
| "entropy": 0.550368145108223, | |
| "epoch": 1.3137254901960784, | |
| "grad_norm": 0.03463875129818916, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5514199137687683, | |
| "mean_token_accuracy": 0.7735539227724075, | |
| "num_tokens": 5737730.0, | |
| "step": 352 | |
| }, | |
| { | |
| "entropy": 0.538982629776001, | |
| "epoch": 1.3174603174603174, | |
| "grad_norm": 0.03956155851483345, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5469655990600586, | |
| "mean_token_accuracy": 0.7747407406568527, | |
| "num_tokens": 5753795.0, | |
| "step": 353 | |
| }, | |
| { | |
| "entropy": 0.5339585244655609, | |
| "epoch": 1.3211951447245565, | |
| "grad_norm": 0.029367057606577873, | |
| "learning_rate": 0.0002, | |
| "loss": 0.536923348903656, | |
| "mean_token_accuracy": 0.7791249603033066, | |
| "num_tokens": 5770100.0, | |
| "step": 354 | |
| }, | |
| { | |
| "entropy": 0.5469655245542526, | |
| "epoch": 1.3249299719887955, | |
| "grad_norm": 0.044070687144994736, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5485926270484924, | |
| "mean_token_accuracy": 0.7760020345449448, | |
| "num_tokens": 5786242.0, | |
| "step": 355 | |
| }, | |
| { | |
| "entropy": 0.5686767846345901, | |
| "epoch": 1.3286647992530345, | |
| "grad_norm": 0.0298174861818552, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5646032691001892, | |
| "mean_token_accuracy": 0.7700935900211334, | |
| "num_tokens": 5802594.0, | |
| "step": 356 | |
| }, | |
| { | |
| "entropy": 0.5524211078882217, | |
| "epoch": 1.3323996265172735, | |
| "grad_norm": 0.03443749621510506, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5538625717163086, | |
| "mean_token_accuracy": 0.7730942517518997, | |
| "num_tokens": 5818733.0, | |
| "step": 357 | |
| }, | |
| { | |
| "entropy": 0.5450694710016251, | |
| "epoch": 1.3361344537815127, | |
| "grad_norm": 0.042639389634132385, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5457915663719177, | |
| "mean_token_accuracy": 0.7793462425470352, | |
| "num_tokens": 5834966.0, | |
| "step": 358 | |
| }, | |
| { | |
| "entropy": 0.5628755837678909, | |
| "epoch": 1.3398692810457518, | |
| "grad_norm": 0.031939953565597534, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5615131855010986, | |
| "mean_token_accuracy": 0.7720433920621872, | |
| "num_tokens": 5851352.0, | |
| "step": 359 | |
| }, | |
| { | |
| "entropy": 0.5299947410821915, | |
| "epoch": 1.3436041083099908, | |
| "grad_norm": 0.03047833777964115, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5295021533966064, | |
| "mean_token_accuracy": 0.7874699085950851, | |
| "num_tokens": 5867820.0, | |
| "step": 360 | |
| }, | |
| { | |
| "entropy": 0.5308109223842621, | |
| "epoch": 1.3473389355742298, | |
| "grad_norm": 0.032848697155714035, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5431129336357117, | |
| "mean_token_accuracy": 0.7857107818126678, | |
| "num_tokens": 5883984.0, | |
| "step": 361 | |
| }, | |
| { | |
| "entropy": 0.5426601469516754, | |
| "epoch": 1.3510737628384688, | |
| "grad_norm": 0.033830493688583374, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5514194965362549, | |
| "mean_token_accuracy": 0.77635657787323, | |
| "num_tokens": 5900290.0, | |
| "step": 362 | |
| }, | |
| { | |
| "entropy": 0.5411643236875534, | |
| "epoch": 1.3548085901027078, | |
| "grad_norm": 0.029694274067878723, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5333205461502075, | |
| "mean_token_accuracy": 0.7832283675670624, | |
| "num_tokens": 5916469.0, | |
| "step": 363 | |
| }, | |
| { | |
| "entropy": 0.5501731634140015, | |
| "epoch": 1.3585434173669468, | |
| "grad_norm": 0.03007029928267002, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5431393980979919, | |
| "mean_token_accuracy": 0.7804041355848312, | |
| "num_tokens": 5932693.0, | |
| "step": 364 | |
| }, | |
| { | |
| "entropy": 0.5419217795133591, | |
| "epoch": 1.3622782446311859, | |
| "grad_norm": 0.030986929312348366, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5391764044761658, | |
| "mean_token_accuracy": 0.7810684144496918, | |
| "num_tokens": 5949053.0, | |
| "step": 365 | |
| }, | |
| { | |
| "entropy": 0.529257670044899, | |
| "epoch": 1.3660130718954249, | |
| "grad_norm": 0.0282028466463089, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5282759666442871, | |
| "mean_token_accuracy": 0.7846860438585281, | |
| "num_tokens": 5965428.0, | |
| "step": 366 | |
| }, | |
| { | |
| "entropy": 0.5425796508789062, | |
| "epoch": 1.3697478991596639, | |
| "grad_norm": 0.03842358663678169, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5492331981658936, | |
| "mean_token_accuracy": 0.7747556120157242, | |
| "num_tokens": 5981730.0, | |
| "step": 367 | |
| }, | |
| { | |
| "entropy": 0.5349410325288773, | |
| "epoch": 1.373482726423903, | |
| "grad_norm": 0.033598389476537704, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5436474084854126, | |
| "mean_token_accuracy": 0.7797878831624985, | |
| "num_tokens": 5997949.0, | |
| "step": 368 | |
| }, | |
| { | |
| "entropy": 0.552407756447792, | |
| "epoch": 1.377217553688142, | |
| "grad_norm": 0.03342469781637192, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5567049980163574, | |
| "mean_token_accuracy": 0.7723858207464218, | |
| "num_tokens": 6014178.0, | |
| "step": 369 | |
| }, | |
| { | |
| "entropy": 0.5454883426427841, | |
| "epoch": 1.380952380952381, | |
| "grad_norm": 0.03550714999437332, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5418342351913452, | |
| "mean_token_accuracy": 0.7798961699008942, | |
| "num_tokens": 6030806.0, | |
| "step": 370 | |
| }, | |
| { | |
| "entropy": 0.552109032869339, | |
| "epoch": 1.38468720821662, | |
| "grad_norm": 0.03026903234422207, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5456339120864868, | |
| "mean_token_accuracy": 0.7773927599191666, | |
| "num_tokens": 6046782.0, | |
| "step": 371 | |
| }, | |
| { | |
| "entropy": 0.5603116452693939, | |
| "epoch": 1.388422035480859, | |
| "grad_norm": 0.03449714556336403, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5605192184448242, | |
| "mean_token_accuracy": 0.7709443271160126, | |
| "num_tokens": 6063178.0, | |
| "step": 372 | |
| }, | |
| { | |
| "entropy": 0.5442145317792892, | |
| "epoch": 1.392156862745098, | |
| "grad_norm": 0.03407449275255203, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5482808947563171, | |
| "mean_token_accuracy": 0.7804455161094666, | |
| "num_tokens": 6079813.0, | |
| "step": 373 | |
| }, | |
| { | |
| "entropy": 0.5443685501813889, | |
| "epoch": 1.395891690009337, | |
| "grad_norm": 0.03118809685111046, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5504392385482788, | |
| "mean_token_accuracy": 0.7759056687355042, | |
| "num_tokens": 6096208.0, | |
| "step": 374 | |
| }, | |
| { | |
| "entropy": 0.5544550269842148, | |
| "epoch": 1.399626517273576, | |
| "grad_norm": 0.03532007709145546, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5569352507591248, | |
| "mean_token_accuracy": 0.7748352587223053, | |
| "num_tokens": 6112356.0, | |
| "step": 375 | |
| }, | |
| { | |
| "entropy": 0.5439307242631912, | |
| "epoch": 1.403361344537815, | |
| "grad_norm": 0.0334586501121521, | |
| "learning_rate": 0.0002, | |
| "loss": 0.542488694190979, | |
| "mean_token_accuracy": 0.777744397521019, | |
| "num_tokens": 6128800.0, | |
| "step": 376 | |
| }, | |
| { | |
| "entropy": 0.5407049357891083, | |
| "epoch": 1.407096171802054, | |
| "grad_norm": 0.029349738731980324, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5370444655418396, | |
| "mean_token_accuracy": 0.7816447019577026, | |
| "num_tokens": 6145053.0, | |
| "step": 377 | |
| }, | |
| { | |
| "entropy": 0.5527060329914093, | |
| "epoch": 1.410830999066293, | |
| "grad_norm": 0.030373841524124146, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5530543327331543, | |
| "mean_token_accuracy": 0.775768980383873, | |
| "num_tokens": 6161518.0, | |
| "step": 378 | |
| }, | |
| { | |
| "entropy": 0.5383686721324921, | |
| "epoch": 1.4145658263305323, | |
| "grad_norm": 0.033442895859479904, | |
| "learning_rate": 0.0002, | |
| "loss": 0.539923369884491, | |
| "mean_token_accuracy": 0.7825078517198563, | |
| "num_tokens": 6177817.0, | |
| "step": 379 | |
| }, | |
| { | |
| "entropy": 0.5557737052440643, | |
| "epoch": 1.4183006535947713, | |
| "grad_norm": 0.03396908566355705, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5632482767105103, | |
| "mean_token_accuracy": 0.7692397683858871, | |
| "num_tokens": 6194312.0, | |
| "step": 380 | |
| }, | |
| { | |
| "entropy": 0.5457819253206253, | |
| "epoch": 1.4220354808590103, | |
| "grad_norm": 0.02866293303668499, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5467988848686218, | |
| "mean_token_accuracy": 0.7775601893663406, | |
| "num_tokens": 6210818.0, | |
| "step": 381 | |
| }, | |
| { | |
| "entropy": 0.5640534311532974, | |
| "epoch": 1.4257703081232493, | |
| "grad_norm": 0.027476362884044647, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5636141896247864, | |
| "mean_token_accuracy": 0.7717417329549789, | |
| "num_tokens": 6227080.0, | |
| "step": 382 | |
| }, | |
| { | |
| "entropy": 0.560546487569809, | |
| "epoch": 1.4295051353874884, | |
| "grad_norm": 0.030654683709144592, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5566866397857666, | |
| "mean_token_accuracy": 0.7725766897201538, | |
| "num_tokens": 6243654.0, | |
| "step": 383 | |
| }, | |
| { | |
| "entropy": 0.5566196143627167, | |
| "epoch": 1.4332399626517274, | |
| "grad_norm": 0.03377790376543999, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5511550903320312, | |
| "mean_token_accuracy": 0.7775295376777649, | |
| "num_tokens": 6259998.0, | |
| "step": 384 | |
| }, | |
| { | |
| "entropy": 0.5302538275718689, | |
| "epoch": 1.4369747899159664, | |
| "grad_norm": 0.028172362595796585, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5359051823616028, | |
| "mean_token_accuracy": 0.7816868871450424, | |
| "num_tokens": 6276398.0, | |
| "step": 385 | |
| }, | |
| { | |
| "entropy": 0.543848991394043, | |
| "epoch": 1.4407096171802054, | |
| "grad_norm": 0.03123684599995613, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5530490875244141, | |
| "mean_token_accuracy": 0.7756175249814987, | |
| "num_tokens": 6292623.0, | |
| "step": 386 | |
| }, | |
| { | |
| "entropy": 0.5351638197898865, | |
| "epoch": 1.4444444444444444, | |
| "grad_norm": 0.032041870057582855, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5453383326530457, | |
| "mean_token_accuracy": 0.7787481844425201, | |
| "num_tokens": 6308980.0, | |
| "step": 387 | |
| }, | |
| { | |
| "entropy": 0.5499856919050217, | |
| "epoch": 1.4481792717086834, | |
| "grad_norm": 0.03275283798575401, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5510199666023254, | |
| "mean_token_accuracy": 0.7770793437957764, | |
| "num_tokens": 6325352.0, | |
| "step": 388 | |
| }, | |
| { | |
| "entropy": 0.5473773032426834, | |
| "epoch": 1.4519140989729225, | |
| "grad_norm": 0.02793571725487709, | |
| "learning_rate": 0.0002, | |
| "loss": 0.540398120880127, | |
| "mean_token_accuracy": 0.7805086821317673, | |
| "num_tokens": 6341686.0, | |
| "step": 389 | |
| }, | |
| { | |
| "entropy": 0.553907573223114, | |
| "epoch": 1.4556489262371615, | |
| "grad_norm": 0.02763449028134346, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5470324754714966, | |
| "mean_token_accuracy": 0.7763955593109131, | |
| "num_tokens": 6358367.0, | |
| "step": 390 | |
| }, | |
| { | |
| "entropy": 0.54300856590271, | |
| "epoch": 1.4593837535014005, | |
| "grad_norm": 0.0320272259414196, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5394243001937866, | |
| "mean_token_accuracy": 0.7796929031610489, | |
| "num_tokens": 6374332.0, | |
| "step": 391 | |
| }, | |
| { | |
| "entropy": 0.5419201552867889, | |
| "epoch": 1.4631185807656397, | |
| "grad_norm": 0.029694141820073128, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5459417104721069, | |
| "mean_token_accuracy": 0.7794879227876663, | |
| "num_tokens": 6390817.0, | |
| "step": 392 | |
| }, | |
| { | |
| "entropy": 0.533346489071846, | |
| "epoch": 1.4668534080298787, | |
| "grad_norm": 0.031921736896038055, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5339134335517883, | |
| "mean_token_accuracy": 0.7845402210950851, | |
| "num_tokens": 6407105.0, | |
| "step": 393 | |
| }, | |
| { | |
| "entropy": 0.5490029752254486, | |
| "epoch": 1.4705882352941178, | |
| "grad_norm": 0.031292662024497986, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5461300611495972, | |
| "mean_token_accuracy": 0.7792785912752151, | |
| "num_tokens": 6423432.0, | |
| "step": 394 | |
| }, | |
| { | |
| "entropy": 0.5407290160655975, | |
| "epoch": 1.4743230625583568, | |
| "grad_norm": 0.029509229585528374, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5409979224205017, | |
| "mean_token_accuracy": 0.7798801958560944, | |
| "num_tokens": 6440111.0, | |
| "step": 395 | |
| }, | |
| { | |
| "entropy": 0.5352925509214401, | |
| "epoch": 1.4780578898225958, | |
| "grad_norm": 0.03132627159357071, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5360226035118103, | |
| "mean_token_accuracy": 0.7835162281990051, | |
| "num_tokens": 6456553.0, | |
| "step": 396 | |
| }, | |
| { | |
| "entropy": 0.5409245789051056, | |
| "epoch": 1.4817927170868348, | |
| "grad_norm": 0.032262932509183884, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5367339253425598, | |
| "mean_token_accuracy": 0.779682844877243, | |
| "num_tokens": 6472831.0, | |
| "step": 397 | |
| }, | |
| { | |
| "entropy": 0.5202168971300125, | |
| "epoch": 1.4855275443510738, | |
| "grad_norm": 0.033896930515766144, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5268123149871826, | |
| "mean_token_accuracy": 0.7819826900959015, | |
| "num_tokens": 6488931.0, | |
| "step": 398 | |
| }, | |
| { | |
| "entropy": 0.5325956791639328, | |
| "epoch": 1.4892623716153128, | |
| "grad_norm": 0.03540036827325821, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5433887839317322, | |
| "mean_token_accuracy": 0.778034120798111, | |
| "num_tokens": 6505354.0, | |
| "step": 399 | |
| }, | |
| { | |
| "entropy": 0.5327711254358292, | |
| "epoch": 1.4929971988795518, | |
| "grad_norm": 0.02958959899842739, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5335476398468018, | |
| "mean_token_accuracy": 0.7828179448843002, | |
| "num_tokens": 6521544.0, | |
| "step": 400 | |
| }, | |
| { | |
| "entropy": 0.5357908606529236, | |
| "epoch": 1.4967320261437909, | |
| "grad_norm": 0.027617521584033966, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5293720364570618, | |
| "mean_token_accuracy": 0.7868403792381287, | |
| "num_tokens": 6537889.0, | |
| "step": 401 | |
| }, | |
| { | |
| "entropy": 0.5473283380270004, | |
| "epoch": 1.5004668534080299, | |
| "grad_norm": 0.028360038995742798, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5436528325080872, | |
| "mean_token_accuracy": 0.7810066491365433, | |
| "num_tokens": 6554149.0, | |
| "step": 402 | |
| }, | |
| { | |
| "entropy": 0.5518513321876526, | |
| "epoch": 1.504201680672269, | |
| "grad_norm": 0.031041931360960007, | |
| "learning_rate": 0.0002, | |
| "loss": 0.545119047164917, | |
| "mean_token_accuracy": 0.7779288738965988, | |
| "num_tokens": 6570521.0, | |
| "step": 403 | |
| }, | |
| { | |
| "entropy": 0.5428237915039062, | |
| "epoch": 1.507936507936508, | |
| "grad_norm": 0.032197825610637665, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5472823977470398, | |
| "mean_token_accuracy": 0.7758528888225555, | |
| "num_tokens": 6587086.0, | |
| "step": 404 | |
| }, | |
| { | |
| "entropy": 0.5483403950929642, | |
| "epoch": 1.511671335200747, | |
| "grad_norm": 0.03174825757741928, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5524789094924927, | |
| "mean_token_accuracy": 0.7772649824619293, | |
| "num_tokens": 6603513.0, | |
| "step": 405 | |
| }, | |
| { | |
| "entropy": 0.5337469726800919, | |
| "epoch": 1.515406162464986, | |
| "grad_norm": 0.03365413472056389, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5418713688850403, | |
| "mean_token_accuracy": 0.7772432267665863, | |
| "num_tokens": 6619737.0, | |
| "step": 406 | |
| }, | |
| { | |
| "entropy": 0.5614880919456482, | |
| "epoch": 1.519140989729225, | |
| "grad_norm": 0.030781377106904984, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5604795217514038, | |
| "mean_token_accuracy": 0.7718411535024643, | |
| "num_tokens": 6636097.0, | |
| "step": 407 | |
| }, | |
| { | |
| "entropy": 0.5390657633543015, | |
| "epoch": 1.522875816993464, | |
| "grad_norm": 0.02782733179628849, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5329728126525879, | |
| "mean_token_accuracy": 0.7839234322309494, | |
| "num_tokens": 6652406.0, | |
| "step": 408 | |
| }, | |
| { | |
| "entropy": 0.5573919266462326, | |
| "epoch": 1.526610644257703, | |
| "grad_norm": 0.027401108294725418, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5554807186126709, | |
| "mean_token_accuracy": 0.7726366519927979, | |
| "num_tokens": 6668812.0, | |
| "step": 409 | |
| }, | |
| { | |
| "entropy": 0.5391197204589844, | |
| "epoch": 1.530345471521942, | |
| "grad_norm": 0.03163023665547371, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5407525897026062, | |
| "mean_token_accuracy": 0.7810121178627014, | |
| "num_tokens": 6685040.0, | |
| "step": 410 | |
| }, | |
| { | |
| "entropy": 0.5353195369243622, | |
| "epoch": 1.534080298786181, | |
| "grad_norm": 0.026917260140180588, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5328407883644104, | |
| "mean_token_accuracy": 0.7829948961734772, | |
| "num_tokens": 6701433.0, | |
| "step": 411 | |
| }, | |
| { | |
| "entropy": 0.5223068818449974, | |
| "epoch": 1.53781512605042, | |
| "grad_norm": 0.03261617571115494, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5255942344665527, | |
| "mean_token_accuracy": 0.785964623093605, | |
| "num_tokens": 6717710.0, | |
| "step": 412 | |
| }, | |
| { | |
| "entropy": 0.5453132838010788, | |
| "epoch": 1.541549953314659, | |
| "grad_norm": 0.03235824778676033, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5518944263458252, | |
| "mean_token_accuracy": 0.7770064026117325, | |
| "num_tokens": 6733942.0, | |
| "step": 413 | |
| }, | |
| { | |
| "entropy": 0.5489854216575623, | |
| "epoch": 1.545284780578898, | |
| "grad_norm": 0.02913379855453968, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5539657473564148, | |
| "mean_token_accuracy": 0.7730102986097336, | |
| "num_tokens": 6749978.0, | |
| "step": 414 | |
| }, | |
| { | |
| "entropy": 0.5504709929227829, | |
| "epoch": 1.5490196078431373, | |
| "grad_norm": 0.03497619554400444, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5534422397613525, | |
| "mean_token_accuracy": 0.7738368958234787, | |
| "num_tokens": 6766386.0, | |
| "step": 415 | |
| }, | |
| { | |
| "entropy": 0.5360163599252701, | |
| "epoch": 1.5527544351073763, | |
| "grad_norm": 0.03147003799676895, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5354920625686646, | |
| "mean_token_accuracy": 0.7844124883413315, | |
| "num_tokens": 6782497.0, | |
| "step": 416 | |
| }, | |
| { | |
| "entropy": 0.5680203884840012, | |
| "epoch": 1.5564892623716153, | |
| "grad_norm": 0.030537011101841927, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5605371594429016, | |
| "mean_token_accuracy": 0.772536501288414, | |
| "num_tokens": 6799059.0, | |
| "step": 417 | |
| }, | |
| { | |
| "entropy": 0.5505528301000595, | |
| "epoch": 1.5602240896358543, | |
| "grad_norm": 0.028710143640637398, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5522081255912781, | |
| "mean_token_accuracy": 0.7738733440637589, | |
| "num_tokens": 6815363.0, | |
| "step": 418 | |
| }, | |
| { | |
| "entropy": 0.5502945929765701, | |
| "epoch": 1.5639589169000934, | |
| "grad_norm": 0.0320894755423069, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5519194006919861, | |
| "mean_token_accuracy": 0.775145635008812, | |
| "num_tokens": 6831823.0, | |
| "step": 419 | |
| }, | |
| { | |
| "entropy": 0.5572039783000946, | |
| "epoch": 1.5676937441643324, | |
| "grad_norm": 0.028658481314778328, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5568941831588745, | |
| "mean_token_accuracy": 0.7728902250528336, | |
| "num_tokens": 6848346.0, | |
| "step": 420 | |
| }, | |
| { | |
| "entropy": 0.5431763082742691, | |
| "epoch": 1.5714285714285714, | |
| "grad_norm": 0.027273258194327354, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5424181818962097, | |
| "mean_token_accuracy": 0.7814328521490097, | |
| "num_tokens": 6864537.0, | |
| "step": 421 | |
| }, | |
| { | |
| "entropy": 0.5466543883085251, | |
| "epoch": 1.5751633986928104, | |
| "grad_norm": 0.02875494956970215, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5450119972229004, | |
| "mean_token_accuracy": 0.7765506953001022, | |
| "num_tokens": 6881053.0, | |
| "step": 422 | |
| }, | |
| { | |
| "entropy": 0.5499023944139481, | |
| "epoch": 1.5788982259570497, | |
| "grad_norm": 0.02958599291741848, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5486996173858643, | |
| "mean_token_accuracy": 0.778396338224411, | |
| "num_tokens": 6897409.0, | |
| "step": 423 | |
| }, | |
| { | |
| "entropy": 0.5387710481882095, | |
| "epoch": 1.5826330532212887, | |
| "grad_norm": 0.030644621700048447, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5404931306838989, | |
| "mean_token_accuracy": 0.7786550223827362, | |
| "num_tokens": 6913681.0, | |
| "step": 424 | |
| }, | |
| { | |
| "entropy": 0.5346106290817261, | |
| "epoch": 1.5863678804855277, | |
| "grad_norm": 0.028904983773827553, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5413768887519836, | |
| "mean_token_accuracy": 0.7797856479883194, | |
| "num_tokens": 6930096.0, | |
| "step": 425 | |
| }, | |
| { | |
| "entropy": 0.5166824460029602, | |
| "epoch": 1.5901027077497667, | |
| "grad_norm": 0.03321892023086548, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5238149166107178, | |
| "mean_token_accuracy": 0.7857634872198105, | |
| "num_tokens": 6946449.0, | |
| "step": 426 | |
| }, | |
| { | |
| "entropy": 0.5426425486803055, | |
| "epoch": 1.5938375350140057, | |
| "grad_norm": 0.030873097479343414, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5491586923599243, | |
| "mean_token_accuracy": 0.7750476896762848, | |
| "num_tokens": 6962805.0, | |
| "step": 427 | |
| }, | |
| { | |
| "entropy": 0.555439367890358, | |
| "epoch": 1.5975723622782447, | |
| "grad_norm": 0.030430428683757782, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5504173040390015, | |
| "mean_token_accuracy": 0.7780658453702927, | |
| "num_tokens": 6979378.0, | |
| "step": 428 | |
| }, | |
| { | |
| "entropy": 0.5425661355257034, | |
| "epoch": 1.6013071895424837, | |
| "grad_norm": 0.033183399587869644, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5338144302368164, | |
| "mean_token_accuracy": 0.7815939337015152, | |
| "num_tokens": 6995576.0, | |
| "step": 429 | |
| }, | |
| { | |
| "entropy": 0.5580693334341049, | |
| "epoch": 1.6050420168067228, | |
| "grad_norm": 0.02936139702796936, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5471250414848328, | |
| "mean_token_accuracy": 0.7805830985307693, | |
| "num_tokens": 7011887.0, | |
| "step": 430 | |
| }, | |
| { | |
| "entropy": 0.5445709973573685, | |
| "epoch": 1.6087768440709618, | |
| "grad_norm": 0.029686426743865013, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5449705719947815, | |
| "mean_token_accuracy": 0.7791666090488434, | |
| "num_tokens": 7028245.0, | |
| "step": 431 | |
| }, | |
| { | |
| "entropy": 0.5352734625339508, | |
| "epoch": 1.6125116713352008, | |
| "grad_norm": 0.0335598923265934, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5456029772758484, | |
| "mean_token_accuracy": 0.7778525203466415, | |
| "num_tokens": 7044490.0, | |
| "step": 432 | |
| }, | |
| { | |
| "entropy": 0.548936665058136, | |
| "epoch": 1.6162464985994398, | |
| "grad_norm": 0.03590673953294754, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5520269870758057, | |
| "mean_token_accuracy": 0.7742140144109726, | |
| "num_tokens": 7060917.0, | |
| "step": 433 | |
| }, | |
| { | |
| "entropy": 0.5434507131576538, | |
| "epoch": 1.6199813258636788, | |
| "grad_norm": 0.028407955542206764, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5414606332778931, | |
| "mean_token_accuracy": 0.778694823384285, | |
| "num_tokens": 7077100.0, | |
| "step": 434 | |
| }, | |
| { | |
| "entropy": 0.5490714907646179, | |
| "epoch": 1.6237161531279178, | |
| "grad_norm": 0.0324469618499279, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5481012463569641, | |
| "mean_token_accuracy": 0.7763958275318146, | |
| "num_tokens": 7093665.0, | |
| "step": 435 | |
| }, | |
| { | |
| "entropy": 0.5379714071750641, | |
| "epoch": 1.6274509803921569, | |
| "grad_norm": 0.030424365773797035, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5396856665611267, | |
| "mean_token_accuracy": 0.7815098166465759, | |
| "num_tokens": 7110174.0, | |
| "step": 436 | |
| }, | |
| { | |
| "entropy": 0.5480812042951584, | |
| "epoch": 1.6311858076563959, | |
| "grad_norm": 0.029105886816978455, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5511510372161865, | |
| "mean_token_accuracy": 0.7754542678594589, | |
| "num_tokens": 7126486.0, | |
| "step": 437 | |
| }, | |
| { | |
| "entropy": 0.5540740191936493, | |
| "epoch": 1.6349206349206349, | |
| "grad_norm": 0.027599727734923363, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5574399828910828, | |
| "mean_token_accuracy": 0.7723194360733032, | |
| "num_tokens": 7143064.0, | |
| "step": 438 | |
| }, | |
| { | |
| "entropy": 0.5382533967494965, | |
| "epoch": 1.638655462184874, | |
| "grad_norm": 0.02985025756061077, | |
| "learning_rate": 0.0002, | |
| "loss": 0.542414665222168, | |
| "mean_token_accuracy": 0.7797781080007553, | |
| "num_tokens": 7159194.0, | |
| "step": 439 | |
| }, | |
| { | |
| "entropy": 0.545093446969986, | |
| "epoch": 1.642390289449113, | |
| "grad_norm": 0.033221568912267685, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5397443771362305, | |
| "mean_token_accuracy": 0.781465008854866, | |
| "num_tokens": 7175448.0, | |
| "step": 440 | |
| }, | |
| { | |
| "entropy": 0.547942727804184, | |
| "epoch": 1.646125116713352, | |
| "grad_norm": 0.030130675062537193, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5471298098564148, | |
| "mean_token_accuracy": 0.7778923958539963, | |
| "num_tokens": 7191951.0, | |
| "step": 441 | |
| }, | |
| { | |
| "entropy": 0.5388812720775604, | |
| "epoch": 1.649859943977591, | |
| "grad_norm": 0.03608401492238045, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5405545234680176, | |
| "mean_token_accuracy": 0.7795072197914124, | |
| "num_tokens": 7208082.0, | |
| "step": 442 | |
| }, | |
| { | |
| "entropy": 0.5480445921421051, | |
| "epoch": 1.65359477124183, | |
| "grad_norm": 0.03251367062330246, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5486726760864258, | |
| "mean_token_accuracy": 0.7771764397621155, | |
| "num_tokens": 7224432.0, | |
| "step": 443 | |
| }, | |
| { | |
| "entropy": 0.5502856224775314, | |
| "epoch": 1.657329598506069, | |
| "grad_norm": 0.03557496517896652, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5455541014671326, | |
| "mean_token_accuracy": 0.7788678556680679, | |
| "num_tokens": 7241112.0, | |
| "step": 444 | |
| }, | |
| { | |
| "entropy": 0.5650181323289871, | |
| "epoch": 1.661064425770308, | |
| "grad_norm": 0.036821287125349045, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5659928321838379, | |
| "mean_token_accuracy": 0.7705142349004745, | |
| "num_tokens": 7257646.0, | |
| "step": 445 | |
| }, | |
| { | |
| "entropy": 0.5301887840032578, | |
| "epoch": 1.664799253034547, | |
| "grad_norm": 0.028849398717284203, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5311304926872253, | |
| "mean_token_accuracy": 0.7853154540061951, | |
| "num_tokens": 7273883.0, | |
| "step": 446 | |
| }, | |
| { | |
| "entropy": 0.5287686139345169, | |
| "epoch": 1.668534080298786, | |
| "grad_norm": 0.027796290814876556, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5300359129905701, | |
| "mean_token_accuracy": 0.7818829715251923, | |
| "num_tokens": 7290094.0, | |
| "step": 447 | |
| }, | |
| { | |
| "entropy": 0.5384389162063599, | |
| "epoch": 1.6722689075630253, | |
| "grad_norm": 0.03137550130486488, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5358840227127075, | |
| "mean_token_accuracy": 0.7822984606027603, | |
| "num_tokens": 7306318.0, | |
| "step": 448 | |
| }, | |
| { | |
| "entropy": 0.5409219712018967, | |
| "epoch": 1.6760037348272643, | |
| "grad_norm": 0.03238392993807793, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5490888357162476, | |
| "mean_token_accuracy": 0.7757006883621216, | |
| "num_tokens": 7322518.0, | |
| "step": 449 | |
| }, | |
| { | |
| "entropy": 0.5399473458528519, | |
| "epoch": 1.6797385620915033, | |
| "grad_norm": 0.03108685463666916, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5397608876228333, | |
| "mean_token_accuracy": 0.7774724215269089, | |
| "num_tokens": 7338931.0, | |
| "step": 450 | |
| }, | |
| { | |
| "entropy": 0.5551822930574417, | |
| "epoch": 1.6834733893557423, | |
| "grad_norm": 0.02780800126492977, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5481570959091187, | |
| "mean_token_accuracy": 0.7780963182449341, | |
| "num_tokens": 7355336.0, | |
| "step": 451 | |
| }, | |
| { | |
| "entropy": 0.54237399995327, | |
| "epoch": 1.6872082166199813, | |
| "grad_norm": 0.04012434557080269, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5462750792503357, | |
| "mean_token_accuracy": 0.7741427570581436, | |
| "num_tokens": 7371655.0, | |
| "step": 452 | |
| }, | |
| { | |
| "entropy": 0.5476243197917938, | |
| "epoch": 1.6909430438842203, | |
| "grad_norm": 0.031238745898008347, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5490629076957703, | |
| "mean_token_accuracy": 0.7778069078922272, | |
| "num_tokens": 7387779.0, | |
| "step": 453 | |
| }, | |
| { | |
| "entropy": 0.5370198786258698, | |
| "epoch": 1.6946778711484594, | |
| "grad_norm": 0.0672907754778862, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5387383699417114, | |
| "mean_token_accuracy": 0.7835952490568161, | |
| "num_tokens": 7404160.0, | |
| "step": 454 | |
| }, | |
| { | |
| "entropy": 0.5476315915584564, | |
| "epoch": 1.6984126984126984, | |
| "grad_norm": 0.029196592047810555, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5511754751205444, | |
| "mean_token_accuracy": 0.7767634838819504, | |
| "num_tokens": 7420779.0, | |
| "step": 455 | |
| }, | |
| { | |
| "entropy": 0.5495481044054031, | |
| "epoch": 1.7021475256769374, | |
| "grad_norm": 0.03591341897845268, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5475634336471558, | |
| "mean_token_accuracy": 0.7761732786893845, | |
| "num_tokens": 7437268.0, | |
| "step": 456 | |
| }, | |
| { | |
| "entropy": 0.5471929609775543, | |
| "epoch": 1.7058823529411766, | |
| "grad_norm": 0.07272505015134811, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5460875630378723, | |
| "mean_token_accuracy": 0.7771887481212616, | |
| "num_tokens": 7453407.0, | |
| "step": 457 | |
| }, | |
| { | |
| "entropy": 0.5470087379217148, | |
| "epoch": 1.7096171802054156, | |
| "grad_norm": 0.027592960745096207, | |
| "learning_rate": 0.0002, | |
| "loss": 0.544583797454834, | |
| "mean_token_accuracy": 0.7774143517017365, | |
| "num_tokens": 7469641.0, | |
| "step": 458 | |
| }, | |
| { | |
| "entropy": 0.5607744753360748, | |
| "epoch": 1.7133520074696547, | |
| "grad_norm": 0.031071651726961136, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5542961955070496, | |
| "mean_token_accuracy": 0.7748319655656815, | |
| "num_tokens": 7486190.0, | |
| "step": 459 | |
| }, | |
| { | |
| "entropy": 0.5514983385801315, | |
| "epoch": 1.7170868347338937, | |
| "grad_norm": 0.03477690741419792, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5511950254440308, | |
| "mean_token_accuracy": 0.7754039019346237, | |
| "num_tokens": 7502685.0, | |
| "step": 460 | |
| }, | |
| { | |
| "entropy": 0.5462844371795654, | |
| "epoch": 1.7208216619981327, | |
| "grad_norm": 0.02956387773156166, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5578323602676392, | |
| "mean_token_accuracy": 0.7759933173656464, | |
| "num_tokens": 7518976.0, | |
| "step": 461 | |
| }, | |
| { | |
| "entropy": 0.5413178950548172, | |
| "epoch": 1.7245564892623717, | |
| "grad_norm": 0.03515993058681488, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5494832992553711, | |
| "mean_token_accuracy": 0.7766997069120407, | |
| "num_tokens": 7535230.0, | |
| "step": 462 | |
| }, | |
| { | |
| "entropy": 0.5519613027572632, | |
| "epoch": 1.7282913165266107, | |
| "grad_norm": 0.03921071067452431, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5593541860580444, | |
| "mean_token_accuracy": 0.7729771286249161, | |
| "num_tokens": 7551766.0, | |
| "step": 463 | |
| }, | |
| { | |
| "entropy": 0.5483202934265137, | |
| "epoch": 1.7320261437908497, | |
| "grad_norm": 0.02950095944106579, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5464847683906555, | |
| "mean_token_accuracy": 0.7769839763641357, | |
| "num_tokens": 7568028.0, | |
| "step": 464 | |
| }, | |
| { | |
| "entropy": 0.5524065643548965, | |
| "epoch": 1.7357609710550888, | |
| "grad_norm": 0.038918618112802505, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5422624945640564, | |
| "mean_token_accuracy": 0.7797468602657318, | |
| "num_tokens": 7584397.0, | |
| "step": 465 | |
| }, | |
| { | |
| "entropy": 0.546732097864151, | |
| "epoch": 1.7394957983193278, | |
| "grad_norm": 0.03082694672048092, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5352342128753662, | |
| "mean_token_accuracy": 0.78376704454422, | |
| "num_tokens": 7600719.0, | |
| "step": 466 | |
| }, | |
| { | |
| "entropy": 0.557578444480896, | |
| "epoch": 1.7432306255835668, | |
| "grad_norm": 0.031017586588859558, | |
| "learning_rate": 0.0002, | |
| "loss": 0.54631108045578, | |
| "mean_token_accuracy": 0.7787049263715744, | |
| "num_tokens": 7617277.0, | |
| "step": 467 | |
| }, | |
| { | |
| "entropy": 0.5322857201099396, | |
| "epoch": 1.7469654528478058, | |
| "grad_norm": 0.0356813408434391, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5350920557975769, | |
| "mean_token_accuracy": 0.7820670753717422, | |
| "num_tokens": 7633468.0, | |
| "step": 468 | |
| }, | |
| { | |
| "entropy": 0.5373670607805252, | |
| "epoch": 1.7507002801120448, | |
| "grad_norm": 0.0339689627289772, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5516907572746277, | |
| "mean_token_accuracy": 0.7766174525022507, | |
| "num_tokens": 7649778.0, | |
| "step": 469 | |
| }, | |
| { | |
| "entropy": 0.522003561258316, | |
| "epoch": 1.7544351073762838, | |
| "grad_norm": 0.034353625029325485, | |
| "learning_rate": 0.0002, | |
| "loss": 0.533075749874115, | |
| "mean_token_accuracy": 0.7833420485258102, | |
| "num_tokens": 7666182.0, | |
| "step": 470 | |
| }, | |
| { | |
| "entropy": 0.5592000931501389, | |
| "epoch": 1.7581699346405228, | |
| "grad_norm": 0.029966510832309723, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5585059523582458, | |
| "mean_token_accuracy": 0.7732168585062027, | |
| "num_tokens": 7682572.0, | |
| "step": 471 | |
| }, | |
| { | |
| "entropy": 0.5302631109952927, | |
| "epoch": 1.7619047619047619, | |
| "grad_norm": 0.030881982296705246, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5330703854560852, | |
| "mean_token_accuracy": 0.7825835943222046, | |
| "num_tokens": 7698564.0, | |
| "step": 472 | |
| }, | |
| { | |
| "entropy": 0.5615632385015488, | |
| "epoch": 1.7656395891690009, | |
| "grad_norm": 0.03000018559396267, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5536789298057556, | |
| "mean_token_accuracy": 0.7739888280630112, | |
| "num_tokens": 7714922.0, | |
| "step": 473 | |
| }, | |
| { | |
| "entropy": 0.5522587448358536, | |
| "epoch": 1.76937441643324, | |
| "grad_norm": 0.031349826604127884, | |
| "learning_rate": 0.0002, | |
| "loss": 0.551250696182251, | |
| "mean_token_accuracy": 0.7755384594202042, | |
| "num_tokens": 7731301.0, | |
| "step": 474 | |
| }, | |
| { | |
| "entropy": 0.5275092422962189, | |
| "epoch": 1.773109243697479, | |
| "grad_norm": 0.026553746312856674, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5240329504013062, | |
| "mean_token_accuracy": 0.7870848327875137, | |
| "num_tokens": 7747693.0, | |
| "step": 475 | |
| }, | |
| { | |
| "entropy": 0.5298073589801788, | |
| "epoch": 1.776844070961718, | |
| "grad_norm": 0.03024754300713539, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5267937183380127, | |
| "mean_token_accuracy": 0.7867465615272522, | |
| "num_tokens": 7763990.0, | |
| "step": 476 | |
| }, | |
| { | |
| "entropy": 0.5466170459985733, | |
| "epoch": 1.780578898225957, | |
| "grad_norm": 0.03677600622177124, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5455999374389648, | |
| "mean_token_accuracy": 0.7789721339941025, | |
| "num_tokens": 7780428.0, | |
| "step": 477 | |
| }, | |
| { | |
| "entropy": 0.5342886596918106, | |
| "epoch": 1.784313725490196, | |
| "grad_norm": 0.03470218554139137, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5434668660163879, | |
| "mean_token_accuracy": 0.7787842005491257, | |
| "num_tokens": 7796524.0, | |
| "step": 478 | |
| }, | |
| { | |
| "entropy": 0.5427644997835159, | |
| "epoch": 1.788048552754435, | |
| "grad_norm": 0.026957696303725243, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5418925285339355, | |
| "mean_token_accuracy": 0.7785145193338394, | |
| "num_tokens": 7813105.0, | |
| "step": 479 | |
| }, | |
| { | |
| "entropy": 0.528566911816597, | |
| "epoch": 1.791783380018674, | |
| "grad_norm": 0.037975575774908066, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5284658074378967, | |
| "mean_token_accuracy": 0.7871547490358353, | |
| "num_tokens": 7829398.0, | |
| "step": 480 | |
| }, | |
| { | |
| "entropy": 0.5551463812589645, | |
| "epoch": 1.795518207282913, | |
| "grad_norm": 0.028514336794614792, | |
| "learning_rate": 0.0002, | |
| "loss": 0.556096076965332, | |
| "mean_token_accuracy": 0.7756756544113159, | |
| "num_tokens": 7845626.0, | |
| "step": 481 | |
| }, | |
| { | |
| "entropy": 0.5317743271589279, | |
| "epoch": 1.7992530345471522, | |
| "grad_norm": 0.03154602646827698, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5321435332298279, | |
| "mean_token_accuracy": 0.7815971374511719, | |
| "num_tokens": 7861817.0, | |
| "step": 482 | |
| }, | |
| { | |
| "entropy": 0.547456681728363, | |
| "epoch": 1.8029878618113913, | |
| "grad_norm": 0.03746788948774338, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5512088537216187, | |
| "mean_token_accuracy": 0.7785567492246628, | |
| "num_tokens": 7878075.0, | |
| "step": 483 | |
| }, | |
| { | |
| "entropy": 0.5500114560127258, | |
| "epoch": 1.8067226890756303, | |
| "grad_norm": 0.030493978410959244, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5513818264007568, | |
| "mean_token_accuracy": 0.773513063788414, | |
| "num_tokens": 7894502.0, | |
| "step": 484 | |
| }, | |
| { | |
| "entropy": 0.543748289346695, | |
| "epoch": 1.8104575163398693, | |
| "grad_norm": 0.036304932087659836, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5411792993545532, | |
| "mean_token_accuracy": 0.7785163670778275, | |
| "num_tokens": 7910890.0, | |
| "step": 485 | |
| }, | |
| { | |
| "entropy": 0.5393827706575394, | |
| "epoch": 1.8141923436041083, | |
| "grad_norm": 0.03712041303515434, | |
| "learning_rate": 0.0002, | |
| "loss": 0.540428876876831, | |
| "mean_token_accuracy": 0.7790835350751877, | |
| "num_tokens": 7927094.0, | |
| "step": 486 | |
| }, | |
| { | |
| "entropy": 0.5430537164211273, | |
| "epoch": 1.8179271708683473, | |
| "grad_norm": 0.03853759169578552, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5471268892288208, | |
| "mean_token_accuracy": 0.7801193594932556, | |
| "num_tokens": 7943326.0, | |
| "step": 487 | |
| }, | |
| { | |
| "entropy": 0.5636092722415924, | |
| "epoch": 1.8216619981325863, | |
| "grad_norm": 0.0457291305065155, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5627254843711853, | |
| "mean_token_accuracy": 0.7725824415683746, | |
| "num_tokens": 7959760.0, | |
| "step": 488 | |
| }, | |
| { | |
| "entropy": 0.543666809797287, | |
| "epoch": 1.8253968253968254, | |
| "grad_norm": 0.02919071726500988, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5421757102012634, | |
| "mean_token_accuracy": 0.7801477611064911, | |
| "num_tokens": 7975860.0, | |
| "step": 489 | |
| }, | |
| { | |
| "entropy": 0.5545783638954163, | |
| "epoch": 1.8291316526610646, | |
| "grad_norm": 0.03340514004230499, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5518795251846313, | |
| "mean_token_accuracy": 0.7791309058666229, | |
| "num_tokens": 7992340.0, | |
| "step": 490 | |
| }, | |
| { | |
| "entropy": 0.5608565956354141, | |
| "epoch": 1.8328664799253036, | |
| "grad_norm": 0.03725928068161011, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5564695596694946, | |
| "mean_token_accuracy": 0.7749106585979462, | |
| "num_tokens": 8008783.0, | |
| "step": 491 | |
| }, | |
| { | |
| "entropy": 0.5600428581237793, | |
| "epoch": 1.8366013071895426, | |
| "grad_norm": 0.030761808156967163, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5595075488090515, | |
| "mean_token_accuracy": 0.7733119577169418, | |
| "num_tokens": 8025204.0, | |
| "step": 492 | |
| }, | |
| { | |
| "entropy": 0.5233868211507797, | |
| "epoch": 1.8403361344537816, | |
| "grad_norm": 0.030873069539666176, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5303994417190552, | |
| "mean_token_accuracy": 0.784132570028305, | |
| "num_tokens": 8041524.0, | |
| "step": 493 | |
| }, | |
| { | |
| "entropy": 0.5531543642282486, | |
| "epoch": 1.8440709617180207, | |
| "grad_norm": 0.037785280495882034, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5541731119155884, | |
| "mean_token_accuracy": 0.7754590958356857, | |
| "num_tokens": 8057944.0, | |
| "step": 494 | |
| }, | |
| { | |
| "entropy": 0.542868971824646, | |
| "epoch": 1.8478057889822597, | |
| "grad_norm": 0.03054802305996418, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5407766699790955, | |
| "mean_token_accuracy": 0.7781128138303757, | |
| "num_tokens": 8074585.0, | |
| "step": 495 | |
| }, | |
| { | |
| "entropy": 0.5384076237678528, | |
| "epoch": 1.8515406162464987, | |
| "grad_norm": 0.024639198556542397, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5381752848625183, | |
| "mean_token_accuracy": 0.7817619889974594, | |
| "num_tokens": 8091097.0, | |
| "step": 496 | |
| }, | |
| { | |
| "entropy": 0.5398432165384293, | |
| "epoch": 1.8552754435107377, | |
| "grad_norm": 0.04202251881361008, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5468040704727173, | |
| "mean_token_accuracy": 0.7771125733852386, | |
| "num_tokens": 8107370.0, | |
| "step": 497 | |
| }, | |
| { | |
| "entropy": 0.5353098064661026, | |
| "epoch": 1.8590102707749767, | |
| "grad_norm": 0.03730052337050438, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5450741052627563, | |
| "mean_token_accuracy": 0.7791319042444229, | |
| "num_tokens": 8123388.0, | |
| "step": 498 | |
| }, | |
| { | |
| "entropy": 0.537789598107338, | |
| "epoch": 1.8627450980392157, | |
| "grad_norm": 0.02861681580543518, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5363599061965942, | |
| "mean_token_accuracy": 0.7793796509504318, | |
| "num_tokens": 8139491.0, | |
| "step": 499 | |
| }, | |
| { | |
| "entropy": 0.5609306395053864, | |
| "epoch": 1.8664799253034547, | |
| "grad_norm": 0.04193006083369255, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5556061267852783, | |
| "mean_token_accuracy": 0.7729236781597137, | |
| "num_tokens": 8155893.0, | |
| "step": 500 | |
| }, | |
| { | |
| "entropy": 0.5393400639295578, | |
| "epoch": 1.8702147525676938, | |
| "grad_norm": 0.030415907502174377, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5372475385665894, | |
| "mean_token_accuracy": 0.7827758193016052, | |
| "num_tokens": 8172021.0, | |
| "step": 501 | |
| }, | |
| { | |
| "entropy": 0.5631109476089478, | |
| "epoch": 1.8739495798319328, | |
| "grad_norm": 0.030597561970353127, | |
| "learning_rate": 0.0002, | |
| "loss": 0.56128990650177, | |
| "mean_token_accuracy": 0.7722143828868866, | |
| "num_tokens": 8188761.0, | |
| "step": 502 | |
| }, | |
| { | |
| "entropy": 0.540698915719986, | |
| "epoch": 1.8776844070961718, | |
| "grad_norm": 0.03197801113128662, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5419467687606812, | |
| "mean_token_accuracy": 0.7789603024721146, | |
| "num_tokens": 8205080.0, | |
| "step": 503 | |
| }, | |
| { | |
| "entropy": 0.5343400835990906, | |
| "epoch": 1.8814192343604108, | |
| "grad_norm": 0.03577344864606857, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5340043306350708, | |
| "mean_token_accuracy": 0.7837951481342316, | |
| "num_tokens": 8221164.0, | |
| "step": 504 | |
| }, | |
| { | |
| "entropy": 0.5417536497116089, | |
| "epoch": 1.8851540616246498, | |
| "grad_norm": 0.029083728790283203, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5438728332519531, | |
| "mean_token_accuracy": 0.7775007486343384, | |
| "num_tokens": 8237535.0, | |
| "step": 505 | |
| }, | |
| { | |
| "entropy": 0.5649835765361786, | |
| "epoch": 1.8888888888888888, | |
| "grad_norm": 0.03408566117286682, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5633872151374817, | |
| "mean_token_accuracy": 0.7726111114025116, | |
| "num_tokens": 8253827.0, | |
| "step": 506 | |
| }, | |
| { | |
| "entropy": 0.5582909733057022, | |
| "epoch": 1.8926237161531279, | |
| "grad_norm": 0.028437087312340736, | |
| "learning_rate": 0.0002, | |
| "loss": 0.556007981300354, | |
| "mean_token_accuracy": 0.7727185785770416, | |
| "num_tokens": 8270404.0, | |
| "step": 507 | |
| }, | |
| { | |
| "entropy": 0.5577380061149597, | |
| "epoch": 1.8963585434173669, | |
| "grad_norm": 0.029986968263983727, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5514963865280151, | |
| "mean_token_accuracy": 0.7755957692861557, | |
| "num_tokens": 8286963.0, | |
| "step": 508 | |
| }, | |
| { | |
| "entropy": 0.5398396402597427, | |
| "epoch": 1.9000933706816059, | |
| "grad_norm": 0.030943697318434715, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5466131567955017, | |
| "mean_token_accuracy": 0.7787002176046371, | |
| "num_tokens": 8303122.0, | |
| "step": 509 | |
| }, | |
| { | |
| "entropy": 0.536215141415596, | |
| "epoch": 1.903828197945845, | |
| "grad_norm": 0.03370903804898262, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5468170046806335, | |
| "mean_token_accuracy": 0.7753093987703323, | |
| "num_tokens": 8319505.0, | |
| "step": 510 | |
| }, | |
| { | |
| "entropy": 0.5411160290241241, | |
| "epoch": 1.907563025210084, | |
| "grad_norm": 0.028430534526705742, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5434973835945129, | |
| "mean_token_accuracy": 0.7790606617927551, | |
| "num_tokens": 8335861.0, | |
| "step": 511 | |
| }, | |
| { | |
| "entropy": 0.5555713921785355, | |
| "epoch": 1.911297852474323, | |
| "grad_norm": 0.029101036489009857, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5541608929634094, | |
| "mean_token_accuracy": 0.7740498781204224, | |
| "num_tokens": 8352413.0, | |
| "step": 512 | |
| }, | |
| { | |
| "entropy": 0.5440339744091034, | |
| "epoch": 1.915032679738562, | |
| "grad_norm": 0.029705537483096123, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5449399352073669, | |
| "mean_token_accuracy": 0.7799241691827774, | |
| "num_tokens": 8368524.0, | |
| "step": 513 | |
| }, | |
| { | |
| "entropy": 0.5385466068983078, | |
| "epoch": 1.918767507002801, | |
| "grad_norm": 0.02762160450220108, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5408512353897095, | |
| "mean_token_accuracy": 0.7800593823194504, | |
| "num_tokens": 8384881.0, | |
| "step": 514 | |
| }, | |
| { | |
| "entropy": 0.5469230860471725, | |
| "epoch": 1.9225023342670402, | |
| "grad_norm": 0.02923613414168358, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5409518480300903, | |
| "mean_token_accuracy": 0.7801833301782608, | |
| "num_tokens": 8401135.0, | |
| "step": 515 | |
| }, | |
| { | |
| "entropy": 0.5446400791406631, | |
| "epoch": 1.9262371615312792, | |
| "grad_norm": 0.031235719099640846, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5424818992614746, | |
| "mean_token_accuracy": 0.7767819166183472, | |
| "num_tokens": 8417485.0, | |
| "step": 516 | |
| }, | |
| { | |
| "entropy": 0.5608149170875549, | |
| "epoch": 1.9299719887955182, | |
| "grad_norm": 0.027529114857316017, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5587472915649414, | |
| "mean_token_accuracy": 0.7713829576969147, | |
| "num_tokens": 8433808.0, | |
| "step": 517 | |
| }, | |
| { | |
| "entropy": 0.560915470123291, | |
| "epoch": 1.9337068160597572, | |
| "grad_norm": 0.03099709376692772, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5625618100166321, | |
| "mean_token_accuracy": 0.7697023302316666, | |
| "num_tokens": 8450212.0, | |
| "step": 518 | |
| }, | |
| { | |
| "entropy": 0.5411669313907623, | |
| "epoch": 1.9374416433239963, | |
| "grad_norm": 0.03581510856747627, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5449709892272949, | |
| "mean_token_accuracy": 0.779253363609314, | |
| "num_tokens": 8466650.0, | |
| "step": 519 | |
| }, | |
| { | |
| "entropy": 0.5495533496141434, | |
| "epoch": 1.9411764705882353, | |
| "grad_norm": 0.02863345853984356, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5461183786392212, | |
| "mean_token_accuracy": 0.7790695428848267, | |
| "num_tokens": 8482819.0, | |
| "step": 520 | |
| }, | |
| { | |
| "entropy": 0.5496646910905838, | |
| "epoch": 1.9449112978524743, | |
| "grad_norm": 0.028455862775444984, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5562914609909058, | |
| "mean_token_accuracy": 0.7747850865125656, | |
| "num_tokens": 8499201.0, | |
| "step": 521 | |
| }, | |
| { | |
| "entropy": 0.5566077679395676, | |
| "epoch": 1.9486461251167133, | |
| "grad_norm": 0.030010810121893883, | |
| "learning_rate": 0.0002, | |
| "loss": 0.551722526550293, | |
| "mean_token_accuracy": 0.7771954238414764, | |
| "num_tokens": 8515798.0, | |
| "step": 522 | |
| }, | |
| { | |
| "entropy": 0.5467117130756378, | |
| "epoch": 1.9523809523809523, | |
| "grad_norm": 0.027012262493371964, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5425857305526733, | |
| "mean_token_accuracy": 0.7798562794923782, | |
| "num_tokens": 8531958.0, | |
| "step": 523 | |
| }, | |
| { | |
| "entropy": 0.5346378833055496, | |
| "epoch": 1.9561157796451916, | |
| "grad_norm": 0.028377590700984, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5295640230178833, | |
| "mean_token_accuracy": 0.7838203459978104, | |
| "num_tokens": 8548384.0, | |
| "step": 524 | |
| }, | |
| { | |
| "entropy": 0.5571393668651581, | |
| "epoch": 1.9598506069094306, | |
| "grad_norm": 0.02818567305803299, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5521214008331299, | |
| "mean_token_accuracy": 0.7728232592344284, | |
| "num_tokens": 8564872.0, | |
| "step": 525 | |
| }, | |
| { | |
| "entropy": 0.5285107642412186, | |
| "epoch": 1.9635854341736696, | |
| "grad_norm": 0.03457087650895119, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5370362401008606, | |
| "mean_token_accuracy": 0.7813837081193924, | |
| "num_tokens": 8581245.0, | |
| "step": 526 | |
| }, | |
| { | |
| "entropy": 0.5266488045454025, | |
| "epoch": 1.9673202614379086, | |
| "grad_norm": 0.030525686219334602, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5345274806022644, | |
| "mean_token_accuracy": 0.7815807163715363, | |
| "num_tokens": 8597625.0, | |
| "step": 527 | |
| }, | |
| { | |
| "entropy": 0.5280887708067894, | |
| "epoch": 1.9710550887021476, | |
| "grad_norm": 0.03248651325702667, | |
| "learning_rate": 0.0002, | |
| "loss": 0.536238431930542, | |
| "mean_token_accuracy": 0.781073585152626, | |
| "num_tokens": 8613792.0, | |
| "step": 528 | |
| }, | |
| { | |
| "entropy": 0.5472559034824371, | |
| "epoch": 1.9747899159663866, | |
| "grad_norm": 0.029427766799926758, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5451797842979431, | |
| "mean_token_accuracy": 0.7770361602306366, | |
| "num_tokens": 8629870.0, | |
| "step": 529 | |
| }, | |
| { | |
| "entropy": 0.5381799042224884, | |
| "epoch": 1.9785247432306257, | |
| "grad_norm": 0.028413154184818268, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5342366695404053, | |
| "mean_token_accuracy": 0.7803498655557632, | |
| "num_tokens": 8646077.0, | |
| "step": 530 | |
| }, | |
| { | |
| "entropy": 0.5565104633569717, | |
| "epoch": 1.9822595704948647, | |
| "grad_norm": 0.031074564903974533, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5515958666801453, | |
| "mean_token_accuracy": 0.7759076505899429, | |
| "num_tokens": 8662535.0, | |
| "step": 531 | |
| }, | |
| { | |
| "entropy": 0.5381414890289307, | |
| "epoch": 1.9859943977591037, | |
| "grad_norm": 0.027250438928604126, | |
| "learning_rate": 0.0002, | |
| "loss": 0.534949004650116, | |
| "mean_token_accuracy": 0.7819445878267288, | |
| "num_tokens": 8679064.0, | |
| "step": 532 | |
| }, | |
| { | |
| "entropy": 0.550770565867424, | |
| "epoch": 1.9897292250233427, | |
| "grad_norm": 0.03366328775882721, | |
| "learning_rate": 0.0002, | |
| "loss": 0.560295045375824, | |
| "mean_token_accuracy": 0.7720893323421478, | |
| "num_tokens": 8695198.0, | |
| "step": 533 | |
| }, | |
| { | |
| "entropy": 0.5551019459962845, | |
| "epoch": 1.9934640522875817, | |
| "grad_norm": 0.03133872151374817, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5596403479576111, | |
| "mean_token_accuracy": 0.7717682421207428, | |
| "num_tokens": 8711690.0, | |
| "step": 534 | |
| }, | |
| { | |
| "entropy": 0.5346082448959351, | |
| "epoch": 1.9971988795518207, | |
| "grad_norm": 0.027525832876563072, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5321208834648132, | |
| "mean_token_accuracy": 0.7810429036617279, | |
| "num_tokens": 8727828.0, | |
| "step": 535 | |
| }, | |
| { | |
| "entropy": 0.5438209176063538, | |
| "epoch": 2.0, | |
| "grad_norm": 0.03134825825691223, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5398504734039307, | |
| "mean_token_accuracy": 0.7799929777781168, | |
| "num_tokens": 8729600.0, | |
| "step": 536 | |
| }, | |
| { | |
| "entropy": 0.5402208417654037, | |
| "epoch": 2.003734827264239, | |
| "grad_norm": 0.03922782838344574, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5171674489974976, | |
| "mean_token_accuracy": 0.7900938540697098, | |
| "num_tokens": 8745809.0, | |
| "step": 537 | |
| }, | |
| { | |
| "entropy": 0.5227422267198563, | |
| "epoch": 2.007469654528478, | |
| "grad_norm": 0.032982293516397476, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5183929204940796, | |
| "mean_token_accuracy": 0.7890477329492569, | |
| "num_tokens": 8762197.0, | |
| "step": 538 | |
| }, | |
| { | |
| "entropy": 0.5411823242902756, | |
| "epoch": 2.011204481792717, | |
| "grad_norm": 0.043377745896577835, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5554962158203125, | |
| "mean_token_accuracy": 0.7736512869596481, | |
| "num_tokens": 8778400.0, | |
| "step": 539 | |
| }, | |
| { | |
| "entropy": 0.5156290903687477, | |
| "epoch": 2.014939309056956, | |
| "grad_norm": 0.05257771536707878, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5335375666618347, | |
| "mean_token_accuracy": 0.7833946198225021, | |
| "num_tokens": 8794851.0, | |
| "step": 540 | |
| }, | |
| { | |
| "entropy": 0.5122585743665695, | |
| "epoch": 2.018674136321195, | |
| "grad_norm": 0.03504469618201256, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5155843496322632, | |
| "mean_token_accuracy": 0.7894317060709, | |
| "num_tokens": 8811019.0, | |
| "step": 541 | |
| }, | |
| { | |
| "entropy": 0.5448772013187408, | |
| "epoch": 2.022408963585434, | |
| "grad_norm": 0.0317138209939003, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5368859171867371, | |
| "mean_token_accuracy": 0.7842776328325272, | |
| "num_tokens": 8827258.0, | |
| "step": 542 | |
| }, | |
| { | |
| "entropy": 0.5527419149875641, | |
| "epoch": 2.026143790849673, | |
| "grad_norm": 0.03476279601454735, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5388182997703552, | |
| "mean_token_accuracy": 0.7820452451705933, | |
| "num_tokens": 8843634.0, | |
| "step": 543 | |
| }, | |
| { | |
| "entropy": 0.5452789962291718, | |
| "epoch": 2.029878618113912, | |
| "grad_norm": 0.036034028977155685, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5357140302658081, | |
| "mean_token_accuracy": 0.7825983464717865, | |
| "num_tokens": 8859977.0, | |
| "step": 544 | |
| }, | |
| { | |
| "entropy": 0.5304267108440399, | |
| "epoch": 2.033613445378151, | |
| "grad_norm": 0.02969290129840374, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5306066870689392, | |
| "mean_token_accuracy": 0.7856841534376144, | |
| "num_tokens": 8876314.0, | |
| "step": 545 | |
| }, | |
| { | |
| "entropy": 0.5096816495060921, | |
| "epoch": 2.03734827264239, | |
| "grad_norm": 0.043957311660051346, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5185045599937439, | |
| "mean_token_accuracy": 0.7923233062028885, | |
| "num_tokens": 8892568.0, | |
| "step": 546 | |
| }, | |
| { | |
| "entropy": 0.5319265872240067, | |
| "epoch": 2.041083099906629, | |
| "grad_norm": 0.035869866609573364, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5334051847457886, | |
| "mean_token_accuracy": 0.7822142988443375, | |
| "num_tokens": 8909094.0, | |
| "step": 547 | |
| }, | |
| { | |
| "entropy": 0.527954563498497, | |
| "epoch": 2.044817927170868, | |
| "grad_norm": 0.034570369869470596, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5274232029914856, | |
| "mean_token_accuracy": 0.7842770516872406, | |
| "num_tokens": 8925492.0, | |
| "step": 548 | |
| }, | |
| { | |
| "entropy": 0.5419287383556366, | |
| "epoch": 2.048552754435107, | |
| "grad_norm": 0.03259408101439476, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5387848615646362, | |
| "mean_token_accuracy": 0.7804249227046967, | |
| "num_tokens": 8941717.0, | |
| "step": 549 | |
| }, | |
| { | |
| "entropy": 0.5271529629826546, | |
| "epoch": 2.052287581699346, | |
| "grad_norm": 0.03245944157242775, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5230631828308105, | |
| "mean_token_accuracy": 0.785658523440361, | |
| "num_tokens": 8957856.0, | |
| "step": 550 | |
| }, | |
| { | |
| "entropy": 0.5363311916589737, | |
| "epoch": 2.0560224089635852, | |
| "grad_norm": 0.035185229033231735, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5378749370574951, | |
| "mean_token_accuracy": 0.781972661614418, | |
| "num_tokens": 8974161.0, | |
| "step": 551 | |
| }, | |
| { | |
| "entropy": 0.513224758207798, | |
| "epoch": 2.0597572362278243, | |
| "grad_norm": 0.032956283539533615, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5172683000564575, | |
| "mean_token_accuracy": 0.7906839847564697, | |
| "num_tokens": 8990304.0, | |
| "step": 552 | |
| }, | |
| { | |
| "entropy": 0.5387901067733765, | |
| "epoch": 2.0634920634920633, | |
| "grad_norm": 0.03281653672456741, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5435392260551453, | |
| "mean_token_accuracy": 0.7788385599851608, | |
| "num_tokens": 9006661.0, | |
| "step": 553 | |
| }, | |
| { | |
| "entropy": 0.5324967205524445, | |
| "epoch": 2.0672268907563027, | |
| "grad_norm": 0.03808191418647766, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5265247225761414, | |
| "mean_token_accuracy": 0.7839124202728271, | |
| "num_tokens": 9022887.0, | |
| "step": 554 | |
| }, | |
| { | |
| "entropy": 0.513438269495964, | |
| "epoch": 2.0709617180205417, | |
| "grad_norm": 0.033963609486818314, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5105268955230713, | |
| "mean_token_accuracy": 0.7911703735589981, | |
| "num_tokens": 9039478.0, | |
| "step": 555 | |
| }, | |
| { | |
| "entropy": 0.526850014925003, | |
| "epoch": 2.0746965452847808, | |
| "grad_norm": 0.03211839497089386, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5205508470535278, | |
| "mean_token_accuracy": 0.7859012186527252, | |
| "num_tokens": 9055612.0, | |
| "step": 556 | |
| }, | |
| { | |
| "entropy": 0.5130272284150124, | |
| "epoch": 2.0784313725490198, | |
| "grad_norm": 0.03543682396411896, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5140283703804016, | |
| "mean_token_accuracy": 0.7925766706466675, | |
| "num_tokens": 9072145.0, | |
| "step": 557 | |
| }, | |
| { | |
| "entropy": 0.540324792265892, | |
| "epoch": 2.082166199813259, | |
| "grad_norm": 0.037342023104429245, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5462511777877808, | |
| "mean_token_accuracy": 0.7783039510250092, | |
| "num_tokens": 9088571.0, | |
| "step": 558 | |
| }, | |
| { | |
| "entropy": 0.5168541818857193, | |
| "epoch": 2.085901027077498, | |
| "grad_norm": 0.03552469611167908, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5188402533531189, | |
| "mean_token_accuracy": 0.7880326211452484, | |
| "num_tokens": 9104869.0, | |
| "step": 559 | |
| }, | |
| { | |
| "entropy": 0.5319818705320358, | |
| "epoch": 2.089635854341737, | |
| "grad_norm": 0.03719151020050049, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5254620313644409, | |
| "mean_token_accuracy": 0.7848033308982849, | |
| "num_tokens": 9121231.0, | |
| "step": 560 | |
| }, | |
| { | |
| "entropy": 0.5197737812995911, | |
| "epoch": 2.093370681605976, | |
| "grad_norm": 0.03636628016829491, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5256960988044739, | |
| "mean_token_accuracy": 0.7841715961694717, | |
| "num_tokens": 9137375.0, | |
| "step": 561 | |
| }, | |
| { | |
| "entropy": 0.5284384936094284, | |
| "epoch": 2.097105508870215, | |
| "grad_norm": 0.04217526316642761, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5343865752220154, | |
| "mean_token_accuracy": 0.7833265513181686, | |
| "num_tokens": 9153783.0, | |
| "step": 562 | |
| }, | |
| { | |
| "entropy": 0.541428878903389, | |
| "epoch": 2.100840336134454, | |
| "grad_norm": 0.035067781805992126, | |
| "learning_rate": 0.0002, | |
| "loss": 0.532885730266571, | |
| "mean_token_accuracy": 0.7817672491073608, | |
| "num_tokens": 9170090.0, | |
| "step": 563 | |
| }, | |
| { | |
| "entropy": 0.5429966300725937, | |
| "epoch": 2.104575163398693, | |
| "grad_norm": 0.0392267219722271, | |
| "learning_rate": 0.0002, | |
| "loss": 0.541841447353363, | |
| "mean_token_accuracy": 0.7772143185138702, | |
| "num_tokens": 9186453.0, | |
| "step": 564 | |
| }, | |
| { | |
| "entropy": 0.5249236822128296, | |
| "epoch": 2.108309990662932, | |
| "grad_norm": 0.036935608834028244, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5244463086128235, | |
| "mean_token_accuracy": 0.7873810976743698, | |
| "num_tokens": 9202852.0, | |
| "step": 565 | |
| }, | |
| { | |
| "entropy": 0.5256126970052719, | |
| "epoch": 2.112044817927171, | |
| "grad_norm": 0.03337714821100235, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5264843702316284, | |
| "mean_token_accuracy": 0.7856813371181488, | |
| "num_tokens": 9219197.0, | |
| "step": 566 | |
| }, | |
| { | |
| "entropy": 0.5338774845004082, | |
| "epoch": 2.11577964519141, | |
| "grad_norm": 0.03405802696943283, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5289718508720398, | |
| "mean_token_accuracy": 0.7856559157371521, | |
| "num_tokens": 9235376.0, | |
| "step": 567 | |
| }, | |
| { | |
| "entropy": 0.5203371495008469, | |
| "epoch": 2.119514472455649, | |
| "grad_norm": 0.0316944345831871, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5218056440353394, | |
| "mean_token_accuracy": 0.7897576838731766, | |
| "num_tokens": 9251814.0, | |
| "step": 568 | |
| }, | |
| { | |
| "entropy": 0.5288322418928146, | |
| "epoch": 2.123249299719888, | |
| "grad_norm": 0.03991817682981491, | |
| "learning_rate": 0.0002, | |
| "loss": 0.538847029209137, | |
| "mean_token_accuracy": 0.7831858396530151, | |
| "num_tokens": 9268205.0, | |
| "step": 569 | |
| }, | |
| { | |
| "entropy": 0.5342643857002258, | |
| "epoch": 2.126984126984127, | |
| "grad_norm": 0.030493905767798424, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5326664447784424, | |
| "mean_token_accuracy": 0.7838881760835648, | |
| "num_tokens": 9284834.0, | |
| "step": 570 | |
| }, | |
| { | |
| "entropy": 0.5280385613441467, | |
| "epoch": 2.130718954248366, | |
| "grad_norm": 0.033054206520318985, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5298633575439453, | |
| "mean_token_accuracy": 0.7841726392507553, | |
| "num_tokens": 9301172.0, | |
| "step": 571 | |
| }, | |
| { | |
| "entropy": 0.5260151326656342, | |
| "epoch": 2.134453781512605, | |
| "grad_norm": 0.03629712015390396, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5276213884353638, | |
| "mean_token_accuracy": 0.7840316295623779, | |
| "num_tokens": 9317580.0, | |
| "step": 572 | |
| }, | |
| { | |
| "entropy": 0.5471695214509964, | |
| "epoch": 2.138188608776844, | |
| "grad_norm": 0.036552250385284424, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5418487787246704, | |
| "mean_token_accuracy": 0.7832918912172318, | |
| "num_tokens": 9333967.0, | |
| "step": 573 | |
| }, | |
| { | |
| "entropy": 0.5421722680330276, | |
| "epoch": 2.141923436041083, | |
| "grad_norm": 0.03261527791619301, | |
| "learning_rate": 0.0002, | |
| "loss": 0.535561203956604, | |
| "mean_token_accuracy": 0.7817313969135284, | |
| "num_tokens": 9350202.0, | |
| "step": 574 | |
| }, | |
| { | |
| "entropy": 0.5255165547132492, | |
| "epoch": 2.145658263305322, | |
| "grad_norm": 0.04084421694278717, | |
| "learning_rate": 0.0002, | |
| "loss": 0.531385064125061, | |
| "mean_token_accuracy": 0.7830993682146072, | |
| "num_tokens": 9366633.0, | |
| "step": 575 | |
| }, | |
| { | |
| "entropy": 0.5387750118970871, | |
| "epoch": 2.149393090569561, | |
| "grad_norm": 0.03884339705109596, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5440813302993774, | |
| "mean_token_accuracy": 0.7815608382225037, | |
| "num_tokens": 9382903.0, | |
| "step": 576 | |
| }, | |
| { | |
| "entropy": 0.5333912819623947, | |
| "epoch": 2.1531279178338, | |
| "grad_norm": 0.03951586037874222, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5371235013008118, | |
| "mean_token_accuracy": 0.7820965349674225, | |
| "num_tokens": 9399354.0, | |
| "step": 577 | |
| }, | |
| { | |
| "entropy": 0.5337669998407364, | |
| "epoch": 2.156862745098039, | |
| "grad_norm": 0.03831348195672035, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5345415472984314, | |
| "mean_token_accuracy": 0.7816608846187592, | |
| "num_tokens": 9415670.0, | |
| "step": 578 | |
| }, | |
| { | |
| "entropy": 0.5471907705068588, | |
| "epoch": 2.160597572362278, | |
| "grad_norm": 0.04574183374643326, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5423465967178345, | |
| "mean_token_accuracy": 0.7815522998571396, | |
| "num_tokens": 9432026.0, | |
| "step": 579 | |
| }, | |
| { | |
| "entropy": 0.5296851545572281, | |
| "epoch": 2.164332399626517, | |
| "grad_norm": 0.036245960742235184, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5283267498016357, | |
| "mean_token_accuracy": 0.782764196395874, | |
| "num_tokens": 9448243.0, | |
| "step": 580 | |
| }, | |
| { | |
| "entropy": 0.5230330973863602, | |
| "epoch": 2.168067226890756, | |
| "grad_norm": 0.042745113372802734, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5258357524871826, | |
| "mean_token_accuracy": 0.7882087379693985, | |
| "num_tokens": 9464651.0, | |
| "step": 581 | |
| }, | |
| { | |
| "entropy": 0.527550533413887, | |
| "epoch": 2.171802054154995, | |
| "grad_norm": 0.037547484040260315, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5317714214324951, | |
| "mean_token_accuracy": 0.7830660939216614, | |
| "num_tokens": 9480840.0, | |
| "step": 582 | |
| }, | |
| { | |
| "entropy": 0.5365846008062363, | |
| "epoch": 2.175536881419234, | |
| "grad_norm": 0.031849246472120285, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5385018587112427, | |
| "mean_token_accuracy": 0.7798628509044647, | |
| "num_tokens": 9497134.0, | |
| "step": 583 | |
| }, | |
| { | |
| "entropy": 0.5381672978401184, | |
| "epoch": 2.179271708683473, | |
| "grad_norm": 0.03450456261634827, | |
| "learning_rate": 0.0002, | |
| "loss": 0.534706711769104, | |
| "mean_token_accuracy": 0.7828627675771713, | |
| "num_tokens": 9513638.0, | |
| "step": 584 | |
| }, | |
| { | |
| "entropy": 0.5432828962802887, | |
| "epoch": 2.183006535947712, | |
| "grad_norm": 0.03337936848402023, | |
| "learning_rate": 0.0002, | |
| "loss": 0.537329375743866, | |
| "mean_token_accuracy": 0.7817478477954865, | |
| "num_tokens": 9530154.0, | |
| "step": 585 | |
| }, | |
| { | |
| "entropy": 0.5273857861757278, | |
| "epoch": 2.1867413632119517, | |
| "grad_norm": 0.03686324506998062, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5241349339485168, | |
| "mean_token_accuracy": 0.7857643216848373, | |
| "num_tokens": 9546371.0, | |
| "step": 586 | |
| }, | |
| { | |
| "entropy": 0.5210235714912415, | |
| "epoch": 2.1904761904761907, | |
| "grad_norm": 0.036837268620729446, | |
| "learning_rate": 0.0002, | |
| "loss": 0.52490234375, | |
| "mean_token_accuracy": 0.7855679392814636, | |
| "num_tokens": 9562751.0, | |
| "step": 587 | |
| }, | |
| { | |
| "entropy": 0.5189210176467896, | |
| "epoch": 2.1942110177404297, | |
| "grad_norm": 0.034773845225572586, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5269665718078613, | |
| "mean_token_accuracy": 0.7840563803911209, | |
| "num_tokens": 9579070.0, | |
| "step": 588 | |
| }, | |
| { | |
| "entropy": 0.5213501304388046, | |
| "epoch": 2.1979458450046687, | |
| "grad_norm": 0.03593657165765762, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5271462798118591, | |
| "mean_token_accuracy": 0.7872640639543533, | |
| "num_tokens": 9595317.0, | |
| "step": 589 | |
| }, | |
| { | |
| "entropy": 0.5206883400678635, | |
| "epoch": 2.2016806722689077, | |
| "grad_norm": 0.04565085843205452, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5203741192817688, | |
| "mean_token_accuracy": 0.7881180793046951, | |
| "num_tokens": 9611799.0, | |
| "step": 590 | |
| }, | |
| { | |
| "entropy": 0.5511275231838226, | |
| "epoch": 2.2054154995331468, | |
| "grad_norm": 0.03642827644944191, | |
| "learning_rate": 0.0002, | |
| "loss": 0.543250322341919, | |
| "mean_token_accuracy": 0.7804217487573624, | |
| "num_tokens": 9628251.0, | |
| "step": 591 | |
| }, | |
| { | |
| "entropy": 0.5495842546224594, | |
| "epoch": 2.2091503267973858, | |
| "grad_norm": 0.03284912183880806, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5446897745132446, | |
| "mean_token_accuracy": 0.7783705443143845, | |
| "num_tokens": 9644703.0, | |
| "step": 592 | |
| }, | |
| { | |
| "entropy": 0.5297017693519592, | |
| "epoch": 2.212885154061625, | |
| "grad_norm": 0.04696131870150566, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5325087308883667, | |
| "mean_token_accuracy": 0.7825257629156113, | |
| "num_tokens": 9660837.0, | |
| "step": 593 | |
| }, | |
| { | |
| "entropy": 0.5143487304449081, | |
| "epoch": 2.216619981325864, | |
| "grad_norm": 0.03802449256181717, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5171544551849365, | |
| "mean_token_accuracy": 0.7884373366832733, | |
| "num_tokens": 9676767.0, | |
| "step": 594 | |
| }, | |
| { | |
| "entropy": 0.5370919853448868, | |
| "epoch": 2.220354808590103, | |
| "grad_norm": 0.0338297039270401, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5430178046226501, | |
| "mean_token_accuracy": 0.7808142453432083, | |
| "num_tokens": 9693155.0, | |
| "step": 595 | |
| }, | |
| { | |
| "entropy": 0.5210085138678551, | |
| "epoch": 2.224089635854342, | |
| "grad_norm": 0.04106014966964722, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5282027125358582, | |
| "mean_token_accuracy": 0.7855826020240784, | |
| "num_tokens": 9709374.0, | |
| "step": 596 | |
| }, | |
| { | |
| "entropy": 0.516735278069973, | |
| "epoch": 2.227824463118581, | |
| "grad_norm": 0.03890896216034889, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5173130035400391, | |
| "mean_token_accuracy": 0.7897009253501892, | |
| "num_tokens": 9725684.0, | |
| "step": 597 | |
| }, | |
| { | |
| "entropy": 0.5427378565073013, | |
| "epoch": 2.23155929038282, | |
| "grad_norm": 0.038357146084308624, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5421530604362488, | |
| "mean_token_accuracy": 0.7789111882448196, | |
| "num_tokens": 9742139.0, | |
| "step": 598 | |
| }, | |
| { | |
| "entropy": 0.5454076677560806, | |
| "epoch": 2.235294117647059, | |
| "grad_norm": 0.037645429372787476, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5428951978683472, | |
| "mean_token_accuracy": 0.7789873778820038, | |
| "num_tokens": 9758607.0, | |
| "step": 599 | |
| }, | |
| { | |
| "entropy": 0.5404632985591888, | |
| "epoch": 2.239028944911298, | |
| "grad_norm": 0.039499301463365555, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5404936075210571, | |
| "mean_token_accuracy": 0.7793222069740295, | |
| "num_tokens": 9775018.0, | |
| "step": 600 | |
| }, | |
| { | |
| "entropy": 0.530501589179039, | |
| "epoch": 2.242763772175537, | |
| "grad_norm": 0.040064238011837006, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5247729420661926, | |
| "mean_token_accuracy": 0.7874402105808258, | |
| "num_tokens": 9791324.0, | |
| "step": 601 | |
| }, | |
| { | |
| "entropy": 0.5369330644607544, | |
| "epoch": 2.246498599439776, | |
| "grad_norm": 0.037321336567401886, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5377377271652222, | |
| "mean_token_accuracy": 0.782483384013176, | |
| "num_tokens": 9807623.0, | |
| "step": 602 | |
| }, | |
| { | |
| "entropy": 0.5428077727556229, | |
| "epoch": 2.250233426704015, | |
| "grad_norm": 0.03844759240746498, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5476452112197876, | |
| "mean_token_accuracy": 0.776596188545227, | |
| "num_tokens": 9824163.0, | |
| "step": 603 | |
| }, | |
| { | |
| "entropy": 0.5409123748540878, | |
| "epoch": 2.253968253968254, | |
| "grad_norm": 0.03608860820531845, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5434892177581787, | |
| "mean_token_accuracy": 0.7794551849365234, | |
| "num_tokens": 9840911.0, | |
| "step": 604 | |
| }, | |
| { | |
| "entropy": 0.5327287763357162, | |
| "epoch": 2.257703081232493, | |
| "grad_norm": 0.037285350263118744, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5284507274627686, | |
| "mean_token_accuracy": 0.7831137478351593, | |
| "num_tokens": 9857296.0, | |
| "step": 605 | |
| }, | |
| { | |
| "entropy": 0.5373975485563278, | |
| "epoch": 2.261437908496732, | |
| "grad_norm": 0.03957006335258484, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5341996550559998, | |
| "mean_token_accuracy": 0.7810620963573456, | |
| "num_tokens": 9873850.0, | |
| "step": 606 | |
| }, | |
| { | |
| "entropy": 0.5290782749652863, | |
| "epoch": 2.265172735760971, | |
| "grad_norm": 0.040026333183050156, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5341078042984009, | |
| "mean_token_accuracy": 0.780807763338089, | |
| "num_tokens": 9890140.0, | |
| "step": 607 | |
| }, | |
| { | |
| "entropy": 0.5333269834518433, | |
| "epoch": 2.26890756302521, | |
| "grad_norm": 0.03453996032476425, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5351240634918213, | |
| "mean_token_accuracy": 0.7807765603065491, | |
| "num_tokens": 9906713.0, | |
| "step": 608 | |
| }, | |
| { | |
| "entropy": 0.5285785049200058, | |
| "epoch": 2.272642390289449, | |
| "grad_norm": 0.04334354028105736, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5339541435241699, | |
| "mean_token_accuracy": 0.7852969169616699, | |
| "num_tokens": 9922956.0, | |
| "step": 609 | |
| }, | |
| { | |
| "entropy": 0.5360069870948792, | |
| "epoch": 2.276377217553688, | |
| "grad_norm": 0.03924287483096123, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5359759330749512, | |
| "mean_token_accuracy": 0.7819220721721649, | |
| "num_tokens": 9939069.0, | |
| "step": 610 | |
| }, | |
| { | |
| "entropy": 0.5435689836740494, | |
| "epoch": 2.280112044817927, | |
| "grad_norm": 0.037971340119838715, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5404746532440186, | |
| "mean_token_accuracy": 0.779410719871521, | |
| "num_tokens": 9955662.0, | |
| "step": 611 | |
| }, | |
| { | |
| "entropy": 0.5445673018693924, | |
| "epoch": 2.283846872082166, | |
| "grad_norm": 0.03730984404683113, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5418494939804077, | |
| "mean_token_accuracy": 0.7813813239336014, | |
| "num_tokens": 9971957.0, | |
| "step": 612 | |
| }, | |
| { | |
| "entropy": 0.5419572293758392, | |
| "epoch": 2.287581699346405, | |
| "grad_norm": 0.041550587862730026, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5388907194137573, | |
| "mean_token_accuracy": 0.7824303805828094, | |
| "num_tokens": 9988368.0, | |
| "step": 613 | |
| }, | |
| { | |
| "entropy": 0.5267663449048996, | |
| "epoch": 2.291316526610644, | |
| "grad_norm": 0.03576701879501343, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5305144786834717, | |
| "mean_token_accuracy": 0.784039780497551, | |
| "num_tokens": 10004679.0, | |
| "step": 614 | |
| }, | |
| { | |
| "entropy": 0.5274334847927094, | |
| "epoch": 2.295051353874883, | |
| "grad_norm": 0.03758349269628525, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5307276844978333, | |
| "mean_token_accuracy": 0.7834599912166595, | |
| "num_tokens": 10021146.0, | |
| "step": 615 | |
| }, | |
| { | |
| "entropy": 0.5317062586545944, | |
| "epoch": 2.298786181139122, | |
| "grad_norm": 0.04096253216266632, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5370841026306152, | |
| "mean_token_accuracy": 0.7823186218738556, | |
| "num_tokens": 10037433.0, | |
| "step": 616 | |
| }, | |
| { | |
| "entropy": 0.5429483950138092, | |
| "epoch": 2.302521008403361, | |
| "grad_norm": 0.04739284887909889, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5423600673675537, | |
| "mean_token_accuracy": 0.7782929539680481, | |
| "num_tokens": 10053809.0, | |
| "step": 617 | |
| }, | |
| { | |
| "entropy": 0.5375621318817139, | |
| "epoch": 2.3062558356676, | |
| "grad_norm": 0.03586879000067711, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5351011157035828, | |
| "mean_token_accuracy": 0.7827189415693283, | |
| "num_tokens": 10070403.0, | |
| "step": 618 | |
| }, | |
| { | |
| "entropy": 0.5332229882478714, | |
| "epoch": 2.309990662931839, | |
| "grad_norm": 0.039749447256326675, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5316674113273621, | |
| "mean_token_accuracy": 0.7804222106933594, | |
| "num_tokens": 10086520.0, | |
| "step": 619 | |
| }, | |
| { | |
| "entropy": 0.5367073863744736, | |
| "epoch": 2.313725490196078, | |
| "grad_norm": 0.03144790232181549, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5297679305076599, | |
| "mean_token_accuracy": 0.7887496650218964, | |
| "num_tokens": 10102990.0, | |
| "step": 620 | |
| }, | |
| { | |
| "entropy": 0.5359086692333221, | |
| "epoch": 2.317460317460317, | |
| "grad_norm": 0.03297298401594162, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5307391881942749, | |
| "mean_token_accuracy": 0.7840029001235962, | |
| "num_tokens": 10119527.0, | |
| "step": 621 | |
| }, | |
| { | |
| "entropy": 0.5245398730039597, | |
| "epoch": 2.3211951447245567, | |
| "grad_norm": 0.04077174887061119, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5315594673156738, | |
| "mean_token_accuracy": 0.7830156534910202, | |
| "num_tokens": 10135668.0, | |
| "step": 622 | |
| }, | |
| { | |
| "entropy": 0.5435569882392883, | |
| "epoch": 2.3249299719887957, | |
| "grad_norm": 0.037014640867710114, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5420113801956177, | |
| "mean_token_accuracy": 0.7790663093328476, | |
| "num_tokens": 10152032.0, | |
| "step": 623 | |
| }, | |
| { | |
| "entropy": 0.5408807992935181, | |
| "epoch": 2.3286647992530347, | |
| "grad_norm": 0.040016841143369675, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5407621264457703, | |
| "mean_token_accuracy": 0.7807969450950623, | |
| "num_tokens": 10168548.0, | |
| "step": 624 | |
| }, | |
| { | |
| "entropy": 0.5394706726074219, | |
| "epoch": 2.3323996265172737, | |
| "grad_norm": 0.038603588938713074, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5371181964874268, | |
| "mean_token_accuracy": 0.7830179631710052, | |
| "num_tokens": 10185087.0, | |
| "step": 625 | |
| }, | |
| { | |
| "entropy": 0.5331766307353973, | |
| "epoch": 2.3361344537815127, | |
| "grad_norm": 0.03732473403215408, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5403282046318054, | |
| "mean_token_accuracy": 0.7811668664216995, | |
| "num_tokens": 10201643.0, | |
| "step": 626 | |
| }, | |
| { | |
| "entropy": 0.5270423144102097, | |
| "epoch": 2.3398692810457518, | |
| "grad_norm": 0.039125435054302216, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5314643383026123, | |
| "mean_token_accuracy": 0.7847079634666443, | |
| "num_tokens": 10218028.0, | |
| "step": 627 | |
| }, | |
| { | |
| "entropy": 0.5217768847942352, | |
| "epoch": 2.3436041083099908, | |
| "grad_norm": 0.031856924295425415, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5211607813835144, | |
| "mean_token_accuracy": 0.787055104970932, | |
| "num_tokens": 10234302.0, | |
| "step": 628 | |
| }, | |
| { | |
| "entropy": 0.5297789797186852, | |
| "epoch": 2.34733893557423, | |
| "grad_norm": 0.044731732457876205, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5366175174713135, | |
| "mean_token_accuracy": 0.7815698832273483, | |
| "num_tokens": 10250527.0, | |
| "step": 629 | |
| }, | |
| { | |
| "entropy": 0.5372533053159714, | |
| "epoch": 2.351073762838469, | |
| "grad_norm": 0.03578559309244156, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5398249626159668, | |
| "mean_token_accuracy": 0.782914400100708, | |
| "num_tokens": 10266845.0, | |
| "step": 630 | |
| }, | |
| { | |
| "entropy": 0.5397268682718277, | |
| "epoch": 2.354808590102708, | |
| "grad_norm": 0.04053846001625061, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5417327880859375, | |
| "mean_token_accuracy": 0.7793487906455994, | |
| "num_tokens": 10283134.0, | |
| "step": 631 | |
| }, | |
| { | |
| "entropy": 0.5457513332366943, | |
| "epoch": 2.358543417366947, | |
| "grad_norm": 0.039855144917964935, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5377854704856873, | |
| "mean_token_accuracy": 0.7803535759449005, | |
| "num_tokens": 10299673.0, | |
| "step": 632 | |
| }, | |
| { | |
| "entropy": 0.5374201238155365, | |
| "epoch": 2.362278244631186, | |
| "grad_norm": 0.03583669289946556, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5346733331680298, | |
| "mean_token_accuracy": 0.7818557769060135, | |
| "num_tokens": 10316146.0, | |
| "step": 633 | |
| }, | |
| { | |
| "entropy": 0.5228708907961845, | |
| "epoch": 2.366013071895425, | |
| "grad_norm": 0.0356278158724308, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5220701694488525, | |
| "mean_token_accuracy": 0.7894868850708008, | |
| "num_tokens": 10332482.0, | |
| "step": 634 | |
| }, | |
| { | |
| "entropy": 0.5448856949806213, | |
| "epoch": 2.369747899159664, | |
| "grad_norm": 0.045307550579309464, | |
| "learning_rate": 0.0002, | |
| "loss": 0.555870771408081, | |
| "mean_token_accuracy": 0.7770739197731018, | |
| "num_tokens": 10348970.0, | |
| "step": 635 | |
| }, | |
| { | |
| "entropy": 0.5384282767772675, | |
| "epoch": 2.373482726423903, | |
| "grad_norm": 0.03949993476271629, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5424531102180481, | |
| "mean_token_accuracy": 0.7787595987319946, | |
| "num_tokens": 10365074.0, | |
| "step": 636 | |
| }, | |
| { | |
| "entropy": 0.532962828874588, | |
| "epoch": 2.377217553688142, | |
| "grad_norm": 0.0345122404396534, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5286644697189331, | |
| "mean_token_accuracy": 0.7851764559745789, | |
| "num_tokens": 10381036.0, | |
| "step": 637 | |
| }, | |
| { | |
| "entropy": 0.5396641790866852, | |
| "epoch": 2.380952380952381, | |
| "grad_norm": 0.038070570677518845, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5350325703620911, | |
| "mean_token_accuracy": 0.783138781785965, | |
| "num_tokens": 10397441.0, | |
| "step": 638 | |
| }, | |
| { | |
| "entropy": 0.5453281551599503, | |
| "epoch": 2.38468720821662, | |
| "grad_norm": 0.03477659448981285, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5431845188140869, | |
| "mean_token_accuracy": 0.7779907435178757, | |
| "num_tokens": 10413843.0, | |
| "step": 639 | |
| }, | |
| { | |
| "entropy": 0.5235247910022736, | |
| "epoch": 2.388422035480859, | |
| "grad_norm": 0.04054819047451019, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5272566080093384, | |
| "mean_token_accuracy": 0.7897930145263672, | |
| "num_tokens": 10430041.0, | |
| "step": 640 | |
| }, | |
| { | |
| "entropy": 0.5263708084821701, | |
| "epoch": 2.392156862745098, | |
| "grad_norm": 0.042338334023952484, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5340385437011719, | |
| "mean_token_accuracy": 0.7824059575796127, | |
| "num_tokens": 10446100.0, | |
| "step": 641 | |
| }, | |
| { | |
| "entropy": 0.543594166636467, | |
| "epoch": 2.395891690009337, | |
| "grad_norm": 0.04357817769050598, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5377992391586304, | |
| "mean_token_accuracy": 0.781853511929512, | |
| "num_tokens": 10462519.0, | |
| "step": 642 | |
| }, | |
| { | |
| "entropy": 0.5444612801074982, | |
| "epoch": 2.399626517273576, | |
| "grad_norm": 0.03883645310997963, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5423793196678162, | |
| "mean_token_accuracy": 0.7786720097064972, | |
| "num_tokens": 10478807.0, | |
| "step": 643 | |
| }, | |
| { | |
| "entropy": 0.5298498719930649, | |
| "epoch": 2.403361344537815, | |
| "grad_norm": 0.03690332546830177, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5272641181945801, | |
| "mean_token_accuracy": 0.782812237739563, | |
| "num_tokens": 10494864.0, | |
| "step": 644 | |
| }, | |
| { | |
| "entropy": 0.528311550617218, | |
| "epoch": 2.407096171802054, | |
| "grad_norm": 0.04098167642951012, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5349369049072266, | |
| "mean_token_accuracy": 0.7804581671953201, | |
| "num_tokens": 10511211.0, | |
| "step": 645 | |
| }, | |
| { | |
| "entropy": 0.5355981737375259, | |
| "epoch": 2.410830999066293, | |
| "grad_norm": 0.040713947266340256, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5427882075309753, | |
| "mean_token_accuracy": 0.7789115309715271, | |
| "num_tokens": 10527252.0, | |
| "step": 646 | |
| }, | |
| { | |
| "entropy": 0.5335679203271866, | |
| "epoch": 2.414565826330532, | |
| "grad_norm": 0.03578624129295349, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5237961411476135, | |
| "mean_token_accuracy": 0.7891885042190552, | |
| "num_tokens": 10543508.0, | |
| "step": 647 | |
| }, | |
| { | |
| "entropy": 0.550647184252739, | |
| "epoch": 2.418300653594771, | |
| "grad_norm": 0.041548412293195724, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5482417941093445, | |
| "mean_token_accuracy": 0.7743094116449356, | |
| "num_tokens": 10559883.0, | |
| "step": 648 | |
| }, | |
| { | |
| "entropy": 0.5099608227610588, | |
| "epoch": 2.42203548085901, | |
| "grad_norm": 0.035532381385564804, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5146307349205017, | |
| "mean_token_accuracy": 0.7876965999603271, | |
| "num_tokens": 10576102.0, | |
| "step": 649 | |
| }, | |
| { | |
| "entropy": 0.5289439111948013, | |
| "epoch": 2.425770308123249, | |
| "grad_norm": 0.03995847702026367, | |
| "learning_rate": 0.0002, | |
| "loss": 0.529523491859436, | |
| "mean_token_accuracy": 0.7849718630313873, | |
| "num_tokens": 10592278.0, | |
| "step": 650 | |
| }, | |
| { | |
| "entropy": 0.5271874070167542, | |
| "epoch": 2.429505135387488, | |
| "grad_norm": 0.038978736847639084, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5379216074943542, | |
| "mean_token_accuracy": 0.7814365327358246, | |
| "num_tokens": 10608707.0, | |
| "step": 651 | |
| }, | |
| { | |
| "entropy": 0.5211434736847878, | |
| "epoch": 2.4332399626517276, | |
| "grad_norm": 0.04277133196592331, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5224626660346985, | |
| "mean_token_accuracy": 0.7893835753202438, | |
| "num_tokens": 10625209.0, | |
| "step": 652 | |
| }, | |
| { | |
| "entropy": 0.5353395342826843, | |
| "epoch": 2.4369747899159666, | |
| "grad_norm": 0.03804321959614754, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5317578911781311, | |
| "mean_token_accuracy": 0.7827101796865463, | |
| "num_tokens": 10641678.0, | |
| "step": 653 | |
| }, | |
| { | |
| "entropy": 0.5419681817293167, | |
| "epoch": 2.4407096171802056, | |
| "grad_norm": 0.03237481042742729, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5347220301628113, | |
| "mean_token_accuracy": 0.7828710377216339, | |
| "num_tokens": 10658020.0, | |
| "step": 654 | |
| }, | |
| { | |
| "entropy": 0.54988232254982, | |
| "epoch": 2.4444444444444446, | |
| "grad_norm": 0.0367792509496212, | |
| "learning_rate": 0.0002, | |
| "loss": 0.548277735710144, | |
| "mean_token_accuracy": 0.7793003767728806, | |
| "num_tokens": 10674273.0, | |
| "step": 655 | |
| }, | |
| { | |
| "entropy": 0.5270714908838272, | |
| "epoch": 2.4481792717086837, | |
| "grad_norm": 0.04078115150332451, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5275436639785767, | |
| "mean_token_accuracy": 0.7857778370380402, | |
| "num_tokens": 10690682.0, | |
| "step": 656 | |
| }, | |
| { | |
| "entropy": 0.5229745805263519, | |
| "epoch": 2.4519140989729227, | |
| "grad_norm": 0.03635413572192192, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5259315967559814, | |
| "mean_token_accuracy": 0.7876160591840744, | |
| "num_tokens": 10706935.0, | |
| "step": 657 | |
| }, | |
| { | |
| "entropy": 0.520149365067482, | |
| "epoch": 2.4556489262371617, | |
| "grad_norm": 0.04523176699876785, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5284128189086914, | |
| "mean_token_accuracy": 0.7826286852359772, | |
| "num_tokens": 10723130.0, | |
| "step": 658 | |
| }, | |
| { | |
| "entropy": 0.5226980745792389, | |
| "epoch": 2.4593837535014007, | |
| "grad_norm": 0.04385685920715332, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5277330279350281, | |
| "mean_token_accuracy": 0.7842638790607452, | |
| "num_tokens": 10739706.0, | |
| "step": 659 | |
| }, | |
| { | |
| "entropy": 0.5327855497598648, | |
| "epoch": 2.4631185807656397, | |
| "grad_norm": 0.03833289071917534, | |
| "learning_rate": 0.0002, | |
| "loss": 0.529242753982544, | |
| "mean_token_accuracy": 0.784681499004364, | |
| "num_tokens": 10756135.0, | |
| "step": 660 | |
| }, | |
| { | |
| "entropy": 0.5270693749189377, | |
| "epoch": 2.4668534080298787, | |
| "grad_norm": 0.04420669376850128, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5234766602516174, | |
| "mean_token_accuracy": 0.7881586104631424, | |
| "num_tokens": 10772473.0, | |
| "step": 661 | |
| }, | |
| { | |
| "entropy": 0.5432615429162979, | |
| "epoch": 2.4705882352941178, | |
| "grad_norm": 0.03388570621609688, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5459257364273071, | |
| "mean_token_accuracy": 0.7780051380395889, | |
| "num_tokens": 10788831.0, | |
| "step": 662 | |
| }, | |
| { | |
| "entropy": 0.5488771200180054, | |
| "epoch": 2.4743230625583568, | |
| "grad_norm": 0.04762876406311989, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5534912943840027, | |
| "mean_token_accuracy": 0.7749715596437454, | |
| "num_tokens": 10805527.0, | |
| "step": 663 | |
| }, | |
| { | |
| "entropy": 0.5422950983047485, | |
| "epoch": 2.478057889822596, | |
| "grad_norm": 0.03591262549161911, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5398073792457581, | |
| "mean_token_accuracy": 0.779995933175087, | |
| "num_tokens": 10821915.0, | |
| "step": 664 | |
| }, | |
| { | |
| "entropy": 0.567908450961113, | |
| "epoch": 2.481792717086835, | |
| "grad_norm": 0.04293651878833771, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5645220875740051, | |
| "mean_token_accuracy": 0.771768257021904, | |
| "num_tokens": 10838601.0, | |
| "step": 665 | |
| }, | |
| { | |
| "entropy": 0.534419909119606, | |
| "epoch": 2.485527544351074, | |
| "grad_norm": 0.036424651741981506, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5309603214263916, | |
| "mean_token_accuracy": 0.786093220114708, | |
| "num_tokens": 10854981.0, | |
| "step": 666 | |
| }, | |
| { | |
| "entropy": 0.5380399525165558, | |
| "epoch": 2.489262371615313, | |
| "grad_norm": 0.04585183784365654, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5384916067123413, | |
| "mean_token_accuracy": 0.7809207290410995, | |
| "num_tokens": 10871328.0, | |
| "step": 667 | |
| }, | |
| { | |
| "entropy": 0.5118337720632553, | |
| "epoch": 2.492997198879552, | |
| "grad_norm": 0.03870607912540436, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5148553252220154, | |
| "mean_token_accuracy": 0.7906211614608765, | |
| "num_tokens": 10887543.0, | |
| "step": 668 | |
| }, | |
| { | |
| "entropy": 0.539421871304512, | |
| "epoch": 2.496732026143791, | |
| "grad_norm": 0.04092569276690483, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5474343299865723, | |
| "mean_token_accuracy": 0.7810823172330856, | |
| "num_tokens": 10904063.0, | |
| "step": 669 | |
| }, | |
| { | |
| "entropy": 0.5357869118452072, | |
| "epoch": 2.50046685340803, | |
| "grad_norm": 0.03857175633311272, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5365599393844604, | |
| "mean_token_accuracy": 0.7816625684499741, | |
| "num_tokens": 10920474.0, | |
| "step": 670 | |
| }, | |
| { | |
| "entropy": 0.5330220460891724, | |
| "epoch": 2.504201680672269, | |
| "grad_norm": 0.03685252368450165, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5331542491912842, | |
| "mean_token_accuracy": 0.7820776700973511, | |
| "num_tokens": 10936663.0, | |
| "step": 671 | |
| }, | |
| { | |
| "entropy": 0.524094969034195, | |
| "epoch": 2.507936507936508, | |
| "grad_norm": 0.03893151134252548, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5277613997459412, | |
| "mean_token_accuracy": 0.7860450148582458, | |
| "num_tokens": 10952950.0, | |
| "step": 672 | |
| }, | |
| { | |
| "entropy": 0.5463172346353531, | |
| "epoch": 2.511671335200747, | |
| "grad_norm": 0.039967626333236694, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5425282716751099, | |
| "mean_token_accuracy": 0.7801816016435623, | |
| "num_tokens": 10969412.0, | |
| "step": 673 | |
| }, | |
| { | |
| "entropy": 0.5239230394363403, | |
| "epoch": 2.515406162464986, | |
| "grad_norm": 0.046231936663389206, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5241309404373169, | |
| "mean_token_accuracy": 0.787441685795784, | |
| "num_tokens": 10985869.0, | |
| "step": 674 | |
| }, | |
| { | |
| "entropy": 0.5359321981668472, | |
| "epoch": 2.519140989729225, | |
| "grad_norm": 0.040779855102300644, | |
| "learning_rate": 0.0002, | |
| "loss": 0.536766767501831, | |
| "mean_token_accuracy": 0.7817385196685791, | |
| "num_tokens": 11002074.0, | |
| "step": 675 | |
| }, | |
| { | |
| "entropy": 0.5319357812404633, | |
| "epoch": 2.522875816993464, | |
| "grad_norm": 0.03476366400718689, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5311717391014099, | |
| "mean_token_accuracy": 0.7856648862361908, | |
| "num_tokens": 11018648.0, | |
| "step": 676 | |
| }, | |
| { | |
| "entropy": 0.5231706351041794, | |
| "epoch": 2.526610644257703, | |
| "grad_norm": 0.03785642236471176, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5269960165023804, | |
| "mean_token_accuracy": 0.7866117358207703, | |
| "num_tokens": 11034686.0, | |
| "step": 677 | |
| }, | |
| { | |
| "entropy": 0.5381273478269577, | |
| "epoch": 2.530345471521942, | |
| "grad_norm": 0.03976747393608093, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5381407141685486, | |
| "mean_token_accuracy": 0.7849810570478439, | |
| "num_tokens": 11050922.0, | |
| "step": 678 | |
| }, | |
| { | |
| "entropy": 0.5456480979919434, | |
| "epoch": 2.534080298786181, | |
| "grad_norm": 0.039225250482559204, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5425232648849487, | |
| "mean_token_accuracy": 0.7785615175962448, | |
| "num_tokens": 11067148.0, | |
| "step": 679 | |
| }, | |
| { | |
| "entropy": 0.5407412797212601, | |
| "epoch": 2.53781512605042, | |
| "grad_norm": 0.03705086559057236, | |
| "learning_rate": 0.0002, | |
| "loss": 0.536932110786438, | |
| "mean_token_accuracy": 0.7821008861064911, | |
| "num_tokens": 11083363.0, | |
| "step": 680 | |
| }, | |
| { | |
| "entropy": 0.5263440012931824, | |
| "epoch": 2.541549953314659, | |
| "grad_norm": 0.0353594608604908, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5256474018096924, | |
| "mean_token_accuracy": 0.7849348187446594, | |
| "num_tokens": 11099785.0, | |
| "step": 681 | |
| }, | |
| { | |
| "entropy": 0.5354757159948349, | |
| "epoch": 2.545284780578898, | |
| "grad_norm": 0.04532964155077934, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5450004935264587, | |
| "mean_token_accuracy": 0.7807293385267258, | |
| "num_tokens": 11115892.0, | |
| "step": 682 | |
| }, | |
| { | |
| "entropy": 0.5281579941511154, | |
| "epoch": 2.549019607843137, | |
| "grad_norm": 0.03604253754019737, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5311046838760376, | |
| "mean_token_accuracy": 0.7845126688480377, | |
| "num_tokens": 11132189.0, | |
| "step": 683 | |
| }, | |
| { | |
| "entropy": 0.5354526489973068, | |
| "epoch": 2.552754435107376, | |
| "grad_norm": 0.03747657313942909, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5361717343330383, | |
| "mean_token_accuracy": 0.7801252007484436, | |
| "num_tokens": 11148681.0, | |
| "step": 684 | |
| }, | |
| { | |
| "entropy": 0.5386267453432083, | |
| "epoch": 2.556489262371615, | |
| "grad_norm": 0.037825409322977066, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5390512347221375, | |
| "mean_token_accuracy": 0.7815877050161362, | |
| "num_tokens": 11165013.0, | |
| "step": 685 | |
| }, | |
| { | |
| "entropy": 0.530585527420044, | |
| "epoch": 2.560224089635854, | |
| "grad_norm": 0.03970746695995331, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5291422009468079, | |
| "mean_token_accuracy": 0.7839111536741257, | |
| "num_tokens": 11181301.0, | |
| "step": 686 | |
| }, | |
| { | |
| "entropy": 0.5292850136756897, | |
| "epoch": 2.563958916900093, | |
| "grad_norm": 0.03387298434972763, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5319269299507141, | |
| "mean_token_accuracy": 0.7840193659067154, | |
| "num_tokens": 11197537.0, | |
| "step": 687 | |
| }, | |
| { | |
| "entropy": 0.5399095267057419, | |
| "epoch": 2.567693744164332, | |
| "grad_norm": 0.038681600242853165, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5435532331466675, | |
| "mean_token_accuracy": 0.7806709408760071, | |
| "num_tokens": 11213896.0, | |
| "step": 688 | |
| }, | |
| { | |
| "entropy": 0.5498056858778, | |
| "epoch": 2.571428571428571, | |
| "grad_norm": 0.03758297860622406, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5467256307601929, | |
| "mean_token_accuracy": 0.7782751470804214, | |
| "num_tokens": 11230383.0, | |
| "step": 689 | |
| }, | |
| { | |
| "entropy": 0.5476771891117096, | |
| "epoch": 2.57516339869281, | |
| "grad_norm": 0.03605665639042854, | |
| "learning_rate": 0.0002, | |
| "loss": 0.541588544845581, | |
| "mean_token_accuracy": 0.7791445404291153, | |
| "num_tokens": 11246749.0, | |
| "step": 690 | |
| }, | |
| { | |
| "entropy": 0.542407214641571, | |
| "epoch": 2.5788982259570497, | |
| "grad_norm": 0.04616822302341461, | |
| "learning_rate": 0.0002, | |
| "loss": 0.535969614982605, | |
| "mean_token_accuracy": 0.7812883108854294, | |
| "num_tokens": 11263093.0, | |
| "step": 691 | |
| }, | |
| { | |
| "entropy": 0.5215721130371094, | |
| "epoch": 2.5826330532212887, | |
| "grad_norm": 0.040278688073158264, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5306443572044373, | |
| "mean_token_accuracy": 0.783096119761467, | |
| "num_tokens": 11279295.0, | |
| "step": 692 | |
| }, | |
| { | |
| "entropy": 0.5300876200199127, | |
| "epoch": 2.5863678804855277, | |
| "grad_norm": 0.04465034604072571, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5408331751823425, | |
| "mean_token_accuracy": 0.781616821885109, | |
| "num_tokens": 11295488.0, | |
| "step": 693 | |
| }, | |
| { | |
| "entropy": 0.529060423374176, | |
| "epoch": 2.5901027077497667, | |
| "grad_norm": 0.03697149083018303, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5315713286399841, | |
| "mean_token_accuracy": 0.784434586763382, | |
| "num_tokens": 11311910.0, | |
| "step": 694 | |
| }, | |
| { | |
| "entropy": 0.5421274900436401, | |
| "epoch": 2.5938375350140057, | |
| "grad_norm": 0.03769063949584961, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5342295169830322, | |
| "mean_token_accuracy": 0.7821343541145325, | |
| "num_tokens": 11328227.0, | |
| "step": 695 | |
| }, | |
| { | |
| "entropy": 0.5521349608898163, | |
| "epoch": 2.5975723622782447, | |
| "grad_norm": 0.037369053810834885, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5406404733657837, | |
| "mean_token_accuracy": 0.7816728502511978, | |
| "num_tokens": 11344754.0, | |
| "step": 696 | |
| }, | |
| { | |
| "entropy": 0.5276040434837341, | |
| "epoch": 2.6013071895424837, | |
| "grad_norm": 0.04295807331800461, | |
| "learning_rate": 0.0002, | |
| "loss": 0.531209409236908, | |
| "mean_token_accuracy": 0.781694307923317, | |
| "num_tokens": 11360846.0, | |
| "step": 697 | |
| }, | |
| { | |
| "entropy": 0.5329545885324478, | |
| "epoch": 2.6050420168067228, | |
| "grad_norm": 0.04680144414305687, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5448673963546753, | |
| "mean_token_accuracy": 0.7803032696247101, | |
| "num_tokens": 11376984.0, | |
| "step": 698 | |
| }, | |
| { | |
| "entropy": 0.5330372750759125, | |
| "epoch": 2.6087768440709618, | |
| "grad_norm": 0.038128506392240524, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5345317125320435, | |
| "mean_token_accuracy": 0.7848279774188995, | |
| "num_tokens": 11393192.0, | |
| "step": 699 | |
| }, | |
| { | |
| "entropy": 0.5620173513889313, | |
| "epoch": 2.612511671335201, | |
| "grad_norm": 0.0405871607363224, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5558884143829346, | |
| "mean_token_accuracy": 0.7717028856277466, | |
| "num_tokens": 11409571.0, | |
| "step": 700 | |
| }, | |
| { | |
| "entropy": 0.5401062965393066, | |
| "epoch": 2.61624649859944, | |
| "grad_norm": 0.033952489495277405, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5324668884277344, | |
| "mean_token_accuracy": 0.7836252152919769, | |
| "num_tokens": 11426157.0, | |
| "step": 701 | |
| }, | |
| { | |
| "entropy": 0.5401272624731064, | |
| "epoch": 2.619981325863679, | |
| "grad_norm": 0.03486888110637665, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5405600666999817, | |
| "mean_token_accuracy": 0.780670240521431, | |
| "num_tokens": 11442706.0, | |
| "step": 702 | |
| }, | |
| { | |
| "entropy": 0.5286990851163864, | |
| "epoch": 2.623716153127918, | |
| "grad_norm": 0.03971569985151291, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5301419496536255, | |
| "mean_token_accuracy": 0.7845329642295837, | |
| "num_tokens": 11459059.0, | |
| "step": 703 | |
| }, | |
| { | |
| "entropy": 0.5408699810504913, | |
| "epoch": 2.627450980392157, | |
| "grad_norm": 0.03566860780119896, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5422340631484985, | |
| "mean_token_accuracy": 0.7786179780960083, | |
| "num_tokens": 11475473.0, | |
| "step": 704 | |
| }, | |
| { | |
| "entropy": 0.5306770950555801, | |
| "epoch": 2.631185807656396, | |
| "grad_norm": 0.038531865924596786, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5311087965965271, | |
| "mean_token_accuracy": 0.784186452627182, | |
| "num_tokens": 11491765.0, | |
| "step": 705 | |
| }, | |
| { | |
| "entropy": 0.5391299277544022, | |
| "epoch": 2.634920634920635, | |
| "grad_norm": 0.036147549748420715, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5403758883476257, | |
| "mean_token_accuracy": 0.7817845791578293, | |
| "num_tokens": 11508291.0, | |
| "step": 706 | |
| }, | |
| { | |
| "entropy": 0.5316940769553185, | |
| "epoch": 2.638655462184874, | |
| "grad_norm": 0.036513980478048325, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5340716242790222, | |
| "mean_token_accuracy": 0.7847382575273514, | |
| "num_tokens": 11524811.0, | |
| "step": 707 | |
| }, | |
| { | |
| "entropy": 0.5237598121166229, | |
| "epoch": 2.642390289449113, | |
| "grad_norm": 0.03360476344823837, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5258880257606506, | |
| "mean_token_accuracy": 0.7865117788314819, | |
| "num_tokens": 11541335.0, | |
| "step": 708 | |
| }, | |
| { | |
| "entropy": 0.5325336754322052, | |
| "epoch": 2.646125116713352, | |
| "grad_norm": 0.03501066192984581, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5358341336250305, | |
| "mean_token_accuracy": 0.7841547876596451, | |
| "num_tokens": 11557859.0, | |
| "step": 709 | |
| }, | |
| { | |
| "entropy": 0.5220260694622993, | |
| "epoch": 2.649859943977591, | |
| "grad_norm": 0.038072340190410614, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5222914814949036, | |
| "mean_token_accuracy": 0.7870404571294785, | |
| "num_tokens": 11574116.0, | |
| "step": 710 | |
| }, | |
| { | |
| "entropy": 0.5257419422268867, | |
| "epoch": 2.65359477124183, | |
| "grad_norm": 0.03713792935013771, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5267120599746704, | |
| "mean_token_accuracy": 0.7852788418531418, | |
| "num_tokens": 11590295.0, | |
| "step": 711 | |
| }, | |
| { | |
| "entropy": 0.5383759438991547, | |
| "epoch": 2.657329598506069, | |
| "grad_norm": 0.04603256285190582, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5421494841575623, | |
| "mean_token_accuracy": 0.781136080622673, | |
| "num_tokens": 11606581.0, | |
| "step": 712 | |
| }, | |
| { | |
| "entropy": 0.5336297005414963, | |
| "epoch": 2.661064425770308, | |
| "grad_norm": 0.03931435942649841, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5313882231712341, | |
| "mean_token_accuracy": 0.7825400978326797, | |
| "num_tokens": 11622793.0, | |
| "step": 713 | |
| }, | |
| { | |
| "entropy": 0.5316190719604492, | |
| "epoch": 2.664799253034547, | |
| "grad_norm": 0.03564710542559624, | |
| "learning_rate": 0.0002, | |
| "loss": 0.530137836933136, | |
| "mean_token_accuracy": 0.7861842215061188, | |
| "num_tokens": 11638909.0, | |
| "step": 714 | |
| }, | |
| { | |
| "entropy": 0.529007188975811, | |
| "epoch": 2.668534080298786, | |
| "grad_norm": 0.03671964257955551, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5294506549835205, | |
| "mean_token_accuracy": 0.7843856066465378, | |
| "num_tokens": 11655048.0, | |
| "step": 715 | |
| }, | |
| { | |
| "entropy": 0.5391807407140732, | |
| "epoch": 2.6722689075630255, | |
| "grad_norm": 0.043020427227020264, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5463923215866089, | |
| "mean_token_accuracy": 0.7798423320055008, | |
| "num_tokens": 11671616.0, | |
| "step": 716 | |
| }, | |
| { | |
| "entropy": 0.5369218289852142, | |
| "epoch": 2.6760037348272645, | |
| "grad_norm": 0.04039768502116203, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5428373217582703, | |
| "mean_token_accuracy": 0.7810155898332596, | |
| "num_tokens": 11687981.0, | |
| "step": 717 | |
| }, | |
| { | |
| "entropy": 0.5410373359918594, | |
| "epoch": 2.6797385620915035, | |
| "grad_norm": 0.032212115824222565, | |
| "learning_rate": 0.0002, | |
| "loss": 0.538726806640625, | |
| "mean_token_accuracy": 0.7823937833309174, | |
| "num_tokens": 11704497.0, | |
| "step": 718 | |
| }, | |
| { | |
| "entropy": 0.5408433228731155, | |
| "epoch": 2.6834733893557425, | |
| "grad_norm": 0.04190416634082794, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5312804579734802, | |
| "mean_token_accuracy": 0.7858607023954391, | |
| "num_tokens": 11720759.0, | |
| "step": 719 | |
| }, | |
| { | |
| "entropy": 0.52065759152174, | |
| "epoch": 2.6872082166199815, | |
| "grad_norm": 0.03749416023492813, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5172442197799683, | |
| "mean_token_accuracy": 0.7908001989126205, | |
| "num_tokens": 11736897.0, | |
| "step": 720 | |
| }, | |
| { | |
| "entropy": 0.5223864614963531, | |
| "epoch": 2.6909430438842206, | |
| "grad_norm": 0.03889421746134758, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5262103080749512, | |
| "mean_token_accuracy": 0.7863954603672028, | |
| "num_tokens": 11753026.0, | |
| "step": 721 | |
| }, | |
| { | |
| "entropy": 0.5417105704545975, | |
| "epoch": 2.6946778711484596, | |
| "grad_norm": 0.03900585323572159, | |
| "learning_rate": 0.0002, | |
| "loss": 0.548478901386261, | |
| "mean_token_accuracy": 0.7769544124603271, | |
| "num_tokens": 11769364.0, | |
| "step": 722 | |
| }, | |
| { | |
| "entropy": 0.5348965376615524, | |
| "epoch": 2.6984126984126986, | |
| "grad_norm": 0.040531598031520844, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5366338491439819, | |
| "mean_token_accuracy": 0.7824574261903763, | |
| "num_tokens": 11785662.0, | |
| "step": 723 | |
| }, | |
| { | |
| "entropy": 0.5472202748060226, | |
| "epoch": 2.7021475256769376, | |
| "grad_norm": 0.03544607013463974, | |
| "learning_rate": 0.0002, | |
| "loss": 0.546108067035675, | |
| "mean_token_accuracy": 0.7778937071561813, | |
| "num_tokens": 11802091.0, | |
| "step": 724 | |
| }, | |
| { | |
| "entropy": 0.5445298254489899, | |
| "epoch": 2.7058823529411766, | |
| "grad_norm": 0.045996710658073425, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5458025336265564, | |
| "mean_token_accuracy": 0.7784214168787003, | |
| "num_tokens": 11818307.0, | |
| "step": 725 | |
| }, | |
| { | |
| "entropy": 0.5437731146812439, | |
| "epoch": 2.7096171802054156, | |
| "grad_norm": 0.040692199021577835, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5425392389297485, | |
| "mean_token_accuracy": 0.7800037413835526, | |
| "num_tokens": 11834733.0, | |
| "step": 726 | |
| }, | |
| { | |
| "entropy": 0.5586313903331757, | |
| "epoch": 2.7133520074696547, | |
| "grad_norm": 0.05102645978331566, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5646232962608337, | |
| "mean_token_accuracy": 0.7713905870914459, | |
| "num_tokens": 11851346.0, | |
| "step": 727 | |
| }, | |
| { | |
| "entropy": 0.5276175439357758, | |
| "epoch": 2.7170868347338937, | |
| "grad_norm": 0.04199473559856415, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5330867767333984, | |
| "mean_token_accuracy": 0.7841922342777252, | |
| "num_tokens": 11867709.0, | |
| "step": 728 | |
| }, | |
| { | |
| "entropy": 0.5365078300237656, | |
| "epoch": 2.7208216619981327, | |
| "grad_norm": 0.038084954023361206, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5328811407089233, | |
| "mean_token_accuracy": 0.7830130755901337, | |
| "num_tokens": 11884172.0, | |
| "step": 729 | |
| }, | |
| { | |
| "entropy": 0.5306914746761322, | |
| "epoch": 2.7245564892623717, | |
| "grad_norm": 0.04009576886892319, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5335056185722351, | |
| "mean_token_accuracy": 0.784161165356636, | |
| "num_tokens": 11900524.0, | |
| "step": 730 | |
| }, | |
| { | |
| "entropy": 0.5325679033994675, | |
| "epoch": 2.7282913165266107, | |
| "grad_norm": 0.0398661270737648, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5311678051948547, | |
| "mean_token_accuracy": 0.7866542786359787, | |
| "num_tokens": 11916696.0, | |
| "step": 731 | |
| }, | |
| { | |
| "entropy": 0.5234319120645523, | |
| "epoch": 2.7320261437908497, | |
| "grad_norm": 0.03887765109539032, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5243536233901978, | |
| "mean_token_accuracy": 0.786685973405838, | |
| "num_tokens": 11933375.0, | |
| "step": 732 | |
| }, | |
| { | |
| "entropy": 0.5323622822761536, | |
| "epoch": 2.7357609710550888, | |
| "grad_norm": 0.041390158236026764, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5382110476493835, | |
| "mean_token_accuracy": 0.7813025563955307, | |
| "num_tokens": 11949641.0, | |
| "step": 733 | |
| }, | |
| { | |
| "entropy": 0.5282771736383438, | |
| "epoch": 2.7394957983193278, | |
| "grad_norm": 0.03821795806288719, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5237923860549927, | |
| "mean_token_accuracy": 0.7858958840370178, | |
| "num_tokens": 11965904.0, | |
| "step": 734 | |
| }, | |
| { | |
| "entropy": 0.5336133688688278, | |
| "epoch": 2.743230625583567, | |
| "grad_norm": 0.040790773928165436, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5322080850601196, | |
| "mean_token_accuracy": 0.7814221978187561, | |
| "num_tokens": 11982242.0, | |
| "step": 735 | |
| }, | |
| { | |
| "entropy": 0.5447276085615158, | |
| "epoch": 2.746965452847806, | |
| "grad_norm": 0.03733038902282715, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5435236096382141, | |
| "mean_token_accuracy": 0.7783806473016739, | |
| "num_tokens": 11998525.0, | |
| "step": 736 | |
| }, | |
| { | |
| "entropy": 0.5370974391698837, | |
| "epoch": 2.750700280112045, | |
| "grad_norm": 0.035691265016794205, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5391957759857178, | |
| "mean_token_accuracy": 0.7787430435419083, | |
| "num_tokens": 12014726.0, | |
| "step": 737 | |
| }, | |
| { | |
| "entropy": 0.5190877616405487, | |
| "epoch": 2.754435107376284, | |
| "grad_norm": 0.037242453545331955, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5239222645759583, | |
| "mean_token_accuracy": 0.7867171913385391, | |
| "num_tokens": 12030648.0, | |
| "step": 738 | |
| }, | |
| { | |
| "entropy": 0.5201060324907303, | |
| "epoch": 2.758169934640523, | |
| "grad_norm": 0.03840528428554535, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5264686942100525, | |
| "mean_token_accuracy": 0.7854082137346268, | |
| "num_tokens": 12046824.0, | |
| "step": 739 | |
| }, | |
| { | |
| "entropy": 0.5208890736103058, | |
| "epoch": 2.761904761904762, | |
| "grad_norm": 0.038443028926849365, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5207111239433289, | |
| "mean_token_accuracy": 0.7860049307346344, | |
| "num_tokens": 12063182.0, | |
| "step": 740 | |
| }, | |
| { | |
| "entropy": 0.5337280184030533, | |
| "epoch": 2.765639589169001, | |
| "grad_norm": 0.0408535934984684, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5295891165733337, | |
| "mean_token_accuracy": 0.7857932895421982, | |
| "num_tokens": 12079411.0, | |
| "step": 741 | |
| }, | |
| { | |
| "entropy": 0.5374506562948227, | |
| "epoch": 2.76937441643324, | |
| "grad_norm": 0.04354558512568474, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5414345860481262, | |
| "mean_token_accuracy": 0.7807870209217072, | |
| "num_tokens": 12095874.0, | |
| "step": 742 | |
| }, | |
| { | |
| "entropy": 0.5360343009233475, | |
| "epoch": 2.773109243697479, | |
| "grad_norm": 0.03928976133465767, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5380703210830688, | |
| "mean_token_accuracy": 0.7798075079917908, | |
| "num_tokens": 12112215.0, | |
| "step": 743 | |
| }, | |
| { | |
| "entropy": 0.5264292061328888, | |
| "epoch": 2.776844070961718, | |
| "grad_norm": 0.03775021806359291, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5281617045402527, | |
| "mean_token_accuracy": 0.7842919081449509, | |
| "num_tokens": 12128361.0, | |
| "step": 744 | |
| }, | |
| { | |
| "entropy": 0.5419831871986389, | |
| "epoch": 2.780578898225957, | |
| "grad_norm": 0.032331038266420364, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5362944602966309, | |
| "mean_token_accuracy": 0.7816326916217804, | |
| "num_tokens": 12144755.0, | |
| "step": 745 | |
| }, | |
| { | |
| "entropy": 0.5174460113048553, | |
| "epoch": 2.784313725490196, | |
| "grad_norm": 0.03798742592334747, | |
| "learning_rate": 0.0002, | |
| "loss": 0.515007495880127, | |
| "mean_token_accuracy": 0.7882062345743179, | |
| "num_tokens": 12161034.0, | |
| "step": 746 | |
| }, | |
| { | |
| "entropy": 0.5355328992009163, | |
| "epoch": 2.788048552754435, | |
| "grad_norm": 0.036557331681251526, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5344611406326294, | |
| "mean_token_accuracy": 0.7847500294446945, | |
| "num_tokens": 12177479.0, | |
| "step": 747 | |
| }, | |
| { | |
| "entropy": 0.538584902882576, | |
| "epoch": 2.791783380018674, | |
| "grad_norm": 0.039520300924777985, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5427792072296143, | |
| "mean_token_accuracy": 0.7786386609077454, | |
| "num_tokens": 12193830.0, | |
| "step": 748 | |
| }, | |
| { | |
| "entropy": 0.51973095536232, | |
| "epoch": 2.795518207282913, | |
| "grad_norm": 0.04126165434718132, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5279180407524109, | |
| "mean_token_accuracy": 0.784518226981163, | |
| "num_tokens": 12210022.0, | |
| "step": 749 | |
| }, | |
| { | |
| "entropy": 0.5385647118091583, | |
| "epoch": 2.799253034547152, | |
| "grad_norm": 0.03742329403758049, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5358390808105469, | |
| "mean_token_accuracy": 0.7814119607210159, | |
| "num_tokens": 12226184.0, | |
| "step": 750 | |
| }, | |
| { | |
| "entropy": 0.5483904033899307, | |
| "epoch": 2.802987861811391, | |
| "grad_norm": 0.03444087877869606, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5442800521850586, | |
| "mean_token_accuracy": 0.7782953381538391, | |
| "num_tokens": 12242564.0, | |
| "step": 751 | |
| }, | |
| { | |
| "entropy": 0.5447859466075897, | |
| "epoch": 2.80672268907563, | |
| "grad_norm": 0.037425972521305084, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5376838445663452, | |
| "mean_token_accuracy": 0.7805659919977188, | |
| "num_tokens": 12259077.0, | |
| "step": 752 | |
| }, | |
| { | |
| "entropy": 0.526421070098877, | |
| "epoch": 2.810457516339869, | |
| "grad_norm": 0.039544545114040375, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5272819399833679, | |
| "mean_token_accuracy": 0.7836880385875702, | |
| "num_tokens": 12275297.0, | |
| "step": 753 | |
| }, | |
| { | |
| "entropy": 0.538783460855484, | |
| "epoch": 2.814192343604108, | |
| "grad_norm": 0.035788971930742264, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5417999625205994, | |
| "mean_token_accuracy": 0.7819748818874359, | |
| "num_tokens": 12291643.0, | |
| "step": 754 | |
| }, | |
| { | |
| "entropy": 0.5367716252803802, | |
| "epoch": 2.817927170868347, | |
| "grad_norm": 0.040753189474344254, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5376288294792175, | |
| "mean_token_accuracy": 0.7829637825489044, | |
| "num_tokens": 12307987.0, | |
| "step": 755 | |
| }, | |
| { | |
| "entropy": 0.5418078452348709, | |
| "epoch": 2.821661998132586, | |
| "grad_norm": 0.036726806312799454, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5469898581504822, | |
| "mean_token_accuracy": 0.7801835685968399, | |
| "num_tokens": 12324503.0, | |
| "step": 756 | |
| }, | |
| { | |
| "entropy": 0.525896355509758, | |
| "epoch": 2.825396825396825, | |
| "grad_norm": 0.034559980034828186, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5265108942985535, | |
| "mean_token_accuracy": 0.7867930829524994, | |
| "num_tokens": 12340881.0, | |
| "step": 757 | |
| }, | |
| { | |
| "entropy": 0.5369487851858139, | |
| "epoch": 2.8291316526610646, | |
| "grad_norm": 0.03595944494009018, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5396771430969238, | |
| "mean_token_accuracy": 0.7813677042722702, | |
| "num_tokens": 12357352.0, | |
| "step": 758 | |
| }, | |
| { | |
| "entropy": 0.5467210859060287, | |
| "epoch": 2.8328664799253036, | |
| "grad_norm": 0.03524104505777359, | |
| "learning_rate": 0.0002, | |
| "loss": 0.544916570186615, | |
| "mean_token_accuracy": 0.7771721184253693, | |
| "num_tokens": 12373526.0, | |
| "step": 759 | |
| }, | |
| { | |
| "entropy": 0.5246351063251495, | |
| "epoch": 2.8366013071895426, | |
| "grad_norm": 0.036806508898735046, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5221924781799316, | |
| "mean_token_accuracy": 0.7871624380350113, | |
| "num_tokens": 12389771.0, | |
| "step": 760 | |
| }, | |
| { | |
| "entropy": 0.530710369348526, | |
| "epoch": 2.8403361344537816, | |
| "grad_norm": 0.04332499951124191, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5322965383529663, | |
| "mean_token_accuracy": 0.7832685261964798, | |
| "num_tokens": 12406028.0, | |
| "step": 761 | |
| }, | |
| { | |
| "entropy": 0.5254833996295929, | |
| "epoch": 2.8440709617180207, | |
| "grad_norm": 0.038304176181554794, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5253804922103882, | |
| "mean_token_accuracy": 0.7873952239751816, | |
| "num_tokens": 12422639.0, | |
| "step": 762 | |
| }, | |
| { | |
| "entropy": 0.5236704498529434, | |
| "epoch": 2.8478057889822597, | |
| "grad_norm": 0.03660830482840538, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5286169052124023, | |
| "mean_token_accuracy": 0.7816056311130524, | |
| "num_tokens": 12438922.0, | |
| "step": 763 | |
| }, | |
| { | |
| "entropy": 0.5321139246225357, | |
| "epoch": 2.8515406162464987, | |
| "grad_norm": 0.04276243969798088, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5400298237800598, | |
| "mean_token_accuracy": 0.7802720963954926, | |
| "num_tokens": 12455234.0, | |
| "step": 764 | |
| }, | |
| { | |
| "entropy": 0.5383250862360001, | |
| "epoch": 2.8552754435107377, | |
| "grad_norm": 0.04291578382253647, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5375620722770691, | |
| "mean_token_accuracy": 0.7810464948415756, | |
| "num_tokens": 12471352.0, | |
| "step": 765 | |
| }, | |
| { | |
| "entropy": 0.5423205345869064, | |
| "epoch": 2.8590102707749767, | |
| "grad_norm": 0.04575496166944504, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5404216647148132, | |
| "mean_token_accuracy": 0.7788951247930527, | |
| "num_tokens": 12487810.0, | |
| "step": 766 | |
| }, | |
| { | |
| "entropy": 0.5412723869085312, | |
| "epoch": 2.8627450980392157, | |
| "grad_norm": 0.03895537182688713, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5416159629821777, | |
| "mean_token_accuracy": 0.7791194468736649, | |
| "num_tokens": 12504261.0, | |
| "step": 767 | |
| }, | |
| { | |
| "entropy": 0.551712304353714, | |
| "epoch": 2.8664799253034547, | |
| "grad_norm": 0.04248276725411415, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5512599945068359, | |
| "mean_token_accuracy": 0.7787346094846725, | |
| "num_tokens": 12520594.0, | |
| "step": 768 | |
| }, | |
| { | |
| "entropy": 0.5365375429391861, | |
| "epoch": 2.8702147525676938, | |
| "grad_norm": 0.0429382361471653, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5369971990585327, | |
| "mean_token_accuracy": 0.7795698195695877, | |
| "num_tokens": 12537097.0, | |
| "step": 769 | |
| }, | |
| { | |
| "entropy": 0.5311344265937805, | |
| "epoch": 2.8739495798319328, | |
| "grad_norm": 0.03710220381617546, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5327049493789673, | |
| "mean_token_accuracy": 0.784042477607727, | |
| "num_tokens": 12553319.0, | |
| "step": 770 | |
| }, | |
| { | |
| "entropy": 0.5425883233547211, | |
| "epoch": 2.877684407096172, | |
| "grad_norm": 0.04352175444364548, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5457234382629395, | |
| "mean_token_accuracy": 0.7795119434595108, | |
| "num_tokens": 12569370.0, | |
| "step": 771 | |
| }, | |
| { | |
| "entropy": 0.5384223312139511, | |
| "epoch": 2.881419234360411, | |
| "grad_norm": 0.046248357743024826, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5449962615966797, | |
| "mean_token_accuracy": 0.7777050882577896, | |
| "num_tokens": 12585550.0, | |
| "step": 772 | |
| }, | |
| { | |
| "entropy": 0.5304270684719086, | |
| "epoch": 2.88515406162465, | |
| "grad_norm": 0.03803584724664688, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5308764576911926, | |
| "mean_token_accuracy": 0.7852406352758408, | |
| "num_tokens": 12601869.0, | |
| "step": 773 | |
| }, | |
| { | |
| "entropy": 0.5238187685608864, | |
| "epoch": 2.888888888888889, | |
| "grad_norm": 0.04374956712126732, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5296017527580261, | |
| "mean_token_accuracy": 0.7867107540369034, | |
| "num_tokens": 12618133.0, | |
| "step": 774 | |
| }, | |
| { | |
| "entropy": 0.545166626572609, | |
| "epoch": 2.892623716153128, | |
| "grad_norm": 0.04235200583934784, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5444045066833496, | |
| "mean_token_accuracy": 0.7811264097690582, | |
| "num_tokens": 12634590.0, | |
| "step": 775 | |
| }, | |
| { | |
| "entropy": 0.552961677312851, | |
| "epoch": 2.896358543417367, | |
| "grad_norm": 0.04033121094107628, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5423647165298462, | |
| "mean_token_accuracy": 0.7789802700281143, | |
| "num_tokens": 12650990.0, | |
| "step": 776 | |
| }, | |
| { | |
| "entropy": 0.5362664610147476, | |
| "epoch": 2.900093370681606, | |
| "grad_norm": 0.039799049496650696, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5340068340301514, | |
| "mean_token_accuracy": 0.7801271975040436, | |
| "num_tokens": 12667374.0, | |
| "step": 777 | |
| }, | |
| { | |
| "entropy": 0.540292888879776, | |
| "epoch": 2.903828197945845, | |
| "grad_norm": 0.04687785729765892, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5417227149009705, | |
| "mean_token_accuracy": 0.7800564914941788, | |
| "num_tokens": 12683778.0, | |
| "step": 778 | |
| }, | |
| { | |
| "entropy": 0.5580530762672424, | |
| "epoch": 2.907563025210084, | |
| "grad_norm": 0.04104934632778168, | |
| "learning_rate": 0.0002, | |
| "loss": 0.553903341293335, | |
| "mean_token_accuracy": 0.7754019796848297, | |
| "num_tokens": 12700259.0, | |
| "step": 779 | |
| }, | |
| { | |
| "entropy": 0.5188224613666534, | |
| "epoch": 2.911297852474323, | |
| "grad_norm": 0.04876643791794777, | |
| "learning_rate": 0.0002, | |
| "loss": 0.525776207447052, | |
| "mean_token_accuracy": 0.7853571325540543, | |
| "num_tokens": 12716566.0, | |
| "step": 780 | |
| }, | |
| { | |
| "entropy": 0.5420665293931961, | |
| "epoch": 2.915032679738562, | |
| "grad_norm": 0.04760121926665306, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5495279431343079, | |
| "mean_token_accuracy": 0.7769062519073486, | |
| "num_tokens": 12732949.0, | |
| "step": 781 | |
| }, | |
| { | |
| "entropy": 0.5393791049718857, | |
| "epoch": 2.918767507002801, | |
| "grad_norm": 0.0337008535861969, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5375462174415588, | |
| "mean_token_accuracy": 0.7824095785617828, | |
| "num_tokens": 12749208.0, | |
| "step": 782 | |
| }, | |
| { | |
| "entropy": 0.5315912365913391, | |
| "epoch": 2.9225023342670404, | |
| "grad_norm": 0.04428756982088089, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5206541419029236, | |
| "mean_token_accuracy": 0.7908456176519394, | |
| "num_tokens": 12765331.0, | |
| "step": 783 | |
| }, | |
| { | |
| "entropy": 0.5318206250667572, | |
| "epoch": 2.9262371615312794, | |
| "grad_norm": 0.04391348361968994, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5263054370880127, | |
| "mean_token_accuracy": 0.7842861711978912, | |
| "num_tokens": 12781575.0, | |
| "step": 784 | |
| }, | |
| { | |
| "entropy": 0.5414671450853348, | |
| "epoch": 2.9299719887955185, | |
| "grad_norm": 0.03392143175005913, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5417372584342957, | |
| "mean_token_accuracy": 0.779655933380127, | |
| "num_tokens": 12797804.0, | |
| "step": 785 | |
| }, | |
| { | |
| "entropy": 0.5150401219725609, | |
| "epoch": 2.9337068160597575, | |
| "grad_norm": 0.04989241063594818, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5268764495849609, | |
| "mean_token_accuracy": 0.7849253863096237, | |
| "num_tokens": 12814387.0, | |
| "step": 786 | |
| }, | |
| { | |
| "entropy": 0.5104701817035675, | |
| "epoch": 2.9374416433239965, | |
| "grad_norm": 0.04267291724681854, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5144373178482056, | |
| "mean_token_accuracy": 0.7921061366796494, | |
| "num_tokens": 12830547.0, | |
| "step": 787 | |
| }, | |
| { | |
| "entropy": 0.5301306545734406, | |
| "epoch": 2.9411764705882355, | |
| "grad_norm": 0.041861243546009064, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5351182818412781, | |
| "mean_token_accuracy": 0.7849584370851517, | |
| "num_tokens": 12846796.0, | |
| "step": 788 | |
| }, | |
| { | |
| "entropy": 0.5566616058349609, | |
| "epoch": 2.9449112978524745, | |
| "grad_norm": 0.04726849123835564, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5562955737113953, | |
| "mean_token_accuracy": 0.7750595211982727, | |
| "num_tokens": 12863231.0, | |
| "step": 789 | |
| }, | |
| { | |
| "entropy": 0.5550259649753571, | |
| "epoch": 2.9486461251167135, | |
| "grad_norm": 0.04144451022148132, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5501708388328552, | |
| "mean_token_accuracy": 0.7760492265224457, | |
| "num_tokens": 12879599.0, | |
| "step": 790 | |
| }, | |
| { | |
| "entropy": 0.5439048856496811, | |
| "epoch": 2.9523809523809526, | |
| "grad_norm": 0.038411688059568405, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5328619480133057, | |
| "mean_token_accuracy": 0.7869621217250824, | |
| "num_tokens": 12895954.0, | |
| "step": 791 | |
| }, | |
| { | |
| "entropy": 0.5426651537418365, | |
| "epoch": 2.9561157796451916, | |
| "grad_norm": 0.035909172147512436, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5376070141792297, | |
| "mean_token_accuracy": 0.7810229063034058, | |
| "num_tokens": 12912468.0, | |
| "step": 792 | |
| }, | |
| { | |
| "entropy": 0.5385068506002426, | |
| "epoch": 2.9598506069094306, | |
| "grad_norm": 0.04422811418771744, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5405643582344055, | |
| "mean_token_accuracy": 0.7827010452747345, | |
| "num_tokens": 12929047.0, | |
| "step": 793 | |
| }, | |
| { | |
| "entropy": 0.5246873497962952, | |
| "epoch": 2.9635854341736696, | |
| "grad_norm": 0.042685672640800476, | |
| "learning_rate": 0.0002, | |
| "loss": 0.537744402885437, | |
| "mean_token_accuracy": 0.7845292538404465, | |
| "num_tokens": 12945498.0, | |
| "step": 794 | |
| }, | |
| { | |
| "entropy": 0.534453883767128, | |
| "epoch": 2.9673202614379086, | |
| "grad_norm": 0.04630210995674133, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5448824763298035, | |
| "mean_token_accuracy": 0.7790633589029312, | |
| "num_tokens": 12961911.0, | |
| "step": 795 | |
| }, | |
| { | |
| "entropy": 0.551120862364769, | |
| "epoch": 2.9710550887021476, | |
| "grad_norm": 0.038833893835544586, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5517142415046692, | |
| "mean_token_accuracy": 0.7771248668432236, | |
| "num_tokens": 12978275.0, | |
| "step": 796 | |
| }, | |
| { | |
| "entropy": 0.540284737944603, | |
| "epoch": 2.9747899159663866, | |
| "grad_norm": 0.034402430057525635, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5354663133621216, | |
| "mean_token_accuracy": 0.7817137837409973, | |
| "num_tokens": 12994610.0, | |
| "step": 797 | |
| }, | |
| { | |
| "entropy": 0.5466310381889343, | |
| "epoch": 2.9785247432306257, | |
| "grad_norm": 0.07181618362665176, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5540565848350525, | |
| "mean_token_accuracy": 0.7755098789930344, | |
| "num_tokens": 13011180.0, | |
| "step": 798 | |
| }, | |
| { | |
| "entropy": 0.5366263538599014, | |
| "epoch": 2.9822595704948647, | |
| "grad_norm": 0.038452569395303726, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5375447869300842, | |
| "mean_token_accuracy": 0.7817091047763824, | |
| "num_tokens": 13027553.0, | |
| "step": 799 | |
| }, | |
| { | |
| "entropy": 0.5117043852806091, | |
| "epoch": 2.9859943977591037, | |
| "grad_norm": 0.040419358760118484, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5115300416946411, | |
| "mean_token_accuracy": 0.7910782992839813, | |
| "num_tokens": 13043466.0, | |
| "step": 800 | |
| }, | |
| { | |
| "entropy": 0.5549824833869934, | |
| "epoch": 2.9897292250233427, | |
| "grad_norm": 0.04015415534377098, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5516586303710938, | |
| "mean_token_accuracy": 0.7774178683757782, | |
| "num_tokens": 13059980.0, | |
| "step": 801 | |
| }, | |
| { | |
| "entropy": 0.5470731258392334, | |
| "epoch": 2.9934640522875817, | |
| "grad_norm": 0.03732411563396454, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5440268516540527, | |
| "mean_token_accuracy": 0.7784831672906876, | |
| "num_tokens": 13076305.0, | |
| "step": 802 | |
| }, | |
| { | |
| "entropy": 0.5496807992458344, | |
| "epoch": 2.9971988795518207, | |
| "grad_norm": 0.042060188949108124, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5516492128372192, | |
| "mean_token_accuracy": 0.7782593071460724, | |
| "num_tokens": 13092596.0, | |
| "step": 803 | |
| }, | |
| { | |
| "entropy": 0.5623628298441569, | |
| "epoch": 3.0, | |
| "grad_norm": 0.04183833301067352, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5470706820487976, | |
| "mean_token_accuracy": 0.7766743898391724, | |
| "num_tokens": 13094419.0, | |
| "step": 804 | |
| } | |
| ], | |
| "logging_steps": 1, | |
| "max_steps": 804, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 3, | |
| "save_steps": 500, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 1.2222761209723617e+18, | |
| "train_batch_size": 4, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |