| { | |
| "best_global_step": null, | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 3.0, | |
| "eval_steps": 500, | |
| "global_step": 2916, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.005145356315924878, | |
| "grad_norm": 3.516500949859619, | |
| "learning_rate": 5.47945205479452e-07, | |
| "loss": 1.0549, | |
| "mean_token_accuracy": 0.7244073122739791, | |
| "num_tokens": 10390477.0, | |
| "step": 5 | |
| }, | |
| { | |
| "epoch": 0.010290712631849755, | |
| "grad_norm": 2.3721697330474854, | |
| "learning_rate": 1.2328767123287673e-06, | |
| "loss": 1.0331, | |
| "mean_token_accuracy": 0.7277479201555253, | |
| "num_tokens": 20784784.0, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.015436068947774634, | |
| "grad_norm": 1.1673978567123413, | |
| "learning_rate": 1.9178082191780823e-06, | |
| "loss": 1.0055, | |
| "mean_token_accuracy": 0.7293060600757599, | |
| "num_tokens": 31194057.0, | |
| "step": 15 | |
| }, | |
| { | |
| "epoch": 0.02058142526369951, | |
| "grad_norm": 1.1831053495407104, | |
| "learning_rate": 2.6027397260273973e-06, | |
| "loss": 0.9849, | |
| "mean_token_accuracy": 0.7313222289085388, | |
| "num_tokens": 41573889.0, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.025726781579624387, | |
| "grad_norm": 0.7422630786895752, | |
| "learning_rate": 3.2876712328767123e-06, | |
| "loss": 0.9496, | |
| "mean_token_accuracy": 0.7391268193721772, | |
| "num_tokens": 51955847.0, | |
| "step": 25 | |
| }, | |
| { | |
| "epoch": 0.030872137895549268, | |
| "grad_norm": 0.4367648959159851, | |
| "learning_rate": 3.972602739726027e-06, | |
| "loss": 0.9444, | |
| "mean_token_accuracy": 0.7393645942211151, | |
| "num_tokens": 62362424.0, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.03601749421147415, | |
| "grad_norm": 0.38364696502685547, | |
| "learning_rate": 4.657534246575343e-06, | |
| "loss": 0.9117, | |
| "mean_token_accuracy": 0.7468442142009735, | |
| "num_tokens": 72760156.0, | |
| "step": 35 | |
| }, | |
| { | |
| "epoch": 0.04116285052739902, | |
| "grad_norm": 0.32832905650138855, | |
| "learning_rate": 5.342465753424658e-06, | |
| "loss": 0.9062, | |
| "mean_token_accuracy": 0.7470937430858612, | |
| "num_tokens": 83160696.0, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.0463082068433239, | |
| "grad_norm": 0.254768967628479, | |
| "learning_rate": 6.027397260273973e-06, | |
| "loss": 0.8938, | |
| "mean_token_accuracy": 0.7495229691267014, | |
| "num_tokens": 93541996.0, | |
| "step": 45 | |
| }, | |
| { | |
| "epoch": 0.051453563159248775, | |
| "grad_norm": 0.23051689565181732, | |
| "learning_rate": 6.712328767123288e-06, | |
| "loss": 0.9012, | |
| "mean_token_accuracy": 0.7474822252988815, | |
| "num_tokens": 103951096.0, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.056598919475173655, | |
| "grad_norm": 0.20961317420005798, | |
| "learning_rate": 7.397260273972603e-06, | |
| "loss": 0.8874, | |
| "mean_token_accuracy": 0.75033338367939, | |
| "num_tokens": 114348089.0, | |
| "step": 55 | |
| }, | |
| { | |
| "epoch": 0.061744275791098535, | |
| "grad_norm": 0.20559196174144745, | |
| "learning_rate": 8.082191780821919e-06, | |
| "loss": 0.8788, | |
| "mean_token_accuracy": 0.7523881673812867, | |
| "num_tokens": 124723656.0, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.06688963210702341, | |
| "grad_norm": 0.2094876766204834, | |
| "learning_rate": 8.767123287671233e-06, | |
| "loss": 0.87, | |
| "mean_token_accuracy": 0.7535106301307678, | |
| "num_tokens": 135146677.0, | |
| "step": 65 | |
| }, | |
| { | |
| "epoch": 0.0720349884229483, | |
| "grad_norm": 0.19069737195968628, | |
| "learning_rate": 9.452054794520548e-06, | |
| "loss": 0.8749, | |
| "mean_token_accuracy": 0.7522967606782913, | |
| "num_tokens": 145548072.0, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.07718034473887317, | |
| "grad_norm": 0.19116425514221191, | |
| "learning_rate": 1.0136986301369864e-05, | |
| "loss": 0.8592, | |
| "mean_token_accuracy": 0.7557980835437774, | |
| "num_tokens": 155949133.0, | |
| "step": 75 | |
| }, | |
| { | |
| "epoch": 0.08232570105479804, | |
| "grad_norm": 0.21261022984981537, | |
| "learning_rate": 1.082191780821918e-05, | |
| "loss": 0.8596, | |
| "mean_token_accuracy": 0.7559153437614441, | |
| "num_tokens": 166313331.0, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.08747105737072293, | |
| "grad_norm": 0.19367730617523193, | |
| "learning_rate": 1.1506849315068493e-05, | |
| "loss": 0.8616, | |
| "mean_token_accuracy": 0.7551171153783798, | |
| "num_tokens": 176729555.0, | |
| "step": 85 | |
| }, | |
| { | |
| "epoch": 0.0926164136866478, | |
| "grad_norm": 0.18082526326179504, | |
| "learning_rate": 1.219178082191781e-05, | |
| "loss": 0.8489, | |
| "mean_token_accuracy": 0.757958498597145, | |
| "num_tokens": 187130427.0, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 0.09776177000257268, | |
| "grad_norm": 0.20260317623615265, | |
| "learning_rate": 1.2876712328767125e-05, | |
| "loss": 0.8508, | |
| "mean_token_accuracy": 0.7575758665800094, | |
| "num_tokens": 197551850.0, | |
| "step": 95 | |
| }, | |
| { | |
| "epoch": 0.10290712631849755, | |
| "grad_norm": 0.23283614218235016, | |
| "learning_rate": 1.356164383561644e-05, | |
| "loss": 0.8489, | |
| "mean_token_accuracy": 0.7578668266534805, | |
| "num_tokens": 207961400.0, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.10805248263442244, | |
| "grad_norm": 0.21379666030406952, | |
| "learning_rate": 1.4246575342465754e-05, | |
| "loss": 0.8339, | |
| "mean_token_accuracy": 0.7610192090272904, | |
| "num_tokens": 218376661.0, | |
| "step": 105 | |
| }, | |
| { | |
| "epoch": 0.11319783895034731, | |
| "grad_norm": 0.20584498345851898, | |
| "learning_rate": 1.493150684931507e-05, | |
| "loss": 0.8371, | |
| "mean_token_accuracy": 0.7601476907730103, | |
| "num_tokens": 228779024.0, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 0.11834319526627218, | |
| "grad_norm": 0.22660651803016663, | |
| "learning_rate": 1.5616438356164384e-05, | |
| "loss": 0.8379, | |
| "mean_token_accuracy": 0.7598486542701721, | |
| "num_tokens": 239167993.0, | |
| "step": 115 | |
| }, | |
| { | |
| "epoch": 0.12348855158219707, | |
| "grad_norm": 0.21756519377231598, | |
| "learning_rate": 1.6301369863013702e-05, | |
| "loss": 0.833, | |
| "mean_token_accuracy": 0.7604194134473801, | |
| "num_tokens": 249533588.0, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 0.12863390789812196, | |
| "grad_norm": 0.2544439733028412, | |
| "learning_rate": 1.6986301369863014e-05, | |
| "loss": 0.8267, | |
| "mean_token_accuracy": 0.7623803347349167, | |
| "num_tokens": 259910590.0, | |
| "step": 125 | |
| }, | |
| { | |
| "epoch": 0.13377926421404682, | |
| "grad_norm": 0.22863225638866425, | |
| "learning_rate": 1.767123287671233e-05, | |
| "loss": 0.8326, | |
| "mean_token_accuracy": 0.7607358664274215, | |
| "num_tokens": 270321780.0, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 0.1389246205299717, | |
| "grad_norm": 0.2177596092224121, | |
| "learning_rate": 1.8356164383561645e-05, | |
| "loss": 0.8181, | |
| "mean_token_accuracy": 0.7643768191337585, | |
| "num_tokens": 280754879.0, | |
| "step": 135 | |
| }, | |
| { | |
| "epoch": 0.1440699768458966, | |
| "grad_norm": 0.25882163643836975, | |
| "learning_rate": 1.904109589041096e-05, | |
| "loss": 0.83, | |
| "mean_token_accuracy": 0.7610959112644196, | |
| "num_tokens": 291123129.0, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 0.14921533316182145, | |
| "grad_norm": 0.3098185062408447, | |
| "learning_rate": 1.9726027397260276e-05, | |
| "loss": 0.8421, | |
| "mean_token_accuracy": 0.7577149778604507, | |
| "num_tokens": 301526988.0, | |
| "step": 145 | |
| }, | |
| { | |
| "epoch": 0.15436068947774634, | |
| "grad_norm": 0.3141907751560211, | |
| "learning_rate": 1.999995658762304e-05, | |
| "loss": 0.8239, | |
| "mean_token_accuracy": 0.7624665558338165, | |
| "num_tokens": 311935967.0, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.15950604579367123, | |
| "grad_norm": 0.2806973159313202, | |
| "learning_rate": 1.999969129158383e-05, | |
| "loss": 0.8149, | |
| "mean_token_accuracy": 0.7648573398590088, | |
| "num_tokens": 322344265.0, | |
| "step": 155 | |
| }, | |
| { | |
| "epoch": 0.16465140210959608, | |
| "grad_norm": 0.25377604365348816, | |
| "learning_rate": 1.999918482601347e-05, | |
| "loss": 0.8115, | |
| "mean_token_accuracy": 0.7656003296375274, | |
| "num_tokens": 332734169.0, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 0.16979675842552097, | |
| "grad_norm": 0.2188454419374466, | |
| "learning_rate": 1.9998437207198492e-05, | |
| "loss": 0.8159, | |
| "mean_token_accuracy": 0.7638908416032791, | |
| "num_tokens": 343149437.0, | |
| "step": 165 | |
| }, | |
| { | |
| "epoch": 0.17494211474144586, | |
| "grad_norm": 0.284574031829834, | |
| "learning_rate": 1.9997448459180285e-05, | |
| "loss": 0.8125, | |
| "mean_token_accuracy": 0.7646788358688354, | |
| "num_tokens": 353540696.0, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 0.18008747105737072, | |
| "grad_norm": 0.2330782562494278, | |
| "learning_rate": 1.999621861375427e-05, | |
| "loss": 0.8015, | |
| "mean_token_accuracy": 0.7678918361663818, | |
| "num_tokens": 363936147.0, | |
| "step": 175 | |
| }, | |
| { | |
| "epoch": 0.1852328273732956, | |
| "grad_norm": 0.3005109429359436, | |
| "learning_rate": 1.9994747710468907e-05, | |
| "loss": 0.8221, | |
| "mean_token_accuracy": 0.7623626977205277, | |
| "num_tokens": 374341093.0, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 0.19037818368922046, | |
| "grad_norm": 0.2539622485637665, | |
| "learning_rate": 1.9993035796624416e-05, | |
| "loss": 0.8191, | |
| "mean_token_accuracy": 0.7632038950920105, | |
| "num_tokens": 384736305.0, | |
| "step": 185 | |
| }, | |
| { | |
| "epoch": 0.19552354000514535, | |
| "grad_norm": 0.3602747321128845, | |
| "learning_rate": 1.9991082927271263e-05, | |
| "loss": 0.8072, | |
| "mean_token_accuracy": 0.765991085767746, | |
| "num_tokens": 395134979.0, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 0.20066889632107024, | |
| "grad_norm": 0.273478627204895, | |
| "learning_rate": 1.9988889165208373e-05, | |
| "loss": 0.8078, | |
| "mean_token_accuracy": 0.7652726262807846, | |
| "num_tokens": 405523607.0, | |
| "step": 195 | |
| }, | |
| { | |
| "epoch": 0.2058142526369951, | |
| "grad_norm": 0.2960168719291687, | |
| "learning_rate": 1.998645458098112e-05, | |
| "loss": 0.813, | |
| "mean_token_accuracy": 0.7640766650438309, | |
| "num_tokens": 415905850.0, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.21095960895291999, | |
| "grad_norm": 0.23220957815647125, | |
| "learning_rate": 1.998377925287908e-05, | |
| "loss": 0.8086, | |
| "mean_token_accuracy": 0.7652794599533081, | |
| "num_tokens": 426274540.0, | |
| "step": 205 | |
| }, | |
| { | |
| "epoch": 0.21610496526884487, | |
| "grad_norm": 0.28992024064064026, | |
| "learning_rate": 1.9980863266933464e-05, | |
| "loss": 0.8179, | |
| "mean_token_accuracy": 0.7626183509826661, | |
| "num_tokens": 436689729.0, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 0.22125032158476973, | |
| "grad_norm": 0.31990498304367065, | |
| "learning_rate": 1.9977706716914402e-05, | |
| "loss": 0.803, | |
| "mean_token_accuracy": 0.7667577922344208, | |
| "num_tokens": 447114386.0, | |
| "step": 215 | |
| }, | |
| { | |
| "epoch": 0.22639567790069462, | |
| "grad_norm": 0.3343593180179596, | |
| "learning_rate": 1.997430970432789e-05, | |
| "loss": 0.7933, | |
| "mean_token_accuracy": 0.7693847328424454, | |
| "num_tokens": 457532029.0, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 0.2315410342166195, | |
| "grad_norm": 0.24708902835845947, | |
| "learning_rate": 1.9970672338412554e-05, | |
| "loss": 0.7975, | |
| "mean_token_accuracy": 0.768178117275238, | |
| "num_tokens": 467953269.0, | |
| "step": 225 | |
| }, | |
| { | |
| "epoch": 0.23668639053254437, | |
| "grad_norm": 0.33174219727516174, | |
| "learning_rate": 1.9966794736136114e-05, | |
| "loss": 0.8089, | |
| "mean_token_accuracy": 0.7650195062160492, | |
| "num_tokens": 478360587.0, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 0.24183174684846925, | |
| "grad_norm": 0.2640225291252136, | |
| "learning_rate": 1.9962677022191648e-05, | |
| "loss": 0.7919, | |
| "mean_token_accuracy": 0.7692882120609283, | |
| "num_tokens": 488764842.0, | |
| "step": 235 | |
| }, | |
| { | |
| "epoch": 0.24697710316439414, | |
| "grad_norm": 0.2795109450817108, | |
| "learning_rate": 1.9958319328993553e-05, | |
| "loss": 0.8134, | |
| "mean_token_accuracy": 0.7633749455213547, | |
| "num_tokens": 499181820.0, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 0.25212245948031903, | |
| "grad_norm": 0.220581516623497, | |
| "learning_rate": 1.99537217966733e-05, | |
| "loss": 0.8017, | |
| "mean_token_accuracy": 0.7669906347990036, | |
| "num_tokens": 509595885.0, | |
| "step": 245 | |
| }, | |
| { | |
| "epoch": 0.2572678157962439, | |
| "grad_norm": 0.2826674282550812, | |
| "learning_rate": 1.9948884573074948e-05, | |
| "loss": 0.8109, | |
| "mean_token_accuracy": 0.7639420449733734, | |
| "num_tokens": 519993750.0, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 0.26241317211216875, | |
| "grad_norm": 0.24169522523880005, | |
| "learning_rate": 1.9943807813750356e-05, | |
| "loss": 0.7981, | |
| "mean_token_accuracy": 0.7674963772296906, | |
| "num_tokens": 530411077.0, | |
| "step": 255 | |
| }, | |
| { | |
| "epoch": 0.26755852842809363, | |
| "grad_norm": 0.2998713552951813, | |
| "learning_rate": 1.9938491681954196e-05, | |
| "loss": 0.7992, | |
| "mean_token_accuracy": 0.7669602394104004, | |
| "num_tokens": 540817105.0, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 0.2727038847440185, | |
| "grad_norm": 0.3029504716396332, | |
| "learning_rate": 1.993293634863871e-05, | |
| "loss": 0.7932, | |
| "mean_token_accuracy": 0.7686622679233551, | |
| "num_tokens": 551223833.0, | |
| "step": 265 | |
| }, | |
| { | |
| "epoch": 0.2778492410599434, | |
| "grad_norm": 0.263815701007843, | |
| "learning_rate": 1.99271419924482e-05, | |
| "loss": 0.7941, | |
| "mean_token_accuracy": 0.7685212969779969, | |
| "num_tokens": 561643697.0, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 0.2829945973758683, | |
| "grad_norm": 0.3639836311340332, | |
| "learning_rate": 1.992110879971329e-05, | |
| "loss": 0.7973, | |
| "mean_token_accuracy": 0.7674791067838669, | |
| "num_tokens": 572054629.0, | |
| "step": 275 | |
| }, | |
| { | |
| "epoch": 0.2881399536917932, | |
| "grad_norm": 0.2445714920759201, | |
| "learning_rate": 1.9914836964444934e-05, | |
| "loss": 0.804, | |
| "mean_token_accuracy": 0.7654124438762665, | |
| "num_tokens": 582446471.0, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 0.293285310007718, | |
| "grad_norm": 0.23441371321678162, | |
| "learning_rate": 1.990832668832818e-05, | |
| "loss": 0.7832, | |
| "mean_token_accuracy": 0.7708245635032653, | |
| "num_tokens": 592850162.0, | |
| "step": 285 | |
| }, | |
| { | |
| "epoch": 0.2984306663236429, | |
| "grad_norm": 0.27346476912498474, | |
| "learning_rate": 1.9901578180715674e-05, | |
| "loss": 0.7892, | |
| "mean_token_accuracy": 0.7693257629871368, | |
| "num_tokens": 603254165.0, | |
| "step": 290 | |
| }, | |
| { | |
| "epoch": 0.3035760226395678, | |
| "grad_norm": 0.2540152966976166, | |
| "learning_rate": 1.989459165862094e-05, | |
| "loss": 0.7873, | |
| "mean_token_accuracy": 0.7698075413703919, | |
| "num_tokens": 613613424.0, | |
| "step": 295 | |
| }, | |
| { | |
| "epoch": 0.3087213789554927, | |
| "grad_norm": 0.24472405016422272, | |
| "learning_rate": 1.9887367346711387e-05, | |
| "loss": 0.7903, | |
| "mean_token_accuracy": 0.7687265604734421, | |
| "num_tokens": 624004700.0, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.31386673527141756, | |
| "grad_norm": 0.2586674094200134, | |
| "learning_rate": 1.987990547730111e-05, | |
| "loss": 0.7843, | |
| "mean_token_accuracy": 0.7710338205099105, | |
| "num_tokens": 634387240.0, | |
| "step": 305 | |
| }, | |
| { | |
| "epoch": 0.31901209158734245, | |
| "grad_norm": 0.23755744099617004, | |
| "learning_rate": 1.9872206290343384e-05, | |
| "loss": 0.7865, | |
| "mean_token_accuracy": 0.7694753050804138, | |
| "num_tokens": 644778090.0, | |
| "step": 310 | |
| }, | |
| { | |
| "epoch": 0.3241574479032673, | |
| "grad_norm": 0.3347889482975006, | |
| "learning_rate": 1.9864270033422975e-05, | |
| "loss": 0.7964, | |
| "mean_token_accuracy": 0.7670842260122299, | |
| "num_tokens": 655177170.0, | |
| "step": 315 | |
| }, | |
| { | |
| "epoch": 0.32930280421919217, | |
| "grad_norm": 0.2553674578666687, | |
| "learning_rate": 1.985609696174817e-05, | |
| "loss": 0.7909, | |
| "mean_token_accuracy": 0.7687036842107773, | |
| "num_tokens": 665585375.0, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 0.33444816053511706, | |
| "grad_norm": 0.22028060257434845, | |
| "learning_rate": 1.984768733814257e-05, | |
| "loss": 0.7835, | |
| "mean_token_accuracy": 0.7702562361955643, | |
| "num_tokens": 675996436.0, | |
| "step": 325 | |
| }, | |
| { | |
| "epoch": 0.33959351685104194, | |
| "grad_norm": 0.24218259751796722, | |
| "learning_rate": 1.9839041433036636e-05, | |
| "loss": 0.7882, | |
| "mean_token_accuracy": 0.7690007984638214, | |
| "num_tokens": 686407890.0, | |
| "step": 330 | |
| }, | |
| { | |
| "epoch": 0.34473887316696683, | |
| "grad_norm": 0.2648875117301941, | |
| "learning_rate": 1.9830159524459e-05, | |
| "loss": 0.7914, | |
| "mean_token_accuracy": 0.7680494576692581, | |
| "num_tokens": 696772298.0, | |
| "step": 335 | |
| }, | |
| { | |
| "epoch": 0.3498842294828917, | |
| "grad_norm": 0.3156125247478485, | |
| "learning_rate": 1.982104189802751e-05, | |
| "loss": 0.7882, | |
| "mean_token_accuracy": 0.7689610362052918, | |
| "num_tokens": 707159995.0, | |
| "step": 340 | |
| }, | |
| { | |
| "epoch": 0.35502958579881655, | |
| "grad_norm": 0.21199235320091248, | |
| "learning_rate": 1.9811688846940064e-05, | |
| "loss": 0.7973, | |
| "mean_token_accuracy": 0.7667856603860855, | |
| "num_tokens": 717555566.0, | |
| "step": 345 | |
| }, | |
| { | |
| "epoch": 0.36017494211474144, | |
| "grad_norm": 0.2736167013645172, | |
| "learning_rate": 1.9802100671965167e-05, | |
| "loss": 0.7927, | |
| "mean_token_accuracy": 0.767885434627533, | |
| "num_tokens": 727966808.0, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 0.3653202984306663, | |
| "grad_norm": 0.22648128867149353, | |
| "learning_rate": 1.9792277681432257e-05, | |
| "loss": 0.7907, | |
| "mean_token_accuracy": 0.7685991823673248, | |
| "num_tokens": 738371625.0, | |
| "step": 355 | |
| }, | |
| { | |
| "epoch": 0.3704656547465912, | |
| "grad_norm": 0.26248160004615784, | |
| "learning_rate": 1.9782220191221818e-05, | |
| "loss": 0.7972, | |
| "mean_token_accuracy": 0.7668163865804672, | |
| "num_tokens": 748771291.0, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 0.3756110110625161, | |
| "grad_norm": 0.23813173174858093, | |
| "learning_rate": 1.9771928524755182e-05, | |
| "loss": 0.7871, | |
| "mean_token_accuracy": 0.7691414833068848, | |
| "num_tokens": 759119097.0, | |
| "step": 365 | |
| }, | |
| { | |
| "epoch": 0.38075636737844093, | |
| "grad_norm": 0.32292458415031433, | |
| "learning_rate": 1.976140301298416e-05, | |
| "loss": 0.7858, | |
| "mean_token_accuracy": 0.7693786770105362, | |
| "num_tokens": 769547626.0, | |
| "step": 370 | |
| }, | |
| { | |
| "epoch": 0.3859017236943658, | |
| "grad_norm": 0.23817549645900726, | |
| "learning_rate": 1.9750643994380377e-05, | |
| "loss": 0.7841, | |
| "mean_token_accuracy": 0.7698348790407181, | |
| "num_tokens": 779970748.0, | |
| "step": 375 | |
| }, | |
| { | |
| "epoch": 0.3910470800102907, | |
| "grad_norm": 0.2122715711593628, | |
| "learning_rate": 1.9739651814924404e-05, | |
| "loss": 0.79, | |
| "mean_token_accuracy": 0.76830395758152, | |
| "num_tokens": 790354749.0, | |
| "step": 380 | |
| }, | |
| { | |
| "epoch": 0.3961924363262156, | |
| "grad_norm": 0.25851792097091675, | |
| "learning_rate": 1.972842682809463e-05, | |
| "loss": 0.7844, | |
| "mean_token_accuracy": 0.7702566295862198, | |
| "num_tokens": 800785265.0, | |
| "step": 385 | |
| }, | |
| { | |
| "epoch": 0.4013377926421405, | |
| "grad_norm": 0.24672254920005798, | |
| "learning_rate": 1.9716969394855884e-05, | |
| "loss": 0.7768, | |
| "mean_token_accuracy": 0.7723254442214966, | |
| "num_tokens": 811181165.0, | |
| "step": 390 | |
| }, | |
| { | |
| "epoch": 0.40648314895806537, | |
| "grad_norm": 0.23226742446422577, | |
| "learning_rate": 1.9705279883647842e-05, | |
| "loss": 0.7809, | |
| "mean_token_accuracy": 0.7711203783750534, | |
| "num_tokens": 821562722.0, | |
| "step": 395 | |
| }, | |
| { | |
| "epoch": 0.4116285052739902, | |
| "grad_norm": 0.22964943945407867, | |
| "learning_rate": 1.9693358670373162e-05, | |
| "loss": 0.7772, | |
| "mean_token_accuracy": 0.7718722522258759, | |
| "num_tokens": 831975336.0, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 0.4167738615899151, | |
| "grad_norm": 0.2204572707414627, | |
| "learning_rate": 1.9681206138385418e-05, | |
| "loss": 0.7829, | |
| "mean_token_accuracy": 0.7703472405672074, | |
| "num_tokens": 842370514.0, | |
| "step": 405 | |
| }, | |
| { | |
| "epoch": 0.42191921790583997, | |
| "grad_norm": 0.21690765023231506, | |
| "learning_rate": 1.966882267847675e-05, | |
| "loss": 0.7778, | |
| "mean_token_accuracy": 0.7715538173913956, | |
| "num_tokens": 852764466.0, | |
| "step": 410 | |
| }, | |
| { | |
| "epoch": 0.42706457422176486, | |
| "grad_norm": 0.2913786768913269, | |
| "learning_rate": 1.9656208688865318e-05, | |
| "loss": 0.7806, | |
| "mean_token_accuracy": 0.7708995878696442, | |
| "num_tokens": 863181464.0, | |
| "step": 415 | |
| }, | |
| { | |
| "epoch": 0.43220993053768975, | |
| "grad_norm": 0.2708909809589386, | |
| "learning_rate": 1.9643364575182474e-05, | |
| "loss": 0.7853, | |
| "mean_token_accuracy": 0.7692730128765106, | |
| "num_tokens": 873548841.0, | |
| "step": 420 | |
| }, | |
| { | |
| "epoch": 0.43735528685361463, | |
| "grad_norm": 0.20992125570774078, | |
| "learning_rate": 1.9630290750459733e-05, | |
| "loss": 0.7835, | |
| "mean_token_accuracy": 0.7699774980545044, | |
| "num_tokens": 883941994.0, | |
| "step": 425 | |
| }, | |
| { | |
| "epoch": 0.44250064316953946, | |
| "grad_norm": 0.21337294578552246, | |
| "learning_rate": 1.9616987635115502e-05, | |
| "loss": 0.7725, | |
| "mean_token_accuracy": 0.7728747427463531, | |
| "num_tokens": 894335136.0, | |
| "step": 430 | |
| }, | |
| { | |
| "epoch": 0.44764599948546435, | |
| "grad_norm": 0.30405837297439575, | |
| "learning_rate": 1.9603455656941518e-05, | |
| "loss": 0.7813, | |
| "mean_token_accuracy": 0.7705006301403046, | |
| "num_tokens": 904737119.0, | |
| "step": 435 | |
| }, | |
| { | |
| "epoch": 0.45279135580138924, | |
| "grad_norm": 0.22999157011508942, | |
| "learning_rate": 1.9589695251089154e-05, | |
| "loss": 0.7804, | |
| "mean_token_accuracy": 0.7707367807626724, | |
| "num_tokens": 915122419.0, | |
| "step": 440 | |
| }, | |
| { | |
| "epoch": 0.4579367121173141, | |
| "grad_norm": 0.37796908617019653, | |
| "learning_rate": 1.9575706860055363e-05, | |
| "loss": 0.7859, | |
| "mean_token_accuracy": 0.7693438589572906, | |
| "num_tokens": 925494224.0, | |
| "step": 445 | |
| }, | |
| { | |
| "epoch": 0.463082068433239, | |
| "grad_norm": 0.4048900306224823, | |
| "learning_rate": 1.9561490933668492e-05, | |
| "loss": 0.79, | |
| "mean_token_accuracy": 0.7678173094987869, | |
| "num_tokens": 935892275.0, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 0.4682274247491639, | |
| "grad_norm": 0.2899525463581085, | |
| "learning_rate": 1.95470479290738e-05, | |
| "loss": 0.773, | |
| "mean_token_accuracy": 0.7723690897226334, | |
| "num_tokens": 946289615.0, | |
| "step": 455 | |
| }, | |
| { | |
| "epoch": 0.47337278106508873, | |
| "grad_norm": 0.22180290520191193, | |
| "learning_rate": 1.9532378310718745e-05, | |
| "loss": 0.7721, | |
| "mean_token_accuracy": 0.7729926645755768, | |
| "num_tokens": 956672790.0, | |
| "step": 460 | |
| }, | |
| { | |
| "epoch": 0.4785181373810136, | |
| "grad_norm": 0.32568320631980896, | |
| "learning_rate": 1.951748255033809e-05, | |
| "loss": 0.7845, | |
| "mean_token_accuracy": 0.7696036785840988, | |
| "num_tokens": 967100741.0, | |
| "step": 465 | |
| }, | |
| { | |
| "epoch": 0.4836634936969385, | |
| "grad_norm": 0.21786752343177795, | |
| "learning_rate": 1.9502361126938683e-05, | |
| "loss": 0.7769, | |
| "mean_token_accuracy": 0.7711740046739578, | |
| "num_tokens": 977497458.0, | |
| "step": 470 | |
| }, | |
| { | |
| "epoch": 0.4888088500128634, | |
| "grad_norm": 0.274349570274353, | |
| "learning_rate": 1.9487014526784088e-05, | |
| "loss": 0.7717, | |
| "mean_token_accuracy": 0.772866228222847, | |
| "num_tokens": 987914143.0, | |
| "step": 475 | |
| }, | |
| { | |
| "epoch": 0.4939542063287883, | |
| "grad_norm": 0.27368083596229553, | |
| "learning_rate": 1.9471443243378934e-05, | |
| "loss": 0.7812, | |
| "mean_token_accuracy": 0.7704960852861404, | |
| "num_tokens": 998321473.0, | |
| "step": 480 | |
| }, | |
| { | |
| "epoch": 0.49909956264471317, | |
| "grad_norm": 0.2256074994802475, | |
| "learning_rate": 1.9455647777453045e-05, | |
| "loss": 0.7819, | |
| "mean_token_accuracy": 0.7698213875293731, | |
| "num_tokens": 1008748099.0, | |
| "step": 485 | |
| }, | |
| { | |
| "epoch": 0.5042449189606381, | |
| "grad_norm": 0.2857572138309479, | |
| "learning_rate": 1.9439628636945337e-05, | |
| "loss": 0.7816, | |
| "mean_token_accuracy": 0.7701322674751282, | |
| "num_tokens": 1019143713.0, | |
| "step": 490 | |
| }, | |
| { | |
| "epoch": 0.5093902752765629, | |
| "grad_norm": 1.1966010332107544, | |
| "learning_rate": 1.9423386336987507e-05, | |
| "loss": 0.7727, | |
| "mean_token_accuracy": 0.7724141061306, | |
| "num_tokens": 1029507845.0, | |
| "step": 495 | |
| }, | |
| { | |
| "epoch": 0.5145356315924878, | |
| "grad_norm": 0.40811723470687866, | |
| "learning_rate": 1.9406921399887432e-05, | |
| "loss": 0.8462, | |
| "mean_token_accuracy": 0.7615606904029846, | |
| "num_tokens": 1039924539.0, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.5196809879084127, | |
| "grad_norm": 0.43425825238227844, | |
| "learning_rate": 1.9390234355112386e-05, | |
| "loss": 0.776, | |
| "mean_token_accuracy": 0.7718711495399475, | |
| "num_tokens": 1050307833.0, | |
| "step": 505 | |
| }, | |
| { | |
| "epoch": 0.5248263442243375, | |
| "grad_norm": 0.2550142705440521, | |
| "learning_rate": 1.9373325739272035e-05, | |
| "loss": 0.7718, | |
| "mean_token_accuracy": 0.772559967637062, | |
| "num_tokens": 1060725805.0, | |
| "step": 510 | |
| }, | |
| { | |
| "epoch": 0.5299717005402624, | |
| "grad_norm": 0.2491987943649292, | |
| "learning_rate": 1.9356196096101145e-05, | |
| "loss": 0.7818, | |
| "mean_token_accuracy": 0.7696256130933762, | |
| "num_tokens": 1071091088.0, | |
| "step": 515 | |
| }, | |
| { | |
| "epoch": 0.5351170568561873, | |
| "grad_norm": 0.2185741513967514, | |
| "learning_rate": 1.9338845976442128e-05, | |
| "loss": 0.7696, | |
| "mean_token_accuracy": 0.7729953050613403, | |
| "num_tokens": 1081507707.0, | |
| "step": 520 | |
| }, | |
| { | |
| "epoch": 0.5402624131721122, | |
| "grad_norm": 0.22735682129859924, | |
| "learning_rate": 1.9321275938227315e-05, | |
| "loss": 0.7846, | |
| "mean_token_accuracy": 0.7691415637731552, | |
| "num_tokens": 1091915702.0, | |
| "step": 525 | |
| }, | |
| { | |
| "epoch": 0.545407769488037, | |
| "grad_norm": 0.2260189801454544, | |
| "learning_rate": 1.930348654646101e-05, | |
| "loss": 0.7783, | |
| "mean_token_accuracy": 0.7702914178371429, | |
| "num_tokens": 1102329794.0, | |
| "step": 530 | |
| }, | |
| { | |
| "epoch": 0.5505531258039619, | |
| "grad_norm": 0.24121206998825073, | |
| "learning_rate": 1.928547837320133e-05, | |
| "loss": 0.7793, | |
| "mean_token_accuracy": 0.7705663651227951, | |
| "num_tokens": 1112744574.0, | |
| "step": 535 | |
| }, | |
| { | |
| "epoch": 0.5556984821198868, | |
| "grad_norm": 0.21719065308570862, | |
| "learning_rate": 1.92672519975418e-05, | |
| "loss": 0.7709, | |
| "mean_token_accuracy": 0.7725832790136338, | |
| "num_tokens": 1123133981.0, | |
| "step": 540 | |
| }, | |
| { | |
| "epoch": 0.5608438384358116, | |
| "grad_norm": 0.2318485677242279, | |
| "learning_rate": 1.9248808005592748e-05, | |
| "loss": 0.7593, | |
| "mean_token_accuracy": 0.7762016743421555, | |
| "num_tokens": 1133547201.0, | |
| "step": 545 | |
| }, | |
| { | |
| "epoch": 0.5659891947517366, | |
| "grad_norm": 0.22322027385234833, | |
| "learning_rate": 1.923014699046244e-05, | |
| "loss": 0.7781, | |
| "mean_token_accuracy": 0.7703386157751083, | |
| "num_tokens": 1143928590.0, | |
| "step": 550 | |
| }, | |
| { | |
| "epoch": 0.5711345510676614, | |
| "grad_norm": 0.25323280692100525, | |
| "learning_rate": 1.9211269552238006e-05, | |
| "loss": 0.7674, | |
| "mean_token_accuracy": 0.7730859339237213, | |
| "num_tokens": 1154289231.0, | |
| "step": 555 | |
| }, | |
| { | |
| "epoch": 0.5762799073835864, | |
| "grad_norm": 0.19904294610023499, | |
| "learning_rate": 1.919217629796616e-05, | |
| "loss": 0.7562, | |
| "mean_token_accuracy": 0.7763365268707275, | |
| "num_tokens": 1164696083.0, | |
| "step": 560 | |
| }, | |
| { | |
| "epoch": 0.5814252636995112, | |
| "grad_norm": 0.20939859747886658, | |
| "learning_rate": 1.917286784163366e-05, | |
| "loss": 0.7619, | |
| "mean_token_accuracy": 0.7748125195503235, | |
| "num_tokens": 1175092473.0, | |
| "step": 565 | |
| }, | |
| { | |
| "epoch": 0.586570620015436, | |
| "grad_norm": 0.19839182496070862, | |
| "learning_rate": 1.9153344804147583e-05, | |
| "loss": 0.758, | |
| "mean_token_accuracy": 0.7757725417613983, | |
| "num_tokens": 1185494160.0, | |
| "step": 570 | |
| }, | |
| { | |
| "epoch": 0.591715976331361, | |
| "grad_norm": 0.2394329458475113, | |
| "learning_rate": 1.913360781331535e-05, | |
| "loss": 0.7761, | |
| "mean_token_accuracy": 0.770972666144371, | |
| "num_tokens": 1195885099.0, | |
| "step": 575 | |
| }, | |
| { | |
| "epoch": 0.5968613326472858, | |
| "grad_norm": 0.22963404655456543, | |
| "learning_rate": 1.9113657503824513e-05, | |
| "loss": 0.7632, | |
| "mean_token_accuracy": 0.7746537119150162, | |
| "num_tokens": 1206298358.0, | |
| "step": 580 | |
| }, | |
| { | |
| "epoch": 0.6020066889632107, | |
| "grad_norm": 0.2229454070329666, | |
| "learning_rate": 1.9093494517222397e-05, | |
| "loss": 0.7843, | |
| "mean_token_accuracy": 0.7687952756881714, | |
| "num_tokens": 1216694302.0, | |
| "step": 585 | |
| }, | |
| { | |
| "epoch": 0.6071520452791356, | |
| "grad_norm": 0.24715222418308258, | |
| "learning_rate": 1.907311950189542e-05, | |
| "loss": 0.7782, | |
| "mean_token_accuracy": 0.770742443203926, | |
| "num_tokens": 1227102730.0, | |
| "step": 590 | |
| }, | |
| { | |
| "epoch": 0.6122974015950604, | |
| "grad_norm": 0.26584967970848083, | |
| "learning_rate": 1.9052533113048274e-05, | |
| "loss": 0.7656, | |
| "mean_token_accuracy": 0.7737416863441468, | |
| "num_tokens": 1237504190.0, | |
| "step": 595 | |
| }, | |
| { | |
| "epoch": 0.6174427579109854, | |
| "grad_norm": 0.23727792501449585, | |
| "learning_rate": 1.903173601268284e-05, | |
| "loss": 0.767, | |
| "mean_token_accuracy": 0.7734443098306656, | |
| "num_tokens": 1247914802.0, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 0.6225881142269102, | |
| "grad_norm": 0.22145575284957886, | |
| "learning_rate": 1.90107288695769e-05, | |
| "loss": 0.7717, | |
| "mean_token_accuracy": 0.7720071583986282, | |
| "num_tokens": 1258346476.0, | |
| "step": 605 | |
| }, | |
| { | |
| "epoch": 0.6277334705428351, | |
| "grad_norm": 0.2083977609872818, | |
| "learning_rate": 1.8989512359262643e-05, | |
| "loss": 0.7679, | |
| "mean_token_accuracy": 0.7729957222938537, | |
| "num_tokens": 1268749806.0, | |
| "step": 610 | |
| }, | |
| { | |
| "epoch": 0.63287882685876, | |
| "grad_norm": 0.21598635613918304, | |
| "learning_rate": 1.8968087164004935e-05, | |
| "loss": 0.7662, | |
| "mean_token_accuracy": 0.7731610238552094, | |
| "num_tokens": 1279117698.0, | |
| "step": 615 | |
| }, | |
| { | |
| "epoch": 0.6380241831746849, | |
| "grad_norm": 0.19322967529296875, | |
| "learning_rate": 1.894645397277937e-05, | |
| "loss": 0.7502, | |
| "mean_token_accuracy": 0.7780898064374924, | |
| "num_tokens": 1289533458.0, | |
| "step": 620 | |
| }, | |
| { | |
| "epoch": 0.6431695394906097, | |
| "grad_norm": 0.19693732261657715, | |
| "learning_rate": 1.8924613481250128e-05, | |
| "loss": 0.7727, | |
| "mean_token_accuracy": 0.7718368798494339, | |
| "num_tokens": 1299925979.0, | |
| "step": 625 | |
| }, | |
| { | |
| "epoch": 0.6483148958065346, | |
| "grad_norm": 0.21000495553016663, | |
| "learning_rate": 1.8902566391747596e-05, | |
| "loss": 0.7734, | |
| "mean_token_accuracy": 0.7717230170965195, | |
| "num_tokens": 1310311084.0, | |
| "step": 630 | |
| }, | |
| { | |
| "epoch": 0.6534602521224595, | |
| "grad_norm": 0.2045949399471283, | |
| "learning_rate": 1.8880313413245794e-05, | |
| "loss": 0.7717, | |
| "mean_token_accuracy": 0.7715155422687531, | |
| "num_tokens": 1320686452.0, | |
| "step": 635 | |
| }, | |
| { | |
| "epoch": 0.6586056084383843, | |
| "grad_norm": 0.22864918410778046, | |
| "learning_rate": 1.885785526133956e-05, | |
| "loss": 0.7579, | |
| "mean_token_accuracy": 0.7755870014429093, | |
| "num_tokens": 1331096443.0, | |
| "step": 640 | |
| }, | |
| { | |
| "epoch": 0.6637509647543093, | |
| "grad_norm": 0.2280014157295227, | |
| "learning_rate": 1.8835192658221545e-05, | |
| "loss": 0.7643, | |
| "mean_token_accuracy": 0.7744101166725159, | |
| "num_tokens": 1341514376.0, | |
| "step": 645 | |
| }, | |
| { | |
| "epoch": 0.6688963210702341, | |
| "grad_norm": 0.224672332406044, | |
| "learning_rate": 1.8812326332658997e-05, | |
| "loss": 0.7674, | |
| "mean_token_accuracy": 0.773426678776741, | |
| "num_tokens": 1351882404.0, | |
| "step": 650 | |
| }, | |
| { | |
| "epoch": 0.6740416773861589, | |
| "grad_norm": 0.2680542767047882, | |
| "learning_rate": 1.878925701997032e-05, | |
| "loss": 0.7573, | |
| "mean_token_accuracy": 0.7760124772787094, | |
| "num_tokens": 1362304069.0, | |
| "step": 655 | |
| }, | |
| { | |
| "epoch": 0.6791870337020839, | |
| "grad_norm": 0.29469063878059387, | |
| "learning_rate": 1.8765985462001424e-05, | |
| "loss": 0.7636, | |
| "mean_token_accuracy": 0.7740887552499771, | |
| "num_tokens": 1372688301.0, | |
| "step": 660 | |
| }, | |
| { | |
| "epoch": 0.6843323900180087, | |
| "grad_norm": 0.21097056567668915, | |
| "learning_rate": 1.8742512407101875e-05, | |
| "loss": 0.7578, | |
| "mean_token_accuracy": 0.7753616005182267, | |
| "num_tokens": 1383093337.0, | |
| "step": 665 | |
| }, | |
| { | |
| "epoch": 0.6894777463339337, | |
| "grad_norm": 0.21591758728027344, | |
| "learning_rate": 1.8718838610100832e-05, | |
| "loss": 0.7566, | |
| "mean_token_accuracy": 0.7760109454393387, | |
| "num_tokens": 1393440159.0, | |
| "step": 670 | |
| }, | |
| { | |
| "epoch": 0.6946231026498585, | |
| "grad_norm": 0.2419527769088745, | |
| "learning_rate": 1.8694964832282764e-05, | |
| "loss": 0.77, | |
| "mean_token_accuracy": 0.7725261867046356, | |
| "num_tokens": 1403836556.0, | |
| "step": 675 | |
| }, | |
| { | |
| "epoch": 0.6997684589657834, | |
| "grad_norm": 0.27528461813926697, | |
| "learning_rate": 1.8670891841362976e-05, | |
| "loss": 0.7543, | |
| "mean_token_accuracy": 0.7764055013656617, | |
| "num_tokens": 1414213366.0, | |
| "step": 680 | |
| }, | |
| { | |
| "epoch": 0.7049138152817083, | |
| "grad_norm": 0.2578127980232239, | |
| "learning_rate": 1.8646620411462924e-05, | |
| "loss": 0.7802, | |
| "mean_token_accuracy": 0.7694523215293885, | |
| "num_tokens": 1424633658.0, | |
| "step": 685 | |
| }, | |
| { | |
| "epoch": 0.7100591715976331, | |
| "grad_norm": 0.23547138273715973, | |
| "learning_rate": 1.8622151323085317e-05, | |
| "loss": 0.7573, | |
| "mean_token_accuracy": 0.7754601895809173, | |
| "num_tokens": 1435040726.0, | |
| "step": 690 | |
| }, | |
| { | |
| "epoch": 0.715204527913558, | |
| "grad_norm": 0.28163185715675354, | |
| "learning_rate": 1.8597485363089026e-05, | |
| "loss": 0.7574, | |
| "mean_token_accuracy": 0.7757174968719482, | |
| "num_tokens": 1445446395.0, | |
| "step": 695 | |
| }, | |
| { | |
| "epoch": 0.7203498842294829, | |
| "grad_norm": 0.2508062422275543, | |
| "learning_rate": 1.8572623324663756e-05, | |
| "loss": 0.767, | |
| "mean_token_accuracy": 0.7730412214994431, | |
| "num_tokens": 1455855217.0, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 0.7254952405454078, | |
| "grad_norm": 0.20320753753185272, | |
| "learning_rate": 1.8547566007304577e-05, | |
| "loss": 0.7687, | |
| "mean_token_accuracy": 0.7726406931877137, | |
| "num_tokens": 1466245798.0, | |
| "step": 705 | |
| }, | |
| { | |
| "epoch": 0.7306405968613326, | |
| "grad_norm": 0.2486068159341812, | |
| "learning_rate": 1.8522314216786186e-05, | |
| "loss": 0.7559, | |
| "mean_token_accuracy": 0.77595334649086, | |
| "num_tokens": 1476629034.0, | |
| "step": 710 | |
| }, | |
| { | |
| "epoch": 0.7357859531772575, | |
| "grad_norm": 0.20530639588832855, | |
| "learning_rate": 1.8496868765136996e-05, | |
| "loss": 0.758, | |
| "mean_token_accuracy": 0.7758067876100541, | |
| "num_tokens": 1487047420.0, | |
| "step": 715 | |
| }, | |
| { | |
| "epoch": 0.7409313094931824, | |
| "grad_norm": 0.20521405339241028, | |
| "learning_rate": 1.8471230470613046e-05, | |
| "loss": 0.7661, | |
| "mean_token_accuracy": 0.7729949295520783, | |
| "num_tokens": 1497477237.0, | |
| "step": 720 | |
| }, | |
| { | |
| "epoch": 0.7460766658091073, | |
| "grad_norm": 0.2346915453672409, | |
| "learning_rate": 1.844540015767167e-05, | |
| "loss": 0.7633, | |
| "mean_token_accuracy": 0.7740152984857559, | |
| "num_tokens": 1507862470.0, | |
| "step": 725 | |
| }, | |
| { | |
| "epoch": 0.7512220221250322, | |
| "grad_norm": 0.25663813948631287, | |
| "learning_rate": 1.8419378656944983e-05, | |
| "loss": 0.7633, | |
| "mean_token_accuracy": 0.7739923149347305, | |
| "num_tokens": 1518284565.0, | |
| "step": 730 | |
| }, | |
| { | |
| "epoch": 0.756367378440957, | |
| "grad_norm": 0.19483357667922974, | |
| "learning_rate": 1.8393166805213178e-05, | |
| "loss": 0.7564, | |
| "mean_token_accuracy": 0.7762584149837494, | |
| "num_tokens": 1528673299.0, | |
| "step": 735 | |
| }, | |
| { | |
| "epoch": 0.7615127347568819, | |
| "grad_norm": 0.298998087644577, | |
| "learning_rate": 1.8366765445377614e-05, | |
| "loss": 0.7634, | |
| "mean_token_accuracy": 0.7737476319074631, | |
| "num_tokens": 1539078416.0, | |
| "step": 740 | |
| }, | |
| { | |
| "epoch": 0.7666580910728068, | |
| "grad_norm": 0.22814105451107025, | |
| "learning_rate": 1.834017542643372e-05, | |
| "loss": 0.769, | |
| "mean_token_accuracy": 0.7724894404411315, | |
| "num_tokens": 1549479202.0, | |
| "step": 745 | |
| }, | |
| { | |
| "epoch": 0.7718034473887316, | |
| "grad_norm": 0.20318229496479034, | |
| "learning_rate": 1.8313397603443665e-05, | |
| "loss": 0.7508, | |
| "mean_token_accuracy": 0.7772645950317383, | |
| "num_tokens": 1559862298.0, | |
| "step": 750 | |
| }, | |
| { | |
| "epoch": 0.7769488037046566, | |
| "grad_norm": 0.28188684582710266, | |
| "learning_rate": 1.828643283750891e-05, | |
| "loss": 0.7518, | |
| "mean_token_accuracy": 0.776808711886406, | |
| "num_tokens": 1570288616.0, | |
| "step": 755 | |
| }, | |
| { | |
| "epoch": 0.7820941600205814, | |
| "grad_norm": 0.24467986822128296, | |
| "learning_rate": 1.8259281995742467e-05, | |
| "loss": 0.7663, | |
| "mean_token_accuracy": 0.7730655431747436, | |
| "num_tokens": 1580705251.0, | |
| "step": 760 | |
| }, | |
| { | |
| "epoch": 0.7872395163365064, | |
| "grad_norm": 0.24917523562908173, | |
| "learning_rate": 1.8231945951241043e-05, | |
| "loss": 0.7625, | |
| "mean_token_accuracy": 0.773466631770134, | |
| "num_tokens": 1591106421.0, | |
| "step": 765 | |
| }, | |
| { | |
| "epoch": 0.7923848726524312, | |
| "grad_norm": 0.18945878744125366, | |
| "learning_rate": 1.8204425583056962e-05, | |
| "loss": 0.7507, | |
| "mean_token_accuracy": 0.7773733377456665, | |
| "num_tokens": 1601520172.0, | |
| "step": 770 | |
| }, | |
| { | |
| "epoch": 0.797530228968356, | |
| "grad_norm": 0.2917018234729767, | |
| "learning_rate": 1.817672177616989e-05, | |
| "loss": 0.7604, | |
| "mean_token_accuracy": 0.7747350037097931, | |
| "num_tokens": 1611899934.0, | |
| "step": 775 | |
| }, | |
| { | |
| "epoch": 0.802675585284281, | |
| "grad_norm": 0.24668017029762268, | |
| "learning_rate": 1.8148835421458374e-05, | |
| "loss": 0.7601, | |
| "mean_token_accuracy": 0.7746721476316452, | |
| "num_tokens": 1622304284.0, | |
| "step": 780 | |
| }, | |
| { | |
| "epoch": 0.8078209416002058, | |
| "grad_norm": 0.3010805547237396, | |
| "learning_rate": 1.8120767415671208e-05, | |
| "loss": 0.7715, | |
| "mean_token_accuracy": 0.7717162489891052, | |
| "num_tokens": 1632665638.0, | |
| "step": 785 | |
| }, | |
| { | |
| "epoch": 0.8129662979161307, | |
| "grad_norm": 0.2387019842863083, | |
| "learning_rate": 1.809251866139858e-05, | |
| "loss": 0.7631, | |
| "mean_token_accuracy": 0.773768350481987, | |
| "num_tokens": 1643065667.0, | |
| "step": 790 | |
| }, | |
| { | |
| "epoch": 0.8181116542320556, | |
| "grad_norm": 0.207286074757576, | |
| "learning_rate": 1.8064090067043066e-05, | |
| "loss": 0.7596, | |
| "mean_token_accuracy": 0.7746483981609344, | |
| "num_tokens": 1653489023.0, | |
| "step": 795 | |
| }, | |
| { | |
| "epoch": 0.8232570105479804, | |
| "grad_norm": 0.20019465684890747, | |
| "learning_rate": 1.8035482546790387e-05, | |
| "loss": 0.7619, | |
| "mean_token_accuracy": 0.7740916252136231, | |
| "num_tokens": 1663916269.0, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 0.8284023668639053, | |
| "grad_norm": 0.24347372353076935, | |
| "learning_rate": 1.8006697020580048e-05, | |
| "loss": 0.7614, | |
| "mean_token_accuracy": 0.7742874711751938, | |
| "num_tokens": 1674339518.0, | |
| "step": 805 | |
| }, | |
| { | |
| "epoch": 0.8335477231798302, | |
| "grad_norm": 0.2331140786409378, | |
| "learning_rate": 1.7977734414075728e-05, | |
| "loss": 0.7542, | |
| "mean_token_accuracy": 0.7760845631361007, | |
| "num_tokens": 1684753767.0, | |
| "step": 810 | |
| }, | |
| { | |
| "epoch": 0.8386930794957551, | |
| "grad_norm": 0.26996636390686035, | |
| "learning_rate": 1.7948595658635533e-05, | |
| "loss": 0.7644, | |
| "mean_token_accuracy": 0.7731517612934112, | |
| "num_tokens": 1695167774.0, | |
| "step": 815 | |
| }, | |
| { | |
| "epoch": 0.8438384358116799, | |
| "grad_norm": 0.23208890855312347, | |
| "learning_rate": 1.791928169128202e-05, | |
| "loss": 0.754, | |
| "mean_token_accuracy": 0.7757177084684372, | |
| "num_tokens": 1705535908.0, | |
| "step": 820 | |
| }, | |
| { | |
| "epoch": 0.8489837921276049, | |
| "grad_norm": 0.22476951777935028, | |
| "learning_rate": 1.7889793454672104e-05, | |
| "loss": 0.757, | |
| "mean_token_accuracy": 0.7750364452600479, | |
| "num_tokens": 1715945968.0, | |
| "step": 825 | |
| }, | |
| { | |
| "epoch": 0.8541291484435297, | |
| "grad_norm": 0.2300329953432083, | |
| "learning_rate": 1.7860131897066702e-05, | |
| "loss": 0.7687, | |
| "mean_token_accuracy": 0.7721786022186279, | |
| "num_tokens": 1726366494.0, | |
| "step": 830 | |
| }, | |
| { | |
| "epoch": 0.8592745047594545, | |
| "grad_norm": 0.21463564038276672, | |
| "learning_rate": 1.7830297972300266e-05, | |
| "loss": 0.7632, | |
| "mean_token_accuracy": 0.773630577325821, | |
| "num_tokens": 1736754760.0, | |
| "step": 835 | |
| }, | |
| { | |
| "epoch": 0.8644198610753795, | |
| "grad_norm": 0.20610825717449188, | |
| "learning_rate": 1.780029263975011e-05, | |
| "loss": 0.7428, | |
| "mean_token_accuracy": 0.7793777525424957, | |
| "num_tokens": 1747169397.0, | |
| "step": 840 | |
| }, | |
| { | |
| "epoch": 0.8695652173913043, | |
| "grad_norm": 0.17724700272083282, | |
| "learning_rate": 1.7770116864305543e-05, | |
| "loss": 0.7542, | |
| "mean_token_accuracy": 0.7759394317865371, | |
| "num_tokens": 1757562579.0, | |
| "step": 845 | |
| }, | |
| { | |
| "epoch": 0.8747105737072293, | |
| "grad_norm": 0.20515885949134827, | |
| "learning_rate": 1.773977161633686e-05, | |
| "loss": 0.7585, | |
| "mean_token_accuracy": 0.774996566772461, | |
| "num_tokens": 1767951452.0, | |
| "step": 850 | |
| }, | |
| { | |
| "epoch": 0.8798559300231541, | |
| "grad_norm": 0.21575596928596497, | |
| "learning_rate": 1.770925787166412e-05, | |
| "loss": 0.7352, | |
| "mean_token_accuracy": 0.7808972954750061, | |
| "num_tokens": 1778357806.0, | |
| "step": 855 | |
| }, | |
| { | |
| "epoch": 0.8850012863390789, | |
| "grad_norm": 0.22368930280208588, | |
| "learning_rate": 1.767857661152578e-05, | |
| "loss": 0.7557, | |
| "mean_token_accuracy": 0.775502935051918, | |
| "num_tokens": 1788767633.0, | |
| "step": 860 | |
| }, | |
| { | |
| "epoch": 0.8901466426550039, | |
| "grad_norm": 0.2021559774875641, | |
| "learning_rate": 1.7647728822547126e-05, | |
| "loss": 0.7609, | |
| "mean_token_accuracy": 0.7740152478218079, | |
| "num_tokens": 1799164623.0, | |
| "step": 865 | |
| }, | |
| { | |
| "epoch": 0.8952919989709287, | |
| "grad_norm": 0.19145594537258148, | |
| "learning_rate": 1.7616715496708575e-05, | |
| "loss": 0.7562, | |
| "mean_token_accuracy": 0.7754783451557159, | |
| "num_tokens": 1809537031.0, | |
| "step": 870 | |
| }, | |
| { | |
| "epoch": 0.9004373552868536, | |
| "grad_norm": 0.19838295876979828, | |
| "learning_rate": 1.7585537631313738e-05, | |
| "loss": 0.7554, | |
| "mean_token_accuracy": 0.7755459159612655, | |
| "num_tokens": 1819917273.0, | |
| "step": 875 | |
| }, | |
| { | |
| "epoch": 0.9055827116027785, | |
| "grad_norm": 0.18983587622642517, | |
| "learning_rate": 1.7554196228957374e-05, | |
| "loss": 0.7629, | |
| "mean_token_accuracy": 0.7733017027378082, | |
| "num_tokens": 1830301370.0, | |
| "step": 880 | |
| }, | |
| { | |
| "epoch": 0.9107280679187034, | |
| "grad_norm": 0.2382967472076416, | |
| "learning_rate": 1.7522692297493145e-05, | |
| "loss": 0.7545, | |
| "mean_token_accuracy": 0.775650081038475, | |
| "num_tokens": 1840697377.0, | |
| "step": 885 | |
| }, | |
| { | |
| "epoch": 0.9158734242346283, | |
| "grad_norm": 0.2184479981660843, | |
| "learning_rate": 1.7491026850001195e-05, | |
| "loss": 0.761, | |
| "mean_token_accuracy": 0.7739459365606308, | |
| "num_tokens": 1851095319.0, | |
| "step": 890 | |
| }, | |
| { | |
| "epoch": 0.9210187805505531, | |
| "grad_norm": 0.20179307460784912, | |
| "learning_rate": 1.745920090475559e-05, | |
| "loss": 0.7654, | |
| "mean_token_accuracy": 0.7729564756155014, | |
| "num_tokens": 1861483986.0, | |
| "step": 895 | |
| }, | |
| { | |
| "epoch": 0.926164136866478, | |
| "grad_norm": 0.2453368604183197, | |
| "learning_rate": 1.7427215485191567e-05, | |
| "loss": 0.7555, | |
| "mean_token_accuracy": 0.7756044954061508, | |
| "num_tokens": 1871857137.0, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 0.9313094931824029, | |
| "grad_norm": 0.21209710836410522, | |
| "learning_rate": 1.739507161987261e-05, | |
| "loss": 0.7615, | |
| "mean_token_accuracy": 0.7739556908607483, | |
| "num_tokens": 1882273427.0, | |
| "step": 905 | |
| }, | |
| { | |
| "epoch": 0.9364548494983278, | |
| "grad_norm": 0.18556725978851318, | |
| "learning_rate": 1.736277034245739e-05, | |
| "loss": 0.7557, | |
| "mean_token_accuracy": 0.7750442743301391, | |
| "num_tokens": 1892660604.0, | |
| "step": 910 | |
| }, | |
| { | |
| "epoch": 0.9416002058142526, | |
| "grad_norm": 0.2182847410440445, | |
| "learning_rate": 1.7330312691666517e-05, | |
| "loss": 0.7592, | |
| "mean_token_accuracy": 0.7745744317770005, | |
| "num_tokens": 1903083350.0, | |
| "step": 915 | |
| }, | |
| { | |
| "epoch": 0.9467455621301775, | |
| "grad_norm": 0.20762157440185547, | |
| "learning_rate": 1.7297699711249144e-05, | |
| "loss": 0.7481, | |
| "mean_token_accuracy": 0.7772045373916626, | |
| "num_tokens": 1913484635.0, | |
| "step": 920 | |
| }, | |
| { | |
| "epoch": 0.9518909184461024, | |
| "grad_norm": 0.22431345283985138, | |
| "learning_rate": 1.7264932449949403e-05, | |
| "loss": 0.752, | |
| "mean_token_accuracy": 0.7767430722713471, | |
| "num_tokens": 1923902157.0, | |
| "step": 925 | |
| }, | |
| { | |
| "epoch": 0.9570362747620272, | |
| "grad_norm": 0.21677188575267792, | |
| "learning_rate": 1.7232011961472666e-05, | |
| "loss": 0.751, | |
| "mean_token_accuracy": 0.776458004117012, | |
| "num_tokens": 1934287093.0, | |
| "step": 930 | |
| }, | |
| { | |
| "epoch": 0.9621816310779522, | |
| "grad_norm": 0.24291808903217316, | |
| "learning_rate": 1.7198939304451677e-05, | |
| "loss": 0.7621, | |
| "mean_token_accuracy": 0.7737038463354111, | |
| "num_tokens": 1944670753.0, | |
| "step": 935 | |
| }, | |
| { | |
| "epoch": 0.967326987393877, | |
| "grad_norm": 0.18869860470294952, | |
| "learning_rate": 1.7165715542412505e-05, | |
| "loss": 0.7474, | |
| "mean_token_accuracy": 0.7775454163551331, | |
| "num_tokens": 1955041733.0, | |
| "step": 940 | |
| }, | |
| { | |
| "epoch": 0.972472343709802, | |
| "grad_norm": 0.2036639004945755, | |
| "learning_rate": 1.7132341743740343e-05, | |
| "loss": 0.7591, | |
| "mean_token_accuracy": 0.7744521021842956, | |
| "num_tokens": 1965413435.0, | |
| "step": 945 | |
| }, | |
| { | |
| "epoch": 0.9776177000257268, | |
| "grad_norm": 0.19717492163181305, | |
| "learning_rate": 1.709881898164515e-05, | |
| "loss": 0.7626, | |
| "mean_token_accuracy": 0.7734819889068604, | |
| "num_tokens": 1975831716.0, | |
| "step": 950 | |
| }, | |
| { | |
| "epoch": 0.9827630563416516, | |
| "grad_norm": 0.20819610357284546, | |
| "learning_rate": 1.7065148334127137e-05, | |
| "loss": 0.7622, | |
| "mean_token_accuracy": 0.7739403694868088, | |
| "num_tokens": 1986221267.0, | |
| "step": 955 | |
| }, | |
| { | |
| "epoch": 0.9879084126575766, | |
| "grad_norm": 0.2304711937904358, | |
| "learning_rate": 1.7031330883942106e-05, | |
| "loss": 0.7604, | |
| "mean_token_accuracy": 0.7743852972984314, | |
| "num_tokens": 1996637419.0, | |
| "step": 960 | |
| }, | |
| { | |
| "epoch": 0.9930537689735014, | |
| "grad_norm": 0.20585313439369202, | |
| "learning_rate": 1.699736771856664e-05, | |
| "loss": 0.7417, | |
| "mean_token_accuracy": 0.7793391734361649, | |
| "num_tokens": 2007024978.0, | |
| "step": 965 | |
| }, | |
| { | |
| "epoch": 0.9981991252894263, | |
| "grad_norm": 0.20385591685771942, | |
| "learning_rate": 1.6963259930163104e-05, | |
| "loss": 0.7442, | |
| "mean_token_accuracy": 0.7783006697893142, | |
| "num_tokens": 2017443954.0, | |
| "step": 970 | |
| }, | |
| { | |
| "epoch": 1.003087213789555, | |
| "grad_norm": 0.24495287239551544, | |
| "learning_rate": 1.692900861554457e-05, | |
| "loss": 0.7389, | |
| "mean_token_accuracy": 0.779910335415288, | |
| "num_tokens": 2027325031.0, | |
| "step": 975 | |
| }, | |
| { | |
| "epoch": 1.0082325701054797, | |
| "grad_norm": 0.2038143426179886, | |
| "learning_rate": 1.68946148761395e-05, | |
| "loss": 0.7429, | |
| "mean_token_accuracy": 0.7774323493242263, | |
| "num_tokens": 2037733216.0, | |
| "step": 980 | |
| }, | |
| { | |
| "epoch": 1.0133779264214047, | |
| "grad_norm": 0.19667111337184906, | |
| "learning_rate": 1.6860079817956353e-05, | |
| "loss": 0.7276, | |
| "mean_token_accuracy": 0.7817776888608933, | |
| "num_tokens": 2048163457.0, | |
| "step": 985 | |
| }, | |
| { | |
| "epoch": 1.0185232827373296, | |
| "grad_norm": 0.2280578911304474, | |
| "learning_rate": 1.682540455154801e-05, | |
| "loss": 0.7156, | |
| "mean_token_accuracy": 0.7847582966089248, | |
| "num_tokens": 2058559958.0, | |
| "step": 990 | |
| }, | |
| { | |
| "epoch": 1.0236686390532543, | |
| "grad_norm": 0.20857077836990356, | |
| "learning_rate": 1.6790590191976068e-05, | |
| "loss": 0.7335, | |
| "mean_token_accuracy": 0.7803497105836869, | |
| "num_tokens": 2068976029.0, | |
| "step": 995 | |
| }, | |
| { | |
| "epoch": 1.0288139953691793, | |
| "grad_norm": 0.23610693216323853, | |
| "learning_rate": 1.6755637858774986e-05, | |
| "loss": 0.7416, | |
| "mean_token_accuracy": 0.7778135746717453, | |
| "num_tokens": 2079378100.0, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 1.0339593516851042, | |
| "grad_norm": 0.22110521793365479, | |
| "learning_rate": 1.6720548675916058e-05, | |
| "loss": 0.7354, | |
| "mean_token_accuracy": 0.7797701776027679, | |
| "num_tokens": 2089780930.0, | |
| "step": 1005 | |
| }, | |
| { | |
| "epoch": 1.0391047080010292, | |
| "grad_norm": 0.2105809450149536, | |
| "learning_rate": 1.6685323771771306e-05, | |
| "loss": 0.727, | |
| "mean_token_accuracy": 0.7818355768918991, | |
| "num_tokens": 2100188750.0, | |
| "step": 1010 | |
| }, | |
| { | |
| "epoch": 1.0442500643169539, | |
| "grad_norm": 0.20174016058444977, | |
| "learning_rate": 1.664996427907717e-05, | |
| "loss": 0.7327, | |
| "mean_token_accuracy": 0.7801722586154938, | |
| "num_tokens": 2110572072.0, | |
| "step": 1015 | |
| }, | |
| { | |
| "epoch": 1.0493954206328788, | |
| "grad_norm": 0.1835939884185791, | |
| "learning_rate": 1.6614471334898086e-05, | |
| "loss": 0.7334, | |
| "mean_token_accuracy": 0.7800804376602173, | |
| "num_tokens": 2120973633.0, | |
| "step": 1020 | |
| }, | |
| { | |
| "epoch": 1.0545407769488038, | |
| "grad_norm": 0.21300305426120758, | |
| "learning_rate": 1.6578846080589934e-05, | |
| "loss": 0.7299, | |
| "mean_token_accuracy": 0.7812818288803101, | |
| "num_tokens": 2131360621.0, | |
| "step": 1025 | |
| }, | |
| { | |
| "epoch": 1.0596861332647285, | |
| "grad_norm": 0.1908658891916275, | |
| "learning_rate": 1.6543089661763315e-05, | |
| "loss": 0.7223, | |
| "mean_token_accuracy": 0.7834197998046875, | |
| "num_tokens": 2141793569.0, | |
| "step": 1030 | |
| }, | |
| { | |
| "epoch": 1.0648314895806534, | |
| "grad_norm": 0.19039925932884216, | |
| "learning_rate": 1.650720322824672e-05, | |
| "loss": 0.7361, | |
| "mean_token_accuracy": 0.779027310013771, | |
| "num_tokens": 2152174814.0, | |
| "step": 1035 | |
| }, | |
| { | |
| "epoch": 1.0699768458965784, | |
| "grad_norm": 0.2063397467136383, | |
| "learning_rate": 1.6471187934049574e-05, | |
| "loss": 0.7237, | |
| "mean_token_accuracy": 0.7826652109622956, | |
| "num_tokens": 2162584314.0, | |
| "step": 1040 | |
| }, | |
| { | |
| "epoch": 1.0751222022125033, | |
| "grad_norm": 0.20498026907444, | |
| "learning_rate": 1.643504493732509e-05, | |
| "loss": 0.7346, | |
| "mean_token_accuracy": 0.7796628832817077, | |
| "num_tokens": 2172965246.0, | |
| "step": 1045 | |
| }, | |
| { | |
| "epoch": 1.080267558528428, | |
| "grad_norm": 0.23134800791740417, | |
| "learning_rate": 1.639877540033305e-05, | |
| "loss": 0.7271, | |
| "mean_token_accuracy": 0.7816483587026596, | |
| "num_tokens": 2183370435.0, | |
| "step": 1050 | |
| }, | |
| { | |
| "epoch": 1.085412914844353, | |
| "grad_norm": 0.19139111042022705, | |
| "learning_rate": 1.6362380489402433e-05, | |
| "loss": 0.7228, | |
| "mean_token_accuracy": 0.7829385250806808, | |
| "num_tokens": 2193756185.0, | |
| "step": 1055 | |
| }, | |
| { | |
| "epoch": 1.090558271160278, | |
| "grad_norm": 0.23847441375255585, | |
| "learning_rate": 1.6325861374893885e-05, | |
| "loss": 0.7357, | |
| "mean_token_accuracy": 0.7798227697610856, | |
| "num_tokens": 2204144787.0, | |
| "step": 1060 | |
| }, | |
| { | |
| "epoch": 1.0957036274762026, | |
| "grad_norm": 0.19361141324043274, | |
| "learning_rate": 1.6289219231162107e-05, | |
| "loss": 0.7323, | |
| "mean_token_accuracy": 0.7803295195102692, | |
| "num_tokens": 2214555301.0, | |
| "step": 1065 | |
| }, | |
| { | |
| "epoch": 1.1008489837921276, | |
| "grad_norm": 0.2365255057811737, | |
| "learning_rate": 1.6252455236518088e-05, | |
| "loss": 0.7223, | |
| "mean_token_accuracy": 0.7834222823381424, | |
| "num_tokens": 2224964461.0, | |
| "step": 1070 | |
| }, | |
| { | |
| "epoch": 1.1059943401080525, | |
| "grad_norm": 0.22614383697509766, | |
| "learning_rate": 1.6215570573191203e-05, | |
| "loss": 0.7324, | |
| "mean_token_accuracy": 0.7800393998622894, | |
| "num_tokens": 2235385359.0, | |
| "step": 1075 | |
| }, | |
| { | |
| "epoch": 1.1111396964239773, | |
| "grad_norm": 0.1952890157699585, | |
| "learning_rate": 1.6178566427291196e-05, | |
| "loss": 0.7361, | |
| "mean_token_accuracy": 0.7795492619276047, | |
| "num_tokens": 2245790049.0, | |
| "step": 1080 | |
| }, | |
| { | |
| "epoch": 1.1162850527399022, | |
| "grad_norm": 0.26438701152801514, | |
| "learning_rate": 1.614144398877006e-05, | |
| "loss": 0.733, | |
| "mean_token_accuracy": 0.7802164793014527, | |
| "num_tokens": 2256182680.0, | |
| "step": 1085 | |
| }, | |
| { | |
| "epoch": 1.1214304090558271, | |
| "grad_norm": 0.19735948741436005, | |
| "learning_rate": 1.610420445138373e-05, | |
| "loss": 0.7238, | |
| "mean_token_accuracy": 0.7827324986457824, | |
| "num_tokens": 2266607807.0, | |
| "step": 1090 | |
| }, | |
| { | |
| "epoch": 1.126575765371752, | |
| "grad_norm": 0.20691247284412384, | |
| "learning_rate": 1.6066849012653745e-05, | |
| "loss": 0.727, | |
| "mean_token_accuracy": 0.7817386299371719, | |
| "num_tokens": 2277008491.0, | |
| "step": 1095 | |
| }, | |
| { | |
| "epoch": 1.1317211216876768, | |
| "grad_norm": 0.22432605922222137, | |
| "learning_rate": 1.6029378873828695e-05, | |
| "loss": 0.7298, | |
| "mean_token_accuracy": 0.7808958977460861, | |
| "num_tokens": 2287392752.0, | |
| "step": 1100 | |
| }, | |
| { | |
| "epoch": 1.1368664780036017, | |
| "grad_norm": 0.20151114463806152, | |
| "learning_rate": 1.599179523984562e-05, | |
| "loss": 0.737, | |
| "mean_token_accuracy": 0.7791817605495452, | |
| "num_tokens": 2297803721.0, | |
| "step": 1105 | |
| }, | |
| { | |
| "epoch": 1.1420118343195267, | |
| "grad_norm": 0.2306589037179947, | |
| "learning_rate": 1.5954099319291256e-05, | |
| "loss": 0.7325, | |
| "mean_token_accuracy": 0.7801610469818115, | |
| "num_tokens": 2308187255.0, | |
| "step": 1110 | |
| }, | |
| { | |
| "epoch": 1.1471571906354514, | |
| "grad_norm": 0.1993226855993271, | |
| "learning_rate": 1.5916292324363156e-05, | |
| "loss": 0.7251, | |
| "mean_token_accuracy": 0.7821116268634796, | |
| "num_tokens": 2318589605.0, | |
| "step": 1115 | |
| }, | |
| { | |
| "epoch": 1.1523025469513763, | |
| "grad_norm": 0.22113800048828125, | |
| "learning_rate": 1.5878375470830737e-05, | |
| "loss": 0.743, | |
| "mean_token_accuracy": 0.7773738950490952, | |
| "num_tokens": 2328989752.0, | |
| "step": 1120 | |
| }, | |
| { | |
| "epoch": 1.1574479032673013, | |
| "grad_norm": 0.2003169059753418, | |
| "learning_rate": 1.584034997799615e-05, | |
| "loss": 0.728, | |
| "mean_token_accuracy": 0.7815322816371918, | |
| "num_tokens": 2339418236.0, | |
| "step": 1125 | |
| }, | |
| { | |
| "epoch": 1.1625932595832262, | |
| "grad_norm": 0.2361934781074524, | |
| "learning_rate": 1.5802217068655103e-05, | |
| "loss": 0.7198, | |
| "mean_token_accuracy": 0.7839639008045196, | |
| "num_tokens": 2349792828.0, | |
| "step": 1130 | |
| }, | |
| { | |
| "epoch": 1.167738615899151, | |
| "grad_norm": 0.21388159692287445, | |
| "learning_rate": 1.5763977969057514e-05, | |
| "loss": 0.7327, | |
| "mean_token_accuracy": 0.7798545330762863, | |
| "num_tokens": 2360200634.0, | |
| "step": 1135 | |
| }, | |
| { | |
| "epoch": 1.172883972215076, | |
| "grad_norm": 0.20029175281524658, | |
| "learning_rate": 1.5725633908868098e-05, | |
| "loss": 0.7338, | |
| "mean_token_accuracy": 0.7797937542200089, | |
| "num_tokens": 2370596817.0, | |
| "step": 1140 | |
| }, | |
| { | |
| "epoch": 1.1780293285310008, | |
| "grad_norm": 0.19658546149730682, | |
| "learning_rate": 1.568718612112681e-05, | |
| "loss": 0.721, | |
| "mean_token_accuracy": 0.7830342799425125, | |
| "num_tokens": 2380988279.0, | |
| "step": 1145 | |
| }, | |
| { | |
| "epoch": 1.1831746848469256, | |
| "grad_norm": 0.23174834251403809, | |
| "learning_rate": 1.5648635842209197e-05, | |
| "loss": 0.7311, | |
| "mean_token_accuracy": 0.7808991014957428, | |
| "num_tokens": 2391412429.0, | |
| "step": 1150 | |
| }, | |
| { | |
| "epoch": 1.1883200411628505, | |
| "grad_norm": 0.21775346994400024, | |
| "learning_rate": 1.5609984311786645e-05, | |
| "loss": 0.729, | |
| "mean_token_accuracy": 0.7810937970876694, | |
| "num_tokens": 2401771777.0, | |
| "step": 1155 | |
| }, | |
| { | |
| "epoch": 1.1934653974787754, | |
| "grad_norm": 0.21019048988819122, | |
| "learning_rate": 1.5571232772786517e-05, | |
| "loss": 0.7253, | |
| "mean_token_accuracy": 0.7820482671260833, | |
| "num_tokens": 2412159833.0, | |
| "step": 1160 | |
| }, | |
| { | |
| "epoch": 1.1986107537947004, | |
| "grad_norm": 0.20135393738746643, | |
| "learning_rate": 1.553238247135216e-05, | |
| "loss": 0.7276, | |
| "mean_token_accuracy": 0.7814355909824371, | |
| "num_tokens": 2422558752.0, | |
| "step": 1165 | |
| }, | |
| { | |
| "epoch": 1.203756110110625, | |
| "grad_norm": 0.18914781510829926, | |
| "learning_rate": 1.549343465680287e-05, | |
| "loss": 0.731, | |
| "mean_token_accuracy": 0.7803467661142349, | |
| "num_tokens": 2432969997.0, | |
| "step": 1170 | |
| }, | |
| { | |
| "epoch": 1.20890146642655, | |
| "grad_norm": 0.21189342439174652, | |
| "learning_rate": 1.5454390581593687e-05, | |
| "loss": 0.7262, | |
| "mean_token_accuracy": 0.7820929646492004, | |
| "num_tokens": 2443373443.0, | |
| "step": 1175 | |
| }, | |
| { | |
| "epoch": 1.214046822742475, | |
| "grad_norm": 0.1938139647245407, | |
| "learning_rate": 1.541525150127513e-05, | |
| "loss": 0.721, | |
| "mean_token_accuracy": 0.7835055589675903, | |
| "num_tokens": 2453770770.0, | |
| "step": 1180 | |
| }, | |
| { | |
| "epoch": 1.2191921790583997, | |
| "grad_norm": 0.21657834947109222, | |
| "learning_rate": 1.537601867445283e-05, | |
| "loss": 0.7319, | |
| "mean_token_accuracy": 0.7800745993852616, | |
| "num_tokens": 2464158108.0, | |
| "step": 1185 | |
| }, | |
| { | |
| "epoch": 1.2243375353743247, | |
| "grad_norm": 0.20266316831111908, | |
| "learning_rate": 1.5336693362747036e-05, | |
| "loss": 0.7274, | |
| "mean_token_accuracy": 0.7812938541173935, | |
| "num_tokens": 2474574860.0, | |
| "step": 1190 | |
| }, | |
| { | |
| "epoch": 1.2294828916902496, | |
| "grad_norm": 0.21585378050804138, | |
| "learning_rate": 1.5297276830752074e-05, | |
| "loss": 0.7262, | |
| "mean_token_accuracy": 0.7819346249103546, | |
| "num_tokens": 2484977909.0, | |
| "step": 1195 | |
| }, | |
| { | |
| "epoch": 1.2346282480061745, | |
| "grad_norm": 0.22400428354740143, | |
| "learning_rate": 1.5257770345995648e-05, | |
| "loss": 0.7325, | |
| "mean_token_accuracy": 0.7801924586296082, | |
| "num_tokens": 2495371556.0, | |
| "step": 1200 | |
| }, | |
| { | |
| "epoch": 1.2397736043220993, | |
| "grad_norm": 0.2023976743221283, | |
| "learning_rate": 1.5218175178898106e-05, | |
| "loss": 0.7202, | |
| "mean_token_accuracy": 0.7832947343587875, | |
| "num_tokens": 2505752888.0, | |
| "step": 1205 | |
| }, | |
| { | |
| "epoch": 1.2449189606380242, | |
| "grad_norm": 0.22408372163772583, | |
| "learning_rate": 1.5178492602731581e-05, | |
| "loss": 0.7254, | |
| "mean_token_accuracy": 0.7819278568029404, | |
| "num_tokens": 2516182631.0, | |
| "step": 1210 | |
| }, | |
| { | |
| "epoch": 1.250064316953949, | |
| "grad_norm": 0.24158449470996857, | |
| "learning_rate": 1.5138723893579028e-05, | |
| "loss": 0.7296, | |
| "mean_token_accuracy": 0.781261432170868, | |
| "num_tokens": 2526605123.0, | |
| "step": 1215 | |
| }, | |
| { | |
| "epoch": 1.2552096732698739, | |
| "grad_norm": 0.2226988673210144, | |
| "learning_rate": 1.5098870330293218e-05, | |
| "loss": 0.7171, | |
| "mean_token_accuracy": 0.7841993749141694, | |
| "num_tokens": 2537002245.0, | |
| "step": 1220 | |
| }, | |
| { | |
| "epoch": 1.2603550295857988, | |
| "grad_norm": 0.23769278824329376, | |
| "learning_rate": 1.505893319445559e-05, | |
| "loss": 0.7325, | |
| "mean_token_accuracy": 0.7801701694726944, | |
| "num_tokens": 2547440043.0, | |
| "step": 1225 | |
| }, | |
| { | |
| "epoch": 1.2655003859017238, | |
| "grad_norm": 0.1922103464603424, | |
| "learning_rate": 1.5018913770335046e-05, | |
| "loss": 0.7274, | |
| "mean_token_accuracy": 0.781620192527771, | |
| "num_tokens": 2557858544.0, | |
| "step": 1230 | |
| }, | |
| { | |
| "epoch": 1.2706457422176487, | |
| "grad_norm": 0.18663917481899261, | |
| "learning_rate": 1.4978813344846661e-05, | |
| "loss": 0.7391, | |
| "mean_token_accuracy": 0.7782456696033477, | |
| "num_tokens": 2568263241.0, | |
| "step": 1235 | |
| }, | |
| { | |
| "epoch": 1.2757910985335734, | |
| "grad_norm": 0.19044432044029236, | |
| "learning_rate": 1.4938633207510287e-05, | |
| "loss": 0.7267, | |
| "mean_token_accuracy": 0.7819855302572251, | |
| "num_tokens": 2578685798.0, | |
| "step": 1240 | |
| }, | |
| { | |
| "epoch": 1.2809364548494984, | |
| "grad_norm": 0.2261897772550583, | |
| "learning_rate": 1.4898374650409094e-05, | |
| "loss": 0.7266, | |
| "mean_token_accuracy": 0.7815109491348267, | |
| "num_tokens": 2589079528.0, | |
| "step": 1245 | |
| }, | |
| { | |
| "epoch": 1.286081811165423, | |
| "grad_norm": 0.27746737003326416, | |
| "learning_rate": 1.485803896814801e-05, | |
| "loss": 0.7255, | |
| "mean_token_accuracy": 0.7821037322282791, | |
| "num_tokens": 2599488251.0, | |
| "step": 1250 | |
| }, | |
| { | |
| "epoch": 1.291227167481348, | |
| "grad_norm": 0.21356722712516785, | |
| "learning_rate": 1.4817627457812107e-05, | |
| "loss": 0.7218, | |
| "mean_token_accuracy": 0.7831762701272964, | |
| "num_tokens": 2609884395.0, | |
| "step": 1255 | |
| }, | |
| { | |
| "epoch": 1.296372523797273, | |
| "grad_norm": 0.1928669810295105, | |
| "learning_rate": 1.4777141418924874e-05, | |
| "loss": 0.7222, | |
| "mean_token_accuracy": 0.7831795990467072, | |
| "num_tokens": 2620262838.0, | |
| "step": 1260 | |
| }, | |
| { | |
| "epoch": 1.301517880113198, | |
| "grad_norm": 0.20577941834926605, | |
| "learning_rate": 1.4736582153406431e-05, | |
| "loss": 0.7345, | |
| "mean_token_accuracy": 0.7796419382095336, | |
| "num_tokens": 2630615363.0, | |
| "step": 1265 | |
| }, | |
| { | |
| "epoch": 1.3066632364291229, | |
| "grad_norm": 0.2071027308702469, | |
| "learning_rate": 1.4695950965531679e-05, | |
| "loss": 0.7207, | |
| "mean_token_accuracy": 0.7833283305168152, | |
| "num_tokens": 2641010705.0, | |
| "step": 1270 | |
| }, | |
| { | |
| "epoch": 1.3118085927450476, | |
| "grad_norm": 0.20452018082141876, | |
| "learning_rate": 1.4655249161888322e-05, | |
| "loss": 0.7219, | |
| "mean_token_accuracy": 0.7826146423816681, | |
| "num_tokens": 2651367538.0, | |
| "step": 1275 | |
| }, | |
| { | |
| "epoch": 1.3169539490609725, | |
| "grad_norm": 0.19734811782836914, | |
| "learning_rate": 1.46144780513349e-05, | |
| "loss": 0.7293, | |
| "mean_token_accuracy": 0.7808092325925827, | |
| "num_tokens": 2661752746.0, | |
| "step": 1280 | |
| }, | |
| { | |
| "epoch": 1.3220993053768972, | |
| "grad_norm": 0.2132222205400467, | |
| "learning_rate": 1.4573638944958654e-05, | |
| "loss": 0.7213, | |
| "mean_token_accuracy": 0.783295625448227, | |
| "num_tokens": 2672170022.0, | |
| "step": 1285 | |
| }, | |
| { | |
| "epoch": 1.3272446616928222, | |
| "grad_norm": 0.20818860828876495, | |
| "learning_rate": 1.4532733156033399e-05, | |
| "loss": 0.7257, | |
| "mean_token_accuracy": 0.7817448288202286, | |
| "num_tokens": 2682534056.0, | |
| "step": 1290 | |
| }, | |
| { | |
| "epoch": 1.3323900180087471, | |
| "grad_norm": 0.21955865621566772, | |
| "learning_rate": 1.449176199997726e-05, | |
| "loss": 0.7227, | |
| "mean_token_accuracy": 0.7827985614538193, | |
| "num_tokens": 2692942602.0, | |
| "step": 1295 | |
| }, | |
| { | |
| "epoch": 1.337535374324672, | |
| "grad_norm": 0.2140192687511444, | |
| "learning_rate": 1.4450726794310408e-05, | |
| "loss": 0.7245, | |
| "mean_token_accuracy": 0.7822914987802505, | |
| "num_tokens": 2703334544.0, | |
| "step": 1300 | |
| }, | |
| { | |
| "epoch": 1.3426807306405968, | |
| "grad_norm": 0.19470317661762238, | |
| "learning_rate": 1.4409628858612665e-05, | |
| "loss": 0.725, | |
| "mean_token_accuracy": 0.7822528421878815, | |
| "num_tokens": 2713729545.0, | |
| "step": 1305 | |
| }, | |
| { | |
| "epoch": 1.3478260869565217, | |
| "grad_norm": 0.19224713742733002, | |
| "learning_rate": 1.4368469514481083e-05, | |
| "loss": 0.7159, | |
| "mean_token_accuracy": 0.784244379401207, | |
| "num_tokens": 2724133639.0, | |
| "step": 1310 | |
| }, | |
| { | |
| "epoch": 1.3529714432724467, | |
| "grad_norm": 0.19122512638568878, | |
| "learning_rate": 1.4327250085487435e-05, | |
| "loss": 0.7318, | |
| "mean_token_accuracy": 0.7805237233638763, | |
| "num_tokens": 2734527785.0, | |
| "step": 1315 | |
| }, | |
| { | |
| "epoch": 1.3581167995883714, | |
| "grad_norm": 0.19252869486808777, | |
| "learning_rate": 1.428597189713566e-05, | |
| "loss": 0.721, | |
| "mean_token_accuracy": 0.7833033174276351, | |
| "num_tokens": 2744932620.0, | |
| "step": 1320 | |
| }, | |
| { | |
| "epoch": 1.3632621559042963, | |
| "grad_norm": 0.20513495802879333, | |
| "learning_rate": 1.4244636276819247e-05, | |
| "loss": 0.7288, | |
| "mean_token_accuracy": 0.7811249732971192, | |
| "num_tokens": 2755296881.0, | |
| "step": 1325 | |
| }, | |
| { | |
| "epoch": 1.3684075122202213, | |
| "grad_norm": 0.21072103083133698, | |
| "learning_rate": 1.4203244553778523e-05, | |
| "loss": 0.7267, | |
| "mean_token_accuracy": 0.781619307398796, | |
| "num_tokens": 2765692282.0, | |
| "step": 1330 | |
| }, | |
| { | |
| "epoch": 1.3735528685361462, | |
| "grad_norm": 0.23436151444911957, | |
| "learning_rate": 1.4161798059057942e-05, | |
| "loss": 0.7221, | |
| "mean_token_accuracy": 0.7828515231609344, | |
| "num_tokens": 2776081454.0, | |
| "step": 1335 | |
| }, | |
| { | |
| "epoch": 1.378698224852071, | |
| "grad_norm": 0.19753047823905945, | |
| "learning_rate": 1.4120298125463252e-05, | |
| "loss": 0.73, | |
| "mean_token_accuracy": 0.7808351576328277, | |
| "num_tokens": 2786502061.0, | |
| "step": 1340 | |
| }, | |
| { | |
| "epoch": 1.3838435811679959, | |
| "grad_norm": 0.2203502655029297, | |
| "learning_rate": 1.4078746087518655e-05, | |
| "loss": 0.7337, | |
| "mean_token_accuracy": 0.7793877094984054, | |
| "num_tokens": 2796907592.0, | |
| "step": 1345 | |
| }, | |
| { | |
| "epoch": 1.3889889374839208, | |
| "grad_norm": 0.19259704649448395, | |
| "learning_rate": 1.4037143281423885e-05, | |
| "loss": 0.7254, | |
| "mean_token_accuracy": 0.7815340638160706, | |
| "num_tokens": 2807315293.0, | |
| "step": 1350 | |
| }, | |
| { | |
| "epoch": 1.3941342937998455, | |
| "grad_norm": 0.20095007121562958, | |
| "learning_rate": 1.3995491045011243e-05, | |
| "loss": 0.7248, | |
| "mean_token_accuracy": 0.7817043244838715, | |
| "num_tokens": 2817690875.0, | |
| "step": 1355 | |
| }, | |
| { | |
| "epoch": 1.3992796501157705, | |
| "grad_norm": 0.1883144974708557, | |
| "learning_rate": 1.395379071770257e-05, | |
| "loss": 0.7207, | |
| "mean_token_accuracy": 0.7828416913747788, | |
| "num_tokens": 2828099023.0, | |
| "step": 1360 | |
| }, | |
| { | |
| "epoch": 1.4044250064316954, | |
| "grad_norm": 0.18824101984500885, | |
| "learning_rate": 1.3912043640466175e-05, | |
| "loss": 0.7194, | |
| "mean_token_accuracy": 0.7835315078496933, | |
| "num_tokens": 2838513670.0, | |
| "step": 1365 | |
| }, | |
| { | |
| "epoch": 1.4095703627476204, | |
| "grad_norm": 0.19366908073425293, | |
| "learning_rate": 1.387025115577373e-05, | |
| "loss": 0.74, | |
| "mean_token_accuracy": 0.7780154138803482, | |
| "num_tokens": 2848908140.0, | |
| "step": 1370 | |
| }, | |
| { | |
| "epoch": 1.414715719063545, | |
| "grad_norm": 0.18834272027015686, | |
| "learning_rate": 1.382841460755707e-05, | |
| "loss": 0.7279, | |
| "mean_token_accuracy": 0.7807209342718124, | |
| "num_tokens": 2859277820.0, | |
| "step": 1375 | |
| }, | |
| { | |
| "epoch": 1.41986107537947, | |
| "grad_norm": 0.20872853696346283, | |
| "learning_rate": 1.378653534116501e-05, | |
| "loss": 0.7174, | |
| "mean_token_accuracy": 0.7837792187929153, | |
| "num_tokens": 2869692721.0, | |
| "step": 1380 | |
| }, | |
| { | |
| "epoch": 1.425006431695395, | |
| "grad_norm": 0.21112094819545746, | |
| "learning_rate": 1.3744614703320046e-05, | |
| "loss": 0.7229, | |
| "mean_token_accuracy": 0.7826662242412568, | |
| "num_tokens": 2880093189.0, | |
| "step": 1385 | |
| }, | |
| { | |
| "epoch": 1.4301517880113197, | |
| "grad_norm": 0.18763834238052368, | |
| "learning_rate": 1.3702654042075077e-05, | |
| "loss": 0.7244, | |
| "mean_token_accuracy": 0.7820846647024154, | |
| "num_tokens": 2890505046.0, | |
| "step": 1390 | |
| }, | |
| { | |
| "epoch": 1.4352971443272446, | |
| "grad_norm": 0.1809283047914505, | |
| "learning_rate": 1.3660654706770045e-05, | |
| "loss": 0.7304, | |
| "mean_token_accuracy": 0.7803399920463562, | |
| "num_tokens": 2900869962.0, | |
| "step": 1395 | |
| }, | |
| { | |
| "epoch": 1.4404425006431696, | |
| "grad_norm": 0.19696731865406036, | |
| "learning_rate": 1.3618618047988541e-05, | |
| "loss": 0.7229, | |
| "mean_token_accuracy": 0.7826083064079284, | |
| "num_tokens": 2911235205.0, | |
| "step": 1400 | |
| }, | |
| { | |
| "epoch": 1.4455878569590945, | |
| "grad_norm": 0.21456697583198547, | |
| "learning_rate": 1.3576545417514379e-05, | |
| "loss": 0.7238, | |
| "mean_token_accuracy": 0.7821739882230758, | |
| "num_tokens": 2921630626.0, | |
| "step": 1405 | |
| }, | |
| { | |
| "epoch": 1.4507332132750193, | |
| "grad_norm": 0.2006523162126541, | |
| "learning_rate": 1.3534438168288122e-05, | |
| "loss": 0.7236, | |
| "mean_token_accuracy": 0.7829265475273133, | |
| "num_tokens": 2931987655.0, | |
| "step": 1410 | |
| }, | |
| { | |
| "epoch": 1.4558785695909442, | |
| "grad_norm": 0.20633164048194885, | |
| "learning_rate": 1.3492297654363582e-05, | |
| "loss": 0.7303, | |
| "mean_token_accuracy": 0.7806042492389679, | |
| "num_tokens": 2942398851.0, | |
| "step": 1415 | |
| }, | |
| { | |
| "epoch": 1.461023925906869, | |
| "grad_norm": 0.18342465162277222, | |
| "learning_rate": 1.3450125230864265e-05, | |
| "loss": 0.7221, | |
| "mean_token_accuracy": 0.7832267910242081, | |
| "num_tokens": 2952797589.0, | |
| "step": 1420 | |
| }, | |
| { | |
| "epoch": 1.4661692822227939, | |
| "grad_norm": 0.18584422767162323, | |
| "learning_rate": 1.3407922253939801e-05, | |
| "loss": 0.7207, | |
| "mean_token_accuracy": 0.7827514231204986, | |
| "num_tokens": 2963142662.0, | |
| "step": 1425 | |
| }, | |
| { | |
| "epoch": 1.4713146385387188, | |
| "grad_norm": 0.2012760490179062, | |
| "learning_rate": 1.3365690080722349e-05, | |
| "loss": 0.7228, | |
| "mean_token_accuracy": 0.7825741022825241, | |
| "num_tokens": 2973551267.0, | |
| "step": 1430 | |
| }, | |
| { | |
| "epoch": 1.4764599948546437, | |
| "grad_norm": 0.1869860738515854, | |
| "learning_rate": 1.3323430069282922e-05, | |
| "loss": 0.7123, | |
| "mean_token_accuracy": 0.7854143947362899, | |
| "num_tokens": 2983967802.0, | |
| "step": 1435 | |
| }, | |
| { | |
| "epoch": 1.4816053511705687, | |
| "grad_norm": 0.18766240775585175, | |
| "learning_rate": 1.3281143578587747e-05, | |
| "loss": 0.7204, | |
| "mean_token_accuracy": 0.7834302335977554, | |
| "num_tokens": 2994361017.0, | |
| "step": 1440 | |
| }, | |
| { | |
| "epoch": 1.4867507074864934, | |
| "grad_norm": 0.20129527151584625, | |
| "learning_rate": 1.3238831968454547e-05, | |
| "loss": 0.7295, | |
| "mean_token_accuracy": 0.7809592038393021, | |
| "num_tokens": 3004789429.0, | |
| "step": 1445 | |
| }, | |
| { | |
| "epoch": 1.4918960638024183, | |
| "grad_norm": 0.1840723752975464, | |
| "learning_rate": 1.3196496599508818e-05, | |
| "loss": 0.7164, | |
| "mean_token_accuracy": 0.7845001757144928, | |
| "num_tokens": 3015178802.0, | |
| "step": 1450 | |
| }, | |
| { | |
| "epoch": 1.497041420118343, | |
| "grad_norm": 0.18060751259326935, | |
| "learning_rate": 1.3154138833140066e-05, | |
| "loss": 0.7194, | |
| "mean_token_accuracy": 0.7834681749343873, | |
| "num_tokens": 3025590677.0, | |
| "step": 1455 | |
| }, | |
| { | |
| "epoch": 1.502186776434268, | |
| "grad_norm": 0.20038992166519165, | |
| "learning_rate": 1.3111760031458056e-05, | |
| "loss": 0.7234, | |
| "mean_token_accuracy": 0.7822674155235291, | |
| "num_tokens": 3036005732.0, | |
| "step": 1460 | |
| }, | |
| { | |
| "epoch": 1.507332132750193, | |
| "grad_norm": 0.17819659411907196, | |
| "learning_rate": 1.3069361557248972e-05, | |
| "loss": 0.7255, | |
| "mean_token_accuracy": 0.781733363866806, | |
| "num_tokens": 3046403093.0, | |
| "step": 1465 | |
| }, | |
| { | |
| "epoch": 1.512477489066118, | |
| "grad_norm": 0.18325041234493256, | |
| "learning_rate": 1.3026944773931623e-05, | |
| "loss": 0.7241, | |
| "mean_token_accuracy": 0.7819523394107819, | |
| "num_tokens": 3056805589.0, | |
| "step": 1470 | |
| }, | |
| { | |
| "epoch": 1.5176228453820428, | |
| "grad_norm": 0.1812821924686432, | |
| "learning_rate": 1.2984511045513583e-05, | |
| "loss": 0.7336, | |
| "mean_token_accuracy": 0.7796476185321808, | |
| "num_tokens": 3067202755.0, | |
| "step": 1475 | |
| }, | |
| { | |
| "epoch": 1.5227682016979676, | |
| "grad_norm": 0.1757994294166565, | |
| "learning_rate": 1.2942061736547338e-05, | |
| "loss": 0.7252, | |
| "mean_token_accuracy": 0.7821748554706573, | |
| "num_tokens": 3077623015.0, | |
| "step": 1480 | |
| }, | |
| { | |
| "epoch": 1.5279135580138925, | |
| "grad_norm": 0.18919821083545685, | |
| "learning_rate": 1.2899598212086407e-05, | |
| "loss": 0.7224, | |
| "mean_token_accuracy": 0.7825713455677032, | |
| "num_tokens": 3088009778.0, | |
| "step": 1485 | |
| }, | |
| { | |
| "epoch": 1.5330589143298172, | |
| "grad_norm": 0.18530713021755219, | |
| "learning_rate": 1.285712183764142e-05, | |
| "loss": 0.7308, | |
| "mean_token_accuracy": 0.7805260062217713, | |
| "num_tokens": 3098430248.0, | |
| "step": 1490 | |
| }, | |
| { | |
| "epoch": 1.5382042706457422, | |
| "grad_norm": 0.1960536390542984, | |
| "learning_rate": 1.2814633979136254e-05, | |
| "loss": 0.7224, | |
| "mean_token_accuracy": 0.7830262005329132, | |
| "num_tokens": 3108837811.0, | |
| "step": 1495 | |
| }, | |
| { | |
| "epoch": 1.543349626961667, | |
| "grad_norm": 0.19076813757419586, | |
| "learning_rate": 1.2772136002864067e-05, | |
| "loss": 0.7221, | |
| "mean_token_accuracy": 0.783091539144516, | |
| "num_tokens": 3119231467.0, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 1.548494983277592, | |
| "grad_norm": 0.20653417706489563, | |
| "learning_rate": 1.2729629275443373e-05, | |
| "loss": 0.7256, | |
| "mean_token_accuracy": 0.781879261136055, | |
| "num_tokens": 3129640782.0, | |
| "step": 1505 | |
| }, | |
| { | |
| "epoch": 1.553640339593517, | |
| "grad_norm": 0.19807811081409454, | |
| "learning_rate": 1.268711516377411e-05, | |
| "loss": 0.7156, | |
| "mean_token_accuracy": 0.7845076858997345, | |
| "num_tokens": 3140041502.0, | |
| "step": 1510 | |
| }, | |
| { | |
| "epoch": 1.5587856959094417, | |
| "grad_norm": 0.18549486994743347, | |
| "learning_rate": 1.2644595034993667e-05, | |
| "loss": 0.7145, | |
| "mean_token_accuracy": 0.7843082189559937, | |
| "num_tokens": 3150443408.0, | |
| "step": 1515 | |
| }, | |
| { | |
| "epoch": 1.5639310522253667, | |
| "grad_norm": 0.18279866874217987, | |
| "learning_rate": 1.260207025643293e-05, | |
| "loss": 0.7247, | |
| "mean_token_accuracy": 0.7818575918674469, | |
| "num_tokens": 3160858821.0, | |
| "step": 1520 | |
| }, | |
| { | |
| "epoch": 1.5690764085412914, | |
| "grad_norm": 0.19264079630374908, | |
| "learning_rate": 1.25595421955723e-05, | |
| "loss": 0.7373, | |
| "mean_token_accuracy": 0.7786633670330048, | |
| "num_tokens": 3171243172.0, | |
| "step": 1525 | |
| }, | |
| { | |
| "epoch": 1.5742217648572163, | |
| "grad_norm": 0.20798954367637634, | |
| "learning_rate": 1.2517012219997743e-05, | |
| "loss": 0.7263, | |
| "mean_token_accuracy": 0.7813588201999664, | |
| "num_tokens": 3181666806.0, | |
| "step": 1530 | |
| }, | |
| { | |
| "epoch": 1.5793671211731413, | |
| "grad_norm": 0.22132764756679535, | |
| "learning_rate": 1.2474481697356784e-05, | |
| "loss": 0.7158, | |
| "mean_token_accuracy": 0.7841507345438004, | |
| "num_tokens": 3192068095.0, | |
| "step": 1535 | |
| }, | |
| { | |
| "epoch": 1.5845124774890662, | |
| "grad_norm": 0.1778470277786255, | |
| "learning_rate": 1.2431951995314547e-05, | |
| "loss": 0.7112, | |
| "mean_token_accuracy": 0.7860257804393769, | |
| "num_tokens": 3202473819.0, | |
| "step": 1540 | |
| }, | |
| { | |
| "epoch": 1.5896578338049911, | |
| "grad_norm": 0.20948489010334015, | |
| "learning_rate": 1.2389424481509766e-05, | |
| "loss": 0.7283, | |
| "mean_token_accuracy": 0.7805652767419815, | |
| "num_tokens": 3212838888.0, | |
| "step": 1545 | |
| }, | |
| { | |
| "epoch": 1.5948031901209159, | |
| "grad_norm": 0.22574825584888458, | |
| "learning_rate": 1.2346900523510804e-05, | |
| "loss": 0.7246, | |
| "mean_token_accuracy": 0.7815930396318436, | |
| "num_tokens": 3223222863.0, | |
| "step": 1550 | |
| }, | |
| { | |
| "epoch": 1.5999485464368406, | |
| "grad_norm": 0.19260616600513458, | |
| "learning_rate": 1.2304381488771684e-05, | |
| "loss": 0.7192, | |
| "mean_token_accuracy": 0.7832415938377381, | |
| "num_tokens": 3233636338.0, | |
| "step": 1555 | |
| }, | |
| { | |
| "epoch": 1.6050939027527655, | |
| "grad_norm": 0.2285853922367096, | |
| "learning_rate": 1.2261868744588108e-05, | |
| "loss": 0.7123, | |
| "mean_token_accuracy": 0.7852210611104965, | |
| "num_tokens": 3244050232.0, | |
| "step": 1560 | |
| }, | |
| { | |
| "epoch": 1.6102392590686905, | |
| "grad_norm": 0.21371160447597504, | |
| "learning_rate": 1.2219363658053496e-05, | |
| "loss": 0.7236, | |
| "mean_token_accuracy": 0.7823358774185181, | |
| "num_tokens": 3254430770.0, | |
| "step": 1565 | |
| }, | |
| { | |
| "epoch": 1.6153846153846154, | |
| "grad_norm": 0.20527417957782745, | |
| "learning_rate": 1.217686759601501e-05, | |
| "loss": 0.7147, | |
| "mean_token_accuracy": 0.7844323009252548, | |
| "num_tokens": 3264847891.0, | |
| "step": 1570 | |
| }, | |
| { | |
| "epoch": 1.6205299717005404, | |
| "grad_norm": 0.18310104310512543, | |
| "learning_rate": 1.2134381925029613e-05, | |
| "loss": 0.725, | |
| "mean_token_accuracy": 0.7815193057060241, | |
| "num_tokens": 3275273883.0, | |
| "step": 1575 | |
| }, | |
| { | |
| "epoch": 1.6256753280164653, | |
| "grad_norm": 0.18352478742599487, | |
| "learning_rate": 1.209190801132012e-05, | |
| "loss": 0.7174, | |
| "mean_token_accuracy": 0.7840063363313675, | |
| "num_tokens": 3285662067.0, | |
| "step": 1580 | |
| }, | |
| { | |
| "epoch": 1.63082068433239, | |
| "grad_norm": 0.1829749196767807, | |
| "learning_rate": 1.2049447220731266e-05, | |
| "loss": 0.7256, | |
| "mean_token_accuracy": 0.7813293248414993, | |
| "num_tokens": 3296058473.0, | |
| "step": 1585 | |
| }, | |
| { | |
| "epoch": 1.6359660406483147, | |
| "grad_norm": 0.18960633873939514, | |
| "learning_rate": 1.2007000918685786e-05, | |
| "loss": 0.7283, | |
| "mean_token_accuracy": 0.780817398428917, | |
| "num_tokens": 3306487200.0, | |
| "step": 1590 | |
| }, | |
| { | |
| "epoch": 1.6411113969642397, | |
| "grad_norm": 0.18444406986236572, | |
| "learning_rate": 1.196457047014049e-05, | |
| "loss": 0.7166, | |
| "mean_token_accuracy": 0.7841069996356964, | |
| "num_tokens": 3316886128.0, | |
| "step": 1595 | |
| }, | |
| { | |
| "epoch": 1.6462567532801646, | |
| "grad_norm": 0.18856759369373322, | |
| "learning_rate": 1.1922157239542396e-05, | |
| "loss": 0.7224, | |
| "mean_token_accuracy": 0.7824205875396728, | |
| "num_tokens": 3327276884.0, | |
| "step": 1600 | |
| }, | |
| { | |
| "epoch": 1.6514021095960896, | |
| "grad_norm": 0.18209627270698547, | |
| "learning_rate": 1.1879762590784832e-05, | |
| "loss": 0.7097, | |
| "mean_token_accuracy": 0.7861823856830596, | |
| "num_tokens": 3337639665.0, | |
| "step": 1605 | |
| }, | |
| { | |
| "epoch": 1.6565474659120145, | |
| "grad_norm": 0.18820519745349884, | |
| "learning_rate": 1.1837387887163594e-05, | |
| "loss": 0.7186, | |
| "mean_token_accuracy": 0.783584377169609, | |
| "num_tokens": 3348052567.0, | |
| "step": 1610 | |
| }, | |
| { | |
| "epoch": 1.6616928222279392, | |
| "grad_norm": 0.2538444399833679, | |
| "learning_rate": 1.1795034491333089e-05, | |
| "loss": 0.7258, | |
| "mean_token_accuracy": 0.7812894821166992, | |
| "num_tokens": 3358442738.0, | |
| "step": 1615 | |
| }, | |
| { | |
| "epoch": 1.6668381785438642, | |
| "grad_norm": 0.22863799333572388, | |
| "learning_rate": 1.175270376526252e-05, | |
| "loss": 0.7285, | |
| "mean_token_accuracy": 0.7804809838533402, | |
| "num_tokens": 3368845776.0, | |
| "step": 1620 | |
| }, | |
| { | |
| "epoch": 1.671983534859789, | |
| "grad_norm": 0.1943485289812088, | |
| "learning_rate": 1.1710397070192103e-05, | |
| "loss": 0.7131, | |
| "mean_token_accuracy": 0.7848410785198212, | |
| "num_tokens": 3379247751.0, | |
| "step": 1625 | |
| }, | |
| { | |
| "epoch": 1.6771288911757138, | |
| "grad_norm": 0.25817155838012695, | |
| "learning_rate": 1.1668115766589278e-05, | |
| "loss": 0.7209, | |
| "mean_token_accuracy": 0.7828822433948517, | |
| "num_tokens": 3389594723.0, | |
| "step": 1630 | |
| }, | |
| { | |
| "epoch": 1.6822742474916388, | |
| "grad_norm": 0.21201983094215393, | |
| "learning_rate": 1.1625861214104967e-05, | |
| "loss": 0.7148, | |
| "mean_token_accuracy": 0.7844018071889878, | |
| "num_tokens": 3399983766.0, | |
| "step": 1635 | |
| }, | |
| { | |
| "epoch": 1.6874196038075637, | |
| "grad_norm": 0.20084577798843384, | |
| "learning_rate": 1.1583634771529843e-05, | |
| "loss": 0.7167, | |
| "mean_token_accuracy": 0.783909472823143, | |
| "num_tokens": 3410348245.0, | |
| "step": 1640 | |
| }, | |
| { | |
| "epoch": 1.6925649601234887, | |
| "grad_norm": 0.19142308831214905, | |
| "learning_rate": 1.1541437796750651e-05, | |
| "loss": 0.7216, | |
| "mean_token_accuracy": 0.7828730583190918, | |
| "num_tokens": 3420763568.0, | |
| "step": 1645 | |
| }, | |
| { | |
| "epoch": 1.6977103164394134, | |
| "grad_norm": 0.21393629908561707, | |
| "learning_rate": 1.1499271646706525e-05, | |
| "loss": 0.7299, | |
| "mean_token_accuracy": 0.7805344760417938, | |
| "num_tokens": 3431161357.0, | |
| "step": 1650 | |
| }, | |
| { | |
| "epoch": 1.7028556727553383, | |
| "grad_norm": 0.20122776925563812, | |
| "learning_rate": 1.1457137677345362e-05, | |
| "loss": 0.7147, | |
| "mean_token_accuracy": 0.7848177880048752, | |
| "num_tokens": 3441567248.0, | |
| "step": 1655 | |
| }, | |
| { | |
| "epoch": 1.708001029071263, | |
| "grad_norm": 0.21072812378406525, | |
| "learning_rate": 1.1415037243580219e-05, | |
| "loss": 0.7069, | |
| "mean_token_accuracy": 0.7868409931659699, | |
| "num_tokens": 3451983024.0, | |
| "step": 1660 | |
| }, | |
| { | |
| "epoch": 1.713146385387188, | |
| "grad_norm": 0.19437959790229797, | |
| "learning_rate": 1.1372971699245732e-05, | |
| "loss": 0.7196, | |
| "mean_token_accuracy": 0.7829753428697586, | |
| "num_tokens": 3462383943.0, | |
| "step": 1665 | |
| }, | |
| { | |
| "epoch": 1.718291741703113, | |
| "grad_norm": 0.19020071625709534, | |
| "learning_rate": 1.1330942397054599e-05, | |
| "loss": 0.7231, | |
| "mean_token_accuracy": 0.7823878258466721, | |
| "num_tokens": 3472795076.0, | |
| "step": 1670 | |
| }, | |
| { | |
| "epoch": 1.7234370980190379, | |
| "grad_norm": 0.17885635793209076, | |
| "learning_rate": 1.1288950688554068e-05, | |
| "loss": 0.7307, | |
| "mean_token_accuracy": 0.7804099589586257, | |
| "num_tokens": 3483201272.0, | |
| "step": 1675 | |
| }, | |
| { | |
| "epoch": 1.7285824543349628, | |
| "grad_norm": 0.1998845338821411, | |
| "learning_rate": 1.1246997924082465e-05, | |
| "loss": 0.7178, | |
| "mean_token_accuracy": 0.7835394382476807, | |
| "num_tokens": 3493592730.0, | |
| "step": 1680 | |
| }, | |
| { | |
| "epoch": 1.7337278106508875, | |
| "grad_norm": 0.1758834719657898, | |
| "learning_rate": 1.1205085452725796e-05, | |
| "loss": 0.7246, | |
| "mean_token_accuracy": 0.7821025729179383, | |
| "num_tokens": 3504000714.0, | |
| "step": 1685 | |
| }, | |
| { | |
| "epoch": 1.7388731669668125, | |
| "grad_norm": 0.18512143194675446, | |
| "learning_rate": 1.116321462227435e-05, | |
| "loss": 0.7217, | |
| "mean_token_accuracy": 0.7826696693897247, | |
| "num_tokens": 3514403874.0, | |
| "step": 1690 | |
| }, | |
| { | |
| "epoch": 1.7440185232827372, | |
| "grad_norm": 0.18202784657478333, | |
| "learning_rate": 1.112138677917935e-05, | |
| "loss": 0.7098, | |
| "mean_token_accuracy": 0.7855264693498611, | |
| "num_tokens": 3524800705.0, | |
| "step": 1695 | |
| }, | |
| { | |
| "epoch": 1.7491638795986622, | |
| "grad_norm": 0.17856541275978088, | |
| "learning_rate": 1.1079603268509671e-05, | |
| "loss": 0.7223, | |
| "mean_token_accuracy": 0.7820982217788697, | |
| "num_tokens": 3535216074.0, | |
| "step": 1700 | |
| }, | |
| { | |
| "epoch": 1.754309235914587, | |
| "grad_norm": 0.18560512363910675, | |
| "learning_rate": 1.1037865433908574e-05, | |
| "loss": 0.7163, | |
| "mean_token_accuracy": 0.7839051306247711, | |
| "num_tokens": 3545649484.0, | |
| "step": 1705 | |
| }, | |
| { | |
| "epoch": 1.759454592230512, | |
| "grad_norm": 0.1768876314163208, | |
| "learning_rate": 1.0996174617550506e-05, | |
| "loss": 0.7147, | |
| "mean_token_accuracy": 0.78475923538208, | |
| "num_tokens": 3556016373.0, | |
| "step": 1710 | |
| }, | |
| { | |
| "epoch": 1.764599948546437, | |
| "grad_norm": 0.18054403364658356, | |
| "learning_rate": 1.0954532160097937e-05, | |
| "loss": 0.7199, | |
| "mean_token_accuracy": 0.783240556716919, | |
| "num_tokens": 3566403061.0, | |
| "step": 1715 | |
| }, | |
| { | |
| "epoch": 1.7697453048623617, | |
| "grad_norm": 0.1674446016550064, | |
| "learning_rate": 1.0912939400658243e-05, | |
| "loss": 0.7223, | |
| "mean_token_accuracy": 0.7823190867900849, | |
| "num_tokens": 3576811704.0, | |
| "step": 1720 | |
| }, | |
| { | |
| "epoch": 1.7748906611782866, | |
| "grad_norm": 0.19289781153202057, | |
| "learning_rate": 1.0871397676740647e-05, | |
| "loss": 0.7268, | |
| "mean_token_accuracy": 0.7810219496488571, | |
| "num_tokens": 3587228256.0, | |
| "step": 1725 | |
| }, | |
| { | |
| "epoch": 1.7800360174942114, | |
| "grad_norm": 0.19123966991901398, | |
| "learning_rate": 1.0829908324213214e-05, | |
| "loss": 0.7203, | |
| "mean_token_accuracy": 0.7828882426023483, | |
| "num_tokens": 3597638203.0, | |
| "step": 1730 | |
| }, | |
| { | |
| "epoch": 1.7851813738101363, | |
| "grad_norm": 0.17165765166282654, | |
| "learning_rate": 1.0788472677259888e-05, | |
| "loss": 0.7237, | |
| "mean_token_accuracy": 0.7817003160715104, | |
| "num_tokens": 3608044316.0, | |
| "step": 1735 | |
| }, | |
| { | |
| "epoch": 1.7903267301260612, | |
| "grad_norm": 0.17944514751434326, | |
| "learning_rate": 1.074709206833759e-05, | |
| "loss": 0.7254, | |
| "mean_token_accuracy": 0.7813136577606201, | |
| "num_tokens": 3618445618.0, | |
| "step": 1740 | |
| }, | |
| { | |
| "epoch": 1.7954720864419862, | |
| "grad_norm": 0.19152477383613586, | |
| "learning_rate": 1.070576782813336e-05, | |
| "loss": 0.7244, | |
| "mean_token_accuracy": 0.7815535515546799, | |
| "num_tokens": 3628864024.0, | |
| "step": 1745 | |
| }, | |
| { | |
| "epoch": 1.8006174427579111, | |
| "grad_norm": 0.20791219174861908, | |
| "learning_rate": 1.0664501285521585e-05, | |
| "loss": 0.7185, | |
| "mean_token_accuracy": 0.7832733541727066, | |
| "num_tokens": 3639237896.0, | |
| "step": 1750 | |
| }, | |
| { | |
| "epoch": 1.8057627990738359, | |
| "grad_norm": 0.17812472581863403, | |
| "learning_rate": 1.0623293767521248e-05, | |
| "loss": 0.7241, | |
| "mean_token_accuracy": 0.7819753557443618, | |
| "num_tokens": 3649639864.0, | |
| "step": 1755 | |
| }, | |
| { | |
| "epoch": 1.8109081553897606, | |
| "grad_norm": 0.2049945890903473, | |
| "learning_rate": 1.0582146599253271e-05, | |
| "loss": 0.7257, | |
| "mean_token_accuracy": 0.7812327802181244, | |
| "num_tokens": 3660058988.0, | |
| "step": 1760 | |
| }, | |
| { | |
| "epoch": 1.8160535117056855, | |
| "grad_norm": 0.17638404667377472, | |
| "learning_rate": 1.0541061103897881e-05, | |
| "loss": 0.7224, | |
| "mean_token_accuracy": 0.782220122218132, | |
| "num_tokens": 3670478945.0, | |
| "step": 1765 | |
| }, | |
| { | |
| "epoch": 1.8211988680216105, | |
| "grad_norm": 0.19884805381298065, | |
| "learning_rate": 1.0500038602652087e-05, | |
| "loss": 0.7231, | |
| "mean_token_accuracy": 0.7823522746562958, | |
| "num_tokens": 3680853435.0, | |
| "step": 1770 | |
| }, | |
| { | |
| "epoch": 1.8263442243375354, | |
| "grad_norm": 0.2005850225687027, | |
| "learning_rate": 1.0459080414687166e-05, | |
| "loss": 0.7109, | |
| "mean_token_accuracy": 0.7854242146015167, | |
| "num_tokens": 3691255046.0, | |
| "step": 1775 | |
| }, | |
| { | |
| "epoch": 1.8314895806534603, | |
| "grad_norm": 0.18633082509040833, | |
| "learning_rate": 1.041818785710627e-05, | |
| "loss": 0.7138, | |
| "mean_token_accuracy": 0.7844531148672104, | |
| "num_tokens": 3701666509.0, | |
| "step": 1780 | |
| }, | |
| { | |
| "epoch": 1.8366349369693853, | |
| "grad_norm": 0.20367996394634247, | |
| "learning_rate": 1.037736224490205e-05, | |
| "loss": 0.7277, | |
| "mean_token_accuracy": 0.7811777710914611, | |
| "num_tokens": 3712030587.0, | |
| "step": 1785 | |
| }, | |
| { | |
| "epoch": 1.84178029328531, | |
| "grad_norm": 0.19179487228393555, | |
| "learning_rate": 1.033660489091437e-05, | |
| "loss": 0.7184, | |
| "mean_token_accuracy": 0.7832197934389115, | |
| "num_tokens": 3722418473.0, | |
| "step": 1790 | |
| }, | |
| { | |
| "epoch": 1.8469256496012347, | |
| "grad_norm": 0.1956593543291092, | |
| "learning_rate": 1.0295917105788116e-05, | |
| "loss": 0.7176, | |
| "mean_token_accuracy": 0.7836333483457565, | |
| "num_tokens": 3732817985.0, | |
| "step": 1795 | |
| }, | |
| { | |
| "epoch": 1.8520710059171597, | |
| "grad_norm": 0.18963828682899475, | |
| "learning_rate": 1.0255300197931008e-05, | |
| "loss": 0.7264, | |
| "mean_token_accuracy": 0.781423020362854, | |
| "num_tokens": 3743226210.0, | |
| "step": 1800 | |
| }, | |
| { | |
| "epoch": 1.8572163622330846, | |
| "grad_norm": 0.16611480712890625, | |
| "learning_rate": 1.021475547347157e-05, | |
| "loss": 0.7201, | |
| "mean_token_accuracy": 0.7829820960760117, | |
| "num_tokens": 3753595703.0, | |
| "step": 1805 | |
| }, | |
| { | |
| "epoch": 1.8623617185490096, | |
| "grad_norm": 0.17385919392108917, | |
| "learning_rate": 1.017428423621708e-05, | |
| "loss": 0.7294, | |
| "mean_token_accuracy": 0.7805358350276947, | |
| "num_tokens": 3764011843.0, | |
| "step": 1810 | |
| }, | |
| { | |
| "epoch": 1.8675070748649345, | |
| "grad_norm": 0.18427050113677979, | |
| "learning_rate": 1.0133887787611691e-05, | |
| "loss": 0.7199, | |
| "mean_token_accuracy": 0.7830179333686829, | |
| "num_tokens": 3774434254.0, | |
| "step": 1815 | |
| }, | |
| { | |
| "epoch": 1.8726524311808592, | |
| "grad_norm": 0.18085253238677979, | |
| "learning_rate": 1.0093567426694544e-05, | |
| "loss": 0.71, | |
| "mean_token_accuracy": 0.7860257983207702, | |
| "num_tokens": 3784829343.0, | |
| "step": 1820 | |
| }, | |
| { | |
| "epoch": 1.8777977874967842, | |
| "grad_norm": 0.18546296656131744, | |
| "learning_rate": 1.0053324450058017e-05, | |
| "loss": 0.7121, | |
| "mean_token_accuracy": 0.7851428180932999, | |
| "num_tokens": 3795222093.0, | |
| "step": 1825 | |
| }, | |
| { | |
| "epoch": 1.8829431438127089, | |
| "grad_norm": 0.1925223022699356, | |
| "learning_rate": 1.0013160151806019e-05, | |
| "loss": 0.7235, | |
| "mean_token_accuracy": 0.7820713192224502, | |
| "num_tokens": 3805638950.0, | |
| "step": 1830 | |
| }, | |
| { | |
| "epoch": 1.8880885001286338, | |
| "grad_norm": 0.17734932899475098, | |
| "learning_rate": 9.973075823512368e-06, | |
| "loss": 0.7232, | |
| "mean_token_accuracy": 0.78176209628582, | |
| "num_tokens": 3816016677.0, | |
| "step": 1835 | |
| }, | |
| { | |
| "epoch": 1.8932338564445588, | |
| "grad_norm": 0.1860755831003189, | |
| "learning_rate": 9.933072754179285e-06, | |
| "loss": 0.7142, | |
| "mean_token_accuracy": 0.7846502423286438, | |
| "num_tokens": 3826390292.0, | |
| "step": 1840 | |
| }, | |
| { | |
| "epoch": 1.8983792127604837, | |
| "grad_norm": 0.17822657525539398, | |
| "learning_rate": 9.893152230195909e-06, | |
| "loss": 0.7158, | |
| "mean_token_accuracy": 0.7840298235416412, | |
| "num_tokens": 3836783019.0, | |
| "step": 1845 | |
| }, | |
| { | |
| "epoch": 1.9035245690764087, | |
| "grad_norm": 0.19308920204639435, | |
| "learning_rate": 9.85331553529696e-06, | |
| "loss": 0.7089, | |
| "mean_token_accuracy": 0.7863038212060929, | |
| "num_tokens": 3847188499.0, | |
| "step": 1850 | |
| }, | |
| { | |
| "epoch": 1.9086699253923334, | |
| "grad_norm": 0.18061098456382751, | |
| "learning_rate": 9.813563950521435e-06, | |
| "loss": 0.7116, | |
| "mean_token_accuracy": 0.7850539714097977, | |
| "num_tokens": 3857595213.0, | |
| "step": 1855 | |
| }, | |
| { | |
| "epoch": 1.9138152817082583, | |
| "grad_norm": 0.18107041716575623, | |
| "learning_rate": 9.773898754171425e-06, | |
| "loss": 0.7219, | |
| "mean_token_accuracy": 0.7823956727981567, | |
| "num_tokens": 3867991091.0, | |
| "step": 1860 | |
| }, | |
| { | |
| "epoch": 1.918960638024183, | |
| "grad_norm": 0.18848438560962677, | |
| "learning_rate": 9.734321221771003e-06, | |
| "loss": 0.7211, | |
| "mean_token_accuracy": 0.7825707286596298, | |
| "num_tokens": 3878420983.0, | |
| "step": 1865 | |
| }, | |
| { | |
| "epoch": 1.924105994340108, | |
| "grad_norm": 0.1764371693134308, | |
| "learning_rate": 9.69483262602522e-06, | |
| "loss": 0.7193, | |
| "mean_token_accuracy": 0.7829346388578415, | |
| "num_tokens": 3888799338.0, | |
| "step": 1870 | |
| }, | |
| { | |
| "epoch": 1.929251350656033, | |
| "grad_norm": 0.21297839283943176, | |
| "learning_rate": 9.655434236779157e-06, | |
| "loss": 0.7255, | |
| "mean_token_accuracy": 0.78101367354393, | |
| "num_tokens": 3899196340.0, | |
| "step": 1875 | |
| }, | |
| { | |
| "epoch": 1.9343967069719579, | |
| "grad_norm": 0.16781088709831238, | |
| "learning_rate": 9.616127320977103e-06, | |
| "loss": 0.7241, | |
| "mean_token_accuracy": 0.7818098127841949, | |
| "num_tokens": 3909607686.0, | |
| "step": 1880 | |
| }, | |
| { | |
| "epoch": 1.9395420632878828, | |
| "grad_norm": 0.1716819554567337, | |
| "learning_rate": 9.576913142621814e-06, | |
| "loss": 0.7226, | |
| "mean_token_accuracy": 0.7824906349182129, | |
| "num_tokens": 3920018100.0, | |
| "step": 1885 | |
| }, | |
| { | |
| "epoch": 1.9446874196038075, | |
| "grad_norm": 0.16348664462566376, | |
| "learning_rate": 9.537792962733865e-06, | |
| "loss": 0.7087, | |
| "mean_token_accuracy": 0.7859474241733551, | |
| "num_tokens": 3930421105.0, | |
| "step": 1890 | |
| }, | |
| { | |
| "epoch": 1.9498327759197325, | |
| "grad_norm": 0.17886386811733246, | |
| "learning_rate": 9.498768039311091e-06, | |
| "loss": 0.7195, | |
| "mean_token_accuracy": 0.7828368335962296, | |
| "num_tokens": 3940804188.0, | |
| "step": 1895 | |
| }, | |
| { | |
| "epoch": 1.9549781322356572, | |
| "grad_norm": 0.19559861719608307, | |
| "learning_rate": 9.459839627288149e-06, | |
| "loss": 0.7223, | |
| "mean_token_accuracy": 0.7822702258825303, | |
| "num_tokens": 3951164761.0, | |
| "step": 1900 | |
| }, | |
| { | |
| "epoch": 1.9601234885515821, | |
| "grad_norm": 0.1999935507774353, | |
| "learning_rate": 9.421008978496147e-06, | |
| "loss": 0.7302, | |
| "mean_token_accuracy": 0.7797028571367264, | |
| "num_tokens": 3961562769.0, | |
| "step": 1905 | |
| }, | |
| { | |
| "epoch": 1.965268844867507, | |
| "grad_norm": 0.17995114624500275, | |
| "learning_rate": 9.3822773416224e-06, | |
| "loss": 0.7242, | |
| "mean_token_accuracy": 0.7817402511835099, | |
| "num_tokens": 3971972365.0, | |
| "step": 1910 | |
| }, | |
| { | |
| "epoch": 1.970414201183432, | |
| "grad_norm": 0.19008983671665192, | |
| "learning_rate": 9.343645962170267e-06, | |
| "loss": 0.701, | |
| "mean_token_accuracy": 0.7879698783159256, | |
| "num_tokens": 3982366725.0, | |
| "step": 1915 | |
| }, | |
| { | |
| "epoch": 1.975559557499357, | |
| "grad_norm": 0.1909506916999817, | |
| "learning_rate": 9.305116082419098e-06, | |
| "loss": 0.7189, | |
| "mean_token_accuracy": 0.7828688323497772, | |
| "num_tokens": 3992755868.0, | |
| "step": 1920 | |
| }, | |
| { | |
| "epoch": 1.9807049138152817, | |
| "grad_norm": 0.20575623214244843, | |
| "learning_rate": 9.266688941384307e-06, | |
| "loss": 0.7144, | |
| "mean_token_accuracy": 0.7844961941242218, | |
| "num_tokens": 4003176984.0, | |
| "step": 1925 | |
| }, | |
| { | |
| "epoch": 1.9858502701312066, | |
| "grad_norm": 0.19786439836025238, | |
| "learning_rate": 9.228365774777498e-06, | |
| "loss": 0.7134, | |
| "mean_token_accuracy": 0.7853114068508148, | |
| "num_tokens": 4013594555.0, | |
| "step": 1930 | |
| }, | |
| { | |
| "epoch": 1.9909956264471314, | |
| "grad_norm": 0.19185397028923035, | |
| "learning_rate": 9.190147814966747e-06, | |
| "loss": 0.717, | |
| "mean_token_accuracy": 0.7838179767131805, | |
| "num_tokens": 4023992898.0, | |
| "step": 1935 | |
| }, | |
| { | |
| "epoch": 1.9961409827630563, | |
| "grad_norm": 0.187905415892601, | |
| "learning_rate": 9.152036290936966e-06, | |
| "loss": 0.7137, | |
| "mean_token_accuracy": 0.7846971601247787, | |
| "num_tokens": 4034394748.0, | |
| "step": 1940 | |
| }, | |
| { | |
| "epoch": 2.001029071263185, | |
| "grad_norm": 0.2134164422750473, | |
| "learning_rate": 9.114032428250385e-06, | |
| "loss": 0.7088, | |
| "mean_token_accuracy": 0.7855551619278757, | |
| "num_tokens": 4044275069.0, | |
| "step": 1945 | |
| }, | |
| { | |
| "epoch": 2.00617442757911, | |
| "grad_norm": 0.19707649946212769, | |
| "learning_rate": 9.07613744900714e-06, | |
| "loss": 0.6946, | |
| "mean_token_accuracy": 0.7890868008136749, | |
| "num_tokens": 4054664720.0, | |
| "step": 1950 | |
| }, | |
| { | |
| "epoch": 2.0113197838950345, | |
| "grad_norm": 0.1966494768857956, | |
| "learning_rate": 9.038352571805973e-06, | |
| "loss": 0.7024, | |
| "mean_token_accuracy": 0.7869040161371231, | |
| "num_tokens": 4065081909.0, | |
| "step": 1955 | |
| }, | |
| { | |
| "epoch": 2.0164651402109595, | |
| "grad_norm": 0.21523414552211761, | |
| "learning_rate": 9.000679011705048e-06, | |
| "loss": 0.7, | |
| "mean_token_accuracy": 0.7871535241603851, | |
| "num_tokens": 4075473101.0, | |
| "step": 1960 | |
| }, | |
| { | |
| "epoch": 2.0216104965268844, | |
| "grad_norm": 0.20164090394973755, | |
| "learning_rate": 8.963117980182871e-06, | |
| "loss": 0.6879, | |
| "mean_token_accuracy": 0.7908169955015183, | |
| "num_tokens": 4085893536.0, | |
| "step": 1965 | |
| }, | |
| { | |
| "epoch": 2.0267558528428093, | |
| "grad_norm": 0.1790938675403595, | |
| "learning_rate": 8.925670685099344e-06, | |
| "loss": 0.6966, | |
| "mean_token_accuracy": 0.7879111260175705, | |
| "num_tokens": 4096303930.0, | |
| "step": 1970 | |
| }, | |
| { | |
| "epoch": 2.0319012091587343, | |
| "grad_norm": 0.18033479154109955, | |
| "learning_rate": 8.888338330656909e-06, | |
| "loss": 0.6907, | |
| "mean_token_accuracy": 0.7897056668996811, | |
| "num_tokens": 4106711156.0, | |
| "step": 1975 | |
| }, | |
| { | |
| "epoch": 2.0370465654746592, | |
| "grad_norm": 0.18739460408687592, | |
| "learning_rate": 8.851122117361845e-06, | |
| "loss": 0.6848, | |
| "mean_token_accuracy": 0.7917119234800338, | |
| "num_tokens": 4117100496.0, | |
| "step": 1980 | |
| }, | |
| { | |
| "epoch": 2.042191921790584, | |
| "grad_norm": 0.18272534012794495, | |
| "learning_rate": 8.814023241985633e-06, | |
| "loss": 0.7014, | |
| "mean_token_accuracy": 0.7867262125015259, | |
| "num_tokens": 4127509126.0, | |
| "step": 1985 | |
| }, | |
| { | |
| "epoch": 2.0473372781065087, | |
| "grad_norm": 0.2082863599061966, | |
| "learning_rate": 8.777042897526491e-06, | |
| "loss": 0.6971, | |
| "mean_token_accuracy": 0.7877671688795089, | |
| "num_tokens": 4137900742.0, | |
| "step": 1990 | |
| }, | |
| { | |
| "epoch": 2.0524826344224336, | |
| "grad_norm": 0.18298238515853882, | |
| "learning_rate": 8.740182273171021e-06, | |
| "loss": 0.6937, | |
| "mean_token_accuracy": 0.789008492231369, | |
| "num_tokens": 4148307231.0, | |
| "step": 1995 | |
| }, | |
| { | |
| "epoch": 2.0576279907383586, | |
| "grad_norm": 0.20916491746902466, | |
| "learning_rate": 8.703442554255945e-06, | |
| "loss": 0.6971, | |
| "mean_token_accuracy": 0.7878756642341613, | |
| "num_tokens": 4158703927.0, | |
| "step": 2000 | |
| }, | |
| { | |
| "epoch": 2.0627733470542835, | |
| "grad_norm": 0.17486633360385895, | |
| "learning_rate": 8.666824922229993e-06, | |
| "loss": 0.6997, | |
| "mean_token_accuracy": 0.7873221039772034, | |
| "num_tokens": 4169110558.0, | |
| "step": 2005 | |
| }, | |
| { | |
| "epoch": 2.0679187033702084, | |
| "grad_norm": 0.18294841051101685, | |
| "learning_rate": 8.630330554615918e-06, | |
| "loss": 0.6957, | |
| "mean_token_accuracy": 0.7886905431747436, | |
| "num_tokens": 4179479610.0, | |
| "step": 2010 | |
| }, | |
| { | |
| "epoch": 2.0730640596861334, | |
| "grad_norm": 0.17662659287452698, | |
| "learning_rate": 8.593960624972635e-06, | |
| "loss": 0.6896, | |
| "mean_token_accuracy": 0.790263557434082, | |
| "num_tokens": 4189878842.0, | |
| "step": 2015 | |
| }, | |
| { | |
| "epoch": 2.0782094160020583, | |
| "grad_norm": 0.18194252252578735, | |
| "learning_rate": 8.557716302857469e-06, | |
| "loss": 0.6955, | |
| "mean_token_accuracy": 0.7886851370334625, | |
| "num_tokens": 4200279229.0, | |
| "step": 2020 | |
| }, | |
| { | |
| "epoch": 2.083354772317983, | |
| "grad_norm": 0.1726061850786209, | |
| "learning_rate": 8.521598753788538e-06, | |
| "loss": 0.6975, | |
| "mean_token_accuracy": 0.7880923539400101, | |
| "num_tokens": 4210680215.0, | |
| "step": 2025 | |
| }, | |
| { | |
| "epoch": 2.0885001286339078, | |
| "grad_norm": 0.17614272236824036, | |
| "learning_rate": 8.485609139207312e-06, | |
| "loss": 0.6859, | |
| "mean_token_accuracy": 0.7908884584903717, | |
| "num_tokens": 4221086681.0, | |
| "step": 2030 | |
| }, | |
| { | |
| "epoch": 2.0936454849498327, | |
| "grad_norm": 0.16882190108299255, | |
| "learning_rate": 8.449748616441217e-06, | |
| "loss": 0.6888, | |
| "mean_token_accuracy": 0.7902822762727737, | |
| "num_tokens": 4231490233.0, | |
| "step": 2035 | |
| }, | |
| { | |
| "epoch": 2.0987908412657577, | |
| "grad_norm": 0.18029114603996277, | |
| "learning_rate": 8.414018338666453e-06, | |
| "loss": 0.6964, | |
| "mean_token_accuracy": 0.7881254225969314, | |
| "num_tokens": 4241880702.0, | |
| "step": 2040 | |
| }, | |
| { | |
| "epoch": 2.1039361975816826, | |
| "grad_norm": 0.1758222132921219, | |
| "learning_rate": 8.378419454870885e-06, | |
| "loss": 0.6922, | |
| "mean_token_accuracy": 0.7890959054231643, | |
| "num_tokens": 4252273338.0, | |
| "step": 2045 | |
| }, | |
| { | |
| "epoch": 2.1090815538976075, | |
| "grad_norm": 0.18869197368621826, | |
| "learning_rate": 8.34295310981712e-06, | |
| "loss": 0.6975, | |
| "mean_token_accuracy": 0.7877348899841309, | |
| "num_tokens": 4262684519.0, | |
| "step": 2050 | |
| }, | |
| { | |
| "epoch": 2.1142269102135325, | |
| "grad_norm": 0.19612114131450653, | |
| "learning_rate": 8.307620444005675e-06, | |
| "loss": 0.6857, | |
| "mean_token_accuracy": 0.7910007119178772, | |
| "num_tokens": 4273105678.0, | |
| "step": 2055 | |
| }, | |
| { | |
| "epoch": 2.119372266529457, | |
| "grad_norm": 0.18810050189495087, | |
| "learning_rate": 8.272422593638312e-06, | |
| "loss": 0.7012, | |
| "mean_token_accuracy": 0.7865576684474945, | |
| "num_tokens": 4283510594.0, | |
| "step": 2060 | |
| }, | |
| { | |
| "epoch": 2.124517622845382, | |
| "grad_norm": 0.1900843381881714, | |
| "learning_rate": 8.237360690581494e-06, | |
| "loss": 0.6946, | |
| "mean_token_accuracy": 0.7886899948120117, | |
| "num_tokens": 4293904300.0, | |
| "step": 2065 | |
| }, | |
| { | |
| "epoch": 2.129662979161307, | |
| "grad_norm": 0.20008240640163422, | |
| "learning_rate": 8.202435862329992e-06, | |
| "loss": 0.6931, | |
| "mean_token_accuracy": 0.7892868250608445, | |
| "num_tokens": 4304318437.0, | |
| "step": 2070 | |
| }, | |
| { | |
| "epoch": 2.134808335477232, | |
| "grad_norm": 0.19377216696739197, | |
| "learning_rate": 8.167649231970629e-06, | |
| "loss": 0.7087, | |
| "mean_token_accuracy": 0.7848498582839966, | |
| "num_tokens": 4314743524.0, | |
| "step": 2075 | |
| }, | |
| { | |
| "epoch": 2.1399536917931568, | |
| "grad_norm": 0.18533039093017578, | |
| "learning_rate": 8.13300191814616e-06, | |
| "loss": 0.6985, | |
| "mean_token_accuracy": 0.787962692975998, | |
| "num_tokens": 4325118311.0, | |
| "step": 2080 | |
| }, | |
| { | |
| "epoch": 2.1450990481090817, | |
| "grad_norm": 0.18524758517742157, | |
| "learning_rate": 8.098495035019307e-06, | |
| "loss": 0.6933, | |
| "mean_token_accuracy": 0.7891044408082962, | |
| "num_tokens": 4335531487.0, | |
| "step": 2085 | |
| }, | |
| { | |
| "epoch": 2.1502444044250066, | |
| "grad_norm": 0.20954075455665588, | |
| "learning_rate": 8.064129692236914e-06, | |
| "loss": 0.6988, | |
| "mean_token_accuracy": 0.7874211251735688, | |
| "num_tokens": 4345955448.0, | |
| "step": 2090 | |
| }, | |
| { | |
| "epoch": 2.155389760740931, | |
| "grad_norm": 0.19117045402526855, | |
| "learning_rate": 8.029906994894285e-06, | |
| "loss": 0.6847, | |
| "mean_token_accuracy": 0.7915515124797821, | |
| "num_tokens": 4356359783.0, | |
| "step": 2095 | |
| }, | |
| { | |
| "epoch": 2.160535117056856, | |
| "grad_norm": 0.21733401715755463, | |
| "learning_rate": 7.995828043499637e-06, | |
| "loss": 0.6933, | |
| "mean_token_accuracy": 0.7889263033866882, | |
| "num_tokens": 4366739707.0, | |
| "step": 2100 | |
| }, | |
| { | |
| "epoch": 2.165680473372781, | |
| "grad_norm": 0.18222884833812714, | |
| "learning_rate": 7.961893933938707e-06, | |
| "loss": 0.7027, | |
| "mean_token_accuracy": 0.7866089105606079, | |
| "num_tokens": 4377106402.0, | |
| "step": 2105 | |
| }, | |
| { | |
| "epoch": 2.170825829688706, | |
| "grad_norm": 0.1800689697265625, | |
| "learning_rate": 7.92810575743952e-06, | |
| "loss": 0.6993, | |
| "mean_token_accuracy": 0.7875191777944565, | |
| "num_tokens": 4387518880.0, | |
| "step": 2110 | |
| }, | |
| { | |
| "epoch": 2.175971186004631, | |
| "grad_norm": 0.1838780641555786, | |
| "learning_rate": 7.89446460053728e-06, | |
| "loss": 0.6941, | |
| "mean_token_accuracy": 0.7888187408447266, | |
| "num_tokens": 4397892379.0, | |
| "step": 2115 | |
| }, | |
| { | |
| "epoch": 2.181116542320556, | |
| "grad_norm": 0.19722041487693787, | |
| "learning_rate": 7.860971545039466e-06, | |
| "loss": 0.6971, | |
| "mean_token_accuracy": 0.7882630676031113, | |
| "num_tokens": 4408268522.0, | |
| "step": 2120 | |
| }, | |
| { | |
| "epoch": 2.186261898636481, | |
| "grad_norm": 0.18653474748134613, | |
| "learning_rate": 7.827627667991e-06, | |
| "loss": 0.6955, | |
| "mean_token_accuracy": 0.787960433959961, | |
| "num_tokens": 4418677517.0, | |
| "step": 2125 | |
| }, | |
| { | |
| "epoch": 2.1914072549524053, | |
| "grad_norm": 0.18152935802936554, | |
| "learning_rate": 7.794434041639651e-06, | |
| "loss": 0.6969, | |
| "mean_token_accuracy": 0.7878915429115295, | |
| "num_tokens": 4429109296.0, | |
| "step": 2130 | |
| }, | |
| { | |
| "epoch": 2.1965526112683302, | |
| "grad_norm": 0.19016778469085693, | |
| "learning_rate": 7.761391733401523e-06, | |
| "loss": 0.6966, | |
| "mean_token_accuracy": 0.7879950881004334, | |
| "num_tokens": 4439524505.0, | |
| "step": 2135 | |
| }, | |
| { | |
| "epoch": 2.201697967584255, | |
| "grad_norm": 0.18335750699043274, | |
| "learning_rate": 7.728501805826751e-06, | |
| "loss": 0.7016, | |
| "mean_token_accuracy": 0.786550509929657, | |
| "num_tokens": 4449922294.0, | |
| "step": 2140 | |
| }, | |
| { | |
| "epoch": 2.20684332390018, | |
| "grad_norm": 0.17287969589233398, | |
| "learning_rate": 7.695765316565326e-06, | |
| "loss": 0.6885, | |
| "mean_token_accuracy": 0.7902193248271943, | |
| "num_tokens": 4460309251.0, | |
| "step": 2145 | |
| }, | |
| { | |
| "epoch": 2.211988680216105, | |
| "grad_norm": 0.17045357823371887, | |
| "learning_rate": 7.66318331833308e-06, | |
| "loss": 0.6822, | |
| "mean_token_accuracy": 0.7919429570436478, | |
| "num_tokens": 4470722415.0, | |
| "step": 2150 | |
| }, | |
| { | |
| "epoch": 2.21713403653203, | |
| "grad_norm": 0.1837993860244751, | |
| "learning_rate": 7.630756858877835e-06, | |
| "loss": 0.6917, | |
| "mean_token_accuracy": 0.7894322812557221, | |
| "num_tokens": 4481112230.0, | |
| "step": 2155 | |
| }, | |
| { | |
| "epoch": 2.2222793928479545, | |
| "grad_norm": 0.1990688294172287, | |
| "learning_rate": 7.598486980945721e-06, | |
| "loss": 0.696, | |
| "mean_token_accuracy": 0.7881556123495101, | |
| "num_tokens": 4491526989.0, | |
| "step": 2160 | |
| }, | |
| { | |
| "epoch": 2.2274247491638794, | |
| "grad_norm": 0.19104093313217163, | |
| "learning_rate": 7.566374722247625e-06, | |
| "loss": 0.7071, | |
| "mean_token_accuracy": 0.7854187726974488, | |
| "num_tokens": 4501890999.0, | |
| "step": 2165 | |
| }, | |
| { | |
| "epoch": 2.2325701054798044, | |
| "grad_norm": 0.17589214444160461, | |
| "learning_rate": 7.534421115425832e-06, | |
| "loss": 0.7082, | |
| "mean_token_accuracy": 0.7851406782865524, | |
| "num_tokens": 4512263755.0, | |
| "step": 2170 | |
| }, | |
| { | |
| "epoch": 2.2377154617957293, | |
| "grad_norm": 0.18910154700279236, | |
| "learning_rate": 7.502627188020828e-06, | |
| "loss": 0.7018, | |
| "mean_token_accuracy": 0.7865214943885803, | |
| "num_tokens": 4522673409.0, | |
| "step": 2175 | |
| }, | |
| { | |
| "epoch": 2.2428608181116543, | |
| "grad_norm": 0.19419220089912415, | |
| "learning_rate": 7.470993962438233e-06, | |
| "loss": 0.6981, | |
| "mean_token_accuracy": 0.787706145644188, | |
| "num_tokens": 4533050765.0, | |
| "step": 2180 | |
| }, | |
| { | |
| "epoch": 2.248006174427579, | |
| "grad_norm": 0.19314493238925934, | |
| "learning_rate": 7.439522455915941e-06, | |
| "loss": 0.6921, | |
| "mean_token_accuracy": 0.7892083436250686, | |
| "num_tokens": 4543452165.0, | |
| "step": 2185 | |
| }, | |
| { | |
| "epoch": 2.253151530743504, | |
| "grad_norm": 0.21107150614261627, | |
| "learning_rate": 7.408213680491409e-06, | |
| "loss": 0.6969, | |
| "mean_token_accuracy": 0.7882888942956925, | |
| "num_tokens": 4553853739.0, | |
| "step": 2190 | |
| }, | |
| { | |
| "epoch": 2.258296887059429, | |
| "grad_norm": 0.1907646358013153, | |
| "learning_rate": 7.377068642969104e-06, | |
| "loss": 0.6963, | |
| "mean_token_accuracy": 0.7884598582983017, | |
| "num_tokens": 4564274917.0, | |
| "step": 2195 | |
| }, | |
| { | |
| "epoch": 2.2634422433753536, | |
| "grad_norm": 0.17553763091564178, | |
| "learning_rate": 7.346088344888125e-06, | |
| "loss": 0.6951, | |
| "mean_token_accuracy": 0.788986611366272, | |
| "num_tokens": 4574641137.0, | |
| "step": 2200 | |
| }, | |
| { | |
| "epoch": 2.2685875996912785, | |
| "grad_norm": 0.18935340642929077, | |
| "learning_rate": 7.315273782490008e-06, | |
| "loss": 0.6994, | |
| "mean_token_accuracy": 0.7872116446495057, | |
| "num_tokens": 4585019689.0, | |
| "step": 2205 | |
| }, | |
| { | |
| "epoch": 2.2737329560072035, | |
| "grad_norm": 0.19580236077308655, | |
| "learning_rate": 7.284625946686685e-06, | |
| "loss": 0.693, | |
| "mean_token_accuracy": 0.7892390996217727, | |
| "num_tokens": 4595438984.0, | |
| "step": 2210 | |
| }, | |
| { | |
| "epoch": 2.2788783123231284, | |
| "grad_norm": 0.19986537098884583, | |
| "learning_rate": 7.254145823028617e-06, | |
| "loss": 0.6936, | |
| "mean_token_accuracy": 0.7890995383262634, | |
| "num_tokens": 4605823855.0, | |
| "step": 2215 | |
| }, | |
| { | |
| "epoch": 2.2840236686390534, | |
| "grad_norm": 0.1742008775472641, | |
| "learning_rate": 7.2238343916730915e-06, | |
| "loss": 0.6993, | |
| "mean_token_accuracy": 0.7871985048055649, | |
| "num_tokens": 4616238871.0, | |
| "step": 2220 | |
| }, | |
| { | |
| "epoch": 2.2891690249549783, | |
| "grad_norm": 0.19267351925373077, | |
| "learning_rate": 7.193692627352726e-06, | |
| "loss": 0.6872, | |
| "mean_token_accuracy": 0.7908998429775238, | |
| "num_tokens": 4626643459.0, | |
| "step": 2225 | |
| }, | |
| { | |
| "epoch": 2.294314381270903, | |
| "grad_norm": 0.17010417580604553, | |
| "learning_rate": 7.163721499344107e-06, | |
| "loss": 0.6966, | |
| "mean_token_accuracy": 0.7883331865072251, | |
| "num_tokens": 4637044833.0, | |
| "step": 2230 | |
| }, | |
| { | |
| "epoch": 2.2994597375868278, | |
| "grad_norm": 0.17460167407989502, | |
| "learning_rate": 7.133921971436622e-06, | |
| "loss": 0.6989, | |
| "mean_token_accuracy": 0.7876080513000489, | |
| "num_tokens": 4647444102.0, | |
| "step": 2235 | |
| }, | |
| { | |
| "epoch": 2.3046050939027527, | |
| "grad_norm": 0.17576715350151062, | |
| "learning_rate": 7.104295001901473e-06, | |
| "loss": 0.6878, | |
| "mean_token_accuracy": 0.7904452890157699, | |
| "num_tokens": 4657824599.0, | |
| "step": 2240 | |
| }, | |
| { | |
| "epoch": 2.3097504502186776, | |
| "grad_norm": 0.16763444244861603, | |
| "learning_rate": 7.074841543460853e-06, | |
| "loss": 0.6844, | |
| "mean_token_accuracy": 0.7918480813503266, | |
| "num_tokens": 4668181094.0, | |
| "step": 2245 | |
| }, | |
| { | |
| "epoch": 2.3148958065346026, | |
| "grad_norm": 0.1823350489139557, | |
| "learning_rate": 7.0455625432573186e-06, | |
| "loss": 0.6932, | |
| "mean_token_accuracy": 0.7892414182424545, | |
| "num_tokens": 4678584523.0, | |
| "step": 2250 | |
| }, | |
| { | |
| "epoch": 2.3200411628505275, | |
| "grad_norm": 0.17247016727924347, | |
| "learning_rate": 7.016458942823321e-06, | |
| "loss": 0.6869, | |
| "mean_token_accuracy": 0.7909802347421646, | |
| "num_tokens": 4688981096.0, | |
| "step": 2255 | |
| }, | |
| { | |
| "epoch": 2.3251865191664525, | |
| "grad_norm": 0.17673242092132568, | |
| "learning_rate": 6.987531678050943e-06, | |
| "loss": 0.6802, | |
| "mean_token_accuracy": 0.7923660695552825, | |
| "num_tokens": 4699404625.0, | |
| "step": 2260 | |
| }, | |
| { | |
| "epoch": 2.330331875482377, | |
| "grad_norm": 0.16854751110076904, | |
| "learning_rate": 6.958781679161788e-06, | |
| "loss": 0.6842, | |
| "mean_token_accuracy": 0.7919197797775268, | |
| "num_tokens": 4709811697.0, | |
| "step": 2265 | |
| }, | |
| { | |
| "epoch": 2.335477231798302, | |
| "grad_norm": 0.1761576384305954, | |
| "learning_rate": 6.930209870677077e-06, | |
| "loss": 0.685, | |
| "mean_token_accuracy": 0.7914377897977829, | |
| "num_tokens": 4720237781.0, | |
| "step": 2270 | |
| }, | |
| { | |
| "epoch": 2.340622588114227, | |
| "grad_norm": 0.17306455969810486, | |
| "learning_rate": 6.901817171387917e-06, | |
| "loss": 0.7019, | |
| "mean_token_accuracy": 0.7869494408369064, | |
| "num_tokens": 4730606260.0, | |
| "step": 2275 | |
| }, | |
| { | |
| "epoch": 2.345767944430152, | |
| "grad_norm": 0.18955180048942566, | |
| "learning_rate": 6.873604494325757e-06, | |
| "loss": 0.6948, | |
| "mean_token_accuracy": 0.7886533975601197, | |
| "num_tokens": 4741014261.0, | |
| "step": 2280 | |
| }, | |
| { | |
| "epoch": 2.3509133007460767, | |
| "grad_norm": 0.1918182373046875, | |
| "learning_rate": 6.845572746733015e-06, | |
| "loss": 0.6907, | |
| "mean_token_accuracy": 0.7898207098245621, | |
| "num_tokens": 4751422939.0, | |
| "step": 2285 | |
| }, | |
| { | |
| "epoch": 2.3560586570620017, | |
| "grad_norm": 0.17227977514266968, | |
| "learning_rate": 6.8177228300339186e-06, | |
| "loss": 0.6926, | |
| "mean_token_accuracy": 0.7893718838691711, | |
| "num_tokens": 4761799091.0, | |
| "step": 2290 | |
| }, | |
| { | |
| "epoch": 2.361204013377926, | |
| "grad_norm": 0.19379264116287231, | |
| "learning_rate": 6.79005563980551e-06, | |
| "loss": 0.6867, | |
| "mean_token_accuracy": 0.7909263670444489, | |
| "num_tokens": 4772203894.0, | |
| "step": 2295 | |
| }, | |
| { | |
| "epoch": 2.366349369693851, | |
| "grad_norm": 0.16864101588726044, | |
| "learning_rate": 6.7625720657488526e-06, | |
| "loss": 0.6954, | |
| "mean_token_accuracy": 0.7885312736034393, | |
| "num_tokens": 4782600873.0, | |
| "step": 2300 | |
| }, | |
| { | |
| "epoch": 2.371494726009776, | |
| "grad_norm": 0.17416614294052124, | |
| "learning_rate": 6.735272991660415e-06, | |
| "loss": 0.7108, | |
| "mean_token_accuracy": 0.784038883447647, | |
| "num_tokens": 4793015981.0, | |
| "step": 2305 | |
| }, | |
| { | |
| "epoch": 2.376640082325701, | |
| "grad_norm": 0.17601124942302704, | |
| "learning_rate": 6.708159295403645e-06, | |
| "loss": 0.6931, | |
| "mean_token_accuracy": 0.7890658885240555, | |
| "num_tokens": 4803428229.0, | |
| "step": 2310 | |
| }, | |
| { | |
| "epoch": 2.381785438641626, | |
| "grad_norm": 0.17575791478157043, | |
| "learning_rate": 6.681231848880758e-06, | |
| "loss": 0.6931, | |
| "mean_token_accuracy": 0.7891870647668838, | |
| "num_tokens": 4813832709.0, | |
| "step": 2315 | |
| }, | |
| { | |
| "epoch": 2.386930794957551, | |
| "grad_norm": 0.171165332198143, | |
| "learning_rate": 6.654491518004684e-06, | |
| "loss": 0.6977, | |
| "mean_token_accuracy": 0.7878083676099777, | |
| "num_tokens": 4824217333.0, | |
| "step": 2320 | |
| }, | |
| { | |
| "epoch": 2.392076151273476, | |
| "grad_norm": 0.174204021692276, | |
| "learning_rate": 6.6279391626712195e-06, | |
| "loss": 0.6767, | |
| "mean_token_accuracy": 0.7939162909984588, | |
| "num_tokens": 4834578249.0, | |
| "step": 2325 | |
| }, | |
| { | |
| "epoch": 2.397221507589401, | |
| "grad_norm": 0.17255154252052307, | |
| "learning_rate": 6.601575636731393e-06, | |
| "loss": 0.6853, | |
| "mean_token_accuracy": 0.7911572694778443, | |
| "num_tokens": 4845003859.0, | |
| "step": 2330 | |
| }, | |
| { | |
| "epoch": 2.4023668639053253, | |
| "grad_norm": 0.17027340829372406, | |
| "learning_rate": 6.575401787963991e-06, | |
| "loss": 0.7016, | |
| "mean_token_accuracy": 0.7866772085428237, | |
| "num_tokens": 4855396377.0, | |
| "step": 2335 | |
| }, | |
| { | |
| "epoch": 2.40751222022125, | |
| "grad_norm": 0.1804320216178894, | |
| "learning_rate": 6.549418458048301e-06, | |
| "loss": 0.6944, | |
| "mean_token_accuracy": 0.7887315809726715, | |
| "num_tokens": 4865807607.0, | |
| "step": 2340 | |
| }, | |
| { | |
| "epoch": 2.412657576537175, | |
| "grad_norm": 0.17891834676265717, | |
| "learning_rate": 6.523626482537051e-06, | |
| "loss": 0.6924, | |
| "mean_token_accuracy": 0.7891820967197418, | |
| "num_tokens": 4876203853.0, | |
| "step": 2345 | |
| }, | |
| { | |
| "epoch": 2.4178029328531, | |
| "grad_norm": 0.17687129974365234, | |
| "learning_rate": 6.498026690829529e-06, | |
| "loss": 0.6879, | |
| "mean_token_accuracy": 0.7905671745538712, | |
| "num_tokens": 4886593797.0, | |
| "step": 2350 | |
| }, | |
| { | |
| "epoch": 2.422948289169025, | |
| "grad_norm": 0.16701921820640564, | |
| "learning_rate": 6.472619906144924e-06, | |
| "loss": 0.7011, | |
| "mean_token_accuracy": 0.7867477118968964, | |
| "num_tokens": 4897020034.0, | |
| "step": 2355 | |
| }, | |
| { | |
| "epoch": 2.42809364548495, | |
| "grad_norm": 0.18092454969882965, | |
| "learning_rate": 6.447406945495843e-06, | |
| "loss": 0.6846, | |
| "mean_token_accuracy": 0.7916429519653321, | |
| "num_tokens": 4907433880.0, | |
| "step": 2360 | |
| }, | |
| { | |
| "epoch": 2.4332390018008745, | |
| "grad_norm": 0.17751817405223846, | |
| "learning_rate": 6.422388619662045e-06, | |
| "loss": 0.694, | |
| "mean_token_accuracy": 0.7888200342655182, | |
| "num_tokens": 4917840148.0, | |
| "step": 2365 | |
| }, | |
| { | |
| "epoch": 2.4383843581167994, | |
| "grad_norm": 0.1927623152732849, | |
| "learning_rate": 6.3975657331643715e-06, | |
| "loss": 0.6959, | |
| "mean_token_accuracy": 0.7883234590291976, | |
| "num_tokens": 4928232237.0, | |
| "step": 2370 | |
| }, | |
| { | |
| "epoch": 2.4435297144327244, | |
| "grad_norm": 0.17448590695858002, | |
| "learning_rate": 6.3729390842388585e-06, | |
| "loss": 0.7, | |
| "mean_token_accuracy": 0.7875474035739899, | |
| "num_tokens": 4938631938.0, | |
| "step": 2375 | |
| }, | |
| { | |
| "epoch": 2.4486750707486493, | |
| "grad_norm": 0.1870429664850235, | |
| "learning_rate": 6.348509464811088e-06, | |
| "loss": 0.698, | |
| "mean_token_accuracy": 0.7877880901098251, | |
| "num_tokens": 4949047787.0, | |
| "step": 2380 | |
| }, | |
| { | |
| "epoch": 2.4538204270645743, | |
| "grad_norm": 0.1780596822500229, | |
| "learning_rate": 6.3242776604707144e-06, | |
| "loss": 0.6918, | |
| "mean_token_accuracy": 0.7893176406621933, | |
| "num_tokens": 4959424736.0, | |
| "step": 2385 | |
| }, | |
| { | |
| "epoch": 2.458965783380499, | |
| "grad_norm": 0.16919124126434326, | |
| "learning_rate": 6.300244450446195e-06, | |
| "loss": 0.7012, | |
| "mean_token_accuracy": 0.7870047926902771, | |
| "num_tokens": 4969829870.0, | |
| "step": 2390 | |
| }, | |
| { | |
| "epoch": 2.464111139696424, | |
| "grad_norm": 0.172256737947464, | |
| "learning_rate": 6.27641060757974e-06, | |
| "loss": 0.7041, | |
| "mean_token_accuracy": 0.7858896970748901, | |
| "num_tokens": 4980197224.0, | |
| "step": 2395 | |
| }, | |
| { | |
| "epoch": 2.469256496012349, | |
| "grad_norm": 0.17399129271507263, | |
| "learning_rate": 6.252776898302453e-06, | |
| "loss": 0.6824, | |
| "mean_token_accuracy": 0.7921358823776246, | |
| "num_tokens": 4990600805.0, | |
| "step": 2400 | |
| }, | |
| { | |
| "epoch": 2.4744018523282736, | |
| "grad_norm": 0.1758703887462616, | |
| "learning_rate": 6.2293440826097005e-06, | |
| "loss": 0.6961, | |
| "mean_token_accuracy": 0.7880290925502778, | |
| "num_tokens": 5000978207.0, | |
| "step": 2405 | |
| }, | |
| { | |
| "epoch": 2.4795472086441985, | |
| "grad_norm": 0.1842864602804184, | |
| "learning_rate": 6.206112914036657e-06, | |
| "loss": 0.6965, | |
| "mean_token_accuracy": 0.7884801357984543, | |
| "num_tokens": 5011384736.0, | |
| "step": 2410 | |
| }, | |
| { | |
| "epoch": 2.4846925649601235, | |
| "grad_norm": 0.1803123950958252, | |
| "learning_rate": 6.1830841396340705e-06, | |
| "loss": 0.6991, | |
| "mean_token_accuracy": 0.7872378647327423, | |
| "num_tokens": 5021771302.0, | |
| "step": 2415 | |
| }, | |
| { | |
| "epoch": 2.4898379212760484, | |
| "grad_norm": 0.17343245446681976, | |
| "learning_rate": 6.160258499944255e-06, | |
| "loss": 0.6899, | |
| "mean_token_accuracy": 0.7900263160467148, | |
| "num_tokens": 5032158385.0, | |
| "step": 2420 | |
| }, | |
| { | |
| "epoch": 2.4949832775919734, | |
| "grad_norm": 0.18295209109783173, | |
| "learning_rate": 6.137636728977267e-06, | |
| "loss": 0.6873, | |
| "mean_token_accuracy": 0.7904597342014312, | |
| "num_tokens": 5042580817.0, | |
| "step": 2425 | |
| }, | |
| { | |
| "epoch": 2.500128633907898, | |
| "grad_norm": 0.1744978278875351, | |
| "learning_rate": 6.115219554187303e-06, | |
| "loss": 0.6944, | |
| "mean_token_accuracy": 0.7883099675178528, | |
| "num_tokens": 5052996785.0, | |
| "step": 2430 | |
| }, | |
| { | |
| "epoch": 2.505273990223823, | |
| "grad_norm": 0.17044760286808014, | |
| "learning_rate": 6.0930076964493034e-06, | |
| "loss": 0.7044, | |
| "mean_token_accuracy": 0.7858549505472183, | |
| "num_tokens": 5063403777.0, | |
| "step": 2435 | |
| }, | |
| { | |
| "epoch": 2.5104193465397477, | |
| "grad_norm": 0.1745147705078125, | |
| "learning_rate": 6.07100187003578e-06, | |
| "loss": 0.6946, | |
| "mean_token_accuracy": 0.7888891041278839, | |
| "num_tokens": 5073787624.0, | |
| "step": 2440 | |
| }, | |
| { | |
| "epoch": 2.5155647028556727, | |
| "grad_norm": 0.17163583636283875, | |
| "learning_rate": 6.049202782593837e-06, | |
| "loss": 0.7091, | |
| "mean_token_accuracy": 0.784814390540123, | |
| "num_tokens": 5084155762.0, | |
| "step": 2445 | |
| }, | |
| { | |
| "epoch": 2.5207100591715976, | |
| "grad_norm": 0.1630815863609314, | |
| "learning_rate": 6.027611135122423e-06, | |
| "loss": 0.6833, | |
| "mean_token_accuracy": 0.7919480204582214, | |
| "num_tokens": 5094520579.0, | |
| "step": 2450 | |
| }, | |
| { | |
| "epoch": 2.5258554154875226, | |
| "grad_norm": 0.169570654630661, | |
| "learning_rate": 6.006227621949783e-06, | |
| "loss": 0.6912, | |
| "mean_token_accuracy": 0.7897911489009857, | |
| "num_tokens": 5104935332.0, | |
| "step": 2455 | |
| }, | |
| { | |
| "epoch": 2.5310007718034475, | |
| "grad_norm": 0.17307248711585999, | |
| "learning_rate": 5.985052930711133e-06, | |
| "loss": 0.686, | |
| "mean_token_accuracy": 0.7910365283489227, | |
| "num_tokens": 5115312123.0, | |
| "step": 2460 | |
| }, | |
| { | |
| "epoch": 2.5361461281193725, | |
| "grad_norm": 0.1717165857553482, | |
| "learning_rate": 5.964087742326549e-06, | |
| "loss": 0.7048, | |
| "mean_token_accuracy": 0.7863658338785171, | |
| "num_tokens": 5125722883.0, | |
| "step": 2465 | |
| }, | |
| { | |
| "epoch": 2.5412914844352974, | |
| "grad_norm": 0.16661353409290314, | |
| "learning_rate": 5.943332730979067e-06, | |
| "loss": 0.6982, | |
| "mean_token_accuracy": 0.7878574222326279, | |
| "num_tokens": 5136118397.0, | |
| "step": 2470 | |
| }, | |
| { | |
| "epoch": 2.546436840751222, | |
| "grad_norm": 0.18003134429454803, | |
| "learning_rate": 5.922788564093009e-06, | |
| "loss": 0.6942, | |
| "mean_token_accuracy": 0.788626492023468, | |
| "num_tokens": 5146490125.0, | |
| "step": 2475 | |
| }, | |
| { | |
| "epoch": 2.551582197067147, | |
| "grad_norm": 0.17579644918441772, | |
| "learning_rate": 5.902455902312511e-06, | |
| "loss": 0.7027, | |
| "mean_token_accuracy": 0.7862021327018738, | |
| "num_tokens": 5156884568.0, | |
| "step": 2480 | |
| }, | |
| { | |
| "epoch": 2.556727553383072, | |
| "grad_norm": 0.1727745682001114, | |
| "learning_rate": 5.88233539948029e-06, | |
| "loss": 0.6925, | |
| "mean_token_accuracy": 0.7890074193477631, | |
| "num_tokens": 5167297476.0, | |
| "step": 2485 | |
| }, | |
| { | |
| "epoch": 2.5618729096989967, | |
| "grad_norm": 0.18698406219482422, | |
| "learning_rate": 5.862427702616605e-06, | |
| "loss": 0.6831, | |
| "mean_token_accuracy": 0.7916372120380402, | |
| "num_tokens": 5177725329.0, | |
| "step": 2490 | |
| }, | |
| { | |
| "epoch": 2.5670182660149217, | |
| "grad_norm": 0.17920783162117004, | |
| "learning_rate": 5.842733451898467e-06, | |
| "loss": 0.7028, | |
| "mean_token_accuracy": 0.7861807703971863, | |
| "num_tokens": 5188136857.0, | |
| "step": 2495 | |
| }, | |
| { | |
| "epoch": 2.572163622330846, | |
| "grad_norm": 0.17899206280708313, | |
| "learning_rate": 5.823253280639039e-06, | |
| "loss": 0.6814, | |
| "mean_token_accuracy": 0.7923789769411087, | |
| "num_tokens": 5198537170.0, | |
| "step": 2500 | |
| }, | |
| { | |
| "epoch": 2.577308978646771, | |
| "grad_norm": 0.17629799246788025, | |
| "learning_rate": 5.803987815267268e-06, | |
| "loss": 0.6979, | |
| "mean_token_accuracy": 0.7875349700450898, | |
| "num_tokens": 5208920419.0, | |
| "step": 2505 | |
| }, | |
| { | |
| "epoch": 2.582454334962696, | |
| "grad_norm": 0.16480837762355804, | |
| "learning_rate": 5.7849376753077625e-06, | |
| "loss": 0.6856, | |
| "mean_token_accuracy": 0.7911129057407379, | |
| "num_tokens": 5219341909.0, | |
| "step": 2510 | |
| }, | |
| { | |
| "epoch": 2.587599691278621, | |
| "grad_norm": 0.1704104244709015, | |
| "learning_rate": 5.766103473360842e-06, | |
| "loss": 0.6955, | |
| "mean_token_accuracy": 0.7883382886648178, | |
| "num_tokens": 5229692388.0, | |
| "step": 2515 | |
| }, | |
| { | |
| "epoch": 2.592745047594546, | |
| "grad_norm": 0.17449352145195007, | |
| "learning_rate": 5.74748581508286e-06, | |
| "loss": 0.6943, | |
| "mean_token_accuracy": 0.788813516497612, | |
| "num_tokens": 5240057257.0, | |
| "step": 2520 | |
| }, | |
| { | |
| "epoch": 2.597890403910471, | |
| "grad_norm": 0.17114900052547455, | |
| "learning_rate": 5.729085299166713e-06, | |
| "loss": 0.6925, | |
| "mean_token_accuracy": 0.7888531744480133, | |
| "num_tokens": 5250472272.0, | |
| "step": 2525 | |
| }, | |
| { | |
| "epoch": 2.603035760226396, | |
| "grad_norm": 0.19473806023597717, | |
| "learning_rate": 5.710902517322597e-06, | |
| "loss": 0.7034, | |
| "mean_token_accuracy": 0.7864585638046264, | |
| "num_tokens": 5260858183.0, | |
| "step": 2530 | |
| }, | |
| { | |
| "epoch": 2.6081811165423208, | |
| "grad_norm": 0.17495019733905792, | |
| "learning_rate": 5.6929380542589764e-06, | |
| "loss": 0.6919, | |
| "mean_token_accuracy": 0.7893568813800812, | |
| "num_tokens": 5271278572.0, | |
| "step": 2535 | |
| }, | |
| { | |
| "epoch": 2.6133264728582457, | |
| "grad_norm": 0.16885867714881897, | |
| "learning_rate": 5.675192487663777e-06, | |
| "loss": 0.6922, | |
| "mean_token_accuracy": 0.7892817795276642, | |
| "num_tokens": 5281676141.0, | |
| "step": 2540 | |
| }, | |
| { | |
| "epoch": 2.61847182917417, | |
| "grad_norm": 0.17458729445934296, | |
| "learning_rate": 5.657666388185823e-06, | |
| "loss": 0.6925, | |
| "mean_token_accuracy": 0.7891670197248459, | |
| "num_tokens": 5292081395.0, | |
| "step": 2545 | |
| }, | |
| { | |
| "epoch": 2.623617185490095, | |
| "grad_norm": 0.17178680002689362, | |
| "learning_rate": 5.640360319416467e-06, | |
| "loss": 0.6888, | |
| "mean_token_accuracy": 0.7902668923139572, | |
| "num_tokens": 5302469310.0, | |
| "step": 2550 | |
| }, | |
| { | |
| "epoch": 2.62876254180602, | |
| "grad_norm": 0.1776588261127472, | |
| "learning_rate": 5.623274837871483e-06, | |
| "loss": 0.694, | |
| "mean_token_accuracy": 0.7890095263719559, | |
| "num_tokens": 5312857824.0, | |
| "step": 2555 | |
| }, | |
| { | |
| "epoch": 2.633907898121945, | |
| "grad_norm": 0.17728105187416077, | |
| "learning_rate": 5.606410492973162e-06, | |
| "loss": 0.6885, | |
| "mean_token_accuracy": 0.7901356816291809, | |
| "num_tokens": 5323248156.0, | |
| "step": 2560 | |
| }, | |
| { | |
| "epoch": 2.63905325443787, | |
| "grad_norm": 0.16960322856903076, | |
| "learning_rate": 5.589767827032649e-06, | |
| "loss": 0.7014, | |
| "mean_token_accuracy": 0.7869833618402481, | |
| "num_tokens": 5333628919.0, | |
| "step": 2565 | |
| }, | |
| { | |
| "epoch": 2.6441986107537945, | |
| "grad_norm": 0.17245355248451233, | |
| "learning_rate": 5.573347375232493e-06, | |
| "loss": 0.6918, | |
| "mean_token_accuracy": 0.7889738440513611, | |
| "num_tokens": 5344021897.0, | |
| "step": 2570 | |
| }, | |
| { | |
| "epoch": 2.6493439670697194, | |
| "grad_norm": 0.16664022207260132, | |
| "learning_rate": 5.557149665609455e-06, | |
| "loss": 0.7, | |
| "mean_token_accuracy": 0.7870820313692093, | |
| "num_tokens": 5354420664.0, | |
| "step": 2575 | |
| }, | |
| { | |
| "epoch": 2.6544893233856444, | |
| "grad_norm": 0.17978526651859283, | |
| "learning_rate": 5.54117521903751e-06, | |
| "loss": 0.6783, | |
| "mean_token_accuracy": 0.793021947145462, | |
| "num_tokens": 5364835797.0, | |
| "step": 2580 | |
| }, | |
| { | |
| "epoch": 2.6596346797015693, | |
| "grad_norm": 0.17351332306861877, | |
| "learning_rate": 5.525424549211112e-06, | |
| "loss": 0.6964, | |
| "mean_token_accuracy": 0.7881974041461944, | |
| "num_tokens": 5375251218.0, | |
| "step": 2585 | |
| }, | |
| { | |
| "epoch": 2.6647800360174942, | |
| "grad_norm": 0.1651052087545395, | |
| "learning_rate": 5.509898162628657e-06, | |
| "loss": 0.6956, | |
| "mean_token_accuracy": 0.7883888274431229, | |
| "num_tokens": 5385653462.0, | |
| "step": 2590 | |
| }, | |
| { | |
| "epoch": 2.669925392333419, | |
| "grad_norm": 0.17517080903053284, | |
| "learning_rate": 5.494596558576215e-06, | |
| "loss": 0.7, | |
| "mean_token_accuracy": 0.7868314325809479, | |
| "num_tokens": 5396055523.0, | |
| "step": 2595 | |
| }, | |
| { | |
| "epoch": 2.675070748649344, | |
| "grad_norm": 0.18584103882312775, | |
| "learning_rate": 5.4795202291114655e-06, | |
| "loss": 0.6948, | |
| "mean_token_accuracy": 0.7886532038450241, | |
| "num_tokens": 5406483427.0, | |
| "step": 2600 | |
| }, | |
| { | |
| "epoch": 2.680216104965269, | |
| "grad_norm": 0.1600140780210495, | |
| "learning_rate": 5.464669659047871e-06, | |
| "loss": 0.7105, | |
| "mean_token_accuracy": 0.7844233065843582, | |
| "num_tokens": 5416893350.0, | |
| "step": 2605 | |
| }, | |
| { | |
| "epoch": 2.6853614612811936, | |
| "grad_norm": 0.16962645947933197, | |
| "learning_rate": 5.450045325939086e-06, | |
| "loss": 0.699, | |
| "mean_token_accuracy": 0.7872962862253189, | |
| "num_tokens": 5427314991.0, | |
| "step": 2610 | |
| }, | |
| { | |
| "epoch": 2.6905068175971185, | |
| "grad_norm": 0.16842614114284515, | |
| "learning_rate": 5.4356477000636155e-06, | |
| "loss": 0.696, | |
| "mean_token_accuracy": 0.7881788671016693, | |
| "num_tokens": 5437722441.0, | |
| "step": 2615 | |
| }, | |
| { | |
| "epoch": 2.6956521739130435, | |
| "grad_norm": 0.1736549735069275, | |
| "learning_rate": 5.42147724440967e-06, | |
| "loss": 0.6804, | |
| "mean_token_accuracy": 0.7926200598478317, | |
| "num_tokens": 5448128609.0, | |
| "step": 2620 | |
| }, | |
| { | |
| "epoch": 2.7007975302289684, | |
| "grad_norm": 0.17829610407352448, | |
| "learning_rate": 5.407534414660296e-06, | |
| "loss": 0.7076, | |
| "mean_token_accuracy": 0.7849996328353882, | |
| "num_tokens": 5458552679.0, | |
| "step": 2625 | |
| }, | |
| { | |
| "epoch": 2.7059428865448933, | |
| "grad_norm": 0.17205742001533508, | |
| "learning_rate": 5.3938196591787055e-06, | |
| "loss": 0.6886, | |
| "mean_token_accuracy": 0.7903157830238342, | |
| "num_tokens": 5468959670.0, | |
| "step": 2630 | |
| }, | |
| { | |
| "epoch": 2.711088242860818, | |
| "grad_norm": 0.19865980744361877, | |
| "learning_rate": 5.380333418993874e-06, | |
| "loss": 0.6969, | |
| "mean_token_accuracy": 0.7879400312900543, | |
| "num_tokens": 5479359994.0, | |
| "step": 2635 | |
| }, | |
| { | |
| "epoch": 2.716233599176743, | |
| "grad_norm": 0.17879824340343475, | |
| "learning_rate": 5.367076127786349e-06, | |
| "loss": 0.6799, | |
| "mean_token_accuracy": 0.7927328020334243, | |
| "num_tokens": 5489776454.0, | |
| "step": 2640 | |
| }, | |
| { | |
| "epoch": 2.7213789554926677, | |
| "grad_norm": 0.16733145713806152, | |
| "learning_rate": 5.354048211874305e-06, | |
| "loss": 0.7009, | |
| "mean_token_accuracy": 0.7868389576673508, | |
| "num_tokens": 5500195720.0, | |
| "step": 2645 | |
| }, | |
| { | |
| "epoch": 2.7265243118085927, | |
| "grad_norm": 0.17073538899421692, | |
| "learning_rate": 5.341250090199836e-06, | |
| "loss": 0.689, | |
| "mean_token_accuracy": 0.7902264356613159, | |
| "num_tokens": 5510590496.0, | |
| "step": 2650 | |
| }, | |
| { | |
| "epoch": 2.7316696681245176, | |
| "grad_norm": 0.16798222064971924, | |
| "learning_rate": 5.328682174315484e-06, | |
| "loss": 0.6997, | |
| "mean_token_accuracy": 0.7871428996324539, | |
| "num_tokens": 5520984194.0, | |
| "step": 2655 | |
| }, | |
| { | |
| "epoch": 2.7368150244404426, | |
| "grad_norm": 0.17511659860610962, | |
| "learning_rate": 5.316344868370999e-06, | |
| "loss": 0.7027, | |
| "mean_token_accuracy": 0.7862639844417572, | |
| "num_tokens": 5531378820.0, | |
| "step": 2660 | |
| }, | |
| { | |
| "epoch": 2.7419603807563675, | |
| "grad_norm": 0.1656564474105835, | |
| "learning_rate": 5.304238569100351e-06, | |
| "loss": 0.6919, | |
| "mean_token_accuracy": 0.7892627060413361, | |
| "num_tokens": 5541788819.0, | |
| "step": 2665 | |
| }, | |
| { | |
| "epoch": 2.7471057370722924, | |
| "grad_norm": 0.1768578141927719, | |
| "learning_rate": 5.2923636658089674e-06, | |
| "loss": 0.6951, | |
| "mean_token_accuracy": 0.7887540727853775, | |
| "num_tokens": 5552136282.0, | |
| "step": 2670 | |
| }, | |
| { | |
| "epoch": 2.7522510933882174, | |
| "grad_norm": 0.16567444801330566, | |
| "learning_rate": 5.280720540361213e-06, | |
| "loss": 0.6902, | |
| "mean_token_accuracy": 0.7896697282791137, | |
| "num_tokens": 5562532385.0, | |
| "step": 2675 | |
| }, | |
| { | |
| "epoch": 2.757396449704142, | |
| "grad_norm": 0.16643787920475006, | |
| "learning_rate": 5.2693095671681125e-06, | |
| "loss": 0.6946, | |
| "mean_token_accuracy": 0.7891028523445129, | |
| "num_tokens": 5572953965.0, | |
| "step": 2680 | |
| }, | |
| { | |
| "epoch": 2.762541806020067, | |
| "grad_norm": 0.1679885983467102, | |
| "learning_rate": 5.258131113175312e-06, | |
| "loss": 0.6928, | |
| "mean_token_accuracy": 0.7889256983995437, | |
| "num_tokens": 5583365916.0, | |
| "step": 2685 | |
| }, | |
| { | |
| "epoch": 2.7676871623359918, | |
| "grad_norm": 0.17079950869083405, | |
| "learning_rate": 5.247185537851277e-06, | |
| "loss": 0.693, | |
| "mean_token_accuracy": 0.7887627691030502, | |
| "num_tokens": 5593766520.0, | |
| "step": 2690 | |
| }, | |
| { | |
| "epoch": 2.7728325186519167, | |
| "grad_norm": 0.17430275678634644, | |
| "learning_rate": 5.236473193175727e-06, | |
| "loss": 0.693, | |
| "mean_token_accuracy": 0.7892514944076539, | |
| "num_tokens": 5604146645.0, | |
| "step": 2695 | |
| }, | |
| { | |
| "epoch": 2.7779778749678417, | |
| "grad_norm": 0.1720867156982422, | |
| "learning_rate": 5.225994423628329e-06, | |
| "loss": 0.6982, | |
| "mean_token_accuracy": 0.7877674490213394, | |
| "num_tokens": 5614554383.0, | |
| "step": 2700 | |
| }, | |
| { | |
| "epoch": 2.783123231283766, | |
| "grad_norm": 0.17518489062786102, | |
| "learning_rate": 5.215749566177612e-06, | |
| "loss": 0.6908, | |
| "mean_token_accuracy": 0.789122948050499, | |
| "num_tokens": 5624946901.0, | |
| "step": 2705 | |
| }, | |
| { | |
| "epoch": 2.788268587599691, | |
| "grad_norm": 0.17626863718032837, | |
| "learning_rate": 5.2057389502701315e-06, | |
| "loss": 0.6962, | |
| "mean_token_accuracy": 0.7886440306901932, | |
| "num_tokens": 5635349602.0, | |
| "step": 2710 | |
| }, | |
| { | |
| "epoch": 2.793413943915616, | |
| "grad_norm": 0.17560714483261108, | |
| "learning_rate": 5.19596289781988e-06, | |
| "loss": 0.6859, | |
| "mean_token_accuracy": 0.7909172236919403, | |
| "num_tokens": 5645756384.0, | |
| "step": 2715 | |
| }, | |
| { | |
| "epoch": 2.798559300231541, | |
| "grad_norm": 0.16886913776397705, | |
| "learning_rate": 5.186421723197922e-06, | |
| "loss": 0.6868, | |
| "mean_token_accuracy": 0.7907733172178268, | |
| "num_tokens": 5656178271.0, | |
| "step": 2720 | |
| }, | |
| { | |
| "epoch": 2.803704656547466, | |
| "grad_norm": 0.1693251132965088, | |
| "learning_rate": 5.177115733222307e-06, | |
| "loss": 0.6947, | |
| "mean_token_accuracy": 0.7886239379644394, | |
| "num_tokens": 5666600795.0, | |
| "step": 2725 | |
| }, | |
| { | |
| "epoch": 2.808850012863391, | |
| "grad_norm": 0.16803273558616638, | |
| "learning_rate": 5.168045227148184e-06, | |
| "loss": 0.6972, | |
| "mean_token_accuracy": 0.787845715880394, | |
| "num_tokens": 5676999671.0, | |
| "step": 2730 | |
| }, | |
| { | |
| "epoch": 2.813995369179316, | |
| "grad_norm": 0.16651484370231628, | |
| "learning_rate": 5.159210496658182e-06, | |
| "loss": 0.6884, | |
| "mean_token_accuracy": 0.7903055369853973, | |
| "num_tokens": 5687397402.0, | |
| "step": 2735 | |
| }, | |
| { | |
| "epoch": 2.8191407254952408, | |
| "grad_norm": 0.1660391241312027, | |
| "learning_rate": 5.15061182585304e-06, | |
| "loss": 0.6792, | |
| "mean_token_accuracy": 0.7931360393762589, | |
| "num_tokens": 5697794080.0, | |
| "step": 2740 | |
| }, | |
| { | |
| "epoch": 2.8242860818111657, | |
| "grad_norm": 0.17520754039287567, | |
| "learning_rate": 5.1422494912424595e-06, | |
| "loss": 0.6986, | |
| "mean_token_accuracy": 0.7876220345497131, | |
| "num_tokens": 5708182662.0, | |
| "step": 2745 | |
| }, | |
| { | |
| "epoch": 2.82943143812709, | |
| "grad_norm": 0.16872966289520264, | |
| "learning_rate": 5.134123761736216e-06, | |
| "loss": 0.6966, | |
| "mean_token_accuracy": 0.7881220698356628, | |
| "num_tokens": 5718594864.0, | |
| "step": 2750 | |
| }, | |
| { | |
| "epoch": 2.834576794443015, | |
| "grad_norm": 0.16700060665607452, | |
| "learning_rate": 5.126234898635518e-06, | |
| "loss": 0.6942, | |
| "mean_token_accuracy": 0.788716048002243, | |
| "num_tokens": 5728995814.0, | |
| "step": 2755 | |
| }, | |
| { | |
| "epoch": 2.83972215075894, | |
| "grad_norm": 0.17934595048427582, | |
| "learning_rate": 5.118583155624593e-06, | |
| "loss": 0.6929, | |
| "mean_token_accuracy": 0.7886988967657089, | |
| "num_tokens": 5739395668.0, | |
| "step": 2760 | |
| }, | |
| { | |
| "epoch": 2.844867507074865, | |
| "grad_norm": 0.17512430250644684, | |
| "learning_rate": 5.111168778762542e-06, | |
| "loss": 0.6938, | |
| "mean_token_accuracy": 0.7889688044786454, | |
| "num_tokens": 5749807349.0, | |
| "step": 2765 | |
| }, | |
| { | |
| "epoch": 2.85001286339079, | |
| "grad_norm": 0.1730797290802002, | |
| "learning_rate": 5.103992006475416e-06, | |
| "loss": 0.6951, | |
| "mean_token_accuracy": 0.7884382456541061, | |
| "num_tokens": 5760209986.0, | |
| "step": 2770 | |
| }, | |
| { | |
| "epoch": 2.8551582197067145, | |
| "grad_norm": 0.17350102961063385, | |
| "learning_rate": 5.097053069548554e-06, | |
| "loss": 0.6985, | |
| "mean_token_accuracy": 0.7875759869813919, | |
| "num_tokens": 5770624277.0, | |
| "step": 2775 | |
| }, | |
| { | |
| "epoch": 2.8603035760226394, | |
| "grad_norm": 0.16581964492797852, | |
| "learning_rate": 5.090352191119167e-06, | |
| "loss": 0.6968, | |
| "mean_token_accuracy": 0.7881747186183929, | |
| "num_tokens": 5780998524.0, | |
| "step": 2780 | |
| }, | |
| { | |
| "epoch": 2.8654489323385643, | |
| "grad_norm": 0.1817171722650528, | |
| "learning_rate": 5.083889586669148e-06, | |
| "loss": 0.6957, | |
| "mean_token_accuracy": 0.7882327765226365, | |
| "num_tokens": 5791408666.0, | |
| "step": 2785 | |
| }, | |
| { | |
| "epoch": 2.8705942886544893, | |
| "grad_norm": 0.18993426859378815, | |
| "learning_rate": 5.077665464018158e-06, | |
| "loss": 0.7035, | |
| "mean_token_accuracy": 0.7860898345708847, | |
| "num_tokens": 5801820214.0, | |
| "step": 2790 | |
| }, | |
| { | |
| "epoch": 2.8757396449704142, | |
| "grad_norm": 0.21299272775650024, | |
| "learning_rate": 5.071680023316934e-06, | |
| "loss": 0.688, | |
| "mean_token_accuracy": 0.7904294729232788, | |
| "num_tokens": 5812235390.0, | |
| "step": 2795 | |
| }, | |
| { | |
| "epoch": 2.880885001286339, | |
| "grad_norm": 0.17882496118545532, | |
| "learning_rate": 5.065933457040855e-06, | |
| "loss": 0.6996, | |
| "mean_token_accuracy": 0.7872415781021118, | |
| "num_tokens": 5822639708.0, | |
| "step": 2800 | |
| }, | |
| { | |
| "epoch": 2.886030357602264, | |
| "grad_norm": 0.16855992376804352, | |
| "learning_rate": 5.060425949983754e-06, | |
| "loss": 0.682, | |
| "mean_token_accuracy": 0.7919277369976043, | |
| "num_tokens": 5833034354.0, | |
| "step": 2805 | |
| }, | |
| { | |
| "epoch": 2.891175713918189, | |
| "grad_norm": 0.17891888320446014, | |
| "learning_rate": 5.055157679251973e-06, | |
| "loss": 0.6899, | |
| "mean_token_accuracy": 0.7897359609603882, | |
| "num_tokens": 5843403693.0, | |
| "step": 2810 | |
| }, | |
| { | |
| "epoch": 2.8963210702341136, | |
| "grad_norm": 0.16349950432777405, | |
| "learning_rate": 5.05012881425867e-06, | |
| "loss": 0.6863, | |
| "mean_token_accuracy": 0.7908827304840088, | |
| "num_tokens": 5853803722.0, | |
| "step": 2815 | |
| }, | |
| { | |
| "epoch": 2.9014664265500385, | |
| "grad_norm": 0.1745273768901825, | |
| "learning_rate": 5.045339516718369e-06, | |
| "loss": 0.6893, | |
| "mean_token_accuracy": 0.7903022348880768, | |
| "num_tokens": 5864184533.0, | |
| "step": 2820 | |
| }, | |
| { | |
| "epoch": 2.9066117828659634, | |
| "grad_norm": 0.16056472063064575, | |
| "learning_rate": 5.0407899406417626e-06, | |
| "loss": 0.6952, | |
| "mean_token_accuracy": 0.7885043084621429, | |
| "num_tokens": 5874589484.0, | |
| "step": 2825 | |
| }, | |
| { | |
| "epoch": 2.9117571391818884, | |
| "grad_norm": 0.17720390856266022, | |
| "learning_rate": 5.036480232330756e-06, | |
| "loss": 0.6936, | |
| "mean_token_accuracy": 0.7888238668441773, | |
| "num_tokens": 5885001020.0, | |
| "step": 2830 | |
| }, | |
| { | |
| "epoch": 2.9169024954978133, | |
| "grad_norm": 0.16652604937553406, | |
| "learning_rate": 5.032410530373764e-06, | |
| "loss": 0.7, | |
| "mean_token_accuracy": 0.7873083800077438, | |
| "num_tokens": 5895408701.0, | |
| "step": 2835 | |
| }, | |
| { | |
| "epoch": 2.922047851813738, | |
| "grad_norm": 0.17051072418689728, | |
| "learning_rate": 5.028580965641256e-06, | |
| "loss": 0.6925, | |
| "mean_token_accuracy": 0.7890042126178741, | |
| "num_tokens": 5905806664.0, | |
| "step": 2840 | |
| }, | |
| { | |
| "epoch": 2.9271932081296628, | |
| "grad_norm": 0.16726641356945038, | |
| "learning_rate": 5.024991661281546e-06, | |
| "loss": 0.6962, | |
| "mean_token_accuracy": 0.7878126442432404, | |
| "num_tokens": 5916184097.0, | |
| "step": 2845 | |
| }, | |
| { | |
| "epoch": 2.9323385644455877, | |
| "grad_norm": 0.16886568069458008, | |
| "learning_rate": 5.0216427327168295e-06, | |
| "loss": 0.6861, | |
| "mean_token_accuracy": 0.7910468071699143, | |
| "num_tokens": 5926604231.0, | |
| "step": 2850 | |
| }, | |
| { | |
| "epoch": 2.9374839207615127, | |
| "grad_norm": 0.1645469218492508, | |
| "learning_rate": 5.0185342876394775e-06, | |
| "loss": 0.6954, | |
| "mean_token_accuracy": 0.7883421629667282, | |
| "num_tokens": 5937011174.0, | |
| "step": 2855 | |
| }, | |
| { | |
| "epoch": 2.9426292770774376, | |
| "grad_norm": 0.17823590338230133, | |
| "learning_rate": 5.0156664260085695e-06, | |
| "loss": 0.6896, | |
| "mean_token_accuracy": 0.7900129020214081, | |
| "num_tokens": 5947409312.0, | |
| "step": 2860 | |
| }, | |
| { | |
| "epoch": 2.9477746333933625, | |
| "grad_norm": 0.18835224211215973, | |
| "learning_rate": 5.0130392400466835e-06, | |
| "loss": 0.689, | |
| "mean_token_accuracy": 0.7900780767202378, | |
| "num_tokens": 5957805326.0, | |
| "step": 2865 | |
| }, | |
| { | |
| "epoch": 2.9529199897092875, | |
| "grad_norm": 0.1715887039899826, | |
| "learning_rate": 5.010652814236921e-06, | |
| "loss": 0.6909, | |
| "mean_token_accuracy": 0.7899001896381378, | |
| "num_tokens": 5968218507.0, | |
| "step": 2870 | |
| }, | |
| { | |
| "epoch": 2.9580653460252124, | |
| "grad_norm": 0.18797433376312256, | |
| "learning_rate": 5.008507225320203e-06, | |
| "loss": 0.704, | |
| "mean_token_accuracy": 0.7860912084579468, | |
| "num_tokens": 5978622283.0, | |
| "step": 2875 | |
| }, | |
| { | |
| "epoch": 2.9632107023411374, | |
| "grad_norm": 0.17873001098632812, | |
| "learning_rate": 5.00660254229279e-06, | |
| "loss": 0.6968, | |
| "mean_token_accuracy": 0.7881993442773819, | |
| "num_tokens": 5988997989.0, | |
| "step": 2880 | |
| }, | |
| { | |
| "epoch": 2.968356058657062, | |
| "grad_norm": 0.18319903314113617, | |
| "learning_rate": 5.004938826404073e-06, | |
| "loss": 0.6993, | |
| "mean_token_accuracy": 0.787423062324524, | |
| "num_tokens": 5999359936.0, | |
| "step": 2885 | |
| }, | |
| { | |
| "epoch": 2.973501414972987, | |
| "grad_norm": 0.1805352121591568, | |
| "learning_rate": 5.003516131154598e-06, | |
| "loss": 0.6972, | |
| "mean_token_accuracy": 0.7876656591892243, | |
| "num_tokens": 6009761882.0, | |
| "step": 2890 | |
| }, | |
| { | |
| "epoch": 2.9786467712889118, | |
| "grad_norm": 0.18117545545101166, | |
| "learning_rate": 5.002334502294346e-06, | |
| "loss": 0.6947, | |
| "mean_token_accuracy": 0.7882909893989563, | |
| "num_tokens": 6020167222.0, | |
| "step": 2895 | |
| }, | |
| { | |
| "epoch": 2.9837921276048367, | |
| "grad_norm": 0.1723506599664688, | |
| "learning_rate": 5.001393977821266e-06, | |
| "loss": 0.7042, | |
| "mean_token_accuracy": 0.7860449641942978, | |
| "num_tokens": 6030555418.0, | |
| "step": 2900 | |
| }, | |
| { | |
| "epoch": 2.9889374839207616, | |
| "grad_norm": 0.19804443418979645, | |
| "learning_rate": 5.00069458798005e-06, | |
| "loss": 0.6997, | |
| "mean_token_accuracy": 0.7871603965759277, | |
| "num_tokens": 6040947002.0, | |
| "step": 2905 | |
| }, | |
| { | |
| "epoch": 2.994082840236686, | |
| "grad_norm": 0.16171956062316895, | |
| "learning_rate": 5.000236355261159e-06, | |
| "loss": 0.6994, | |
| "mean_token_accuracy": 0.787379264831543, | |
| "num_tokens": 6051343009.0, | |
| "step": 2910 | |
| }, | |
| { | |
| "epoch": 2.999228196552611, | |
| "grad_norm": 0.17762000858783722, | |
| "learning_rate": 5.000019294400102e-06, | |
| "loss": 0.6875, | |
| "mean_token_accuracy": 0.7905127763748169, | |
| "num_tokens": 6061722567.0, | |
| "step": 2915 | |
| } | |
| ], | |
| "logging_steps": 5, | |
| "max_steps": 2916, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 3, | |
| "save_steps": 450, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 1.2789425421603045e+19, | |
| "train_batch_size": 4, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |