| { | |
| "best_metric": 1.2152043581008911, | |
| "best_model_checkpoint": "./output/checkpoint-4650", | |
| "epoch": 0.3073567321039064, | |
| "eval_steps": 150, | |
| "global_step": 4650, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.0006609822195782933, | |
| "grad_norm": 7.413546562194824, | |
| "learning_rate": 2.2360679774997904e-06, | |
| "loss": 1.2392, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.0013219644391565867, | |
| "grad_norm": 7.08538818359375, | |
| "learning_rate": 4.472135954999581e-06, | |
| "loss": 1.2951, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.00198294665873488, | |
| "grad_norm": 15.173999786376953, | |
| "learning_rate": 6.70820393249937e-06, | |
| "loss": 1.3208, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.0026439288783131733, | |
| "grad_norm": 7.055360317230225, | |
| "learning_rate": 8.944271909999161e-06, | |
| "loss": 1.2641, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.003304911097891467, | |
| "grad_norm": 8.638155937194824, | |
| "learning_rate": 1.118033988749895e-05, | |
| "loss": 1.2835, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.00396589331746976, | |
| "grad_norm": 7.482174396514893, | |
| "learning_rate": 1.341640786499874e-05, | |
| "loss": 1.1253, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.0046268755370480535, | |
| "grad_norm": 11.88020133972168, | |
| "learning_rate": 1.565247584249853e-05, | |
| "loss": 1.1857, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.005287857756626347, | |
| "grad_norm": 13.985732078552246, | |
| "learning_rate": 1.7888543819998323e-05, | |
| "loss": 1.3447, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.00594883997620464, | |
| "grad_norm": 7.102285861968994, | |
| "learning_rate": 2.0124611797498112e-05, | |
| "loss": 1.3336, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 0.006609822195782934, | |
| "grad_norm": 7.646865367889404, | |
| "learning_rate": 2.23606797749979e-05, | |
| "loss": 1.176, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.007270804415361227, | |
| "grad_norm": 6.750139236450195, | |
| "learning_rate": 2.236044998500671e-05, | |
| "loss": 1.2429, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 0.00793178663493952, | |
| "grad_norm": 10.000078201293945, | |
| "learning_rate": 2.235976062447891e-05, | |
| "loss": 1.3139, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 0.008592768854517813, | |
| "grad_norm": 12.12943172454834, | |
| "learning_rate": 2.2358611721751407e-05, | |
| "loss": 1.3145, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 0.009253751074096107, | |
| "grad_norm": 7.1956071853637695, | |
| "learning_rate": 2.2357003324051093e-05, | |
| "loss": 1.3055, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 0.009914733293674401, | |
| "grad_norm": 6.159770965576172, | |
| "learning_rate": 2.23549354974929e-05, | |
| "loss": 1.3298, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.009914733293674401, | |
| "eval_loss": 1.3606581687927246, | |
| "eval_runtime": 45.5267, | |
| "eval_samples_per_second": 11.005, | |
| "eval_steps_per_second": 11.005, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.010575715513252693, | |
| "grad_norm": 15.24757194519043, | |
| "learning_rate": 2.2352408327077078e-05, | |
| "loss": 1.303, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 0.011236697732830987, | |
| "grad_norm": 10.154984474182129, | |
| "learning_rate": 2.2349421916685704e-05, | |
| "loss": 1.2568, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 0.01189767995240928, | |
| "grad_norm": 7.64827299118042, | |
| "learning_rate": 2.234597638907841e-05, | |
| "loss": 1.27, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 0.012558662171987573, | |
| "grad_norm": 10.21170711517334, | |
| "learning_rate": 2.2342071885887346e-05, | |
| "loss": 1.2995, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 0.013219644391565867, | |
| "grad_norm": 10.44480037689209, | |
| "learning_rate": 2.2337708567611343e-05, | |
| "loss": 1.3509, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.01388062661114416, | |
| "grad_norm": 7.435905456542969, | |
| "learning_rate": 2.233288661360932e-05, | |
| "loss": 1.1597, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 0.014541608830722454, | |
| "grad_norm": 16.616416931152344, | |
| "learning_rate": 2.232760622209293e-05, | |
| "loss": 1.2589, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 0.015202591050300748, | |
| "grad_norm": 13.498307228088379, | |
| "learning_rate": 2.2321867610118378e-05, | |
| "loss": 1.3307, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 0.01586357326987904, | |
| "grad_norm": 7.282419681549072, | |
| "learning_rate": 2.231567101357753e-05, | |
| "loss": 1.3213, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 0.016524555489457332, | |
| "grad_norm": 12.302486419677734, | |
| "learning_rate": 2.2309016687188194e-05, | |
| "loss": 1.3124, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 0.017185537709035626, | |
| "grad_norm": 8.877416610717773, | |
| "learning_rate": 2.230190490448367e-05, | |
| "loss": 1.1267, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 0.01784651992861392, | |
| "grad_norm": 10.397753715515137, | |
| "learning_rate": 2.229433595780149e-05, | |
| "loss": 1.3197, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 0.018507502148192214, | |
| "grad_norm": 9.187607765197754, | |
| "learning_rate": 2.2286310158271407e-05, | |
| "loss": 1.1703, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 0.019168484367770508, | |
| "grad_norm": 7.458565711975098, | |
| "learning_rate": 2.22778278358026e-05, | |
| "loss": 1.2126, | |
| "step": 290 | |
| }, | |
| { | |
| "epoch": 0.019829466587348802, | |
| "grad_norm": 11.090981483459473, | |
| "learning_rate": 2.2268889339070124e-05, | |
| "loss": 1.1683, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.019829466587348802, | |
| "eval_loss": 1.3488467931747437, | |
| "eval_runtime": 55.8106, | |
| "eval_samples_per_second": 8.977, | |
| "eval_steps_per_second": 8.977, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.020490448806927093, | |
| "grad_norm": 10.89608383178711, | |
| "learning_rate": 2.2259495035500576e-05, | |
| "loss": 1.4133, | |
| "step": 310 | |
| }, | |
| { | |
| "epoch": 0.021151431026505386, | |
| "grad_norm": 7.514070510864258, | |
| "learning_rate": 2.2249645311256972e-05, | |
| "loss": 1.2241, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 0.02181241324608368, | |
| "grad_norm": 12.841883659362793, | |
| "learning_rate": 2.2239340571222904e-05, | |
| "loss": 1.2928, | |
| "step": 330 | |
| }, | |
| { | |
| "epoch": 0.022473395465661974, | |
| "grad_norm": 13.028974533081055, | |
| "learning_rate": 2.2228581238985868e-05, | |
| "loss": 1.2704, | |
| "step": 340 | |
| }, | |
| { | |
| "epoch": 0.02313437768524027, | |
| "grad_norm": 11.415493965148926, | |
| "learning_rate": 2.2217367756819878e-05, | |
| "loss": 1.2951, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 0.02379535990481856, | |
| "grad_norm": 14.492388725280762, | |
| "learning_rate": 2.2205700585667257e-05, | |
| "loss": 1.2643, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 0.024456342124396853, | |
| "grad_norm": 10.009002685546875, | |
| "learning_rate": 2.2193580205119724e-05, | |
| "loss": 1.2515, | |
| "step": 370 | |
| }, | |
| { | |
| "epoch": 0.025117324343975147, | |
| "grad_norm": 8.66943073272705, | |
| "learning_rate": 2.2181007113398642e-05, | |
| "loss": 1.1653, | |
| "step": 380 | |
| }, | |
| { | |
| "epoch": 0.02577830656355344, | |
| "grad_norm": 13.82745361328125, | |
| "learning_rate": 2.216798182733457e-05, | |
| "loss": 1.3251, | |
| "step": 390 | |
| }, | |
| { | |
| "epoch": 0.026439288783131735, | |
| "grad_norm": 9.831866264343262, | |
| "learning_rate": 2.2154504882346002e-05, | |
| "loss": 1.3099, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 0.02710027100271003, | |
| "grad_norm": 6.000834941864014, | |
| "learning_rate": 2.214057683241736e-05, | |
| "loss": 1.2919, | |
| "step": 410 | |
| }, | |
| { | |
| "epoch": 0.02776125322228832, | |
| "grad_norm": 5.438742160797119, | |
| "learning_rate": 2.2126198250076225e-05, | |
| "loss": 1.1859, | |
| "step": 420 | |
| }, | |
| { | |
| "epoch": 0.028422235441866613, | |
| "grad_norm": 11.776556968688965, | |
| "learning_rate": 2.2111369726369802e-05, | |
| "loss": 1.339, | |
| "step": 430 | |
| }, | |
| { | |
| "epoch": 0.029083217661444907, | |
| "grad_norm": 7.697872638702393, | |
| "learning_rate": 2.2096091870840613e-05, | |
| "loss": 1.2235, | |
| "step": 440 | |
| }, | |
| { | |
| "epoch": 0.0297441998810232, | |
| "grad_norm": 12.47408676147461, | |
| "learning_rate": 2.2080365311501466e-05, | |
| "loss": 1.0851, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 0.0297441998810232, | |
| "eval_loss": 1.3441540002822876, | |
| "eval_runtime": 45.42, | |
| "eval_samples_per_second": 11.03, | |
| "eval_steps_per_second": 11.03, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 0.030405182100601495, | |
| "grad_norm": 5.456786155700684, | |
| "learning_rate": 2.206419069480962e-05, | |
| "loss": 1.2224, | |
| "step": 460 | |
| }, | |
| { | |
| "epoch": 0.031066164320179786, | |
| "grad_norm": 17.571989059448242, | |
| "learning_rate": 2.2047568685640212e-05, | |
| "loss": 1.355, | |
| "step": 470 | |
| }, | |
| { | |
| "epoch": 0.03172714653975808, | |
| "grad_norm": 10.6810302734375, | |
| "learning_rate": 2.203049996725894e-05, | |
| "loss": 1.3274, | |
| "step": 480 | |
| }, | |
| { | |
| "epoch": 0.032388128759336374, | |
| "grad_norm": 7.424011707305908, | |
| "learning_rate": 2.2012985241293954e-05, | |
| "loss": 1.1497, | |
| "step": 490 | |
| }, | |
| { | |
| "epoch": 0.033049110978914664, | |
| "grad_norm": 12.73671817779541, | |
| "learning_rate": 2.1995025227707044e-05, | |
| "loss": 1.3728, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.03371009319849296, | |
| "grad_norm": 8.181777000427246, | |
| "learning_rate": 2.1976620664764027e-05, | |
| "loss": 1.2332, | |
| "step": 510 | |
| }, | |
| { | |
| "epoch": 0.03437107541807125, | |
| "grad_norm": 13.738442420959473, | |
| "learning_rate": 2.1957772309004394e-05, | |
| "loss": 1.2833, | |
| "step": 520 | |
| }, | |
| { | |
| "epoch": 0.03503205763764955, | |
| "grad_norm": 13.703083992004395, | |
| "learning_rate": 2.1938480935210228e-05, | |
| "loss": 1.4239, | |
| "step": 530 | |
| }, | |
| { | |
| "epoch": 0.03569303985722784, | |
| "grad_norm": 7.870193004608154, | |
| "learning_rate": 2.1918747336374347e-05, | |
| "loss": 1.4103, | |
| "step": 540 | |
| }, | |
| { | |
| "epoch": 0.03635402207680613, | |
| "grad_norm": 8.396446228027344, | |
| "learning_rate": 2.189857232366771e-05, | |
| "loss": 1.2522, | |
| "step": 550 | |
| }, | |
| { | |
| "epoch": 0.03701500429638443, | |
| "grad_norm": 12.225940704345703, | |
| "learning_rate": 2.1877956726406063e-05, | |
| "loss": 1.3464, | |
| "step": 560 | |
| }, | |
| { | |
| "epoch": 0.03767598651596272, | |
| "grad_norm": 11.3760347366333, | |
| "learning_rate": 2.1856901392015874e-05, | |
| "loss": 1.2843, | |
| "step": 570 | |
| }, | |
| { | |
| "epoch": 0.038336968735541016, | |
| "grad_norm": 11.334436416625977, | |
| "learning_rate": 2.183540718599946e-05, | |
| "loss": 1.2579, | |
| "step": 580 | |
| }, | |
| { | |
| "epoch": 0.03899795095511931, | |
| "grad_norm": 10.890923500061035, | |
| "learning_rate": 2.1813474991899453e-05, | |
| "loss": 1.1799, | |
| "step": 590 | |
| }, | |
| { | |
| "epoch": 0.039658933174697604, | |
| "grad_norm": 9.872835159301758, | |
| "learning_rate": 2.1791105711262442e-05, | |
| "loss": 1.1629, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 0.039658933174697604, | |
| "eval_loss": 1.3372266292572021, | |
| "eval_runtime": 56.8438, | |
| "eval_samples_per_second": 8.814, | |
| "eval_steps_per_second": 8.814, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 0.040319915394275894, | |
| "grad_norm": 11.447709083557129, | |
| "learning_rate": 2.1768300263601945e-05, | |
| "loss": 1.2011, | |
| "step": 610 | |
| }, | |
| { | |
| "epoch": 0.040980897613854185, | |
| "grad_norm": 12.056636810302734, | |
| "learning_rate": 2.174505958636059e-05, | |
| "loss": 1.2068, | |
| "step": 620 | |
| }, | |
| { | |
| "epoch": 0.04164187983343248, | |
| "grad_norm": 8.074010848999023, | |
| "learning_rate": 2.1721384634871592e-05, | |
| "loss": 1.1598, | |
| "step": 630 | |
| }, | |
| { | |
| "epoch": 0.04230286205301077, | |
| "grad_norm": 11.10396957397461, | |
| "learning_rate": 2.169727638231948e-05, | |
| "loss": 1.0609, | |
| "step": 640 | |
| }, | |
| { | |
| "epoch": 0.04296384427258907, | |
| "grad_norm": 7.929290771484375, | |
| "learning_rate": 2.1672735819700084e-05, | |
| "loss": 1.1761, | |
| "step": 650 | |
| }, | |
| { | |
| "epoch": 0.04362482649216736, | |
| "grad_norm": 12.149751663208008, | |
| "learning_rate": 2.1647763955779823e-05, | |
| "loss": 1.35, | |
| "step": 660 | |
| }, | |
| { | |
| "epoch": 0.04428580871174565, | |
| "grad_norm": 12.335487365722656, | |
| "learning_rate": 2.1622361817054213e-05, | |
| "loss": 1.2615, | |
| "step": 670 | |
| }, | |
| { | |
| "epoch": 0.04494679093132395, | |
| "grad_norm": 10.838406562805176, | |
| "learning_rate": 2.1596530447705676e-05, | |
| "loss": 1.1423, | |
| "step": 680 | |
| }, | |
| { | |
| "epoch": 0.04560777315090224, | |
| "grad_norm": 11.29602336883545, | |
| "learning_rate": 2.157027090956064e-05, | |
| "loss": 1.2088, | |
| "step": 690 | |
| }, | |
| { | |
| "epoch": 0.04626875537048054, | |
| "grad_norm": 6.865326881408691, | |
| "learning_rate": 2.1543584282045862e-05, | |
| "loss": 1.2449, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 0.04692973759005883, | |
| "grad_norm": 11.23728084564209, | |
| "learning_rate": 2.1516471662144077e-05, | |
| "loss": 1.3072, | |
| "step": 710 | |
| }, | |
| { | |
| "epoch": 0.04759071980963712, | |
| "grad_norm": 9.809483528137207, | |
| "learning_rate": 2.1488934164348898e-05, | |
| "loss": 1.2592, | |
| "step": 720 | |
| }, | |
| { | |
| "epoch": 0.048251702029215415, | |
| "grad_norm": 12.237908363342285, | |
| "learning_rate": 2.1460972920619e-05, | |
| "loss": 1.2014, | |
| "step": 730 | |
| }, | |
| { | |
| "epoch": 0.048912684248793706, | |
| "grad_norm": 12.795587539672852, | |
| "learning_rate": 2.143258908033159e-05, | |
| "loss": 1.2433, | |
| "step": 740 | |
| }, | |
| { | |
| "epoch": 0.049573666468372, | |
| "grad_norm": 13.611194610595703, | |
| "learning_rate": 2.140378381023518e-05, | |
| "loss": 1.2548, | |
| "step": 750 | |
| }, | |
| { | |
| "epoch": 0.049573666468372, | |
| "eval_loss": 1.3183883428573608, | |
| "eval_runtime": 55.6542, | |
| "eval_samples_per_second": 9.002, | |
| "eval_steps_per_second": 9.002, | |
| "step": 750 | |
| }, | |
| { | |
| "epoch": 0.050234648687950294, | |
| "grad_norm": 4.964775085449219, | |
| "learning_rate": 2.1374558294401597e-05, | |
| "loss": 1.2587, | |
| "step": 760 | |
| }, | |
| { | |
| "epoch": 0.050895630907528584, | |
| "grad_norm": 13.402926445007324, | |
| "learning_rate": 2.134491373417733e-05, | |
| "loss": 1.1855, | |
| "step": 770 | |
| }, | |
| { | |
| "epoch": 0.05155661312710688, | |
| "grad_norm": 8.38901138305664, | |
| "learning_rate": 2.1314851348134134e-05, | |
| "loss": 1.3289, | |
| "step": 780 | |
| }, | |
| { | |
| "epoch": 0.05221759534668517, | |
| "grad_norm": 6.840709686279297, | |
| "learning_rate": 2.1284372372018963e-05, | |
| "loss": 1.1234, | |
| "step": 790 | |
| }, | |
| { | |
| "epoch": 0.05287857756626347, | |
| "grad_norm": 6.543496608734131, | |
| "learning_rate": 2.125347805870314e-05, | |
| "loss": 1.2149, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 0.05353955978584176, | |
| "grad_norm": 7.223635196685791, | |
| "learning_rate": 2.122216967813088e-05, | |
| "loss": 1.0977, | |
| "step": 810 | |
| }, | |
| { | |
| "epoch": 0.05420054200542006, | |
| "grad_norm": 10.436606407165527, | |
| "learning_rate": 2.1190448517267087e-05, | |
| "loss": 1.1564, | |
| "step": 820 | |
| }, | |
| { | |
| "epoch": 0.05486152422499835, | |
| "grad_norm": 17.590259552001953, | |
| "learning_rate": 2.115831588004444e-05, | |
| "loss": 1.3229, | |
| "step": 830 | |
| }, | |
| { | |
| "epoch": 0.05552250644457664, | |
| "grad_norm": 11.749155044555664, | |
| "learning_rate": 2.1125773087309798e-05, | |
| "loss": 1.2345, | |
| "step": 840 | |
| }, | |
| { | |
| "epoch": 0.056183488664154936, | |
| "grad_norm": 11.912696838378906, | |
| "learning_rate": 2.1092821476769906e-05, | |
| "loss": 1.1779, | |
| "step": 850 | |
| }, | |
| { | |
| "epoch": 0.05684447088373323, | |
| "grad_norm": 5.420770168304443, | |
| "learning_rate": 2.1059462402936416e-05, | |
| "loss": 1.2414, | |
| "step": 860 | |
| }, | |
| { | |
| "epoch": 0.057505453103311524, | |
| "grad_norm": 4.887539863586426, | |
| "learning_rate": 2.102569723707019e-05, | |
| "loss": 1.1046, | |
| "step": 870 | |
| }, | |
| { | |
| "epoch": 0.058166435322889815, | |
| "grad_norm": 9.325897216796875, | |
| "learning_rate": 2.0991527367124955e-05, | |
| "loss": 1.3145, | |
| "step": 880 | |
| }, | |
| { | |
| "epoch": 0.058827417542468105, | |
| "grad_norm": 14.635684967041016, | |
| "learning_rate": 2.095695419769022e-05, | |
| "loss": 1.3592, | |
| "step": 890 | |
| }, | |
| { | |
| "epoch": 0.0594883997620464, | |
| "grad_norm": 8.91545295715332, | |
| "learning_rate": 2.0921979149933576e-05, | |
| "loss": 1.3035, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 0.0594883997620464, | |
| "eval_loss": 1.3120555877685547, | |
| "eval_runtime": 52.1726, | |
| "eval_samples_per_second": 9.603, | |
| "eval_steps_per_second": 9.603, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 0.06014938198162469, | |
| "grad_norm": 6.539499759674072, | |
| "learning_rate": 2.0886603661542245e-05, | |
| "loss": 1.2819, | |
| "step": 910 | |
| }, | |
| { | |
| "epoch": 0.06081036420120299, | |
| "grad_norm": 5.03954553604126, | |
| "learning_rate": 2.0850829186663994e-05, | |
| "loss": 1.2467, | |
| "step": 920 | |
| }, | |
| { | |
| "epoch": 0.06147134642078128, | |
| "grad_norm": 12.52458381652832, | |
| "learning_rate": 2.0814657195847375e-05, | |
| "loss": 1.1568, | |
| "step": 930 | |
| }, | |
| { | |
| "epoch": 0.06213232864035957, | |
| "grad_norm": 11.251747131347656, | |
| "learning_rate": 2.077808917598125e-05, | |
| "loss": 1.1703, | |
| "step": 940 | |
| }, | |
| { | |
| "epoch": 0.06279331085993786, | |
| "grad_norm": 10.658408164978027, | |
| "learning_rate": 2.0741126630233687e-05, | |
| "loss": 1.1074, | |
| "step": 950 | |
| }, | |
| { | |
| "epoch": 0.06345429307951617, | |
| "grad_norm": 6.95957612991333, | |
| "learning_rate": 2.070377107799017e-05, | |
| "loss": 1.1635, | |
| "step": 960 | |
| }, | |
| { | |
| "epoch": 0.06411527529909446, | |
| "grad_norm": 10.898233413696289, | |
| "learning_rate": 2.0666024054791137e-05, | |
| "loss": 1.2801, | |
| "step": 970 | |
| }, | |
| { | |
| "epoch": 0.06477625751867275, | |
| "grad_norm": 12.640921592712402, | |
| "learning_rate": 2.0627887112268875e-05, | |
| "loss": 1.2982, | |
| "step": 980 | |
| }, | |
| { | |
| "epoch": 0.06543723973825104, | |
| "grad_norm": 6.845248699188232, | |
| "learning_rate": 2.0589361818083712e-05, | |
| "loss": 1.0552, | |
| "step": 990 | |
| }, | |
| { | |
| "epoch": 0.06609822195782933, | |
| "grad_norm": 12.774737358093262, | |
| "learning_rate": 2.0550449755859598e-05, | |
| "loss": 1.149, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 0.06675920417740763, | |
| "grad_norm": 12.460762977600098, | |
| "learning_rate": 2.0511152525119014e-05, | |
| "loss": 1.0864, | |
| "step": 1010 | |
| }, | |
| { | |
| "epoch": 0.06742018639698592, | |
| "grad_norm": 12.369227409362793, | |
| "learning_rate": 2.0471471741217183e-05, | |
| "loss": 1.2691, | |
| "step": 1020 | |
| }, | |
| { | |
| "epoch": 0.06808116861656421, | |
| "grad_norm": 15.577491760253906, | |
| "learning_rate": 2.0431409035275724e-05, | |
| "loss": 1.3091, | |
| "step": 1030 | |
| }, | |
| { | |
| "epoch": 0.0687421508361425, | |
| "grad_norm": 8.849650382995605, | |
| "learning_rate": 2.0390966054115558e-05, | |
| "loss": 1.2703, | |
| "step": 1040 | |
| }, | |
| { | |
| "epoch": 0.0694031330557208, | |
| "grad_norm": 13.82666015625, | |
| "learning_rate": 2.035014446018924e-05, | |
| "loss": 1.388, | |
| "step": 1050 | |
| }, | |
| { | |
| "epoch": 0.0694031330557208, | |
| "eval_loss": 1.303145170211792, | |
| "eval_runtime": 53.8965, | |
| "eval_samples_per_second": 9.296, | |
| "eval_steps_per_second": 9.296, | |
| "step": 1050 | |
| }, | |
| { | |
| "epoch": 0.0700641152752991, | |
| "grad_norm": 11.953422546386719, | |
| "learning_rate": 2.0308945931512606e-05, | |
| "loss": 1.1849, | |
| "step": 1060 | |
| }, | |
| { | |
| "epoch": 0.07072509749487739, | |
| "grad_norm": 6.583851337432861, | |
| "learning_rate": 2.0267372161595806e-05, | |
| "loss": 1.2334, | |
| "step": 1070 | |
| }, | |
| { | |
| "epoch": 0.07138607971445568, | |
| "grad_norm": 10.967381477355957, | |
| "learning_rate": 2.022542485937369e-05, | |
| "loss": 1.146, | |
| "step": 1080 | |
| }, | |
| { | |
| "epoch": 0.07204706193403397, | |
| "grad_norm": 11.6732177734375, | |
| "learning_rate": 2.0183105749135553e-05, | |
| "loss": 1.1601, | |
| "step": 1090 | |
| }, | |
| { | |
| "epoch": 0.07270804415361226, | |
| "grad_norm": 11.63559341430664, | |
| "learning_rate": 2.0140416570454266e-05, | |
| "loss": 1.2845, | |
| "step": 1100 | |
| }, | |
| { | |
| "epoch": 0.07336902637319057, | |
| "grad_norm": 8.482784271240234, | |
| "learning_rate": 2.0097359078114767e-05, | |
| "loss": 1.1344, | |
| "step": 1110 | |
| }, | |
| { | |
| "epoch": 0.07403000859276886, | |
| "grad_norm": 11.602831840515137, | |
| "learning_rate": 2.0053935042041915e-05, | |
| "loss": 1.2167, | |
| "step": 1120 | |
| }, | |
| { | |
| "epoch": 0.07469099081234715, | |
| "grad_norm": 6.016249179840088, | |
| "learning_rate": 2.001014624722775e-05, | |
| "loss": 1.2611, | |
| "step": 1130 | |
| }, | |
| { | |
| "epoch": 0.07535197303192544, | |
| "grad_norm": 6.9794020652771, | |
| "learning_rate": 1.996599449365813e-05, | |
| "loss": 1.0101, | |
| "step": 1140 | |
| }, | |
| { | |
| "epoch": 0.07601295525150373, | |
| "grad_norm": 10.84961986541748, | |
| "learning_rate": 1.9921481596238703e-05, | |
| "loss": 1.1906, | |
| "step": 1150 | |
| }, | |
| { | |
| "epoch": 0.07667393747108203, | |
| "grad_norm": 13.637924194335938, | |
| "learning_rate": 1.9876609384720335e-05, | |
| "loss": 1.2617, | |
| "step": 1160 | |
| }, | |
| { | |
| "epoch": 0.07733491969066032, | |
| "grad_norm": 11.967713356018066, | |
| "learning_rate": 1.9831379703623903e-05, | |
| "loss": 1.1903, | |
| "step": 1170 | |
| }, | |
| { | |
| "epoch": 0.07799590191023861, | |
| "grad_norm": 12.296497344970703, | |
| "learning_rate": 1.978579441216443e-05, | |
| "loss": 0.9757, | |
| "step": 1180 | |
| }, | |
| { | |
| "epoch": 0.0786568841298169, | |
| "grad_norm": 12.823221206665039, | |
| "learning_rate": 1.9739855384174708e-05, | |
| "loss": 1.2341, | |
| "step": 1190 | |
| }, | |
| { | |
| "epoch": 0.07931786634939521, | |
| "grad_norm": 9.349319458007812, | |
| "learning_rate": 1.969356450802825e-05, | |
| "loss": 1.1929, | |
| "step": 1200 | |
| }, | |
| { | |
| "epoch": 0.07931786634939521, | |
| "eval_loss": 1.3002644777297974, | |
| "eval_runtime": 46.8524, | |
| "eval_samples_per_second": 10.693, | |
| "eval_steps_per_second": 10.693, | |
| "step": 1200 | |
| }, | |
| { | |
| "epoch": 0.0799788485689735, | |
| "grad_norm": 6.869687080383301, | |
| "learning_rate": 1.964692368656166e-05, | |
| "loss": 0.9831, | |
| "step": 1210 | |
| }, | |
| { | |
| "epoch": 0.08063983078855179, | |
| "grad_norm": 12.35352897644043, | |
| "learning_rate": 1.9599934836996435e-05, | |
| "loss": 1.1827, | |
| "step": 1220 | |
| }, | |
| { | |
| "epoch": 0.08130081300813008, | |
| "grad_norm": 14.163335800170898, | |
| "learning_rate": 1.9552599890860126e-05, | |
| "loss": 1.2183, | |
| "step": 1230 | |
| }, | |
| { | |
| "epoch": 0.08196179522770837, | |
| "grad_norm": 14.357596397399902, | |
| "learning_rate": 1.9504920793906985e-05, | |
| "loss": 1.1122, | |
| "step": 1240 | |
| }, | |
| { | |
| "epoch": 0.08262277744728667, | |
| "grad_norm": 12.211373329162598, | |
| "learning_rate": 1.945689950603793e-05, | |
| "loss": 1.1785, | |
| "step": 1250 | |
| }, | |
| { | |
| "epoch": 0.08328375966686496, | |
| "grad_norm": 9.271207809448242, | |
| "learning_rate": 1.9408538001220032e-05, | |
| "loss": 1.3458, | |
| "step": 1260 | |
| }, | |
| { | |
| "epoch": 0.08394474188644326, | |
| "grad_norm": 8.985238075256348, | |
| "learning_rate": 1.9359838267405318e-05, | |
| "loss": 1.2764, | |
| "step": 1270 | |
| }, | |
| { | |
| "epoch": 0.08460572410602155, | |
| "grad_norm": 6.032650947570801, | |
| "learning_rate": 1.931080230644911e-05, | |
| "loss": 1.1252, | |
| "step": 1280 | |
| }, | |
| { | |
| "epoch": 0.08526670632559984, | |
| "grad_norm": 8.561097145080566, | |
| "learning_rate": 1.926143213402771e-05, | |
| "loss": 1.1761, | |
| "step": 1290 | |
| }, | |
| { | |
| "epoch": 0.08592768854517814, | |
| "grad_norm": 11.316914558410645, | |
| "learning_rate": 1.921172977955552e-05, | |
| "loss": 1.2844, | |
| "step": 1300 | |
| }, | |
| { | |
| "epoch": 0.08658867076475643, | |
| "grad_norm": 11.52777099609375, | |
| "learning_rate": 1.9161697286101677e-05, | |
| "loss": 1.3252, | |
| "step": 1310 | |
| }, | |
| { | |
| "epoch": 0.08724965298433472, | |
| "grad_norm": 7.112990379333496, | |
| "learning_rate": 1.9111336710306013e-05, | |
| "loss": 1.2886, | |
| "step": 1320 | |
| }, | |
| { | |
| "epoch": 0.08791063520391301, | |
| "grad_norm": 11.982434272766113, | |
| "learning_rate": 1.9060650122294554e-05, | |
| "loss": 1.2249, | |
| "step": 1330 | |
| }, | |
| { | |
| "epoch": 0.0885716174234913, | |
| "grad_norm": 5.956284046173096, | |
| "learning_rate": 1.9009639605594407e-05, | |
| "loss": 1.1993, | |
| "step": 1340 | |
| }, | |
| { | |
| "epoch": 0.08923259964306961, | |
| "grad_norm": 6.896420955657959, | |
| "learning_rate": 1.8958307257048116e-05, | |
| "loss": 1.2083, | |
| "step": 1350 | |
| }, | |
| { | |
| "epoch": 0.08923259964306961, | |
| "eval_loss": 1.2925916910171509, | |
| "eval_runtime": 53.3979, | |
| "eval_samples_per_second": 9.382, | |
| "eval_steps_per_second": 9.382, | |
| "step": 1350 | |
| }, | |
| { | |
| "epoch": 0.0898935818626479, | |
| "grad_norm": 11.231532096862793, | |
| "learning_rate": 1.890665518672748e-05, | |
| "loss": 1.3071, | |
| "step": 1360 | |
| }, | |
| { | |
| "epoch": 0.09055456408222619, | |
| "grad_norm": 8.269697189331055, | |
| "learning_rate": 1.88546855178468e-05, | |
| "loss": 1.3681, | |
| "step": 1370 | |
| }, | |
| { | |
| "epoch": 0.09121554630180448, | |
| "grad_norm": 9.768874168395996, | |
| "learning_rate": 1.880240038667561e-05, | |
| "loss": 1.1444, | |
| "step": 1380 | |
| }, | |
| { | |
| "epoch": 0.09187652852138277, | |
| "grad_norm": 12.701289176940918, | |
| "learning_rate": 1.874980194245087e-05, | |
| "loss": 1.2358, | |
| "step": 1390 | |
| }, | |
| { | |
| "epoch": 0.09253751074096107, | |
| "grad_norm": 7.481356620788574, | |
| "learning_rate": 1.8696892347288606e-05, | |
| "loss": 1.2474, | |
| "step": 1400 | |
| }, | |
| { | |
| "epoch": 0.09319849296053936, | |
| "grad_norm": 5.565570831298828, | |
| "learning_rate": 1.864367377609504e-05, | |
| "loss": 1.3041, | |
| "step": 1410 | |
| }, | |
| { | |
| "epoch": 0.09385947518011765, | |
| "grad_norm": 11.658685684204102, | |
| "learning_rate": 1.8590148416477198e-05, | |
| "loss": 1.2475, | |
| "step": 1420 | |
| }, | |
| { | |
| "epoch": 0.09452045739969595, | |
| "grad_norm": 7.721464157104492, | |
| "learning_rate": 1.8536318468652962e-05, | |
| "loss": 1.2889, | |
| "step": 1430 | |
| }, | |
| { | |
| "epoch": 0.09518143961927424, | |
| "grad_norm": 13.417887687683105, | |
| "learning_rate": 1.8482186145360648e-05, | |
| "loss": 1.0137, | |
| "step": 1440 | |
| }, | |
| { | |
| "epoch": 0.09584242183885254, | |
| "grad_norm": 12.11631965637207, | |
| "learning_rate": 1.8427753671768056e-05, | |
| "loss": 1.1422, | |
| "step": 1450 | |
| }, | |
| { | |
| "epoch": 0.09650340405843083, | |
| "grad_norm": 10.596673965454102, | |
| "learning_rate": 1.8373023285380966e-05, | |
| "loss": 1.3137, | |
| "step": 1460 | |
| }, | |
| { | |
| "epoch": 0.09716438627800912, | |
| "grad_norm": 7.0566558837890625, | |
| "learning_rate": 1.8317997235951204e-05, | |
| "loss": 1.1111, | |
| "step": 1470 | |
| }, | |
| { | |
| "epoch": 0.09782536849758741, | |
| "grad_norm": 11.534781455993652, | |
| "learning_rate": 1.8262677785384142e-05, | |
| "loss": 1.207, | |
| "step": 1480 | |
| }, | |
| { | |
| "epoch": 0.0984863507171657, | |
| "grad_norm": 10.579961776733398, | |
| "learning_rate": 1.8207067207645716e-05, | |
| "loss": 1.0107, | |
| "step": 1490 | |
| }, | |
| { | |
| "epoch": 0.099147332936744, | |
| "grad_norm": 11.584352493286133, | |
| "learning_rate": 1.815116778866897e-05, | |
| "loss": 1.3272, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 0.099147332936744, | |
| "eval_loss": 1.2920811176300049, | |
| "eval_runtime": 56.3843, | |
| "eval_samples_per_second": 8.885, | |
| "eval_steps_per_second": 8.885, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 0.0998083151563223, | |
| "grad_norm": 12.167766571044922, | |
| "learning_rate": 1.8094981826260064e-05, | |
| "loss": 1.1052, | |
| "step": 1510 | |
| }, | |
| { | |
| "epoch": 0.10046929737590059, | |
| "grad_norm": 6.422857284545898, | |
| "learning_rate": 1.8038511630003865e-05, | |
| "loss": 1.2341, | |
| "step": 1520 | |
| }, | |
| { | |
| "epoch": 0.10113027959547888, | |
| "grad_norm": 11.502632141113281, | |
| "learning_rate": 1.798175952116895e-05, | |
| "loss": 1.2251, | |
| "step": 1530 | |
| }, | |
| { | |
| "epoch": 0.10179126181505717, | |
| "grad_norm": 13.205157279968262, | |
| "learning_rate": 1.7924727832612227e-05, | |
| "loss": 1.2488, | |
| "step": 1540 | |
| }, | |
| { | |
| "epoch": 0.10245224403463547, | |
| "grad_norm": 7.521269798278809, | |
| "learning_rate": 1.786741890868305e-05, | |
| "loss": 1.2128, | |
| "step": 1550 | |
| }, | |
| { | |
| "epoch": 0.10311322625421376, | |
| "grad_norm": 7.006454944610596, | |
| "learning_rate": 1.7809835105126807e-05, | |
| "loss": 1.1772, | |
| "step": 1560 | |
| }, | |
| { | |
| "epoch": 0.10377420847379205, | |
| "grad_norm": 10.070454597473145, | |
| "learning_rate": 1.7751978788988123e-05, | |
| "loss": 1.2622, | |
| "step": 1570 | |
| }, | |
| { | |
| "epoch": 0.10443519069337034, | |
| "grad_norm": 5.716686248779297, | |
| "learning_rate": 1.7693852338513545e-05, | |
| "loss": 1.2284, | |
| "step": 1580 | |
| }, | |
| { | |
| "epoch": 0.10509617291294863, | |
| "grad_norm": 9.35854721069336, | |
| "learning_rate": 1.7635458143053794e-05, | |
| "loss": 1.1278, | |
| "step": 1590 | |
| }, | |
| { | |
| "epoch": 0.10575715513252694, | |
| "grad_norm": 8.222880363464355, | |
| "learning_rate": 1.7576798602965525e-05, | |
| "loss": 1.2629, | |
| "step": 1600 | |
| }, | |
| { | |
| "epoch": 0.10641813735210523, | |
| "grad_norm": 7.391974925994873, | |
| "learning_rate": 1.7517876129512677e-05, | |
| "loss": 1.1084, | |
| "step": 1610 | |
| }, | |
| { | |
| "epoch": 0.10707911957168352, | |
| "grad_norm": 9.882158279418945, | |
| "learning_rate": 1.7458693144767353e-05, | |
| "loss": 1.1754, | |
| "step": 1620 | |
| }, | |
| { | |
| "epoch": 0.10774010179126181, | |
| "grad_norm": 6.603885173797607, | |
| "learning_rate": 1.7399252081510248e-05, | |
| "loss": 1.2642, | |
| "step": 1630 | |
| }, | |
| { | |
| "epoch": 0.10840108401084012, | |
| "grad_norm": 9.928793907165527, | |
| "learning_rate": 1.733955538313066e-05, | |
| "loss": 1.2299, | |
| "step": 1640 | |
| }, | |
| { | |
| "epoch": 0.1090620662304184, | |
| "grad_norm": 13.607159614562988, | |
| "learning_rate": 1.7279605503526047e-05, | |
| "loss": 1.3297, | |
| "step": 1650 | |
| }, | |
| { | |
| "epoch": 0.1090620662304184, | |
| "eval_loss": 1.2833280563354492, | |
| "eval_runtime": 56.0628, | |
| "eval_samples_per_second": 8.936, | |
| "eval_steps_per_second": 8.936, | |
| "step": 1650 | |
| }, | |
| { | |
| "epoch": 0.1097230484499967, | |
| "grad_norm": 12.829073905944824, | |
| "learning_rate": 1.721940490700115e-05, | |
| "loss": 1.1734, | |
| "step": 1660 | |
| }, | |
| { | |
| "epoch": 0.11038403066957499, | |
| "grad_norm": 5.9544548988342285, | |
| "learning_rate": 1.7158956068166697e-05, | |
| "loss": 1.0935, | |
| "step": 1670 | |
| }, | |
| { | |
| "epoch": 0.11104501288915328, | |
| "grad_norm": 7.440855503082275, | |
| "learning_rate": 1.7098261471837696e-05, | |
| "loss": 1.22, | |
| "step": 1680 | |
| }, | |
| { | |
| "epoch": 0.11170599510873158, | |
| "grad_norm": 5.567168235778809, | |
| "learning_rate": 1.7037323612931272e-05, | |
| "loss": 1.1423, | |
| "step": 1690 | |
| }, | |
| { | |
| "epoch": 0.11236697732830987, | |
| "grad_norm": 5.937944412231445, | |
| "learning_rate": 1.697614499636414e-05, | |
| "loss": 1.148, | |
| "step": 1700 | |
| }, | |
| { | |
| "epoch": 0.11302795954788816, | |
| "grad_norm": 6.795397758483887, | |
| "learning_rate": 1.6914728136949594e-05, | |
| "loss": 1.2881, | |
| "step": 1710 | |
| }, | |
| { | |
| "epoch": 0.11368894176746645, | |
| "grad_norm": 8.981378555297852, | |
| "learning_rate": 1.6853075559294172e-05, | |
| "loss": 1.1772, | |
| "step": 1720 | |
| }, | |
| { | |
| "epoch": 0.11434992398704474, | |
| "grad_norm": 9.995403289794922, | |
| "learning_rate": 1.6791189797693877e-05, | |
| "loss": 1.1541, | |
| "step": 1730 | |
| }, | |
| { | |
| "epoch": 0.11501090620662305, | |
| "grad_norm": 12.851771354675293, | |
| "learning_rate": 1.6729073396029965e-05, | |
| "loss": 1.2167, | |
| "step": 1740 | |
| }, | |
| { | |
| "epoch": 0.11567188842620134, | |
| "grad_norm": 12.812955856323242, | |
| "learning_rate": 1.666672890766442e-05, | |
| "loss": 1.1763, | |
| "step": 1750 | |
| }, | |
| { | |
| "epoch": 0.11633287064577963, | |
| "grad_norm": 8.584874153137207, | |
| "learning_rate": 1.660415889533497e-05, | |
| "loss": 1.2797, | |
| "step": 1760 | |
| }, | |
| { | |
| "epoch": 0.11699385286535792, | |
| "grad_norm": 8.92071533203125, | |
| "learning_rate": 1.6541365931049757e-05, | |
| "loss": 1.23, | |
| "step": 1770 | |
| }, | |
| { | |
| "epoch": 0.11765483508493621, | |
| "grad_norm": 5.1022210121154785, | |
| "learning_rate": 1.6478352595981594e-05, | |
| "loss": 1.0536, | |
| "step": 1780 | |
| }, | |
| { | |
| "epoch": 0.11831581730451451, | |
| "grad_norm": 8.801514625549316, | |
| "learning_rate": 1.6415121480361884e-05, | |
| "loss": 1.0129, | |
| "step": 1790 | |
| }, | |
| { | |
| "epoch": 0.1189767995240928, | |
| "grad_norm": 11.475573539733887, | |
| "learning_rate": 1.635167518337413e-05, | |
| "loss": 1.2538, | |
| "step": 1800 | |
| }, | |
| { | |
| "epoch": 0.1189767995240928, | |
| "eval_loss": 1.278364896774292, | |
| "eval_runtime": 47.0777, | |
| "eval_samples_per_second": 10.642, | |
| "eval_steps_per_second": 10.642, | |
| "step": 1800 | |
| }, | |
| { | |
| "epoch": 0.1196377817436711, | |
| "grad_norm": 10.728155136108398, | |
| "learning_rate": 1.6288016313047095e-05, | |
| "loss": 1.2208, | |
| "step": 1810 | |
| }, | |
| { | |
| "epoch": 0.12029876396324939, | |
| "grad_norm": 12.165102005004883, | |
| "learning_rate": 1.6224147486147602e-05, | |
| "loss": 1.3179, | |
| "step": 1820 | |
| }, | |
| { | |
| "epoch": 0.12095974618282768, | |
| "grad_norm": 10.370355606079102, | |
| "learning_rate": 1.616007132807298e-05, | |
| "loss": 1.226, | |
| "step": 1830 | |
| }, | |
| { | |
| "epoch": 0.12162072840240598, | |
| "grad_norm": 13.64041519165039, | |
| "learning_rate": 1.6095790472743107e-05, | |
| "loss": 1.287, | |
| "step": 1840 | |
| }, | |
| { | |
| "epoch": 0.12228171062198427, | |
| "grad_norm": 9.342700958251953, | |
| "learning_rate": 1.6031307562492174e-05, | |
| "loss": 1.2169, | |
| "step": 1850 | |
| }, | |
| { | |
| "epoch": 0.12294269284156256, | |
| "grad_norm": 5.222902297973633, | |
| "learning_rate": 1.5966625247960068e-05, | |
| "loss": 1.2688, | |
| "step": 1860 | |
| }, | |
| { | |
| "epoch": 0.12360367506114085, | |
| "grad_norm": 6.980830669403076, | |
| "learning_rate": 1.5901746187983387e-05, | |
| "loss": 1.1797, | |
| "step": 1870 | |
| }, | |
| { | |
| "epoch": 0.12426465728071914, | |
| "grad_norm": 10.581820487976074, | |
| "learning_rate": 1.5836673049486175e-05, | |
| "loss": 1.1752, | |
| "step": 1880 | |
| }, | |
| { | |
| "epoch": 0.12492563950029745, | |
| "grad_norm": 10.523150444030762, | |
| "learning_rate": 1.577140850737029e-05, | |
| "loss": 1.2042, | |
| "step": 1890 | |
| }, | |
| { | |
| "epoch": 0.12558662171987572, | |
| "grad_norm": 6.221709251403809, | |
| "learning_rate": 1.5705955244405423e-05, | |
| "loss": 1.1912, | |
| "step": 1900 | |
| }, | |
| { | |
| "epoch": 0.12624760393945403, | |
| "grad_norm": 10.54680347442627, | |
| "learning_rate": 1.564031595111886e-05, | |
| "loss": 1.2476, | |
| "step": 1910 | |
| }, | |
| { | |
| "epoch": 0.12690858615903233, | |
| "grad_norm": 5.043491840362549, | |
| "learning_rate": 1.557449332568485e-05, | |
| "loss": 1.2221, | |
| "step": 1920 | |
| }, | |
| { | |
| "epoch": 0.1275695683786106, | |
| "grad_norm": 10.203733444213867, | |
| "learning_rate": 1.5508490073813722e-05, | |
| "loss": 1.1716, | |
| "step": 1930 | |
| }, | |
| { | |
| "epoch": 0.1282305505981889, | |
| "grad_norm": 7.249475955963135, | |
| "learning_rate": 1.5442308908640636e-05, | |
| "loss": 1.1548, | |
| "step": 1940 | |
| }, | |
| { | |
| "epoch": 0.1288915328177672, | |
| "grad_norm": 11.740514755249023, | |
| "learning_rate": 1.537595255061408e-05, | |
| "loss": 1.1863, | |
| "step": 1950 | |
| }, | |
| { | |
| "epoch": 0.1288915328177672, | |
| "eval_loss": 1.2681256532669067, | |
| "eval_runtime": 53.9387, | |
| "eval_samples_per_second": 9.288, | |
| "eval_steps_per_second": 9.288, | |
| "step": 1950 | |
| }, | |
| { | |
| "epoch": 0.1295525150373455, | |
| "grad_norm": 9.638320922851562, | |
| "learning_rate": 1.5309423727384037e-05, | |
| "loss": 1.2506, | |
| "step": 1960 | |
| }, | |
| { | |
| "epoch": 0.1302134972569238, | |
| "grad_norm": 7.702147483825684, | |
| "learning_rate": 1.5242725173689851e-05, | |
| "loss": 1.1908, | |
| "step": 1970 | |
| }, | |
| { | |
| "epoch": 0.13087447947650208, | |
| "grad_norm": 15.315128326416016, | |
| "learning_rate": 1.5175859631247827e-05, | |
| "loss": 1.1775, | |
| "step": 1980 | |
| }, | |
| { | |
| "epoch": 0.13153546169608038, | |
| "grad_norm": 6.902062892913818, | |
| "learning_rate": 1.5108829848638515e-05, | |
| "loss": 1.1696, | |
| "step": 1990 | |
| }, | |
| { | |
| "epoch": 0.13219644391565866, | |
| "grad_norm": 10.421862602233887, | |
| "learning_rate": 1.5041638581193741e-05, | |
| "loss": 1.1456, | |
| "step": 2000 | |
| }, | |
| { | |
| "epoch": 0.13285742613523696, | |
| "grad_norm": 12.304083824157715, | |
| "learning_rate": 1.4974288590883346e-05, | |
| "loss": 1.0899, | |
| "step": 2010 | |
| }, | |
| { | |
| "epoch": 0.13351840835481527, | |
| "grad_norm": 6.598790645599365, | |
| "learning_rate": 1.4906782646201634e-05, | |
| "loss": 1.1023, | |
| "step": 2020 | |
| }, | |
| { | |
| "epoch": 0.13417939057439354, | |
| "grad_norm": 10.214670181274414, | |
| "learning_rate": 1.4839123522053591e-05, | |
| "loss": 1.1551, | |
| "step": 2030 | |
| }, | |
| { | |
| "epoch": 0.13484037279397185, | |
| "grad_norm": 9.92830753326416, | |
| "learning_rate": 1.4771313999640806e-05, | |
| "loss": 1.1611, | |
| "step": 2040 | |
| }, | |
| { | |
| "epoch": 0.13550135501355012, | |
| "grad_norm": 11.352734565734863, | |
| "learning_rate": 1.4703356866347155e-05, | |
| "loss": 1.1261, | |
| "step": 2050 | |
| }, | |
| { | |
| "epoch": 0.13616233723312843, | |
| "grad_norm": 9.193647384643555, | |
| "learning_rate": 1.4635254915624214e-05, | |
| "loss": 1.1497, | |
| "step": 2060 | |
| }, | |
| { | |
| "epoch": 0.13682331945270673, | |
| "grad_norm": 8.309967994689941, | |
| "learning_rate": 1.4567010946876445e-05, | |
| "loss": 1.2163, | |
| "step": 2070 | |
| }, | |
| { | |
| "epoch": 0.137484301672285, | |
| "grad_norm": 9.005535125732422, | |
| "learning_rate": 1.4498627765346109e-05, | |
| "loss": 1.1769, | |
| "step": 2080 | |
| }, | |
| { | |
| "epoch": 0.1381452838918633, | |
| "grad_norm": 6.557043552398682, | |
| "learning_rate": 1.4430108181997962e-05, | |
| "loss": 1.093, | |
| "step": 2090 | |
| }, | |
| { | |
| "epoch": 0.1388062661114416, | |
| "grad_norm": 7.859200954437256, | |
| "learning_rate": 1.4361455013403695e-05, | |
| "loss": 1.2585, | |
| "step": 2100 | |
| }, | |
| { | |
| "epoch": 0.1388062661114416, | |
| "eval_loss": 1.2679221630096436, | |
| "eval_runtime": 46.9201, | |
| "eval_samples_per_second": 10.678, | |
| "eval_steps_per_second": 10.678, | |
| "step": 2100 | |
| }, | |
| { | |
| "epoch": 0.1394672483310199, | |
| "grad_norm": 12.011978149414062, | |
| "learning_rate": 1.4292671081626183e-05, | |
| "loss": 1.2173, | |
| "step": 2110 | |
| }, | |
| { | |
| "epoch": 0.1401282305505982, | |
| "grad_norm": 9.485074996948242, | |
| "learning_rate": 1.4223759214103443e-05, | |
| "loss": 1.2501, | |
| "step": 2120 | |
| }, | |
| { | |
| "epoch": 0.14078921277017648, | |
| "grad_norm": 11.757882118225098, | |
| "learning_rate": 1.4154722243532445e-05, | |
| "loss": 1.1974, | |
| "step": 2130 | |
| }, | |
| { | |
| "epoch": 0.14145019498975478, | |
| "grad_norm": 13.57962703704834, | |
| "learning_rate": 1.4085563007752654e-05, | |
| "loss": 1.1892, | |
| "step": 2140 | |
| }, | |
| { | |
| "epoch": 0.14211117720933306, | |
| "grad_norm": 9.708785057067871, | |
| "learning_rate": 1.4016284349629364e-05, | |
| "loss": 1.225, | |
| "step": 2150 | |
| }, | |
| { | |
| "epoch": 0.14277215942891136, | |
| "grad_norm": 10.492091178894043, | |
| "learning_rate": 1.3946889116936874e-05, | |
| "loss": 1.208, | |
| "step": 2160 | |
| }, | |
| { | |
| "epoch": 0.14343314164848966, | |
| "grad_norm": 7.376300811767578, | |
| "learning_rate": 1.3877380162241394e-05, | |
| "loss": 1.1689, | |
| "step": 2170 | |
| }, | |
| { | |
| "epoch": 0.14409412386806794, | |
| "grad_norm": 6.636634349822998, | |
| "learning_rate": 1.3807760342783804e-05, | |
| "loss": 1.1393, | |
| "step": 2180 | |
| }, | |
| { | |
| "epoch": 0.14475510608764625, | |
| "grad_norm": 12.17708969116211, | |
| "learning_rate": 1.37380325203622e-05, | |
| "loss": 1.2818, | |
| "step": 2190 | |
| }, | |
| { | |
| "epoch": 0.14541608830722452, | |
| "grad_norm": 12.49779987335205, | |
| "learning_rate": 1.3668199561214252e-05, | |
| "loss": 1.133, | |
| "step": 2200 | |
| }, | |
| { | |
| "epoch": 0.14607707052680283, | |
| "grad_norm": 6.741744518280029, | |
| "learning_rate": 1.35982643358994e-05, | |
| "loss": 1.1637, | |
| "step": 2210 | |
| }, | |
| { | |
| "epoch": 0.14673805274638113, | |
| "grad_norm": 9.643292427062988, | |
| "learning_rate": 1.3528229719180835e-05, | |
| "loss": 1.2758, | |
| "step": 2220 | |
| }, | |
| { | |
| "epoch": 0.1473990349659594, | |
| "grad_norm": 10.941937446594238, | |
| "learning_rate": 1.3458098589907348e-05, | |
| "loss": 1.268, | |
| "step": 2230 | |
| }, | |
| { | |
| "epoch": 0.1480600171855377, | |
| "grad_norm": 11.461699485778809, | |
| "learning_rate": 1.3387873830894973e-05, | |
| "loss": 1.0558, | |
| "step": 2240 | |
| }, | |
| { | |
| "epoch": 0.148720999405116, | |
| "grad_norm": 6.023902893066406, | |
| "learning_rate": 1.3317558328808506e-05, | |
| "loss": 1.1131, | |
| "step": 2250 | |
| }, | |
| { | |
| "epoch": 0.148720999405116, | |
| "eval_loss": 1.259637475013733, | |
| "eval_runtime": 52.7273, | |
| "eval_samples_per_second": 9.502, | |
| "eval_steps_per_second": 9.502, | |
| "step": 2250 | |
| }, | |
| { | |
| "epoch": 0.1493819816246943, | |
| "grad_norm": 11.362767219543457, | |
| "learning_rate": 1.3247154974042827e-05, | |
| "loss": 1.2487, | |
| "step": 2260 | |
| }, | |
| { | |
| "epoch": 0.1500429638442726, | |
| "grad_norm": 12.16934585571289, | |
| "learning_rate": 1.3176666660604102e-05, | |
| "loss": 1.3317, | |
| "step": 2270 | |
| }, | |
| { | |
| "epoch": 0.15070394606385087, | |
| "grad_norm": 7.8326849937438965, | |
| "learning_rate": 1.3106096285990812e-05, | |
| "loss": 1.1973, | |
| "step": 2280 | |
| }, | |
| { | |
| "epoch": 0.15136492828342918, | |
| "grad_norm": 7.108518600463867, | |
| "learning_rate": 1.3035446751074653e-05, | |
| "loss": 1.1605, | |
| "step": 2290 | |
| }, | |
| { | |
| "epoch": 0.15202591050300746, | |
| "grad_norm": 11.288322448730469, | |
| "learning_rate": 1.2964720959981287e-05, | |
| "loss": 1.1857, | |
| "step": 2300 | |
| }, | |
| { | |
| "epoch": 0.15268689272258576, | |
| "grad_norm": 5.468815803527832, | |
| "learning_rate": 1.2893921819970972e-05, | |
| "loss": 1.2428, | |
| "step": 2310 | |
| }, | |
| { | |
| "epoch": 0.15334787494216406, | |
| "grad_norm": 11.970479011535645, | |
| "learning_rate": 1.2823052241319061e-05, | |
| "loss": 1.2249, | |
| "step": 2320 | |
| }, | |
| { | |
| "epoch": 0.15400885716174234, | |
| "grad_norm": 9.788006782531738, | |
| "learning_rate": 1.2752115137196341e-05, | |
| "loss": 1.1832, | |
| "step": 2330 | |
| }, | |
| { | |
| "epoch": 0.15466983938132065, | |
| "grad_norm": 5.940231800079346, | |
| "learning_rate": 1.2681113423549334e-05, | |
| "loss": 1.0796, | |
| "step": 2340 | |
| }, | |
| { | |
| "epoch": 0.15533082160089895, | |
| "grad_norm": 5.606922149658203, | |
| "learning_rate": 1.2610050018980385e-05, | |
| "loss": 0.9388, | |
| "step": 2350 | |
| }, | |
| { | |
| "epoch": 0.15599180382047723, | |
| "grad_norm": 6.812578201293945, | |
| "learning_rate": 1.2538927844627726e-05, | |
| "loss": 1.12, | |
| "step": 2360 | |
| }, | |
| { | |
| "epoch": 0.15665278604005553, | |
| "grad_norm": 10.468450546264648, | |
| "learning_rate": 1.2467749824045373e-05, | |
| "loss": 1.1143, | |
| "step": 2370 | |
| }, | |
| { | |
| "epoch": 0.1573137682596338, | |
| "grad_norm": 6.699043273925781, | |
| "learning_rate": 1.2396518883082966e-05, | |
| "loss": 1.1317, | |
| "step": 2380 | |
| }, | |
| { | |
| "epoch": 0.1579747504792121, | |
| "grad_norm": 11.339058876037598, | |
| "learning_rate": 1.2325237949765496e-05, | |
| "loss": 1.1824, | |
| "step": 2390 | |
| }, | |
| { | |
| "epoch": 0.15863573269879042, | |
| "grad_norm": 6.434577941894531, | |
| "learning_rate": 1.225390995417295e-05, | |
| "loss": 1.0624, | |
| "step": 2400 | |
| }, | |
| { | |
| "epoch": 0.15863573269879042, | |
| "eval_loss": 1.253835678100586, | |
| "eval_runtime": 47.0266, | |
| "eval_samples_per_second": 10.654, | |
| "eval_steps_per_second": 10.654, | |
| "step": 2400 | |
| }, | |
| { | |
| "epoch": 0.1592967149183687, | |
| "grad_norm": 10.957035064697266, | |
| "learning_rate": 1.2182537828319848e-05, | |
| "loss": 1.265, | |
| "step": 2410 | |
| }, | |
| { | |
| "epoch": 0.159957697137947, | |
| "grad_norm": 12.669862747192383, | |
| "learning_rate": 1.2111124506034739e-05, | |
| "loss": 1.1453, | |
| "step": 2420 | |
| }, | |
| { | |
| "epoch": 0.16061867935752527, | |
| "grad_norm": 12.645952224731445, | |
| "learning_rate": 1.2039672922839598e-05, | |
| "loss": 1.1506, | |
| "step": 2430 | |
| }, | |
| { | |
| "epoch": 0.16127966157710358, | |
| "grad_norm": 12.920147895812988, | |
| "learning_rate": 1.196818601582915e-05, | |
| "loss": 1.0976, | |
| "step": 2440 | |
| }, | |
| { | |
| "epoch": 0.16194064379668188, | |
| "grad_norm": 13.062854766845703, | |
| "learning_rate": 1.189666672355015e-05, | |
| "loss": 1.3518, | |
| "step": 2450 | |
| }, | |
| { | |
| "epoch": 0.16260162601626016, | |
| "grad_norm": 5.583253860473633, | |
| "learning_rate": 1.1825117985880576e-05, | |
| "loss": 1.0854, | |
| "step": 2460 | |
| }, | |
| { | |
| "epoch": 0.16326260823583846, | |
| "grad_norm": 12.410826683044434, | |
| "learning_rate": 1.1753542743908802e-05, | |
| "loss": 1.1561, | |
| "step": 2470 | |
| }, | |
| { | |
| "epoch": 0.16392359045541674, | |
| "grad_norm": 11.445279121398926, | |
| "learning_rate": 1.1681943939812688e-05, | |
| "loss": 1.3584, | |
| "step": 2480 | |
| }, | |
| { | |
| "epoch": 0.16458457267499504, | |
| "grad_norm": 6.8058342933654785, | |
| "learning_rate": 1.1610324516738626e-05, | |
| "loss": 1.2373, | |
| "step": 2490 | |
| }, | |
| { | |
| "epoch": 0.16524555489457335, | |
| "grad_norm": 10.376558303833008, | |
| "learning_rate": 1.1538687418680596e-05, | |
| "loss": 1.0921, | |
| "step": 2500 | |
| }, | |
| { | |
| "epoch": 0.16590653711415163, | |
| "grad_norm": 6.7869791984558105, | |
| "learning_rate": 1.1467035590359106e-05, | |
| "loss": 1.2743, | |
| "step": 2510 | |
| }, | |
| { | |
| "epoch": 0.16656751933372993, | |
| "grad_norm": 12.313713073730469, | |
| "learning_rate": 1.139537197710018e-05, | |
| "loss": 1.1243, | |
| "step": 2520 | |
| }, | |
| { | |
| "epoch": 0.1672285015533082, | |
| "grad_norm": 11.535476684570312, | |
| "learning_rate": 1.1323699524714278e-05, | |
| "loss": 1.2232, | |
| "step": 2530 | |
| }, | |
| { | |
| "epoch": 0.1678894837728865, | |
| "grad_norm": 9.248635292053223, | |
| "learning_rate": 1.1252021179375192e-05, | |
| "loss": 1.0689, | |
| "step": 2540 | |
| }, | |
| { | |
| "epoch": 0.16855046599246482, | |
| "grad_norm": 10.689653396606445, | |
| "learning_rate": 1.118033988749895e-05, | |
| "loss": 1.2617, | |
| "step": 2550 | |
| }, | |
| { | |
| "epoch": 0.16855046599246482, | |
| "eval_loss": 1.2488397359848022, | |
| "eval_runtime": 52.0382, | |
| "eval_samples_per_second": 9.628, | |
| "eval_steps_per_second": 9.628, | |
| "step": 2550 | |
| }, | |
| { | |
| "epoch": 0.1692114482120431, | |
| "grad_norm": 12.502510070800781, | |
| "learning_rate": 1.1108658595622709e-05, | |
| "loss": 1.2023, | |
| "step": 2560 | |
| }, | |
| { | |
| "epoch": 0.1698724304316214, | |
| "grad_norm": 11.087409973144531, | |
| "learning_rate": 1.1036980250283621e-05, | |
| "loss": 1.2207, | |
| "step": 2570 | |
| }, | |
| { | |
| "epoch": 0.17053341265119967, | |
| "grad_norm": 9.92039680480957, | |
| "learning_rate": 1.096530779789772e-05, | |
| "loss": 1.1602, | |
| "step": 2580 | |
| }, | |
| { | |
| "epoch": 0.17119439487077798, | |
| "grad_norm": 5.836206912994385, | |
| "learning_rate": 1.0893644184638797e-05, | |
| "loss": 1.0523, | |
| "step": 2590 | |
| }, | |
| { | |
| "epoch": 0.17185537709035628, | |
| "grad_norm": 12.243383407592773, | |
| "learning_rate": 1.0821992356317307e-05, | |
| "loss": 1.2196, | |
| "step": 2600 | |
| }, | |
| { | |
| "epoch": 0.17251635930993456, | |
| "grad_norm": 6.7921366691589355, | |
| "learning_rate": 1.0750355258259273e-05, | |
| "loss": 1.2333, | |
| "step": 2610 | |
| }, | |
| { | |
| "epoch": 0.17317734152951286, | |
| "grad_norm": 11.758354187011719, | |
| "learning_rate": 1.0678735835185219e-05, | |
| "loss": 1.1695, | |
| "step": 2620 | |
| }, | |
| { | |
| "epoch": 0.17383832374909114, | |
| "grad_norm": 12.446253776550293, | |
| "learning_rate": 1.06071370310891e-05, | |
| "loss": 1.1428, | |
| "step": 2630 | |
| }, | |
| { | |
| "epoch": 0.17449930596866944, | |
| "grad_norm": 7.370149612426758, | |
| "learning_rate": 1.0535561789117327e-05, | |
| "loss": 1.262, | |
| "step": 2640 | |
| }, | |
| { | |
| "epoch": 0.17516028818824775, | |
| "grad_norm": 10.489151954650879, | |
| "learning_rate": 1.0464013051447755e-05, | |
| "loss": 1.0921, | |
| "step": 2650 | |
| }, | |
| { | |
| "epoch": 0.17582127040782602, | |
| "grad_norm": 10.34467887878418, | |
| "learning_rate": 1.0392493759168751e-05, | |
| "loss": 1.1942, | |
| "step": 2660 | |
| }, | |
| { | |
| "epoch": 0.17648225262740433, | |
| "grad_norm": 11.04796314239502, | |
| "learning_rate": 1.0321006852158306e-05, | |
| "loss": 1.0937, | |
| "step": 2670 | |
| }, | |
| { | |
| "epoch": 0.1771432348469826, | |
| "grad_norm": 12.193102836608887, | |
| "learning_rate": 1.0249555268963164e-05, | |
| "loss": 1.1015, | |
| "step": 2680 | |
| }, | |
| { | |
| "epoch": 0.1778042170665609, | |
| "grad_norm": 11.928840637207031, | |
| "learning_rate": 1.0178141946678054e-05, | |
| "loss": 1.2069, | |
| "step": 2690 | |
| }, | |
| { | |
| "epoch": 0.17846519928613921, | |
| "grad_norm": 6.055873870849609, | |
| "learning_rate": 1.0106769820824951e-05, | |
| "loss": 1.0915, | |
| "step": 2700 | |
| }, | |
| { | |
| "epoch": 0.17846519928613921, | |
| "eval_loss": 1.246018409729004, | |
| "eval_runtime": 47.997, | |
| "eval_samples_per_second": 10.438, | |
| "eval_steps_per_second": 10.438, | |
| "step": 2700 | |
| }, | |
| { | |
| "epoch": 0.1791261815057175, | |
| "grad_norm": 7.3669586181640625, | |
| "learning_rate": 1.0035441825232406e-05, | |
| "loss": 1.0824, | |
| "step": 2710 | |
| }, | |
| { | |
| "epoch": 0.1797871637252958, | |
| "grad_norm": 12.520928382873535, | |
| "learning_rate": 9.964160891914937e-06, | |
| "loss": 1.1395, | |
| "step": 2720 | |
| }, | |
| { | |
| "epoch": 0.18044814594487407, | |
| "grad_norm": 6.952485084533691, | |
| "learning_rate": 9.892929950952532e-06, | |
| "loss": 1.1727, | |
| "step": 2730 | |
| }, | |
| { | |
| "epoch": 0.18110912816445238, | |
| "grad_norm": 10.507661819458008, | |
| "learning_rate": 9.821751930370177e-06, | |
| "loss": 1.184, | |
| "step": 2740 | |
| }, | |
| { | |
| "epoch": 0.18177011038403068, | |
| "grad_norm": 12.77137279510498, | |
| "learning_rate": 9.750629756017514e-06, | |
| "loss": 1.228, | |
| "step": 2750 | |
| }, | |
| { | |
| "epoch": 0.18243109260360896, | |
| "grad_norm": 7.609248161315918, | |
| "learning_rate": 9.679566351448571e-06, | |
| "loss": 1.1315, | |
| "step": 2760 | |
| }, | |
| { | |
| "epoch": 0.18309207482318726, | |
| "grad_norm": 11.428009986877441, | |
| "learning_rate": 9.608564637801562e-06, | |
| "loss": 1.041, | |
| "step": 2770 | |
| }, | |
| { | |
| "epoch": 0.18375305704276554, | |
| "grad_norm": 12.582087516784668, | |
| "learning_rate": 9.537627533678842e-06, | |
| "loss": 1.1608, | |
| "step": 2780 | |
| }, | |
| { | |
| "epoch": 0.18441403926234384, | |
| "grad_norm": 10.488136291503906, | |
| "learning_rate": 9.466757955026925e-06, | |
| "loss": 1.0935, | |
| "step": 2790 | |
| }, | |
| { | |
| "epoch": 0.18507502148192215, | |
| "grad_norm": 12.54319953918457, | |
| "learning_rate": 9.395958815016618e-06, | |
| "loss": 1.1654, | |
| "step": 2800 | |
| }, | |
| { | |
| "epoch": 0.18573600370150042, | |
| "grad_norm": 10.314374923706055, | |
| "learning_rate": 9.325233023923252e-06, | |
| "loss": 1.2293, | |
| "step": 2810 | |
| }, | |
| { | |
| "epoch": 0.18639698592107873, | |
| "grad_norm": 7.015604496002197, | |
| "learning_rate": 9.25458348900709e-06, | |
| "loss": 1.0994, | |
| "step": 2820 | |
| }, | |
| { | |
| "epoch": 0.187057968140657, | |
| "grad_norm": 6.349636554718018, | |
| "learning_rate": 9.1840131143938e-06, | |
| "loss": 1.2272, | |
| "step": 2830 | |
| }, | |
| { | |
| "epoch": 0.1877189503602353, | |
| "grad_norm": 9.584831237792969, | |
| "learning_rate": 9.113524800955074e-06, | |
| "loss": 1.1187, | |
| "step": 2840 | |
| }, | |
| { | |
| "epoch": 0.1883799325798136, | |
| "grad_norm": 4.967813491821289, | |
| "learning_rate": 9.043121446189398e-06, | |
| "loss": 1.0012, | |
| "step": 2850 | |
| }, | |
| { | |
| "epoch": 0.1883799325798136, | |
| "eval_loss": 1.2398909330368042, | |
| "eval_runtime": 53.5377, | |
| "eval_samples_per_second": 9.358, | |
| "eval_steps_per_second": 9.358, | |
| "step": 2850 | |
| }, | |
| { | |
| "epoch": 0.1890409147993919, | |
| "grad_norm": 11.762967109680176, | |
| "learning_rate": 8.972805944102928e-06, | |
| "loss": 1.1628, | |
| "step": 2860 | |
| }, | |
| { | |
| "epoch": 0.1897018970189702, | |
| "grad_norm": 9.806082725524902, | |
| "learning_rate": 8.902581185090555e-06, | |
| "loss": 1.0982, | |
| "step": 2870 | |
| }, | |
| { | |
| "epoch": 0.19036287923854847, | |
| "grad_norm": 5.619679927825928, | |
| "learning_rate": 8.832450055817064e-06, | |
| "loss": 1.1545, | |
| "step": 2880 | |
| }, | |
| { | |
| "epoch": 0.19102386145812678, | |
| "grad_norm": 12.290181159973145, | |
| "learning_rate": 8.7624154390985e-06, | |
| "loss": 1.1625, | |
| "step": 2890 | |
| }, | |
| { | |
| "epoch": 0.19168484367770508, | |
| "grad_norm": 12.353217124938965, | |
| "learning_rate": 8.692480213783649e-06, | |
| "loss": 1.159, | |
| "step": 2900 | |
| }, | |
| { | |
| "epoch": 0.19234582589728336, | |
| "grad_norm": 9.661192893981934, | |
| "learning_rate": 8.622647254635703e-06, | |
| "loss": 1.2334, | |
| "step": 2910 | |
| }, | |
| { | |
| "epoch": 0.19300680811686166, | |
| "grad_norm": 10.236005783081055, | |
| "learning_rate": 8.552919432214097e-06, | |
| "loss": 1.1434, | |
| "step": 2920 | |
| }, | |
| { | |
| "epoch": 0.19366779033643994, | |
| "grad_norm": 11.429096221923828, | |
| "learning_rate": 8.483299612756505e-06, | |
| "loss": 1.2204, | |
| "step": 2930 | |
| }, | |
| { | |
| "epoch": 0.19432877255601824, | |
| "grad_norm": 7.723197937011719, | |
| "learning_rate": 8.413790658061028e-06, | |
| "loss": 1.2049, | |
| "step": 2940 | |
| }, | |
| { | |
| "epoch": 0.19498975477559655, | |
| "grad_norm": 9.042826652526855, | |
| "learning_rate": 8.344395425368537e-06, | |
| "loss": 1.1231, | |
| "step": 2950 | |
| }, | |
| { | |
| "epoch": 0.19565073699517482, | |
| "grad_norm": 11.260157585144043, | |
| "learning_rate": 8.275116767245251e-06, | |
| "loss": 1.1543, | |
| "step": 2960 | |
| }, | |
| { | |
| "epoch": 0.19631171921475313, | |
| "grad_norm": 5.6008830070495605, | |
| "learning_rate": 8.205957531465456e-06, | |
| "loss": 1.0243, | |
| "step": 2970 | |
| }, | |
| { | |
| "epoch": 0.1969727014343314, | |
| "grad_norm": 5.492390155792236, | |
| "learning_rate": 8.136920560894458e-06, | |
| "loss": 1.2962, | |
| "step": 2980 | |
| }, | |
| { | |
| "epoch": 0.1976336836539097, | |
| "grad_norm": 10.791748046875, | |
| "learning_rate": 8.068008693371723e-06, | |
| "loss": 1.0384, | |
| "step": 2990 | |
| }, | |
| { | |
| "epoch": 0.198294665873488, | |
| "grad_norm": 6.472116470336914, | |
| "learning_rate": 7.999224761594206e-06, | |
| "loss": 1.0479, | |
| "step": 3000 | |
| }, | |
| { | |
| "epoch": 0.198294665873488, | |
| "eval_loss": 1.2349213361740112, | |
| "eval_runtime": 53.0521, | |
| "eval_samples_per_second": 9.444, | |
| "eval_steps_per_second": 9.444, | |
| "step": 3000 | |
| }, | |
| { | |
| "epoch": 0.1989556480930663, | |
| "grad_norm": 7.443964958190918, | |
| "learning_rate": 7.930571592999942e-06, | |
| "loss": 1.1367, | |
| "step": 3010 | |
| }, | |
| { | |
| "epoch": 0.1996166303126446, | |
| "grad_norm": 7.271074295043945, | |
| "learning_rate": 7.86205200965179e-06, | |
| "loss": 1.1435, | |
| "step": 3020 | |
| }, | |
| { | |
| "epoch": 0.20027761253222287, | |
| "grad_norm": 12.19694995880127, | |
| "learning_rate": 7.793668828121457e-06, | |
| "loss": 1.274, | |
| "step": 3030 | |
| }, | |
| { | |
| "epoch": 0.20093859475180118, | |
| "grad_norm": 6.130085468292236, | |
| "learning_rate": 7.725424859373688e-06, | |
| "loss": 1.1887, | |
| "step": 3040 | |
| }, | |
| { | |
| "epoch": 0.20159957697137948, | |
| "grad_norm": 8.441886901855469, | |
| "learning_rate": 7.65732290865075e-06, | |
| "loss": 1.1228, | |
| "step": 3050 | |
| }, | |
| { | |
| "epoch": 0.20226055919095776, | |
| "grad_norm": 10.298881530761719, | |
| "learning_rate": 7.589365775357096e-06, | |
| "loss": 1.1681, | |
| "step": 3060 | |
| }, | |
| { | |
| "epoch": 0.20292154141053606, | |
| "grad_norm": 5.6892218589782715, | |
| "learning_rate": 7.52155625294431e-06, | |
| "loss": 1.1967, | |
| "step": 3070 | |
| }, | |
| { | |
| "epoch": 0.20358252363011434, | |
| "grad_norm": 4.733664035797119, | |
| "learning_rate": 7.453897128796269e-06, | |
| "loss": 0.9874, | |
| "step": 3080 | |
| }, | |
| { | |
| "epoch": 0.20424350584969264, | |
| "grad_norm": 6.695845603942871, | |
| "learning_rate": 7.386391184114558e-06, | |
| "loss": 1.2284, | |
| "step": 3090 | |
| }, | |
| { | |
| "epoch": 0.20490448806927095, | |
| "grad_norm": 11.191842079162598, | |
| "learning_rate": 7.319041193804161e-06, | |
| "loss": 1.2232, | |
| "step": 3100 | |
| }, | |
| { | |
| "epoch": 0.20556547028884922, | |
| "grad_norm": 6.132591724395752, | |
| "learning_rate": 7.2518499263593866e-06, | |
| "loss": 1.12, | |
| "step": 3110 | |
| }, | |
| { | |
| "epoch": 0.20622645250842753, | |
| "grad_norm": 11.867471694946289, | |
| "learning_rate": 7.184820143750079e-06, | |
| "loss": 1.1889, | |
| "step": 3120 | |
| }, | |
| { | |
| "epoch": 0.2068874347280058, | |
| "grad_norm": 10.931007385253906, | |
| "learning_rate": 7.117954601308052e-06, | |
| "loss": 1.2347, | |
| "step": 3130 | |
| }, | |
| { | |
| "epoch": 0.2075484169475841, | |
| "grad_norm": 12.895480155944824, | |
| "learning_rate": 7.051256047613866e-06, | |
| "loss": 1.216, | |
| "step": 3140 | |
| }, | |
| { | |
| "epoch": 0.2082093991671624, | |
| "grad_norm": 10.634278297424316, | |
| "learning_rate": 6.984727224383822e-06, | |
| "loss": 1.1687, | |
| "step": 3150 | |
| }, | |
| { | |
| "epoch": 0.2082093991671624, | |
| "eval_loss": 1.2307320833206177, | |
| "eval_runtime": 58.1752, | |
| "eval_samples_per_second": 8.612, | |
| "eval_steps_per_second": 8.612, | |
| "step": 3150 | |
| }, | |
| { | |
| "epoch": 0.2088703813867407, | |
| "grad_norm": 11.298223495483398, | |
| "learning_rate": 6.918370866357266e-06, | |
| "loss": 1.1429, | |
| "step": 3160 | |
| }, | |
| { | |
| "epoch": 0.209531363606319, | |
| "grad_norm": 5.801537036895752, | |
| "learning_rate": 6.852189701184183e-06, | |
| "loss": 1.1809, | |
| "step": 3170 | |
| }, | |
| { | |
| "epoch": 0.21019234582589727, | |
| "grad_norm": 11.565352439880371, | |
| "learning_rate": 6.786186449313051e-06, | |
| "loss": 1.1068, | |
| "step": 3180 | |
| }, | |
| { | |
| "epoch": 0.21085332804547557, | |
| "grad_norm": 9.563201904296875, | |
| "learning_rate": 6.720363823879042e-06, | |
| "loss": 1.1438, | |
| "step": 3190 | |
| }, | |
| { | |
| "epoch": 0.21151431026505388, | |
| "grad_norm": 3.7967348098754883, | |
| "learning_rate": 6.6547245305924765e-06, | |
| "loss": 1.1022, | |
| "step": 3200 | |
| }, | |
| { | |
| "epoch": 0.21217529248463216, | |
| "grad_norm": 9.867331504821777, | |
| "learning_rate": 6.589271267627615e-06, | |
| "loss": 1.0329, | |
| "step": 3210 | |
| }, | |
| { | |
| "epoch": 0.21283627470421046, | |
| "grad_norm": 10.908332824707031, | |
| "learning_rate": 6.524006725511727e-06, | |
| "loss": 1.0811, | |
| "step": 3220 | |
| }, | |
| { | |
| "epoch": 0.21349725692378874, | |
| "grad_norm": 11.866363525390625, | |
| "learning_rate": 6.4589335870145165e-06, | |
| "loss": 1.1611, | |
| "step": 3230 | |
| }, | |
| { | |
| "epoch": 0.21415823914336704, | |
| "grad_norm": 12.108943939208984, | |
| "learning_rate": 6.394054527037837e-06, | |
| "loss": 1.1558, | |
| "step": 3240 | |
| }, | |
| { | |
| "epoch": 0.21481922136294535, | |
| "grad_norm": 11.09125804901123, | |
| "learning_rate": 6.329372212505727e-06, | |
| "loss": 1.1853, | |
| "step": 3250 | |
| }, | |
| { | |
| "epoch": 0.21548020358252362, | |
| "grad_norm": 12.74525260925293, | |
| "learning_rate": 6.264889302254797e-06, | |
| "loss": 1.1862, | |
| "step": 3260 | |
| }, | |
| { | |
| "epoch": 0.21614118580210193, | |
| "grad_norm": 9.876714706420898, | |
| "learning_rate": 6.200608446924922e-06, | |
| "loss": 1.1651, | |
| "step": 3270 | |
| }, | |
| { | |
| "epoch": 0.21680216802168023, | |
| "grad_norm": 9.700896263122559, | |
| "learning_rate": 6.136532288850295e-06, | |
| "loss": 1.2345, | |
| "step": 3280 | |
| }, | |
| { | |
| "epoch": 0.2174631502412585, | |
| "grad_norm": 10.941569328308105, | |
| "learning_rate": 6.072663461950806e-06, | |
| "loss": 1.0379, | |
| "step": 3290 | |
| }, | |
| { | |
| "epoch": 0.2181241324608368, | |
| "grad_norm": 13.29504108428955, | |
| "learning_rate": 6.009004591623776e-06, | |
| "loss": 1.1251, | |
| "step": 3300 | |
| }, | |
| { | |
| "epoch": 0.2181241324608368, | |
| "eval_loss": 1.2260839939117432, | |
| "eval_runtime": 47.8562, | |
| "eval_samples_per_second": 10.469, | |
| "eval_steps_per_second": 10.469, | |
| "step": 3300 | |
| }, | |
| { | |
| "epoch": 0.2187851146804151, | |
| "grad_norm": 8.1751708984375, | |
| "learning_rate": 5.945558294636019e-06, | |
| "loss": 1.1452, | |
| "step": 3310 | |
| }, | |
| { | |
| "epoch": 0.2194460968999934, | |
| "grad_norm": 12.451173782348633, | |
| "learning_rate": 5.882327179016307e-06, | |
| "loss": 1.217, | |
| "step": 3320 | |
| }, | |
| { | |
| "epoch": 0.2201070791195717, | |
| "grad_norm": 11.116937637329102, | |
| "learning_rate": 5.819313843948146e-06, | |
| "loss": 1.1602, | |
| "step": 3330 | |
| }, | |
| { | |
| "epoch": 0.22076806133914997, | |
| "grad_norm": 10.272557258605957, | |
| "learning_rate": 5.756520879662929e-06, | |
| "loss": 1.2616, | |
| "step": 3340 | |
| }, | |
| { | |
| "epoch": 0.22142904355872828, | |
| "grad_norm": 10.73164176940918, | |
| "learning_rate": 5.693950867333488e-06, | |
| "loss": 1.2448, | |
| "step": 3350 | |
| }, | |
| { | |
| "epoch": 0.22209002577830655, | |
| "grad_norm": 11.405309677124023, | |
| "learning_rate": 5.6316063789679415e-06, | |
| "loss": 1.2419, | |
| "step": 3360 | |
| }, | |
| { | |
| "epoch": 0.22275100799788486, | |
| "grad_norm": 6.117231369018555, | |
| "learning_rate": 5.569489977304029e-06, | |
| "loss": 1.2027, | |
| "step": 3370 | |
| }, | |
| { | |
| "epoch": 0.22341199021746316, | |
| "grad_norm": 12.008468627929688, | |
| "learning_rate": 5.507604215703729e-06, | |
| "loss": 1.1525, | |
| "step": 3380 | |
| }, | |
| { | |
| "epoch": 0.22407297243704144, | |
| "grad_norm": 6.268473148345947, | |
| "learning_rate": 5.44595163804831e-06, | |
| "loss": 1.1422, | |
| "step": 3390 | |
| }, | |
| { | |
| "epoch": 0.22473395465661974, | |
| "grad_norm": 14.515848159790039, | |
| "learning_rate": 5.384534778633763e-06, | |
| "loss": 1.0998, | |
| "step": 3400 | |
| }, | |
| { | |
| "epoch": 0.22539493687619802, | |
| "grad_norm": 10.610064506530762, | |
| "learning_rate": 5.323356162066626e-06, | |
| "loss": 1.2074, | |
| "step": 3410 | |
| }, | |
| { | |
| "epoch": 0.22605591909577633, | |
| "grad_norm": 11.648080825805664, | |
| "learning_rate": 5.262418303160206e-06, | |
| "loss": 1.0755, | |
| "step": 3420 | |
| }, | |
| { | |
| "epoch": 0.22671690131535463, | |
| "grad_norm": 6.210646629333496, | |
| "learning_rate": 5.201723706831204e-06, | |
| "loss": 1.1203, | |
| "step": 3430 | |
| }, | |
| { | |
| "epoch": 0.2273778835349329, | |
| "grad_norm": 4.218708038330078, | |
| "learning_rate": 5.141274867996755e-06, | |
| "loss": 0.9939, | |
| "step": 3440 | |
| }, | |
| { | |
| "epoch": 0.2280388657545112, | |
| "grad_norm": 8.179903030395508, | |
| "learning_rate": 5.081074271471855e-06, | |
| "loss": 1.0597, | |
| "step": 3450 | |
| }, | |
| { | |
| "epoch": 0.2280388657545112, | |
| "eval_loss": 1.2263822555541992, | |
| "eval_runtime": 52.855, | |
| "eval_samples_per_second": 9.479, | |
| "eval_steps_per_second": 9.479, | |
| "step": 3450 | |
| }, | |
| { | |
| "epoch": 0.2286998479740895, | |
| "grad_norm": 13.975303649902344, | |
| "learning_rate": 5.021124391867241e-06, | |
| "loss": 1.1898, | |
| "step": 3460 | |
| }, | |
| { | |
| "epoch": 0.2293608301936678, | |
| "grad_norm": 11.902430534362793, | |
| "learning_rate": 4.961427693487654e-06, | |
| "loss": 1.2382, | |
| "step": 3470 | |
| }, | |
| { | |
| "epoch": 0.2300218124132461, | |
| "grad_norm": 7.363813877105713, | |
| "learning_rate": 4.901986630230549e-06, | |
| "loss": 1.1337, | |
| "step": 3480 | |
| }, | |
| { | |
| "epoch": 0.23068279463282437, | |
| "grad_norm": 14.231773376464844, | |
| "learning_rate": 4.842803645485228e-06, | |
| "loss": 1.2631, | |
| "step": 3490 | |
| }, | |
| { | |
| "epoch": 0.23134377685240268, | |
| "grad_norm": 13.055315971374512, | |
| "learning_rate": 4.7838811720323795e-06, | |
| "loss": 1.2307, | |
| "step": 3500 | |
| }, | |
| { | |
| "epoch": 0.23200475907198095, | |
| "grad_norm": 11.109673500061035, | |
| "learning_rate": 4.725221631944109e-06, | |
| "loss": 1.0673, | |
| "step": 3510 | |
| }, | |
| { | |
| "epoch": 0.23266574129155926, | |
| "grad_norm": 9.12000560760498, | |
| "learning_rate": 4.666827436484355e-06, | |
| "loss": 1.2818, | |
| "step": 3520 | |
| }, | |
| { | |
| "epoch": 0.23332672351113756, | |
| "grad_norm": 11.266242980957031, | |
| "learning_rate": 4.60870098600978e-06, | |
| "loss": 0.9892, | |
| "step": 3530 | |
| }, | |
| { | |
| "epoch": 0.23398770573071584, | |
| "grad_norm": 13.089488983154297, | |
| "learning_rate": 4.550844669871095e-06, | |
| "loss": 1.1585, | |
| "step": 3540 | |
| }, | |
| { | |
| "epoch": 0.23464868795029414, | |
| "grad_norm": 9.938103675842285, | |
| "learning_rate": 4.493260866314851e-06, | |
| "loss": 1.1734, | |
| "step": 3550 | |
| }, | |
| { | |
| "epoch": 0.23530967016987242, | |
| "grad_norm": 10.093935012817383, | |
| "learning_rate": 4.435951942385671e-06, | |
| "loss": 1.1185, | |
| "step": 3560 | |
| }, | |
| { | |
| "epoch": 0.23597065238945072, | |
| "grad_norm": 4.782352924346924, | |
| "learning_rate": 4.378920253828953e-06, | |
| "loss": 1.1413, | |
| "step": 3570 | |
| }, | |
| { | |
| "epoch": 0.23663163460902903, | |
| "grad_norm": 11.091765403747559, | |
| "learning_rate": 4.322168144994041e-06, | |
| "loss": 1.2909, | |
| "step": 3580 | |
| }, | |
| { | |
| "epoch": 0.2372926168286073, | |
| "grad_norm": 10.81592845916748, | |
| "learning_rate": 4.265697948737836e-06, | |
| "loss": 1.2501, | |
| "step": 3590 | |
| }, | |
| { | |
| "epoch": 0.2379535990481856, | |
| "grad_norm": 11.043889045715332, | |
| "learning_rate": 4.209511986328935e-06, | |
| "loss": 1.1757, | |
| "step": 3600 | |
| }, | |
| { | |
| "epoch": 0.2379535990481856, | |
| "eval_loss": 1.223681092262268, | |
| "eval_runtime": 54.0238, | |
| "eval_samples_per_second": 9.274, | |
| "eval_steps_per_second": 9.274, | |
| "step": 3600 | |
| }, | |
| { | |
| "epoch": 0.2386145812677639, | |
| "grad_norm": 6.890323638916016, | |
| "learning_rate": 4.153612567352186e-06, | |
| "loss": 1.0562, | |
| "step": 3610 | |
| }, | |
| { | |
| "epoch": 0.2392755634873422, | |
| "grad_norm": 8.741559028625488, | |
| "learning_rate": 4.098001989613763e-06, | |
| "loss": 1.1737, | |
| "step": 3620 | |
| }, | |
| { | |
| "epoch": 0.2399365457069205, | |
| "grad_norm": 12.617691993713379, | |
| "learning_rate": 4.042682539046698e-06, | |
| "loss": 1.2365, | |
| "step": 3630 | |
| }, | |
| { | |
| "epoch": 0.24059752792649877, | |
| "grad_norm": 6.839216232299805, | |
| "learning_rate": 3.987656489616937e-06, | |
| "loss": 1.1941, | |
| "step": 3640 | |
| }, | |
| { | |
| "epoch": 0.24125851014607708, | |
| "grad_norm": 10.760446548461914, | |
| "learning_rate": 3.932926103229849e-06, | |
| "loss": 1.1187, | |
| "step": 3650 | |
| }, | |
| { | |
| "epoch": 0.24191949236565535, | |
| "grad_norm": 7.493879795074463, | |
| "learning_rate": 3.878493629637249e-06, | |
| "loss": 1.1193, | |
| "step": 3660 | |
| }, | |
| { | |
| "epoch": 0.24258047458523366, | |
| "grad_norm": 8.233012199401855, | |
| "learning_rate": 3.824361306344942e-06, | |
| "loss": 1.1905, | |
| "step": 3670 | |
| }, | |
| { | |
| "epoch": 0.24324145680481196, | |
| "grad_norm": 8.992157936096191, | |
| "learning_rate": 3.7705313585207056e-06, | |
| "loss": 1.0877, | |
| "step": 3680 | |
| }, | |
| { | |
| "epoch": 0.24390243902439024, | |
| "grad_norm": 13.892884254455566, | |
| "learning_rate": 3.717005998902859e-06, | |
| "loss": 1.1345, | |
| "step": 3690 | |
| }, | |
| { | |
| "epoch": 0.24456342124396854, | |
| "grad_norm": 10.53703784942627, | |
| "learning_rate": 3.6637874277092946e-06, | |
| "loss": 1.1473, | |
| "step": 3700 | |
| }, | |
| { | |
| "epoch": 0.24522440346354682, | |
| "grad_norm": 5.2873406410217285, | |
| "learning_rate": 3.610877832547034e-06, | |
| "loss": 1.0317, | |
| "step": 3710 | |
| }, | |
| { | |
| "epoch": 0.24588538568312512, | |
| "grad_norm": 8.536104202270508, | |
| "learning_rate": 3.5582793883222923e-06, | |
| "loss": 1.0296, | |
| "step": 3720 | |
| }, | |
| { | |
| "epoch": 0.24654636790270343, | |
| "grad_norm": 7.4764227867126465, | |
| "learning_rate": 3.5059942571511037e-06, | |
| "loss": 1.0728, | |
| "step": 3730 | |
| }, | |
| { | |
| "epoch": 0.2472073501222817, | |
| "grad_norm": 9.194038391113281, | |
| "learning_rate": 3.4540245882704213e-06, | |
| "loss": 1.1157, | |
| "step": 3740 | |
| }, | |
| { | |
| "epoch": 0.24786833234186, | |
| "grad_norm": 10.502184867858887, | |
| "learning_rate": 3.4023725179497848e-06, | |
| "loss": 1.1923, | |
| "step": 3750 | |
| }, | |
| { | |
| "epoch": 0.24786833234186, | |
| "eval_loss": 1.2212793827056885, | |
| "eval_runtime": 53.4315, | |
| "eval_samples_per_second": 9.376, | |
| "eval_steps_per_second": 9.376, | |
| "step": 3750 | |
| }, | |
| { | |
| "epoch": 0.24852931456143829, | |
| "grad_norm": 7.8659234046936035, | |
| "learning_rate": 3.351040169403499e-06, | |
| "loss": 1.0991, | |
| "step": 3760 | |
| }, | |
| { | |
| "epoch": 0.2491902967810166, | |
| "grad_norm": 8.55827808380127, | |
| "learning_rate": 3.30002965270335e-06, | |
| "loss": 1.0168, | |
| "step": 3770 | |
| }, | |
| { | |
| "epoch": 0.2498512790005949, | |
| "grad_norm": 10.08139705657959, | |
| "learning_rate": 3.2493430646918865e-06, | |
| "loss": 1.188, | |
| "step": 3780 | |
| }, | |
| { | |
| "epoch": 0.25051226122017317, | |
| "grad_norm": 7.772961139678955, | |
| "learning_rate": 3.1989824888962225e-06, | |
| "loss": 1.1373, | |
| "step": 3790 | |
| }, | |
| { | |
| "epoch": 0.25117324343975145, | |
| "grad_norm": 7.485221862792969, | |
| "learning_rate": 3.1489499954423797e-06, | |
| "loss": 1.2637, | |
| "step": 3800 | |
| }, | |
| { | |
| "epoch": 0.2518342256593298, | |
| "grad_norm": 14.595245361328125, | |
| "learning_rate": 3.0992476409701936e-06, | |
| "loss": 1.1433, | |
| "step": 3810 | |
| }, | |
| { | |
| "epoch": 0.25249520787890806, | |
| "grad_norm": 11.104635238647461, | |
| "learning_rate": 3.0498774685487882e-06, | |
| "loss": 1.1773, | |
| "step": 3820 | |
| }, | |
| { | |
| "epoch": 0.25315619009848633, | |
| "grad_norm": 6.462589263916016, | |
| "learning_rate": 3.000841507592583e-06, | |
| "loss": 1.0087, | |
| "step": 3830 | |
| }, | |
| { | |
| "epoch": 0.25381717231806467, | |
| "grad_norm": 12.072765350341797, | |
| "learning_rate": 2.9521417737778717e-06, | |
| "loss": 1.0804, | |
| "step": 3840 | |
| }, | |
| { | |
| "epoch": 0.25447815453764294, | |
| "grad_norm": 11.500109672546387, | |
| "learning_rate": 2.9037802689599704e-06, | |
| "loss": 1.1597, | |
| "step": 3850 | |
| }, | |
| { | |
| "epoch": 0.2551391367572212, | |
| "grad_norm": 8.149591445922852, | |
| "learning_rate": 2.855758981090918e-06, | |
| "loss": 1.2028, | |
| "step": 3860 | |
| }, | |
| { | |
| "epoch": 0.25580011897679955, | |
| "grad_norm": 11.354681015014648, | |
| "learning_rate": 2.8080798841377743e-06, | |
| "loss": 1.1725, | |
| "step": 3870 | |
| }, | |
| { | |
| "epoch": 0.2564611011963778, | |
| "grad_norm": 9.085524559020996, | |
| "learning_rate": 2.7607449380014703e-06, | |
| "loss": 1.2511, | |
| "step": 3880 | |
| }, | |
| { | |
| "epoch": 0.2571220834159561, | |
| "grad_norm": 10.283825874328613, | |
| "learning_rate": 2.713756088436244e-06, | |
| "loss": 1.1444, | |
| "step": 3890 | |
| }, | |
| { | |
| "epoch": 0.2577830656355344, | |
| "grad_norm": 11.607617378234863, | |
| "learning_rate": 2.6671152669696515e-06, | |
| "loss": 1.1419, | |
| "step": 3900 | |
| }, | |
| { | |
| "epoch": 0.2577830656355344, | |
| "eval_loss": 1.2201364040374756, | |
| "eval_runtime": 55.3983, | |
| "eval_samples_per_second": 9.044, | |
| "eval_steps_per_second": 9.044, | |
| "step": 3900 | |
| }, | |
| { | |
| "epoch": 0.2584440478551127, | |
| "grad_norm": 7.006284713745117, | |
| "learning_rate": 2.6208243908231916e-06, | |
| "loss": 1.0414, | |
| "step": 3910 | |
| }, | |
| { | |
| "epoch": 0.259105030074691, | |
| "grad_norm": 10.41873550415039, | |
| "learning_rate": 2.57488536283347e-06, | |
| "loss": 1.1597, | |
| "step": 3920 | |
| }, | |
| { | |
| "epoch": 0.25976601229426927, | |
| "grad_norm": 9.293778419494629, | |
| "learning_rate": 2.5293000713739977e-06, | |
| "loss": 1.182, | |
| "step": 3930 | |
| }, | |
| { | |
| "epoch": 0.2604269945138476, | |
| "grad_norm": 11.898356437683105, | |
| "learning_rate": 2.4840703902775642e-06, | |
| "loss": 1.2502, | |
| "step": 3940 | |
| }, | |
| { | |
| "epoch": 0.2610879767334259, | |
| "grad_norm": 9.323407173156738, | |
| "learning_rate": 2.4391981787592005e-06, | |
| "loss": 1.0892, | |
| "step": 3950 | |
| }, | |
| { | |
| "epoch": 0.26174895895300415, | |
| "grad_norm": 11.664414405822754, | |
| "learning_rate": 2.3946852813397737e-06, | |
| "loss": 1.1837, | |
| "step": 3960 | |
| }, | |
| { | |
| "epoch": 0.2624099411725825, | |
| "grad_norm": 11.392061233520508, | |
| "learning_rate": 2.3505335277701494e-06, | |
| "loss": 1.0029, | |
| "step": 3970 | |
| }, | |
| { | |
| "epoch": 0.26307092339216076, | |
| "grad_norm": 10.388303756713867, | |
| "learning_rate": 2.306744732955991e-06, | |
| "loss": 1.172, | |
| "step": 3980 | |
| }, | |
| { | |
| "epoch": 0.26373190561173904, | |
| "grad_norm": 11.332767486572266, | |
| "learning_rate": 2.2633206968831374e-06, | |
| "loss": 1.1951, | |
| "step": 3990 | |
| }, | |
| { | |
| "epoch": 0.2643928878313173, | |
| "grad_norm": 4.8323259353637695, | |
| "learning_rate": 2.220263204543635e-06, | |
| "loss": 1.0181, | |
| "step": 4000 | |
| }, | |
| { | |
| "epoch": 0.26505387005089565, | |
| "grad_norm": 11.138567924499512, | |
| "learning_rate": 2.1775740258623492e-06, | |
| "loss": 1.1295, | |
| "step": 4010 | |
| }, | |
| { | |
| "epoch": 0.2657148522704739, | |
| "grad_norm": 7.644820690155029, | |
| "learning_rate": 2.1352549156242126e-06, | |
| "loss": 1.1392, | |
| "step": 4020 | |
| }, | |
| { | |
| "epoch": 0.2663758344900522, | |
| "grad_norm": 11.998611450195312, | |
| "learning_rate": 2.0933076134020958e-06, | |
| "loss": 1.1516, | |
| "step": 4030 | |
| }, | |
| { | |
| "epoch": 0.26703681670963053, | |
| "grad_norm": 9.40128231048584, | |
| "learning_rate": 2.0517338434852946e-06, | |
| "loss": 1.1157, | |
| "step": 4040 | |
| }, | |
| { | |
| "epoch": 0.2676977989292088, | |
| "grad_norm": 7.291782379150391, | |
| "learning_rate": 2.010535314808659e-06, | |
| "loss": 1.1069, | |
| "step": 4050 | |
| }, | |
| { | |
| "epoch": 0.2676977989292088, | |
| "eval_loss": 1.2179657220840454, | |
| "eval_runtime": 53.213, | |
| "eval_samples_per_second": 9.415, | |
| "eval_steps_per_second": 9.415, | |
| "step": 4050 | |
| }, | |
| { | |
| "epoch": 0.2683587811487871, | |
| "grad_norm": 11.658596992492676, | |
| "learning_rate": 1.9697137208823396e-06, | |
| "loss": 1.172, | |
| "step": 4060 | |
| }, | |
| { | |
| "epoch": 0.2690197633683654, | |
| "grad_norm": 5.082404613494873, | |
| "learning_rate": 1.9292707397221775e-06, | |
| "loss": 1.1331, | |
| "step": 4070 | |
| }, | |
| { | |
| "epoch": 0.2696807455879437, | |
| "grad_norm": 13.126559257507324, | |
| "learning_rate": 1.8892080337807171e-06, | |
| "loss": 1.1899, | |
| "step": 4080 | |
| }, | |
| { | |
| "epoch": 0.27034172780752197, | |
| "grad_norm": 11.264731407165527, | |
| "learning_rate": 1.8495272498788887e-06, | |
| "loss": 1.0929, | |
| "step": 4090 | |
| }, | |
| { | |
| "epoch": 0.27100271002710025, | |
| "grad_norm": 12.232498168945312, | |
| "learning_rate": 1.8102300191383008e-06, | |
| "loss": 1.1517, | |
| "step": 4100 | |
| }, | |
| { | |
| "epoch": 0.2716636922466786, | |
| "grad_norm": 6.517210483551025, | |
| "learning_rate": 1.7713179569141897e-06, | |
| "loss": 1.1451, | |
| "step": 4110 | |
| }, | |
| { | |
| "epoch": 0.27232467446625686, | |
| "grad_norm": 10.073516845703125, | |
| "learning_rate": 1.7327926627290298e-06, | |
| "loss": 1.1757, | |
| "step": 4120 | |
| }, | |
| { | |
| "epoch": 0.27298565668583513, | |
| "grad_norm": 10.904183387756348, | |
| "learning_rate": 1.6946557202067662e-06, | |
| "loss": 1.201, | |
| "step": 4130 | |
| }, | |
| { | |
| "epoch": 0.27364663890541346, | |
| "grad_norm": 9.502151489257812, | |
| "learning_rate": 1.6569086970077352e-06, | |
| "loss": 1.1649, | |
| "step": 4140 | |
| }, | |
| { | |
| "epoch": 0.27430762112499174, | |
| "grad_norm": 12.71923542022705, | |
| "learning_rate": 1.6195531447642177e-06, | |
| "loss": 1.2048, | |
| "step": 4150 | |
| }, | |
| { | |
| "epoch": 0.27496860334457, | |
| "grad_norm": 13.27767562866211, | |
| "learning_rate": 1.582590599016653e-06, | |
| "loss": 1.0894, | |
| "step": 4160 | |
| }, | |
| { | |
| "epoch": 0.27562958556414835, | |
| "grad_norm": 12.859643936157227, | |
| "learning_rate": 1.5460225791505258e-06, | |
| "loss": 1.1565, | |
| "step": 4170 | |
| }, | |
| { | |
| "epoch": 0.2762905677837266, | |
| "grad_norm": 6.589792728424072, | |
| "learning_rate": 1.509850588333905e-06, | |
| "loss": 1.0296, | |
| "step": 4180 | |
| }, | |
| { | |
| "epoch": 0.2769515500033049, | |
| "grad_norm": 13.752243995666504, | |
| "learning_rate": 1.4740761134556557e-06, | |
| "loss": 1.312, | |
| "step": 4190 | |
| }, | |
| { | |
| "epoch": 0.2776125322228832, | |
| "grad_norm": 12.691303253173828, | |
| "learning_rate": 1.4387006250643236e-06, | |
| "loss": 1.1494, | |
| "step": 4200 | |
| }, | |
| { | |
| "epoch": 0.2776125322228832, | |
| "eval_loss": 1.2168010473251343, | |
| "eval_runtime": 51.4283, | |
| "eval_samples_per_second": 9.742, | |
| "eval_steps_per_second": 9.742, | |
| "step": 4200 | |
| }, | |
| { | |
| "epoch": 0.2782735144424615, | |
| "grad_norm": 11.23477840423584, | |
| "learning_rate": 1.4037255773076804e-06, | |
| "loss": 1.0421, | |
| "step": 4210 | |
| }, | |
| { | |
| "epoch": 0.2789344966620398, | |
| "grad_norm": 10.921051979064941, | |
| "learning_rate": 1.3691524078729481e-06, | |
| "loss": 1.055, | |
| "step": 4220 | |
| }, | |
| { | |
| "epoch": 0.27959547888161806, | |
| "grad_norm": 7.342863082885742, | |
| "learning_rate": 1.3349825379277099e-06, | |
| "loss": 1.2973, | |
| "step": 4230 | |
| }, | |
| { | |
| "epoch": 0.2802564611011964, | |
| "grad_norm": 11.837105751037598, | |
| "learning_rate": 1.3012173720614862e-06, | |
| "loss": 1.2177, | |
| "step": 4240 | |
| }, | |
| { | |
| "epoch": 0.2809174433207747, | |
| "grad_norm": 13.415239334106445, | |
| "learning_rate": 1.267858298227995e-06, | |
| "loss": 1.1455, | |
| "step": 4250 | |
| }, | |
| { | |
| "epoch": 0.28157842554035295, | |
| "grad_norm": 11.301210403442383, | |
| "learning_rate": 1.2349066876881063e-06, | |
| "loss": 1.1602, | |
| "step": 4260 | |
| }, | |
| { | |
| "epoch": 0.2822394077599313, | |
| "grad_norm": 5.907723903656006, | |
| "learning_rate": 1.202363894953462e-06, | |
| "loss": 1.1053, | |
| "step": 4270 | |
| }, | |
| { | |
| "epoch": 0.28290038997950956, | |
| "grad_norm": 12.926289558410645, | |
| "learning_rate": 1.1702312577308133e-06, | |
| "loss": 1.2056, | |
| "step": 4280 | |
| }, | |
| { | |
| "epoch": 0.28356137219908784, | |
| "grad_norm": 10.026867866516113, | |
| "learning_rate": 1.1385100968670189e-06, | |
| "loss": 1.1685, | |
| "step": 4290 | |
| }, | |
| { | |
| "epoch": 0.2842223544186661, | |
| "grad_norm": 12.193798065185547, | |
| "learning_rate": 1.107201716294762e-06, | |
| "loss": 1.1253, | |
| "step": 4300 | |
| }, | |
| { | |
| "epoch": 0.28488333663824444, | |
| "grad_norm": 6.5807294845581055, | |
| "learning_rate": 1.076307402978938e-06, | |
| "loss": 1.1252, | |
| "step": 4310 | |
| }, | |
| { | |
| "epoch": 0.2855443188578227, | |
| "grad_norm": 11.568461418151855, | |
| "learning_rate": 1.0458284268637652e-06, | |
| "loss": 1.2131, | |
| "step": 4320 | |
| }, | |
| { | |
| "epoch": 0.286205301077401, | |
| "grad_norm": 5.46840238571167, | |
| "learning_rate": 1.0157660408205728e-06, | |
| "loss": 1.0678, | |
| "step": 4330 | |
| }, | |
| { | |
| "epoch": 0.28686628329697933, | |
| "grad_norm": 13.20085334777832, | |
| "learning_rate": 9.861214805963042e-07, | |
| "loss": 1.1974, | |
| "step": 4340 | |
| }, | |
| { | |
| "epoch": 0.2875272655165576, | |
| "grad_norm": 13.585931777954102, | |
| "learning_rate": 9.568959647627223e-07, | |
| "loss": 1.1664, | |
| "step": 4350 | |
| }, | |
| { | |
| "epoch": 0.2875272655165576, | |
| "eval_loss": 1.21638822555542, | |
| "eval_runtime": 51.7738, | |
| "eval_samples_per_second": 9.677, | |
| "eval_steps_per_second": 9.677, | |
| "step": 4350 | |
| }, | |
| { | |
| "epoch": 0.2881882477361359, | |
| "grad_norm": 7.628300189971924, | |
| "learning_rate": 9.280906946663111e-07, | |
| "loss": 1.0584, | |
| "step": 4360 | |
| }, | |
| { | |
| "epoch": 0.2888492299557142, | |
| "grad_norm": 8.380716323852539, | |
| "learning_rate": 8.997068543789051e-07, | |
| "loss": 1.1137, | |
| "step": 4370 | |
| }, | |
| { | |
| "epoch": 0.2895102121752925, | |
| "grad_norm": 12.071667671203613, | |
| "learning_rate": 8.717456106490042e-07, | |
| "loss": 1.0887, | |
| "step": 4380 | |
| }, | |
| { | |
| "epoch": 0.29017119439487077, | |
| "grad_norm": 6.33940315246582, | |
| "learning_rate": 8.442081128538243e-07, | |
| "loss": 1.0145, | |
| "step": 4390 | |
| }, | |
| { | |
| "epoch": 0.29083217661444905, | |
| "grad_norm": 9.972112655639648, | |
| "learning_rate": 8.170954929520389e-07, | |
| "loss": 1.1362, | |
| "step": 4400 | |
| }, | |
| { | |
| "epoch": 0.2914931588340274, | |
| "grad_norm": 12.998346328735352, | |
| "learning_rate": 7.904088654372622e-07, | |
| "loss": 1.148, | |
| "step": 4410 | |
| }, | |
| { | |
| "epoch": 0.29215414105360565, | |
| "grad_norm": 5.646799087524414, | |
| "learning_rate": 7.641493272922243e-07, | |
| "loss": 1.1281, | |
| "step": 4420 | |
| }, | |
| { | |
| "epoch": 0.29281512327318393, | |
| "grad_norm": 10.702962875366211, | |
| "learning_rate": 7.383179579436903e-07, | |
| "loss": 1.1785, | |
| "step": 4430 | |
| }, | |
| { | |
| "epoch": 0.29347610549276226, | |
| "grad_norm": 5.956870079040527, | |
| "learning_rate": 7.129158192180766e-07, | |
| "loss": 1.1568, | |
| "step": 4440 | |
| }, | |
| { | |
| "epoch": 0.29413708771234054, | |
| "grad_norm": 11.048665046691895, | |
| "learning_rate": 6.879439552978142e-07, | |
| "loss": 1.0652, | |
| "step": 4450 | |
| }, | |
| { | |
| "epoch": 0.2947980699319188, | |
| "grad_norm": 5.649775505065918, | |
| "learning_rate": 6.634033926784221e-07, | |
| "loss": 1.1235, | |
| "step": 4460 | |
| }, | |
| { | |
| "epoch": 0.29545905215149715, | |
| "grad_norm": 11.055773735046387, | |
| "learning_rate": 6.392951401263069e-07, | |
| "loss": 1.285, | |
| "step": 4470 | |
| }, | |
| { | |
| "epoch": 0.2961200343710754, | |
| "grad_norm": 7.027043342590332, | |
| "learning_rate": 6.156201886373113e-07, | |
| "loss": 1.209, | |
| "step": 4480 | |
| }, | |
| { | |
| "epoch": 0.2967810165906537, | |
| "grad_norm": 11.43958854675293, | |
| "learning_rate": 5.923795113959569e-07, | |
| "loss": 1.2139, | |
| "step": 4490 | |
| }, | |
| { | |
| "epoch": 0.297441998810232, | |
| "grad_norm": 11.668280601501465, | |
| "learning_rate": 5.695740637354591e-07, | |
| "loss": 1.2407, | |
| "step": 4500 | |
| }, | |
| { | |
| "epoch": 0.297441998810232, | |
| "eval_loss": 1.2155283689498901, | |
| "eval_runtime": 48.0067, | |
| "eval_samples_per_second": 10.436, | |
| "eval_steps_per_second": 10.436, | |
| "step": 4500 | |
| }, | |
| { | |
| "epoch": 0.2981029810298103, | |
| "grad_norm": 10.411969184875488, | |
| "learning_rate": 5.472047830984499e-07, | |
| "loss": 1.1499, | |
| "step": 4510 | |
| }, | |
| { | |
| "epoch": 0.2987639632493886, | |
| "grad_norm": 6.937885761260986, | |
| "learning_rate": 5.252725889984403e-07, | |
| "loss": 1.0297, | |
| "step": 4520 | |
| }, | |
| { | |
| "epoch": 0.29942494546896686, | |
| "grad_norm": 10.743237495422363, | |
| "learning_rate": 5.037783829820298e-07, | |
| "loss": 1.1198, | |
| "step": 4530 | |
| }, | |
| { | |
| "epoch": 0.3000859276885452, | |
| "grad_norm": 5.665622234344482, | |
| "learning_rate": 4.827230485918372e-07, | |
| "loss": 1.0459, | |
| "step": 4540 | |
| }, | |
| { | |
| "epoch": 0.30074690990812347, | |
| "grad_norm": 9.720799446105957, | |
| "learning_rate": 4.6210745133019236e-07, | |
| "loss": 1.1943, | |
| "step": 4550 | |
| }, | |
| { | |
| "epoch": 0.30140789212770175, | |
| "grad_norm": 11.57904052734375, | |
| "learning_rate": 4.419324386235529e-07, | |
| "loss": 1.2007, | |
| "step": 4560 | |
| }, | |
| { | |
| "epoch": 0.3020688743472801, | |
| "grad_norm": 10.47191333770752, | |
| "learning_rate": 4.2219883978767386e-07, | |
| "loss": 1.1754, | |
| "step": 4570 | |
| }, | |
| { | |
| "epoch": 0.30272985656685836, | |
| "grad_norm": 8.371639251708984, | |
| "learning_rate": 4.029074659935082e-07, | |
| "loss": 1.0829, | |
| "step": 4580 | |
| }, | |
| { | |
| "epoch": 0.30339083878643663, | |
| "grad_norm": 11.640840530395508, | |
| "learning_rate": 3.8405911023387444e-07, | |
| "loss": 1.0573, | |
| "step": 4590 | |
| }, | |
| { | |
| "epoch": 0.3040518210060149, | |
| "grad_norm": 14.082575798034668, | |
| "learning_rate": 3.6565454729085526e-07, | |
| "loss": 1.2711, | |
| "step": 4600 | |
| }, | |
| { | |
| "epoch": 0.30471280322559324, | |
| "grad_norm": 8.940695762634277, | |
| "learning_rate": 3.4769453370394753e-07, | |
| "loss": 1.1595, | |
| "step": 4610 | |
| }, | |
| { | |
| "epoch": 0.3053737854451715, | |
| "grad_norm": 7.7234954833984375, | |
| "learning_rate": 3.301798077389637e-07, | |
| "loss": 1.2151, | |
| "step": 4620 | |
| }, | |
| { | |
| "epoch": 0.3060347676647498, | |
| "grad_norm": 4.756081581115723, | |
| "learning_rate": 3.1311108935768926e-07, | |
| "loss": 1.173, | |
| "step": 4630 | |
| }, | |
| { | |
| "epoch": 0.30669574988432813, | |
| "grad_norm": 10.524628639221191, | |
| "learning_rate": 2.964890801882817e-07, | |
| "loss": 1.0992, | |
| "step": 4640 | |
| }, | |
| { | |
| "epoch": 0.3073567321039064, | |
| "grad_norm": 6.618716716766357, | |
| "learning_rate": 2.8031446349643393e-07, | |
| "loss": 1.1152, | |
| "step": 4650 | |
| }, | |
| { | |
| "epoch": 0.3073567321039064, | |
| "eval_loss": 1.2152043581008911, | |
| "eval_runtime": 53.4713, | |
| "eval_samples_per_second": 9.37, | |
| "eval_steps_per_second": 9.37, | |
| "step": 4650 | |
| } | |
| ], | |
| "logging_steps": 10, | |
| "max_steps": 5000, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 1, | |
| "save_steps": 150, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": false | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 3.2041141329494016e+17, | |
| "train_batch_size": 4, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |