| { | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 2.0, | |
| "eval_steps": 500, | |
| "global_step": 594, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.003367003367003367, | |
| "grad_norm": 4.149549460211877, | |
| "learning_rate": 1.6666666666666668e-07, | |
| "loss": 2.1262, | |
| "step": 1 | |
| }, | |
| { | |
| "epoch": 0.016835016835016835, | |
| "grad_norm": 4.010411470158707, | |
| "learning_rate": 8.333333333333333e-07, | |
| "loss": 2.0953, | |
| "step": 5 | |
| }, | |
| { | |
| "epoch": 0.03367003367003367, | |
| "grad_norm": 2.095167650626825, | |
| "learning_rate": 1.6666666666666667e-06, | |
| "loss": 1.8934, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.050505050505050504, | |
| "grad_norm": 3.1062734406420245, | |
| "learning_rate": 2.5e-06, | |
| "loss": 1.4715, | |
| "step": 15 | |
| }, | |
| { | |
| "epoch": 0.06734006734006734, | |
| "grad_norm": 0.5160710231319299, | |
| "learning_rate": 3.3333333333333333e-06, | |
| "loss": 0.9113, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.08417508417508418, | |
| "grad_norm": 0.42528861790572803, | |
| "learning_rate": 4.166666666666667e-06, | |
| "loss": 0.7253, | |
| "step": 25 | |
| }, | |
| { | |
| "epoch": 0.10101010101010101, | |
| "grad_norm": 0.27763208880764356, | |
| "learning_rate": 5e-06, | |
| "loss": 0.664, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.11784511784511785, | |
| "grad_norm": 0.22702639326519608, | |
| "learning_rate": 5.833333333333334e-06, | |
| "loss": 0.5673, | |
| "step": 35 | |
| }, | |
| { | |
| "epoch": 0.13468013468013468, | |
| "grad_norm": 0.20027531882369276, | |
| "learning_rate": 6.666666666666667e-06, | |
| "loss": 0.5462, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.15151515151515152, | |
| "grad_norm": 0.22331242368395698, | |
| "learning_rate": 7.500000000000001e-06, | |
| "loss": 0.5208, | |
| "step": 45 | |
| }, | |
| { | |
| "epoch": 0.16835016835016836, | |
| "grad_norm": 0.2495290013519025, | |
| "learning_rate": 8.333333333333334e-06, | |
| "loss": 0.5063, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.18518518518518517, | |
| "grad_norm": 0.22997291660332314, | |
| "learning_rate": 9.166666666666666e-06, | |
| "loss": 0.4763, | |
| "step": 55 | |
| }, | |
| { | |
| "epoch": 0.20202020202020202, | |
| "grad_norm": 0.1854483270058641, | |
| "learning_rate": 1e-05, | |
| "loss": 0.4797, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.21885521885521886, | |
| "grad_norm": 0.19584468158412918, | |
| "learning_rate": 9.997836953115927e-06, | |
| "loss": 0.4705, | |
| "step": 65 | |
| }, | |
| { | |
| "epoch": 0.2356902356902357, | |
| "grad_norm": 0.20152991740174855, | |
| "learning_rate": 9.991349683972435e-06, | |
| "loss": 0.4539, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.25252525252525254, | |
| "grad_norm": 0.16770527024092638, | |
| "learning_rate": 9.980543805476447e-06, | |
| "loss": 0.4611, | |
| "step": 75 | |
| }, | |
| { | |
| "epoch": 0.26936026936026936, | |
| "grad_norm": 0.1345303934796462, | |
| "learning_rate": 9.965428667076687e-06, | |
| "loss": 0.4208, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.28619528619528617, | |
| "grad_norm": 0.14467263254757398, | |
| "learning_rate": 9.946017346674362e-06, | |
| "loss": 0.4291, | |
| "step": 85 | |
| }, | |
| { | |
| "epoch": 0.30303030303030304, | |
| "grad_norm": 0.11675624115717066, | |
| "learning_rate": 9.922326639307918e-06, | |
| "loss": 0.4118, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 0.31986531986531985, | |
| "grad_norm": 0.12419897003228939, | |
| "learning_rate": 9.894377042621654e-06, | |
| "loss": 0.4169, | |
| "step": 95 | |
| }, | |
| { | |
| "epoch": 0.3367003367003367, | |
| "grad_norm": 0.1411171642205521, | |
| "learning_rate": 9.86219273913078e-06, | |
| "loss": 0.4336, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.35353535353535354, | |
| "grad_norm": 0.10846956134773478, | |
| "learning_rate": 9.825801575298248e-06, | |
| "loss": 0.3873, | |
| "step": 105 | |
| }, | |
| { | |
| "epoch": 0.37037037037037035, | |
| "grad_norm": 0.12002423610117473, | |
| "learning_rate": 9.785235037441473e-06, | |
| "loss": 0.3619, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 0.3872053872053872, | |
| "grad_norm": 0.11343783522484456, | |
| "learning_rate": 9.74052822448978e-06, | |
| "loss": 0.3993, | |
| "step": 115 | |
| }, | |
| { | |
| "epoch": 0.40404040404040403, | |
| "grad_norm": 0.11192347591145078, | |
| "learning_rate": 9.691719817616148e-06, | |
| "loss": 0.4268, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 0.4208754208754209, | |
| "grad_norm": 0.1139058653749672, | |
| "learning_rate": 9.63885204676954e-06, | |
| "loss": 0.3725, | |
| "step": 125 | |
| }, | |
| { | |
| "epoch": 0.4377104377104377, | |
| "grad_norm": 0.1254583279635675, | |
| "learning_rate": 9.581970654136752e-06, | |
| "loss": 0.3645, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 0.45454545454545453, | |
| "grad_norm": 0.11137112662931582, | |
| "learning_rate": 9.521124854565425e-06, | |
| "loss": 0.3796, | |
| "step": 135 | |
| }, | |
| { | |
| "epoch": 0.4713804713804714, | |
| "grad_norm": 0.12134068097374133, | |
| "learning_rate": 9.45636729298243e-06, | |
| "loss": 0.4135, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 0.4882154882154882, | |
| "grad_norm": 0.10390787212244738, | |
| "learning_rate": 9.387753998844482e-06, | |
| "loss": 0.4136, | |
| "step": 145 | |
| }, | |
| { | |
| "epoch": 0.5050505050505051, | |
| "grad_norm": 0.10067346916725926, | |
| "learning_rate": 9.315344337660422e-06, | |
| "loss": 0.3814, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.5218855218855218, | |
| "grad_norm": 0.12175671218216305, | |
| "learning_rate": 9.239200959627048e-06, | |
| "loss": 0.4118, | |
| "step": 155 | |
| }, | |
| { | |
| "epoch": 0.5387205387205387, | |
| "grad_norm": 0.10919894991390798, | |
| "learning_rate": 9.159389745423003e-06, | |
| "loss": 0.4231, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 0.5555555555555556, | |
| "grad_norm": 0.10398902410046555, | |
| "learning_rate": 9.07597974920756e-06, | |
| "loss": 0.4062, | |
| "step": 165 | |
| }, | |
| { | |
| "epoch": 0.5723905723905723, | |
| "grad_norm": 0.100265725065611, | |
| "learning_rate": 8.98904313887369e-06, | |
| "loss": 0.3677, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 0.5892255892255892, | |
| "grad_norm": 0.099307367621639, | |
| "learning_rate": 8.89865513360703e-06, | |
| "loss": 0.3758, | |
| "step": 175 | |
| }, | |
| { | |
| "epoch": 0.6060606060606061, | |
| "grad_norm": 0.09507905089725636, | |
| "learning_rate": 8.804893938804839e-06, | |
| "loss": 0.3703, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 0.622895622895623, | |
| "grad_norm": 0.11024852756485287, | |
| "learning_rate": 8.707840678411223e-06, | |
| "loss": 0.4052, | |
| "step": 185 | |
| }, | |
| { | |
| "epoch": 0.6397306397306397, | |
| "grad_norm": 0.10365136825762392, | |
| "learning_rate": 8.607579324727175e-06, | |
| "loss": 0.3955, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 0.6565656565656566, | |
| "grad_norm": 0.11091339739436984, | |
| "learning_rate": 8.504196625756166e-06, | |
| "loss": 0.3884, | |
| "step": 195 | |
| }, | |
| { | |
| "epoch": 0.6734006734006734, | |
| "grad_norm": 0.10685048356906915, | |
| "learning_rate": 8.397782030148147e-06, | |
| "loss": 0.3702, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.6902356902356902, | |
| "grad_norm": 0.1246306899349896, | |
| "learning_rate": 8.288427609806899e-06, | |
| "loss": 0.4024, | |
| "step": 205 | |
| }, | |
| { | |
| "epoch": 0.7070707070707071, | |
| "grad_norm": 0.1066849900460222, | |
| "learning_rate": 8.176227980227693e-06, | |
| "loss": 0.4052, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 0.7239057239057239, | |
| "grad_norm": 0.11875691065211648, | |
| "learning_rate": 8.061280218634192e-06, | |
| "loss": 0.3896, | |
| "step": 215 | |
| }, | |
| { | |
| "epoch": 0.7407407407407407, | |
| "grad_norm": 0.09762435583125909, | |
| "learning_rate": 7.943683779985412e-06, | |
| "loss": 0.3868, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 0.7575757575757576, | |
| "grad_norm": 0.10054336444833502, | |
| "learning_rate": 7.823540410925434e-06, | |
| "loss": 0.3674, | |
| "step": 225 | |
| }, | |
| { | |
| "epoch": 0.7744107744107744, | |
| "grad_norm": 0.10842059524327032, | |
| "learning_rate": 7.700954061750295e-06, | |
| "loss": 0.3918, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 0.7912457912457912, | |
| "grad_norm": 0.09745155947498828, | |
| "learning_rate": 7.576030796468233e-06, | |
| "loss": 0.3679, | |
| "step": 235 | |
| }, | |
| { | |
| "epoch": 0.8080808080808081, | |
| "grad_norm": 0.10372602136364313, | |
| "learning_rate": 7.4488787010311425e-06, | |
| "loss": 0.386, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 0.8249158249158249, | |
| "grad_norm": 0.10925288274096943, | |
| "learning_rate": 7.319607789816555e-06, | |
| "loss": 0.3822, | |
| "step": 245 | |
| }, | |
| { | |
| "epoch": 0.8417508417508418, | |
| "grad_norm": 0.10360616006701923, | |
| "learning_rate": 7.188329910441154e-06, | |
| "loss": 0.3656, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 0.8585858585858586, | |
| "grad_norm": 0.09128323758171268, | |
| "learning_rate": 7.05515864698811e-06, | |
| "loss": 0.3801, | |
| "step": 255 | |
| }, | |
| { | |
| "epoch": 0.8754208754208754, | |
| "grad_norm": 0.09321485064307464, | |
| "learning_rate": 6.920209221732007e-06, | |
| "loss": 0.3453, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 0.8922558922558923, | |
| "grad_norm": 0.11568566960803582, | |
| "learning_rate": 6.783598395446371e-06, | |
| "loss": 0.4071, | |
| "step": 265 | |
| }, | |
| { | |
| "epoch": 0.9090909090909091, | |
| "grad_norm": 0.09087682600845605, | |
| "learning_rate": 6.64544436638005e-06, | |
| "loss": 0.3685, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 0.9259259259259259, | |
| "grad_norm": 0.08403968322909508, | |
| "learning_rate": 6.505866667989884e-06, | |
| "loss": 0.3535, | |
| "step": 275 | |
| }, | |
| { | |
| "epoch": 0.9427609427609428, | |
| "grad_norm": 0.09338895709806748, | |
| "learning_rate": 6.364986065518106e-06, | |
| "loss": 0.3425, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 0.9595959595959596, | |
| "grad_norm": 0.11009895402833714, | |
| "learning_rate": 6.222924451504001e-06, | |
| "loss": 0.3903, | |
| "step": 285 | |
| }, | |
| { | |
| "epoch": 0.9764309764309764, | |
| "grad_norm": 0.0962540505203609, | |
| "learning_rate": 6.079804740320181e-06, | |
| "loss": 0.3743, | |
| "step": 290 | |
| }, | |
| { | |
| "epoch": 0.9932659932659933, | |
| "grad_norm": 0.11859376799293515, | |
| "learning_rate": 5.935750761824777e-06, | |
| "loss": 0.3911, | |
| "step": 295 | |
| }, | |
| { | |
| "epoch": 1.0, | |
| "eval_loss": 0.3753235638141632, | |
| "eval_runtime": 28.7233, | |
| "eval_samples_per_second": 20.506, | |
| "eval_steps_per_second": 5.153, | |
| "step": 297 | |
| }, | |
| { | |
| "epoch": 1.0101010101010102, | |
| "grad_norm": 0.09638846774704153, | |
| "learning_rate": 5.790887154221521e-06, | |
| "loss": 0.3592, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 1.026936026936027, | |
| "grad_norm": 0.11218182778453092, | |
| "learning_rate": 5.645339256220427e-06, | |
| "loss": 0.3346, | |
| "step": 305 | |
| }, | |
| { | |
| "epoch": 1.0437710437710437, | |
| "grad_norm": 0.0894851822596739, | |
| "learning_rate": 5.499232998592399e-06, | |
| "loss": 0.3261, | |
| "step": 310 | |
| }, | |
| { | |
| "epoch": 1.0606060606060606, | |
| "grad_norm": 0.11093920612376884, | |
| "learning_rate": 5.352694795211555e-06, | |
| "loss": 0.3812, | |
| "step": 315 | |
| }, | |
| { | |
| "epoch": 1.0774410774410774, | |
| "grad_norm": 0.10664119581590888, | |
| "learning_rate": 5.20585143367959e-06, | |
| "loss": 0.3603, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 1.0942760942760943, | |
| "grad_norm": 0.09997813791060199, | |
| "learning_rate": 5.058829965626742e-06, | |
| "loss": 0.3335, | |
| "step": 325 | |
| }, | |
| { | |
| "epoch": 1.1111111111111112, | |
| "grad_norm": 0.09409979391826565, | |
| "learning_rate": 4.911757596784358e-06, | |
| "loss": 0.3611, | |
| "step": 330 | |
| }, | |
| { | |
| "epoch": 1.127946127946128, | |
| "grad_norm": 0.10458387692519078, | |
| "learning_rate": 4.7647615769241e-06, | |
| "loss": 0.3335, | |
| "step": 335 | |
| }, | |
| { | |
| "epoch": 1.144781144781145, | |
| "grad_norm": 0.09111079710967974, | |
| "learning_rate": 4.617969089759066e-06, | |
| "loss": 0.3428, | |
| "step": 340 | |
| }, | |
| { | |
| "epoch": 1.1616161616161615, | |
| "grad_norm": 0.08443699111109511, | |
| "learning_rate": 4.471507142902036e-06, | |
| "loss": 0.3223, | |
| "step": 345 | |
| }, | |
| { | |
| "epoch": 1.1784511784511784, | |
| "grad_norm": 0.09133102611827909, | |
| "learning_rate": 4.325502457976126e-06, | |
| "loss": 0.3435, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 1.1952861952861953, | |
| "grad_norm": 0.09066367143881884, | |
| "learning_rate": 4.180081360972852e-06, | |
| "loss": 0.3651, | |
| "step": 355 | |
| }, | |
| { | |
| "epoch": 1.2121212121212122, | |
| "grad_norm": 0.10084499213383416, | |
| "learning_rate": 4.035369672952516e-06, | |
| "loss": 0.3956, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 1.228956228956229, | |
| "grad_norm": 0.09491527150886968, | |
| "learning_rate": 3.891492601181462e-06, | |
| "loss": 0.3453, | |
| "step": 365 | |
| }, | |
| { | |
| "epoch": 1.2457912457912457, | |
| "grad_norm": 0.1019704740198795, | |
| "learning_rate": 3.7485746308004013e-06, | |
| "loss": 0.3479, | |
| "step": 370 | |
| }, | |
| { | |
| "epoch": 1.2626262626262625, | |
| "grad_norm": 0.0968882851124518, | |
| "learning_rate": 3.6067394171175397e-06, | |
| "loss": 0.3303, | |
| "step": 375 | |
| }, | |
| { | |
| "epoch": 1.2794612794612794, | |
| "grad_norm": 0.08550746429570652, | |
| "learning_rate": 3.466109678619681e-06, | |
| "loss": 0.3269, | |
| "step": 380 | |
| }, | |
| { | |
| "epoch": 1.2962962962962963, | |
| "grad_norm": 0.10176082406674497, | |
| "learning_rate": 3.3268070907938915e-06, | |
| "loss": 0.3401, | |
| "step": 385 | |
| }, | |
| { | |
| "epoch": 1.3131313131313131, | |
| "grad_norm": 0.09276368484156704, | |
| "learning_rate": 3.1889521808515888e-06, | |
| "loss": 0.3308, | |
| "step": 390 | |
| }, | |
| { | |
| "epoch": 1.32996632996633, | |
| "grad_norm": 0.10581834870886321, | |
| "learning_rate": 3.0526642234461313e-06, | |
| "loss": 0.3238, | |
| "step": 395 | |
| }, | |
| { | |
| "epoch": 1.3468013468013469, | |
| "grad_norm": 0.10428102445335462, | |
| "learning_rate": 2.9180611374741623e-06, | |
| "loss": 0.3595, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 1.3636363636363638, | |
| "grad_norm": 0.0899593621588282, | |
| "learning_rate": 2.785259384049959e-06, | |
| "loss": 0.3298, | |
| "step": 405 | |
| }, | |
| { | |
| "epoch": 1.3804713804713804, | |
| "grad_norm": 0.08797336350474357, | |
| "learning_rate": 2.6543738657411033e-06, | |
| "loss": 0.354, | |
| "step": 410 | |
| }, | |
| { | |
| "epoch": 1.3973063973063973, | |
| "grad_norm": 0.09021559950024842, | |
| "learning_rate": 2.525517827152614e-06, | |
| "loss": 0.3298, | |
| "step": 415 | |
| }, | |
| { | |
| "epoch": 1.4141414141414141, | |
| "grad_norm": 0.10219971409103493, | |
| "learning_rate": 2.3988027569455895e-06, | |
| "loss": 0.3468, | |
| "step": 420 | |
| }, | |
| { | |
| "epoch": 1.430976430976431, | |
| "grad_norm": 0.10290620157032898, | |
| "learning_rate": 2.274338291375147e-06, | |
| "loss": 0.3577, | |
| "step": 425 | |
| }, | |
| { | |
| "epoch": 1.4478114478114479, | |
| "grad_norm": 0.08826604210110597, | |
| "learning_rate": 2.1522321194310577e-06, | |
| "loss": 0.3473, | |
| "step": 430 | |
| }, | |
| { | |
| "epoch": 1.4646464646464645, | |
| "grad_norm": 0.08903591800358818, | |
| "learning_rate": 2.0325898896632178e-06, | |
| "loss": 0.3339, | |
| "step": 435 | |
| }, | |
| { | |
| "epoch": 1.4814814814814814, | |
| "grad_norm": 0.08848973658266114, | |
| "learning_rate": 1.915515118772555e-06, | |
| "loss": 0.3215, | |
| "step": 440 | |
| }, | |
| { | |
| "epoch": 1.4983164983164983, | |
| "grad_norm": 0.09450509559743611, | |
| "learning_rate": 1.8011091020464138e-06, | |
| "loss": 0.3519, | |
| "step": 445 | |
| }, | |
| { | |
| "epoch": 1.5151515151515151, | |
| "grad_norm": 0.08781947425372004, | |
| "learning_rate": 1.689470825715998e-06, | |
| "loss": 0.3344, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 1.531986531986532, | |
| "grad_norm": 0.09293777777558654, | |
| "learning_rate": 1.580696881311611e-06, | |
| "loss": 0.3429, | |
| "step": 455 | |
| }, | |
| { | |
| "epoch": 1.5488215488215489, | |
| "grad_norm": 0.10333592478818118, | |
| "learning_rate": 1.4748813820898554e-06, | |
| "loss": 0.3583, | |
| "step": 460 | |
| }, | |
| { | |
| "epoch": 1.5656565656565657, | |
| "grad_norm": 0.10312555456577105, | |
| "learning_rate": 1.3721158816050872e-06, | |
| "loss": 0.3245, | |
| "step": 465 | |
| }, | |
| { | |
| "epoch": 1.5824915824915826, | |
| "grad_norm": 0.09235484309875117, | |
| "learning_rate": 1.272489294495548e-06, | |
| "loss": 0.3203, | |
| "step": 470 | |
| }, | |
| { | |
| "epoch": 1.5993265993265995, | |
| "grad_norm": 0.09652201115483773, | |
| "learning_rate": 1.1760878195527642e-06, | |
| "loss": 0.3346, | |
| "step": 475 | |
| }, | |
| { | |
| "epoch": 1.6161616161616161, | |
| "grad_norm": 0.10091941077870167, | |
| "learning_rate": 1.0829948651407374e-06, | |
| "loss": 0.3212, | |
| "step": 480 | |
| }, | |
| { | |
| "epoch": 1.632996632996633, | |
| "grad_norm": 0.08814459685515268, | |
| "learning_rate": 9.932909770294542e-07, | |
| "loss": 0.3278, | |
| "step": 485 | |
| }, | |
| { | |
| "epoch": 1.6498316498316499, | |
| "grad_norm": 0.09440301955370818, | |
| "learning_rate": 9.070537687051817e-07, | |
| "loss": 0.3194, | |
| "step": 490 | |
| }, | |
| { | |
| "epoch": 1.6666666666666665, | |
| "grad_norm": 0.08616003726483284, | |
| "learning_rate": 8.243578542178227e-07, | |
| "loss": 0.3066, | |
| "step": 495 | |
| }, | |
| { | |
| "epoch": 1.6835016835016834, | |
| "grad_norm": 0.09770744203946605, | |
| "learning_rate": 7.452747836234392e-07, | |
| "loss": 0.319, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 1.7003367003367003, | |
| "grad_norm": 0.10679236845336801, | |
| "learning_rate": 6.698729810778065e-07, | |
| "loss": 0.3533, | |
| "step": 505 | |
| }, | |
| { | |
| "epoch": 1.7171717171717171, | |
| "grad_norm": 0.08984439281172861, | |
| "learning_rate": 5.982176856345445e-07, | |
| "loss": 0.3408, | |
| "step": 510 | |
| }, | |
| { | |
| "epoch": 1.734006734006734, | |
| "grad_norm": 0.0992116905666168, | |
| "learning_rate": 5.303708947990638e-07, | |
| "loss": 0.3397, | |
| "step": 515 | |
| }, | |
| { | |
| "epoch": 1.7508417508417509, | |
| "grad_norm": 0.08492852347673017, | |
| "learning_rate": 4.663913108871726e-07, | |
| "loss": 0.3661, | |
| "step": 520 | |
| }, | |
| { | |
| "epoch": 1.7676767676767677, | |
| "grad_norm": 0.08919997641532293, | |
| "learning_rate": 4.0633429023472004e-07, | |
| "loss": 0.3507, | |
| "step": 525 | |
| }, | |
| { | |
| "epoch": 1.7845117845117846, | |
| "grad_norm": 0.08884595301820054, | |
| "learning_rate": 3.5025179530225995e-07, | |
| "loss": 0.3475, | |
| "step": 530 | |
| }, | |
| { | |
| "epoch": 1.8013468013468015, | |
| "grad_norm": 0.10375781310984283, | |
| "learning_rate": 2.9819234971616154e-07, | |
| "loss": 0.3453, | |
| "step": 535 | |
| }, | |
| { | |
| "epoch": 1.8181818181818183, | |
| "grad_norm": 0.08617062897445908, | |
| "learning_rate": 2.5020099628504603e-07, | |
| "loss": 0.3225, | |
| "step": 540 | |
| }, | |
| { | |
| "epoch": 1.835016835016835, | |
| "grad_norm": 0.08459363441220882, | |
| "learning_rate": 2.0631925802791608e-07, | |
| "loss": 0.343, | |
| "step": 545 | |
| }, | |
| { | |
| "epoch": 1.8518518518518519, | |
| "grad_norm": 0.08738001505560129, | |
| "learning_rate": 1.6658510224765333e-07, | |
| "loss": 0.346, | |
| "step": 550 | |
| }, | |
| { | |
| "epoch": 1.8686868686868687, | |
| "grad_norm": 0.14091077594058707, | |
| "learning_rate": 1.3103290768099796e-07, | |
| "loss": 0.3525, | |
| "step": 555 | |
| }, | |
| { | |
| "epoch": 1.8855218855218854, | |
| "grad_norm": 0.10724350369660748, | |
| "learning_rate": 9.969343475342285e-08, | |
| "loss": 0.3415, | |
| "step": 560 | |
| }, | |
| { | |
| "epoch": 1.9023569023569022, | |
| "grad_norm": 0.10272663637676055, | |
| "learning_rate": 7.259379896463248e-08, | |
| "loss": 0.3283, | |
| "step": 565 | |
| }, | |
| { | |
| "epoch": 1.9191919191919191, | |
| "grad_norm": 0.10531925815392336, | |
| "learning_rate": 4.975744742772848e-08, | |
| "loss": 0.3437, | |
| "step": 570 | |
| }, | |
| { | |
| "epoch": 1.936026936026936, | |
| "grad_norm": 0.09578215650108557, | |
| "learning_rate": 3.120413858232474e-08, | |
| "loss": 0.3387, | |
| "step": 575 | |
| }, | |
| { | |
| "epoch": 1.9528619528619529, | |
| "grad_norm": 0.09866364512320212, | |
| "learning_rate": 1.69499250991767e-08, | |
| "loss": 0.327, | |
| "step": 580 | |
| }, | |
| { | |
| "epoch": 1.9696969696969697, | |
| "grad_norm": 0.09454492383929458, | |
| "learning_rate": 7.007139991108136e-09, | |
| "loss": 0.3122, | |
| "step": 585 | |
| }, | |
| { | |
| "epoch": 1.9865319865319866, | |
| "grad_norm": 0.10021678758376606, | |
| "learning_rate": 1.3843859422574269e-09, | |
| "loss": 0.3852, | |
| "step": 590 | |
| }, | |
| { | |
| "epoch": 2.0, | |
| "eval_loss": 0.37048205733299255, | |
| "eval_runtime": 28.2507, | |
| "eval_samples_per_second": 20.849, | |
| "eval_steps_per_second": 5.239, | |
| "step": 594 | |
| }, | |
| { | |
| "epoch": 2.0, | |
| "step": 594, | |
| "total_flos": 1.5456743568084828e+18, | |
| "train_loss": 0.4185141025970279, | |
| "train_runtime": 4834.4452, | |
| "train_samples_per_second": 5.896, | |
| "train_steps_per_second": 0.123 | |
| } | |
| ], | |
| "logging_steps": 5, | |
| "max_steps": 594, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 2, | |
| "save_steps": 500, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 1.5456743568084828e+18, | |
| "train_batch_size": 2, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |