| { | |
| "best_global_step": null, | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 3.0, | |
| "eval_steps": 500, | |
| "global_step": 7335, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.004091234530019434, | |
| "grad_norm": 5.139719573502004, | |
| "learning_rate": 6.9791071983348295e-06, | |
| "loss": 2.653, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.008182469060038867, | |
| "grad_norm": 3.820838784929218, | |
| "learning_rate": 9.080027807988022e-06, | |
| "loss": 0.4708, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.0122737035900583, | |
| "grad_norm": 2.6483227024127802, | |
| "learning_rate": 1.030898758162737e-05, | |
| "loss": 0.4622, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.016364938120077735, | |
| "grad_norm": 3.0325419802512723, | |
| "learning_rate": 1.1180948417641216e-05, | |
| "loss": 0.4572, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.020456172650097165, | |
| "grad_norm": 3.3008143753804378, | |
| "learning_rate": 1.1857293787016462e-05, | |
| "loss": 0.4569, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.0245474071801166, | |
| "grad_norm": 2.8131794752844383, | |
| "learning_rate": 1.2409908191280561e-05, | |
| "loss": 0.4378, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.028638641710136033, | |
| "grad_norm": 2.7183268247171717, | |
| "learning_rate": 1.2877137012696984e-05, | |
| "loss": 0.4523, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.03272987624015547, | |
| "grad_norm": 1.812836545554904, | |
| "learning_rate": 1.3281869027294408e-05, | |
| "loss": 0.4517, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.0368211107701749, | |
| "grad_norm": 1.5605721318709478, | |
| "learning_rate": 1.363886796491991e-05, | |
| "loss": 0.4396, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 0.04091234530019433, | |
| "grad_norm": 3.00608523793705, | |
| "learning_rate": 1.3958214396669659e-05, | |
| "loss": 0.4393, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.04500357983021377, | |
| "grad_norm": 2.004900329434675, | |
| "learning_rate": 1.4247098383615834e-05, | |
| "loss": 0.4287, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 0.0490948143602332, | |
| "grad_norm": 3.1505903529524826, | |
| "learning_rate": 1.4510828800933757e-05, | |
| "loss": 0.4461, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 0.053186048890252635, | |
| "grad_norm": 1.4909337148061823, | |
| "learning_rate": 1.47534372669567e-05, | |
| "loss": 0.4475, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 0.057277283420272065, | |
| "grad_norm": 1.910562126750776, | |
| "learning_rate": 1.4978057622350176e-05, | |
| "loss": 0.456, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 0.0613685179502915, | |
| "grad_norm": 1.071878138097679, | |
| "learning_rate": 1.5187174170309003e-05, | |
| "loss": 0.4444, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.06545975248031094, | |
| "grad_norm": 1.182162791937331, | |
| "learning_rate": 1.5382789636947598e-05, | |
| "loss": 0.4544, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 0.06955098701033037, | |
| "grad_norm": 2.804609224620368, | |
| "learning_rate": 1.5566542122709266e-05, | |
| "loss": 0.4431, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 0.0736422215403498, | |
| "grad_norm": 2.30126890056861, | |
| "learning_rate": 1.5739788574573106e-05, | |
| "loss": 0.4304, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 0.07773345607036923, | |
| "grad_norm": 1.8322154114594529, | |
| "learning_rate": 1.59036656596413e-05, | |
| "loss": 0.4488, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 0.08182469060038866, | |
| "grad_norm": 1.6909197474415698, | |
| "learning_rate": 1.605913500632285e-05, | |
| "loss": 0.4244, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.0859159251304081, | |
| "grad_norm": 1.9267153990137886, | |
| "learning_rate": 1.6207017395989525e-05, | |
| "loss": 0.4452, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 0.09000715966042754, | |
| "grad_norm": 1.9513020149733689, | |
| "learning_rate": 1.6348018993269024e-05, | |
| "loss": 0.441, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 0.09409839419044697, | |
| "grad_norm": 1.6992552537559622, | |
| "learning_rate": 1.648275174085812e-05, | |
| "loss": 0.4142, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 0.0981896287204664, | |
| "grad_norm": 2.506670138718048, | |
| "learning_rate": 1.661174941058695e-05, | |
| "loss": 0.4309, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 0.10228086325048584, | |
| "grad_norm": 1.1649232947269594, | |
| "learning_rate": 1.6735480375698097e-05, | |
| "loss": 0.4468, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 0.10637209778050527, | |
| "grad_norm": 2.2776528295818723, | |
| "learning_rate": 1.6854357876609896e-05, | |
| "loss": 0.4263, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 0.1104633323105247, | |
| "grad_norm": 1.09704187408726, | |
| "learning_rate": 1.6968748348212453e-05, | |
| "loss": 0.4199, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 0.11455456684054413, | |
| "grad_norm": 0.8342510727420896, | |
| "learning_rate": 1.7078978232003368e-05, | |
| "loss": 0.4313, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 0.11864580137056356, | |
| "grad_norm": 0.8821458905420587, | |
| "learning_rate": 1.7185339592301872e-05, | |
| "loss": 0.4259, | |
| "step": 290 | |
| }, | |
| { | |
| "epoch": 0.122737035900583, | |
| "grad_norm": 1.7825705984955846, | |
| "learning_rate": 1.7288094779962197e-05, | |
| "loss": 0.4261, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.12682827043060244, | |
| "grad_norm": 1.904238409153354, | |
| "learning_rate": 1.7387480331094423e-05, | |
| "loss": 0.4259, | |
| "step": 310 | |
| }, | |
| { | |
| "epoch": 0.13091950496062188, | |
| "grad_norm": 1.725649730023905, | |
| "learning_rate": 1.7483710246600792e-05, | |
| "loss": 0.4314, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 0.1350107394906413, | |
| "grad_norm": 1.4341207306128956, | |
| "learning_rate": 1.757697876690837e-05, | |
| "loss": 0.4419, | |
| "step": 330 | |
| }, | |
| { | |
| "epoch": 0.13910197402066074, | |
| "grad_norm": 1.2643467480950796, | |
| "learning_rate": 1.766746273236246e-05, | |
| "loss": 0.4312, | |
| "step": 340 | |
| }, | |
| { | |
| "epoch": 0.14319320855068016, | |
| "grad_norm": 1.563315600788671, | |
| "learning_rate": 1.7755323601378616e-05, | |
| "loss": 0.4211, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 0.1472844430806996, | |
| "grad_norm": 1.3601337663266007, | |
| "learning_rate": 1.7840709184226296e-05, | |
| "loss": 0.4187, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 0.15137567761071904, | |
| "grad_norm": 1.230870033216971, | |
| "learning_rate": 1.792375513921188e-05, | |
| "loss": 0.4317, | |
| "step": 370 | |
| }, | |
| { | |
| "epoch": 0.15546691214073846, | |
| "grad_norm": 0.935994666832451, | |
| "learning_rate": 1.8004586269294493e-05, | |
| "loss": 0.4347, | |
| "step": 380 | |
| }, | |
| { | |
| "epoch": 0.1595581466707579, | |
| "grad_norm": 1.3347135910248187, | |
| "learning_rate": 1.8083317650249243e-05, | |
| "loss": 0.4323, | |
| "step": 390 | |
| }, | |
| { | |
| "epoch": 0.16364938120077732, | |
| "grad_norm": 1.929506823385391, | |
| "learning_rate": 1.8160055615976043e-05, | |
| "loss": 0.4346, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 0.16774061573079677, | |
| "grad_norm": 0.9726505093912414, | |
| "learning_rate": 1.8234898622125742e-05, | |
| "loss": 0.416, | |
| "step": 410 | |
| }, | |
| { | |
| "epoch": 0.1718318502608162, | |
| "grad_norm": 0.9397510565012722, | |
| "learning_rate": 1.8307938005642715e-05, | |
| "loss": 0.4182, | |
| "step": 420 | |
| }, | |
| { | |
| "epoch": 0.17592308479083563, | |
| "grad_norm": 1.2862427800883758, | |
| "learning_rate": 1.8379258654923192e-05, | |
| "loss": 0.4165, | |
| "step": 430 | |
| }, | |
| { | |
| "epoch": 0.18001431932085507, | |
| "grad_norm": 1.455717102975378, | |
| "learning_rate": 1.8448939602922218e-05, | |
| "loss": 0.4287, | |
| "step": 440 | |
| }, | |
| { | |
| "epoch": 0.18410555385087451, | |
| "grad_norm": 1.5001908608590255, | |
| "learning_rate": 1.8517054553601544e-05, | |
| "loss": 0.4303, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 0.18819678838089393, | |
| "grad_norm": 1.6709222890878312, | |
| "learning_rate": 1.8583672350511313e-05, | |
| "loss": 0.4328, | |
| "step": 460 | |
| }, | |
| { | |
| "epoch": 0.19228802291091338, | |
| "grad_norm": 1.3805672401271125, | |
| "learning_rate": 1.864885739497424e-05, | |
| "loss": 0.4179, | |
| "step": 470 | |
| }, | |
| { | |
| "epoch": 0.1963792574409328, | |
| "grad_norm": 1.4202659814214615, | |
| "learning_rate": 1.8712670020240143e-05, | |
| "loss": 0.4129, | |
| "step": 480 | |
| }, | |
| { | |
| "epoch": 0.20047049197095224, | |
| "grad_norm": 1.3802506203673055, | |
| "learning_rate": 1.8775166827059134e-05, | |
| "loss": 0.4203, | |
| "step": 490 | |
| }, | |
| { | |
| "epoch": 0.20456172650097168, | |
| "grad_norm": 1.4249688406884504, | |
| "learning_rate": 1.883640098535129e-05, | |
| "loss": 0.4246, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.20456172650097168, | |
| "eval_loss": 0.4226590692996979, | |
| "eval_runtime": 565.8176, | |
| "eval_samples_per_second": 5.458, | |
| "eval_steps_per_second": 0.91, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.2086529610309911, | |
| "grad_norm": 0.8798533540054851, | |
| "learning_rate": 1.8896422506001807e-05, | |
| "loss": 0.4267, | |
| "step": 510 | |
| }, | |
| { | |
| "epoch": 0.21274419556101054, | |
| "grad_norm": 1.279435399555335, | |
| "learning_rate": 1.895527848626309e-05, | |
| "loss": 0.4103, | |
| "step": 520 | |
| }, | |
| { | |
| "epoch": 0.21683543009102996, | |
| "grad_norm": 1.2983197150849732, | |
| "learning_rate": 1.901301333178074e-05, | |
| "loss": 0.416, | |
| "step": 530 | |
| }, | |
| { | |
| "epoch": 0.2209266646210494, | |
| "grad_norm": 1.3784001180176515, | |
| "learning_rate": 1.9069668957865647e-05, | |
| "loss": 0.4249, | |
| "step": 540 | |
| }, | |
| { | |
| "epoch": 0.22501789915106885, | |
| "grad_norm": 1.4315659756421384, | |
| "learning_rate": 1.9125284972297466e-05, | |
| "loss": 0.4244, | |
| "step": 550 | |
| }, | |
| { | |
| "epoch": 0.22910913368108826, | |
| "grad_norm": 1.1438536096514877, | |
| "learning_rate": 1.9179898841656562e-05, | |
| "loss": 0.4175, | |
| "step": 560 | |
| }, | |
| { | |
| "epoch": 0.2332003682111077, | |
| "grad_norm": 1.5784295688235042, | |
| "learning_rate": 1.923354604293384e-05, | |
| "loss": 0.4338, | |
| "step": 570 | |
| }, | |
| { | |
| "epoch": 0.23729160274112712, | |
| "grad_norm": 1.1792683199314111, | |
| "learning_rate": 1.9286260201955066e-05, | |
| "loss": 0.4235, | |
| "step": 580 | |
| }, | |
| { | |
| "epoch": 0.24138283727114657, | |
| "grad_norm": 0.9242432483061238, | |
| "learning_rate": 1.9338073219972227e-05, | |
| "loss": 0.4206, | |
| "step": 590 | |
| }, | |
| { | |
| "epoch": 0.245474071801166, | |
| "grad_norm": 1.4278436783956994, | |
| "learning_rate": 1.938901538961539e-05, | |
| "loss": 0.4288, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 0.24956530633118543, | |
| "grad_norm": 1.7892184247748346, | |
| "learning_rate": 1.9439115501260403e-05, | |
| "loss": 0.4314, | |
| "step": 610 | |
| }, | |
| { | |
| "epoch": 0.25365654086120487, | |
| "grad_norm": 1.3459176439410694, | |
| "learning_rate": 1.9488400940747617e-05, | |
| "loss": 0.4252, | |
| "step": 620 | |
| }, | |
| { | |
| "epoch": 0.2577477753912243, | |
| "grad_norm": 1.0979452662978122, | |
| "learning_rate": 1.9536897779282066e-05, | |
| "loss": 0.4159, | |
| "step": 630 | |
| }, | |
| { | |
| "epoch": 0.26183900992124376, | |
| "grad_norm": 1.1558624628355403, | |
| "learning_rate": 1.958463085625399e-05, | |
| "loss": 0.425, | |
| "step": 640 | |
| }, | |
| { | |
| "epoch": 0.2659302444512632, | |
| "grad_norm": 0.9606366329728439, | |
| "learning_rate": 1.9631623855638338e-05, | |
| "loss": 0.4084, | |
| "step": 650 | |
| }, | |
| { | |
| "epoch": 0.2700214789812826, | |
| "grad_norm": 1.0597290473469163, | |
| "learning_rate": 1.9677899376561565e-05, | |
| "loss": 0.4099, | |
| "step": 660 | |
| }, | |
| { | |
| "epoch": 0.274112713511302, | |
| "grad_norm": 1.0819100913671185, | |
| "learning_rate": 1.9723478998562017e-05, | |
| "loss": 0.4151, | |
| "step": 670 | |
| }, | |
| { | |
| "epoch": 0.2782039480413215, | |
| "grad_norm": 1.4892435767285719, | |
| "learning_rate": 1.976838334201565e-05, | |
| "loss": 0.4164, | |
| "step": 680 | |
| }, | |
| { | |
| "epoch": 0.2822951825713409, | |
| "grad_norm": 1.1025628764173787, | |
| "learning_rate": 1.981263212415066e-05, | |
| "loss": 0.4193, | |
| "step": 690 | |
| }, | |
| { | |
| "epoch": 0.2863864171013603, | |
| "grad_norm": 1.0753475991516983, | |
| "learning_rate": 1.985624421103181e-05, | |
| "loss": 0.4108, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 0.2904776516313798, | |
| "grad_norm": 0.905939605392397, | |
| "learning_rate": 1.9899237665857572e-05, | |
| "loss": 0.4089, | |
| "step": 710 | |
| }, | |
| { | |
| "epoch": 0.2945688861613992, | |
| "grad_norm": 1.1713616964839249, | |
| "learning_rate": 1.994162979387949e-05, | |
| "loss": 0.4201, | |
| "step": 720 | |
| }, | |
| { | |
| "epoch": 0.2986601206914186, | |
| "grad_norm": 0.9299543253519471, | |
| "learning_rate": 1.998343718422334e-05, | |
| "loss": 0.4139, | |
| "step": 730 | |
| }, | |
| { | |
| "epoch": 0.3027513552214381, | |
| "grad_norm": 1.516913403786176, | |
| "learning_rate": 1.998485078018482e-05, | |
| "loss": 0.4067, | |
| "step": 740 | |
| }, | |
| { | |
| "epoch": 0.3068425897514575, | |
| "grad_norm": 0.7991131845266628, | |
| "learning_rate": 1.9954552340554464e-05, | |
| "loss": 0.4123, | |
| "step": 750 | |
| }, | |
| { | |
| "epoch": 0.3109338242814769, | |
| "grad_norm": 1.3657331223383264, | |
| "learning_rate": 1.9924253900924104e-05, | |
| "loss": 0.4108, | |
| "step": 760 | |
| }, | |
| { | |
| "epoch": 0.3150250588114964, | |
| "grad_norm": 0.8920875301003005, | |
| "learning_rate": 1.9893955461293747e-05, | |
| "loss": 0.4191, | |
| "step": 770 | |
| }, | |
| { | |
| "epoch": 0.3191162933415158, | |
| "grad_norm": 0.9744817931110429, | |
| "learning_rate": 1.9863657021663386e-05, | |
| "loss": 0.4053, | |
| "step": 780 | |
| }, | |
| { | |
| "epoch": 0.3232075278715352, | |
| "grad_norm": 1.1889083717413498, | |
| "learning_rate": 1.9833358582033025e-05, | |
| "loss": 0.4629, | |
| "step": 790 | |
| }, | |
| { | |
| "epoch": 0.32729876240155464, | |
| "grad_norm": 2.2559102163995965, | |
| "learning_rate": 1.9803060142402668e-05, | |
| "loss": 0.7059, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 0.3313899969315741, | |
| "grad_norm": 0.7012331362796347, | |
| "learning_rate": 1.977276170277231e-05, | |
| "loss": 0.418, | |
| "step": 810 | |
| }, | |
| { | |
| "epoch": 0.33548123146159353, | |
| "grad_norm": 1.1193457098820139, | |
| "learning_rate": 1.9742463263141947e-05, | |
| "loss": 0.4022, | |
| "step": 820 | |
| }, | |
| { | |
| "epoch": 0.33957246599161295, | |
| "grad_norm": 1.0787812784251993, | |
| "learning_rate": 1.971216482351159e-05, | |
| "loss": 0.4147, | |
| "step": 830 | |
| }, | |
| { | |
| "epoch": 0.3436637005216324, | |
| "grad_norm": 1.2144732211866651, | |
| "learning_rate": 1.9681866383881233e-05, | |
| "loss": 0.4118, | |
| "step": 840 | |
| }, | |
| { | |
| "epoch": 0.34775493505165184, | |
| "grad_norm": 1.2104042207994736, | |
| "learning_rate": 1.9651567944250872e-05, | |
| "loss": 0.4081, | |
| "step": 850 | |
| }, | |
| { | |
| "epoch": 0.35184616958167125, | |
| "grad_norm": 1.2049689205461667, | |
| "learning_rate": 1.9621269504620512e-05, | |
| "loss": 0.4073, | |
| "step": 860 | |
| }, | |
| { | |
| "epoch": 0.3559374041116907, | |
| "grad_norm": 1.230689211204093, | |
| "learning_rate": 1.9590971064990155e-05, | |
| "loss": 0.4104, | |
| "step": 870 | |
| }, | |
| { | |
| "epoch": 0.36002863864171014, | |
| "grad_norm": 0.8629950816912576, | |
| "learning_rate": 1.9560672625359794e-05, | |
| "loss": 0.4147, | |
| "step": 880 | |
| }, | |
| { | |
| "epoch": 0.36411987317172956, | |
| "grad_norm": 1.1029857217191437, | |
| "learning_rate": 1.9530374185729437e-05, | |
| "loss": 0.4221, | |
| "step": 890 | |
| }, | |
| { | |
| "epoch": 0.36821110770174903, | |
| "grad_norm": 1.1513671079248653, | |
| "learning_rate": 1.9500075746099076e-05, | |
| "loss": 0.4143, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 0.37230234223176845, | |
| "grad_norm": 1.3187073752135716, | |
| "learning_rate": 1.946977730646872e-05, | |
| "loss": 0.4193, | |
| "step": 910 | |
| }, | |
| { | |
| "epoch": 0.37639357676178786, | |
| "grad_norm": 0.8494318204927609, | |
| "learning_rate": 1.943947886683836e-05, | |
| "loss": 0.4103, | |
| "step": 920 | |
| }, | |
| { | |
| "epoch": 0.3804848112918073, | |
| "grad_norm": 1.2378655056760652, | |
| "learning_rate": 1.9409180427208e-05, | |
| "loss": 0.413, | |
| "step": 930 | |
| }, | |
| { | |
| "epoch": 0.38457604582182675, | |
| "grad_norm": 0.8742974569121568, | |
| "learning_rate": 1.937888198757764e-05, | |
| "loss": 0.4178, | |
| "step": 940 | |
| }, | |
| { | |
| "epoch": 0.38866728035184617, | |
| "grad_norm": 0.6659680293817604, | |
| "learning_rate": 1.934858354794728e-05, | |
| "loss": 0.3992, | |
| "step": 950 | |
| }, | |
| { | |
| "epoch": 0.3927585148818656, | |
| "grad_norm": 1.0085028880533002, | |
| "learning_rate": 1.9318285108316923e-05, | |
| "loss": 0.3957, | |
| "step": 960 | |
| }, | |
| { | |
| "epoch": 0.39684974941188506, | |
| "grad_norm": 0.7913168849253336, | |
| "learning_rate": 1.9287986668686566e-05, | |
| "loss": 0.4111, | |
| "step": 970 | |
| }, | |
| { | |
| "epoch": 0.40094098394190447, | |
| "grad_norm": 0.9593948086372496, | |
| "learning_rate": 1.9257688229056206e-05, | |
| "loss": 0.4097, | |
| "step": 980 | |
| }, | |
| { | |
| "epoch": 0.4050322184719239, | |
| "grad_norm": 1.1075304924315132, | |
| "learning_rate": 1.9227389789425845e-05, | |
| "loss": 0.4029, | |
| "step": 990 | |
| }, | |
| { | |
| "epoch": 0.40912345300194336, | |
| "grad_norm": 1.3189534557378433, | |
| "learning_rate": 1.9197091349795488e-05, | |
| "loss": 0.4114, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 0.40912345300194336, | |
| "eval_loss": 0.4164506494998932, | |
| "eval_runtime": 567.741, | |
| "eval_samples_per_second": 5.439, | |
| "eval_steps_per_second": 0.907, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 0.4132146875319628, | |
| "grad_norm": 1.3479557524857961, | |
| "learning_rate": 1.9166792910165127e-05, | |
| "loss": 0.4034, | |
| "step": 1010 | |
| }, | |
| { | |
| "epoch": 0.4173059220619822, | |
| "grad_norm": 0.7981045727445666, | |
| "learning_rate": 1.913649447053477e-05, | |
| "loss": 0.3984, | |
| "step": 1020 | |
| }, | |
| { | |
| "epoch": 0.4213971565920016, | |
| "grad_norm": 1.3167949575672933, | |
| "learning_rate": 1.910619603090441e-05, | |
| "loss": 0.418, | |
| "step": 1030 | |
| }, | |
| { | |
| "epoch": 0.4254883911220211, | |
| "grad_norm": 0.875082404450118, | |
| "learning_rate": 1.907589759127405e-05, | |
| "loss": 0.4165, | |
| "step": 1040 | |
| }, | |
| { | |
| "epoch": 0.4295796256520405, | |
| "grad_norm": 0.7010201186949262, | |
| "learning_rate": 1.9045599151643692e-05, | |
| "loss": 0.4043, | |
| "step": 1050 | |
| }, | |
| { | |
| "epoch": 0.4336708601820599, | |
| "grad_norm": 0.8106312112876102, | |
| "learning_rate": 1.9015300712013335e-05, | |
| "loss": 0.3946, | |
| "step": 1060 | |
| }, | |
| { | |
| "epoch": 0.4377620947120794, | |
| "grad_norm": 1.2069253813775411, | |
| "learning_rate": 1.8985002272382974e-05, | |
| "loss": 0.4142, | |
| "step": 1070 | |
| }, | |
| { | |
| "epoch": 0.4418533292420988, | |
| "grad_norm": 1.4074507315896057, | |
| "learning_rate": 1.8954703832752614e-05, | |
| "loss": 0.4216, | |
| "step": 1080 | |
| }, | |
| { | |
| "epoch": 0.4459445637721182, | |
| "grad_norm": 1.0142221478478237, | |
| "learning_rate": 1.8924405393122257e-05, | |
| "loss": 0.399, | |
| "step": 1090 | |
| }, | |
| { | |
| "epoch": 0.4500357983021377, | |
| "grad_norm": 1.0549341445926665, | |
| "learning_rate": 1.8894106953491896e-05, | |
| "loss": 0.4036, | |
| "step": 1100 | |
| }, | |
| { | |
| "epoch": 0.4541270328321571, | |
| "grad_norm": 0.965773892068146, | |
| "learning_rate": 1.8863808513861535e-05, | |
| "loss": 0.4064, | |
| "step": 1110 | |
| }, | |
| { | |
| "epoch": 0.4582182673621765, | |
| "grad_norm": 0.9064204193354054, | |
| "learning_rate": 1.8833510074231178e-05, | |
| "loss": 0.3909, | |
| "step": 1120 | |
| }, | |
| { | |
| "epoch": 0.462309501892196, | |
| "grad_norm": 0.9674069704546393, | |
| "learning_rate": 1.880321163460082e-05, | |
| "loss": 0.4016, | |
| "step": 1130 | |
| }, | |
| { | |
| "epoch": 0.4664007364222154, | |
| "grad_norm": 0.7415264356489151, | |
| "learning_rate": 1.877291319497046e-05, | |
| "loss": 0.418, | |
| "step": 1140 | |
| }, | |
| { | |
| "epoch": 0.47049197095223483, | |
| "grad_norm": 0.610984069094811, | |
| "learning_rate": 1.87426147553401e-05, | |
| "loss": 0.4163, | |
| "step": 1150 | |
| }, | |
| { | |
| "epoch": 0.47458320548225424, | |
| "grad_norm": 0.8399219229428194, | |
| "learning_rate": 1.8712316315709743e-05, | |
| "loss": 0.3986, | |
| "step": 1160 | |
| }, | |
| { | |
| "epoch": 0.4786744400122737, | |
| "grad_norm": 1.157757088776256, | |
| "learning_rate": 1.8682017876079382e-05, | |
| "loss": 0.4108, | |
| "step": 1170 | |
| }, | |
| { | |
| "epoch": 0.48276567454229313, | |
| "grad_norm": 0.967241129437065, | |
| "learning_rate": 1.8651719436449025e-05, | |
| "loss": 0.4217, | |
| "step": 1180 | |
| }, | |
| { | |
| "epoch": 0.48685690907231255, | |
| "grad_norm": 2.091651865632623, | |
| "learning_rate": 1.8621420996818665e-05, | |
| "loss": 0.4091, | |
| "step": 1190 | |
| }, | |
| { | |
| "epoch": 0.490948143602332, | |
| "grad_norm": 0.6567666937577481, | |
| "learning_rate": 1.8591122557188307e-05, | |
| "loss": 0.4206, | |
| "step": 1200 | |
| }, | |
| { | |
| "epoch": 0.49503937813235144, | |
| "grad_norm": 1.3838725189967105, | |
| "learning_rate": 1.8560824117557947e-05, | |
| "loss": 0.3986, | |
| "step": 1210 | |
| }, | |
| { | |
| "epoch": 0.49913061266237085, | |
| "grad_norm": 0.7561244143868575, | |
| "learning_rate": 1.853052567792759e-05, | |
| "loss": 0.4091, | |
| "step": 1220 | |
| }, | |
| { | |
| "epoch": 0.5032218471923903, | |
| "grad_norm": 1.1843346363848066, | |
| "learning_rate": 1.850022723829723e-05, | |
| "loss": 0.4009, | |
| "step": 1230 | |
| }, | |
| { | |
| "epoch": 0.5073130817224097, | |
| "grad_norm": 0.9612775939058839, | |
| "learning_rate": 1.846992879866687e-05, | |
| "loss": 0.4312, | |
| "step": 1240 | |
| }, | |
| { | |
| "epoch": 0.5114043162524292, | |
| "grad_norm": 0.9653095988733074, | |
| "learning_rate": 1.843963035903651e-05, | |
| "loss": 0.411, | |
| "step": 1250 | |
| }, | |
| { | |
| "epoch": 0.5154955507824486, | |
| "grad_norm": 0.9538148396031432, | |
| "learning_rate": 1.840933191940615e-05, | |
| "loss": 0.4189, | |
| "step": 1260 | |
| }, | |
| { | |
| "epoch": 0.519586785312468, | |
| "grad_norm": 0.831951461503928, | |
| "learning_rate": 1.8379033479775794e-05, | |
| "loss": 0.3866, | |
| "step": 1270 | |
| }, | |
| { | |
| "epoch": 0.5236780198424875, | |
| "grad_norm": 0.8415502915298407, | |
| "learning_rate": 1.8348735040145433e-05, | |
| "loss": 0.43, | |
| "step": 1280 | |
| }, | |
| { | |
| "epoch": 0.5277692543725069, | |
| "grad_norm": 0.8927957692335573, | |
| "learning_rate": 1.8318436600515076e-05, | |
| "loss": 0.4143, | |
| "step": 1290 | |
| }, | |
| { | |
| "epoch": 0.5318604889025264, | |
| "grad_norm": 0.9951847425278025, | |
| "learning_rate": 1.8288138160884716e-05, | |
| "loss": 0.4082, | |
| "step": 1300 | |
| }, | |
| { | |
| "epoch": 0.5359517234325458, | |
| "grad_norm": 0.9794640728750337, | |
| "learning_rate": 1.825783972125436e-05, | |
| "loss": 0.4123, | |
| "step": 1310 | |
| }, | |
| { | |
| "epoch": 0.5400429579625652, | |
| "grad_norm": 1.2936619002334293, | |
| "learning_rate": 1.8227541281623998e-05, | |
| "loss": 0.4063, | |
| "step": 1320 | |
| }, | |
| { | |
| "epoch": 0.5441341924925847, | |
| "grad_norm": 0.9539785376828354, | |
| "learning_rate": 1.8197242841993637e-05, | |
| "loss": 0.4001, | |
| "step": 1330 | |
| }, | |
| { | |
| "epoch": 0.548225427022604, | |
| "grad_norm": 0.7647319210581001, | |
| "learning_rate": 1.816694440236328e-05, | |
| "loss": 0.4067, | |
| "step": 1340 | |
| }, | |
| { | |
| "epoch": 0.5523166615526235, | |
| "grad_norm": 0.7391526930684824, | |
| "learning_rate": 1.8136645962732923e-05, | |
| "loss": 0.4135, | |
| "step": 1350 | |
| }, | |
| { | |
| "epoch": 0.556407896082643, | |
| "grad_norm": 1.2278179621110827, | |
| "learning_rate": 1.8106347523102562e-05, | |
| "loss": 0.3955, | |
| "step": 1360 | |
| }, | |
| { | |
| "epoch": 0.5604991306126623, | |
| "grad_norm": 0.9263102201334721, | |
| "learning_rate": 1.8076049083472202e-05, | |
| "loss": 0.4053, | |
| "step": 1370 | |
| }, | |
| { | |
| "epoch": 0.5645903651426818, | |
| "grad_norm": 1.270439289582048, | |
| "learning_rate": 1.8045750643841845e-05, | |
| "loss": 0.4065, | |
| "step": 1380 | |
| }, | |
| { | |
| "epoch": 0.5686815996727013, | |
| "grad_norm": 0.8192132223098715, | |
| "learning_rate": 1.8015452204211484e-05, | |
| "loss": 0.4085, | |
| "step": 1390 | |
| }, | |
| { | |
| "epoch": 0.5727728342027206, | |
| "grad_norm": 0.6191336734377946, | |
| "learning_rate": 1.7985153764581124e-05, | |
| "loss": 0.402, | |
| "step": 1400 | |
| }, | |
| { | |
| "epoch": 0.5768640687327401, | |
| "grad_norm": 0.7122309486400348, | |
| "learning_rate": 1.7954855324950766e-05, | |
| "loss": 0.3985, | |
| "step": 1410 | |
| }, | |
| { | |
| "epoch": 0.5809553032627596, | |
| "grad_norm": 1.0903768849480002, | |
| "learning_rate": 1.792455688532041e-05, | |
| "loss": 0.4142, | |
| "step": 1420 | |
| }, | |
| { | |
| "epoch": 0.5850465377927789, | |
| "grad_norm": 1.1699038018329126, | |
| "learning_rate": 1.789425844569005e-05, | |
| "loss": 0.4055, | |
| "step": 1430 | |
| }, | |
| { | |
| "epoch": 0.5891377723227984, | |
| "grad_norm": 0.7701176154609344, | |
| "learning_rate": 1.7863960006059688e-05, | |
| "loss": 0.403, | |
| "step": 1440 | |
| }, | |
| { | |
| "epoch": 0.5932290068528179, | |
| "grad_norm": 0.9632444193994341, | |
| "learning_rate": 1.783366156642933e-05, | |
| "loss": 0.3897, | |
| "step": 1450 | |
| }, | |
| { | |
| "epoch": 0.5973202413828372, | |
| "grad_norm": 0.9564215846627407, | |
| "learning_rate": 1.780336312679897e-05, | |
| "loss": 0.4117, | |
| "step": 1460 | |
| }, | |
| { | |
| "epoch": 0.6014114759128567, | |
| "grad_norm": 0.6899350345691984, | |
| "learning_rate": 1.7773064687168613e-05, | |
| "loss": 0.4137, | |
| "step": 1470 | |
| }, | |
| { | |
| "epoch": 0.6055027104428762, | |
| "grad_norm": 0.9095599036748141, | |
| "learning_rate": 1.7742766247538253e-05, | |
| "loss": 0.415, | |
| "step": 1480 | |
| }, | |
| { | |
| "epoch": 0.6095939449728955, | |
| "grad_norm": 0.9109255603659862, | |
| "learning_rate": 1.7712467807907892e-05, | |
| "loss": 0.407, | |
| "step": 1490 | |
| }, | |
| { | |
| "epoch": 0.613685179502915, | |
| "grad_norm": 1.2266191388411232, | |
| "learning_rate": 1.7682169368277535e-05, | |
| "loss": 0.4055, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 0.613685179502915, | |
| "eval_loss": 0.4121568500995636, | |
| "eval_runtime": 568.64, | |
| "eval_samples_per_second": 5.431, | |
| "eval_steps_per_second": 0.906, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 0.6177764140329345, | |
| "grad_norm": 2.106549959554808, | |
| "learning_rate": 1.7651870928647178e-05, | |
| "loss": 0.397, | |
| "step": 1510 | |
| }, | |
| { | |
| "epoch": 0.6218676485629538, | |
| "grad_norm": 0.5026183280127312, | |
| "learning_rate": 1.7621572489016817e-05, | |
| "loss": 0.3986, | |
| "step": 1520 | |
| }, | |
| { | |
| "epoch": 0.6259588830929733, | |
| "grad_norm": 1.141004596209656, | |
| "learning_rate": 1.7591274049386457e-05, | |
| "loss": 0.4074, | |
| "step": 1530 | |
| }, | |
| { | |
| "epoch": 0.6300501176229928, | |
| "grad_norm": 0.8191222686339406, | |
| "learning_rate": 1.75609756097561e-05, | |
| "loss": 0.4002, | |
| "step": 1540 | |
| }, | |
| { | |
| "epoch": 0.6341413521530121, | |
| "grad_norm": 0.6764982914151534, | |
| "learning_rate": 1.753067717012574e-05, | |
| "loss": 0.4107, | |
| "step": 1550 | |
| }, | |
| { | |
| "epoch": 0.6382325866830316, | |
| "grad_norm": 1.3684032943814484, | |
| "learning_rate": 1.750037873049538e-05, | |
| "loss": 0.4036, | |
| "step": 1560 | |
| }, | |
| { | |
| "epoch": 0.642323821213051, | |
| "grad_norm": 0.8576599206196178, | |
| "learning_rate": 1.747008029086502e-05, | |
| "loss": 0.4067, | |
| "step": 1570 | |
| }, | |
| { | |
| "epoch": 0.6464150557430705, | |
| "grad_norm": 1.199412066961356, | |
| "learning_rate": 1.7439781851234664e-05, | |
| "loss": 0.4069, | |
| "step": 1580 | |
| }, | |
| { | |
| "epoch": 0.6505062902730899, | |
| "grad_norm": 0.9099518471943355, | |
| "learning_rate": 1.7409483411604304e-05, | |
| "loss": 0.3995, | |
| "step": 1590 | |
| }, | |
| { | |
| "epoch": 0.6545975248031093, | |
| "grad_norm": 1.1070377119289831, | |
| "learning_rate": 1.7379184971973947e-05, | |
| "loss": 0.4225, | |
| "step": 1600 | |
| }, | |
| { | |
| "epoch": 0.6586887593331288, | |
| "grad_norm": 0.8753865952879469, | |
| "learning_rate": 1.7348886532343586e-05, | |
| "loss": 0.4032, | |
| "step": 1610 | |
| }, | |
| { | |
| "epoch": 0.6627799938631482, | |
| "grad_norm": 0.7973497440286291, | |
| "learning_rate": 1.7318588092713226e-05, | |
| "loss": 0.4207, | |
| "step": 1620 | |
| }, | |
| { | |
| "epoch": 0.6668712283931676, | |
| "grad_norm": 0.6925989497344892, | |
| "learning_rate": 1.728828965308287e-05, | |
| "loss": 0.4123, | |
| "step": 1630 | |
| }, | |
| { | |
| "epoch": 0.6709624629231871, | |
| "grad_norm": 0.9669611664338155, | |
| "learning_rate": 1.725799121345251e-05, | |
| "loss": 0.3916, | |
| "step": 1640 | |
| }, | |
| { | |
| "epoch": 0.6750536974532065, | |
| "grad_norm": 1.2385150364852342, | |
| "learning_rate": 1.7227692773822147e-05, | |
| "loss": 0.4136, | |
| "step": 1650 | |
| }, | |
| { | |
| "epoch": 0.6791449319832259, | |
| "grad_norm": 0.7549418298109767, | |
| "learning_rate": 1.719739433419179e-05, | |
| "loss": 0.4146, | |
| "step": 1660 | |
| }, | |
| { | |
| "epoch": 0.6832361665132454, | |
| "grad_norm": 1.0424008728514387, | |
| "learning_rate": 1.7167095894561433e-05, | |
| "loss": 0.3971, | |
| "step": 1670 | |
| }, | |
| { | |
| "epoch": 0.6873274010432648, | |
| "grad_norm": 0.7360079123740301, | |
| "learning_rate": 1.7136797454931072e-05, | |
| "loss": 0.3947, | |
| "step": 1680 | |
| }, | |
| { | |
| "epoch": 0.6914186355732842, | |
| "grad_norm": 0.961447300959097, | |
| "learning_rate": 1.7106499015300712e-05, | |
| "loss": 0.4035, | |
| "step": 1690 | |
| }, | |
| { | |
| "epoch": 0.6955098701033037, | |
| "grad_norm": 0.7277946304181864, | |
| "learning_rate": 1.7076200575670355e-05, | |
| "loss": 0.3995, | |
| "step": 1700 | |
| }, | |
| { | |
| "epoch": 0.6996011046333231, | |
| "grad_norm": 0.7930916610357551, | |
| "learning_rate": 1.7045902136039994e-05, | |
| "loss": 0.4057, | |
| "step": 1710 | |
| }, | |
| { | |
| "epoch": 0.7036923391633425, | |
| "grad_norm": 0.7442545537670275, | |
| "learning_rate": 1.7015603696409637e-05, | |
| "loss": 0.4042, | |
| "step": 1720 | |
| }, | |
| { | |
| "epoch": 0.707783573693362, | |
| "grad_norm": 1.1048107524067752, | |
| "learning_rate": 1.6985305256779276e-05, | |
| "loss": 0.4132, | |
| "step": 1730 | |
| }, | |
| { | |
| "epoch": 0.7118748082233815, | |
| "grad_norm": 1.1900958503859818, | |
| "learning_rate": 1.695500681714892e-05, | |
| "loss": 0.4166, | |
| "step": 1740 | |
| }, | |
| { | |
| "epoch": 0.7159660427534008, | |
| "grad_norm": 0.7615970705911563, | |
| "learning_rate": 1.692470837751856e-05, | |
| "loss": 0.3937, | |
| "step": 1750 | |
| }, | |
| { | |
| "epoch": 0.7200572772834203, | |
| "grad_norm": 1.0147007380179698, | |
| "learning_rate": 1.68944099378882e-05, | |
| "loss": 0.3918, | |
| "step": 1760 | |
| }, | |
| { | |
| "epoch": 0.7241485118134398, | |
| "grad_norm": 0.8482189838251506, | |
| "learning_rate": 1.686411149825784e-05, | |
| "loss": 0.3839, | |
| "step": 1770 | |
| }, | |
| { | |
| "epoch": 0.7282397463434591, | |
| "grad_norm": 0.711159267866409, | |
| "learning_rate": 1.683381305862748e-05, | |
| "loss": 0.3948, | |
| "step": 1780 | |
| }, | |
| { | |
| "epoch": 0.7323309808734786, | |
| "grad_norm": 1.371790290922361, | |
| "learning_rate": 1.6803514618997123e-05, | |
| "loss": 0.3984, | |
| "step": 1790 | |
| }, | |
| { | |
| "epoch": 0.7364222154034981, | |
| "grad_norm": 0.7270726304455154, | |
| "learning_rate": 1.6773216179366766e-05, | |
| "loss": 0.4058, | |
| "step": 1800 | |
| }, | |
| { | |
| "epoch": 0.7405134499335174, | |
| "grad_norm": 0.993765933652885, | |
| "learning_rate": 1.6742917739736406e-05, | |
| "loss": 0.3964, | |
| "step": 1810 | |
| }, | |
| { | |
| "epoch": 0.7446046844635369, | |
| "grad_norm": 0.7396054882960263, | |
| "learning_rate": 1.6712619300106045e-05, | |
| "loss": 0.3988, | |
| "step": 1820 | |
| }, | |
| { | |
| "epoch": 0.7486959189935563, | |
| "grad_norm": 1.1310726932463144, | |
| "learning_rate": 1.6682320860475688e-05, | |
| "loss": 0.4146, | |
| "step": 1830 | |
| }, | |
| { | |
| "epoch": 0.7527871535235757, | |
| "grad_norm": 0.8570685688377147, | |
| "learning_rate": 1.6652022420845327e-05, | |
| "loss": 0.3971, | |
| "step": 1840 | |
| }, | |
| { | |
| "epoch": 0.7568783880535952, | |
| "grad_norm": 1.2492633396416357, | |
| "learning_rate": 1.6621723981214967e-05, | |
| "loss": 0.4087, | |
| "step": 1850 | |
| }, | |
| { | |
| "epoch": 0.7609696225836146, | |
| "grad_norm": 1.0021357130993627, | |
| "learning_rate": 1.659142554158461e-05, | |
| "loss": 0.4162, | |
| "step": 1860 | |
| }, | |
| { | |
| "epoch": 0.765060857113634, | |
| "grad_norm": 0.7813489106628776, | |
| "learning_rate": 1.656112710195425e-05, | |
| "loss": 0.405, | |
| "step": 1870 | |
| }, | |
| { | |
| "epoch": 0.7691520916436535, | |
| "grad_norm": 0.95305010521556, | |
| "learning_rate": 1.6530828662323892e-05, | |
| "loss": 0.391, | |
| "step": 1880 | |
| }, | |
| { | |
| "epoch": 0.7732433261736729, | |
| "grad_norm": 0.6688770767238499, | |
| "learning_rate": 1.650053022269353e-05, | |
| "loss": 0.3802, | |
| "step": 1890 | |
| }, | |
| { | |
| "epoch": 0.7773345607036923, | |
| "grad_norm": 0.6474194044675128, | |
| "learning_rate": 1.6470231783063174e-05, | |
| "loss": 0.4094, | |
| "step": 1900 | |
| }, | |
| { | |
| "epoch": 0.7814257952337118, | |
| "grad_norm": 0.8498584394336631, | |
| "learning_rate": 1.6439933343432814e-05, | |
| "loss": 0.4073, | |
| "step": 1910 | |
| }, | |
| { | |
| "epoch": 0.7855170297637312, | |
| "grad_norm": 0.7377058782682137, | |
| "learning_rate": 1.6409634903802457e-05, | |
| "loss": 0.3958, | |
| "step": 1920 | |
| }, | |
| { | |
| "epoch": 0.7896082642937506, | |
| "grad_norm": 0.843204215889971, | |
| "learning_rate": 1.6379336464172096e-05, | |
| "loss": 0.414, | |
| "step": 1930 | |
| }, | |
| { | |
| "epoch": 0.7936994988237701, | |
| "grad_norm": 0.6130692021490832, | |
| "learning_rate": 1.6349038024541736e-05, | |
| "loss": 0.4114, | |
| "step": 1940 | |
| }, | |
| { | |
| "epoch": 0.7977907333537895, | |
| "grad_norm": 1.2350933998678075, | |
| "learning_rate": 1.631873958491138e-05, | |
| "loss": 0.4111, | |
| "step": 1950 | |
| }, | |
| { | |
| "epoch": 0.8018819678838089, | |
| "grad_norm": 0.5814935040298003, | |
| "learning_rate": 1.628844114528102e-05, | |
| "loss": 0.4038, | |
| "step": 1960 | |
| }, | |
| { | |
| "epoch": 0.8059732024138284, | |
| "grad_norm": 1.1917635251998295, | |
| "learning_rate": 1.625814270565066e-05, | |
| "loss": 0.4108, | |
| "step": 1970 | |
| }, | |
| { | |
| "epoch": 0.8100644369438478, | |
| "grad_norm": 0.6526809291051177, | |
| "learning_rate": 1.62278442660203e-05, | |
| "loss": 0.3924, | |
| "step": 1980 | |
| }, | |
| { | |
| "epoch": 0.8141556714738672, | |
| "grad_norm": 0.8339385093933975, | |
| "learning_rate": 1.6197545826389943e-05, | |
| "loss": 0.4007, | |
| "step": 1990 | |
| }, | |
| { | |
| "epoch": 0.8182469060038867, | |
| "grad_norm": 0.8226184081347072, | |
| "learning_rate": 1.6167247386759582e-05, | |
| "loss": 0.4055, | |
| "step": 2000 | |
| }, | |
| { | |
| "epoch": 0.8182469060038867, | |
| "eval_loss": 0.40546923875808716, | |
| "eval_runtime": 569.0883, | |
| "eval_samples_per_second": 5.426, | |
| "eval_steps_per_second": 0.905, | |
| "step": 2000 | |
| }, | |
| { | |
| "epoch": 0.8223381405339061, | |
| "grad_norm": 0.6918624248021936, | |
| "learning_rate": 1.6136948947129225e-05, | |
| "loss": 0.3996, | |
| "step": 2010 | |
| }, | |
| { | |
| "epoch": 0.8264293750639256, | |
| "grad_norm": 0.9233306557406306, | |
| "learning_rate": 1.6106650507498865e-05, | |
| "loss": 0.4044, | |
| "step": 2020 | |
| }, | |
| { | |
| "epoch": 0.830520609593945, | |
| "grad_norm": 1.036109547461375, | |
| "learning_rate": 1.6076352067868508e-05, | |
| "loss": 0.4078, | |
| "step": 2030 | |
| }, | |
| { | |
| "epoch": 0.8346118441239644, | |
| "grad_norm": 0.6487594500506121, | |
| "learning_rate": 1.6046053628238147e-05, | |
| "loss": 0.3992, | |
| "step": 2040 | |
| }, | |
| { | |
| "epoch": 0.8387030786539839, | |
| "grad_norm": 0.9061995762710394, | |
| "learning_rate": 1.601575518860779e-05, | |
| "loss": 0.4065, | |
| "step": 2050 | |
| }, | |
| { | |
| "epoch": 0.8427943131840032, | |
| "grad_norm": 1.55527142608359, | |
| "learning_rate": 1.598545674897743e-05, | |
| "loss": 0.4126, | |
| "step": 2060 | |
| }, | |
| { | |
| "epoch": 0.8468855477140227, | |
| "grad_norm": 0.6076317290425526, | |
| "learning_rate": 1.595515830934707e-05, | |
| "loss": 0.4027, | |
| "step": 2070 | |
| }, | |
| { | |
| "epoch": 0.8509767822440422, | |
| "grad_norm": 1.100916449726025, | |
| "learning_rate": 1.592485986971671e-05, | |
| "loss": 0.4136, | |
| "step": 2080 | |
| }, | |
| { | |
| "epoch": 0.8550680167740615, | |
| "grad_norm": 0.9334359781171854, | |
| "learning_rate": 1.589456143008635e-05, | |
| "loss": 0.4018, | |
| "step": 2090 | |
| }, | |
| { | |
| "epoch": 0.859159251304081, | |
| "grad_norm": 1.1538729493388855, | |
| "learning_rate": 1.586426299045599e-05, | |
| "loss": 0.3981, | |
| "step": 2100 | |
| }, | |
| { | |
| "epoch": 0.8632504858341005, | |
| "grad_norm": 0.6672272992408166, | |
| "learning_rate": 1.5833964550825633e-05, | |
| "loss": 0.3989, | |
| "step": 2110 | |
| }, | |
| { | |
| "epoch": 0.8673417203641198, | |
| "grad_norm": 0.6790211847421547, | |
| "learning_rate": 1.5803666111195276e-05, | |
| "loss": 0.3999, | |
| "step": 2120 | |
| }, | |
| { | |
| "epoch": 0.8714329548941393, | |
| "grad_norm": 0.8625927450358163, | |
| "learning_rate": 1.5773367671564916e-05, | |
| "loss": 0.4103, | |
| "step": 2130 | |
| }, | |
| { | |
| "epoch": 0.8755241894241588, | |
| "grad_norm": 0.8799687483547954, | |
| "learning_rate": 1.5743069231934555e-05, | |
| "loss": 0.4, | |
| "step": 2140 | |
| }, | |
| { | |
| "epoch": 0.8796154239541781, | |
| "grad_norm": 0.6045044925297284, | |
| "learning_rate": 1.5712770792304198e-05, | |
| "loss": 0.4127, | |
| "step": 2150 | |
| }, | |
| { | |
| "epoch": 0.8837066584841976, | |
| "grad_norm": 1.1939155172076787, | |
| "learning_rate": 1.5682472352673837e-05, | |
| "loss": 0.3982, | |
| "step": 2160 | |
| }, | |
| { | |
| "epoch": 0.8877978930142171, | |
| "grad_norm": 0.7558373479016637, | |
| "learning_rate": 1.565217391304348e-05, | |
| "loss": 0.4064, | |
| "step": 2170 | |
| }, | |
| { | |
| "epoch": 0.8918891275442364, | |
| "grad_norm": 0.7840338054367947, | |
| "learning_rate": 1.562187547341312e-05, | |
| "loss": 0.411, | |
| "step": 2180 | |
| }, | |
| { | |
| "epoch": 0.8959803620742559, | |
| "grad_norm": 0.6836078155952856, | |
| "learning_rate": 1.5591577033782763e-05, | |
| "loss": 0.4011, | |
| "step": 2190 | |
| }, | |
| { | |
| "epoch": 0.9000715966042754, | |
| "grad_norm": 0.6588002590309792, | |
| "learning_rate": 1.5561278594152402e-05, | |
| "loss": 0.4034, | |
| "step": 2200 | |
| }, | |
| { | |
| "epoch": 0.9041628311342947, | |
| "grad_norm": 0.7477069671187269, | |
| "learning_rate": 1.5530980154522045e-05, | |
| "loss": 0.414, | |
| "step": 2210 | |
| }, | |
| { | |
| "epoch": 0.9082540656643142, | |
| "grad_norm": 0.9689258448123745, | |
| "learning_rate": 1.5500681714891684e-05, | |
| "loss": 0.3976, | |
| "step": 2220 | |
| }, | |
| { | |
| "epoch": 0.9123453001943337, | |
| "grad_norm": 0.9394818001617259, | |
| "learning_rate": 1.5470383275261324e-05, | |
| "loss": 0.414, | |
| "step": 2230 | |
| }, | |
| { | |
| "epoch": 0.916436534724353, | |
| "grad_norm": 0.7596601012213272, | |
| "learning_rate": 1.5440084835630967e-05, | |
| "loss": 0.3963, | |
| "step": 2240 | |
| }, | |
| { | |
| "epoch": 0.9205277692543725, | |
| "grad_norm": 0.7148793324533078, | |
| "learning_rate": 1.5409786396000606e-05, | |
| "loss": 0.3933, | |
| "step": 2250 | |
| }, | |
| { | |
| "epoch": 0.924619003784392, | |
| "grad_norm": 0.9033351893520957, | |
| "learning_rate": 1.537948795637025e-05, | |
| "loss": 0.4013, | |
| "step": 2260 | |
| }, | |
| { | |
| "epoch": 0.9287102383144114, | |
| "grad_norm": 1.0192314023889302, | |
| "learning_rate": 1.534918951673989e-05, | |
| "loss": 0.4011, | |
| "step": 2270 | |
| }, | |
| { | |
| "epoch": 0.9328014728444308, | |
| "grad_norm": 0.7958434523694122, | |
| "learning_rate": 1.531889107710953e-05, | |
| "loss": 0.4069, | |
| "step": 2280 | |
| }, | |
| { | |
| "epoch": 0.9368927073744503, | |
| "grad_norm": 0.7201210026528915, | |
| "learning_rate": 1.528859263747917e-05, | |
| "loss": 0.3951, | |
| "step": 2290 | |
| }, | |
| { | |
| "epoch": 0.9409839419044697, | |
| "grad_norm": 0.6175942086431213, | |
| "learning_rate": 1.5258294197848814e-05, | |
| "loss": 0.4047, | |
| "step": 2300 | |
| }, | |
| { | |
| "epoch": 0.9450751764344891, | |
| "grad_norm": 0.7698111963051264, | |
| "learning_rate": 1.5227995758218453e-05, | |
| "loss": 0.4199, | |
| "step": 2310 | |
| }, | |
| { | |
| "epoch": 0.9491664109645085, | |
| "grad_norm": 1.9264632916618274, | |
| "learning_rate": 1.5197697318588094e-05, | |
| "loss": 0.3931, | |
| "step": 2320 | |
| }, | |
| { | |
| "epoch": 0.953257645494528, | |
| "grad_norm": 1.7879957288452364, | |
| "learning_rate": 1.5167398878957735e-05, | |
| "loss": 0.4007, | |
| "step": 2330 | |
| }, | |
| { | |
| "epoch": 0.9573488800245474, | |
| "grad_norm": 0.5780222684091814, | |
| "learning_rate": 1.5137100439327376e-05, | |
| "loss": 0.4104, | |
| "step": 2340 | |
| }, | |
| { | |
| "epoch": 0.9614401145545668, | |
| "grad_norm": 0.8683726024029582, | |
| "learning_rate": 1.5106801999697016e-05, | |
| "loss": 0.4036, | |
| "step": 2350 | |
| }, | |
| { | |
| "epoch": 0.9655313490845863, | |
| "grad_norm": 0.9116114901687677, | |
| "learning_rate": 1.5076503560066657e-05, | |
| "loss": 0.4021, | |
| "step": 2360 | |
| }, | |
| { | |
| "epoch": 0.9696225836146057, | |
| "grad_norm": 0.7992815803043055, | |
| "learning_rate": 1.50462051204363e-05, | |
| "loss": 0.4116, | |
| "step": 2370 | |
| }, | |
| { | |
| "epoch": 0.9737138181446251, | |
| "grad_norm": 0.9553393046537682, | |
| "learning_rate": 1.5015906680805941e-05, | |
| "loss": 0.4081, | |
| "step": 2380 | |
| }, | |
| { | |
| "epoch": 0.9778050526746446, | |
| "grad_norm": 0.588071607214625, | |
| "learning_rate": 1.498560824117558e-05, | |
| "loss": 0.4094, | |
| "step": 2390 | |
| }, | |
| { | |
| "epoch": 0.981896287204664, | |
| "grad_norm": 0.8852293721983012, | |
| "learning_rate": 1.4955309801545222e-05, | |
| "loss": 0.4055, | |
| "step": 2400 | |
| }, | |
| { | |
| "epoch": 0.9859875217346834, | |
| "grad_norm": 0.49652425498018843, | |
| "learning_rate": 1.4925011361914863e-05, | |
| "loss": 0.4, | |
| "step": 2410 | |
| }, | |
| { | |
| "epoch": 0.9900787562647029, | |
| "grad_norm": 1.0572558341566427, | |
| "learning_rate": 1.4894712922284504e-05, | |
| "loss": 0.402, | |
| "step": 2420 | |
| }, | |
| { | |
| "epoch": 0.9941699907947223, | |
| "grad_norm": 0.5472572586554295, | |
| "learning_rate": 1.4864414482654143e-05, | |
| "loss": 0.3824, | |
| "step": 2430 | |
| }, | |
| { | |
| "epoch": 0.9982612253247417, | |
| "grad_norm": 0.6950966780791227, | |
| "learning_rate": 1.4834116043023785e-05, | |
| "loss": 0.3987, | |
| "step": 2440 | |
| }, | |
| { | |
| "epoch": 1.0020456172650096, | |
| "grad_norm": 0.7596326868738933, | |
| "learning_rate": 1.4803817603393427e-05, | |
| "loss": 0.3989, | |
| "step": 2450 | |
| }, | |
| { | |
| "epoch": 1.006136851795029, | |
| "grad_norm": 0.6633330801065885, | |
| "learning_rate": 1.4773519163763069e-05, | |
| "loss": 0.409, | |
| "step": 2460 | |
| }, | |
| { | |
| "epoch": 1.0102280863250486, | |
| "grad_norm": 0.8877771135554996, | |
| "learning_rate": 1.4743220724132708e-05, | |
| "loss": 0.4127, | |
| "step": 2470 | |
| }, | |
| { | |
| "epoch": 1.014319320855068, | |
| "grad_norm": 0.8214654626739897, | |
| "learning_rate": 1.4712922284502349e-05, | |
| "loss": 0.3883, | |
| "step": 2480 | |
| }, | |
| { | |
| "epoch": 1.0184105553850875, | |
| "grad_norm": 1.0024461048478996, | |
| "learning_rate": 1.468262384487199e-05, | |
| "loss": 0.3684, | |
| "step": 2490 | |
| }, | |
| { | |
| "epoch": 1.022501789915107, | |
| "grad_norm": 0.6824573340958298, | |
| "learning_rate": 1.4652325405241631e-05, | |
| "loss": 0.3826, | |
| "step": 2500 | |
| }, | |
| { | |
| "epoch": 1.022501789915107, | |
| "eval_loss": 0.4052634835243225, | |
| "eval_runtime": 566.8912, | |
| "eval_samples_per_second": 5.447, | |
| "eval_steps_per_second": 0.908, | |
| "step": 2500 | |
| }, | |
| { | |
| "epoch": 1.0265930244451262, | |
| "grad_norm": 0.7110510772881399, | |
| "learning_rate": 1.4622026965611271e-05, | |
| "loss": 0.3871, | |
| "step": 2510 | |
| }, | |
| { | |
| "epoch": 1.0306842589751457, | |
| "grad_norm": 0.7933140123736048, | |
| "learning_rate": 1.4591728525980912e-05, | |
| "loss": 0.3936, | |
| "step": 2520 | |
| }, | |
| { | |
| "epoch": 1.0347754935051652, | |
| "grad_norm": 0.8306902290417011, | |
| "learning_rate": 1.4561430086350555e-05, | |
| "loss": 0.3987, | |
| "step": 2530 | |
| }, | |
| { | |
| "epoch": 1.0388667280351847, | |
| "grad_norm": 0.7119087181302277, | |
| "learning_rate": 1.4531131646720196e-05, | |
| "loss": 0.3967, | |
| "step": 2540 | |
| }, | |
| { | |
| "epoch": 1.0429579625652041, | |
| "grad_norm": 1.0370136579582763, | |
| "learning_rate": 1.4500833207089835e-05, | |
| "loss": 0.401, | |
| "step": 2550 | |
| }, | |
| { | |
| "epoch": 1.0470491970952236, | |
| "grad_norm": 0.8532007552527441, | |
| "learning_rate": 1.4470534767459477e-05, | |
| "loss": 0.4037, | |
| "step": 2560 | |
| }, | |
| { | |
| "epoch": 1.0511404316252428, | |
| "grad_norm": 0.6599598500909406, | |
| "learning_rate": 1.4440236327829118e-05, | |
| "loss": 0.3743, | |
| "step": 2570 | |
| }, | |
| { | |
| "epoch": 1.0552316661552623, | |
| "grad_norm": 1.0157223230619488, | |
| "learning_rate": 1.4409937888198759e-05, | |
| "loss": 0.3845, | |
| "step": 2580 | |
| }, | |
| { | |
| "epoch": 1.0593229006852818, | |
| "grad_norm": 0.885522215179448, | |
| "learning_rate": 1.4379639448568402e-05, | |
| "loss": 0.3903, | |
| "step": 2590 | |
| }, | |
| { | |
| "epoch": 1.0634141352153013, | |
| "grad_norm": 0.6463142293297175, | |
| "learning_rate": 1.434934100893804e-05, | |
| "loss": 0.3781, | |
| "step": 2600 | |
| }, | |
| { | |
| "epoch": 1.0675053697453207, | |
| "grad_norm": 1.0072005790653578, | |
| "learning_rate": 1.4319042569307682e-05, | |
| "loss": 0.3949, | |
| "step": 2610 | |
| }, | |
| { | |
| "epoch": 1.07159660427534, | |
| "grad_norm": 0.7263224753671689, | |
| "learning_rate": 1.4288744129677324e-05, | |
| "loss": 0.3883, | |
| "step": 2620 | |
| }, | |
| { | |
| "epoch": 1.0756878388053595, | |
| "grad_norm": 0.8333184288546814, | |
| "learning_rate": 1.4258445690046965e-05, | |
| "loss": 0.3815, | |
| "step": 2630 | |
| }, | |
| { | |
| "epoch": 1.079779073335379, | |
| "grad_norm": 1.0113253670629012, | |
| "learning_rate": 1.4228147250416604e-05, | |
| "loss": 0.3813, | |
| "step": 2640 | |
| }, | |
| { | |
| "epoch": 1.0838703078653984, | |
| "grad_norm": 0.8528216478869027, | |
| "learning_rate": 1.4197848810786245e-05, | |
| "loss": 0.4065, | |
| "step": 2650 | |
| }, | |
| { | |
| "epoch": 1.0879615423954179, | |
| "grad_norm": 0.9440475129085675, | |
| "learning_rate": 1.4167550371155886e-05, | |
| "loss": 0.3962, | |
| "step": 2660 | |
| }, | |
| { | |
| "epoch": 1.0920527769254373, | |
| "grad_norm": 0.6055474396833757, | |
| "learning_rate": 1.413725193152553e-05, | |
| "loss": 0.3995, | |
| "step": 2670 | |
| }, | |
| { | |
| "epoch": 1.0961440114554566, | |
| "grad_norm": 1.0259816645911408, | |
| "learning_rate": 1.4106953491895167e-05, | |
| "loss": 0.3904, | |
| "step": 2680 | |
| }, | |
| { | |
| "epoch": 1.100235245985476, | |
| "grad_norm": 0.5928389552557433, | |
| "learning_rate": 1.407665505226481e-05, | |
| "loss": 0.4021, | |
| "step": 2690 | |
| }, | |
| { | |
| "epoch": 1.1043264805154955, | |
| "grad_norm": 0.6258598420479162, | |
| "learning_rate": 1.4046356612634451e-05, | |
| "loss": 0.391, | |
| "step": 2700 | |
| }, | |
| { | |
| "epoch": 1.108417715045515, | |
| "grad_norm": 0.7147294619644624, | |
| "learning_rate": 1.4016058173004092e-05, | |
| "loss": 0.3837, | |
| "step": 2710 | |
| }, | |
| { | |
| "epoch": 1.1125089495755345, | |
| "grad_norm": 0.646834791543815, | |
| "learning_rate": 1.3985759733373732e-05, | |
| "loss": 0.4035, | |
| "step": 2720 | |
| }, | |
| { | |
| "epoch": 1.116600184105554, | |
| "grad_norm": 0.5735852182826044, | |
| "learning_rate": 1.3955461293743373e-05, | |
| "loss": 0.3857, | |
| "step": 2730 | |
| }, | |
| { | |
| "epoch": 1.1206914186355732, | |
| "grad_norm": 0.7476886392647508, | |
| "learning_rate": 1.3925162854113014e-05, | |
| "loss": 0.4017, | |
| "step": 2740 | |
| }, | |
| { | |
| "epoch": 1.1247826531655927, | |
| "grad_norm": 0.9517953594151923, | |
| "learning_rate": 1.3894864414482657e-05, | |
| "loss": 0.3792, | |
| "step": 2750 | |
| }, | |
| { | |
| "epoch": 1.1288738876956121, | |
| "grad_norm": 0.8648296597362533, | |
| "learning_rate": 1.3864565974852296e-05, | |
| "loss": 0.3918, | |
| "step": 2760 | |
| }, | |
| { | |
| "epoch": 1.1329651222256316, | |
| "grad_norm": 0.758091624498141, | |
| "learning_rate": 1.3834267535221937e-05, | |
| "loss": 0.3849, | |
| "step": 2770 | |
| }, | |
| { | |
| "epoch": 1.137056356755651, | |
| "grad_norm": 0.8292099507010133, | |
| "learning_rate": 1.3803969095591579e-05, | |
| "loss": 0.4008, | |
| "step": 2780 | |
| }, | |
| { | |
| "epoch": 1.1411475912856703, | |
| "grad_norm": 0.8886800402947126, | |
| "learning_rate": 1.377367065596122e-05, | |
| "loss": 0.4094, | |
| "step": 2790 | |
| }, | |
| { | |
| "epoch": 1.1452388258156898, | |
| "grad_norm": 0.6921902768142778, | |
| "learning_rate": 1.3743372216330859e-05, | |
| "loss": 0.3857, | |
| "step": 2800 | |
| }, | |
| { | |
| "epoch": 1.1493300603457093, | |
| "grad_norm": 0.7941140268815301, | |
| "learning_rate": 1.37130737767005e-05, | |
| "loss": 0.3946, | |
| "step": 2810 | |
| }, | |
| { | |
| "epoch": 1.1534212948757288, | |
| "grad_norm": 0.8388193299897188, | |
| "learning_rate": 1.3682775337070141e-05, | |
| "loss": 0.3846, | |
| "step": 2820 | |
| }, | |
| { | |
| "epoch": 1.1575125294057482, | |
| "grad_norm": 0.8827526348826681, | |
| "learning_rate": 1.3652476897439784e-05, | |
| "loss": 0.392, | |
| "step": 2830 | |
| }, | |
| { | |
| "epoch": 1.1616037639357677, | |
| "grad_norm": 0.6399834967699533, | |
| "learning_rate": 1.3622178457809424e-05, | |
| "loss": 0.3936, | |
| "step": 2840 | |
| }, | |
| { | |
| "epoch": 1.1656949984657872, | |
| "grad_norm": 0.7858076115450058, | |
| "learning_rate": 1.3591880018179065e-05, | |
| "loss": 0.4021, | |
| "step": 2850 | |
| }, | |
| { | |
| "epoch": 1.1697862329958064, | |
| "grad_norm": 0.7641925003434336, | |
| "learning_rate": 1.3561581578548706e-05, | |
| "loss": 0.3855, | |
| "step": 2860 | |
| }, | |
| { | |
| "epoch": 1.173877467525826, | |
| "grad_norm": 0.4485280825495544, | |
| "learning_rate": 1.3531283138918347e-05, | |
| "loss": 0.4015, | |
| "step": 2870 | |
| }, | |
| { | |
| "epoch": 1.1779687020558454, | |
| "grad_norm": 0.8448609372779392, | |
| "learning_rate": 1.3500984699287987e-05, | |
| "loss": 0.4038, | |
| "step": 2880 | |
| }, | |
| { | |
| "epoch": 1.1820599365858648, | |
| "grad_norm": 0.6112956233981559, | |
| "learning_rate": 1.3470686259657628e-05, | |
| "loss": 0.3941, | |
| "step": 2890 | |
| }, | |
| { | |
| "epoch": 1.1861511711158843, | |
| "grad_norm": 0.787858901854597, | |
| "learning_rate": 1.3440387820027269e-05, | |
| "loss": 0.396, | |
| "step": 2900 | |
| }, | |
| { | |
| "epoch": 1.1902424056459036, | |
| "grad_norm": 0.7489331322179997, | |
| "learning_rate": 1.3410089380396912e-05, | |
| "loss": 0.3855, | |
| "step": 2910 | |
| }, | |
| { | |
| "epoch": 1.194333640175923, | |
| "grad_norm": 0.8245275477001581, | |
| "learning_rate": 1.3379790940766553e-05, | |
| "loss": 0.3903, | |
| "step": 2920 | |
| }, | |
| { | |
| "epoch": 1.1984248747059425, | |
| "grad_norm": 0.5684832752577668, | |
| "learning_rate": 1.3349492501136192e-05, | |
| "loss": 0.3892, | |
| "step": 2930 | |
| }, | |
| { | |
| "epoch": 1.202516109235962, | |
| "grad_norm": 0.5725195354548462, | |
| "learning_rate": 1.3319194061505834e-05, | |
| "loss": 0.3842, | |
| "step": 2940 | |
| }, | |
| { | |
| "epoch": 1.2066073437659814, | |
| "grad_norm": 0.8605746695624041, | |
| "learning_rate": 1.3288895621875475e-05, | |
| "loss": 0.3989, | |
| "step": 2950 | |
| }, | |
| { | |
| "epoch": 1.2106985782960007, | |
| "grad_norm": 0.569306392368477, | |
| "learning_rate": 1.3258597182245116e-05, | |
| "loss": 0.3886, | |
| "step": 2960 | |
| }, | |
| { | |
| "epoch": 1.2147898128260202, | |
| "grad_norm": 0.8210447551557634, | |
| "learning_rate": 1.3228298742614755e-05, | |
| "loss": 0.378, | |
| "step": 2970 | |
| }, | |
| { | |
| "epoch": 1.2188810473560396, | |
| "grad_norm": 0.5973996299783053, | |
| "learning_rate": 1.3198000302984398e-05, | |
| "loss": 0.4038, | |
| "step": 2980 | |
| }, | |
| { | |
| "epoch": 1.222972281886059, | |
| "grad_norm": 1.5938167927490012, | |
| "learning_rate": 1.316770186335404e-05, | |
| "loss": 0.4133, | |
| "step": 2990 | |
| }, | |
| { | |
| "epoch": 1.2270635164160786, | |
| "grad_norm": 0.6551073648017411, | |
| "learning_rate": 1.313740342372368e-05, | |
| "loss": 0.388, | |
| "step": 3000 | |
| }, | |
| { | |
| "epoch": 1.2270635164160786, | |
| "eval_loss": 0.40439197421073914, | |
| "eval_runtime": 566.8216, | |
| "eval_samples_per_second": 5.448, | |
| "eval_steps_per_second": 0.909, | |
| "step": 3000 | |
| }, | |
| { | |
| "epoch": 1.231154750946098, | |
| "grad_norm": 0.9122941726483766, | |
| "learning_rate": 1.310710498409332e-05, | |
| "loss": 0.3962, | |
| "step": 3010 | |
| }, | |
| { | |
| "epoch": 1.2352459854761175, | |
| "grad_norm": 0.8391097553941964, | |
| "learning_rate": 1.3076806544462961e-05, | |
| "loss": 0.387, | |
| "step": 3020 | |
| }, | |
| { | |
| "epoch": 1.2393372200061368, | |
| "grad_norm": 0.8126464750636853, | |
| "learning_rate": 1.3046508104832602e-05, | |
| "loss": 0.3914, | |
| "step": 3030 | |
| }, | |
| { | |
| "epoch": 1.2434284545361562, | |
| "grad_norm": 0.7877632963656168, | |
| "learning_rate": 1.3016209665202243e-05, | |
| "loss": 0.3678, | |
| "step": 3040 | |
| }, | |
| { | |
| "epoch": 1.2475196890661757, | |
| "grad_norm": 0.7204057647654071, | |
| "learning_rate": 1.2985911225571883e-05, | |
| "loss": 0.4056, | |
| "step": 3050 | |
| }, | |
| { | |
| "epoch": 1.2516109235961952, | |
| "grad_norm": 1.0360947842710033, | |
| "learning_rate": 1.2955612785941526e-05, | |
| "loss": 0.3921, | |
| "step": 3060 | |
| }, | |
| { | |
| "epoch": 1.2557021581262147, | |
| "grad_norm": 0.8740894371532404, | |
| "learning_rate": 1.2925314346311167e-05, | |
| "loss": 0.3785, | |
| "step": 3070 | |
| }, | |
| { | |
| "epoch": 1.259793392656234, | |
| "grad_norm": 0.7168443376463302, | |
| "learning_rate": 1.2895015906680808e-05, | |
| "loss": 0.403, | |
| "step": 3080 | |
| }, | |
| { | |
| "epoch": 1.2638846271862534, | |
| "grad_norm": 0.5888660550300815, | |
| "learning_rate": 1.2864717467050447e-05, | |
| "loss": 0.3716, | |
| "step": 3090 | |
| }, | |
| { | |
| "epoch": 1.2679758617162729, | |
| "grad_norm": 0.5289574947955048, | |
| "learning_rate": 1.2834419027420089e-05, | |
| "loss": 0.3764, | |
| "step": 3100 | |
| }, | |
| { | |
| "epoch": 1.2720670962462923, | |
| "grad_norm": 0.7080617124171087, | |
| "learning_rate": 1.280412058778973e-05, | |
| "loss": 0.3927, | |
| "step": 3110 | |
| }, | |
| { | |
| "epoch": 1.2761583307763118, | |
| "grad_norm": 0.8062114426141331, | |
| "learning_rate": 1.277382214815937e-05, | |
| "loss": 0.4044, | |
| "step": 3120 | |
| }, | |
| { | |
| "epoch": 1.280249565306331, | |
| "grad_norm": 0.476668956425164, | |
| "learning_rate": 1.274352370852901e-05, | |
| "loss": 0.3868, | |
| "step": 3130 | |
| }, | |
| { | |
| "epoch": 1.2843407998363507, | |
| "grad_norm": 0.6871622346909918, | |
| "learning_rate": 1.2713225268898653e-05, | |
| "loss": 0.3947, | |
| "step": 3140 | |
| }, | |
| { | |
| "epoch": 1.28843203436637, | |
| "grad_norm": 0.7061613344838735, | |
| "learning_rate": 1.2682926829268294e-05, | |
| "loss": 0.3872, | |
| "step": 3150 | |
| }, | |
| { | |
| "epoch": 1.2925232688963895, | |
| "grad_norm": 0.9863242302948113, | |
| "learning_rate": 1.2652628389637935e-05, | |
| "loss": 0.3889, | |
| "step": 3160 | |
| }, | |
| { | |
| "epoch": 1.296614503426409, | |
| "grad_norm": 0.5738534501395554, | |
| "learning_rate": 1.2622329950007575e-05, | |
| "loss": 0.4054, | |
| "step": 3170 | |
| }, | |
| { | |
| "epoch": 1.3007057379564284, | |
| "grad_norm": 0.8389559630933702, | |
| "learning_rate": 1.2592031510377216e-05, | |
| "loss": 0.3917, | |
| "step": 3180 | |
| }, | |
| { | |
| "epoch": 1.3047969724864479, | |
| "grad_norm": 0.7991010292600728, | |
| "learning_rate": 1.2561733070746857e-05, | |
| "loss": 0.3901, | |
| "step": 3190 | |
| }, | |
| { | |
| "epoch": 1.3088882070164671, | |
| "grad_norm": 0.5541164921899168, | |
| "learning_rate": 1.25314346311165e-05, | |
| "loss": 0.4008, | |
| "step": 3200 | |
| }, | |
| { | |
| "epoch": 1.3129794415464866, | |
| "grad_norm": 0.8663675517676568, | |
| "learning_rate": 1.2501136191486138e-05, | |
| "loss": 0.3966, | |
| "step": 3210 | |
| }, | |
| { | |
| "epoch": 1.317070676076506, | |
| "grad_norm": 0.5807714675119107, | |
| "learning_rate": 1.247083775185578e-05, | |
| "loss": 0.3881, | |
| "step": 3220 | |
| }, | |
| { | |
| "epoch": 1.3211619106065255, | |
| "grad_norm": 0.5949773301489646, | |
| "learning_rate": 1.2440539312225422e-05, | |
| "loss": 0.3967, | |
| "step": 3230 | |
| }, | |
| { | |
| "epoch": 1.325253145136545, | |
| "grad_norm": 1.0156281369557891, | |
| "learning_rate": 1.2410240872595063e-05, | |
| "loss": 0.3914, | |
| "step": 3240 | |
| }, | |
| { | |
| "epoch": 1.3293443796665643, | |
| "grad_norm": 0.5546340288088691, | |
| "learning_rate": 1.2379942432964704e-05, | |
| "loss": 0.3869, | |
| "step": 3250 | |
| }, | |
| { | |
| "epoch": 1.3334356141965837, | |
| "grad_norm": 1.0378625122625662, | |
| "learning_rate": 1.2349643993334344e-05, | |
| "loss": 0.4012, | |
| "step": 3260 | |
| }, | |
| { | |
| "epoch": 1.3375268487266032, | |
| "grad_norm": 1.0535947037253341, | |
| "learning_rate": 1.2319345553703985e-05, | |
| "loss": 0.4132, | |
| "step": 3270 | |
| }, | |
| { | |
| "epoch": 1.3416180832566227, | |
| "grad_norm": 0.6479776053412775, | |
| "learning_rate": 1.2289047114073628e-05, | |
| "loss": 0.3989, | |
| "step": 3280 | |
| }, | |
| { | |
| "epoch": 1.3457093177866422, | |
| "grad_norm": 0.6474716962137215, | |
| "learning_rate": 1.2258748674443269e-05, | |
| "loss": 0.3937, | |
| "step": 3290 | |
| }, | |
| { | |
| "epoch": 1.3498005523166616, | |
| "grad_norm": 0.770667683748129, | |
| "learning_rate": 1.2228450234812908e-05, | |
| "loss": 0.4021, | |
| "step": 3300 | |
| }, | |
| { | |
| "epoch": 1.353891786846681, | |
| "grad_norm": 0.9562402793665876, | |
| "learning_rate": 1.219815179518255e-05, | |
| "loss": 0.3895, | |
| "step": 3310 | |
| }, | |
| { | |
| "epoch": 1.3579830213767003, | |
| "grad_norm": 0.7835866454881345, | |
| "learning_rate": 1.216785335555219e-05, | |
| "loss": 0.3915, | |
| "step": 3320 | |
| }, | |
| { | |
| "epoch": 1.3620742559067198, | |
| "grad_norm": 0.6840970037680498, | |
| "learning_rate": 1.2137554915921832e-05, | |
| "loss": 0.3933, | |
| "step": 3330 | |
| }, | |
| { | |
| "epoch": 1.3661654904367393, | |
| "grad_norm": 0.6363375592990289, | |
| "learning_rate": 1.2107256476291471e-05, | |
| "loss": 0.3779, | |
| "step": 3340 | |
| }, | |
| { | |
| "epoch": 1.3702567249667588, | |
| "grad_norm": 0.6422064337095349, | |
| "learning_rate": 1.2076958036661112e-05, | |
| "loss": 0.3781, | |
| "step": 3350 | |
| }, | |
| { | |
| "epoch": 1.3743479594967782, | |
| "grad_norm": 0.7443280549443633, | |
| "learning_rate": 1.2046659597030755e-05, | |
| "loss": 0.3829, | |
| "step": 3360 | |
| }, | |
| { | |
| "epoch": 1.3784391940267975, | |
| "grad_norm": 0.9031635762318649, | |
| "learning_rate": 1.2016361157400396e-05, | |
| "loss": 0.3964, | |
| "step": 3370 | |
| }, | |
| { | |
| "epoch": 1.382530428556817, | |
| "grad_norm": 0.5980799440932613, | |
| "learning_rate": 1.1986062717770036e-05, | |
| "loss": 0.3887, | |
| "step": 3380 | |
| }, | |
| { | |
| "epoch": 1.3866216630868364, | |
| "grad_norm": 0.6477461394839297, | |
| "learning_rate": 1.1955764278139677e-05, | |
| "loss": 0.3912, | |
| "step": 3390 | |
| }, | |
| { | |
| "epoch": 1.390712897616856, | |
| "grad_norm": 0.8829826372548863, | |
| "learning_rate": 1.1925465838509318e-05, | |
| "loss": 0.4026, | |
| "step": 3400 | |
| }, | |
| { | |
| "epoch": 1.3948041321468754, | |
| "grad_norm": 0.602437282415112, | |
| "learning_rate": 1.1895167398878959e-05, | |
| "loss": 0.3968, | |
| "step": 3410 | |
| }, | |
| { | |
| "epoch": 1.3988953666768946, | |
| "grad_norm": 1.0853438418518562, | |
| "learning_rate": 1.1864868959248599e-05, | |
| "loss": 0.3795, | |
| "step": 3420 | |
| }, | |
| { | |
| "epoch": 1.4029866012069143, | |
| "grad_norm": 0.68812770168341, | |
| "learning_rate": 1.183457051961824e-05, | |
| "loss": 0.3903, | |
| "step": 3430 | |
| }, | |
| { | |
| "epoch": 1.4070778357369336, | |
| "grad_norm": 0.673891970112453, | |
| "learning_rate": 1.1804272079987883e-05, | |
| "loss": 0.3955, | |
| "step": 3440 | |
| }, | |
| { | |
| "epoch": 1.411169070266953, | |
| "grad_norm": 0.9658876480715098, | |
| "learning_rate": 1.1773973640357524e-05, | |
| "loss": 0.3806, | |
| "step": 3450 | |
| }, | |
| { | |
| "epoch": 1.4152603047969725, | |
| "grad_norm": 0.9072014909826842, | |
| "learning_rate": 1.1743675200727163e-05, | |
| "loss": 0.393, | |
| "step": 3460 | |
| }, | |
| { | |
| "epoch": 1.419351539326992, | |
| "grad_norm": 0.8593625419024139, | |
| "learning_rate": 1.1713376761096804e-05, | |
| "loss": 0.3829, | |
| "step": 3470 | |
| }, | |
| { | |
| "epoch": 1.4234427738570115, | |
| "grad_norm": 1.1598468495920022, | |
| "learning_rate": 1.1683078321466445e-05, | |
| "loss": 0.3895, | |
| "step": 3480 | |
| }, | |
| { | |
| "epoch": 1.4275340083870307, | |
| "grad_norm": 0.6212230421582741, | |
| "learning_rate": 1.1652779881836087e-05, | |
| "loss": 0.404, | |
| "step": 3490 | |
| }, | |
| { | |
| "epoch": 1.4316252429170502, | |
| "grad_norm": 1.052418138214992, | |
| "learning_rate": 1.1622481442205726e-05, | |
| "loss": 0.3788, | |
| "step": 3500 | |
| }, | |
| { | |
| "epoch": 1.4316252429170502, | |
| "eval_loss": 0.3989790678024292, | |
| "eval_runtime": 567.3009, | |
| "eval_samples_per_second": 5.443, | |
| "eval_steps_per_second": 0.908, | |
| "step": 3500 | |
| }, | |
| { | |
| "epoch": 1.4357164774470696, | |
| "grad_norm": 0.9457200564933581, | |
| "learning_rate": 1.1592183002575367e-05, | |
| "loss": 0.3848, | |
| "step": 3510 | |
| }, | |
| { | |
| "epoch": 1.4398077119770891, | |
| "grad_norm": 0.711408339200041, | |
| "learning_rate": 1.156188456294501e-05, | |
| "loss": 0.3777, | |
| "step": 3520 | |
| }, | |
| { | |
| "epoch": 1.4438989465071086, | |
| "grad_norm": 0.765462240881764, | |
| "learning_rate": 1.1531586123314651e-05, | |
| "loss": 0.392, | |
| "step": 3530 | |
| }, | |
| { | |
| "epoch": 1.4479901810371278, | |
| "grad_norm": 0.8447863863134836, | |
| "learning_rate": 1.150128768368429e-05, | |
| "loss": 0.394, | |
| "step": 3540 | |
| }, | |
| { | |
| "epoch": 1.4520814155671473, | |
| "grad_norm": 0.7219223617330234, | |
| "learning_rate": 1.1470989244053932e-05, | |
| "loss": 0.3854, | |
| "step": 3550 | |
| }, | |
| { | |
| "epoch": 1.4561726500971668, | |
| "grad_norm": 0.9253301498291125, | |
| "learning_rate": 1.1440690804423573e-05, | |
| "loss": 0.383, | |
| "step": 3560 | |
| }, | |
| { | |
| "epoch": 1.4602638846271863, | |
| "grad_norm": 0.7352515331825862, | |
| "learning_rate": 1.1410392364793214e-05, | |
| "loss": 0.4006, | |
| "step": 3570 | |
| }, | |
| { | |
| "epoch": 1.4643551191572057, | |
| "grad_norm": 0.7903238163122892, | |
| "learning_rate": 1.1380093925162854e-05, | |
| "loss": 0.3966, | |
| "step": 3580 | |
| }, | |
| { | |
| "epoch": 1.4684463536872252, | |
| "grad_norm": 0.8163601731666391, | |
| "learning_rate": 1.1349795485532495e-05, | |
| "loss": 0.3843, | |
| "step": 3590 | |
| }, | |
| { | |
| "epoch": 1.4725375882172447, | |
| "grad_norm": 0.7031404690333898, | |
| "learning_rate": 1.1319497045902137e-05, | |
| "loss": 0.3889, | |
| "step": 3600 | |
| }, | |
| { | |
| "epoch": 1.476628822747264, | |
| "grad_norm": 0.9035695820122014, | |
| "learning_rate": 1.1289198606271779e-05, | |
| "loss": 0.3827, | |
| "step": 3610 | |
| }, | |
| { | |
| "epoch": 1.4807200572772834, | |
| "grad_norm": 0.8712016470060545, | |
| "learning_rate": 1.125890016664142e-05, | |
| "loss": 0.3833, | |
| "step": 3620 | |
| }, | |
| { | |
| "epoch": 1.4848112918073029, | |
| "grad_norm": 0.5886133981154346, | |
| "learning_rate": 1.122860172701106e-05, | |
| "loss": 0.3852, | |
| "step": 3630 | |
| }, | |
| { | |
| "epoch": 1.4889025263373223, | |
| "grad_norm": 0.6641821222496354, | |
| "learning_rate": 1.11983032873807e-05, | |
| "loss": 0.3883, | |
| "step": 3640 | |
| }, | |
| { | |
| "epoch": 1.4929937608673418, | |
| "grad_norm": 0.71470696861074, | |
| "learning_rate": 1.1168004847750342e-05, | |
| "loss": 0.39, | |
| "step": 3650 | |
| }, | |
| { | |
| "epoch": 1.497084995397361, | |
| "grad_norm": 0.7599626830703158, | |
| "learning_rate": 1.1137706408119984e-05, | |
| "loss": 0.3988, | |
| "step": 3660 | |
| }, | |
| { | |
| "epoch": 1.5011762299273805, | |
| "grad_norm": 0.8131240458205017, | |
| "learning_rate": 1.1107407968489622e-05, | |
| "loss": 0.3837, | |
| "step": 3670 | |
| }, | |
| { | |
| "epoch": 1.5052674644574, | |
| "grad_norm": 0.5981406315717848, | |
| "learning_rate": 1.1077109528859265e-05, | |
| "loss": 0.3976, | |
| "step": 3680 | |
| }, | |
| { | |
| "epoch": 1.5093586989874195, | |
| "grad_norm": 0.6546000876796034, | |
| "learning_rate": 1.1046811089228906e-05, | |
| "loss": 0.3845, | |
| "step": 3690 | |
| }, | |
| { | |
| "epoch": 1.513449933517439, | |
| "grad_norm": 0.6618741576872935, | |
| "learning_rate": 1.1016512649598547e-05, | |
| "loss": 0.3829, | |
| "step": 3700 | |
| }, | |
| { | |
| "epoch": 1.5175411680474582, | |
| "grad_norm": 0.7644320579880938, | |
| "learning_rate": 1.0986214209968187e-05, | |
| "loss": 0.3904, | |
| "step": 3710 | |
| }, | |
| { | |
| "epoch": 1.521632402577478, | |
| "grad_norm": 0.7078963682359172, | |
| "learning_rate": 1.0955915770337828e-05, | |
| "loss": 0.3943, | |
| "step": 3720 | |
| }, | |
| { | |
| "epoch": 1.5257236371074971, | |
| "grad_norm": 0.9863976210557551, | |
| "learning_rate": 1.0925617330707469e-05, | |
| "loss": 0.3836, | |
| "step": 3730 | |
| }, | |
| { | |
| "epoch": 1.5298148716375166, | |
| "grad_norm": 0.7431834628180725, | |
| "learning_rate": 1.0895318891077112e-05, | |
| "loss": 0.4033, | |
| "step": 3740 | |
| }, | |
| { | |
| "epoch": 1.533906106167536, | |
| "grad_norm": 0.9543361591228587, | |
| "learning_rate": 1.0865020451446751e-05, | |
| "loss": 0.3928, | |
| "step": 3750 | |
| }, | |
| { | |
| "epoch": 1.5379973406975553, | |
| "grad_norm": 0.7174707063961077, | |
| "learning_rate": 1.0834722011816392e-05, | |
| "loss": 0.3848, | |
| "step": 3760 | |
| }, | |
| { | |
| "epoch": 1.542088575227575, | |
| "grad_norm": 0.8245320777882585, | |
| "learning_rate": 1.0804423572186034e-05, | |
| "loss": 0.3992, | |
| "step": 3770 | |
| }, | |
| { | |
| "epoch": 1.5461798097575943, | |
| "grad_norm": 1.0937610813639995, | |
| "learning_rate": 1.0774125132555675e-05, | |
| "loss": 0.3922, | |
| "step": 3780 | |
| }, | |
| { | |
| "epoch": 1.5502710442876138, | |
| "grad_norm": 0.6595221788634811, | |
| "learning_rate": 1.0743826692925314e-05, | |
| "loss": 0.3846, | |
| "step": 3790 | |
| }, | |
| { | |
| "epoch": 1.5543622788176332, | |
| "grad_norm": 0.6933714247729369, | |
| "learning_rate": 1.0713528253294955e-05, | |
| "loss": 0.3924, | |
| "step": 3800 | |
| }, | |
| { | |
| "epoch": 1.5584535133476527, | |
| "grad_norm": 0.6322021390419345, | |
| "learning_rate": 1.0683229813664597e-05, | |
| "loss": 0.382, | |
| "step": 3810 | |
| }, | |
| { | |
| "epoch": 1.5625447478776722, | |
| "grad_norm": 0.8617688710364446, | |
| "learning_rate": 1.065293137403424e-05, | |
| "loss": 0.39, | |
| "step": 3820 | |
| }, | |
| { | |
| "epoch": 1.5666359824076914, | |
| "grad_norm": 1.244329874318961, | |
| "learning_rate": 1.0622632934403879e-05, | |
| "loss": 0.3963, | |
| "step": 3830 | |
| }, | |
| { | |
| "epoch": 1.5707272169377111, | |
| "grad_norm": 0.9027740160226115, | |
| "learning_rate": 1.059233449477352e-05, | |
| "loss": 0.382, | |
| "step": 3840 | |
| }, | |
| { | |
| "epoch": 1.5748184514677304, | |
| "grad_norm": 0.5021808277805254, | |
| "learning_rate": 1.0562036055143161e-05, | |
| "loss": 0.399, | |
| "step": 3850 | |
| }, | |
| { | |
| "epoch": 1.5789096859977498, | |
| "grad_norm": 0.6718417113604481, | |
| "learning_rate": 1.0531737615512802e-05, | |
| "loss": 0.3947, | |
| "step": 3860 | |
| }, | |
| { | |
| "epoch": 1.5830009205277693, | |
| "grad_norm": 0.6732165543554006, | |
| "learning_rate": 1.0501439175882442e-05, | |
| "loss": 0.3921, | |
| "step": 3870 | |
| }, | |
| { | |
| "epoch": 1.5870921550577886, | |
| "grad_norm": 0.8949389121109214, | |
| "learning_rate": 1.0471140736252083e-05, | |
| "loss": 0.3789, | |
| "step": 3880 | |
| }, | |
| { | |
| "epoch": 1.5911833895878083, | |
| "grad_norm": 0.8368104145013396, | |
| "learning_rate": 1.0440842296621724e-05, | |
| "loss": 0.3838, | |
| "step": 3890 | |
| }, | |
| { | |
| "epoch": 1.5952746241178275, | |
| "grad_norm": 0.6115609968754325, | |
| "learning_rate": 1.0410543856991367e-05, | |
| "loss": 0.3919, | |
| "step": 3900 | |
| }, | |
| { | |
| "epoch": 1.599365858647847, | |
| "grad_norm": 0.8379228899852589, | |
| "learning_rate": 1.0380245417361006e-05, | |
| "loss": 0.3991, | |
| "step": 3910 | |
| }, | |
| { | |
| "epoch": 1.6034570931778664, | |
| "grad_norm": 0.7827671214396511, | |
| "learning_rate": 1.0349946977730647e-05, | |
| "loss": 0.3794, | |
| "step": 3920 | |
| }, | |
| { | |
| "epoch": 1.6075483277078857, | |
| "grad_norm": 0.6805284446607271, | |
| "learning_rate": 1.0319648538100289e-05, | |
| "loss": 0.3936, | |
| "step": 3930 | |
| }, | |
| { | |
| "epoch": 1.6116395622379054, | |
| "grad_norm": 0.5081615910622816, | |
| "learning_rate": 1.028935009846993e-05, | |
| "loss": 0.3882, | |
| "step": 3940 | |
| }, | |
| { | |
| "epoch": 1.6157307967679246, | |
| "grad_norm": 0.585926687974076, | |
| "learning_rate": 1.0259051658839571e-05, | |
| "loss": 0.4009, | |
| "step": 3950 | |
| }, | |
| { | |
| "epoch": 1.619822031297944, | |
| "grad_norm": 0.6755208541842371, | |
| "learning_rate": 1.022875321920921e-05, | |
| "loss": 0.3797, | |
| "step": 3960 | |
| }, | |
| { | |
| "epoch": 1.6239132658279636, | |
| "grad_norm": 0.7578805362403562, | |
| "learning_rate": 1.0198454779578853e-05, | |
| "loss": 0.3877, | |
| "step": 3970 | |
| }, | |
| { | |
| "epoch": 1.628004500357983, | |
| "grad_norm": 0.7490700009059831, | |
| "learning_rate": 1.0168156339948494e-05, | |
| "loss": 0.3779, | |
| "step": 3980 | |
| }, | |
| { | |
| "epoch": 1.6320957348880025, | |
| "grad_norm": 0.6801784359822746, | |
| "learning_rate": 1.0137857900318136e-05, | |
| "loss": 0.3741, | |
| "step": 3990 | |
| }, | |
| { | |
| "epoch": 1.6361869694180218, | |
| "grad_norm": 0.6583694408666441, | |
| "learning_rate": 1.0107559460687775e-05, | |
| "loss": 0.3959, | |
| "step": 4000 | |
| }, | |
| { | |
| "epoch": 1.6361869694180218, | |
| "eval_loss": 0.39864814281463623, | |
| "eval_runtime": 580.5731, | |
| "eval_samples_per_second": 5.319, | |
| "eval_steps_per_second": 0.887, | |
| "step": 4000 | |
| }, | |
| { | |
| "epoch": 1.6402782039480415, | |
| "grad_norm": 0.9122028487385195, | |
| "learning_rate": 1.0077261021057416e-05, | |
| "loss": 0.3933, | |
| "step": 4010 | |
| }, | |
| { | |
| "epoch": 1.6443694384780607, | |
| "grad_norm": 0.9631240105647585, | |
| "learning_rate": 1.0046962581427057e-05, | |
| "loss": 0.3886, | |
| "step": 4020 | |
| }, | |
| { | |
| "epoch": 1.6484606730080802, | |
| "grad_norm": 0.8266805005833362, | |
| "learning_rate": 1.0016664141796698e-05, | |
| "loss": 0.3878, | |
| "step": 4030 | |
| }, | |
| { | |
| "epoch": 1.6525519075380997, | |
| "grad_norm": 0.692721855989811, | |
| "learning_rate": 9.98636570216634e-06, | |
| "loss": 0.3732, | |
| "step": 4040 | |
| }, | |
| { | |
| "epoch": 1.656643142068119, | |
| "grad_norm": 0.5908823120184169, | |
| "learning_rate": 9.95606726253598e-06, | |
| "loss": 0.3808, | |
| "step": 4050 | |
| }, | |
| { | |
| "epoch": 1.6607343765981386, | |
| "grad_norm": 0.9905003235273305, | |
| "learning_rate": 9.925768822905622e-06, | |
| "loss": 0.3807, | |
| "step": 4060 | |
| }, | |
| { | |
| "epoch": 1.6648256111281579, | |
| "grad_norm": 0.731190147784246, | |
| "learning_rate": 9.895470383275261e-06, | |
| "loss": 0.3826, | |
| "step": 4070 | |
| }, | |
| { | |
| "epoch": 1.6689168456581773, | |
| "grad_norm": 0.5342009151761759, | |
| "learning_rate": 9.865171943644904e-06, | |
| "loss": 0.3931, | |
| "step": 4080 | |
| }, | |
| { | |
| "epoch": 1.6730080801881968, | |
| "grad_norm": 0.6299401650866043, | |
| "learning_rate": 9.834873504014544e-06, | |
| "loss": 0.3655, | |
| "step": 4090 | |
| }, | |
| { | |
| "epoch": 1.6770993147182163, | |
| "grad_norm": 0.6634845001555149, | |
| "learning_rate": 9.804575064384185e-06, | |
| "loss": 0.3857, | |
| "step": 4100 | |
| }, | |
| { | |
| "epoch": 1.6811905492482357, | |
| "grad_norm": 0.47553645688690244, | |
| "learning_rate": 9.774276624753826e-06, | |
| "loss": 0.3819, | |
| "step": 4110 | |
| }, | |
| { | |
| "epoch": 1.685281783778255, | |
| "grad_norm": 0.946430943640409, | |
| "learning_rate": 9.743978185123467e-06, | |
| "loss": 0.3761, | |
| "step": 4120 | |
| }, | |
| { | |
| "epoch": 1.6893730183082747, | |
| "grad_norm": 0.9939471324083836, | |
| "learning_rate": 9.713679745493108e-06, | |
| "loss": 0.388, | |
| "step": 4130 | |
| }, | |
| { | |
| "epoch": 1.693464252838294, | |
| "grad_norm": 0.6474361686497936, | |
| "learning_rate": 9.68338130586275e-06, | |
| "loss": 0.3662, | |
| "step": 4140 | |
| }, | |
| { | |
| "epoch": 1.6975554873683134, | |
| "grad_norm": 0.7980334202517952, | |
| "learning_rate": 9.653082866232389e-06, | |
| "loss": 0.3849, | |
| "step": 4150 | |
| }, | |
| { | |
| "epoch": 1.7016467218983329, | |
| "grad_norm": 0.728268244955796, | |
| "learning_rate": 9.622784426602032e-06, | |
| "loss": 0.3842, | |
| "step": 4160 | |
| }, | |
| { | |
| "epoch": 1.7057379564283521, | |
| "grad_norm": 0.9131018127080367, | |
| "learning_rate": 9.592485986971671e-06, | |
| "loss": 0.3824, | |
| "step": 4170 | |
| }, | |
| { | |
| "epoch": 1.7098291909583718, | |
| "grad_norm": 0.7674900891840208, | |
| "learning_rate": 9.562187547341312e-06, | |
| "loss": 0.3733, | |
| "step": 4180 | |
| }, | |
| { | |
| "epoch": 1.713920425488391, | |
| "grad_norm": 0.9246508059789594, | |
| "learning_rate": 9.531889107710953e-06, | |
| "loss": 0.4029, | |
| "step": 4190 | |
| }, | |
| { | |
| "epoch": 1.7180116600184105, | |
| "grad_norm": 0.6178369369789531, | |
| "learning_rate": 9.501590668080595e-06, | |
| "loss": 0.37, | |
| "step": 4200 | |
| }, | |
| { | |
| "epoch": 1.72210289454843, | |
| "grad_norm": 0.9227660541364927, | |
| "learning_rate": 9.471292228450236e-06, | |
| "loss": 0.3841, | |
| "step": 4210 | |
| }, | |
| { | |
| "epoch": 1.7261941290784493, | |
| "grad_norm": 0.6860127218022202, | |
| "learning_rate": 9.440993788819877e-06, | |
| "loss": 0.3859, | |
| "step": 4220 | |
| }, | |
| { | |
| "epoch": 1.730285363608469, | |
| "grad_norm": 0.6968645833002234, | |
| "learning_rate": 9.410695349189516e-06, | |
| "loss": 0.4007, | |
| "step": 4230 | |
| }, | |
| { | |
| "epoch": 1.7343765981384882, | |
| "grad_norm": 0.7545598370400031, | |
| "learning_rate": 9.38039690955916e-06, | |
| "loss": 0.3859, | |
| "step": 4240 | |
| }, | |
| { | |
| "epoch": 1.7384678326685077, | |
| "grad_norm": 0.7363763263567308, | |
| "learning_rate": 9.350098469928799e-06, | |
| "loss": 0.3867, | |
| "step": 4250 | |
| }, | |
| { | |
| "epoch": 1.7425590671985272, | |
| "grad_norm": 0.48614259001613647, | |
| "learning_rate": 9.31980003029844e-06, | |
| "loss": 0.3798, | |
| "step": 4260 | |
| }, | |
| { | |
| "epoch": 1.7466503017285466, | |
| "grad_norm": 0.7729758585437992, | |
| "learning_rate": 9.289501590668083e-06, | |
| "loss": 0.3912, | |
| "step": 4270 | |
| }, | |
| { | |
| "epoch": 1.750741536258566, | |
| "grad_norm": 0.7305646510615461, | |
| "learning_rate": 9.259203151037722e-06, | |
| "loss": 0.385, | |
| "step": 4280 | |
| }, | |
| { | |
| "epoch": 1.7548327707885853, | |
| "grad_norm": 0.9399629054581843, | |
| "learning_rate": 9.228904711407363e-06, | |
| "loss": 0.3875, | |
| "step": 4290 | |
| }, | |
| { | |
| "epoch": 1.758924005318605, | |
| "grad_norm": 0.49789660727149787, | |
| "learning_rate": 9.198606271777004e-06, | |
| "loss": 0.3886, | |
| "step": 4300 | |
| }, | |
| { | |
| "epoch": 1.7630152398486243, | |
| "grad_norm": 0.8040334242726411, | |
| "learning_rate": 9.168307832146646e-06, | |
| "loss": 0.3854, | |
| "step": 4310 | |
| }, | |
| { | |
| "epoch": 1.7671064743786438, | |
| "grad_norm": 0.520719829405063, | |
| "learning_rate": 9.138009392516287e-06, | |
| "loss": 0.3717, | |
| "step": 4320 | |
| }, | |
| { | |
| "epoch": 1.7711977089086632, | |
| "grad_norm": 1.0848704176681172, | |
| "learning_rate": 9.107710952885928e-06, | |
| "loss": 0.3888, | |
| "step": 4330 | |
| }, | |
| { | |
| "epoch": 1.7752889434386825, | |
| "grad_norm": 1.1159019237173737, | |
| "learning_rate": 9.077412513255567e-06, | |
| "loss": 0.3773, | |
| "step": 4340 | |
| }, | |
| { | |
| "epoch": 1.7793801779687022, | |
| "grad_norm": 0.45334318412994085, | |
| "learning_rate": 9.04711407362521e-06, | |
| "loss": 0.3795, | |
| "step": 4350 | |
| }, | |
| { | |
| "epoch": 1.7834714124987214, | |
| "grad_norm": 0.6934540742121007, | |
| "learning_rate": 9.01681563399485e-06, | |
| "loss": 0.3796, | |
| "step": 4360 | |
| }, | |
| { | |
| "epoch": 1.787562647028741, | |
| "grad_norm": 0.5578670479631883, | |
| "learning_rate": 8.98651719436449e-06, | |
| "loss": 0.3845, | |
| "step": 4370 | |
| }, | |
| { | |
| "epoch": 1.7916538815587604, | |
| "grad_norm": 0.7503759650869845, | |
| "learning_rate": 8.956218754734132e-06, | |
| "loss": 0.3942, | |
| "step": 4380 | |
| }, | |
| { | |
| "epoch": 1.7957451160887796, | |
| "grad_norm": 0.867971089595999, | |
| "learning_rate": 8.925920315103773e-06, | |
| "loss": 0.3862, | |
| "step": 4390 | |
| }, | |
| { | |
| "epoch": 1.7998363506187993, | |
| "grad_norm": 0.8400133056337923, | |
| "learning_rate": 8.895621875473414e-06, | |
| "loss": 0.4045, | |
| "step": 4400 | |
| }, | |
| { | |
| "epoch": 1.8039275851488186, | |
| "grad_norm": 0.8593095780014482, | |
| "learning_rate": 8.865323435843055e-06, | |
| "loss": 0.3972, | |
| "step": 4410 | |
| }, | |
| { | |
| "epoch": 1.808018819678838, | |
| "grad_norm": 0.8648015436921023, | |
| "learning_rate": 8.835024996212695e-06, | |
| "loss": 0.3885, | |
| "step": 4420 | |
| }, | |
| { | |
| "epoch": 1.8121100542088575, | |
| "grad_norm": 0.8474119696284984, | |
| "learning_rate": 8.804726556582338e-06, | |
| "loss": 0.3797, | |
| "step": 4430 | |
| }, | |
| { | |
| "epoch": 1.816201288738877, | |
| "grad_norm": 0.6888945197943561, | |
| "learning_rate": 8.774428116951977e-06, | |
| "loss": 0.386, | |
| "step": 4440 | |
| }, | |
| { | |
| "epoch": 1.8202925232688965, | |
| "grad_norm": 0.8152832539209761, | |
| "learning_rate": 8.744129677321618e-06, | |
| "loss": 0.3903, | |
| "step": 4450 | |
| }, | |
| { | |
| "epoch": 1.8243837577989157, | |
| "grad_norm": 0.7728650210318904, | |
| "learning_rate": 8.71383123769126e-06, | |
| "loss": 0.3919, | |
| "step": 4460 | |
| }, | |
| { | |
| "epoch": 1.8284749923289354, | |
| "grad_norm": 0.5391104409638143, | |
| "learning_rate": 8.6835327980609e-06, | |
| "loss": 0.3803, | |
| "step": 4470 | |
| }, | |
| { | |
| "epoch": 1.8325662268589546, | |
| "grad_norm": 0.7567717191116343, | |
| "learning_rate": 8.653234358430542e-06, | |
| "loss": 0.3815, | |
| "step": 4480 | |
| }, | |
| { | |
| "epoch": 1.8366574613889741, | |
| "grad_norm": 0.8749818376847743, | |
| "learning_rate": 8.622935918800183e-06, | |
| "loss": 0.3883, | |
| "step": 4490 | |
| }, | |
| { | |
| "epoch": 1.8407486959189936, | |
| "grad_norm": 0.8019778003533239, | |
| "learning_rate": 8.592637479169822e-06, | |
| "loss": 0.383, | |
| "step": 4500 | |
| }, | |
| { | |
| "epoch": 1.8407486959189936, | |
| "eval_loss": 0.3955570459365845, | |
| "eval_runtime": 568.3755, | |
| "eval_samples_per_second": 5.433, | |
| "eval_steps_per_second": 0.906, | |
| "step": 4500 | |
| }, | |
| { | |
| "epoch": 1.8448399304490128, | |
| "grad_norm": 0.6293625351988416, | |
| "learning_rate": 8.562339039539465e-06, | |
| "loss": 0.387, | |
| "step": 4510 | |
| }, | |
| { | |
| "epoch": 1.8489311649790325, | |
| "grad_norm": 0.6963219813926529, | |
| "learning_rate": 8.532040599909105e-06, | |
| "loss": 0.3802, | |
| "step": 4520 | |
| }, | |
| { | |
| "epoch": 1.8530223995090518, | |
| "grad_norm": 0.6929306602792736, | |
| "learning_rate": 8.501742160278746e-06, | |
| "loss": 0.3797, | |
| "step": 4530 | |
| }, | |
| { | |
| "epoch": 1.8571136340390713, | |
| "grad_norm": 0.669690717256871, | |
| "learning_rate": 8.471443720648387e-06, | |
| "loss": 0.394, | |
| "step": 4540 | |
| }, | |
| { | |
| "epoch": 1.8612048685690907, | |
| "grad_norm": 0.6233392743914433, | |
| "learning_rate": 8.441145281018028e-06, | |
| "loss": 0.3896, | |
| "step": 4550 | |
| }, | |
| { | |
| "epoch": 1.8652961030991102, | |
| "grad_norm": 0.7239502206719675, | |
| "learning_rate": 8.41084684138767e-06, | |
| "loss": 0.3947, | |
| "step": 4560 | |
| }, | |
| { | |
| "epoch": 1.8693873376291297, | |
| "grad_norm": 0.6756956090605876, | |
| "learning_rate": 8.38054840175731e-06, | |
| "loss": 0.3925, | |
| "step": 4570 | |
| }, | |
| { | |
| "epoch": 1.873478572159149, | |
| "grad_norm": 0.7446762514738522, | |
| "learning_rate": 8.35024996212695e-06, | |
| "loss": 0.3833, | |
| "step": 4580 | |
| }, | |
| { | |
| "epoch": 1.8775698066891686, | |
| "grad_norm": 0.7701766571709215, | |
| "learning_rate": 8.319951522496593e-06, | |
| "loss": 0.3864, | |
| "step": 4590 | |
| }, | |
| { | |
| "epoch": 1.8816610412191879, | |
| "grad_norm": 0.6441859481438332, | |
| "learning_rate": 8.289653082866234e-06, | |
| "loss": 0.3776, | |
| "step": 4600 | |
| }, | |
| { | |
| "epoch": 1.8857522757492073, | |
| "grad_norm": 0.7880686132167092, | |
| "learning_rate": 8.259354643235873e-06, | |
| "loss": 0.3933, | |
| "step": 4610 | |
| }, | |
| { | |
| "epoch": 1.8898435102792268, | |
| "grad_norm": 0.6332952763922544, | |
| "learning_rate": 8.229056203605516e-06, | |
| "loss": 0.3853, | |
| "step": 4620 | |
| }, | |
| { | |
| "epoch": 1.893934744809246, | |
| "grad_norm": 0.7243342267131859, | |
| "learning_rate": 8.198757763975156e-06, | |
| "loss": 0.3792, | |
| "step": 4630 | |
| }, | |
| { | |
| "epoch": 1.8980259793392658, | |
| "grad_norm": 0.7901459850102072, | |
| "learning_rate": 8.168459324344797e-06, | |
| "loss": 0.3869, | |
| "step": 4640 | |
| }, | |
| { | |
| "epoch": 1.902117213869285, | |
| "grad_norm": 0.51044070307818, | |
| "learning_rate": 8.138160884714438e-06, | |
| "loss": 0.3775, | |
| "step": 4650 | |
| }, | |
| { | |
| "epoch": 1.9062084483993045, | |
| "grad_norm": 0.8558077236451173, | |
| "learning_rate": 8.107862445084079e-06, | |
| "loss": 0.3894, | |
| "step": 4660 | |
| }, | |
| { | |
| "epoch": 1.910299682929324, | |
| "grad_norm": 0.7131026847220464, | |
| "learning_rate": 8.07756400545372e-06, | |
| "loss": 0.3862, | |
| "step": 4670 | |
| }, | |
| { | |
| "epoch": 1.9143909174593432, | |
| "grad_norm": 0.9750476025242552, | |
| "learning_rate": 8.047265565823361e-06, | |
| "loss": 0.3957, | |
| "step": 4680 | |
| }, | |
| { | |
| "epoch": 1.918482151989363, | |
| "grad_norm": 0.7225249415951167, | |
| "learning_rate": 8.016967126193e-06, | |
| "loss": 0.4027, | |
| "step": 4690 | |
| }, | |
| { | |
| "epoch": 1.9225733865193821, | |
| "grad_norm": 0.5891488270484448, | |
| "learning_rate": 7.986668686562644e-06, | |
| "loss": 0.3886, | |
| "step": 4700 | |
| }, | |
| { | |
| "epoch": 1.9266646210494016, | |
| "grad_norm": 1.0628090466492606, | |
| "learning_rate": 7.956370246932283e-06, | |
| "loss": 0.398, | |
| "step": 4710 | |
| }, | |
| { | |
| "epoch": 1.930755855579421, | |
| "grad_norm": 0.6647239527835125, | |
| "learning_rate": 7.926071807301924e-06, | |
| "loss": 0.3918, | |
| "step": 4720 | |
| }, | |
| { | |
| "epoch": 1.9348470901094406, | |
| "grad_norm": 0.7463185047324199, | |
| "learning_rate": 7.895773367671565e-06, | |
| "loss": 0.3827, | |
| "step": 4730 | |
| }, | |
| { | |
| "epoch": 1.93893832463946, | |
| "grad_norm": 0.7195015094736638, | |
| "learning_rate": 7.865474928041206e-06, | |
| "loss": 0.4016, | |
| "step": 4740 | |
| }, | |
| { | |
| "epoch": 1.9430295591694793, | |
| "grad_norm": 0.8535138221139457, | |
| "learning_rate": 7.835176488410848e-06, | |
| "loss": 0.3869, | |
| "step": 4750 | |
| }, | |
| { | |
| "epoch": 1.947120793699499, | |
| "grad_norm": 0.7791996740996957, | |
| "learning_rate": 7.804878048780489e-06, | |
| "loss": 0.3714, | |
| "step": 4760 | |
| }, | |
| { | |
| "epoch": 1.9512120282295182, | |
| "grad_norm": 0.5772248432427366, | |
| "learning_rate": 7.77457960915013e-06, | |
| "loss": 0.3822, | |
| "step": 4770 | |
| }, | |
| { | |
| "epoch": 1.9553032627595377, | |
| "grad_norm": 0.7604443317770228, | |
| "learning_rate": 7.744281169519771e-06, | |
| "loss": 0.379, | |
| "step": 4780 | |
| }, | |
| { | |
| "epoch": 1.9593944972895572, | |
| "grad_norm": 0.588549852538312, | |
| "learning_rate": 7.71398272988941e-06, | |
| "loss": 0.3814, | |
| "step": 4790 | |
| }, | |
| { | |
| "epoch": 1.9634857318195764, | |
| "grad_norm": 1.0609622015863114, | |
| "learning_rate": 7.683684290259052e-06, | |
| "loss": 0.3844, | |
| "step": 4800 | |
| }, | |
| { | |
| "epoch": 1.9675769663495961, | |
| "grad_norm": 0.8833450757521427, | |
| "learning_rate": 7.653385850628693e-06, | |
| "loss": 0.3941, | |
| "step": 4810 | |
| }, | |
| { | |
| "epoch": 1.9716682008796154, | |
| "grad_norm": 0.6612140126470812, | |
| "learning_rate": 7.623087410998334e-06, | |
| "loss": 0.3805, | |
| "step": 4820 | |
| }, | |
| { | |
| "epoch": 1.9757594354096348, | |
| "grad_norm": 0.653419144919706, | |
| "learning_rate": 7.592788971367975e-06, | |
| "loss": 0.3666, | |
| "step": 4830 | |
| }, | |
| { | |
| "epoch": 1.9798506699396543, | |
| "grad_norm": 0.6797670566745962, | |
| "learning_rate": 7.562490531737616e-06, | |
| "loss": 0.3742, | |
| "step": 4840 | |
| }, | |
| { | |
| "epoch": 1.9839419044696738, | |
| "grad_norm": 0.6644531103160585, | |
| "learning_rate": 7.5321920921072566e-06, | |
| "loss": 0.3842, | |
| "step": 4850 | |
| }, | |
| { | |
| "epoch": 1.9880331389996933, | |
| "grad_norm": 1.2106290360521903, | |
| "learning_rate": 7.5018936524768986e-06, | |
| "loss": 0.3911, | |
| "step": 4860 | |
| }, | |
| { | |
| "epoch": 1.9921243735297125, | |
| "grad_norm": 0.5569971531476807, | |
| "learning_rate": 7.471595212846539e-06, | |
| "loss": 0.4069, | |
| "step": 4870 | |
| }, | |
| { | |
| "epoch": 1.996215608059732, | |
| "grad_norm": 0.5191981517163119, | |
| "learning_rate": 7.44129677321618e-06, | |
| "loss": 0.3763, | |
| "step": 4880 | |
| }, | |
| { | |
| "epoch": 2.0, | |
| "grad_norm": 1.3233158209509672, | |
| "learning_rate": 7.41099833358582e-06, | |
| "loss": 0.368, | |
| "step": 4890 | |
| }, | |
| { | |
| "epoch": 2.0040912345300192, | |
| "grad_norm": 0.6324023695845297, | |
| "learning_rate": 7.380699893955462e-06, | |
| "loss": 0.3502, | |
| "step": 4900 | |
| }, | |
| { | |
| "epoch": 2.008182469060039, | |
| "grad_norm": 0.6060365694093942, | |
| "learning_rate": 7.350401454325103e-06, | |
| "loss": 0.3634, | |
| "step": 4910 | |
| }, | |
| { | |
| "epoch": 2.012273703590058, | |
| "grad_norm": 0.9148807418922357, | |
| "learning_rate": 7.320103014694744e-06, | |
| "loss": 0.3799, | |
| "step": 4920 | |
| }, | |
| { | |
| "epoch": 2.016364938120078, | |
| "grad_norm": 0.5414356512353191, | |
| "learning_rate": 7.289804575064385e-06, | |
| "loss": 0.3637, | |
| "step": 4930 | |
| }, | |
| { | |
| "epoch": 2.020456172650097, | |
| "grad_norm": 0.9191131346375879, | |
| "learning_rate": 7.259506135434026e-06, | |
| "loss": 0.3502, | |
| "step": 4940 | |
| }, | |
| { | |
| "epoch": 2.0245474071801164, | |
| "grad_norm": 0.5288098199455225, | |
| "learning_rate": 7.229207695803667e-06, | |
| "loss": 0.3594, | |
| "step": 4950 | |
| }, | |
| { | |
| "epoch": 2.028638641710136, | |
| "grad_norm": 0.6644017187035499, | |
| "learning_rate": 7.1989092561733075e-06, | |
| "loss": 0.3566, | |
| "step": 4960 | |
| }, | |
| { | |
| "epoch": 2.0327298762401553, | |
| "grad_norm": 0.7063344184257762, | |
| "learning_rate": 7.1686108165429495e-06, | |
| "loss": 0.3729, | |
| "step": 4970 | |
| }, | |
| { | |
| "epoch": 2.036821110770175, | |
| "grad_norm": 0.716588405815958, | |
| "learning_rate": 7.13831237691259e-06, | |
| "loss": 0.3632, | |
| "step": 4980 | |
| }, | |
| { | |
| "epoch": 2.0409123453001943, | |
| "grad_norm": 0.780104450356012, | |
| "learning_rate": 7.108013937282231e-06, | |
| "loss": 0.3503, | |
| "step": 4990 | |
| }, | |
| { | |
| "epoch": 2.045003579830214, | |
| "grad_norm": 0.9214206047619449, | |
| "learning_rate": 7.077715497651871e-06, | |
| "loss": 0.3583, | |
| "step": 5000 | |
| }, | |
| { | |
| "epoch": 2.045003579830214, | |
| "eval_loss": 0.40945789217948914, | |
| "eval_runtime": 566.8305, | |
| "eval_samples_per_second": 5.448, | |
| "eval_steps_per_second": 0.909, | |
| "step": 5000 | |
| }, | |
| { | |
| "epoch": 2.049094814360233, | |
| "grad_norm": 0.8803623081885309, | |
| "learning_rate": 7.047417058021513e-06, | |
| "loss": 0.3524, | |
| "step": 5010 | |
| }, | |
| { | |
| "epoch": 2.0531860488902525, | |
| "grad_norm": 0.8089956621396417, | |
| "learning_rate": 7.0171186183911536e-06, | |
| "loss": 0.3423, | |
| "step": 5020 | |
| }, | |
| { | |
| "epoch": 2.057277283420272, | |
| "grad_norm": 1.026614900390396, | |
| "learning_rate": 6.986820178760795e-06, | |
| "loss": 0.3612, | |
| "step": 5030 | |
| }, | |
| { | |
| "epoch": 2.0613685179502914, | |
| "grad_norm": 0.8153155935459168, | |
| "learning_rate": 6.956521739130435e-06, | |
| "loss": 0.3619, | |
| "step": 5040 | |
| }, | |
| { | |
| "epoch": 2.065459752480311, | |
| "grad_norm": 0.931239718869365, | |
| "learning_rate": 6.926223299500077e-06, | |
| "loss": 0.3475, | |
| "step": 5050 | |
| }, | |
| { | |
| "epoch": 2.0695509870103304, | |
| "grad_norm": 1.0467750800215039, | |
| "learning_rate": 6.895924859869717e-06, | |
| "loss": 0.3535, | |
| "step": 5060 | |
| }, | |
| { | |
| "epoch": 2.0736422215403496, | |
| "grad_norm": 0.9059104329232058, | |
| "learning_rate": 6.8656264202393585e-06, | |
| "loss": 0.3697, | |
| "step": 5070 | |
| }, | |
| { | |
| "epoch": 2.0777334560703693, | |
| "grad_norm": 0.9807695952493246, | |
| "learning_rate": 6.835327980608999e-06, | |
| "loss": 0.3465, | |
| "step": 5080 | |
| }, | |
| { | |
| "epoch": 2.0818246906003885, | |
| "grad_norm": 0.8912366875608871, | |
| "learning_rate": 6.805029540978641e-06, | |
| "loss": 0.3423, | |
| "step": 5090 | |
| }, | |
| { | |
| "epoch": 2.0859159251304082, | |
| "grad_norm": 0.7770463857589687, | |
| "learning_rate": 6.774731101348281e-06, | |
| "loss": 0.3514, | |
| "step": 5100 | |
| }, | |
| { | |
| "epoch": 2.0900071596604275, | |
| "grad_norm": 0.9470302336462076, | |
| "learning_rate": 6.744432661717922e-06, | |
| "loss": 0.3282, | |
| "step": 5110 | |
| }, | |
| { | |
| "epoch": 2.094098394190447, | |
| "grad_norm": 0.9649585745676128, | |
| "learning_rate": 6.7141342220875625e-06, | |
| "loss": 0.3525, | |
| "step": 5120 | |
| }, | |
| { | |
| "epoch": 2.0981896287204664, | |
| "grad_norm": 0.9294501847855035, | |
| "learning_rate": 6.6838357824572045e-06, | |
| "loss": 0.3568, | |
| "step": 5130 | |
| }, | |
| { | |
| "epoch": 2.1022808632504857, | |
| "grad_norm": 0.7971412344974254, | |
| "learning_rate": 6.653537342826845e-06, | |
| "loss": 0.3649, | |
| "step": 5140 | |
| }, | |
| { | |
| "epoch": 2.1063720977805054, | |
| "grad_norm": 0.9239690361706143, | |
| "learning_rate": 6.623238903196486e-06, | |
| "loss": 0.3671, | |
| "step": 5150 | |
| }, | |
| { | |
| "epoch": 2.1104633323105246, | |
| "grad_norm": 0.792788321441495, | |
| "learning_rate": 6.592940463566126e-06, | |
| "loss": 0.3696, | |
| "step": 5160 | |
| }, | |
| { | |
| "epoch": 2.1145545668405443, | |
| "grad_norm": 0.96794000584284, | |
| "learning_rate": 6.562642023935768e-06, | |
| "loss": 0.3526, | |
| "step": 5170 | |
| }, | |
| { | |
| "epoch": 2.1186458013705636, | |
| "grad_norm": 0.9461764596400418, | |
| "learning_rate": 6.5323435843054086e-06, | |
| "loss": 0.3481, | |
| "step": 5180 | |
| }, | |
| { | |
| "epoch": 2.122737035900583, | |
| "grad_norm": 0.9746653976507995, | |
| "learning_rate": 6.50204514467505e-06, | |
| "loss": 0.3641, | |
| "step": 5190 | |
| }, | |
| { | |
| "epoch": 2.1268282704306025, | |
| "grad_norm": 0.9134747859457668, | |
| "learning_rate": 6.47174670504469e-06, | |
| "loss": 0.3477, | |
| "step": 5200 | |
| }, | |
| { | |
| "epoch": 2.1309195049606218, | |
| "grad_norm": 0.9422179764228524, | |
| "learning_rate": 6.441448265414332e-06, | |
| "loss": 0.3538, | |
| "step": 5210 | |
| }, | |
| { | |
| "epoch": 2.1350107394906415, | |
| "grad_norm": 0.9768549519399179, | |
| "learning_rate": 6.411149825783972e-06, | |
| "loss": 0.36, | |
| "step": 5220 | |
| }, | |
| { | |
| "epoch": 2.1391019740206607, | |
| "grad_norm": 0.8493532914358172, | |
| "learning_rate": 6.3808513861536135e-06, | |
| "loss": 0.3551, | |
| "step": 5230 | |
| }, | |
| { | |
| "epoch": 2.14319320855068, | |
| "grad_norm": 0.850886523603342, | |
| "learning_rate": 6.350552946523254e-06, | |
| "loss": 0.3497, | |
| "step": 5240 | |
| }, | |
| { | |
| "epoch": 2.1472844430806997, | |
| "grad_norm": 1.192315078529062, | |
| "learning_rate": 6.320254506892896e-06, | |
| "loss": 0.3618, | |
| "step": 5250 | |
| }, | |
| { | |
| "epoch": 2.151375677610719, | |
| "grad_norm": 0.8946571568288144, | |
| "learning_rate": 6.289956067262537e-06, | |
| "loss": 0.3458, | |
| "step": 5260 | |
| }, | |
| { | |
| "epoch": 2.1554669121407386, | |
| "grad_norm": 0.8759600168097015, | |
| "learning_rate": 6.259657627632177e-06, | |
| "loss": 0.3681, | |
| "step": 5270 | |
| }, | |
| { | |
| "epoch": 2.159558146670758, | |
| "grad_norm": 0.8486779868515286, | |
| "learning_rate": 6.229359188001819e-06, | |
| "loss": 0.3674, | |
| "step": 5280 | |
| }, | |
| { | |
| "epoch": 2.163649381200777, | |
| "grad_norm": 0.9573497878508632, | |
| "learning_rate": 6.1990607483714595e-06, | |
| "loss": 0.3515, | |
| "step": 5290 | |
| }, | |
| { | |
| "epoch": 2.167740615730797, | |
| "grad_norm": 1.0608696592581122, | |
| "learning_rate": 6.168762308741101e-06, | |
| "loss": 0.3575, | |
| "step": 5300 | |
| }, | |
| { | |
| "epoch": 2.171831850260816, | |
| "grad_norm": 0.9550842012101914, | |
| "learning_rate": 6.138463869110741e-06, | |
| "loss": 0.3591, | |
| "step": 5310 | |
| }, | |
| { | |
| "epoch": 2.1759230847908357, | |
| "grad_norm": 0.7852881818968737, | |
| "learning_rate": 6.108165429480383e-06, | |
| "loss": 0.3609, | |
| "step": 5320 | |
| }, | |
| { | |
| "epoch": 2.180014319320855, | |
| "grad_norm": 1.1566940002899782, | |
| "learning_rate": 6.077866989850023e-06, | |
| "loss": 0.3407, | |
| "step": 5330 | |
| }, | |
| { | |
| "epoch": 2.1841055538508747, | |
| "grad_norm": 0.8078702927449839, | |
| "learning_rate": 6.047568550219664e-06, | |
| "loss": 0.3486, | |
| "step": 5340 | |
| }, | |
| { | |
| "epoch": 2.188196788380894, | |
| "grad_norm": 0.9239475839958554, | |
| "learning_rate": 6.017270110589305e-06, | |
| "loss": 0.3503, | |
| "step": 5350 | |
| }, | |
| { | |
| "epoch": 2.192288022910913, | |
| "grad_norm": 0.9902787811712669, | |
| "learning_rate": 5.986971670958947e-06, | |
| "loss": 0.3616, | |
| "step": 5360 | |
| }, | |
| { | |
| "epoch": 2.196379257440933, | |
| "grad_norm": 0.993917561828579, | |
| "learning_rate": 5.956673231328587e-06, | |
| "loss": 0.3564, | |
| "step": 5370 | |
| }, | |
| { | |
| "epoch": 2.200470491970952, | |
| "grad_norm": 0.9912437866525817, | |
| "learning_rate": 5.926374791698228e-06, | |
| "loss": 0.3515, | |
| "step": 5380 | |
| }, | |
| { | |
| "epoch": 2.204561726500972, | |
| "grad_norm": 0.9063774423655953, | |
| "learning_rate": 5.8960763520678685e-06, | |
| "loss": 0.3597, | |
| "step": 5390 | |
| }, | |
| { | |
| "epoch": 2.208652961030991, | |
| "grad_norm": 1.2314911545390765, | |
| "learning_rate": 5.8657779124375105e-06, | |
| "loss": 0.343, | |
| "step": 5400 | |
| }, | |
| { | |
| "epoch": 2.2127441955610108, | |
| "grad_norm": 0.9949853000873754, | |
| "learning_rate": 5.835479472807151e-06, | |
| "loss": 0.3514, | |
| "step": 5410 | |
| }, | |
| { | |
| "epoch": 2.21683543009103, | |
| "grad_norm": 1.000104375303856, | |
| "learning_rate": 5.805181033176792e-06, | |
| "loss": 0.3638, | |
| "step": 5420 | |
| }, | |
| { | |
| "epoch": 2.2209266646210493, | |
| "grad_norm": 1.128813445069188, | |
| "learning_rate": 5.774882593546432e-06, | |
| "loss": 0.3656, | |
| "step": 5430 | |
| }, | |
| { | |
| "epoch": 2.225017899151069, | |
| "grad_norm": 0.9792714948138356, | |
| "learning_rate": 5.744584153916074e-06, | |
| "loss": 0.3536, | |
| "step": 5440 | |
| }, | |
| { | |
| "epoch": 2.229109133681088, | |
| "grad_norm": 1.0987526220359756, | |
| "learning_rate": 5.7142857142857145e-06, | |
| "loss": 0.3721, | |
| "step": 5450 | |
| }, | |
| { | |
| "epoch": 2.233200368211108, | |
| "grad_norm": 0.8952391612311649, | |
| "learning_rate": 5.683987274655356e-06, | |
| "loss": 0.354, | |
| "step": 5460 | |
| }, | |
| { | |
| "epoch": 2.237291602741127, | |
| "grad_norm": 1.1608074782040154, | |
| "learning_rate": 5.653688835024996e-06, | |
| "loss": 0.3561, | |
| "step": 5470 | |
| }, | |
| { | |
| "epoch": 2.2413828372711464, | |
| "grad_norm": 1.003805233436142, | |
| "learning_rate": 5.623390395394638e-06, | |
| "loss": 0.3551, | |
| "step": 5480 | |
| }, | |
| { | |
| "epoch": 2.245474071801166, | |
| "grad_norm": 0.8422927034761718, | |
| "learning_rate": 5.593091955764278e-06, | |
| "loss": 0.3467, | |
| "step": 5490 | |
| }, | |
| { | |
| "epoch": 2.2495653063311853, | |
| "grad_norm": 1.034912305431081, | |
| "learning_rate": 5.562793516133919e-06, | |
| "loss": 0.3762, | |
| "step": 5500 | |
| }, | |
| { | |
| "epoch": 2.2495653063311853, | |
| "eval_loss": 0.4107515215873718, | |
| "eval_runtime": 566.5709, | |
| "eval_samples_per_second": 5.45, | |
| "eval_steps_per_second": 0.909, | |
| "step": 5500 | |
| }, | |
| { | |
| "epoch": 2.253656540861205, | |
| "grad_norm": 0.9665919628552063, | |
| "learning_rate": 5.53249507650356e-06, | |
| "loss": 0.3683, | |
| "step": 5510 | |
| }, | |
| { | |
| "epoch": 2.2577477753912243, | |
| "grad_norm": 1.1208116392163985, | |
| "learning_rate": 5.502196636873202e-06, | |
| "loss": 0.3492, | |
| "step": 5520 | |
| }, | |
| { | |
| "epoch": 2.2618390099212435, | |
| "grad_norm": 0.8340800205831346, | |
| "learning_rate": 5.471898197242842e-06, | |
| "loss": 0.3372, | |
| "step": 5530 | |
| }, | |
| { | |
| "epoch": 2.2659302444512632, | |
| "grad_norm": 0.9589224290151471, | |
| "learning_rate": 5.441599757612483e-06, | |
| "loss": 0.3478, | |
| "step": 5540 | |
| }, | |
| { | |
| "epoch": 2.2700214789812825, | |
| "grad_norm": 1.015598680602829, | |
| "learning_rate": 5.4113013179821235e-06, | |
| "loss": 0.3607, | |
| "step": 5550 | |
| }, | |
| { | |
| "epoch": 2.274112713511302, | |
| "grad_norm": 0.9746676080601815, | |
| "learning_rate": 5.3810028783517655e-06, | |
| "loss": 0.3611, | |
| "step": 5560 | |
| }, | |
| { | |
| "epoch": 2.2782039480413214, | |
| "grad_norm": 0.9296538878972167, | |
| "learning_rate": 5.350704438721406e-06, | |
| "loss": 0.3576, | |
| "step": 5570 | |
| }, | |
| { | |
| "epoch": 2.2822951825713407, | |
| "grad_norm": 0.8789512395616204, | |
| "learning_rate": 5.320405999091047e-06, | |
| "loss": 0.3629, | |
| "step": 5580 | |
| }, | |
| { | |
| "epoch": 2.2863864171013604, | |
| "grad_norm": 1.0582855408609264, | |
| "learning_rate": 5.290107559460688e-06, | |
| "loss": 0.3634, | |
| "step": 5590 | |
| }, | |
| { | |
| "epoch": 2.2904776516313796, | |
| "grad_norm": 0.9546020518135027, | |
| "learning_rate": 5.259809119830329e-06, | |
| "loss": 0.3586, | |
| "step": 5600 | |
| }, | |
| { | |
| "epoch": 2.2945688861613993, | |
| "grad_norm": 0.962752971888759, | |
| "learning_rate": 5.22951068019997e-06, | |
| "loss": 0.3605, | |
| "step": 5610 | |
| }, | |
| { | |
| "epoch": 2.2986601206914186, | |
| "grad_norm": 1.0245121086717093, | |
| "learning_rate": 5.199212240569611e-06, | |
| "loss": 0.3587, | |
| "step": 5620 | |
| }, | |
| { | |
| "epoch": 2.3027513552214383, | |
| "grad_norm": 1.0040046723513294, | |
| "learning_rate": 5.168913800939253e-06, | |
| "loss": 0.3648, | |
| "step": 5630 | |
| }, | |
| { | |
| "epoch": 2.3068425897514575, | |
| "grad_norm": 1.171908094587368, | |
| "learning_rate": 5.138615361308893e-06, | |
| "loss": 0.3592, | |
| "step": 5640 | |
| }, | |
| { | |
| "epoch": 2.3109338242814768, | |
| "grad_norm": 1.011025217862205, | |
| "learning_rate": 5.108316921678534e-06, | |
| "loss": 0.3442, | |
| "step": 5650 | |
| }, | |
| { | |
| "epoch": 2.3150250588114965, | |
| "grad_norm": 1.1859171170651996, | |
| "learning_rate": 5.078018482048174e-06, | |
| "loss": 0.3633, | |
| "step": 5660 | |
| }, | |
| { | |
| "epoch": 2.3191162933415157, | |
| "grad_norm": 1.0924583307810276, | |
| "learning_rate": 5.047720042417816e-06, | |
| "loss": 0.3532, | |
| "step": 5670 | |
| }, | |
| { | |
| "epoch": 2.3232075278715354, | |
| "grad_norm": 1.167947573027158, | |
| "learning_rate": 5.017421602787457e-06, | |
| "loss": 0.3604, | |
| "step": 5680 | |
| }, | |
| { | |
| "epoch": 2.3272987624015546, | |
| "grad_norm": 1.2875926245828595, | |
| "learning_rate": 4.987123163157098e-06, | |
| "loss": 0.3429, | |
| "step": 5690 | |
| }, | |
| { | |
| "epoch": 2.3313899969315743, | |
| "grad_norm": 1.156830068761064, | |
| "learning_rate": 4.956824723526739e-06, | |
| "loss": 0.3434, | |
| "step": 5700 | |
| }, | |
| { | |
| "epoch": 2.3354812314615936, | |
| "grad_norm": 0.9404794900229477, | |
| "learning_rate": 4.926526283896379e-06, | |
| "loss": 0.3399, | |
| "step": 5710 | |
| }, | |
| { | |
| "epoch": 2.339572465991613, | |
| "grad_norm": 1.116557440039318, | |
| "learning_rate": 4.8962278442660205e-06, | |
| "loss": 0.3584, | |
| "step": 5720 | |
| }, | |
| { | |
| "epoch": 2.3436637005216325, | |
| "grad_norm": 1.2190130786864881, | |
| "learning_rate": 4.865929404635662e-06, | |
| "loss": 0.3701, | |
| "step": 5730 | |
| }, | |
| { | |
| "epoch": 2.347754935051652, | |
| "grad_norm": 1.2099886678141103, | |
| "learning_rate": 4.835630965005303e-06, | |
| "loss": 0.3675, | |
| "step": 5740 | |
| }, | |
| { | |
| "epoch": 2.3518461695816715, | |
| "grad_norm": 1.1364465919872204, | |
| "learning_rate": 4.805332525374943e-06, | |
| "loss": 0.3608, | |
| "step": 5750 | |
| }, | |
| { | |
| "epoch": 2.3559374041116907, | |
| "grad_norm": 0.9321036699083428, | |
| "learning_rate": 4.775034085744584e-06, | |
| "loss": 0.3496, | |
| "step": 5760 | |
| }, | |
| { | |
| "epoch": 2.36002863864171, | |
| "grad_norm": 0.9146634319851078, | |
| "learning_rate": 4.744735646114225e-06, | |
| "loss": 0.3502, | |
| "step": 5770 | |
| }, | |
| { | |
| "epoch": 2.3641198731717297, | |
| "grad_norm": 1.000647504917848, | |
| "learning_rate": 4.7144372064838665e-06, | |
| "loss": 0.3544, | |
| "step": 5780 | |
| }, | |
| { | |
| "epoch": 2.368211107701749, | |
| "grad_norm": 1.1557436169214348, | |
| "learning_rate": 4.684138766853508e-06, | |
| "loss": 0.3518, | |
| "step": 5790 | |
| }, | |
| { | |
| "epoch": 2.3723023422317686, | |
| "grad_norm": 1.2926612320528568, | |
| "learning_rate": 4.653840327223149e-06, | |
| "loss": 0.3486, | |
| "step": 5800 | |
| }, | |
| { | |
| "epoch": 2.376393576761788, | |
| "grad_norm": 1.2169663560603732, | |
| "learning_rate": 4.62354188759279e-06, | |
| "loss": 0.3539, | |
| "step": 5810 | |
| }, | |
| { | |
| "epoch": 2.380484811291807, | |
| "grad_norm": 0.9319379770517142, | |
| "learning_rate": 4.59324344796243e-06, | |
| "loss": 0.328, | |
| "step": 5820 | |
| }, | |
| { | |
| "epoch": 2.384576045821827, | |
| "grad_norm": 1.0263622334503117, | |
| "learning_rate": 4.562945008332071e-06, | |
| "loss": 0.3536, | |
| "step": 5830 | |
| }, | |
| { | |
| "epoch": 2.388667280351846, | |
| "grad_norm": 1.152645840965358, | |
| "learning_rate": 4.5326465687017126e-06, | |
| "loss": 0.3536, | |
| "step": 5840 | |
| }, | |
| { | |
| "epoch": 2.3927585148818658, | |
| "grad_norm": 0.9658502725314514, | |
| "learning_rate": 4.502348129071354e-06, | |
| "loss": 0.3548, | |
| "step": 5850 | |
| }, | |
| { | |
| "epoch": 2.396849749411885, | |
| "grad_norm": 1.2197202047381088, | |
| "learning_rate": 4.472049689440994e-06, | |
| "loss": 0.3559, | |
| "step": 5860 | |
| }, | |
| { | |
| "epoch": 2.4009409839419042, | |
| "grad_norm": 1.2765015130610389, | |
| "learning_rate": 4.441751249810635e-06, | |
| "loss": 0.3497, | |
| "step": 5870 | |
| }, | |
| { | |
| "epoch": 2.405032218471924, | |
| "grad_norm": 1.38104638770045, | |
| "learning_rate": 4.411452810180276e-06, | |
| "loss": 0.3448, | |
| "step": 5880 | |
| }, | |
| { | |
| "epoch": 2.409123453001943, | |
| "grad_norm": 1.1400186315945817, | |
| "learning_rate": 4.3811543705499174e-06, | |
| "loss": 0.3561, | |
| "step": 5890 | |
| }, | |
| { | |
| "epoch": 2.413214687531963, | |
| "grad_norm": 1.332445802283346, | |
| "learning_rate": 4.350855930919558e-06, | |
| "loss": 0.3558, | |
| "step": 5900 | |
| }, | |
| { | |
| "epoch": 2.417305922061982, | |
| "grad_norm": 0.8518719177973914, | |
| "learning_rate": 4.320557491289199e-06, | |
| "loss": 0.3557, | |
| "step": 5910 | |
| }, | |
| { | |
| "epoch": 2.4213971565920014, | |
| "grad_norm": 1.3083347016848765, | |
| "learning_rate": 4.29025905165884e-06, | |
| "loss": 0.3443, | |
| "step": 5920 | |
| }, | |
| { | |
| "epoch": 2.425488391122021, | |
| "grad_norm": 1.2245072085864033, | |
| "learning_rate": 4.259960612028481e-06, | |
| "loss": 0.3424, | |
| "step": 5930 | |
| }, | |
| { | |
| "epoch": 2.4295796256520403, | |
| "grad_norm": 1.0512105297819025, | |
| "learning_rate": 4.2296621723981215e-06, | |
| "loss": 0.3668, | |
| "step": 5940 | |
| }, | |
| { | |
| "epoch": 2.43367086018206, | |
| "grad_norm": 1.2181678876630533, | |
| "learning_rate": 4.199363732767763e-06, | |
| "loss": 0.3514, | |
| "step": 5950 | |
| }, | |
| { | |
| "epoch": 2.4377620947120793, | |
| "grad_norm": 1.2107998993154634, | |
| "learning_rate": 4.169065293137404e-06, | |
| "loss": 0.3634, | |
| "step": 5960 | |
| }, | |
| { | |
| "epoch": 2.441853329242099, | |
| "grad_norm": 1.162395707185428, | |
| "learning_rate": 4.138766853507045e-06, | |
| "loss": 0.35, | |
| "step": 5970 | |
| }, | |
| { | |
| "epoch": 2.445944563772118, | |
| "grad_norm": 1.0932545213281812, | |
| "learning_rate": 4.108468413876685e-06, | |
| "loss": 0.3472, | |
| "step": 5980 | |
| }, | |
| { | |
| "epoch": 2.450035798302138, | |
| "grad_norm": 1.0010475016537828, | |
| "learning_rate": 4.078169974246326e-06, | |
| "loss": 0.3584, | |
| "step": 5990 | |
| }, | |
| { | |
| "epoch": 2.454127032832157, | |
| "grad_norm": 0.8180158364405802, | |
| "learning_rate": 4.0478715346159675e-06, | |
| "loss": 0.3597, | |
| "step": 6000 | |
| }, | |
| { | |
| "epoch": 2.454127032832157, | |
| "eval_loss": 0.4111001789569855, | |
| "eval_runtime": 567.1279, | |
| "eval_samples_per_second": 5.445, | |
| "eval_steps_per_second": 0.908, | |
| "step": 6000 | |
| }, | |
| { | |
| "epoch": 2.4582182673621764, | |
| "grad_norm": 1.152408348113746, | |
| "learning_rate": 4.017573094985609e-06, | |
| "loss": 0.352, | |
| "step": 6010 | |
| }, | |
| { | |
| "epoch": 2.462309501892196, | |
| "grad_norm": 0.9901037242730653, | |
| "learning_rate": 3.987274655355249e-06, | |
| "loss": 0.3413, | |
| "step": 6020 | |
| }, | |
| { | |
| "epoch": 2.4664007364222154, | |
| "grad_norm": 1.0744413310129686, | |
| "learning_rate": 3.95697621572489e-06, | |
| "loss": 0.3572, | |
| "step": 6030 | |
| }, | |
| { | |
| "epoch": 2.470491970952235, | |
| "grad_norm": 1.1304586395449463, | |
| "learning_rate": 3.926677776094531e-06, | |
| "loss": 0.3611, | |
| "step": 6040 | |
| }, | |
| { | |
| "epoch": 2.4745832054822543, | |
| "grad_norm": 1.1937074296439871, | |
| "learning_rate": 3.8963793364641724e-06, | |
| "loss": 0.351, | |
| "step": 6050 | |
| }, | |
| { | |
| "epoch": 2.4786744400122735, | |
| "grad_norm": 0.9420890149335949, | |
| "learning_rate": 3.866080896833813e-06, | |
| "loss": 0.3556, | |
| "step": 6060 | |
| }, | |
| { | |
| "epoch": 2.4827656745422932, | |
| "grad_norm": 1.0885349991773454, | |
| "learning_rate": 3.835782457203454e-06, | |
| "loss": 0.3545, | |
| "step": 6070 | |
| }, | |
| { | |
| "epoch": 2.4868569090723125, | |
| "grad_norm": 1.076589826454013, | |
| "learning_rate": 3.805484017573095e-06, | |
| "loss": 0.353, | |
| "step": 6080 | |
| }, | |
| { | |
| "epoch": 2.490948143602332, | |
| "grad_norm": 0.9678457177285116, | |
| "learning_rate": 3.775185577942736e-06, | |
| "loss": 0.3535, | |
| "step": 6090 | |
| }, | |
| { | |
| "epoch": 2.4950393781323514, | |
| "grad_norm": 1.2568971383029166, | |
| "learning_rate": 3.7448871383123773e-06, | |
| "loss": 0.3535, | |
| "step": 6100 | |
| }, | |
| { | |
| "epoch": 2.4991306126623707, | |
| "grad_norm": 1.1438602826766222, | |
| "learning_rate": 3.7145886986820185e-06, | |
| "loss": 0.3548, | |
| "step": 6110 | |
| }, | |
| { | |
| "epoch": 2.5032218471923904, | |
| "grad_norm": 1.1827797517036478, | |
| "learning_rate": 3.6842902590516592e-06, | |
| "loss": 0.3537, | |
| "step": 6120 | |
| }, | |
| { | |
| "epoch": 2.5073130817224096, | |
| "grad_norm": 1.1604168543066307, | |
| "learning_rate": 3.6539918194213004e-06, | |
| "loss": 0.3443, | |
| "step": 6130 | |
| }, | |
| { | |
| "epoch": 2.5114043162524293, | |
| "grad_norm": 1.5271313980044559, | |
| "learning_rate": 3.623693379790941e-06, | |
| "loss": 0.3676, | |
| "step": 6140 | |
| }, | |
| { | |
| "epoch": 2.5154955507824486, | |
| "grad_norm": 1.0170843727726653, | |
| "learning_rate": 3.5933949401605822e-06, | |
| "loss": 0.333, | |
| "step": 6150 | |
| }, | |
| { | |
| "epoch": 2.519586785312468, | |
| "grad_norm": 1.478391272829513, | |
| "learning_rate": 3.563096500530223e-06, | |
| "loss": 0.3289, | |
| "step": 6160 | |
| }, | |
| { | |
| "epoch": 2.5236780198424875, | |
| "grad_norm": 1.1327119923685498, | |
| "learning_rate": 3.532798060899864e-06, | |
| "loss": 0.3465, | |
| "step": 6170 | |
| }, | |
| { | |
| "epoch": 2.5277692543725068, | |
| "grad_norm": 1.2833494932025962, | |
| "learning_rate": 3.502499621269505e-06, | |
| "loss": 0.3603, | |
| "step": 6180 | |
| }, | |
| { | |
| "epoch": 2.5318604889025265, | |
| "grad_norm": 1.3483786326224019, | |
| "learning_rate": 3.472201181639146e-06, | |
| "loss": 0.3499, | |
| "step": 6190 | |
| }, | |
| { | |
| "epoch": 2.5359517234325457, | |
| "grad_norm": 1.41685799213282, | |
| "learning_rate": 3.441902742008787e-06, | |
| "loss": 0.3557, | |
| "step": 6200 | |
| }, | |
| { | |
| "epoch": 2.540042957962565, | |
| "grad_norm": 1.1147817059656389, | |
| "learning_rate": 3.411604302378428e-06, | |
| "loss": 0.3525, | |
| "step": 6210 | |
| }, | |
| { | |
| "epoch": 2.5441341924925847, | |
| "grad_norm": 1.24256482526061, | |
| "learning_rate": 3.381305862748069e-06, | |
| "loss": 0.3557, | |
| "step": 6220 | |
| }, | |
| { | |
| "epoch": 2.548225427022604, | |
| "grad_norm": 1.14276520963034, | |
| "learning_rate": 3.3510074231177097e-06, | |
| "loss": 0.3443, | |
| "step": 6230 | |
| }, | |
| { | |
| "epoch": 2.5523166615526236, | |
| "grad_norm": 1.4308356706747418, | |
| "learning_rate": 3.320708983487351e-06, | |
| "loss": 0.3642, | |
| "step": 6240 | |
| }, | |
| { | |
| "epoch": 2.556407896082643, | |
| "grad_norm": 1.0603812873827165, | |
| "learning_rate": 3.2904105438569916e-06, | |
| "loss": 0.3507, | |
| "step": 6250 | |
| }, | |
| { | |
| "epoch": 2.560499130612662, | |
| "grad_norm": 1.1444596493356962, | |
| "learning_rate": 3.2601121042266328e-06, | |
| "loss": 0.3463, | |
| "step": 6260 | |
| }, | |
| { | |
| "epoch": 2.564590365142682, | |
| "grad_norm": 1.1550573279318823, | |
| "learning_rate": 3.2298136645962735e-06, | |
| "loss": 0.3517, | |
| "step": 6270 | |
| }, | |
| { | |
| "epoch": 2.5686815996727015, | |
| "grad_norm": 1.2279449599940317, | |
| "learning_rate": 3.1995152249659146e-06, | |
| "loss": 0.3419, | |
| "step": 6280 | |
| }, | |
| { | |
| "epoch": 2.5727728342027207, | |
| "grad_norm": 1.1531936082950656, | |
| "learning_rate": 3.1692167853355554e-06, | |
| "loss": 0.3553, | |
| "step": 6290 | |
| }, | |
| { | |
| "epoch": 2.57686406873274, | |
| "grad_norm": 1.1430327649681449, | |
| "learning_rate": 3.1389183457051965e-06, | |
| "loss": 0.3481, | |
| "step": 6300 | |
| }, | |
| { | |
| "epoch": 2.5809553032627597, | |
| "grad_norm": 1.2985006323282675, | |
| "learning_rate": 3.1086199060748372e-06, | |
| "loss": 0.3486, | |
| "step": 6310 | |
| }, | |
| { | |
| "epoch": 2.585046537792779, | |
| "grad_norm": 1.1447086789004135, | |
| "learning_rate": 3.0783214664444784e-06, | |
| "loss": 0.3516, | |
| "step": 6320 | |
| }, | |
| { | |
| "epoch": 2.5891377723227986, | |
| "grad_norm": 1.464516109493368, | |
| "learning_rate": 3.048023026814119e-06, | |
| "loss": 0.3478, | |
| "step": 6330 | |
| }, | |
| { | |
| "epoch": 2.593229006852818, | |
| "grad_norm": 1.1333941032924868, | |
| "learning_rate": 3.0177245871837603e-06, | |
| "loss": 0.3372, | |
| "step": 6340 | |
| }, | |
| { | |
| "epoch": 2.597320241382837, | |
| "grad_norm": 1.1610526768921359, | |
| "learning_rate": 2.987426147553401e-06, | |
| "loss": 0.3537, | |
| "step": 6350 | |
| }, | |
| { | |
| "epoch": 2.601411475912857, | |
| "grad_norm": 1.166132435226011, | |
| "learning_rate": 2.957127707923042e-06, | |
| "loss": 0.3495, | |
| "step": 6360 | |
| }, | |
| { | |
| "epoch": 2.605502710442876, | |
| "grad_norm": 1.1286755958679484, | |
| "learning_rate": 2.926829268292683e-06, | |
| "loss": 0.3567, | |
| "step": 6370 | |
| }, | |
| { | |
| "epoch": 2.6095939449728958, | |
| "grad_norm": 1.0343679227746663, | |
| "learning_rate": 2.896530828662324e-06, | |
| "loss": 0.3471, | |
| "step": 6380 | |
| }, | |
| { | |
| "epoch": 2.613685179502915, | |
| "grad_norm": 1.1778557485063874, | |
| "learning_rate": 2.8662323890319647e-06, | |
| "loss": 0.3549, | |
| "step": 6390 | |
| }, | |
| { | |
| "epoch": 2.6177764140329343, | |
| "grad_norm": 1.226656877106952, | |
| "learning_rate": 2.835933949401606e-06, | |
| "loss": 0.3496, | |
| "step": 6400 | |
| }, | |
| { | |
| "epoch": 2.621867648562954, | |
| "grad_norm": 1.3948209564375484, | |
| "learning_rate": 2.8056355097712466e-06, | |
| "loss": 0.3544, | |
| "step": 6410 | |
| }, | |
| { | |
| "epoch": 2.625958883092973, | |
| "grad_norm": 1.233392087784029, | |
| "learning_rate": 2.7753370701408878e-06, | |
| "loss": 0.34, | |
| "step": 6420 | |
| }, | |
| { | |
| "epoch": 2.630050117622993, | |
| "grad_norm": 1.2620457074981337, | |
| "learning_rate": 2.7450386305105285e-06, | |
| "loss": 0.3304, | |
| "step": 6430 | |
| }, | |
| { | |
| "epoch": 2.634141352153012, | |
| "grad_norm": 1.269720329253969, | |
| "learning_rate": 2.71474019088017e-06, | |
| "loss": 0.3589, | |
| "step": 6440 | |
| }, | |
| { | |
| "epoch": 2.6382325866830314, | |
| "grad_norm": 1.1908184218530549, | |
| "learning_rate": 2.6844417512498112e-06, | |
| "loss": 0.3511, | |
| "step": 6450 | |
| }, | |
| { | |
| "epoch": 2.642323821213051, | |
| "grad_norm": 1.1293285956281658, | |
| "learning_rate": 2.654143311619452e-06, | |
| "loss": 0.3507, | |
| "step": 6460 | |
| }, | |
| { | |
| "epoch": 2.6464150557430703, | |
| "grad_norm": 1.3110950677273854, | |
| "learning_rate": 2.623844871989093e-06, | |
| "loss": 0.3433, | |
| "step": 6470 | |
| }, | |
| { | |
| "epoch": 2.65050629027309, | |
| "grad_norm": 1.4086576967977462, | |
| "learning_rate": 2.593546432358734e-06, | |
| "loss": 0.34, | |
| "step": 6480 | |
| }, | |
| { | |
| "epoch": 2.6545975248031093, | |
| "grad_norm": 1.34700090210642, | |
| "learning_rate": 2.563247992728375e-06, | |
| "loss": 0.349, | |
| "step": 6490 | |
| }, | |
| { | |
| "epoch": 2.6586887593331285, | |
| "grad_norm": 1.0282320900902353, | |
| "learning_rate": 2.5329495530980157e-06, | |
| "loss": 0.3601, | |
| "step": 6500 | |
| }, | |
| { | |
| "epoch": 2.6586887593331285, | |
| "eval_loss": 0.41547685861587524, | |
| "eval_runtime": 565.4311, | |
| "eval_samples_per_second": 5.461, | |
| "eval_steps_per_second": 0.911, | |
| "step": 6500 | |
| }, | |
| { | |
| "epoch": 2.6627799938631482, | |
| "grad_norm": 1.1250564513049757, | |
| "learning_rate": 2.502651113467657e-06, | |
| "loss": 0.3493, | |
| "step": 6510 | |
| }, | |
| { | |
| "epoch": 2.6668712283931675, | |
| "grad_norm": 1.1544902670015906, | |
| "learning_rate": 2.4723526738372976e-06, | |
| "loss": 0.356, | |
| "step": 6520 | |
| }, | |
| { | |
| "epoch": 2.670962462923187, | |
| "grad_norm": 1.6062647614114969, | |
| "learning_rate": 2.4420542342069387e-06, | |
| "loss": 0.3537, | |
| "step": 6530 | |
| }, | |
| { | |
| "epoch": 2.6750536974532064, | |
| "grad_norm": 1.4055380071214372, | |
| "learning_rate": 2.4117557945765794e-06, | |
| "loss": 0.3589, | |
| "step": 6540 | |
| }, | |
| { | |
| "epoch": 2.6791449319832257, | |
| "grad_norm": 1.0767099444486827, | |
| "learning_rate": 2.3814573549462206e-06, | |
| "loss": 0.3564, | |
| "step": 6550 | |
| }, | |
| { | |
| "epoch": 2.6832361665132454, | |
| "grad_norm": 1.0775655234852821, | |
| "learning_rate": 2.3511589153158613e-06, | |
| "loss": 0.3508, | |
| "step": 6560 | |
| }, | |
| { | |
| "epoch": 2.687327401043265, | |
| "grad_norm": 1.3945094308830326, | |
| "learning_rate": 2.3208604756855025e-06, | |
| "loss": 0.3462, | |
| "step": 6570 | |
| }, | |
| { | |
| "epoch": 2.6914186355732843, | |
| "grad_norm": 1.4325900380219518, | |
| "learning_rate": 2.290562036055143e-06, | |
| "loss": 0.34, | |
| "step": 6580 | |
| }, | |
| { | |
| "epoch": 2.6955098701033036, | |
| "grad_norm": 1.0603337925865337, | |
| "learning_rate": 2.2602635964247843e-06, | |
| "loss": 0.341, | |
| "step": 6590 | |
| }, | |
| { | |
| "epoch": 2.6996011046333233, | |
| "grad_norm": 1.1150418643447688, | |
| "learning_rate": 2.229965156794425e-06, | |
| "loss": 0.3427, | |
| "step": 6600 | |
| }, | |
| { | |
| "epoch": 2.7036923391633425, | |
| "grad_norm": 1.5841244349700083, | |
| "learning_rate": 2.199666717164066e-06, | |
| "loss": 0.3446, | |
| "step": 6610 | |
| }, | |
| { | |
| "epoch": 2.707783573693362, | |
| "grad_norm": 1.1978695972855773, | |
| "learning_rate": 2.1693682775337074e-06, | |
| "loss": 0.3447, | |
| "step": 6620 | |
| }, | |
| { | |
| "epoch": 2.7118748082233815, | |
| "grad_norm": 1.0752732100706306, | |
| "learning_rate": 2.139069837903348e-06, | |
| "loss": 0.3338, | |
| "step": 6630 | |
| }, | |
| { | |
| "epoch": 2.7159660427534007, | |
| "grad_norm": 1.2105228186701131, | |
| "learning_rate": 2.1087713982729892e-06, | |
| "loss": 0.3352, | |
| "step": 6640 | |
| }, | |
| { | |
| "epoch": 2.7200572772834204, | |
| "grad_norm": 1.1393844873051664, | |
| "learning_rate": 2.07847295864263e-06, | |
| "loss": 0.3531, | |
| "step": 6650 | |
| }, | |
| { | |
| "epoch": 2.7241485118134396, | |
| "grad_norm": 1.0238817342799573, | |
| "learning_rate": 2.048174519012271e-06, | |
| "loss": 0.3528, | |
| "step": 6660 | |
| }, | |
| { | |
| "epoch": 2.7282397463434593, | |
| "grad_norm": 1.1816844502503707, | |
| "learning_rate": 2.017876079381912e-06, | |
| "loss": 0.3512, | |
| "step": 6670 | |
| }, | |
| { | |
| "epoch": 2.7323309808734786, | |
| "grad_norm": 1.305926419960111, | |
| "learning_rate": 1.987577639751553e-06, | |
| "loss": 0.3442, | |
| "step": 6680 | |
| }, | |
| { | |
| "epoch": 2.736422215403498, | |
| "grad_norm": 1.4261243840727893, | |
| "learning_rate": 1.957279200121194e-06, | |
| "loss": 0.3455, | |
| "step": 6690 | |
| }, | |
| { | |
| "epoch": 2.7405134499335175, | |
| "grad_norm": 1.6388906420384781, | |
| "learning_rate": 1.926980760490835e-06, | |
| "loss": 0.3535, | |
| "step": 6700 | |
| }, | |
| { | |
| "epoch": 2.744604684463537, | |
| "grad_norm": 1.5426677425739872, | |
| "learning_rate": 1.896682320860476e-06, | |
| "loss": 0.3475, | |
| "step": 6710 | |
| }, | |
| { | |
| "epoch": 2.7486959189935565, | |
| "grad_norm": 1.1578285273120823, | |
| "learning_rate": 1.866383881230117e-06, | |
| "loss": 0.3434, | |
| "step": 6720 | |
| }, | |
| { | |
| "epoch": 2.7527871535235757, | |
| "grad_norm": 1.0960855524012971, | |
| "learning_rate": 1.8360854415997579e-06, | |
| "loss": 0.3578, | |
| "step": 6730 | |
| }, | |
| { | |
| "epoch": 2.756878388053595, | |
| "grad_norm": 1.1655103795481652, | |
| "learning_rate": 1.8057870019693988e-06, | |
| "loss": 0.3566, | |
| "step": 6740 | |
| }, | |
| { | |
| "epoch": 2.7609696225836147, | |
| "grad_norm": 1.4270524025940055, | |
| "learning_rate": 1.7754885623390398e-06, | |
| "loss": 0.3517, | |
| "step": 6750 | |
| }, | |
| { | |
| "epoch": 2.765060857113634, | |
| "grad_norm": 1.3348787554214543, | |
| "learning_rate": 1.7451901227086807e-06, | |
| "loss": 0.3518, | |
| "step": 6760 | |
| }, | |
| { | |
| "epoch": 2.7691520916436536, | |
| "grad_norm": 1.0118054266554564, | |
| "learning_rate": 1.7148916830783216e-06, | |
| "loss": 0.3517, | |
| "step": 6770 | |
| }, | |
| { | |
| "epoch": 2.773243326173673, | |
| "grad_norm": 1.5502107988903457, | |
| "learning_rate": 1.6845932434479626e-06, | |
| "loss": 0.3402, | |
| "step": 6780 | |
| }, | |
| { | |
| "epoch": 2.777334560703692, | |
| "grad_norm": 1.061216156435797, | |
| "learning_rate": 1.6542948038176035e-06, | |
| "loss": 0.3521, | |
| "step": 6790 | |
| }, | |
| { | |
| "epoch": 2.781425795233712, | |
| "grad_norm": 1.5424114671353548, | |
| "learning_rate": 1.6239963641872444e-06, | |
| "loss": 0.3456, | |
| "step": 6800 | |
| }, | |
| { | |
| "epoch": 2.785517029763731, | |
| "grad_norm": 1.2155670661756302, | |
| "learning_rate": 1.5936979245568854e-06, | |
| "loss": 0.3444, | |
| "step": 6810 | |
| }, | |
| { | |
| "epoch": 2.7896082642937508, | |
| "grad_norm": 1.2991785598636614, | |
| "learning_rate": 1.5633994849265263e-06, | |
| "loss": 0.3453, | |
| "step": 6820 | |
| }, | |
| { | |
| "epoch": 2.79369949882377, | |
| "grad_norm": 1.5117541536726462, | |
| "learning_rate": 1.5331010452961673e-06, | |
| "loss": 0.3426, | |
| "step": 6830 | |
| }, | |
| { | |
| "epoch": 2.7977907333537892, | |
| "grad_norm": 1.397208477585223, | |
| "learning_rate": 1.5028026056658082e-06, | |
| "loss": 0.3505, | |
| "step": 6840 | |
| }, | |
| { | |
| "epoch": 2.801881967883809, | |
| "grad_norm": 1.2017192873007096, | |
| "learning_rate": 1.4725041660354491e-06, | |
| "loss": 0.3341, | |
| "step": 6850 | |
| }, | |
| { | |
| "epoch": 2.8059732024138286, | |
| "grad_norm": 1.3674252863596916, | |
| "learning_rate": 1.4422057264050903e-06, | |
| "loss": 0.3578, | |
| "step": 6860 | |
| }, | |
| { | |
| "epoch": 2.810064436943848, | |
| "grad_norm": 1.2963144848882437, | |
| "learning_rate": 1.4119072867747312e-06, | |
| "loss": 0.3448, | |
| "step": 6870 | |
| }, | |
| { | |
| "epoch": 2.814155671473867, | |
| "grad_norm": 1.3472526890767895, | |
| "learning_rate": 1.3816088471443724e-06, | |
| "loss": 0.3545, | |
| "step": 6880 | |
| }, | |
| { | |
| "epoch": 2.818246906003887, | |
| "grad_norm": 1.4300718931297243, | |
| "learning_rate": 1.3513104075140133e-06, | |
| "loss": 0.3458, | |
| "step": 6890 | |
| }, | |
| { | |
| "epoch": 2.822338140533906, | |
| "grad_norm": 1.1374915010677178, | |
| "learning_rate": 1.3210119678836542e-06, | |
| "loss": 0.3452, | |
| "step": 6900 | |
| }, | |
| { | |
| "epoch": 2.8264293750639258, | |
| "grad_norm": 1.3186888057848343, | |
| "learning_rate": 1.2907135282532952e-06, | |
| "loss": 0.3582, | |
| "step": 6910 | |
| }, | |
| { | |
| "epoch": 2.830520609593945, | |
| "grad_norm": 1.4076831798674376, | |
| "learning_rate": 1.2604150886229361e-06, | |
| "loss": 0.3473, | |
| "step": 6920 | |
| }, | |
| { | |
| "epoch": 2.8346118441239643, | |
| "grad_norm": 1.236226000169149, | |
| "learning_rate": 1.230116648992577e-06, | |
| "loss": 0.3508, | |
| "step": 6930 | |
| }, | |
| { | |
| "epoch": 2.838703078653984, | |
| "grad_norm": 1.1598603618467764, | |
| "learning_rate": 1.199818209362218e-06, | |
| "loss": 0.3135, | |
| "step": 6940 | |
| }, | |
| { | |
| "epoch": 2.842794313184003, | |
| "grad_norm": 1.4470943796781588, | |
| "learning_rate": 1.169519769731859e-06, | |
| "loss": 0.3369, | |
| "step": 6950 | |
| }, | |
| { | |
| "epoch": 2.846885547714023, | |
| "grad_norm": 1.4514521872416606, | |
| "learning_rate": 1.1392213301014999e-06, | |
| "loss": 0.3652, | |
| "step": 6960 | |
| }, | |
| { | |
| "epoch": 2.850976782244042, | |
| "grad_norm": 1.508158438836179, | |
| "learning_rate": 1.1089228904711408e-06, | |
| "loss": 0.3424, | |
| "step": 6970 | |
| }, | |
| { | |
| "epoch": 2.8550680167740614, | |
| "grad_norm": 1.2409802713756901, | |
| "learning_rate": 1.078624450840782e-06, | |
| "loss": 0.3494, | |
| "step": 6980 | |
| }, | |
| { | |
| "epoch": 2.859159251304081, | |
| "grad_norm": 1.277646969163353, | |
| "learning_rate": 1.0483260112104229e-06, | |
| "loss": 0.3323, | |
| "step": 6990 | |
| }, | |
| { | |
| "epoch": 2.8632504858341004, | |
| "grad_norm": 1.181273366360208, | |
| "learning_rate": 1.0180275715800638e-06, | |
| "loss": 0.3454, | |
| "step": 7000 | |
| }, | |
| { | |
| "epoch": 2.8632504858341004, | |
| "eval_loss": 0.4169977903366089, | |
| "eval_runtime": 566.1639, | |
| "eval_samples_per_second": 5.454, | |
| "eval_steps_per_second": 0.91, | |
| "step": 7000 | |
| }, | |
| { | |
| "epoch": 2.86734172036412, | |
| "grad_norm": 1.2542825969651887, | |
| "learning_rate": 9.877291319497048e-07, | |
| "loss": 0.3441, | |
| "step": 7010 | |
| }, | |
| { | |
| "epoch": 2.8714329548941393, | |
| "grad_norm": 1.2663043582399385, | |
| "learning_rate": 9.574306923193457e-07, | |
| "loss": 0.3577, | |
| "step": 7020 | |
| }, | |
| { | |
| "epoch": 2.8755241894241585, | |
| "grad_norm": 1.2989911359603148, | |
| "learning_rate": 9.271322526889865e-07, | |
| "loss": 0.3441, | |
| "step": 7030 | |
| }, | |
| { | |
| "epoch": 2.8796154239541782, | |
| "grad_norm": 1.4071532233625517, | |
| "learning_rate": 8.968338130586275e-07, | |
| "loss": 0.3461, | |
| "step": 7040 | |
| }, | |
| { | |
| "epoch": 2.8837066584841975, | |
| "grad_norm": 1.3845463524426405, | |
| "learning_rate": 8.665353734282685e-07, | |
| "loss": 0.3497, | |
| "step": 7050 | |
| }, | |
| { | |
| "epoch": 2.887797893014217, | |
| "grad_norm": 1.1682379379190977, | |
| "learning_rate": 8.362369337979096e-07, | |
| "loss": 0.3559, | |
| "step": 7060 | |
| }, | |
| { | |
| "epoch": 2.8918891275442364, | |
| "grad_norm": 1.3470954040058543, | |
| "learning_rate": 8.059384941675505e-07, | |
| "loss": 0.3429, | |
| "step": 7070 | |
| }, | |
| { | |
| "epoch": 2.8959803620742557, | |
| "grad_norm": 1.208626776638643, | |
| "learning_rate": 7.756400545371914e-07, | |
| "loss": 0.3415, | |
| "step": 7080 | |
| }, | |
| { | |
| "epoch": 2.9000715966042754, | |
| "grad_norm": 1.4017945161316288, | |
| "learning_rate": 7.453416149068324e-07, | |
| "loss": 0.3554, | |
| "step": 7090 | |
| }, | |
| { | |
| "epoch": 2.9041628311342946, | |
| "grad_norm": 1.183169645634326, | |
| "learning_rate": 7.150431752764733e-07, | |
| "loss": 0.3464, | |
| "step": 7100 | |
| }, | |
| { | |
| "epoch": 2.9082540656643143, | |
| "grad_norm": 1.321129525250198, | |
| "learning_rate": 6.847447356461142e-07, | |
| "loss": 0.3508, | |
| "step": 7110 | |
| }, | |
| { | |
| "epoch": 2.9123453001943336, | |
| "grad_norm": 1.4041097528620985, | |
| "learning_rate": 6.544462960157552e-07, | |
| "loss": 0.3475, | |
| "step": 7120 | |
| }, | |
| { | |
| "epoch": 2.916436534724353, | |
| "grad_norm": 1.2651747018935453, | |
| "learning_rate": 6.241478563853962e-07, | |
| "loss": 0.353, | |
| "step": 7130 | |
| }, | |
| { | |
| "epoch": 2.9205277692543725, | |
| "grad_norm": 1.2832363456828761, | |
| "learning_rate": 5.938494167550372e-07, | |
| "loss": 0.3416, | |
| "step": 7140 | |
| }, | |
| { | |
| "epoch": 2.924619003784392, | |
| "grad_norm": 1.1421739779615996, | |
| "learning_rate": 5.635509771246781e-07, | |
| "loss": 0.3472, | |
| "step": 7150 | |
| }, | |
| { | |
| "epoch": 2.9287102383144115, | |
| "grad_norm": 1.643848869321631, | |
| "learning_rate": 5.33252537494319e-07, | |
| "loss": 0.3405, | |
| "step": 7160 | |
| }, | |
| { | |
| "epoch": 2.9328014728444307, | |
| "grad_norm": 1.1839381749310496, | |
| "learning_rate": 5.029540978639601e-07, | |
| "loss": 0.3301, | |
| "step": 7170 | |
| }, | |
| { | |
| "epoch": 2.9368927073744504, | |
| "grad_norm": 1.2749563659235175, | |
| "learning_rate": 4.72655658233601e-07, | |
| "loss": 0.3344, | |
| "step": 7180 | |
| }, | |
| { | |
| "epoch": 2.9409839419044697, | |
| "grad_norm": 1.5435865262538777, | |
| "learning_rate": 4.4235721860324195e-07, | |
| "loss": 0.3409, | |
| "step": 7190 | |
| }, | |
| { | |
| "epoch": 2.9450751764344894, | |
| "grad_norm": 1.5137986871644649, | |
| "learning_rate": 4.120587789728829e-07, | |
| "loss": 0.3475, | |
| "step": 7200 | |
| }, | |
| { | |
| "epoch": 2.9491664109645086, | |
| "grad_norm": 1.2316496489032414, | |
| "learning_rate": 3.8176033934252394e-07, | |
| "loss": 0.3348, | |
| "step": 7210 | |
| }, | |
| { | |
| "epoch": 2.953257645494528, | |
| "grad_norm": 1.2680106473026544, | |
| "learning_rate": 3.5146189971216487e-07, | |
| "loss": 0.3335, | |
| "step": 7220 | |
| }, | |
| { | |
| "epoch": 2.9573488800245475, | |
| "grad_norm": 1.0733135362099837, | |
| "learning_rate": 3.211634600818058e-07, | |
| "loss": 0.347, | |
| "step": 7230 | |
| }, | |
| { | |
| "epoch": 2.961440114554567, | |
| "grad_norm": 1.5331378983904185, | |
| "learning_rate": 2.908650204514468e-07, | |
| "loss": 0.3489, | |
| "step": 7240 | |
| }, | |
| { | |
| "epoch": 2.9655313490845865, | |
| "grad_norm": 1.1721604841420616, | |
| "learning_rate": 2.6056658082108774e-07, | |
| "loss": 0.3407, | |
| "step": 7250 | |
| }, | |
| { | |
| "epoch": 2.9696225836146057, | |
| "grad_norm": 1.2850118367826986, | |
| "learning_rate": 2.302681411907287e-07, | |
| "loss": 0.3589, | |
| "step": 7260 | |
| }, | |
| { | |
| "epoch": 2.973713818144625, | |
| "grad_norm": 1.4413311960347719, | |
| "learning_rate": 1.9996970156036967e-07, | |
| "loss": 0.3369, | |
| "step": 7270 | |
| }, | |
| { | |
| "epoch": 2.9778050526746447, | |
| "grad_norm": 1.314019429686636, | |
| "learning_rate": 1.6967126193001063e-07, | |
| "loss": 0.3507, | |
| "step": 7280 | |
| }, | |
| { | |
| "epoch": 2.981896287204664, | |
| "grad_norm": 1.113303777593494, | |
| "learning_rate": 1.3937282229965157e-07, | |
| "loss": 0.3523, | |
| "step": 7290 | |
| }, | |
| { | |
| "epoch": 2.9859875217346836, | |
| "grad_norm": 1.1248866357161083, | |
| "learning_rate": 1.0907438266929254e-07, | |
| "loss": 0.3437, | |
| "step": 7300 | |
| }, | |
| { | |
| "epoch": 2.990078756264703, | |
| "grad_norm": 1.1898353136027997, | |
| "learning_rate": 7.877594303893351e-08, | |
| "loss": 0.3545, | |
| "step": 7310 | |
| }, | |
| { | |
| "epoch": 2.994169990794722, | |
| "grad_norm": 1.4708707926240419, | |
| "learning_rate": 4.8477503408574464e-08, | |
| "loss": 0.3666, | |
| "step": 7320 | |
| }, | |
| { | |
| "epoch": 2.998261225324742, | |
| "grad_norm": 1.3652249352362715, | |
| "learning_rate": 1.8179063778215425e-08, | |
| "loss": 0.3625, | |
| "step": 7330 | |
| } | |
| ], | |
| "logging_steps": 10, | |
| "max_steps": 7335, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 3, | |
| "save_steps": 500, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 307331176169472.0, | |
| "train_batch_size": 2, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |