diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,5277 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 3.0, + "eval_steps": 500, + "global_step": 7335, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.004091234530019434, + "grad_norm": 5.139719573502004, + "learning_rate": 6.9791071983348295e-06, + "loss": 2.653, + "step": 10 + }, + { + "epoch": 0.008182469060038867, + "grad_norm": 3.820838784929218, + "learning_rate": 9.080027807988022e-06, + "loss": 0.4708, + "step": 20 + }, + { + "epoch": 0.0122737035900583, + "grad_norm": 2.6483227024127802, + "learning_rate": 1.030898758162737e-05, + "loss": 0.4622, + "step": 30 + }, + { + "epoch": 0.016364938120077735, + "grad_norm": 3.0325419802512723, + "learning_rate": 1.1180948417641216e-05, + "loss": 0.4572, + "step": 40 + }, + { + "epoch": 0.020456172650097165, + "grad_norm": 3.3008143753804378, + "learning_rate": 1.1857293787016462e-05, + "loss": 0.4569, + "step": 50 + }, + { + "epoch": 0.0245474071801166, + "grad_norm": 2.8131794752844383, + "learning_rate": 1.2409908191280561e-05, + "loss": 0.4378, + "step": 60 + }, + { + "epoch": 0.028638641710136033, + "grad_norm": 2.7183268247171717, + "learning_rate": 1.2877137012696984e-05, + "loss": 0.4523, + "step": 70 + }, + { + "epoch": 0.03272987624015547, + "grad_norm": 1.812836545554904, + "learning_rate": 1.3281869027294408e-05, + "loss": 0.4517, + "step": 80 + }, + { + "epoch": 0.0368211107701749, + "grad_norm": 1.5605721318709478, + "learning_rate": 1.363886796491991e-05, + "loss": 0.4396, + "step": 90 + }, + { + "epoch": 0.04091234530019433, + "grad_norm": 3.00608523793705, + "learning_rate": 1.3958214396669659e-05, + "loss": 0.4393, + "step": 100 + }, + { + "epoch": 0.04500357983021377, + "grad_norm": 2.004900329434675, + "learning_rate": 1.4247098383615834e-05, + "loss": 0.4287, + "step": 110 + }, + { + "epoch": 0.0490948143602332, + "grad_norm": 3.1505903529524826, + "learning_rate": 1.4510828800933757e-05, + "loss": 0.4461, + "step": 120 + }, + { + "epoch": 0.053186048890252635, + "grad_norm": 1.4909337148061823, + "learning_rate": 1.47534372669567e-05, + "loss": 0.4475, + "step": 130 + }, + { + "epoch": 0.057277283420272065, + "grad_norm": 1.910562126750776, + "learning_rate": 1.4978057622350176e-05, + "loss": 0.456, + "step": 140 + }, + { + "epoch": 0.0613685179502915, + "grad_norm": 1.071878138097679, + "learning_rate": 1.5187174170309003e-05, + "loss": 0.4444, + "step": 150 + }, + { + "epoch": 0.06545975248031094, + "grad_norm": 1.182162791937331, + "learning_rate": 1.5382789636947598e-05, + "loss": 0.4544, + "step": 160 + }, + { + "epoch": 0.06955098701033037, + "grad_norm": 2.804609224620368, + "learning_rate": 1.5566542122709266e-05, + "loss": 0.4431, + "step": 170 + }, + { + "epoch": 0.0736422215403498, + "grad_norm": 2.30126890056861, + "learning_rate": 1.5739788574573106e-05, + "loss": 0.4304, + "step": 180 + }, + { + "epoch": 0.07773345607036923, + "grad_norm": 1.8322154114594529, + "learning_rate": 1.59036656596413e-05, + "loss": 0.4488, + "step": 190 + }, + { + "epoch": 0.08182469060038866, + "grad_norm": 1.6909197474415698, + "learning_rate": 1.605913500632285e-05, + "loss": 0.4244, + "step": 200 + }, + { + "epoch": 0.0859159251304081, + "grad_norm": 1.9267153990137886, + "learning_rate": 1.6207017395989525e-05, + "loss": 0.4452, + "step": 210 + }, + { + "epoch": 0.09000715966042754, + "grad_norm": 1.9513020149733689, + "learning_rate": 1.6348018993269024e-05, + "loss": 0.441, + "step": 220 + }, + { + "epoch": 0.09409839419044697, + "grad_norm": 1.6992552537559622, + "learning_rate": 1.648275174085812e-05, + "loss": 0.4142, + "step": 230 + }, + { + "epoch": 0.0981896287204664, + "grad_norm": 2.506670138718048, + "learning_rate": 1.661174941058695e-05, + "loss": 0.4309, + "step": 240 + }, + { + "epoch": 0.10228086325048584, + "grad_norm": 1.1649232947269594, + "learning_rate": 1.6735480375698097e-05, + "loss": 0.4468, + "step": 250 + }, + { + "epoch": 0.10637209778050527, + "grad_norm": 2.2776528295818723, + "learning_rate": 1.6854357876609896e-05, + "loss": 0.4263, + "step": 260 + }, + { + "epoch": 0.1104633323105247, + "grad_norm": 1.09704187408726, + "learning_rate": 1.6968748348212453e-05, + "loss": 0.4199, + "step": 270 + }, + { + "epoch": 0.11455456684054413, + "grad_norm": 0.8342510727420896, + "learning_rate": 1.7078978232003368e-05, + "loss": 0.4313, + "step": 280 + }, + { + "epoch": 0.11864580137056356, + "grad_norm": 0.8821458905420587, + "learning_rate": 1.7185339592301872e-05, + "loss": 0.4259, + "step": 290 + }, + { + "epoch": 0.122737035900583, + "grad_norm": 1.7825705984955846, + "learning_rate": 1.7288094779962197e-05, + "loss": 0.4261, + "step": 300 + }, + { + "epoch": 0.12682827043060244, + "grad_norm": 1.904238409153354, + "learning_rate": 1.7387480331094423e-05, + "loss": 0.4259, + "step": 310 + }, + { + "epoch": 0.13091950496062188, + "grad_norm": 1.725649730023905, + "learning_rate": 1.7483710246600792e-05, + "loss": 0.4314, + "step": 320 + }, + { + "epoch": 0.1350107394906413, + "grad_norm": 1.4341207306128956, + "learning_rate": 1.757697876690837e-05, + "loss": 0.4419, + "step": 330 + }, + { + "epoch": 0.13910197402066074, + "grad_norm": 1.2643467480950796, + "learning_rate": 1.766746273236246e-05, + "loss": 0.4312, + "step": 340 + }, + { + "epoch": 0.14319320855068016, + "grad_norm": 1.563315600788671, + "learning_rate": 1.7755323601378616e-05, + "loss": 0.4211, + "step": 350 + }, + { + "epoch": 0.1472844430806996, + "grad_norm": 1.3601337663266007, + "learning_rate": 1.7840709184226296e-05, + "loss": 0.4187, + "step": 360 + }, + { + "epoch": 0.15137567761071904, + "grad_norm": 1.230870033216971, + "learning_rate": 1.792375513921188e-05, + "loss": 0.4317, + "step": 370 + }, + { + "epoch": 0.15546691214073846, + "grad_norm": 0.935994666832451, + "learning_rate": 1.8004586269294493e-05, + "loss": 0.4347, + "step": 380 + }, + { + "epoch": 0.1595581466707579, + "grad_norm": 1.3347135910248187, + "learning_rate": 1.8083317650249243e-05, + "loss": 0.4323, + "step": 390 + }, + { + "epoch": 0.16364938120077732, + "grad_norm": 1.929506823385391, + "learning_rate": 1.8160055615976043e-05, + "loss": 0.4346, + "step": 400 + }, + { + "epoch": 0.16774061573079677, + "grad_norm": 0.9726505093912414, + "learning_rate": 1.8234898622125742e-05, + "loss": 0.416, + "step": 410 + }, + { + "epoch": 0.1718318502608162, + "grad_norm": 0.9397510565012722, + "learning_rate": 1.8307938005642715e-05, + "loss": 0.4182, + "step": 420 + }, + { + "epoch": 0.17592308479083563, + "grad_norm": 1.2862427800883758, + "learning_rate": 1.8379258654923192e-05, + "loss": 0.4165, + "step": 430 + }, + { + "epoch": 0.18001431932085507, + "grad_norm": 1.455717102975378, + "learning_rate": 1.8448939602922218e-05, + "loss": 0.4287, + "step": 440 + }, + { + "epoch": 0.18410555385087451, + "grad_norm": 1.5001908608590255, + "learning_rate": 1.8517054553601544e-05, + "loss": 0.4303, + "step": 450 + }, + { + "epoch": 0.18819678838089393, + "grad_norm": 1.6709222890878312, + "learning_rate": 1.8583672350511313e-05, + "loss": 0.4328, + "step": 460 + }, + { + "epoch": 0.19228802291091338, + "grad_norm": 1.3805672401271125, + "learning_rate": 1.864885739497424e-05, + "loss": 0.4179, + "step": 470 + }, + { + "epoch": 0.1963792574409328, + "grad_norm": 1.4202659814214615, + "learning_rate": 1.8712670020240143e-05, + "loss": 0.4129, + "step": 480 + }, + { + "epoch": 0.20047049197095224, + "grad_norm": 1.3802506203673055, + "learning_rate": 1.8775166827059134e-05, + "loss": 0.4203, + "step": 490 + }, + { + "epoch": 0.20456172650097168, + "grad_norm": 1.4249688406884504, + "learning_rate": 1.883640098535129e-05, + "loss": 0.4246, + "step": 500 + }, + { + "epoch": 0.20456172650097168, + "eval_loss": 0.4226590692996979, + "eval_runtime": 565.8176, + "eval_samples_per_second": 5.458, + "eval_steps_per_second": 0.91, + "step": 500 + }, + { + "epoch": 0.2086529610309911, + "grad_norm": 0.8798533540054851, + "learning_rate": 1.8896422506001807e-05, + "loss": 0.4267, + "step": 510 + }, + { + "epoch": 0.21274419556101054, + "grad_norm": 1.279435399555335, + "learning_rate": 1.895527848626309e-05, + "loss": 0.4103, + "step": 520 + }, + { + "epoch": 0.21683543009102996, + "grad_norm": 1.2983197150849732, + "learning_rate": 1.901301333178074e-05, + "loss": 0.416, + "step": 530 + }, + { + "epoch": 0.2209266646210494, + "grad_norm": 1.3784001180176515, + "learning_rate": 1.9069668957865647e-05, + "loss": 0.4249, + "step": 540 + }, + { + "epoch": 0.22501789915106885, + "grad_norm": 1.4315659756421384, + "learning_rate": 1.9125284972297466e-05, + "loss": 0.4244, + "step": 550 + }, + { + "epoch": 0.22910913368108826, + "grad_norm": 1.1438536096514877, + "learning_rate": 1.9179898841656562e-05, + "loss": 0.4175, + "step": 560 + }, + { + "epoch": 0.2332003682111077, + "grad_norm": 1.5784295688235042, + "learning_rate": 1.923354604293384e-05, + "loss": 0.4338, + "step": 570 + }, + { + "epoch": 0.23729160274112712, + "grad_norm": 1.1792683199314111, + "learning_rate": 1.9286260201955066e-05, + "loss": 0.4235, + "step": 580 + }, + { + "epoch": 0.24138283727114657, + "grad_norm": 0.9242432483061238, + "learning_rate": 1.9338073219972227e-05, + "loss": 0.4206, + "step": 590 + }, + { + "epoch": 0.245474071801166, + "grad_norm": 1.4278436783956994, + "learning_rate": 1.938901538961539e-05, + "loss": 0.4288, + "step": 600 + }, + { + "epoch": 0.24956530633118543, + "grad_norm": 1.7892184247748346, + "learning_rate": 1.9439115501260403e-05, + "loss": 0.4314, + "step": 610 + }, + { + "epoch": 0.25365654086120487, + "grad_norm": 1.3459176439410694, + "learning_rate": 1.9488400940747617e-05, + "loss": 0.4252, + "step": 620 + }, + { + "epoch": 0.2577477753912243, + "grad_norm": 1.0979452662978122, + "learning_rate": 1.9536897779282066e-05, + "loss": 0.4159, + "step": 630 + }, + { + "epoch": 0.26183900992124376, + "grad_norm": 1.1558624628355403, + "learning_rate": 1.958463085625399e-05, + "loss": 0.425, + "step": 640 + }, + { + "epoch": 0.2659302444512632, + "grad_norm": 0.9606366329728439, + "learning_rate": 1.9631623855638338e-05, + "loss": 0.4084, + "step": 650 + }, + { + "epoch": 0.2700214789812826, + "grad_norm": 1.0597290473469163, + "learning_rate": 1.9677899376561565e-05, + "loss": 0.4099, + "step": 660 + }, + { + "epoch": 0.274112713511302, + "grad_norm": 1.0819100913671185, + "learning_rate": 1.9723478998562017e-05, + "loss": 0.4151, + "step": 670 + }, + { + "epoch": 0.2782039480413215, + "grad_norm": 1.4892435767285719, + "learning_rate": 1.976838334201565e-05, + "loss": 0.4164, + "step": 680 + }, + { + "epoch": 0.2822951825713409, + "grad_norm": 1.1025628764173787, + "learning_rate": 1.981263212415066e-05, + "loss": 0.4193, + "step": 690 + }, + { + "epoch": 0.2863864171013603, + "grad_norm": 1.0753475991516983, + "learning_rate": 1.985624421103181e-05, + "loss": 0.4108, + "step": 700 + }, + { + "epoch": 0.2904776516313798, + "grad_norm": 0.905939605392397, + "learning_rate": 1.9899237665857572e-05, + "loss": 0.4089, + "step": 710 + }, + { + "epoch": 0.2945688861613992, + "grad_norm": 1.1713616964839249, + "learning_rate": 1.994162979387949e-05, + "loss": 0.4201, + "step": 720 + }, + { + "epoch": 0.2986601206914186, + "grad_norm": 0.9299543253519471, + "learning_rate": 1.998343718422334e-05, + "loss": 0.4139, + "step": 730 + }, + { + "epoch": 0.3027513552214381, + "grad_norm": 1.516913403786176, + "learning_rate": 1.998485078018482e-05, + "loss": 0.4067, + "step": 740 + }, + { + "epoch": 0.3068425897514575, + "grad_norm": 0.7991131845266628, + "learning_rate": 1.9954552340554464e-05, + "loss": 0.4123, + "step": 750 + }, + { + "epoch": 0.3109338242814769, + "grad_norm": 1.3657331223383264, + "learning_rate": 1.9924253900924104e-05, + "loss": 0.4108, + "step": 760 + }, + { + "epoch": 0.3150250588114964, + "grad_norm": 0.8920875301003005, + "learning_rate": 1.9893955461293747e-05, + "loss": 0.4191, + "step": 770 + }, + { + "epoch": 0.3191162933415158, + "grad_norm": 0.9744817931110429, + "learning_rate": 1.9863657021663386e-05, + "loss": 0.4053, + "step": 780 + }, + { + "epoch": 0.3232075278715352, + "grad_norm": 1.1889083717413498, + "learning_rate": 1.9833358582033025e-05, + "loss": 0.4629, + "step": 790 + }, + { + "epoch": 0.32729876240155464, + "grad_norm": 2.2559102163995965, + "learning_rate": 1.9803060142402668e-05, + "loss": 0.7059, + "step": 800 + }, + { + "epoch": 0.3313899969315741, + "grad_norm": 0.7012331362796347, + "learning_rate": 1.977276170277231e-05, + "loss": 0.418, + "step": 810 + }, + { + "epoch": 0.33548123146159353, + "grad_norm": 1.1193457098820139, + "learning_rate": 1.9742463263141947e-05, + "loss": 0.4022, + "step": 820 + }, + { + "epoch": 0.33957246599161295, + "grad_norm": 1.0787812784251993, + "learning_rate": 1.971216482351159e-05, + "loss": 0.4147, + "step": 830 + }, + { + "epoch": 0.3436637005216324, + "grad_norm": 1.2144732211866651, + "learning_rate": 1.9681866383881233e-05, + "loss": 0.4118, + "step": 840 + }, + { + "epoch": 0.34775493505165184, + "grad_norm": 1.2104042207994736, + "learning_rate": 1.9651567944250872e-05, + "loss": 0.4081, + "step": 850 + }, + { + "epoch": 0.35184616958167125, + "grad_norm": 1.2049689205461667, + "learning_rate": 1.9621269504620512e-05, + "loss": 0.4073, + "step": 860 + }, + { + "epoch": 0.3559374041116907, + "grad_norm": 1.230689211204093, + "learning_rate": 1.9590971064990155e-05, + "loss": 0.4104, + "step": 870 + }, + { + "epoch": 0.36002863864171014, + "grad_norm": 0.8629950816912576, + "learning_rate": 1.9560672625359794e-05, + "loss": 0.4147, + "step": 880 + }, + { + "epoch": 0.36411987317172956, + "grad_norm": 1.1029857217191437, + "learning_rate": 1.9530374185729437e-05, + "loss": 0.4221, + "step": 890 + }, + { + "epoch": 0.36821110770174903, + "grad_norm": 1.1513671079248653, + "learning_rate": 1.9500075746099076e-05, + "loss": 0.4143, + "step": 900 + }, + { + "epoch": 0.37230234223176845, + "grad_norm": 1.3187073752135716, + "learning_rate": 1.946977730646872e-05, + "loss": 0.4193, + "step": 910 + }, + { + "epoch": 0.37639357676178786, + "grad_norm": 0.8494318204927609, + "learning_rate": 1.943947886683836e-05, + "loss": 0.4103, + "step": 920 + }, + { + "epoch": 0.3804848112918073, + "grad_norm": 1.2378655056760652, + "learning_rate": 1.9409180427208e-05, + "loss": 0.413, + "step": 930 + }, + { + "epoch": 0.38457604582182675, + "grad_norm": 0.8742974569121568, + "learning_rate": 1.937888198757764e-05, + "loss": 0.4178, + "step": 940 + }, + { + "epoch": 0.38866728035184617, + "grad_norm": 0.6659680293817604, + "learning_rate": 1.934858354794728e-05, + "loss": 0.3992, + "step": 950 + }, + { + "epoch": 0.3927585148818656, + "grad_norm": 1.0085028880533002, + "learning_rate": 1.9318285108316923e-05, + "loss": 0.3957, + "step": 960 + }, + { + "epoch": 0.39684974941188506, + "grad_norm": 0.7913168849253336, + "learning_rate": 1.9287986668686566e-05, + "loss": 0.4111, + "step": 970 + }, + { + "epoch": 0.40094098394190447, + "grad_norm": 0.9593948086372496, + "learning_rate": 1.9257688229056206e-05, + "loss": 0.4097, + "step": 980 + }, + { + "epoch": 0.4050322184719239, + "grad_norm": 1.1075304924315132, + "learning_rate": 1.9227389789425845e-05, + "loss": 0.4029, + "step": 990 + }, + { + "epoch": 0.40912345300194336, + "grad_norm": 1.3189534557378433, + "learning_rate": 1.9197091349795488e-05, + "loss": 0.4114, + "step": 1000 + }, + { + "epoch": 0.40912345300194336, + "eval_loss": 0.4164506494998932, + "eval_runtime": 567.741, + "eval_samples_per_second": 5.439, + "eval_steps_per_second": 0.907, + "step": 1000 + }, + { + "epoch": 0.4132146875319628, + "grad_norm": 1.3479557524857961, + "learning_rate": 1.9166792910165127e-05, + "loss": 0.4034, + "step": 1010 + }, + { + "epoch": 0.4173059220619822, + "grad_norm": 0.7981045727445666, + "learning_rate": 1.913649447053477e-05, + "loss": 0.3984, + "step": 1020 + }, + { + "epoch": 0.4213971565920016, + "grad_norm": 1.3167949575672933, + "learning_rate": 1.910619603090441e-05, + "loss": 0.418, + "step": 1030 + }, + { + "epoch": 0.4254883911220211, + "grad_norm": 0.875082404450118, + "learning_rate": 1.907589759127405e-05, + "loss": 0.4165, + "step": 1040 + }, + { + "epoch": 0.4295796256520405, + "grad_norm": 0.7010201186949262, + "learning_rate": 1.9045599151643692e-05, + "loss": 0.4043, + "step": 1050 + }, + { + "epoch": 0.4336708601820599, + "grad_norm": 0.8106312112876102, + "learning_rate": 1.9015300712013335e-05, + "loss": 0.3946, + "step": 1060 + }, + { + "epoch": 0.4377620947120794, + "grad_norm": 1.2069253813775411, + "learning_rate": 1.8985002272382974e-05, + "loss": 0.4142, + "step": 1070 + }, + { + "epoch": 0.4418533292420988, + "grad_norm": 1.4074507315896057, + "learning_rate": 1.8954703832752614e-05, + "loss": 0.4216, + "step": 1080 + }, + { + "epoch": 0.4459445637721182, + "grad_norm": 1.0142221478478237, + "learning_rate": 1.8924405393122257e-05, + "loss": 0.399, + "step": 1090 + }, + { + "epoch": 0.4500357983021377, + "grad_norm": 1.0549341445926665, + "learning_rate": 1.8894106953491896e-05, + "loss": 0.4036, + "step": 1100 + }, + { + "epoch": 0.4541270328321571, + "grad_norm": 0.965773892068146, + "learning_rate": 1.8863808513861535e-05, + "loss": 0.4064, + "step": 1110 + }, + { + "epoch": 0.4582182673621765, + "grad_norm": 0.9064204193354054, + "learning_rate": 1.8833510074231178e-05, + "loss": 0.3909, + "step": 1120 + }, + { + "epoch": 0.462309501892196, + "grad_norm": 0.9674069704546393, + "learning_rate": 1.880321163460082e-05, + "loss": 0.4016, + "step": 1130 + }, + { + "epoch": 0.4664007364222154, + "grad_norm": 0.7415264356489151, + "learning_rate": 1.877291319497046e-05, + "loss": 0.418, + "step": 1140 + }, + { + "epoch": 0.47049197095223483, + "grad_norm": 0.610984069094811, + "learning_rate": 1.87426147553401e-05, + "loss": 0.4163, + "step": 1150 + }, + { + "epoch": 0.47458320548225424, + "grad_norm": 0.8399219229428194, + "learning_rate": 1.8712316315709743e-05, + "loss": 0.3986, + "step": 1160 + }, + { + "epoch": 0.4786744400122737, + "grad_norm": 1.157757088776256, + "learning_rate": 1.8682017876079382e-05, + "loss": 0.4108, + "step": 1170 + }, + { + "epoch": 0.48276567454229313, + "grad_norm": 0.967241129437065, + "learning_rate": 1.8651719436449025e-05, + "loss": 0.4217, + "step": 1180 + }, + { + "epoch": 0.48685690907231255, + "grad_norm": 2.091651865632623, + "learning_rate": 1.8621420996818665e-05, + "loss": 0.4091, + "step": 1190 + }, + { + "epoch": 0.490948143602332, + "grad_norm": 0.6567666937577481, + "learning_rate": 1.8591122557188307e-05, + "loss": 0.4206, + "step": 1200 + }, + { + "epoch": 0.49503937813235144, + "grad_norm": 1.3838725189967105, + "learning_rate": 1.8560824117557947e-05, + "loss": 0.3986, + "step": 1210 + }, + { + "epoch": 0.49913061266237085, + "grad_norm": 0.7561244143868575, + "learning_rate": 1.853052567792759e-05, + "loss": 0.4091, + "step": 1220 + }, + { + "epoch": 0.5032218471923903, + "grad_norm": 1.1843346363848066, + "learning_rate": 1.850022723829723e-05, + "loss": 0.4009, + "step": 1230 + }, + { + "epoch": 0.5073130817224097, + "grad_norm": 0.9612775939058839, + "learning_rate": 1.846992879866687e-05, + "loss": 0.4312, + "step": 1240 + }, + { + "epoch": 0.5114043162524292, + "grad_norm": 0.9653095988733074, + "learning_rate": 1.843963035903651e-05, + "loss": 0.411, + "step": 1250 + }, + { + "epoch": 0.5154955507824486, + "grad_norm": 0.9538148396031432, + "learning_rate": 1.840933191940615e-05, + "loss": 0.4189, + "step": 1260 + }, + { + "epoch": 0.519586785312468, + "grad_norm": 0.831951461503928, + "learning_rate": 1.8379033479775794e-05, + "loss": 0.3866, + "step": 1270 + }, + { + "epoch": 0.5236780198424875, + "grad_norm": 0.8415502915298407, + "learning_rate": 1.8348735040145433e-05, + "loss": 0.43, + "step": 1280 + }, + { + "epoch": 0.5277692543725069, + "grad_norm": 0.8927957692335573, + "learning_rate": 1.8318436600515076e-05, + "loss": 0.4143, + "step": 1290 + }, + { + "epoch": 0.5318604889025264, + "grad_norm": 0.9951847425278025, + "learning_rate": 1.8288138160884716e-05, + "loss": 0.4082, + "step": 1300 + }, + { + "epoch": 0.5359517234325458, + "grad_norm": 0.9794640728750337, + "learning_rate": 1.825783972125436e-05, + "loss": 0.4123, + "step": 1310 + }, + { + "epoch": 0.5400429579625652, + "grad_norm": 1.2936619002334293, + "learning_rate": 1.8227541281623998e-05, + "loss": 0.4063, + "step": 1320 + }, + { + "epoch": 0.5441341924925847, + "grad_norm": 0.9539785376828354, + "learning_rate": 1.8197242841993637e-05, + "loss": 0.4001, + "step": 1330 + }, + { + "epoch": 0.548225427022604, + "grad_norm": 0.7647319210581001, + "learning_rate": 1.816694440236328e-05, + "loss": 0.4067, + "step": 1340 + }, + { + "epoch": 0.5523166615526235, + "grad_norm": 0.7391526930684824, + "learning_rate": 1.8136645962732923e-05, + "loss": 0.4135, + "step": 1350 + }, + { + "epoch": 0.556407896082643, + "grad_norm": 1.2278179621110827, + "learning_rate": 1.8106347523102562e-05, + "loss": 0.3955, + "step": 1360 + }, + { + "epoch": 0.5604991306126623, + "grad_norm": 0.9263102201334721, + "learning_rate": 1.8076049083472202e-05, + "loss": 0.4053, + "step": 1370 + }, + { + "epoch": 0.5645903651426818, + "grad_norm": 1.270439289582048, + "learning_rate": 1.8045750643841845e-05, + "loss": 0.4065, + "step": 1380 + }, + { + "epoch": 0.5686815996727013, + "grad_norm": 0.8192132223098715, + "learning_rate": 1.8015452204211484e-05, + "loss": 0.4085, + "step": 1390 + }, + { + "epoch": 0.5727728342027206, + "grad_norm": 0.6191336734377946, + "learning_rate": 1.7985153764581124e-05, + "loss": 0.402, + "step": 1400 + }, + { + "epoch": 0.5768640687327401, + "grad_norm": 0.7122309486400348, + "learning_rate": 1.7954855324950766e-05, + "loss": 0.3985, + "step": 1410 + }, + { + "epoch": 0.5809553032627596, + "grad_norm": 1.0903768849480002, + "learning_rate": 1.792455688532041e-05, + "loss": 0.4142, + "step": 1420 + }, + { + "epoch": 0.5850465377927789, + "grad_norm": 1.1699038018329126, + "learning_rate": 1.789425844569005e-05, + "loss": 0.4055, + "step": 1430 + }, + { + "epoch": 0.5891377723227984, + "grad_norm": 0.7701176154609344, + "learning_rate": 1.7863960006059688e-05, + "loss": 0.403, + "step": 1440 + }, + { + "epoch": 0.5932290068528179, + "grad_norm": 0.9632444193994341, + "learning_rate": 1.783366156642933e-05, + "loss": 0.3897, + "step": 1450 + }, + { + "epoch": 0.5973202413828372, + "grad_norm": 0.9564215846627407, + "learning_rate": 1.780336312679897e-05, + "loss": 0.4117, + "step": 1460 + }, + { + "epoch": 0.6014114759128567, + "grad_norm": 0.6899350345691984, + "learning_rate": 1.7773064687168613e-05, + "loss": 0.4137, + "step": 1470 + }, + { + "epoch": 0.6055027104428762, + "grad_norm": 0.9095599036748141, + "learning_rate": 1.7742766247538253e-05, + "loss": 0.415, + "step": 1480 + }, + { + "epoch": 0.6095939449728955, + "grad_norm": 0.9109255603659862, + "learning_rate": 1.7712467807907892e-05, + "loss": 0.407, + "step": 1490 + }, + { + "epoch": 0.613685179502915, + "grad_norm": 1.2266191388411232, + "learning_rate": 1.7682169368277535e-05, + "loss": 0.4055, + "step": 1500 + }, + { + "epoch": 0.613685179502915, + "eval_loss": 0.4121568500995636, + "eval_runtime": 568.64, + "eval_samples_per_second": 5.431, + "eval_steps_per_second": 0.906, + "step": 1500 + }, + { + "epoch": 0.6177764140329345, + "grad_norm": 2.106549959554808, + "learning_rate": 1.7651870928647178e-05, + "loss": 0.397, + "step": 1510 + }, + { + "epoch": 0.6218676485629538, + "grad_norm": 0.5026183280127312, + "learning_rate": 1.7621572489016817e-05, + "loss": 0.3986, + "step": 1520 + }, + { + "epoch": 0.6259588830929733, + "grad_norm": 1.141004596209656, + "learning_rate": 1.7591274049386457e-05, + "loss": 0.4074, + "step": 1530 + }, + { + "epoch": 0.6300501176229928, + "grad_norm": 0.8191222686339406, + "learning_rate": 1.75609756097561e-05, + "loss": 0.4002, + "step": 1540 + }, + { + "epoch": 0.6341413521530121, + "grad_norm": 0.6764982914151534, + "learning_rate": 1.753067717012574e-05, + "loss": 0.4107, + "step": 1550 + }, + { + "epoch": 0.6382325866830316, + "grad_norm": 1.3684032943814484, + "learning_rate": 1.750037873049538e-05, + "loss": 0.4036, + "step": 1560 + }, + { + "epoch": 0.642323821213051, + "grad_norm": 0.8576599206196178, + "learning_rate": 1.747008029086502e-05, + "loss": 0.4067, + "step": 1570 + }, + { + "epoch": 0.6464150557430705, + "grad_norm": 1.199412066961356, + "learning_rate": 1.7439781851234664e-05, + "loss": 0.4069, + "step": 1580 + }, + { + "epoch": 0.6505062902730899, + "grad_norm": 0.9099518471943355, + "learning_rate": 1.7409483411604304e-05, + "loss": 0.3995, + "step": 1590 + }, + { + "epoch": 0.6545975248031093, + "grad_norm": 1.1070377119289831, + "learning_rate": 1.7379184971973947e-05, + "loss": 0.4225, + "step": 1600 + }, + { + "epoch": 0.6586887593331288, + "grad_norm": 0.8753865952879469, + "learning_rate": 1.7348886532343586e-05, + "loss": 0.4032, + "step": 1610 + }, + { + "epoch": 0.6627799938631482, + "grad_norm": 0.7973497440286291, + "learning_rate": 1.7318588092713226e-05, + "loss": 0.4207, + "step": 1620 + }, + { + "epoch": 0.6668712283931676, + "grad_norm": 0.6925989497344892, + "learning_rate": 1.728828965308287e-05, + "loss": 0.4123, + "step": 1630 + }, + { + "epoch": 0.6709624629231871, + "grad_norm": 0.9669611664338155, + "learning_rate": 1.725799121345251e-05, + "loss": 0.3916, + "step": 1640 + }, + { + "epoch": 0.6750536974532065, + "grad_norm": 1.2385150364852342, + "learning_rate": 1.7227692773822147e-05, + "loss": 0.4136, + "step": 1650 + }, + { + "epoch": 0.6791449319832259, + "grad_norm": 0.7549418298109767, + "learning_rate": 1.719739433419179e-05, + "loss": 0.4146, + "step": 1660 + }, + { + "epoch": 0.6832361665132454, + "grad_norm": 1.0424008728514387, + "learning_rate": 1.7167095894561433e-05, + "loss": 0.3971, + "step": 1670 + }, + { + "epoch": 0.6873274010432648, + "grad_norm": 0.7360079123740301, + "learning_rate": 1.7136797454931072e-05, + "loss": 0.3947, + "step": 1680 + }, + { + "epoch": 0.6914186355732842, + "grad_norm": 0.961447300959097, + "learning_rate": 1.7106499015300712e-05, + "loss": 0.4035, + "step": 1690 + }, + { + "epoch": 0.6955098701033037, + "grad_norm": 0.7277946304181864, + "learning_rate": 1.7076200575670355e-05, + "loss": 0.3995, + "step": 1700 + }, + { + "epoch": 0.6996011046333231, + "grad_norm": 0.7930916610357551, + "learning_rate": 1.7045902136039994e-05, + "loss": 0.4057, + "step": 1710 + }, + { + "epoch": 0.7036923391633425, + "grad_norm": 0.7442545537670275, + "learning_rate": 1.7015603696409637e-05, + "loss": 0.4042, + "step": 1720 + }, + { + "epoch": 0.707783573693362, + "grad_norm": 1.1048107524067752, + "learning_rate": 1.6985305256779276e-05, + "loss": 0.4132, + "step": 1730 + }, + { + "epoch": 0.7118748082233815, + "grad_norm": 1.1900958503859818, + "learning_rate": 1.695500681714892e-05, + "loss": 0.4166, + "step": 1740 + }, + { + "epoch": 0.7159660427534008, + "grad_norm": 0.7615970705911563, + "learning_rate": 1.692470837751856e-05, + "loss": 0.3937, + "step": 1750 + }, + { + "epoch": 0.7200572772834203, + "grad_norm": 1.0147007380179698, + "learning_rate": 1.68944099378882e-05, + "loss": 0.3918, + "step": 1760 + }, + { + "epoch": 0.7241485118134398, + "grad_norm": 0.8482189838251506, + "learning_rate": 1.686411149825784e-05, + "loss": 0.3839, + "step": 1770 + }, + { + "epoch": 0.7282397463434591, + "grad_norm": 0.711159267866409, + "learning_rate": 1.683381305862748e-05, + "loss": 0.3948, + "step": 1780 + }, + { + "epoch": 0.7323309808734786, + "grad_norm": 1.371790290922361, + "learning_rate": 1.6803514618997123e-05, + "loss": 0.3984, + "step": 1790 + }, + { + "epoch": 0.7364222154034981, + "grad_norm": 0.7270726304455154, + "learning_rate": 1.6773216179366766e-05, + "loss": 0.4058, + "step": 1800 + }, + { + "epoch": 0.7405134499335174, + "grad_norm": 0.993765933652885, + "learning_rate": 1.6742917739736406e-05, + "loss": 0.3964, + "step": 1810 + }, + { + "epoch": 0.7446046844635369, + "grad_norm": 0.7396054882960263, + "learning_rate": 1.6712619300106045e-05, + "loss": 0.3988, + "step": 1820 + }, + { + "epoch": 0.7486959189935563, + "grad_norm": 1.1310726932463144, + "learning_rate": 1.6682320860475688e-05, + "loss": 0.4146, + "step": 1830 + }, + { + "epoch": 0.7527871535235757, + "grad_norm": 0.8570685688377147, + "learning_rate": 1.6652022420845327e-05, + "loss": 0.3971, + "step": 1840 + }, + { + "epoch": 0.7568783880535952, + "grad_norm": 1.2492633396416357, + "learning_rate": 1.6621723981214967e-05, + "loss": 0.4087, + "step": 1850 + }, + { + "epoch": 0.7609696225836146, + "grad_norm": 1.0021357130993627, + "learning_rate": 1.659142554158461e-05, + "loss": 0.4162, + "step": 1860 + }, + { + "epoch": 0.765060857113634, + "grad_norm": 0.7813489106628776, + "learning_rate": 1.656112710195425e-05, + "loss": 0.405, + "step": 1870 + }, + { + "epoch": 0.7691520916436535, + "grad_norm": 0.95305010521556, + "learning_rate": 1.6530828662323892e-05, + "loss": 0.391, + "step": 1880 + }, + { + "epoch": 0.7732433261736729, + "grad_norm": 0.6688770767238499, + "learning_rate": 1.650053022269353e-05, + "loss": 0.3802, + "step": 1890 + }, + { + "epoch": 0.7773345607036923, + "grad_norm": 0.6474194044675128, + "learning_rate": 1.6470231783063174e-05, + "loss": 0.4094, + "step": 1900 + }, + { + "epoch": 0.7814257952337118, + "grad_norm": 0.8498584394336631, + "learning_rate": 1.6439933343432814e-05, + "loss": 0.4073, + "step": 1910 + }, + { + "epoch": 0.7855170297637312, + "grad_norm": 0.7377058782682137, + "learning_rate": 1.6409634903802457e-05, + "loss": 0.3958, + "step": 1920 + }, + { + "epoch": 0.7896082642937506, + "grad_norm": 0.843204215889971, + "learning_rate": 1.6379336464172096e-05, + "loss": 0.414, + "step": 1930 + }, + { + "epoch": 0.7936994988237701, + "grad_norm": 0.6130692021490832, + "learning_rate": 1.6349038024541736e-05, + "loss": 0.4114, + "step": 1940 + }, + { + "epoch": 0.7977907333537895, + "grad_norm": 1.2350933998678075, + "learning_rate": 1.631873958491138e-05, + "loss": 0.4111, + "step": 1950 + }, + { + "epoch": 0.8018819678838089, + "grad_norm": 0.5814935040298003, + "learning_rate": 1.628844114528102e-05, + "loss": 0.4038, + "step": 1960 + }, + { + "epoch": 0.8059732024138284, + "grad_norm": 1.1917635251998295, + "learning_rate": 1.625814270565066e-05, + "loss": 0.4108, + "step": 1970 + }, + { + "epoch": 0.8100644369438478, + "grad_norm": 0.6526809291051177, + "learning_rate": 1.62278442660203e-05, + "loss": 0.3924, + "step": 1980 + }, + { + "epoch": 0.8141556714738672, + "grad_norm": 0.8339385093933975, + "learning_rate": 1.6197545826389943e-05, + "loss": 0.4007, + "step": 1990 + }, + { + "epoch": 0.8182469060038867, + "grad_norm": 0.8226184081347072, + "learning_rate": 1.6167247386759582e-05, + "loss": 0.4055, + "step": 2000 + }, + { + "epoch": 0.8182469060038867, + "eval_loss": 0.40546923875808716, + "eval_runtime": 569.0883, + "eval_samples_per_second": 5.426, + "eval_steps_per_second": 0.905, + "step": 2000 + }, + { + "epoch": 0.8223381405339061, + "grad_norm": 0.6918624248021936, + "learning_rate": 1.6136948947129225e-05, + "loss": 0.3996, + "step": 2010 + }, + { + "epoch": 0.8264293750639256, + "grad_norm": 0.9233306557406306, + "learning_rate": 1.6106650507498865e-05, + "loss": 0.4044, + "step": 2020 + }, + { + "epoch": 0.830520609593945, + "grad_norm": 1.036109547461375, + "learning_rate": 1.6076352067868508e-05, + "loss": 0.4078, + "step": 2030 + }, + { + "epoch": 0.8346118441239644, + "grad_norm": 0.6487594500506121, + "learning_rate": 1.6046053628238147e-05, + "loss": 0.3992, + "step": 2040 + }, + { + "epoch": 0.8387030786539839, + "grad_norm": 0.9061995762710394, + "learning_rate": 1.601575518860779e-05, + "loss": 0.4065, + "step": 2050 + }, + { + "epoch": 0.8427943131840032, + "grad_norm": 1.55527142608359, + "learning_rate": 1.598545674897743e-05, + "loss": 0.4126, + "step": 2060 + }, + { + "epoch": 0.8468855477140227, + "grad_norm": 0.6076317290425526, + "learning_rate": 1.595515830934707e-05, + "loss": 0.4027, + "step": 2070 + }, + { + "epoch": 0.8509767822440422, + "grad_norm": 1.100916449726025, + "learning_rate": 1.592485986971671e-05, + "loss": 0.4136, + "step": 2080 + }, + { + "epoch": 0.8550680167740615, + "grad_norm": 0.9334359781171854, + "learning_rate": 1.589456143008635e-05, + "loss": 0.4018, + "step": 2090 + }, + { + "epoch": 0.859159251304081, + "grad_norm": 1.1538729493388855, + "learning_rate": 1.586426299045599e-05, + "loss": 0.3981, + "step": 2100 + }, + { + "epoch": 0.8632504858341005, + "grad_norm": 0.6672272992408166, + "learning_rate": 1.5833964550825633e-05, + "loss": 0.3989, + "step": 2110 + }, + { + "epoch": 0.8673417203641198, + "grad_norm": 0.6790211847421547, + "learning_rate": 1.5803666111195276e-05, + "loss": 0.3999, + "step": 2120 + }, + { + "epoch": 0.8714329548941393, + "grad_norm": 0.8625927450358163, + "learning_rate": 1.5773367671564916e-05, + "loss": 0.4103, + "step": 2130 + }, + { + "epoch": 0.8755241894241588, + "grad_norm": 0.8799687483547954, + "learning_rate": 1.5743069231934555e-05, + "loss": 0.4, + "step": 2140 + }, + { + "epoch": 0.8796154239541781, + "grad_norm": 0.6045044925297284, + "learning_rate": 1.5712770792304198e-05, + "loss": 0.4127, + "step": 2150 + }, + { + "epoch": 0.8837066584841976, + "grad_norm": 1.1939155172076787, + "learning_rate": 1.5682472352673837e-05, + "loss": 0.3982, + "step": 2160 + }, + { + "epoch": 0.8877978930142171, + "grad_norm": 0.7558373479016637, + "learning_rate": 1.565217391304348e-05, + "loss": 0.4064, + "step": 2170 + }, + { + "epoch": 0.8918891275442364, + "grad_norm": 0.7840338054367947, + "learning_rate": 1.562187547341312e-05, + "loss": 0.411, + "step": 2180 + }, + { + "epoch": 0.8959803620742559, + "grad_norm": 0.6836078155952856, + "learning_rate": 1.5591577033782763e-05, + "loss": 0.4011, + "step": 2190 + }, + { + "epoch": 0.9000715966042754, + "grad_norm": 0.6588002590309792, + "learning_rate": 1.5561278594152402e-05, + "loss": 0.4034, + "step": 2200 + }, + { + "epoch": 0.9041628311342947, + "grad_norm": 0.7477069671187269, + "learning_rate": 1.5530980154522045e-05, + "loss": 0.414, + "step": 2210 + }, + { + "epoch": 0.9082540656643142, + "grad_norm": 0.9689258448123745, + "learning_rate": 1.5500681714891684e-05, + "loss": 0.3976, + "step": 2220 + }, + { + "epoch": 0.9123453001943337, + "grad_norm": 0.9394818001617259, + "learning_rate": 1.5470383275261324e-05, + "loss": 0.414, + "step": 2230 + }, + { + "epoch": 0.916436534724353, + "grad_norm": 0.7596601012213272, + "learning_rate": 1.5440084835630967e-05, + "loss": 0.3963, + "step": 2240 + }, + { + "epoch": 0.9205277692543725, + "grad_norm": 0.7148793324533078, + "learning_rate": 1.5409786396000606e-05, + "loss": 0.3933, + "step": 2250 + }, + { + "epoch": 0.924619003784392, + "grad_norm": 0.9033351893520957, + "learning_rate": 1.537948795637025e-05, + "loss": 0.4013, + "step": 2260 + }, + { + "epoch": 0.9287102383144114, + "grad_norm": 1.0192314023889302, + "learning_rate": 1.534918951673989e-05, + "loss": 0.4011, + "step": 2270 + }, + { + "epoch": 0.9328014728444308, + "grad_norm": 0.7958434523694122, + "learning_rate": 1.531889107710953e-05, + "loss": 0.4069, + "step": 2280 + }, + { + "epoch": 0.9368927073744503, + "grad_norm": 0.7201210026528915, + "learning_rate": 1.528859263747917e-05, + "loss": 0.3951, + "step": 2290 + }, + { + "epoch": 0.9409839419044697, + "grad_norm": 0.6175942086431213, + "learning_rate": 1.5258294197848814e-05, + "loss": 0.4047, + "step": 2300 + }, + { + "epoch": 0.9450751764344891, + "grad_norm": 0.7698111963051264, + "learning_rate": 1.5227995758218453e-05, + "loss": 0.4199, + "step": 2310 + }, + { + "epoch": 0.9491664109645085, + "grad_norm": 1.9264632916618274, + "learning_rate": 1.5197697318588094e-05, + "loss": 0.3931, + "step": 2320 + }, + { + "epoch": 0.953257645494528, + "grad_norm": 1.7879957288452364, + "learning_rate": 1.5167398878957735e-05, + "loss": 0.4007, + "step": 2330 + }, + { + "epoch": 0.9573488800245474, + "grad_norm": 0.5780222684091814, + "learning_rate": 1.5137100439327376e-05, + "loss": 0.4104, + "step": 2340 + }, + { + "epoch": 0.9614401145545668, + "grad_norm": 0.8683726024029582, + "learning_rate": 1.5106801999697016e-05, + "loss": 0.4036, + "step": 2350 + }, + { + "epoch": 0.9655313490845863, + "grad_norm": 0.9116114901687677, + "learning_rate": 1.5076503560066657e-05, + "loss": 0.4021, + "step": 2360 + }, + { + "epoch": 0.9696225836146057, + "grad_norm": 0.7992815803043055, + "learning_rate": 1.50462051204363e-05, + "loss": 0.4116, + "step": 2370 + }, + { + "epoch": 0.9737138181446251, + "grad_norm": 0.9553393046537682, + "learning_rate": 1.5015906680805941e-05, + "loss": 0.4081, + "step": 2380 + }, + { + "epoch": 0.9778050526746446, + "grad_norm": 0.588071607214625, + "learning_rate": 1.498560824117558e-05, + "loss": 0.4094, + "step": 2390 + }, + { + "epoch": 0.981896287204664, + "grad_norm": 0.8852293721983012, + "learning_rate": 1.4955309801545222e-05, + "loss": 0.4055, + "step": 2400 + }, + { + "epoch": 0.9859875217346834, + "grad_norm": 0.49652425498018843, + "learning_rate": 1.4925011361914863e-05, + "loss": 0.4, + "step": 2410 + }, + { + "epoch": 0.9900787562647029, + "grad_norm": 1.0572558341566427, + "learning_rate": 1.4894712922284504e-05, + "loss": 0.402, + "step": 2420 + }, + { + "epoch": 0.9941699907947223, + "grad_norm": 0.5472572586554295, + "learning_rate": 1.4864414482654143e-05, + "loss": 0.3824, + "step": 2430 + }, + { + "epoch": 0.9982612253247417, + "grad_norm": 0.6950966780791227, + "learning_rate": 1.4834116043023785e-05, + "loss": 0.3987, + "step": 2440 + }, + { + "epoch": 1.0020456172650096, + "grad_norm": 0.7596326868738933, + "learning_rate": 1.4803817603393427e-05, + "loss": 0.3989, + "step": 2450 + }, + { + "epoch": 1.006136851795029, + "grad_norm": 0.6633330801065885, + "learning_rate": 1.4773519163763069e-05, + "loss": 0.409, + "step": 2460 + }, + { + "epoch": 1.0102280863250486, + "grad_norm": 0.8877771135554996, + "learning_rate": 1.4743220724132708e-05, + "loss": 0.4127, + "step": 2470 + }, + { + "epoch": 1.014319320855068, + "grad_norm": 0.8214654626739897, + "learning_rate": 1.4712922284502349e-05, + "loss": 0.3883, + "step": 2480 + }, + { + "epoch": 1.0184105553850875, + "grad_norm": 1.0024461048478996, + "learning_rate": 1.468262384487199e-05, + "loss": 0.3684, + "step": 2490 + }, + { + "epoch": 1.022501789915107, + "grad_norm": 0.6824573340958298, + "learning_rate": 1.4652325405241631e-05, + "loss": 0.3826, + "step": 2500 + }, + { + "epoch": 1.022501789915107, + "eval_loss": 0.4052634835243225, + "eval_runtime": 566.8912, + "eval_samples_per_second": 5.447, + "eval_steps_per_second": 0.908, + "step": 2500 + }, + { + "epoch": 1.0265930244451262, + "grad_norm": 0.7110510772881399, + "learning_rate": 1.4622026965611271e-05, + "loss": 0.3871, + "step": 2510 + }, + { + "epoch": 1.0306842589751457, + "grad_norm": 0.7933140123736048, + "learning_rate": 1.4591728525980912e-05, + "loss": 0.3936, + "step": 2520 + }, + { + "epoch": 1.0347754935051652, + "grad_norm": 0.8306902290417011, + "learning_rate": 1.4561430086350555e-05, + "loss": 0.3987, + "step": 2530 + }, + { + "epoch": 1.0388667280351847, + "grad_norm": 0.7119087181302277, + "learning_rate": 1.4531131646720196e-05, + "loss": 0.3967, + "step": 2540 + }, + { + "epoch": 1.0429579625652041, + "grad_norm": 1.0370136579582763, + "learning_rate": 1.4500833207089835e-05, + "loss": 0.401, + "step": 2550 + }, + { + "epoch": 1.0470491970952236, + "grad_norm": 0.8532007552527441, + "learning_rate": 1.4470534767459477e-05, + "loss": 0.4037, + "step": 2560 + }, + { + "epoch": 1.0511404316252428, + "grad_norm": 0.6599598500909406, + "learning_rate": 1.4440236327829118e-05, + "loss": 0.3743, + "step": 2570 + }, + { + "epoch": 1.0552316661552623, + "grad_norm": 1.0157223230619488, + "learning_rate": 1.4409937888198759e-05, + "loss": 0.3845, + "step": 2580 + }, + { + "epoch": 1.0593229006852818, + "grad_norm": 0.885522215179448, + "learning_rate": 1.4379639448568402e-05, + "loss": 0.3903, + "step": 2590 + }, + { + "epoch": 1.0634141352153013, + "grad_norm": 0.6463142293297175, + "learning_rate": 1.434934100893804e-05, + "loss": 0.3781, + "step": 2600 + }, + { + "epoch": 1.0675053697453207, + "grad_norm": 1.0072005790653578, + "learning_rate": 1.4319042569307682e-05, + "loss": 0.3949, + "step": 2610 + }, + { + "epoch": 1.07159660427534, + "grad_norm": 0.7263224753671689, + "learning_rate": 1.4288744129677324e-05, + "loss": 0.3883, + "step": 2620 + }, + { + "epoch": 1.0756878388053595, + "grad_norm": 0.8333184288546814, + "learning_rate": 1.4258445690046965e-05, + "loss": 0.3815, + "step": 2630 + }, + { + "epoch": 1.079779073335379, + "grad_norm": 1.0113253670629012, + "learning_rate": 1.4228147250416604e-05, + "loss": 0.3813, + "step": 2640 + }, + { + "epoch": 1.0838703078653984, + "grad_norm": 0.8528216478869027, + "learning_rate": 1.4197848810786245e-05, + "loss": 0.4065, + "step": 2650 + }, + { + "epoch": 1.0879615423954179, + "grad_norm": 0.9440475129085675, + "learning_rate": 1.4167550371155886e-05, + "loss": 0.3962, + "step": 2660 + }, + { + "epoch": 1.0920527769254373, + "grad_norm": 0.6055474396833757, + "learning_rate": 1.413725193152553e-05, + "loss": 0.3995, + "step": 2670 + }, + { + "epoch": 1.0961440114554566, + "grad_norm": 1.0259816645911408, + "learning_rate": 1.4106953491895167e-05, + "loss": 0.3904, + "step": 2680 + }, + { + "epoch": 1.100235245985476, + "grad_norm": 0.5928389552557433, + "learning_rate": 1.407665505226481e-05, + "loss": 0.4021, + "step": 2690 + }, + { + "epoch": 1.1043264805154955, + "grad_norm": 0.6258598420479162, + "learning_rate": 1.4046356612634451e-05, + "loss": 0.391, + "step": 2700 + }, + { + "epoch": 1.108417715045515, + "grad_norm": 0.7147294619644624, + "learning_rate": 1.4016058173004092e-05, + "loss": 0.3837, + "step": 2710 + }, + { + "epoch": 1.1125089495755345, + "grad_norm": 0.646834791543815, + "learning_rate": 1.3985759733373732e-05, + "loss": 0.4035, + "step": 2720 + }, + { + "epoch": 1.116600184105554, + "grad_norm": 0.5735852182826044, + "learning_rate": 1.3955461293743373e-05, + "loss": 0.3857, + "step": 2730 + }, + { + "epoch": 1.1206914186355732, + "grad_norm": 0.7476886392647508, + "learning_rate": 1.3925162854113014e-05, + "loss": 0.4017, + "step": 2740 + }, + { + "epoch": 1.1247826531655927, + "grad_norm": 0.9517953594151923, + "learning_rate": 1.3894864414482657e-05, + "loss": 0.3792, + "step": 2750 + }, + { + "epoch": 1.1288738876956121, + "grad_norm": 0.8648296597362533, + "learning_rate": 1.3864565974852296e-05, + "loss": 0.3918, + "step": 2760 + }, + { + "epoch": 1.1329651222256316, + "grad_norm": 0.758091624498141, + "learning_rate": 1.3834267535221937e-05, + "loss": 0.3849, + "step": 2770 + }, + { + "epoch": 1.137056356755651, + "grad_norm": 0.8292099507010133, + "learning_rate": 1.3803969095591579e-05, + "loss": 0.4008, + "step": 2780 + }, + { + "epoch": 1.1411475912856703, + "grad_norm": 0.8886800402947126, + "learning_rate": 1.377367065596122e-05, + "loss": 0.4094, + "step": 2790 + }, + { + "epoch": 1.1452388258156898, + "grad_norm": 0.6921902768142778, + "learning_rate": 1.3743372216330859e-05, + "loss": 0.3857, + "step": 2800 + }, + { + "epoch": 1.1493300603457093, + "grad_norm": 0.7941140268815301, + "learning_rate": 1.37130737767005e-05, + "loss": 0.3946, + "step": 2810 + }, + { + "epoch": 1.1534212948757288, + "grad_norm": 0.8388193299897188, + "learning_rate": 1.3682775337070141e-05, + "loss": 0.3846, + "step": 2820 + }, + { + "epoch": 1.1575125294057482, + "grad_norm": 0.8827526348826681, + "learning_rate": 1.3652476897439784e-05, + "loss": 0.392, + "step": 2830 + }, + { + "epoch": 1.1616037639357677, + "grad_norm": 0.6399834967699533, + "learning_rate": 1.3622178457809424e-05, + "loss": 0.3936, + "step": 2840 + }, + { + "epoch": 1.1656949984657872, + "grad_norm": 0.7858076115450058, + "learning_rate": 1.3591880018179065e-05, + "loss": 0.4021, + "step": 2850 + }, + { + "epoch": 1.1697862329958064, + "grad_norm": 0.7641925003434336, + "learning_rate": 1.3561581578548706e-05, + "loss": 0.3855, + "step": 2860 + }, + { + "epoch": 1.173877467525826, + "grad_norm": 0.4485280825495544, + "learning_rate": 1.3531283138918347e-05, + "loss": 0.4015, + "step": 2870 + }, + { + "epoch": 1.1779687020558454, + "grad_norm": 0.8448609372779392, + "learning_rate": 1.3500984699287987e-05, + "loss": 0.4038, + "step": 2880 + }, + { + "epoch": 1.1820599365858648, + "grad_norm": 0.6112956233981559, + "learning_rate": 1.3470686259657628e-05, + "loss": 0.3941, + "step": 2890 + }, + { + "epoch": 1.1861511711158843, + "grad_norm": 0.787858901854597, + "learning_rate": 1.3440387820027269e-05, + "loss": 0.396, + "step": 2900 + }, + { + "epoch": 1.1902424056459036, + "grad_norm": 0.7489331322179997, + "learning_rate": 1.3410089380396912e-05, + "loss": 0.3855, + "step": 2910 + }, + { + "epoch": 1.194333640175923, + "grad_norm": 0.8245275477001581, + "learning_rate": 1.3379790940766553e-05, + "loss": 0.3903, + "step": 2920 + }, + { + "epoch": 1.1984248747059425, + "grad_norm": 0.5684832752577668, + "learning_rate": 1.3349492501136192e-05, + "loss": 0.3892, + "step": 2930 + }, + { + "epoch": 1.202516109235962, + "grad_norm": 0.5725195354548462, + "learning_rate": 1.3319194061505834e-05, + "loss": 0.3842, + "step": 2940 + }, + { + "epoch": 1.2066073437659814, + "grad_norm": 0.8605746695624041, + "learning_rate": 1.3288895621875475e-05, + "loss": 0.3989, + "step": 2950 + }, + { + "epoch": 1.2106985782960007, + "grad_norm": 0.569306392368477, + "learning_rate": 1.3258597182245116e-05, + "loss": 0.3886, + "step": 2960 + }, + { + "epoch": 1.2147898128260202, + "grad_norm": 0.8210447551557634, + "learning_rate": 1.3228298742614755e-05, + "loss": 0.378, + "step": 2970 + }, + { + "epoch": 1.2188810473560396, + "grad_norm": 0.5973996299783053, + "learning_rate": 1.3198000302984398e-05, + "loss": 0.4038, + "step": 2980 + }, + { + "epoch": 1.222972281886059, + "grad_norm": 1.5938167927490012, + "learning_rate": 1.316770186335404e-05, + "loss": 0.4133, + "step": 2990 + }, + { + "epoch": 1.2270635164160786, + "grad_norm": 0.6551073648017411, + "learning_rate": 1.313740342372368e-05, + "loss": 0.388, + "step": 3000 + }, + { + "epoch": 1.2270635164160786, + "eval_loss": 0.40439197421073914, + "eval_runtime": 566.8216, + "eval_samples_per_second": 5.448, + "eval_steps_per_second": 0.909, + "step": 3000 + }, + { + "epoch": 1.231154750946098, + "grad_norm": 0.9122941726483766, + "learning_rate": 1.310710498409332e-05, + "loss": 0.3962, + "step": 3010 + }, + { + "epoch": 1.2352459854761175, + "grad_norm": 0.8391097553941964, + "learning_rate": 1.3076806544462961e-05, + "loss": 0.387, + "step": 3020 + }, + { + "epoch": 1.2393372200061368, + "grad_norm": 0.8126464750636853, + "learning_rate": 1.3046508104832602e-05, + "loss": 0.3914, + "step": 3030 + }, + { + "epoch": 1.2434284545361562, + "grad_norm": 0.7877632963656168, + "learning_rate": 1.3016209665202243e-05, + "loss": 0.3678, + "step": 3040 + }, + { + "epoch": 1.2475196890661757, + "grad_norm": 0.7204057647654071, + "learning_rate": 1.2985911225571883e-05, + "loss": 0.4056, + "step": 3050 + }, + { + "epoch": 1.2516109235961952, + "grad_norm": 1.0360947842710033, + "learning_rate": 1.2955612785941526e-05, + "loss": 0.3921, + "step": 3060 + }, + { + "epoch": 1.2557021581262147, + "grad_norm": 0.8740894371532404, + "learning_rate": 1.2925314346311167e-05, + "loss": 0.3785, + "step": 3070 + }, + { + "epoch": 1.259793392656234, + "grad_norm": 0.7168443376463302, + "learning_rate": 1.2895015906680808e-05, + "loss": 0.403, + "step": 3080 + }, + { + "epoch": 1.2638846271862534, + "grad_norm": 0.5888660550300815, + "learning_rate": 1.2864717467050447e-05, + "loss": 0.3716, + "step": 3090 + }, + { + "epoch": 1.2679758617162729, + "grad_norm": 0.5289574947955048, + "learning_rate": 1.2834419027420089e-05, + "loss": 0.3764, + "step": 3100 + }, + { + "epoch": 1.2720670962462923, + "grad_norm": 0.7080617124171087, + "learning_rate": 1.280412058778973e-05, + "loss": 0.3927, + "step": 3110 + }, + { + "epoch": 1.2761583307763118, + "grad_norm": 0.8062114426141331, + "learning_rate": 1.277382214815937e-05, + "loss": 0.4044, + "step": 3120 + }, + { + "epoch": 1.280249565306331, + "grad_norm": 0.476668956425164, + "learning_rate": 1.274352370852901e-05, + "loss": 0.3868, + "step": 3130 + }, + { + "epoch": 1.2843407998363507, + "grad_norm": 0.6871622346909918, + "learning_rate": 1.2713225268898653e-05, + "loss": 0.3947, + "step": 3140 + }, + { + "epoch": 1.28843203436637, + "grad_norm": 0.7061613344838735, + "learning_rate": 1.2682926829268294e-05, + "loss": 0.3872, + "step": 3150 + }, + { + "epoch": 1.2925232688963895, + "grad_norm": 0.9863242302948113, + "learning_rate": 1.2652628389637935e-05, + "loss": 0.3889, + "step": 3160 + }, + { + "epoch": 1.296614503426409, + "grad_norm": 0.5738534501395554, + "learning_rate": 1.2622329950007575e-05, + "loss": 0.4054, + "step": 3170 + }, + { + "epoch": 1.3007057379564284, + "grad_norm": 0.8389559630933702, + "learning_rate": 1.2592031510377216e-05, + "loss": 0.3917, + "step": 3180 + }, + { + "epoch": 1.3047969724864479, + "grad_norm": 0.7991010292600728, + "learning_rate": 1.2561733070746857e-05, + "loss": 0.3901, + "step": 3190 + }, + { + "epoch": 1.3088882070164671, + "grad_norm": 0.5541164921899168, + "learning_rate": 1.25314346311165e-05, + "loss": 0.4008, + "step": 3200 + }, + { + "epoch": 1.3129794415464866, + "grad_norm": 0.8663675517676568, + "learning_rate": 1.2501136191486138e-05, + "loss": 0.3966, + "step": 3210 + }, + { + "epoch": 1.317070676076506, + "grad_norm": 0.5807714675119107, + "learning_rate": 1.247083775185578e-05, + "loss": 0.3881, + "step": 3220 + }, + { + "epoch": 1.3211619106065255, + "grad_norm": 0.5949773301489646, + "learning_rate": 1.2440539312225422e-05, + "loss": 0.3967, + "step": 3230 + }, + { + "epoch": 1.325253145136545, + "grad_norm": 1.0156281369557891, + "learning_rate": 1.2410240872595063e-05, + "loss": 0.3914, + "step": 3240 + }, + { + "epoch": 1.3293443796665643, + "grad_norm": 0.5546340288088691, + "learning_rate": 1.2379942432964704e-05, + "loss": 0.3869, + "step": 3250 + }, + { + "epoch": 1.3334356141965837, + "grad_norm": 1.0378625122625662, + "learning_rate": 1.2349643993334344e-05, + "loss": 0.4012, + "step": 3260 + }, + { + "epoch": 1.3375268487266032, + "grad_norm": 1.0535947037253341, + "learning_rate": 1.2319345553703985e-05, + "loss": 0.4132, + "step": 3270 + }, + { + "epoch": 1.3416180832566227, + "grad_norm": 0.6479776053412775, + "learning_rate": 1.2289047114073628e-05, + "loss": 0.3989, + "step": 3280 + }, + { + "epoch": 1.3457093177866422, + "grad_norm": 0.6474716962137215, + "learning_rate": 1.2258748674443269e-05, + "loss": 0.3937, + "step": 3290 + }, + { + "epoch": 1.3498005523166616, + "grad_norm": 0.770667683748129, + "learning_rate": 1.2228450234812908e-05, + "loss": 0.4021, + "step": 3300 + }, + { + "epoch": 1.353891786846681, + "grad_norm": 0.9562402793665876, + "learning_rate": 1.219815179518255e-05, + "loss": 0.3895, + "step": 3310 + }, + { + "epoch": 1.3579830213767003, + "grad_norm": 0.7835866454881345, + "learning_rate": 1.216785335555219e-05, + "loss": 0.3915, + "step": 3320 + }, + { + "epoch": 1.3620742559067198, + "grad_norm": 0.6840970037680498, + "learning_rate": 1.2137554915921832e-05, + "loss": 0.3933, + "step": 3330 + }, + { + "epoch": 1.3661654904367393, + "grad_norm": 0.6363375592990289, + "learning_rate": 1.2107256476291471e-05, + "loss": 0.3779, + "step": 3340 + }, + { + "epoch": 1.3702567249667588, + "grad_norm": 0.6422064337095349, + "learning_rate": 1.2076958036661112e-05, + "loss": 0.3781, + "step": 3350 + }, + { + "epoch": 1.3743479594967782, + "grad_norm": 0.7443280549443633, + "learning_rate": 1.2046659597030755e-05, + "loss": 0.3829, + "step": 3360 + }, + { + "epoch": 1.3784391940267975, + "grad_norm": 0.9031635762318649, + "learning_rate": 1.2016361157400396e-05, + "loss": 0.3964, + "step": 3370 + }, + { + "epoch": 1.382530428556817, + "grad_norm": 0.5980799440932613, + "learning_rate": 1.1986062717770036e-05, + "loss": 0.3887, + "step": 3380 + }, + { + "epoch": 1.3866216630868364, + "grad_norm": 0.6477461394839297, + "learning_rate": 1.1955764278139677e-05, + "loss": 0.3912, + "step": 3390 + }, + { + "epoch": 1.390712897616856, + "grad_norm": 0.8829826372548863, + "learning_rate": 1.1925465838509318e-05, + "loss": 0.4026, + "step": 3400 + }, + { + "epoch": 1.3948041321468754, + "grad_norm": 0.602437282415112, + "learning_rate": 1.1895167398878959e-05, + "loss": 0.3968, + "step": 3410 + }, + { + "epoch": 1.3988953666768946, + "grad_norm": 1.0853438418518562, + "learning_rate": 1.1864868959248599e-05, + "loss": 0.3795, + "step": 3420 + }, + { + "epoch": 1.4029866012069143, + "grad_norm": 0.68812770168341, + "learning_rate": 1.183457051961824e-05, + "loss": 0.3903, + "step": 3430 + }, + { + "epoch": 1.4070778357369336, + "grad_norm": 0.673891970112453, + "learning_rate": 1.1804272079987883e-05, + "loss": 0.3955, + "step": 3440 + }, + { + "epoch": 1.411169070266953, + "grad_norm": 0.9658876480715098, + "learning_rate": 1.1773973640357524e-05, + "loss": 0.3806, + "step": 3450 + }, + { + "epoch": 1.4152603047969725, + "grad_norm": 0.9072014909826842, + "learning_rate": 1.1743675200727163e-05, + "loss": 0.393, + "step": 3460 + }, + { + "epoch": 1.419351539326992, + "grad_norm": 0.8593625419024139, + "learning_rate": 1.1713376761096804e-05, + "loss": 0.3829, + "step": 3470 + }, + { + "epoch": 1.4234427738570115, + "grad_norm": 1.1598468495920022, + "learning_rate": 1.1683078321466445e-05, + "loss": 0.3895, + "step": 3480 + }, + { + "epoch": 1.4275340083870307, + "grad_norm": 0.6212230421582741, + "learning_rate": 1.1652779881836087e-05, + "loss": 0.404, + "step": 3490 + }, + { + "epoch": 1.4316252429170502, + "grad_norm": 1.052418138214992, + "learning_rate": 1.1622481442205726e-05, + "loss": 0.3788, + "step": 3500 + }, + { + "epoch": 1.4316252429170502, + "eval_loss": 0.3989790678024292, + "eval_runtime": 567.3009, + "eval_samples_per_second": 5.443, + "eval_steps_per_second": 0.908, + "step": 3500 + }, + { + "epoch": 1.4357164774470696, + "grad_norm": 0.9457200564933581, + "learning_rate": 1.1592183002575367e-05, + "loss": 0.3848, + "step": 3510 + }, + { + "epoch": 1.4398077119770891, + "grad_norm": 0.711408339200041, + "learning_rate": 1.156188456294501e-05, + "loss": 0.3777, + "step": 3520 + }, + { + "epoch": 1.4438989465071086, + "grad_norm": 0.765462240881764, + "learning_rate": 1.1531586123314651e-05, + "loss": 0.392, + "step": 3530 + }, + { + "epoch": 1.4479901810371278, + "grad_norm": 0.8447863863134836, + "learning_rate": 1.150128768368429e-05, + "loss": 0.394, + "step": 3540 + }, + { + "epoch": 1.4520814155671473, + "grad_norm": 0.7219223617330234, + "learning_rate": 1.1470989244053932e-05, + "loss": 0.3854, + "step": 3550 + }, + { + "epoch": 1.4561726500971668, + "grad_norm": 0.9253301498291125, + "learning_rate": 1.1440690804423573e-05, + "loss": 0.383, + "step": 3560 + }, + { + "epoch": 1.4602638846271863, + "grad_norm": 0.7352515331825862, + "learning_rate": 1.1410392364793214e-05, + "loss": 0.4006, + "step": 3570 + }, + { + "epoch": 1.4643551191572057, + "grad_norm": 0.7903238163122892, + "learning_rate": 1.1380093925162854e-05, + "loss": 0.3966, + "step": 3580 + }, + { + "epoch": 1.4684463536872252, + "grad_norm": 0.8163601731666391, + "learning_rate": 1.1349795485532495e-05, + "loss": 0.3843, + "step": 3590 + }, + { + "epoch": 1.4725375882172447, + "grad_norm": 0.7031404690333898, + "learning_rate": 1.1319497045902137e-05, + "loss": 0.3889, + "step": 3600 + }, + { + "epoch": 1.476628822747264, + "grad_norm": 0.9035695820122014, + "learning_rate": 1.1289198606271779e-05, + "loss": 0.3827, + "step": 3610 + }, + { + "epoch": 1.4807200572772834, + "grad_norm": 0.8712016470060545, + "learning_rate": 1.125890016664142e-05, + "loss": 0.3833, + "step": 3620 + }, + { + "epoch": 1.4848112918073029, + "grad_norm": 0.5886133981154346, + "learning_rate": 1.122860172701106e-05, + "loss": 0.3852, + "step": 3630 + }, + { + "epoch": 1.4889025263373223, + "grad_norm": 0.6641821222496354, + "learning_rate": 1.11983032873807e-05, + "loss": 0.3883, + "step": 3640 + }, + { + "epoch": 1.4929937608673418, + "grad_norm": 0.71470696861074, + "learning_rate": 1.1168004847750342e-05, + "loss": 0.39, + "step": 3650 + }, + { + "epoch": 1.497084995397361, + "grad_norm": 0.7599626830703158, + "learning_rate": 1.1137706408119984e-05, + "loss": 0.3988, + "step": 3660 + }, + { + "epoch": 1.5011762299273805, + "grad_norm": 0.8131240458205017, + "learning_rate": 1.1107407968489622e-05, + "loss": 0.3837, + "step": 3670 + }, + { + "epoch": 1.5052674644574, + "grad_norm": 0.5981406315717848, + "learning_rate": 1.1077109528859265e-05, + "loss": 0.3976, + "step": 3680 + }, + { + "epoch": 1.5093586989874195, + "grad_norm": 0.6546000876796034, + "learning_rate": 1.1046811089228906e-05, + "loss": 0.3845, + "step": 3690 + }, + { + "epoch": 1.513449933517439, + "grad_norm": 0.6618741576872935, + "learning_rate": 1.1016512649598547e-05, + "loss": 0.3829, + "step": 3700 + }, + { + "epoch": 1.5175411680474582, + "grad_norm": 0.7644320579880938, + "learning_rate": 1.0986214209968187e-05, + "loss": 0.3904, + "step": 3710 + }, + { + "epoch": 1.521632402577478, + "grad_norm": 0.7078963682359172, + "learning_rate": 1.0955915770337828e-05, + "loss": 0.3943, + "step": 3720 + }, + { + "epoch": 1.5257236371074971, + "grad_norm": 0.9863976210557551, + "learning_rate": 1.0925617330707469e-05, + "loss": 0.3836, + "step": 3730 + }, + { + "epoch": 1.5298148716375166, + "grad_norm": 0.7431834628180725, + "learning_rate": 1.0895318891077112e-05, + "loss": 0.4033, + "step": 3740 + }, + { + "epoch": 1.533906106167536, + "grad_norm": 0.9543361591228587, + "learning_rate": 1.0865020451446751e-05, + "loss": 0.3928, + "step": 3750 + }, + { + "epoch": 1.5379973406975553, + "grad_norm": 0.7174707063961077, + "learning_rate": 1.0834722011816392e-05, + "loss": 0.3848, + "step": 3760 + }, + { + "epoch": 1.542088575227575, + "grad_norm": 0.8245320777882585, + "learning_rate": 1.0804423572186034e-05, + "loss": 0.3992, + "step": 3770 + }, + { + "epoch": 1.5461798097575943, + "grad_norm": 1.0937610813639995, + "learning_rate": 1.0774125132555675e-05, + "loss": 0.3922, + "step": 3780 + }, + { + "epoch": 1.5502710442876138, + "grad_norm": 0.6595221788634811, + "learning_rate": 1.0743826692925314e-05, + "loss": 0.3846, + "step": 3790 + }, + { + "epoch": 1.5543622788176332, + "grad_norm": 0.6933714247729369, + "learning_rate": 1.0713528253294955e-05, + "loss": 0.3924, + "step": 3800 + }, + { + "epoch": 1.5584535133476527, + "grad_norm": 0.6322021390419345, + "learning_rate": 1.0683229813664597e-05, + "loss": 0.382, + "step": 3810 + }, + { + "epoch": 1.5625447478776722, + "grad_norm": 0.8617688710364446, + "learning_rate": 1.065293137403424e-05, + "loss": 0.39, + "step": 3820 + }, + { + "epoch": 1.5666359824076914, + "grad_norm": 1.244329874318961, + "learning_rate": 1.0622632934403879e-05, + "loss": 0.3963, + "step": 3830 + }, + { + "epoch": 1.5707272169377111, + "grad_norm": 0.9027740160226115, + "learning_rate": 1.059233449477352e-05, + "loss": 0.382, + "step": 3840 + }, + { + "epoch": 1.5748184514677304, + "grad_norm": 0.5021808277805254, + "learning_rate": 1.0562036055143161e-05, + "loss": 0.399, + "step": 3850 + }, + { + "epoch": 1.5789096859977498, + "grad_norm": 0.6718417113604481, + "learning_rate": 1.0531737615512802e-05, + "loss": 0.3947, + "step": 3860 + }, + { + "epoch": 1.5830009205277693, + "grad_norm": 0.6732165543554006, + "learning_rate": 1.0501439175882442e-05, + "loss": 0.3921, + "step": 3870 + }, + { + "epoch": 1.5870921550577886, + "grad_norm": 0.8949389121109214, + "learning_rate": 1.0471140736252083e-05, + "loss": 0.3789, + "step": 3880 + }, + { + "epoch": 1.5911833895878083, + "grad_norm": 0.8368104145013396, + "learning_rate": 1.0440842296621724e-05, + "loss": 0.3838, + "step": 3890 + }, + { + "epoch": 1.5952746241178275, + "grad_norm": 0.6115609968754325, + "learning_rate": 1.0410543856991367e-05, + "loss": 0.3919, + "step": 3900 + }, + { + "epoch": 1.599365858647847, + "grad_norm": 0.8379228899852589, + "learning_rate": 1.0380245417361006e-05, + "loss": 0.3991, + "step": 3910 + }, + { + "epoch": 1.6034570931778664, + "grad_norm": 0.7827671214396511, + "learning_rate": 1.0349946977730647e-05, + "loss": 0.3794, + "step": 3920 + }, + { + "epoch": 1.6075483277078857, + "grad_norm": 0.6805284446607271, + "learning_rate": 1.0319648538100289e-05, + "loss": 0.3936, + "step": 3930 + }, + { + "epoch": 1.6116395622379054, + "grad_norm": 0.5081615910622816, + "learning_rate": 1.028935009846993e-05, + "loss": 0.3882, + "step": 3940 + }, + { + "epoch": 1.6157307967679246, + "grad_norm": 0.585926687974076, + "learning_rate": 1.0259051658839571e-05, + "loss": 0.4009, + "step": 3950 + }, + { + "epoch": 1.619822031297944, + "grad_norm": 0.6755208541842371, + "learning_rate": 1.022875321920921e-05, + "loss": 0.3797, + "step": 3960 + }, + { + "epoch": 1.6239132658279636, + "grad_norm": 0.7578805362403562, + "learning_rate": 1.0198454779578853e-05, + "loss": 0.3877, + "step": 3970 + }, + { + "epoch": 1.628004500357983, + "grad_norm": 0.7490700009059831, + "learning_rate": 1.0168156339948494e-05, + "loss": 0.3779, + "step": 3980 + }, + { + "epoch": 1.6320957348880025, + "grad_norm": 0.6801784359822746, + "learning_rate": 1.0137857900318136e-05, + "loss": 0.3741, + "step": 3990 + }, + { + "epoch": 1.6361869694180218, + "grad_norm": 0.6583694408666441, + "learning_rate": 1.0107559460687775e-05, + "loss": 0.3959, + "step": 4000 + }, + { + "epoch": 1.6361869694180218, + "eval_loss": 0.39864814281463623, + "eval_runtime": 580.5731, + "eval_samples_per_second": 5.319, + "eval_steps_per_second": 0.887, + "step": 4000 + }, + { + "epoch": 1.6402782039480415, + "grad_norm": 0.9122028487385195, + "learning_rate": 1.0077261021057416e-05, + "loss": 0.3933, + "step": 4010 + }, + { + "epoch": 1.6443694384780607, + "grad_norm": 0.9631240105647585, + "learning_rate": 1.0046962581427057e-05, + "loss": 0.3886, + "step": 4020 + }, + { + "epoch": 1.6484606730080802, + "grad_norm": 0.8266805005833362, + "learning_rate": 1.0016664141796698e-05, + "loss": 0.3878, + "step": 4030 + }, + { + "epoch": 1.6525519075380997, + "grad_norm": 0.692721855989811, + "learning_rate": 9.98636570216634e-06, + "loss": 0.3732, + "step": 4040 + }, + { + "epoch": 1.656643142068119, + "grad_norm": 0.5908823120184169, + "learning_rate": 9.95606726253598e-06, + "loss": 0.3808, + "step": 4050 + }, + { + "epoch": 1.6607343765981386, + "grad_norm": 0.9905003235273305, + "learning_rate": 9.925768822905622e-06, + "loss": 0.3807, + "step": 4060 + }, + { + "epoch": 1.6648256111281579, + "grad_norm": 0.731190147784246, + "learning_rate": 9.895470383275261e-06, + "loss": 0.3826, + "step": 4070 + }, + { + "epoch": 1.6689168456581773, + "grad_norm": 0.5342009151761759, + "learning_rate": 9.865171943644904e-06, + "loss": 0.3931, + "step": 4080 + }, + { + "epoch": 1.6730080801881968, + "grad_norm": 0.6299401650866043, + "learning_rate": 9.834873504014544e-06, + "loss": 0.3655, + "step": 4090 + }, + { + "epoch": 1.6770993147182163, + "grad_norm": 0.6634845001555149, + "learning_rate": 9.804575064384185e-06, + "loss": 0.3857, + "step": 4100 + }, + { + "epoch": 1.6811905492482357, + "grad_norm": 0.47553645688690244, + "learning_rate": 9.774276624753826e-06, + "loss": 0.3819, + "step": 4110 + }, + { + "epoch": 1.685281783778255, + "grad_norm": 0.946430943640409, + "learning_rate": 9.743978185123467e-06, + "loss": 0.3761, + "step": 4120 + }, + { + "epoch": 1.6893730183082747, + "grad_norm": 0.9939471324083836, + "learning_rate": 9.713679745493108e-06, + "loss": 0.388, + "step": 4130 + }, + { + "epoch": 1.693464252838294, + "grad_norm": 0.6474361686497936, + "learning_rate": 9.68338130586275e-06, + "loss": 0.3662, + "step": 4140 + }, + { + "epoch": 1.6975554873683134, + "grad_norm": 0.7980334202517952, + "learning_rate": 9.653082866232389e-06, + "loss": 0.3849, + "step": 4150 + }, + { + "epoch": 1.7016467218983329, + "grad_norm": 0.728268244955796, + "learning_rate": 9.622784426602032e-06, + "loss": 0.3842, + "step": 4160 + }, + { + "epoch": 1.7057379564283521, + "grad_norm": 0.9131018127080367, + "learning_rate": 9.592485986971671e-06, + "loss": 0.3824, + "step": 4170 + }, + { + "epoch": 1.7098291909583718, + "grad_norm": 0.7674900891840208, + "learning_rate": 9.562187547341312e-06, + "loss": 0.3733, + "step": 4180 + }, + { + "epoch": 1.713920425488391, + "grad_norm": 0.9246508059789594, + "learning_rate": 9.531889107710953e-06, + "loss": 0.4029, + "step": 4190 + }, + { + "epoch": 1.7180116600184105, + "grad_norm": 0.6178369369789531, + "learning_rate": 9.501590668080595e-06, + "loss": 0.37, + "step": 4200 + }, + { + "epoch": 1.72210289454843, + "grad_norm": 0.9227660541364927, + "learning_rate": 9.471292228450236e-06, + "loss": 0.3841, + "step": 4210 + }, + { + "epoch": 1.7261941290784493, + "grad_norm": 0.6860127218022202, + "learning_rate": 9.440993788819877e-06, + "loss": 0.3859, + "step": 4220 + }, + { + "epoch": 1.730285363608469, + "grad_norm": 0.6968645833002234, + "learning_rate": 9.410695349189516e-06, + "loss": 0.4007, + "step": 4230 + }, + { + "epoch": 1.7343765981384882, + "grad_norm": 0.7545598370400031, + "learning_rate": 9.38039690955916e-06, + "loss": 0.3859, + "step": 4240 + }, + { + "epoch": 1.7384678326685077, + "grad_norm": 0.7363763263567308, + "learning_rate": 9.350098469928799e-06, + "loss": 0.3867, + "step": 4250 + }, + { + "epoch": 1.7425590671985272, + "grad_norm": 0.48614259001613647, + "learning_rate": 9.31980003029844e-06, + "loss": 0.3798, + "step": 4260 + }, + { + "epoch": 1.7466503017285466, + "grad_norm": 0.7729758585437992, + "learning_rate": 9.289501590668083e-06, + "loss": 0.3912, + "step": 4270 + }, + { + "epoch": 1.750741536258566, + "grad_norm": 0.7305646510615461, + "learning_rate": 9.259203151037722e-06, + "loss": 0.385, + "step": 4280 + }, + { + "epoch": 1.7548327707885853, + "grad_norm": 0.9399629054581843, + "learning_rate": 9.228904711407363e-06, + "loss": 0.3875, + "step": 4290 + }, + { + "epoch": 1.758924005318605, + "grad_norm": 0.49789660727149787, + "learning_rate": 9.198606271777004e-06, + "loss": 0.3886, + "step": 4300 + }, + { + "epoch": 1.7630152398486243, + "grad_norm": 0.8040334242726411, + "learning_rate": 9.168307832146646e-06, + "loss": 0.3854, + "step": 4310 + }, + { + "epoch": 1.7671064743786438, + "grad_norm": 0.520719829405063, + "learning_rate": 9.138009392516287e-06, + "loss": 0.3717, + "step": 4320 + }, + { + "epoch": 1.7711977089086632, + "grad_norm": 1.0848704176681172, + "learning_rate": 9.107710952885928e-06, + "loss": 0.3888, + "step": 4330 + }, + { + "epoch": 1.7752889434386825, + "grad_norm": 1.1159019237173737, + "learning_rate": 9.077412513255567e-06, + "loss": 0.3773, + "step": 4340 + }, + { + "epoch": 1.7793801779687022, + "grad_norm": 0.45334318412994085, + "learning_rate": 9.04711407362521e-06, + "loss": 0.3795, + "step": 4350 + }, + { + "epoch": 1.7834714124987214, + "grad_norm": 0.6934540742121007, + "learning_rate": 9.01681563399485e-06, + "loss": 0.3796, + "step": 4360 + }, + { + "epoch": 1.787562647028741, + "grad_norm": 0.5578670479631883, + "learning_rate": 8.98651719436449e-06, + "loss": 0.3845, + "step": 4370 + }, + { + "epoch": 1.7916538815587604, + "grad_norm": 0.7503759650869845, + "learning_rate": 8.956218754734132e-06, + "loss": 0.3942, + "step": 4380 + }, + { + "epoch": 1.7957451160887796, + "grad_norm": 0.867971089595999, + "learning_rate": 8.925920315103773e-06, + "loss": 0.3862, + "step": 4390 + }, + { + "epoch": 1.7998363506187993, + "grad_norm": 0.8400133056337923, + "learning_rate": 8.895621875473414e-06, + "loss": 0.4045, + "step": 4400 + }, + { + "epoch": 1.8039275851488186, + "grad_norm": 0.8593095780014482, + "learning_rate": 8.865323435843055e-06, + "loss": 0.3972, + "step": 4410 + }, + { + "epoch": 1.808018819678838, + "grad_norm": 0.8648015436921023, + "learning_rate": 8.835024996212695e-06, + "loss": 0.3885, + "step": 4420 + }, + { + "epoch": 1.8121100542088575, + "grad_norm": 0.8474119696284984, + "learning_rate": 8.804726556582338e-06, + "loss": 0.3797, + "step": 4430 + }, + { + "epoch": 1.816201288738877, + "grad_norm": 0.6888945197943561, + "learning_rate": 8.774428116951977e-06, + "loss": 0.386, + "step": 4440 + }, + { + "epoch": 1.8202925232688965, + "grad_norm": 0.8152832539209761, + "learning_rate": 8.744129677321618e-06, + "loss": 0.3903, + "step": 4450 + }, + { + "epoch": 1.8243837577989157, + "grad_norm": 0.7728650210318904, + "learning_rate": 8.71383123769126e-06, + "loss": 0.3919, + "step": 4460 + }, + { + "epoch": 1.8284749923289354, + "grad_norm": 0.5391104409638143, + "learning_rate": 8.6835327980609e-06, + "loss": 0.3803, + "step": 4470 + }, + { + "epoch": 1.8325662268589546, + "grad_norm": 0.7567717191116343, + "learning_rate": 8.653234358430542e-06, + "loss": 0.3815, + "step": 4480 + }, + { + "epoch": 1.8366574613889741, + "grad_norm": 0.8749818376847743, + "learning_rate": 8.622935918800183e-06, + "loss": 0.3883, + "step": 4490 + }, + { + "epoch": 1.8407486959189936, + "grad_norm": 0.8019778003533239, + "learning_rate": 8.592637479169822e-06, + "loss": 0.383, + "step": 4500 + }, + { + "epoch": 1.8407486959189936, + "eval_loss": 0.3955570459365845, + "eval_runtime": 568.3755, + "eval_samples_per_second": 5.433, + "eval_steps_per_second": 0.906, + "step": 4500 + }, + { + "epoch": 1.8448399304490128, + "grad_norm": 0.6293625351988416, + "learning_rate": 8.562339039539465e-06, + "loss": 0.387, + "step": 4510 + }, + { + "epoch": 1.8489311649790325, + "grad_norm": 0.6963219813926529, + "learning_rate": 8.532040599909105e-06, + "loss": 0.3802, + "step": 4520 + }, + { + "epoch": 1.8530223995090518, + "grad_norm": 0.6929306602792736, + "learning_rate": 8.501742160278746e-06, + "loss": 0.3797, + "step": 4530 + }, + { + "epoch": 1.8571136340390713, + "grad_norm": 0.669690717256871, + "learning_rate": 8.471443720648387e-06, + "loss": 0.394, + "step": 4540 + }, + { + "epoch": 1.8612048685690907, + "grad_norm": 0.6233392743914433, + "learning_rate": 8.441145281018028e-06, + "loss": 0.3896, + "step": 4550 + }, + { + "epoch": 1.8652961030991102, + "grad_norm": 0.7239502206719675, + "learning_rate": 8.41084684138767e-06, + "loss": 0.3947, + "step": 4560 + }, + { + "epoch": 1.8693873376291297, + "grad_norm": 0.6756956090605876, + "learning_rate": 8.38054840175731e-06, + "loss": 0.3925, + "step": 4570 + }, + { + "epoch": 1.873478572159149, + "grad_norm": 0.7446762514738522, + "learning_rate": 8.35024996212695e-06, + "loss": 0.3833, + "step": 4580 + }, + { + "epoch": 1.8775698066891686, + "grad_norm": 0.7701766571709215, + "learning_rate": 8.319951522496593e-06, + "loss": 0.3864, + "step": 4590 + }, + { + "epoch": 1.8816610412191879, + "grad_norm": 0.6441859481438332, + "learning_rate": 8.289653082866234e-06, + "loss": 0.3776, + "step": 4600 + }, + { + "epoch": 1.8857522757492073, + "grad_norm": 0.7880686132167092, + "learning_rate": 8.259354643235873e-06, + "loss": 0.3933, + "step": 4610 + }, + { + "epoch": 1.8898435102792268, + "grad_norm": 0.6332952763922544, + "learning_rate": 8.229056203605516e-06, + "loss": 0.3853, + "step": 4620 + }, + { + "epoch": 1.893934744809246, + "grad_norm": 0.7243342267131859, + "learning_rate": 8.198757763975156e-06, + "loss": 0.3792, + "step": 4630 + }, + { + "epoch": 1.8980259793392658, + "grad_norm": 0.7901459850102072, + "learning_rate": 8.168459324344797e-06, + "loss": 0.3869, + "step": 4640 + }, + { + "epoch": 1.902117213869285, + "grad_norm": 0.51044070307818, + "learning_rate": 8.138160884714438e-06, + "loss": 0.3775, + "step": 4650 + }, + { + "epoch": 1.9062084483993045, + "grad_norm": 0.8558077236451173, + "learning_rate": 8.107862445084079e-06, + "loss": 0.3894, + "step": 4660 + }, + { + "epoch": 1.910299682929324, + "grad_norm": 0.7131026847220464, + "learning_rate": 8.07756400545372e-06, + "loss": 0.3862, + "step": 4670 + }, + { + "epoch": 1.9143909174593432, + "grad_norm": 0.9750476025242552, + "learning_rate": 8.047265565823361e-06, + "loss": 0.3957, + "step": 4680 + }, + { + "epoch": 1.918482151989363, + "grad_norm": 0.7225249415951167, + "learning_rate": 8.016967126193e-06, + "loss": 0.4027, + "step": 4690 + }, + { + "epoch": 1.9225733865193821, + "grad_norm": 0.5891488270484448, + "learning_rate": 7.986668686562644e-06, + "loss": 0.3886, + "step": 4700 + }, + { + "epoch": 1.9266646210494016, + "grad_norm": 1.0628090466492606, + "learning_rate": 7.956370246932283e-06, + "loss": 0.398, + "step": 4710 + }, + { + "epoch": 1.930755855579421, + "grad_norm": 0.6647239527835125, + "learning_rate": 7.926071807301924e-06, + "loss": 0.3918, + "step": 4720 + }, + { + "epoch": 1.9348470901094406, + "grad_norm": 0.7463185047324199, + "learning_rate": 7.895773367671565e-06, + "loss": 0.3827, + "step": 4730 + }, + { + "epoch": 1.93893832463946, + "grad_norm": 0.7195015094736638, + "learning_rate": 7.865474928041206e-06, + "loss": 0.4016, + "step": 4740 + }, + { + "epoch": 1.9430295591694793, + "grad_norm": 0.8535138221139457, + "learning_rate": 7.835176488410848e-06, + "loss": 0.3869, + "step": 4750 + }, + { + "epoch": 1.947120793699499, + "grad_norm": 0.7791996740996957, + "learning_rate": 7.804878048780489e-06, + "loss": 0.3714, + "step": 4760 + }, + { + "epoch": 1.9512120282295182, + "grad_norm": 0.5772248432427366, + "learning_rate": 7.77457960915013e-06, + "loss": 0.3822, + "step": 4770 + }, + { + "epoch": 1.9553032627595377, + "grad_norm": 0.7604443317770228, + "learning_rate": 7.744281169519771e-06, + "loss": 0.379, + "step": 4780 + }, + { + "epoch": 1.9593944972895572, + "grad_norm": 0.588549852538312, + "learning_rate": 7.71398272988941e-06, + "loss": 0.3814, + "step": 4790 + }, + { + "epoch": 1.9634857318195764, + "grad_norm": 1.0609622015863114, + "learning_rate": 7.683684290259052e-06, + "loss": 0.3844, + "step": 4800 + }, + { + "epoch": 1.9675769663495961, + "grad_norm": 0.8833450757521427, + "learning_rate": 7.653385850628693e-06, + "loss": 0.3941, + "step": 4810 + }, + { + "epoch": 1.9716682008796154, + "grad_norm": 0.6612140126470812, + "learning_rate": 7.623087410998334e-06, + "loss": 0.3805, + "step": 4820 + }, + { + "epoch": 1.9757594354096348, + "grad_norm": 0.653419144919706, + "learning_rate": 7.592788971367975e-06, + "loss": 0.3666, + "step": 4830 + }, + { + "epoch": 1.9798506699396543, + "grad_norm": 0.6797670566745962, + "learning_rate": 7.562490531737616e-06, + "loss": 0.3742, + "step": 4840 + }, + { + "epoch": 1.9839419044696738, + "grad_norm": 0.6644531103160585, + "learning_rate": 7.5321920921072566e-06, + "loss": 0.3842, + "step": 4850 + }, + { + "epoch": 1.9880331389996933, + "grad_norm": 1.2106290360521903, + "learning_rate": 7.5018936524768986e-06, + "loss": 0.3911, + "step": 4860 + }, + { + "epoch": 1.9921243735297125, + "grad_norm": 0.5569971531476807, + "learning_rate": 7.471595212846539e-06, + "loss": 0.4069, + "step": 4870 + }, + { + "epoch": 1.996215608059732, + "grad_norm": 0.5191981517163119, + "learning_rate": 7.44129677321618e-06, + "loss": 0.3763, + "step": 4880 + }, + { + "epoch": 2.0, + "grad_norm": 1.3233158209509672, + "learning_rate": 7.41099833358582e-06, + "loss": 0.368, + "step": 4890 + }, + { + "epoch": 2.0040912345300192, + "grad_norm": 0.6324023695845297, + "learning_rate": 7.380699893955462e-06, + "loss": 0.3502, + "step": 4900 + }, + { + "epoch": 2.008182469060039, + "grad_norm": 0.6060365694093942, + "learning_rate": 7.350401454325103e-06, + "loss": 0.3634, + "step": 4910 + }, + { + "epoch": 2.012273703590058, + "grad_norm": 0.9148807418922357, + "learning_rate": 7.320103014694744e-06, + "loss": 0.3799, + "step": 4920 + }, + { + "epoch": 2.016364938120078, + "grad_norm": 0.5414356512353191, + "learning_rate": 7.289804575064385e-06, + "loss": 0.3637, + "step": 4930 + }, + { + "epoch": 2.020456172650097, + "grad_norm": 0.9191131346375879, + "learning_rate": 7.259506135434026e-06, + "loss": 0.3502, + "step": 4940 + }, + { + "epoch": 2.0245474071801164, + "grad_norm": 0.5288098199455225, + "learning_rate": 7.229207695803667e-06, + "loss": 0.3594, + "step": 4950 + }, + { + "epoch": 2.028638641710136, + "grad_norm": 0.6644017187035499, + "learning_rate": 7.1989092561733075e-06, + "loss": 0.3566, + "step": 4960 + }, + { + "epoch": 2.0327298762401553, + "grad_norm": 0.7063344184257762, + "learning_rate": 7.1686108165429495e-06, + "loss": 0.3729, + "step": 4970 + }, + { + "epoch": 2.036821110770175, + "grad_norm": 0.716588405815958, + "learning_rate": 7.13831237691259e-06, + "loss": 0.3632, + "step": 4980 + }, + { + "epoch": 2.0409123453001943, + "grad_norm": 0.780104450356012, + "learning_rate": 7.108013937282231e-06, + "loss": 0.3503, + "step": 4990 + }, + { + "epoch": 2.045003579830214, + "grad_norm": 0.9214206047619449, + "learning_rate": 7.077715497651871e-06, + "loss": 0.3583, + "step": 5000 + }, + { + "epoch": 2.045003579830214, + "eval_loss": 0.40945789217948914, + "eval_runtime": 566.8305, + "eval_samples_per_second": 5.448, + "eval_steps_per_second": 0.909, + "step": 5000 + }, + { + "epoch": 2.049094814360233, + "grad_norm": 0.8803623081885309, + "learning_rate": 7.047417058021513e-06, + "loss": 0.3524, + "step": 5010 + }, + { + "epoch": 2.0531860488902525, + "grad_norm": 0.8089956621396417, + "learning_rate": 7.0171186183911536e-06, + "loss": 0.3423, + "step": 5020 + }, + { + "epoch": 2.057277283420272, + "grad_norm": 1.026614900390396, + "learning_rate": 6.986820178760795e-06, + "loss": 0.3612, + "step": 5030 + }, + { + "epoch": 2.0613685179502914, + "grad_norm": 0.8153155935459168, + "learning_rate": 6.956521739130435e-06, + "loss": 0.3619, + "step": 5040 + }, + { + "epoch": 2.065459752480311, + "grad_norm": 0.931239718869365, + "learning_rate": 6.926223299500077e-06, + "loss": 0.3475, + "step": 5050 + }, + { + "epoch": 2.0695509870103304, + "grad_norm": 1.0467750800215039, + "learning_rate": 6.895924859869717e-06, + "loss": 0.3535, + "step": 5060 + }, + { + "epoch": 2.0736422215403496, + "grad_norm": 0.9059104329232058, + "learning_rate": 6.8656264202393585e-06, + "loss": 0.3697, + "step": 5070 + }, + { + "epoch": 2.0777334560703693, + "grad_norm": 0.9807695952493246, + "learning_rate": 6.835327980608999e-06, + "loss": 0.3465, + "step": 5080 + }, + { + "epoch": 2.0818246906003885, + "grad_norm": 0.8912366875608871, + "learning_rate": 6.805029540978641e-06, + "loss": 0.3423, + "step": 5090 + }, + { + "epoch": 2.0859159251304082, + "grad_norm": 0.7770463857589687, + "learning_rate": 6.774731101348281e-06, + "loss": 0.3514, + "step": 5100 + }, + { + "epoch": 2.0900071596604275, + "grad_norm": 0.9470302336462076, + "learning_rate": 6.744432661717922e-06, + "loss": 0.3282, + "step": 5110 + }, + { + "epoch": 2.094098394190447, + "grad_norm": 0.9649585745676128, + "learning_rate": 6.7141342220875625e-06, + "loss": 0.3525, + "step": 5120 + }, + { + "epoch": 2.0981896287204664, + "grad_norm": 0.9294501847855035, + "learning_rate": 6.6838357824572045e-06, + "loss": 0.3568, + "step": 5130 + }, + { + "epoch": 2.1022808632504857, + "grad_norm": 0.7971412344974254, + "learning_rate": 6.653537342826845e-06, + "loss": 0.3649, + "step": 5140 + }, + { + "epoch": 2.1063720977805054, + "grad_norm": 0.9239690361706143, + "learning_rate": 6.623238903196486e-06, + "loss": 0.3671, + "step": 5150 + }, + { + "epoch": 2.1104633323105246, + "grad_norm": 0.792788321441495, + "learning_rate": 6.592940463566126e-06, + "loss": 0.3696, + "step": 5160 + }, + { + "epoch": 2.1145545668405443, + "grad_norm": 0.96794000584284, + "learning_rate": 6.562642023935768e-06, + "loss": 0.3526, + "step": 5170 + }, + { + "epoch": 2.1186458013705636, + "grad_norm": 0.9461764596400418, + "learning_rate": 6.5323435843054086e-06, + "loss": 0.3481, + "step": 5180 + }, + { + "epoch": 2.122737035900583, + "grad_norm": 0.9746653976507995, + "learning_rate": 6.50204514467505e-06, + "loss": 0.3641, + "step": 5190 + }, + { + "epoch": 2.1268282704306025, + "grad_norm": 0.9134747859457668, + "learning_rate": 6.47174670504469e-06, + "loss": 0.3477, + "step": 5200 + }, + { + "epoch": 2.1309195049606218, + "grad_norm": 0.9422179764228524, + "learning_rate": 6.441448265414332e-06, + "loss": 0.3538, + "step": 5210 + }, + { + "epoch": 2.1350107394906415, + "grad_norm": 0.9768549519399179, + "learning_rate": 6.411149825783972e-06, + "loss": 0.36, + "step": 5220 + }, + { + "epoch": 2.1391019740206607, + "grad_norm": 0.8493532914358172, + "learning_rate": 6.3808513861536135e-06, + "loss": 0.3551, + "step": 5230 + }, + { + "epoch": 2.14319320855068, + "grad_norm": 0.850886523603342, + "learning_rate": 6.350552946523254e-06, + "loss": 0.3497, + "step": 5240 + }, + { + "epoch": 2.1472844430806997, + "grad_norm": 1.192315078529062, + "learning_rate": 6.320254506892896e-06, + "loss": 0.3618, + "step": 5250 + }, + { + "epoch": 2.151375677610719, + "grad_norm": 0.8946571568288144, + "learning_rate": 6.289956067262537e-06, + "loss": 0.3458, + "step": 5260 + }, + { + "epoch": 2.1554669121407386, + "grad_norm": 0.8759600168097015, + "learning_rate": 6.259657627632177e-06, + "loss": 0.3681, + "step": 5270 + }, + { + "epoch": 2.159558146670758, + "grad_norm": 0.8486779868515286, + "learning_rate": 6.229359188001819e-06, + "loss": 0.3674, + "step": 5280 + }, + { + "epoch": 2.163649381200777, + "grad_norm": 0.9573497878508632, + "learning_rate": 6.1990607483714595e-06, + "loss": 0.3515, + "step": 5290 + }, + { + "epoch": 2.167740615730797, + "grad_norm": 1.0608696592581122, + "learning_rate": 6.168762308741101e-06, + "loss": 0.3575, + "step": 5300 + }, + { + "epoch": 2.171831850260816, + "grad_norm": 0.9550842012101914, + "learning_rate": 6.138463869110741e-06, + "loss": 0.3591, + "step": 5310 + }, + { + "epoch": 2.1759230847908357, + "grad_norm": 0.7852881818968737, + "learning_rate": 6.108165429480383e-06, + "loss": 0.3609, + "step": 5320 + }, + { + "epoch": 2.180014319320855, + "grad_norm": 1.1566940002899782, + "learning_rate": 6.077866989850023e-06, + "loss": 0.3407, + "step": 5330 + }, + { + "epoch": 2.1841055538508747, + "grad_norm": 0.8078702927449839, + "learning_rate": 6.047568550219664e-06, + "loss": 0.3486, + "step": 5340 + }, + { + "epoch": 2.188196788380894, + "grad_norm": 0.9239475839958554, + "learning_rate": 6.017270110589305e-06, + "loss": 0.3503, + "step": 5350 + }, + { + "epoch": 2.192288022910913, + "grad_norm": 0.9902787811712669, + "learning_rate": 5.986971670958947e-06, + "loss": 0.3616, + "step": 5360 + }, + { + "epoch": 2.196379257440933, + "grad_norm": 0.993917561828579, + "learning_rate": 5.956673231328587e-06, + "loss": 0.3564, + "step": 5370 + }, + { + "epoch": 2.200470491970952, + "grad_norm": 0.9912437866525817, + "learning_rate": 5.926374791698228e-06, + "loss": 0.3515, + "step": 5380 + }, + { + "epoch": 2.204561726500972, + "grad_norm": 0.9063774423655953, + "learning_rate": 5.8960763520678685e-06, + "loss": 0.3597, + "step": 5390 + }, + { + "epoch": 2.208652961030991, + "grad_norm": 1.2314911545390765, + "learning_rate": 5.8657779124375105e-06, + "loss": 0.343, + "step": 5400 + }, + { + "epoch": 2.2127441955610108, + "grad_norm": 0.9949853000873754, + "learning_rate": 5.835479472807151e-06, + "loss": 0.3514, + "step": 5410 + }, + { + "epoch": 2.21683543009103, + "grad_norm": 1.000104375303856, + "learning_rate": 5.805181033176792e-06, + "loss": 0.3638, + "step": 5420 + }, + { + "epoch": 2.2209266646210493, + "grad_norm": 1.128813445069188, + "learning_rate": 5.774882593546432e-06, + "loss": 0.3656, + "step": 5430 + }, + { + "epoch": 2.225017899151069, + "grad_norm": 0.9792714948138356, + "learning_rate": 5.744584153916074e-06, + "loss": 0.3536, + "step": 5440 + }, + { + "epoch": 2.229109133681088, + "grad_norm": 1.0987526220359756, + "learning_rate": 5.7142857142857145e-06, + "loss": 0.3721, + "step": 5450 + }, + { + "epoch": 2.233200368211108, + "grad_norm": 0.8952391612311649, + "learning_rate": 5.683987274655356e-06, + "loss": 0.354, + "step": 5460 + }, + { + "epoch": 2.237291602741127, + "grad_norm": 1.1608074782040154, + "learning_rate": 5.653688835024996e-06, + "loss": 0.3561, + "step": 5470 + }, + { + "epoch": 2.2413828372711464, + "grad_norm": 1.003805233436142, + "learning_rate": 5.623390395394638e-06, + "loss": 0.3551, + "step": 5480 + }, + { + "epoch": 2.245474071801166, + "grad_norm": 0.8422927034761718, + "learning_rate": 5.593091955764278e-06, + "loss": 0.3467, + "step": 5490 + }, + { + "epoch": 2.2495653063311853, + "grad_norm": 1.034912305431081, + "learning_rate": 5.562793516133919e-06, + "loss": 0.3762, + "step": 5500 + }, + { + "epoch": 2.2495653063311853, + "eval_loss": 0.4107515215873718, + "eval_runtime": 566.5709, + "eval_samples_per_second": 5.45, + "eval_steps_per_second": 0.909, + "step": 5500 + }, + { + "epoch": 2.253656540861205, + "grad_norm": 0.9665919628552063, + "learning_rate": 5.53249507650356e-06, + "loss": 0.3683, + "step": 5510 + }, + { + "epoch": 2.2577477753912243, + "grad_norm": 1.1208116392163985, + "learning_rate": 5.502196636873202e-06, + "loss": 0.3492, + "step": 5520 + }, + { + "epoch": 2.2618390099212435, + "grad_norm": 0.8340800205831346, + "learning_rate": 5.471898197242842e-06, + "loss": 0.3372, + "step": 5530 + }, + { + "epoch": 2.2659302444512632, + "grad_norm": 0.9589224290151471, + "learning_rate": 5.441599757612483e-06, + "loss": 0.3478, + "step": 5540 + }, + { + "epoch": 2.2700214789812825, + "grad_norm": 1.015598680602829, + "learning_rate": 5.4113013179821235e-06, + "loss": 0.3607, + "step": 5550 + }, + { + "epoch": 2.274112713511302, + "grad_norm": 0.9746676080601815, + "learning_rate": 5.3810028783517655e-06, + "loss": 0.3611, + "step": 5560 + }, + { + "epoch": 2.2782039480413214, + "grad_norm": 0.9296538878972167, + "learning_rate": 5.350704438721406e-06, + "loss": 0.3576, + "step": 5570 + }, + { + "epoch": 2.2822951825713407, + "grad_norm": 0.8789512395616204, + "learning_rate": 5.320405999091047e-06, + "loss": 0.3629, + "step": 5580 + }, + { + "epoch": 2.2863864171013604, + "grad_norm": 1.0582855408609264, + "learning_rate": 5.290107559460688e-06, + "loss": 0.3634, + "step": 5590 + }, + { + "epoch": 2.2904776516313796, + "grad_norm": 0.9546020518135027, + "learning_rate": 5.259809119830329e-06, + "loss": 0.3586, + "step": 5600 + }, + { + "epoch": 2.2945688861613993, + "grad_norm": 0.962752971888759, + "learning_rate": 5.22951068019997e-06, + "loss": 0.3605, + "step": 5610 + }, + { + "epoch": 2.2986601206914186, + "grad_norm": 1.0245121086717093, + "learning_rate": 5.199212240569611e-06, + "loss": 0.3587, + "step": 5620 + }, + { + "epoch": 2.3027513552214383, + "grad_norm": 1.0040046723513294, + "learning_rate": 5.168913800939253e-06, + "loss": 0.3648, + "step": 5630 + }, + { + "epoch": 2.3068425897514575, + "grad_norm": 1.171908094587368, + "learning_rate": 5.138615361308893e-06, + "loss": 0.3592, + "step": 5640 + }, + { + "epoch": 2.3109338242814768, + "grad_norm": 1.011025217862205, + "learning_rate": 5.108316921678534e-06, + "loss": 0.3442, + "step": 5650 + }, + { + "epoch": 2.3150250588114965, + "grad_norm": 1.1859171170651996, + "learning_rate": 5.078018482048174e-06, + "loss": 0.3633, + "step": 5660 + }, + { + "epoch": 2.3191162933415157, + "grad_norm": 1.0924583307810276, + "learning_rate": 5.047720042417816e-06, + "loss": 0.3532, + "step": 5670 + }, + { + "epoch": 2.3232075278715354, + "grad_norm": 1.167947573027158, + "learning_rate": 5.017421602787457e-06, + "loss": 0.3604, + "step": 5680 + }, + { + "epoch": 2.3272987624015546, + "grad_norm": 1.2875926245828595, + "learning_rate": 4.987123163157098e-06, + "loss": 0.3429, + "step": 5690 + }, + { + "epoch": 2.3313899969315743, + "grad_norm": 1.156830068761064, + "learning_rate": 4.956824723526739e-06, + "loss": 0.3434, + "step": 5700 + }, + { + "epoch": 2.3354812314615936, + "grad_norm": 0.9404794900229477, + "learning_rate": 4.926526283896379e-06, + "loss": 0.3399, + "step": 5710 + }, + { + "epoch": 2.339572465991613, + "grad_norm": 1.116557440039318, + "learning_rate": 4.8962278442660205e-06, + "loss": 0.3584, + "step": 5720 + }, + { + "epoch": 2.3436637005216325, + "grad_norm": 1.2190130786864881, + "learning_rate": 4.865929404635662e-06, + "loss": 0.3701, + "step": 5730 + }, + { + "epoch": 2.347754935051652, + "grad_norm": 1.2099886678141103, + "learning_rate": 4.835630965005303e-06, + "loss": 0.3675, + "step": 5740 + }, + { + "epoch": 2.3518461695816715, + "grad_norm": 1.1364465919872204, + "learning_rate": 4.805332525374943e-06, + "loss": 0.3608, + "step": 5750 + }, + { + "epoch": 2.3559374041116907, + "grad_norm": 0.9321036699083428, + "learning_rate": 4.775034085744584e-06, + "loss": 0.3496, + "step": 5760 + }, + { + "epoch": 2.36002863864171, + "grad_norm": 0.9146634319851078, + "learning_rate": 4.744735646114225e-06, + "loss": 0.3502, + "step": 5770 + }, + { + "epoch": 2.3641198731717297, + "grad_norm": 1.000647504917848, + "learning_rate": 4.7144372064838665e-06, + "loss": 0.3544, + "step": 5780 + }, + { + "epoch": 2.368211107701749, + "grad_norm": 1.1557436169214348, + "learning_rate": 4.684138766853508e-06, + "loss": 0.3518, + "step": 5790 + }, + { + "epoch": 2.3723023422317686, + "grad_norm": 1.2926612320528568, + "learning_rate": 4.653840327223149e-06, + "loss": 0.3486, + "step": 5800 + }, + { + "epoch": 2.376393576761788, + "grad_norm": 1.2169663560603732, + "learning_rate": 4.62354188759279e-06, + "loss": 0.3539, + "step": 5810 + }, + { + "epoch": 2.380484811291807, + "grad_norm": 0.9319379770517142, + "learning_rate": 4.59324344796243e-06, + "loss": 0.328, + "step": 5820 + }, + { + "epoch": 2.384576045821827, + "grad_norm": 1.0263622334503117, + "learning_rate": 4.562945008332071e-06, + "loss": 0.3536, + "step": 5830 + }, + { + "epoch": 2.388667280351846, + "grad_norm": 1.152645840965358, + "learning_rate": 4.5326465687017126e-06, + "loss": 0.3536, + "step": 5840 + }, + { + "epoch": 2.3927585148818658, + "grad_norm": 0.9658502725314514, + "learning_rate": 4.502348129071354e-06, + "loss": 0.3548, + "step": 5850 + }, + { + "epoch": 2.396849749411885, + "grad_norm": 1.2197202047381088, + "learning_rate": 4.472049689440994e-06, + "loss": 0.3559, + "step": 5860 + }, + { + "epoch": 2.4009409839419042, + "grad_norm": 1.2765015130610389, + "learning_rate": 4.441751249810635e-06, + "loss": 0.3497, + "step": 5870 + }, + { + "epoch": 2.405032218471924, + "grad_norm": 1.38104638770045, + "learning_rate": 4.411452810180276e-06, + "loss": 0.3448, + "step": 5880 + }, + { + "epoch": 2.409123453001943, + "grad_norm": 1.1400186315945817, + "learning_rate": 4.3811543705499174e-06, + "loss": 0.3561, + "step": 5890 + }, + { + "epoch": 2.413214687531963, + "grad_norm": 1.332445802283346, + "learning_rate": 4.350855930919558e-06, + "loss": 0.3558, + "step": 5900 + }, + { + "epoch": 2.417305922061982, + "grad_norm": 0.8518719177973914, + "learning_rate": 4.320557491289199e-06, + "loss": 0.3557, + "step": 5910 + }, + { + "epoch": 2.4213971565920014, + "grad_norm": 1.3083347016848765, + "learning_rate": 4.29025905165884e-06, + "loss": 0.3443, + "step": 5920 + }, + { + "epoch": 2.425488391122021, + "grad_norm": 1.2245072085864033, + "learning_rate": 4.259960612028481e-06, + "loss": 0.3424, + "step": 5930 + }, + { + "epoch": 2.4295796256520403, + "grad_norm": 1.0512105297819025, + "learning_rate": 4.2296621723981215e-06, + "loss": 0.3668, + "step": 5940 + }, + { + "epoch": 2.43367086018206, + "grad_norm": 1.2181678876630533, + "learning_rate": 4.199363732767763e-06, + "loss": 0.3514, + "step": 5950 + }, + { + "epoch": 2.4377620947120793, + "grad_norm": 1.2107998993154634, + "learning_rate": 4.169065293137404e-06, + "loss": 0.3634, + "step": 5960 + }, + { + "epoch": 2.441853329242099, + "grad_norm": 1.162395707185428, + "learning_rate": 4.138766853507045e-06, + "loss": 0.35, + "step": 5970 + }, + { + "epoch": 2.445944563772118, + "grad_norm": 1.0932545213281812, + "learning_rate": 4.108468413876685e-06, + "loss": 0.3472, + "step": 5980 + }, + { + "epoch": 2.450035798302138, + "grad_norm": 1.0010475016537828, + "learning_rate": 4.078169974246326e-06, + "loss": 0.3584, + "step": 5990 + }, + { + "epoch": 2.454127032832157, + "grad_norm": 0.8180158364405802, + "learning_rate": 4.0478715346159675e-06, + "loss": 0.3597, + "step": 6000 + }, + { + "epoch": 2.454127032832157, + "eval_loss": 0.4111001789569855, + "eval_runtime": 567.1279, + "eval_samples_per_second": 5.445, + "eval_steps_per_second": 0.908, + "step": 6000 + }, + { + "epoch": 2.4582182673621764, + "grad_norm": 1.152408348113746, + "learning_rate": 4.017573094985609e-06, + "loss": 0.352, + "step": 6010 + }, + { + "epoch": 2.462309501892196, + "grad_norm": 0.9901037242730653, + "learning_rate": 3.987274655355249e-06, + "loss": 0.3413, + "step": 6020 + }, + { + "epoch": 2.4664007364222154, + "grad_norm": 1.0744413310129686, + "learning_rate": 3.95697621572489e-06, + "loss": 0.3572, + "step": 6030 + }, + { + "epoch": 2.470491970952235, + "grad_norm": 1.1304586395449463, + "learning_rate": 3.926677776094531e-06, + "loss": 0.3611, + "step": 6040 + }, + { + "epoch": 2.4745832054822543, + "grad_norm": 1.1937074296439871, + "learning_rate": 3.8963793364641724e-06, + "loss": 0.351, + "step": 6050 + }, + { + "epoch": 2.4786744400122735, + "grad_norm": 0.9420890149335949, + "learning_rate": 3.866080896833813e-06, + "loss": 0.3556, + "step": 6060 + }, + { + "epoch": 2.4827656745422932, + "grad_norm": 1.0885349991773454, + "learning_rate": 3.835782457203454e-06, + "loss": 0.3545, + "step": 6070 + }, + { + "epoch": 2.4868569090723125, + "grad_norm": 1.076589826454013, + "learning_rate": 3.805484017573095e-06, + "loss": 0.353, + "step": 6080 + }, + { + "epoch": 2.490948143602332, + "grad_norm": 0.9678457177285116, + "learning_rate": 3.775185577942736e-06, + "loss": 0.3535, + "step": 6090 + }, + { + "epoch": 2.4950393781323514, + "grad_norm": 1.2568971383029166, + "learning_rate": 3.7448871383123773e-06, + "loss": 0.3535, + "step": 6100 + }, + { + "epoch": 2.4991306126623707, + "grad_norm": 1.1438602826766222, + "learning_rate": 3.7145886986820185e-06, + "loss": 0.3548, + "step": 6110 + }, + { + "epoch": 2.5032218471923904, + "grad_norm": 1.1827797517036478, + "learning_rate": 3.6842902590516592e-06, + "loss": 0.3537, + "step": 6120 + }, + { + "epoch": 2.5073130817224096, + "grad_norm": 1.1604168543066307, + "learning_rate": 3.6539918194213004e-06, + "loss": 0.3443, + "step": 6130 + }, + { + "epoch": 2.5114043162524293, + "grad_norm": 1.5271313980044559, + "learning_rate": 3.623693379790941e-06, + "loss": 0.3676, + "step": 6140 + }, + { + "epoch": 2.5154955507824486, + "grad_norm": 1.0170843727726653, + "learning_rate": 3.5933949401605822e-06, + "loss": 0.333, + "step": 6150 + }, + { + "epoch": 2.519586785312468, + "grad_norm": 1.478391272829513, + "learning_rate": 3.563096500530223e-06, + "loss": 0.3289, + "step": 6160 + }, + { + "epoch": 2.5236780198424875, + "grad_norm": 1.1327119923685498, + "learning_rate": 3.532798060899864e-06, + "loss": 0.3465, + "step": 6170 + }, + { + "epoch": 2.5277692543725068, + "grad_norm": 1.2833494932025962, + "learning_rate": 3.502499621269505e-06, + "loss": 0.3603, + "step": 6180 + }, + { + "epoch": 2.5318604889025265, + "grad_norm": 1.3483786326224019, + "learning_rate": 3.472201181639146e-06, + "loss": 0.3499, + "step": 6190 + }, + { + "epoch": 2.5359517234325457, + "grad_norm": 1.41685799213282, + "learning_rate": 3.441902742008787e-06, + "loss": 0.3557, + "step": 6200 + }, + { + "epoch": 2.540042957962565, + "grad_norm": 1.1147817059656389, + "learning_rate": 3.411604302378428e-06, + "loss": 0.3525, + "step": 6210 + }, + { + "epoch": 2.5441341924925847, + "grad_norm": 1.24256482526061, + "learning_rate": 3.381305862748069e-06, + "loss": 0.3557, + "step": 6220 + }, + { + "epoch": 2.548225427022604, + "grad_norm": 1.14276520963034, + "learning_rate": 3.3510074231177097e-06, + "loss": 0.3443, + "step": 6230 + }, + { + "epoch": 2.5523166615526236, + "grad_norm": 1.4308356706747418, + "learning_rate": 3.320708983487351e-06, + "loss": 0.3642, + "step": 6240 + }, + { + "epoch": 2.556407896082643, + "grad_norm": 1.0603812873827165, + "learning_rate": 3.2904105438569916e-06, + "loss": 0.3507, + "step": 6250 + }, + { + "epoch": 2.560499130612662, + "grad_norm": 1.1444596493356962, + "learning_rate": 3.2601121042266328e-06, + "loss": 0.3463, + "step": 6260 + }, + { + "epoch": 2.564590365142682, + "grad_norm": 1.1550573279318823, + "learning_rate": 3.2298136645962735e-06, + "loss": 0.3517, + "step": 6270 + }, + { + "epoch": 2.5686815996727015, + "grad_norm": 1.2279449599940317, + "learning_rate": 3.1995152249659146e-06, + "loss": 0.3419, + "step": 6280 + }, + { + "epoch": 2.5727728342027207, + "grad_norm": 1.1531936082950656, + "learning_rate": 3.1692167853355554e-06, + "loss": 0.3553, + "step": 6290 + }, + { + "epoch": 2.57686406873274, + "grad_norm": 1.1430327649681449, + "learning_rate": 3.1389183457051965e-06, + "loss": 0.3481, + "step": 6300 + }, + { + "epoch": 2.5809553032627597, + "grad_norm": 1.2985006323282675, + "learning_rate": 3.1086199060748372e-06, + "loss": 0.3486, + "step": 6310 + }, + { + "epoch": 2.585046537792779, + "grad_norm": 1.1447086789004135, + "learning_rate": 3.0783214664444784e-06, + "loss": 0.3516, + "step": 6320 + }, + { + "epoch": 2.5891377723227986, + "grad_norm": 1.464516109493368, + "learning_rate": 3.048023026814119e-06, + "loss": 0.3478, + "step": 6330 + }, + { + "epoch": 2.593229006852818, + "grad_norm": 1.1333941032924868, + "learning_rate": 3.0177245871837603e-06, + "loss": 0.3372, + "step": 6340 + }, + { + "epoch": 2.597320241382837, + "grad_norm": 1.1610526768921359, + "learning_rate": 2.987426147553401e-06, + "loss": 0.3537, + "step": 6350 + }, + { + "epoch": 2.601411475912857, + "grad_norm": 1.166132435226011, + "learning_rate": 2.957127707923042e-06, + "loss": 0.3495, + "step": 6360 + }, + { + "epoch": 2.605502710442876, + "grad_norm": 1.1286755958679484, + "learning_rate": 2.926829268292683e-06, + "loss": 0.3567, + "step": 6370 + }, + { + "epoch": 2.6095939449728958, + "grad_norm": 1.0343679227746663, + "learning_rate": 2.896530828662324e-06, + "loss": 0.3471, + "step": 6380 + }, + { + "epoch": 2.613685179502915, + "grad_norm": 1.1778557485063874, + "learning_rate": 2.8662323890319647e-06, + "loss": 0.3549, + "step": 6390 + }, + { + "epoch": 2.6177764140329343, + "grad_norm": 1.226656877106952, + "learning_rate": 2.835933949401606e-06, + "loss": 0.3496, + "step": 6400 + }, + { + "epoch": 2.621867648562954, + "grad_norm": 1.3948209564375484, + "learning_rate": 2.8056355097712466e-06, + "loss": 0.3544, + "step": 6410 + }, + { + "epoch": 2.625958883092973, + "grad_norm": 1.233392087784029, + "learning_rate": 2.7753370701408878e-06, + "loss": 0.34, + "step": 6420 + }, + { + "epoch": 2.630050117622993, + "grad_norm": 1.2620457074981337, + "learning_rate": 2.7450386305105285e-06, + "loss": 0.3304, + "step": 6430 + }, + { + "epoch": 2.634141352153012, + "grad_norm": 1.269720329253969, + "learning_rate": 2.71474019088017e-06, + "loss": 0.3589, + "step": 6440 + }, + { + "epoch": 2.6382325866830314, + "grad_norm": 1.1908184218530549, + "learning_rate": 2.6844417512498112e-06, + "loss": 0.3511, + "step": 6450 + }, + { + "epoch": 2.642323821213051, + "grad_norm": 1.1293285956281658, + "learning_rate": 2.654143311619452e-06, + "loss": 0.3507, + "step": 6460 + }, + { + "epoch": 2.6464150557430703, + "grad_norm": 1.3110950677273854, + "learning_rate": 2.623844871989093e-06, + "loss": 0.3433, + "step": 6470 + }, + { + "epoch": 2.65050629027309, + "grad_norm": 1.4086576967977462, + "learning_rate": 2.593546432358734e-06, + "loss": 0.34, + "step": 6480 + }, + { + "epoch": 2.6545975248031093, + "grad_norm": 1.34700090210642, + "learning_rate": 2.563247992728375e-06, + "loss": 0.349, + "step": 6490 + }, + { + "epoch": 2.6586887593331285, + "grad_norm": 1.0282320900902353, + "learning_rate": 2.5329495530980157e-06, + "loss": 0.3601, + "step": 6500 + }, + { + "epoch": 2.6586887593331285, + "eval_loss": 0.41547685861587524, + "eval_runtime": 565.4311, + "eval_samples_per_second": 5.461, + "eval_steps_per_second": 0.911, + "step": 6500 + }, + { + "epoch": 2.6627799938631482, + "grad_norm": 1.1250564513049757, + "learning_rate": 2.502651113467657e-06, + "loss": 0.3493, + "step": 6510 + }, + { + "epoch": 2.6668712283931675, + "grad_norm": 1.1544902670015906, + "learning_rate": 2.4723526738372976e-06, + "loss": 0.356, + "step": 6520 + }, + { + "epoch": 2.670962462923187, + "grad_norm": 1.6062647614114969, + "learning_rate": 2.4420542342069387e-06, + "loss": 0.3537, + "step": 6530 + }, + { + "epoch": 2.6750536974532064, + "grad_norm": 1.4055380071214372, + "learning_rate": 2.4117557945765794e-06, + "loss": 0.3589, + "step": 6540 + }, + { + "epoch": 2.6791449319832257, + "grad_norm": 1.0767099444486827, + "learning_rate": 2.3814573549462206e-06, + "loss": 0.3564, + "step": 6550 + }, + { + "epoch": 2.6832361665132454, + "grad_norm": 1.0775655234852821, + "learning_rate": 2.3511589153158613e-06, + "loss": 0.3508, + "step": 6560 + }, + { + "epoch": 2.687327401043265, + "grad_norm": 1.3945094308830326, + "learning_rate": 2.3208604756855025e-06, + "loss": 0.3462, + "step": 6570 + }, + { + "epoch": 2.6914186355732843, + "grad_norm": 1.4325900380219518, + "learning_rate": 2.290562036055143e-06, + "loss": 0.34, + "step": 6580 + }, + { + "epoch": 2.6955098701033036, + "grad_norm": 1.0603337925865337, + "learning_rate": 2.2602635964247843e-06, + "loss": 0.341, + "step": 6590 + }, + { + "epoch": 2.6996011046333233, + "grad_norm": 1.1150418643447688, + "learning_rate": 2.229965156794425e-06, + "loss": 0.3427, + "step": 6600 + }, + { + "epoch": 2.7036923391633425, + "grad_norm": 1.5841244349700083, + "learning_rate": 2.199666717164066e-06, + "loss": 0.3446, + "step": 6610 + }, + { + "epoch": 2.707783573693362, + "grad_norm": 1.1978695972855773, + "learning_rate": 2.1693682775337074e-06, + "loss": 0.3447, + "step": 6620 + }, + { + "epoch": 2.7118748082233815, + "grad_norm": 1.0752732100706306, + "learning_rate": 2.139069837903348e-06, + "loss": 0.3338, + "step": 6630 + }, + { + "epoch": 2.7159660427534007, + "grad_norm": 1.2105228186701131, + "learning_rate": 2.1087713982729892e-06, + "loss": 0.3352, + "step": 6640 + }, + { + "epoch": 2.7200572772834204, + "grad_norm": 1.1393844873051664, + "learning_rate": 2.07847295864263e-06, + "loss": 0.3531, + "step": 6650 + }, + { + "epoch": 2.7241485118134396, + "grad_norm": 1.0238817342799573, + "learning_rate": 2.048174519012271e-06, + "loss": 0.3528, + "step": 6660 + }, + { + "epoch": 2.7282397463434593, + "grad_norm": 1.1816844502503707, + "learning_rate": 2.017876079381912e-06, + "loss": 0.3512, + "step": 6670 + }, + { + "epoch": 2.7323309808734786, + "grad_norm": 1.305926419960111, + "learning_rate": 1.987577639751553e-06, + "loss": 0.3442, + "step": 6680 + }, + { + "epoch": 2.736422215403498, + "grad_norm": 1.4261243840727893, + "learning_rate": 1.957279200121194e-06, + "loss": 0.3455, + "step": 6690 + }, + { + "epoch": 2.7405134499335175, + "grad_norm": 1.6388906420384781, + "learning_rate": 1.926980760490835e-06, + "loss": 0.3535, + "step": 6700 + }, + { + "epoch": 2.744604684463537, + "grad_norm": 1.5426677425739872, + "learning_rate": 1.896682320860476e-06, + "loss": 0.3475, + "step": 6710 + }, + { + "epoch": 2.7486959189935565, + "grad_norm": 1.1578285273120823, + "learning_rate": 1.866383881230117e-06, + "loss": 0.3434, + "step": 6720 + }, + { + "epoch": 2.7527871535235757, + "grad_norm": 1.0960855524012971, + "learning_rate": 1.8360854415997579e-06, + "loss": 0.3578, + "step": 6730 + }, + { + "epoch": 2.756878388053595, + "grad_norm": 1.1655103795481652, + "learning_rate": 1.8057870019693988e-06, + "loss": 0.3566, + "step": 6740 + }, + { + "epoch": 2.7609696225836147, + "grad_norm": 1.4270524025940055, + "learning_rate": 1.7754885623390398e-06, + "loss": 0.3517, + "step": 6750 + }, + { + "epoch": 2.765060857113634, + "grad_norm": 1.3348787554214543, + "learning_rate": 1.7451901227086807e-06, + "loss": 0.3518, + "step": 6760 + }, + { + "epoch": 2.7691520916436536, + "grad_norm": 1.0118054266554564, + "learning_rate": 1.7148916830783216e-06, + "loss": 0.3517, + "step": 6770 + }, + { + "epoch": 2.773243326173673, + "grad_norm": 1.5502107988903457, + "learning_rate": 1.6845932434479626e-06, + "loss": 0.3402, + "step": 6780 + }, + { + "epoch": 2.777334560703692, + "grad_norm": 1.061216156435797, + "learning_rate": 1.6542948038176035e-06, + "loss": 0.3521, + "step": 6790 + }, + { + "epoch": 2.781425795233712, + "grad_norm": 1.5424114671353548, + "learning_rate": 1.6239963641872444e-06, + "loss": 0.3456, + "step": 6800 + }, + { + "epoch": 2.785517029763731, + "grad_norm": 1.2155670661756302, + "learning_rate": 1.5936979245568854e-06, + "loss": 0.3444, + "step": 6810 + }, + { + "epoch": 2.7896082642937508, + "grad_norm": 1.2991785598636614, + "learning_rate": 1.5633994849265263e-06, + "loss": 0.3453, + "step": 6820 + }, + { + "epoch": 2.79369949882377, + "grad_norm": 1.5117541536726462, + "learning_rate": 1.5331010452961673e-06, + "loss": 0.3426, + "step": 6830 + }, + { + "epoch": 2.7977907333537892, + "grad_norm": 1.397208477585223, + "learning_rate": 1.5028026056658082e-06, + "loss": 0.3505, + "step": 6840 + }, + { + "epoch": 2.801881967883809, + "grad_norm": 1.2017192873007096, + "learning_rate": 1.4725041660354491e-06, + "loss": 0.3341, + "step": 6850 + }, + { + "epoch": 2.8059732024138286, + "grad_norm": 1.3674252863596916, + "learning_rate": 1.4422057264050903e-06, + "loss": 0.3578, + "step": 6860 + }, + { + "epoch": 2.810064436943848, + "grad_norm": 1.2963144848882437, + "learning_rate": 1.4119072867747312e-06, + "loss": 0.3448, + "step": 6870 + }, + { + "epoch": 2.814155671473867, + "grad_norm": 1.3472526890767895, + "learning_rate": 1.3816088471443724e-06, + "loss": 0.3545, + "step": 6880 + }, + { + "epoch": 2.818246906003887, + "grad_norm": 1.4300718931297243, + "learning_rate": 1.3513104075140133e-06, + "loss": 0.3458, + "step": 6890 + }, + { + "epoch": 2.822338140533906, + "grad_norm": 1.1374915010677178, + "learning_rate": 1.3210119678836542e-06, + "loss": 0.3452, + "step": 6900 + }, + { + "epoch": 2.8264293750639258, + "grad_norm": 1.3186888057848343, + "learning_rate": 1.2907135282532952e-06, + "loss": 0.3582, + "step": 6910 + }, + { + "epoch": 2.830520609593945, + "grad_norm": 1.4076831798674376, + "learning_rate": 1.2604150886229361e-06, + "loss": 0.3473, + "step": 6920 + }, + { + "epoch": 2.8346118441239643, + "grad_norm": 1.236226000169149, + "learning_rate": 1.230116648992577e-06, + "loss": 0.3508, + "step": 6930 + }, + { + "epoch": 2.838703078653984, + "grad_norm": 1.1598603618467764, + "learning_rate": 1.199818209362218e-06, + "loss": 0.3135, + "step": 6940 + }, + { + "epoch": 2.842794313184003, + "grad_norm": 1.4470943796781588, + "learning_rate": 1.169519769731859e-06, + "loss": 0.3369, + "step": 6950 + }, + { + "epoch": 2.846885547714023, + "grad_norm": 1.4514521872416606, + "learning_rate": 1.1392213301014999e-06, + "loss": 0.3652, + "step": 6960 + }, + { + "epoch": 2.850976782244042, + "grad_norm": 1.508158438836179, + "learning_rate": 1.1089228904711408e-06, + "loss": 0.3424, + "step": 6970 + }, + { + "epoch": 2.8550680167740614, + "grad_norm": 1.2409802713756901, + "learning_rate": 1.078624450840782e-06, + "loss": 0.3494, + "step": 6980 + }, + { + "epoch": 2.859159251304081, + "grad_norm": 1.277646969163353, + "learning_rate": 1.0483260112104229e-06, + "loss": 0.3323, + "step": 6990 + }, + { + "epoch": 2.8632504858341004, + "grad_norm": 1.181273366360208, + "learning_rate": 1.0180275715800638e-06, + "loss": 0.3454, + "step": 7000 + }, + { + "epoch": 2.8632504858341004, + "eval_loss": 0.4169977903366089, + "eval_runtime": 566.1639, + "eval_samples_per_second": 5.454, + "eval_steps_per_second": 0.91, + "step": 7000 + }, + { + "epoch": 2.86734172036412, + "grad_norm": 1.2542825969651887, + "learning_rate": 9.877291319497048e-07, + "loss": 0.3441, + "step": 7010 + }, + { + "epoch": 2.8714329548941393, + "grad_norm": 1.2663043582399385, + "learning_rate": 9.574306923193457e-07, + "loss": 0.3577, + "step": 7020 + }, + { + "epoch": 2.8755241894241585, + "grad_norm": 1.2989911359603148, + "learning_rate": 9.271322526889865e-07, + "loss": 0.3441, + "step": 7030 + }, + { + "epoch": 2.8796154239541782, + "grad_norm": 1.4071532233625517, + "learning_rate": 8.968338130586275e-07, + "loss": 0.3461, + "step": 7040 + }, + { + "epoch": 2.8837066584841975, + "grad_norm": 1.3845463524426405, + "learning_rate": 8.665353734282685e-07, + "loss": 0.3497, + "step": 7050 + }, + { + "epoch": 2.887797893014217, + "grad_norm": 1.1682379379190977, + "learning_rate": 8.362369337979096e-07, + "loss": 0.3559, + "step": 7060 + }, + { + "epoch": 2.8918891275442364, + "grad_norm": 1.3470954040058543, + "learning_rate": 8.059384941675505e-07, + "loss": 0.3429, + "step": 7070 + }, + { + "epoch": 2.8959803620742557, + "grad_norm": 1.208626776638643, + "learning_rate": 7.756400545371914e-07, + "loss": 0.3415, + "step": 7080 + }, + { + "epoch": 2.9000715966042754, + "grad_norm": 1.4017945161316288, + "learning_rate": 7.453416149068324e-07, + "loss": 0.3554, + "step": 7090 + }, + { + "epoch": 2.9041628311342946, + "grad_norm": 1.183169645634326, + "learning_rate": 7.150431752764733e-07, + "loss": 0.3464, + "step": 7100 + }, + { + "epoch": 2.9082540656643143, + "grad_norm": 1.321129525250198, + "learning_rate": 6.847447356461142e-07, + "loss": 0.3508, + "step": 7110 + }, + { + "epoch": 2.9123453001943336, + "grad_norm": 1.4041097528620985, + "learning_rate": 6.544462960157552e-07, + "loss": 0.3475, + "step": 7120 + }, + { + "epoch": 2.916436534724353, + "grad_norm": 1.2651747018935453, + "learning_rate": 6.241478563853962e-07, + "loss": 0.353, + "step": 7130 + }, + { + "epoch": 2.9205277692543725, + "grad_norm": 1.2832363456828761, + "learning_rate": 5.938494167550372e-07, + "loss": 0.3416, + "step": 7140 + }, + { + "epoch": 2.924619003784392, + "grad_norm": 1.1421739779615996, + "learning_rate": 5.635509771246781e-07, + "loss": 0.3472, + "step": 7150 + }, + { + "epoch": 2.9287102383144115, + "grad_norm": 1.643848869321631, + "learning_rate": 5.33252537494319e-07, + "loss": 0.3405, + "step": 7160 + }, + { + "epoch": 2.9328014728444307, + "grad_norm": 1.1839381749310496, + "learning_rate": 5.029540978639601e-07, + "loss": 0.3301, + "step": 7170 + }, + { + "epoch": 2.9368927073744504, + "grad_norm": 1.2749563659235175, + "learning_rate": 4.72655658233601e-07, + "loss": 0.3344, + "step": 7180 + }, + { + "epoch": 2.9409839419044697, + "grad_norm": 1.5435865262538777, + "learning_rate": 4.4235721860324195e-07, + "loss": 0.3409, + "step": 7190 + }, + { + "epoch": 2.9450751764344894, + "grad_norm": 1.5137986871644649, + "learning_rate": 4.120587789728829e-07, + "loss": 0.3475, + "step": 7200 + }, + { + "epoch": 2.9491664109645086, + "grad_norm": 1.2316496489032414, + "learning_rate": 3.8176033934252394e-07, + "loss": 0.3348, + "step": 7210 + }, + { + "epoch": 2.953257645494528, + "grad_norm": 1.2680106473026544, + "learning_rate": 3.5146189971216487e-07, + "loss": 0.3335, + "step": 7220 + }, + { + "epoch": 2.9573488800245475, + "grad_norm": 1.0733135362099837, + "learning_rate": 3.211634600818058e-07, + "loss": 0.347, + "step": 7230 + }, + { + "epoch": 2.961440114554567, + "grad_norm": 1.5331378983904185, + "learning_rate": 2.908650204514468e-07, + "loss": 0.3489, + "step": 7240 + }, + { + "epoch": 2.9655313490845865, + "grad_norm": 1.1721604841420616, + "learning_rate": 2.6056658082108774e-07, + "loss": 0.3407, + "step": 7250 + }, + { + "epoch": 2.9696225836146057, + "grad_norm": 1.2850118367826986, + "learning_rate": 2.302681411907287e-07, + "loss": 0.3589, + "step": 7260 + }, + { + "epoch": 2.973713818144625, + "grad_norm": 1.4413311960347719, + "learning_rate": 1.9996970156036967e-07, + "loss": 0.3369, + "step": 7270 + }, + { + "epoch": 2.9778050526746447, + "grad_norm": 1.314019429686636, + "learning_rate": 1.6967126193001063e-07, + "loss": 0.3507, + "step": 7280 + }, + { + "epoch": 2.981896287204664, + "grad_norm": 1.113303777593494, + "learning_rate": 1.3937282229965157e-07, + "loss": 0.3523, + "step": 7290 + }, + { + "epoch": 2.9859875217346836, + "grad_norm": 1.1248866357161083, + "learning_rate": 1.0907438266929254e-07, + "loss": 0.3437, + "step": 7300 + }, + { + "epoch": 2.990078756264703, + "grad_norm": 1.1898353136027997, + "learning_rate": 7.877594303893351e-08, + "loss": 0.3545, + "step": 7310 + }, + { + "epoch": 2.994169990794722, + "grad_norm": 1.4708707926240419, + "learning_rate": 4.8477503408574464e-08, + "loss": 0.3666, + "step": 7320 + }, + { + "epoch": 2.998261225324742, + "grad_norm": 1.3652249352362715, + "learning_rate": 1.8179063778215425e-08, + "loss": 0.3625, + "step": 7330 + } + ], + "logging_steps": 10, + "max_steps": 7335, + "num_input_tokens_seen": 0, + "num_train_epochs": 3, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 307331176169472.0, + "train_batch_size": 2, + "trial_name": null, + "trial_params": null +}