SCOPE-direct / trainer_state.json
Cooolder's picture
Upload folder using huggingface_hub
39441a8 verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 3.0,
"eval_steps": 500,
"global_step": 7335,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.004091234530019434,
"grad_norm": 5.139719573502004,
"learning_rate": 6.9791071983348295e-06,
"loss": 2.653,
"step": 10
},
{
"epoch": 0.008182469060038867,
"grad_norm": 3.820838784929218,
"learning_rate": 9.080027807988022e-06,
"loss": 0.4708,
"step": 20
},
{
"epoch": 0.0122737035900583,
"grad_norm": 2.6483227024127802,
"learning_rate": 1.030898758162737e-05,
"loss": 0.4622,
"step": 30
},
{
"epoch": 0.016364938120077735,
"grad_norm": 3.0325419802512723,
"learning_rate": 1.1180948417641216e-05,
"loss": 0.4572,
"step": 40
},
{
"epoch": 0.020456172650097165,
"grad_norm": 3.3008143753804378,
"learning_rate": 1.1857293787016462e-05,
"loss": 0.4569,
"step": 50
},
{
"epoch": 0.0245474071801166,
"grad_norm": 2.8131794752844383,
"learning_rate": 1.2409908191280561e-05,
"loss": 0.4378,
"step": 60
},
{
"epoch": 0.028638641710136033,
"grad_norm": 2.7183268247171717,
"learning_rate": 1.2877137012696984e-05,
"loss": 0.4523,
"step": 70
},
{
"epoch": 0.03272987624015547,
"grad_norm": 1.812836545554904,
"learning_rate": 1.3281869027294408e-05,
"loss": 0.4517,
"step": 80
},
{
"epoch": 0.0368211107701749,
"grad_norm": 1.5605721318709478,
"learning_rate": 1.363886796491991e-05,
"loss": 0.4396,
"step": 90
},
{
"epoch": 0.04091234530019433,
"grad_norm": 3.00608523793705,
"learning_rate": 1.3958214396669659e-05,
"loss": 0.4393,
"step": 100
},
{
"epoch": 0.04500357983021377,
"grad_norm": 2.004900329434675,
"learning_rate": 1.4247098383615834e-05,
"loss": 0.4287,
"step": 110
},
{
"epoch": 0.0490948143602332,
"grad_norm": 3.1505903529524826,
"learning_rate": 1.4510828800933757e-05,
"loss": 0.4461,
"step": 120
},
{
"epoch": 0.053186048890252635,
"grad_norm": 1.4909337148061823,
"learning_rate": 1.47534372669567e-05,
"loss": 0.4475,
"step": 130
},
{
"epoch": 0.057277283420272065,
"grad_norm": 1.910562126750776,
"learning_rate": 1.4978057622350176e-05,
"loss": 0.456,
"step": 140
},
{
"epoch": 0.0613685179502915,
"grad_norm": 1.071878138097679,
"learning_rate": 1.5187174170309003e-05,
"loss": 0.4444,
"step": 150
},
{
"epoch": 0.06545975248031094,
"grad_norm": 1.182162791937331,
"learning_rate": 1.5382789636947598e-05,
"loss": 0.4544,
"step": 160
},
{
"epoch": 0.06955098701033037,
"grad_norm": 2.804609224620368,
"learning_rate": 1.5566542122709266e-05,
"loss": 0.4431,
"step": 170
},
{
"epoch": 0.0736422215403498,
"grad_norm": 2.30126890056861,
"learning_rate": 1.5739788574573106e-05,
"loss": 0.4304,
"step": 180
},
{
"epoch": 0.07773345607036923,
"grad_norm": 1.8322154114594529,
"learning_rate": 1.59036656596413e-05,
"loss": 0.4488,
"step": 190
},
{
"epoch": 0.08182469060038866,
"grad_norm": 1.6909197474415698,
"learning_rate": 1.605913500632285e-05,
"loss": 0.4244,
"step": 200
},
{
"epoch": 0.0859159251304081,
"grad_norm": 1.9267153990137886,
"learning_rate": 1.6207017395989525e-05,
"loss": 0.4452,
"step": 210
},
{
"epoch": 0.09000715966042754,
"grad_norm": 1.9513020149733689,
"learning_rate": 1.6348018993269024e-05,
"loss": 0.441,
"step": 220
},
{
"epoch": 0.09409839419044697,
"grad_norm": 1.6992552537559622,
"learning_rate": 1.648275174085812e-05,
"loss": 0.4142,
"step": 230
},
{
"epoch": 0.0981896287204664,
"grad_norm": 2.506670138718048,
"learning_rate": 1.661174941058695e-05,
"loss": 0.4309,
"step": 240
},
{
"epoch": 0.10228086325048584,
"grad_norm": 1.1649232947269594,
"learning_rate": 1.6735480375698097e-05,
"loss": 0.4468,
"step": 250
},
{
"epoch": 0.10637209778050527,
"grad_norm": 2.2776528295818723,
"learning_rate": 1.6854357876609896e-05,
"loss": 0.4263,
"step": 260
},
{
"epoch": 0.1104633323105247,
"grad_norm": 1.09704187408726,
"learning_rate": 1.6968748348212453e-05,
"loss": 0.4199,
"step": 270
},
{
"epoch": 0.11455456684054413,
"grad_norm": 0.8342510727420896,
"learning_rate": 1.7078978232003368e-05,
"loss": 0.4313,
"step": 280
},
{
"epoch": 0.11864580137056356,
"grad_norm": 0.8821458905420587,
"learning_rate": 1.7185339592301872e-05,
"loss": 0.4259,
"step": 290
},
{
"epoch": 0.122737035900583,
"grad_norm": 1.7825705984955846,
"learning_rate": 1.7288094779962197e-05,
"loss": 0.4261,
"step": 300
},
{
"epoch": 0.12682827043060244,
"grad_norm": 1.904238409153354,
"learning_rate": 1.7387480331094423e-05,
"loss": 0.4259,
"step": 310
},
{
"epoch": 0.13091950496062188,
"grad_norm": 1.725649730023905,
"learning_rate": 1.7483710246600792e-05,
"loss": 0.4314,
"step": 320
},
{
"epoch": 0.1350107394906413,
"grad_norm": 1.4341207306128956,
"learning_rate": 1.757697876690837e-05,
"loss": 0.4419,
"step": 330
},
{
"epoch": 0.13910197402066074,
"grad_norm": 1.2643467480950796,
"learning_rate": 1.766746273236246e-05,
"loss": 0.4312,
"step": 340
},
{
"epoch": 0.14319320855068016,
"grad_norm": 1.563315600788671,
"learning_rate": 1.7755323601378616e-05,
"loss": 0.4211,
"step": 350
},
{
"epoch": 0.1472844430806996,
"grad_norm": 1.3601337663266007,
"learning_rate": 1.7840709184226296e-05,
"loss": 0.4187,
"step": 360
},
{
"epoch": 0.15137567761071904,
"grad_norm": 1.230870033216971,
"learning_rate": 1.792375513921188e-05,
"loss": 0.4317,
"step": 370
},
{
"epoch": 0.15546691214073846,
"grad_norm": 0.935994666832451,
"learning_rate": 1.8004586269294493e-05,
"loss": 0.4347,
"step": 380
},
{
"epoch": 0.1595581466707579,
"grad_norm": 1.3347135910248187,
"learning_rate": 1.8083317650249243e-05,
"loss": 0.4323,
"step": 390
},
{
"epoch": 0.16364938120077732,
"grad_norm": 1.929506823385391,
"learning_rate": 1.8160055615976043e-05,
"loss": 0.4346,
"step": 400
},
{
"epoch": 0.16774061573079677,
"grad_norm": 0.9726505093912414,
"learning_rate": 1.8234898622125742e-05,
"loss": 0.416,
"step": 410
},
{
"epoch": 0.1718318502608162,
"grad_norm": 0.9397510565012722,
"learning_rate": 1.8307938005642715e-05,
"loss": 0.4182,
"step": 420
},
{
"epoch": 0.17592308479083563,
"grad_norm": 1.2862427800883758,
"learning_rate": 1.8379258654923192e-05,
"loss": 0.4165,
"step": 430
},
{
"epoch": 0.18001431932085507,
"grad_norm": 1.455717102975378,
"learning_rate": 1.8448939602922218e-05,
"loss": 0.4287,
"step": 440
},
{
"epoch": 0.18410555385087451,
"grad_norm": 1.5001908608590255,
"learning_rate": 1.8517054553601544e-05,
"loss": 0.4303,
"step": 450
},
{
"epoch": 0.18819678838089393,
"grad_norm": 1.6709222890878312,
"learning_rate": 1.8583672350511313e-05,
"loss": 0.4328,
"step": 460
},
{
"epoch": 0.19228802291091338,
"grad_norm": 1.3805672401271125,
"learning_rate": 1.864885739497424e-05,
"loss": 0.4179,
"step": 470
},
{
"epoch": 0.1963792574409328,
"grad_norm": 1.4202659814214615,
"learning_rate": 1.8712670020240143e-05,
"loss": 0.4129,
"step": 480
},
{
"epoch": 0.20047049197095224,
"grad_norm": 1.3802506203673055,
"learning_rate": 1.8775166827059134e-05,
"loss": 0.4203,
"step": 490
},
{
"epoch": 0.20456172650097168,
"grad_norm": 1.4249688406884504,
"learning_rate": 1.883640098535129e-05,
"loss": 0.4246,
"step": 500
},
{
"epoch": 0.20456172650097168,
"eval_loss": 0.4226590692996979,
"eval_runtime": 565.8176,
"eval_samples_per_second": 5.458,
"eval_steps_per_second": 0.91,
"step": 500
},
{
"epoch": 0.2086529610309911,
"grad_norm": 0.8798533540054851,
"learning_rate": 1.8896422506001807e-05,
"loss": 0.4267,
"step": 510
},
{
"epoch": 0.21274419556101054,
"grad_norm": 1.279435399555335,
"learning_rate": 1.895527848626309e-05,
"loss": 0.4103,
"step": 520
},
{
"epoch": 0.21683543009102996,
"grad_norm": 1.2983197150849732,
"learning_rate": 1.901301333178074e-05,
"loss": 0.416,
"step": 530
},
{
"epoch": 0.2209266646210494,
"grad_norm": 1.3784001180176515,
"learning_rate": 1.9069668957865647e-05,
"loss": 0.4249,
"step": 540
},
{
"epoch": 0.22501789915106885,
"grad_norm": 1.4315659756421384,
"learning_rate": 1.9125284972297466e-05,
"loss": 0.4244,
"step": 550
},
{
"epoch": 0.22910913368108826,
"grad_norm": 1.1438536096514877,
"learning_rate": 1.9179898841656562e-05,
"loss": 0.4175,
"step": 560
},
{
"epoch": 0.2332003682111077,
"grad_norm": 1.5784295688235042,
"learning_rate": 1.923354604293384e-05,
"loss": 0.4338,
"step": 570
},
{
"epoch": 0.23729160274112712,
"grad_norm": 1.1792683199314111,
"learning_rate": 1.9286260201955066e-05,
"loss": 0.4235,
"step": 580
},
{
"epoch": 0.24138283727114657,
"grad_norm": 0.9242432483061238,
"learning_rate": 1.9338073219972227e-05,
"loss": 0.4206,
"step": 590
},
{
"epoch": 0.245474071801166,
"grad_norm": 1.4278436783956994,
"learning_rate": 1.938901538961539e-05,
"loss": 0.4288,
"step": 600
},
{
"epoch": 0.24956530633118543,
"grad_norm": 1.7892184247748346,
"learning_rate": 1.9439115501260403e-05,
"loss": 0.4314,
"step": 610
},
{
"epoch": 0.25365654086120487,
"grad_norm": 1.3459176439410694,
"learning_rate": 1.9488400940747617e-05,
"loss": 0.4252,
"step": 620
},
{
"epoch": 0.2577477753912243,
"grad_norm": 1.0979452662978122,
"learning_rate": 1.9536897779282066e-05,
"loss": 0.4159,
"step": 630
},
{
"epoch": 0.26183900992124376,
"grad_norm": 1.1558624628355403,
"learning_rate": 1.958463085625399e-05,
"loss": 0.425,
"step": 640
},
{
"epoch": 0.2659302444512632,
"grad_norm": 0.9606366329728439,
"learning_rate": 1.9631623855638338e-05,
"loss": 0.4084,
"step": 650
},
{
"epoch": 0.2700214789812826,
"grad_norm": 1.0597290473469163,
"learning_rate": 1.9677899376561565e-05,
"loss": 0.4099,
"step": 660
},
{
"epoch": 0.274112713511302,
"grad_norm": 1.0819100913671185,
"learning_rate": 1.9723478998562017e-05,
"loss": 0.4151,
"step": 670
},
{
"epoch": 0.2782039480413215,
"grad_norm": 1.4892435767285719,
"learning_rate": 1.976838334201565e-05,
"loss": 0.4164,
"step": 680
},
{
"epoch": 0.2822951825713409,
"grad_norm": 1.1025628764173787,
"learning_rate": 1.981263212415066e-05,
"loss": 0.4193,
"step": 690
},
{
"epoch": 0.2863864171013603,
"grad_norm": 1.0753475991516983,
"learning_rate": 1.985624421103181e-05,
"loss": 0.4108,
"step": 700
},
{
"epoch": 0.2904776516313798,
"grad_norm": 0.905939605392397,
"learning_rate": 1.9899237665857572e-05,
"loss": 0.4089,
"step": 710
},
{
"epoch": 0.2945688861613992,
"grad_norm": 1.1713616964839249,
"learning_rate": 1.994162979387949e-05,
"loss": 0.4201,
"step": 720
},
{
"epoch": 0.2986601206914186,
"grad_norm": 0.9299543253519471,
"learning_rate": 1.998343718422334e-05,
"loss": 0.4139,
"step": 730
},
{
"epoch": 0.3027513552214381,
"grad_norm": 1.516913403786176,
"learning_rate": 1.998485078018482e-05,
"loss": 0.4067,
"step": 740
},
{
"epoch": 0.3068425897514575,
"grad_norm": 0.7991131845266628,
"learning_rate": 1.9954552340554464e-05,
"loss": 0.4123,
"step": 750
},
{
"epoch": 0.3109338242814769,
"grad_norm": 1.3657331223383264,
"learning_rate": 1.9924253900924104e-05,
"loss": 0.4108,
"step": 760
},
{
"epoch": 0.3150250588114964,
"grad_norm": 0.8920875301003005,
"learning_rate": 1.9893955461293747e-05,
"loss": 0.4191,
"step": 770
},
{
"epoch": 0.3191162933415158,
"grad_norm": 0.9744817931110429,
"learning_rate": 1.9863657021663386e-05,
"loss": 0.4053,
"step": 780
},
{
"epoch": 0.3232075278715352,
"grad_norm": 1.1889083717413498,
"learning_rate": 1.9833358582033025e-05,
"loss": 0.4629,
"step": 790
},
{
"epoch": 0.32729876240155464,
"grad_norm": 2.2559102163995965,
"learning_rate": 1.9803060142402668e-05,
"loss": 0.7059,
"step": 800
},
{
"epoch": 0.3313899969315741,
"grad_norm": 0.7012331362796347,
"learning_rate": 1.977276170277231e-05,
"loss": 0.418,
"step": 810
},
{
"epoch": 0.33548123146159353,
"grad_norm": 1.1193457098820139,
"learning_rate": 1.9742463263141947e-05,
"loss": 0.4022,
"step": 820
},
{
"epoch": 0.33957246599161295,
"grad_norm": 1.0787812784251993,
"learning_rate": 1.971216482351159e-05,
"loss": 0.4147,
"step": 830
},
{
"epoch": 0.3436637005216324,
"grad_norm": 1.2144732211866651,
"learning_rate": 1.9681866383881233e-05,
"loss": 0.4118,
"step": 840
},
{
"epoch": 0.34775493505165184,
"grad_norm": 1.2104042207994736,
"learning_rate": 1.9651567944250872e-05,
"loss": 0.4081,
"step": 850
},
{
"epoch": 0.35184616958167125,
"grad_norm": 1.2049689205461667,
"learning_rate": 1.9621269504620512e-05,
"loss": 0.4073,
"step": 860
},
{
"epoch": 0.3559374041116907,
"grad_norm": 1.230689211204093,
"learning_rate": 1.9590971064990155e-05,
"loss": 0.4104,
"step": 870
},
{
"epoch": 0.36002863864171014,
"grad_norm": 0.8629950816912576,
"learning_rate": 1.9560672625359794e-05,
"loss": 0.4147,
"step": 880
},
{
"epoch": 0.36411987317172956,
"grad_norm": 1.1029857217191437,
"learning_rate": 1.9530374185729437e-05,
"loss": 0.4221,
"step": 890
},
{
"epoch": 0.36821110770174903,
"grad_norm": 1.1513671079248653,
"learning_rate": 1.9500075746099076e-05,
"loss": 0.4143,
"step": 900
},
{
"epoch": 0.37230234223176845,
"grad_norm": 1.3187073752135716,
"learning_rate": 1.946977730646872e-05,
"loss": 0.4193,
"step": 910
},
{
"epoch": 0.37639357676178786,
"grad_norm": 0.8494318204927609,
"learning_rate": 1.943947886683836e-05,
"loss": 0.4103,
"step": 920
},
{
"epoch": 0.3804848112918073,
"grad_norm": 1.2378655056760652,
"learning_rate": 1.9409180427208e-05,
"loss": 0.413,
"step": 930
},
{
"epoch": 0.38457604582182675,
"grad_norm": 0.8742974569121568,
"learning_rate": 1.937888198757764e-05,
"loss": 0.4178,
"step": 940
},
{
"epoch": 0.38866728035184617,
"grad_norm": 0.6659680293817604,
"learning_rate": 1.934858354794728e-05,
"loss": 0.3992,
"step": 950
},
{
"epoch": 0.3927585148818656,
"grad_norm": 1.0085028880533002,
"learning_rate": 1.9318285108316923e-05,
"loss": 0.3957,
"step": 960
},
{
"epoch": 0.39684974941188506,
"grad_norm": 0.7913168849253336,
"learning_rate": 1.9287986668686566e-05,
"loss": 0.4111,
"step": 970
},
{
"epoch": 0.40094098394190447,
"grad_norm": 0.9593948086372496,
"learning_rate": 1.9257688229056206e-05,
"loss": 0.4097,
"step": 980
},
{
"epoch": 0.4050322184719239,
"grad_norm": 1.1075304924315132,
"learning_rate": 1.9227389789425845e-05,
"loss": 0.4029,
"step": 990
},
{
"epoch": 0.40912345300194336,
"grad_norm": 1.3189534557378433,
"learning_rate": 1.9197091349795488e-05,
"loss": 0.4114,
"step": 1000
},
{
"epoch": 0.40912345300194336,
"eval_loss": 0.4164506494998932,
"eval_runtime": 567.741,
"eval_samples_per_second": 5.439,
"eval_steps_per_second": 0.907,
"step": 1000
},
{
"epoch": 0.4132146875319628,
"grad_norm": 1.3479557524857961,
"learning_rate": 1.9166792910165127e-05,
"loss": 0.4034,
"step": 1010
},
{
"epoch": 0.4173059220619822,
"grad_norm": 0.7981045727445666,
"learning_rate": 1.913649447053477e-05,
"loss": 0.3984,
"step": 1020
},
{
"epoch": 0.4213971565920016,
"grad_norm": 1.3167949575672933,
"learning_rate": 1.910619603090441e-05,
"loss": 0.418,
"step": 1030
},
{
"epoch": 0.4254883911220211,
"grad_norm": 0.875082404450118,
"learning_rate": 1.907589759127405e-05,
"loss": 0.4165,
"step": 1040
},
{
"epoch": 0.4295796256520405,
"grad_norm": 0.7010201186949262,
"learning_rate": 1.9045599151643692e-05,
"loss": 0.4043,
"step": 1050
},
{
"epoch": 0.4336708601820599,
"grad_norm": 0.8106312112876102,
"learning_rate": 1.9015300712013335e-05,
"loss": 0.3946,
"step": 1060
},
{
"epoch": 0.4377620947120794,
"grad_norm": 1.2069253813775411,
"learning_rate": 1.8985002272382974e-05,
"loss": 0.4142,
"step": 1070
},
{
"epoch": 0.4418533292420988,
"grad_norm": 1.4074507315896057,
"learning_rate": 1.8954703832752614e-05,
"loss": 0.4216,
"step": 1080
},
{
"epoch": 0.4459445637721182,
"grad_norm": 1.0142221478478237,
"learning_rate": 1.8924405393122257e-05,
"loss": 0.399,
"step": 1090
},
{
"epoch": 0.4500357983021377,
"grad_norm": 1.0549341445926665,
"learning_rate": 1.8894106953491896e-05,
"loss": 0.4036,
"step": 1100
},
{
"epoch": 0.4541270328321571,
"grad_norm": 0.965773892068146,
"learning_rate": 1.8863808513861535e-05,
"loss": 0.4064,
"step": 1110
},
{
"epoch": 0.4582182673621765,
"grad_norm": 0.9064204193354054,
"learning_rate": 1.8833510074231178e-05,
"loss": 0.3909,
"step": 1120
},
{
"epoch": 0.462309501892196,
"grad_norm": 0.9674069704546393,
"learning_rate": 1.880321163460082e-05,
"loss": 0.4016,
"step": 1130
},
{
"epoch": 0.4664007364222154,
"grad_norm": 0.7415264356489151,
"learning_rate": 1.877291319497046e-05,
"loss": 0.418,
"step": 1140
},
{
"epoch": 0.47049197095223483,
"grad_norm": 0.610984069094811,
"learning_rate": 1.87426147553401e-05,
"loss": 0.4163,
"step": 1150
},
{
"epoch": 0.47458320548225424,
"grad_norm": 0.8399219229428194,
"learning_rate": 1.8712316315709743e-05,
"loss": 0.3986,
"step": 1160
},
{
"epoch": 0.4786744400122737,
"grad_norm": 1.157757088776256,
"learning_rate": 1.8682017876079382e-05,
"loss": 0.4108,
"step": 1170
},
{
"epoch": 0.48276567454229313,
"grad_norm": 0.967241129437065,
"learning_rate": 1.8651719436449025e-05,
"loss": 0.4217,
"step": 1180
},
{
"epoch": 0.48685690907231255,
"grad_norm": 2.091651865632623,
"learning_rate": 1.8621420996818665e-05,
"loss": 0.4091,
"step": 1190
},
{
"epoch": 0.490948143602332,
"grad_norm": 0.6567666937577481,
"learning_rate": 1.8591122557188307e-05,
"loss": 0.4206,
"step": 1200
},
{
"epoch": 0.49503937813235144,
"grad_norm": 1.3838725189967105,
"learning_rate": 1.8560824117557947e-05,
"loss": 0.3986,
"step": 1210
},
{
"epoch": 0.49913061266237085,
"grad_norm": 0.7561244143868575,
"learning_rate": 1.853052567792759e-05,
"loss": 0.4091,
"step": 1220
},
{
"epoch": 0.5032218471923903,
"grad_norm": 1.1843346363848066,
"learning_rate": 1.850022723829723e-05,
"loss": 0.4009,
"step": 1230
},
{
"epoch": 0.5073130817224097,
"grad_norm": 0.9612775939058839,
"learning_rate": 1.846992879866687e-05,
"loss": 0.4312,
"step": 1240
},
{
"epoch": 0.5114043162524292,
"grad_norm": 0.9653095988733074,
"learning_rate": 1.843963035903651e-05,
"loss": 0.411,
"step": 1250
},
{
"epoch": 0.5154955507824486,
"grad_norm": 0.9538148396031432,
"learning_rate": 1.840933191940615e-05,
"loss": 0.4189,
"step": 1260
},
{
"epoch": 0.519586785312468,
"grad_norm": 0.831951461503928,
"learning_rate": 1.8379033479775794e-05,
"loss": 0.3866,
"step": 1270
},
{
"epoch": 0.5236780198424875,
"grad_norm": 0.8415502915298407,
"learning_rate": 1.8348735040145433e-05,
"loss": 0.43,
"step": 1280
},
{
"epoch": 0.5277692543725069,
"grad_norm": 0.8927957692335573,
"learning_rate": 1.8318436600515076e-05,
"loss": 0.4143,
"step": 1290
},
{
"epoch": 0.5318604889025264,
"grad_norm": 0.9951847425278025,
"learning_rate": 1.8288138160884716e-05,
"loss": 0.4082,
"step": 1300
},
{
"epoch": 0.5359517234325458,
"grad_norm": 0.9794640728750337,
"learning_rate": 1.825783972125436e-05,
"loss": 0.4123,
"step": 1310
},
{
"epoch": 0.5400429579625652,
"grad_norm": 1.2936619002334293,
"learning_rate": 1.8227541281623998e-05,
"loss": 0.4063,
"step": 1320
},
{
"epoch": 0.5441341924925847,
"grad_norm": 0.9539785376828354,
"learning_rate": 1.8197242841993637e-05,
"loss": 0.4001,
"step": 1330
},
{
"epoch": 0.548225427022604,
"grad_norm": 0.7647319210581001,
"learning_rate": 1.816694440236328e-05,
"loss": 0.4067,
"step": 1340
},
{
"epoch": 0.5523166615526235,
"grad_norm": 0.7391526930684824,
"learning_rate": 1.8136645962732923e-05,
"loss": 0.4135,
"step": 1350
},
{
"epoch": 0.556407896082643,
"grad_norm": 1.2278179621110827,
"learning_rate": 1.8106347523102562e-05,
"loss": 0.3955,
"step": 1360
},
{
"epoch": 0.5604991306126623,
"grad_norm": 0.9263102201334721,
"learning_rate": 1.8076049083472202e-05,
"loss": 0.4053,
"step": 1370
},
{
"epoch": 0.5645903651426818,
"grad_norm": 1.270439289582048,
"learning_rate": 1.8045750643841845e-05,
"loss": 0.4065,
"step": 1380
},
{
"epoch": 0.5686815996727013,
"grad_norm": 0.8192132223098715,
"learning_rate": 1.8015452204211484e-05,
"loss": 0.4085,
"step": 1390
},
{
"epoch": 0.5727728342027206,
"grad_norm": 0.6191336734377946,
"learning_rate": 1.7985153764581124e-05,
"loss": 0.402,
"step": 1400
},
{
"epoch": 0.5768640687327401,
"grad_norm": 0.7122309486400348,
"learning_rate": 1.7954855324950766e-05,
"loss": 0.3985,
"step": 1410
},
{
"epoch": 0.5809553032627596,
"grad_norm": 1.0903768849480002,
"learning_rate": 1.792455688532041e-05,
"loss": 0.4142,
"step": 1420
},
{
"epoch": 0.5850465377927789,
"grad_norm": 1.1699038018329126,
"learning_rate": 1.789425844569005e-05,
"loss": 0.4055,
"step": 1430
},
{
"epoch": 0.5891377723227984,
"grad_norm": 0.7701176154609344,
"learning_rate": 1.7863960006059688e-05,
"loss": 0.403,
"step": 1440
},
{
"epoch": 0.5932290068528179,
"grad_norm": 0.9632444193994341,
"learning_rate": 1.783366156642933e-05,
"loss": 0.3897,
"step": 1450
},
{
"epoch": 0.5973202413828372,
"grad_norm": 0.9564215846627407,
"learning_rate": 1.780336312679897e-05,
"loss": 0.4117,
"step": 1460
},
{
"epoch": 0.6014114759128567,
"grad_norm": 0.6899350345691984,
"learning_rate": 1.7773064687168613e-05,
"loss": 0.4137,
"step": 1470
},
{
"epoch": 0.6055027104428762,
"grad_norm": 0.9095599036748141,
"learning_rate": 1.7742766247538253e-05,
"loss": 0.415,
"step": 1480
},
{
"epoch": 0.6095939449728955,
"grad_norm": 0.9109255603659862,
"learning_rate": 1.7712467807907892e-05,
"loss": 0.407,
"step": 1490
},
{
"epoch": 0.613685179502915,
"grad_norm": 1.2266191388411232,
"learning_rate": 1.7682169368277535e-05,
"loss": 0.4055,
"step": 1500
},
{
"epoch": 0.613685179502915,
"eval_loss": 0.4121568500995636,
"eval_runtime": 568.64,
"eval_samples_per_second": 5.431,
"eval_steps_per_second": 0.906,
"step": 1500
},
{
"epoch": 0.6177764140329345,
"grad_norm": 2.106549959554808,
"learning_rate": 1.7651870928647178e-05,
"loss": 0.397,
"step": 1510
},
{
"epoch": 0.6218676485629538,
"grad_norm": 0.5026183280127312,
"learning_rate": 1.7621572489016817e-05,
"loss": 0.3986,
"step": 1520
},
{
"epoch": 0.6259588830929733,
"grad_norm": 1.141004596209656,
"learning_rate": 1.7591274049386457e-05,
"loss": 0.4074,
"step": 1530
},
{
"epoch": 0.6300501176229928,
"grad_norm": 0.8191222686339406,
"learning_rate": 1.75609756097561e-05,
"loss": 0.4002,
"step": 1540
},
{
"epoch": 0.6341413521530121,
"grad_norm": 0.6764982914151534,
"learning_rate": 1.753067717012574e-05,
"loss": 0.4107,
"step": 1550
},
{
"epoch": 0.6382325866830316,
"grad_norm": 1.3684032943814484,
"learning_rate": 1.750037873049538e-05,
"loss": 0.4036,
"step": 1560
},
{
"epoch": 0.642323821213051,
"grad_norm": 0.8576599206196178,
"learning_rate": 1.747008029086502e-05,
"loss": 0.4067,
"step": 1570
},
{
"epoch": 0.6464150557430705,
"grad_norm": 1.199412066961356,
"learning_rate": 1.7439781851234664e-05,
"loss": 0.4069,
"step": 1580
},
{
"epoch": 0.6505062902730899,
"grad_norm": 0.9099518471943355,
"learning_rate": 1.7409483411604304e-05,
"loss": 0.3995,
"step": 1590
},
{
"epoch": 0.6545975248031093,
"grad_norm": 1.1070377119289831,
"learning_rate": 1.7379184971973947e-05,
"loss": 0.4225,
"step": 1600
},
{
"epoch": 0.6586887593331288,
"grad_norm": 0.8753865952879469,
"learning_rate": 1.7348886532343586e-05,
"loss": 0.4032,
"step": 1610
},
{
"epoch": 0.6627799938631482,
"grad_norm": 0.7973497440286291,
"learning_rate": 1.7318588092713226e-05,
"loss": 0.4207,
"step": 1620
},
{
"epoch": 0.6668712283931676,
"grad_norm": 0.6925989497344892,
"learning_rate": 1.728828965308287e-05,
"loss": 0.4123,
"step": 1630
},
{
"epoch": 0.6709624629231871,
"grad_norm": 0.9669611664338155,
"learning_rate": 1.725799121345251e-05,
"loss": 0.3916,
"step": 1640
},
{
"epoch": 0.6750536974532065,
"grad_norm": 1.2385150364852342,
"learning_rate": 1.7227692773822147e-05,
"loss": 0.4136,
"step": 1650
},
{
"epoch": 0.6791449319832259,
"grad_norm": 0.7549418298109767,
"learning_rate": 1.719739433419179e-05,
"loss": 0.4146,
"step": 1660
},
{
"epoch": 0.6832361665132454,
"grad_norm": 1.0424008728514387,
"learning_rate": 1.7167095894561433e-05,
"loss": 0.3971,
"step": 1670
},
{
"epoch": 0.6873274010432648,
"grad_norm": 0.7360079123740301,
"learning_rate": 1.7136797454931072e-05,
"loss": 0.3947,
"step": 1680
},
{
"epoch": 0.6914186355732842,
"grad_norm": 0.961447300959097,
"learning_rate": 1.7106499015300712e-05,
"loss": 0.4035,
"step": 1690
},
{
"epoch": 0.6955098701033037,
"grad_norm": 0.7277946304181864,
"learning_rate": 1.7076200575670355e-05,
"loss": 0.3995,
"step": 1700
},
{
"epoch": 0.6996011046333231,
"grad_norm": 0.7930916610357551,
"learning_rate": 1.7045902136039994e-05,
"loss": 0.4057,
"step": 1710
},
{
"epoch": 0.7036923391633425,
"grad_norm": 0.7442545537670275,
"learning_rate": 1.7015603696409637e-05,
"loss": 0.4042,
"step": 1720
},
{
"epoch": 0.707783573693362,
"grad_norm": 1.1048107524067752,
"learning_rate": 1.6985305256779276e-05,
"loss": 0.4132,
"step": 1730
},
{
"epoch": 0.7118748082233815,
"grad_norm": 1.1900958503859818,
"learning_rate": 1.695500681714892e-05,
"loss": 0.4166,
"step": 1740
},
{
"epoch": 0.7159660427534008,
"grad_norm": 0.7615970705911563,
"learning_rate": 1.692470837751856e-05,
"loss": 0.3937,
"step": 1750
},
{
"epoch": 0.7200572772834203,
"grad_norm": 1.0147007380179698,
"learning_rate": 1.68944099378882e-05,
"loss": 0.3918,
"step": 1760
},
{
"epoch": 0.7241485118134398,
"grad_norm": 0.8482189838251506,
"learning_rate": 1.686411149825784e-05,
"loss": 0.3839,
"step": 1770
},
{
"epoch": 0.7282397463434591,
"grad_norm": 0.711159267866409,
"learning_rate": 1.683381305862748e-05,
"loss": 0.3948,
"step": 1780
},
{
"epoch": 0.7323309808734786,
"grad_norm": 1.371790290922361,
"learning_rate": 1.6803514618997123e-05,
"loss": 0.3984,
"step": 1790
},
{
"epoch": 0.7364222154034981,
"grad_norm": 0.7270726304455154,
"learning_rate": 1.6773216179366766e-05,
"loss": 0.4058,
"step": 1800
},
{
"epoch": 0.7405134499335174,
"grad_norm": 0.993765933652885,
"learning_rate": 1.6742917739736406e-05,
"loss": 0.3964,
"step": 1810
},
{
"epoch": 0.7446046844635369,
"grad_norm": 0.7396054882960263,
"learning_rate": 1.6712619300106045e-05,
"loss": 0.3988,
"step": 1820
},
{
"epoch": 0.7486959189935563,
"grad_norm": 1.1310726932463144,
"learning_rate": 1.6682320860475688e-05,
"loss": 0.4146,
"step": 1830
},
{
"epoch": 0.7527871535235757,
"grad_norm": 0.8570685688377147,
"learning_rate": 1.6652022420845327e-05,
"loss": 0.3971,
"step": 1840
},
{
"epoch": 0.7568783880535952,
"grad_norm": 1.2492633396416357,
"learning_rate": 1.6621723981214967e-05,
"loss": 0.4087,
"step": 1850
},
{
"epoch": 0.7609696225836146,
"grad_norm": 1.0021357130993627,
"learning_rate": 1.659142554158461e-05,
"loss": 0.4162,
"step": 1860
},
{
"epoch": 0.765060857113634,
"grad_norm": 0.7813489106628776,
"learning_rate": 1.656112710195425e-05,
"loss": 0.405,
"step": 1870
},
{
"epoch": 0.7691520916436535,
"grad_norm": 0.95305010521556,
"learning_rate": 1.6530828662323892e-05,
"loss": 0.391,
"step": 1880
},
{
"epoch": 0.7732433261736729,
"grad_norm": 0.6688770767238499,
"learning_rate": 1.650053022269353e-05,
"loss": 0.3802,
"step": 1890
},
{
"epoch": 0.7773345607036923,
"grad_norm": 0.6474194044675128,
"learning_rate": 1.6470231783063174e-05,
"loss": 0.4094,
"step": 1900
},
{
"epoch": 0.7814257952337118,
"grad_norm": 0.8498584394336631,
"learning_rate": 1.6439933343432814e-05,
"loss": 0.4073,
"step": 1910
},
{
"epoch": 0.7855170297637312,
"grad_norm": 0.7377058782682137,
"learning_rate": 1.6409634903802457e-05,
"loss": 0.3958,
"step": 1920
},
{
"epoch": 0.7896082642937506,
"grad_norm": 0.843204215889971,
"learning_rate": 1.6379336464172096e-05,
"loss": 0.414,
"step": 1930
},
{
"epoch": 0.7936994988237701,
"grad_norm": 0.6130692021490832,
"learning_rate": 1.6349038024541736e-05,
"loss": 0.4114,
"step": 1940
},
{
"epoch": 0.7977907333537895,
"grad_norm": 1.2350933998678075,
"learning_rate": 1.631873958491138e-05,
"loss": 0.4111,
"step": 1950
},
{
"epoch": 0.8018819678838089,
"grad_norm": 0.5814935040298003,
"learning_rate": 1.628844114528102e-05,
"loss": 0.4038,
"step": 1960
},
{
"epoch": 0.8059732024138284,
"grad_norm": 1.1917635251998295,
"learning_rate": 1.625814270565066e-05,
"loss": 0.4108,
"step": 1970
},
{
"epoch": 0.8100644369438478,
"grad_norm": 0.6526809291051177,
"learning_rate": 1.62278442660203e-05,
"loss": 0.3924,
"step": 1980
},
{
"epoch": 0.8141556714738672,
"grad_norm": 0.8339385093933975,
"learning_rate": 1.6197545826389943e-05,
"loss": 0.4007,
"step": 1990
},
{
"epoch": 0.8182469060038867,
"grad_norm": 0.8226184081347072,
"learning_rate": 1.6167247386759582e-05,
"loss": 0.4055,
"step": 2000
},
{
"epoch": 0.8182469060038867,
"eval_loss": 0.40546923875808716,
"eval_runtime": 569.0883,
"eval_samples_per_second": 5.426,
"eval_steps_per_second": 0.905,
"step": 2000
},
{
"epoch": 0.8223381405339061,
"grad_norm": 0.6918624248021936,
"learning_rate": 1.6136948947129225e-05,
"loss": 0.3996,
"step": 2010
},
{
"epoch": 0.8264293750639256,
"grad_norm": 0.9233306557406306,
"learning_rate": 1.6106650507498865e-05,
"loss": 0.4044,
"step": 2020
},
{
"epoch": 0.830520609593945,
"grad_norm": 1.036109547461375,
"learning_rate": 1.6076352067868508e-05,
"loss": 0.4078,
"step": 2030
},
{
"epoch": 0.8346118441239644,
"grad_norm": 0.6487594500506121,
"learning_rate": 1.6046053628238147e-05,
"loss": 0.3992,
"step": 2040
},
{
"epoch": 0.8387030786539839,
"grad_norm": 0.9061995762710394,
"learning_rate": 1.601575518860779e-05,
"loss": 0.4065,
"step": 2050
},
{
"epoch": 0.8427943131840032,
"grad_norm": 1.55527142608359,
"learning_rate": 1.598545674897743e-05,
"loss": 0.4126,
"step": 2060
},
{
"epoch": 0.8468855477140227,
"grad_norm": 0.6076317290425526,
"learning_rate": 1.595515830934707e-05,
"loss": 0.4027,
"step": 2070
},
{
"epoch": 0.8509767822440422,
"grad_norm": 1.100916449726025,
"learning_rate": 1.592485986971671e-05,
"loss": 0.4136,
"step": 2080
},
{
"epoch": 0.8550680167740615,
"grad_norm": 0.9334359781171854,
"learning_rate": 1.589456143008635e-05,
"loss": 0.4018,
"step": 2090
},
{
"epoch": 0.859159251304081,
"grad_norm": 1.1538729493388855,
"learning_rate": 1.586426299045599e-05,
"loss": 0.3981,
"step": 2100
},
{
"epoch": 0.8632504858341005,
"grad_norm": 0.6672272992408166,
"learning_rate": 1.5833964550825633e-05,
"loss": 0.3989,
"step": 2110
},
{
"epoch": 0.8673417203641198,
"grad_norm": 0.6790211847421547,
"learning_rate": 1.5803666111195276e-05,
"loss": 0.3999,
"step": 2120
},
{
"epoch": 0.8714329548941393,
"grad_norm": 0.8625927450358163,
"learning_rate": 1.5773367671564916e-05,
"loss": 0.4103,
"step": 2130
},
{
"epoch": 0.8755241894241588,
"grad_norm": 0.8799687483547954,
"learning_rate": 1.5743069231934555e-05,
"loss": 0.4,
"step": 2140
},
{
"epoch": 0.8796154239541781,
"grad_norm": 0.6045044925297284,
"learning_rate": 1.5712770792304198e-05,
"loss": 0.4127,
"step": 2150
},
{
"epoch": 0.8837066584841976,
"grad_norm": 1.1939155172076787,
"learning_rate": 1.5682472352673837e-05,
"loss": 0.3982,
"step": 2160
},
{
"epoch": 0.8877978930142171,
"grad_norm": 0.7558373479016637,
"learning_rate": 1.565217391304348e-05,
"loss": 0.4064,
"step": 2170
},
{
"epoch": 0.8918891275442364,
"grad_norm": 0.7840338054367947,
"learning_rate": 1.562187547341312e-05,
"loss": 0.411,
"step": 2180
},
{
"epoch": 0.8959803620742559,
"grad_norm": 0.6836078155952856,
"learning_rate": 1.5591577033782763e-05,
"loss": 0.4011,
"step": 2190
},
{
"epoch": 0.9000715966042754,
"grad_norm": 0.6588002590309792,
"learning_rate": 1.5561278594152402e-05,
"loss": 0.4034,
"step": 2200
},
{
"epoch": 0.9041628311342947,
"grad_norm": 0.7477069671187269,
"learning_rate": 1.5530980154522045e-05,
"loss": 0.414,
"step": 2210
},
{
"epoch": 0.9082540656643142,
"grad_norm": 0.9689258448123745,
"learning_rate": 1.5500681714891684e-05,
"loss": 0.3976,
"step": 2220
},
{
"epoch": 0.9123453001943337,
"grad_norm": 0.9394818001617259,
"learning_rate": 1.5470383275261324e-05,
"loss": 0.414,
"step": 2230
},
{
"epoch": 0.916436534724353,
"grad_norm": 0.7596601012213272,
"learning_rate": 1.5440084835630967e-05,
"loss": 0.3963,
"step": 2240
},
{
"epoch": 0.9205277692543725,
"grad_norm": 0.7148793324533078,
"learning_rate": 1.5409786396000606e-05,
"loss": 0.3933,
"step": 2250
},
{
"epoch": 0.924619003784392,
"grad_norm": 0.9033351893520957,
"learning_rate": 1.537948795637025e-05,
"loss": 0.4013,
"step": 2260
},
{
"epoch": 0.9287102383144114,
"grad_norm": 1.0192314023889302,
"learning_rate": 1.534918951673989e-05,
"loss": 0.4011,
"step": 2270
},
{
"epoch": 0.9328014728444308,
"grad_norm": 0.7958434523694122,
"learning_rate": 1.531889107710953e-05,
"loss": 0.4069,
"step": 2280
},
{
"epoch": 0.9368927073744503,
"grad_norm": 0.7201210026528915,
"learning_rate": 1.528859263747917e-05,
"loss": 0.3951,
"step": 2290
},
{
"epoch": 0.9409839419044697,
"grad_norm": 0.6175942086431213,
"learning_rate": 1.5258294197848814e-05,
"loss": 0.4047,
"step": 2300
},
{
"epoch": 0.9450751764344891,
"grad_norm": 0.7698111963051264,
"learning_rate": 1.5227995758218453e-05,
"loss": 0.4199,
"step": 2310
},
{
"epoch": 0.9491664109645085,
"grad_norm": 1.9264632916618274,
"learning_rate": 1.5197697318588094e-05,
"loss": 0.3931,
"step": 2320
},
{
"epoch": 0.953257645494528,
"grad_norm": 1.7879957288452364,
"learning_rate": 1.5167398878957735e-05,
"loss": 0.4007,
"step": 2330
},
{
"epoch": 0.9573488800245474,
"grad_norm": 0.5780222684091814,
"learning_rate": 1.5137100439327376e-05,
"loss": 0.4104,
"step": 2340
},
{
"epoch": 0.9614401145545668,
"grad_norm": 0.8683726024029582,
"learning_rate": 1.5106801999697016e-05,
"loss": 0.4036,
"step": 2350
},
{
"epoch": 0.9655313490845863,
"grad_norm": 0.9116114901687677,
"learning_rate": 1.5076503560066657e-05,
"loss": 0.4021,
"step": 2360
},
{
"epoch": 0.9696225836146057,
"grad_norm": 0.7992815803043055,
"learning_rate": 1.50462051204363e-05,
"loss": 0.4116,
"step": 2370
},
{
"epoch": 0.9737138181446251,
"grad_norm": 0.9553393046537682,
"learning_rate": 1.5015906680805941e-05,
"loss": 0.4081,
"step": 2380
},
{
"epoch": 0.9778050526746446,
"grad_norm": 0.588071607214625,
"learning_rate": 1.498560824117558e-05,
"loss": 0.4094,
"step": 2390
},
{
"epoch": 0.981896287204664,
"grad_norm": 0.8852293721983012,
"learning_rate": 1.4955309801545222e-05,
"loss": 0.4055,
"step": 2400
},
{
"epoch": 0.9859875217346834,
"grad_norm": 0.49652425498018843,
"learning_rate": 1.4925011361914863e-05,
"loss": 0.4,
"step": 2410
},
{
"epoch": 0.9900787562647029,
"grad_norm": 1.0572558341566427,
"learning_rate": 1.4894712922284504e-05,
"loss": 0.402,
"step": 2420
},
{
"epoch": 0.9941699907947223,
"grad_norm": 0.5472572586554295,
"learning_rate": 1.4864414482654143e-05,
"loss": 0.3824,
"step": 2430
},
{
"epoch": 0.9982612253247417,
"grad_norm": 0.6950966780791227,
"learning_rate": 1.4834116043023785e-05,
"loss": 0.3987,
"step": 2440
},
{
"epoch": 1.0020456172650096,
"grad_norm": 0.7596326868738933,
"learning_rate": 1.4803817603393427e-05,
"loss": 0.3989,
"step": 2450
},
{
"epoch": 1.006136851795029,
"grad_norm": 0.6633330801065885,
"learning_rate": 1.4773519163763069e-05,
"loss": 0.409,
"step": 2460
},
{
"epoch": 1.0102280863250486,
"grad_norm": 0.8877771135554996,
"learning_rate": 1.4743220724132708e-05,
"loss": 0.4127,
"step": 2470
},
{
"epoch": 1.014319320855068,
"grad_norm": 0.8214654626739897,
"learning_rate": 1.4712922284502349e-05,
"loss": 0.3883,
"step": 2480
},
{
"epoch": 1.0184105553850875,
"grad_norm": 1.0024461048478996,
"learning_rate": 1.468262384487199e-05,
"loss": 0.3684,
"step": 2490
},
{
"epoch": 1.022501789915107,
"grad_norm": 0.6824573340958298,
"learning_rate": 1.4652325405241631e-05,
"loss": 0.3826,
"step": 2500
},
{
"epoch": 1.022501789915107,
"eval_loss": 0.4052634835243225,
"eval_runtime": 566.8912,
"eval_samples_per_second": 5.447,
"eval_steps_per_second": 0.908,
"step": 2500
},
{
"epoch": 1.0265930244451262,
"grad_norm": 0.7110510772881399,
"learning_rate": 1.4622026965611271e-05,
"loss": 0.3871,
"step": 2510
},
{
"epoch": 1.0306842589751457,
"grad_norm": 0.7933140123736048,
"learning_rate": 1.4591728525980912e-05,
"loss": 0.3936,
"step": 2520
},
{
"epoch": 1.0347754935051652,
"grad_norm": 0.8306902290417011,
"learning_rate": 1.4561430086350555e-05,
"loss": 0.3987,
"step": 2530
},
{
"epoch": 1.0388667280351847,
"grad_norm": 0.7119087181302277,
"learning_rate": 1.4531131646720196e-05,
"loss": 0.3967,
"step": 2540
},
{
"epoch": 1.0429579625652041,
"grad_norm": 1.0370136579582763,
"learning_rate": 1.4500833207089835e-05,
"loss": 0.401,
"step": 2550
},
{
"epoch": 1.0470491970952236,
"grad_norm": 0.8532007552527441,
"learning_rate": 1.4470534767459477e-05,
"loss": 0.4037,
"step": 2560
},
{
"epoch": 1.0511404316252428,
"grad_norm": 0.6599598500909406,
"learning_rate": 1.4440236327829118e-05,
"loss": 0.3743,
"step": 2570
},
{
"epoch": 1.0552316661552623,
"grad_norm": 1.0157223230619488,
"learning_rate": 1.4409937888198759e-05,
"loss": 0.3845,
"step": 2580
},
{
"epoch": 1.0593229006852818,
"grad_norm": 0.885522215179448,
"learning_rate": 1.4379639448568402e-05,
"loss": 0.3903,
"step": 2590
},
{
"epoch": 1.0634141352153013,
"grad_norm": 0.6463142293297175,
"learning_rate": 1.434934100893804e-05,
"loss": 0.3781,
"step": 2600
},
{
"epoch": 1.0675053697453207,
"grad_norm": 1.0072005790653578,
"learning_rate": 1.4319042569307682e-05,
"loss": 0.3949,
"step": 2610
},
{
"epoch": 1.07159660427534,
"grad_norm": 0.7263224753671689,
"learning_rate": 1.4288744129677324e-05,
"loss": 0.3883,
"step": 2620
},
{
"epoch": 1.0756878388053595,
"grad_norm": 0.8333184288546814,
"learning_rate": 1.4258445690046965e-05,
"loss": 0.3815,
"step": 2630
},
{
"epoch": 1.079779073335379,
"grad_norm": 1.0113253670629012,
"learning_rate": 1.4228147250416604e-05,
"loss": 0.3813,
"step": 2640
},
{
"epoch": 1.0838703078653984,
"grad_norm": 0.8528216478869027,
"learning_rate": 1.4197848810786245e-05,
"loss": 0.4065,
"step": 2650
},
{
"epoch": 1.0879615423954179,
"grad_norm": 0.9440475129085675,
"learning_rate": 1.4167550371155886e-05,
"loss": 0.3962,
"step": 2660
},
{
"epoch": 1.0920527769254373,
"grad_norm": 0.6055474396833757,
"learning_rate": 1.413725193152553e-05,
"loss": 0.3995,
"step": 2670
},
{
"epoch": 1.0961440114554566,
"grad_norm": 1.0259816645911408,
"learning_rate": 1.4106953491895167e-05,
"loss": 0.3904,
"step": 2680
},
{
"epoch": 1.100235245985476,
"grad_norm": 0.5928389552557433,
"learning_rate": 1.407665505226481e-05,
"loss": 0.4021,
"step": 2690
},
{
"epoch": 1.1043264805154955,
"grad_norm": 0.6258598420479162,
"learning_rate": 1.4046356612634451e-05,
"loss": 0.391,
"step": 2700
},
{
"epoch": 1.108417715045515,
"grad_norm": 0.7147294619644624,
"learning_rate": 1.4016058173004092e-05,
"loss": 0.3837,
"step": 2710
},
{
"epoch": 1.1125089495755345,
"grad_norm": 0.646834791543815,
"learning_rate": 1.3985759733373732e-05,
"loss": 0.4035,
"step": 2720
},
{
"epoch": 1.116600184105554,
"grad_norm": 0.5735852182826044,
"learning_rate": 1.3955461293743373e-05,
"loss": 0.3857,
"step": 2730
},
{
"epoch": 1.1206914186355732,
"grad_norm": 0.7476886392647508,
"learning_rate": 1.3925162854113014e-05,
"loss": 0.4017,
"step": 2740
},
{
"epoch": 1.1247826531655927,
"grad_norm": 0.9517953594151923,
"learning_rate": 1.3894864414482657e-05,
"loss": 0.3792,
"step": 2750
},
{
"epoch": 1.1288738876956121,
"grad_norm": 0.8648296597362533,
"learning_rate": 1.3864565974852296e-05,
"loss": 0.3918,
"step": 2760
},
{
"epoch": 1.1329651222256316,
"grad_norm": 0.758091624498141,
"learning_rate": 1.3834267535221937e-05,
"loss": 0.3849,
"step": 2770
},
{
"epoch": 1.137056356755651,
"grad_norm": 0.8292099507010133,
"learning_rate": 1.3803969095591579e-05,
"loss": 0.4008,
"step": 2780
},
{
"epoch": 1.1411475912856703,
"grad_norm": 0.8886800402947126,
"learning_rate": 1.377367065596122e-05,
"loss": 0.4094,
"step": 2790
},
{
"epoch": 1.1452388258156898,
"grad_norm": 0.6921902768142778,
"learning_rate": 1.3743372216330859e-05,
"loss": 0.3857,
"step": 2800
},
{
"epoch": 1.1493300603457093,
"grad_norm": 0.7941140268815301,
"learning_rate": 1.37130737767005e-05,
"loss": 0.3946,
"step": 2810
},
{
"epoch": 1.1534212948757288,
"grad_norm": 0.8388193299897188,
"learning_rate": 1.3682775337070141e-05,
"loss": 0.3846,
"step": 2820
},
{
"epoch": 1.1575125294057482,
"grad_norm": 0.8827526348826681,
"learning_rate": 1.3652476897439784e-05,
"loss": 0.392,
"step": 2830
},
{
"epoch": 1.1616037639357677,
"grad_norm": 0.6399834967699533,
"learning_rate": 1.3622178457809424e-05,
"loss": 0.3936,
"step": 2840
},
{
"epoch": 1.1656949984657872,
"grad_norm": 0.7858076115450058,
"learning_rate": 1.3591880018179065e-05,
"loss": 0.4021,
"step": 2850
},
{
"epoch": 1.1697862329958064,
"grad_norm": 0.7641925003434336,
"learning_rate": 1.3561581578548706e-05,
"loss": 0.3855,
"step": 2860
},
{
"epoch": 1.173877467525826,
"grad_norm": 0.4485280825495544,
"learning_rate": 1.3531283138918347e-05,
"loss": 0.4015,
"step": 2870
},
{
"epoch": 1.1779687020558454,
"grad_norm": 0.8448609372779392,
"learning_rate": 1.3500984699287987e-05,
"loss": 0.4038,
"step": 2880
},
{
"epoch": 1.1820599365858648,
"grad_norm": 0.6112956233981559,
"learning_rate": 1.3470686259657628e-05,
"loss": 0.3941,
"step": 2890
},
{
"epoch": 1.1861511711158843,
"grad_norm": 0.787858901854597,
"learning_rate": 1.3440387820027269e-05,
"loss": 0.396,
"step": 2900
},
{
"epoch": 1.1902424056459036,
"grad_norm": 0.7489331322179997,
"learning_rate": 1.3410089380396912e-05,
"loss": 0.3855,
"step": 2910
},
{
"epoch": 1.194333640175923,
"grad_norm": 0.8245275477001581,
"learning_rate": 1.3379790940766553e-05,
"loss": 0.3903,
"step": 2920
},
{
"epoch": 1.1984248747059425,
"grad_norm": 0.5684832752577668,
"learning_rate": 1.3349492501136192e-05,
"loss": 0.3892,
"step": 2930
},
{
"epoch": 1.202516109235962,
"grad_norm": 0.5725195354548462,
"learning_rate": 1.3319194061505834e-05,
"loss": 0.3842,
"step": 2940
},
{
"epoch": 1.2066073437659814,
"grad_norm": 0.8605746695624041,
"learning_rate": 1.3288895621875475e-05,
"loss": 0.3989,
"step": 2950
},
{
"epoch": 1.2106985782960007,
"grad_norm": 0.569306392368477,
"learning_rate": 1.3258597182245116e-05,
"loss": 0.3886,
"step": 2960
},
{
"epoch": 1.2147898128260202,
"grad_norm": 0.8210447551557634,
"learning_rate": 1.3228298742614755e-05,
"loss": 0.378,
"step": 2970
},
{
"epoch": 1.2188810473560396,
"grad_norm": 0.5973996299783053,
"learning_rate": 1.3198000302984398e-05,
"loss": 0.4038,
"step": 2980
},
{
"epoch": 1.222972281886059,
"grad_norm": 1.5938167927490012,
"learning_rate": 1.316770186335404e-05,
"loss": 0.4133,
"step": 2990
},
{
"epoch": 1.2270635164160786,
"grad_norm": 0.6551073648017411,
"learning_rate": 1.313740342372368e-05,
"loss": 0.388,
"step": 3000
},
{
"epoch": 1.2270635164160786,
"eval_loss": 0.40439197421073914,
"eval_runtime": 566.8216,
"eval_samples_per_second": 5.448,
"eval_steps_per_second": 0.909,
"step": 3000
},
{
"epoch": 1.231154750946098,
"grad_norm": 0.9122941726483766,
"learning_rate": 1.310710498409332e-05,
"loss": 0.3962,
"step": 3010
},
{
"epoch": 1.2352459854761175,
"grad_norm": 0.8391097553941964,
"learning_rate": 1.3076806544462961e-05,
"loss": 0.387,
"step": 3020
},
{
"epoch": 1.2393372200061368,
"grad_norm": 0.8126464750636853,
"learning_rate": 1.3046508104832602e-05,
"loss": 0.3914,
"step": 3030
},
{
"epoch": 1.2434284545361562,
"grad_norm": 0.7877632963656168,
"learning_rate": 1.3016209665202243e-05,
"loss": 0.3678,
"step": 3040
},
{
"epoch": 1.2475196890661757,
"grad_norm": 0.7204057647654071,
"learning_rate": 1.2985911225571883e-05,
"loss": 0.4056,
"step": 3050
},
{
"epoch": 1.2516109235961952,
"grad_norm": 1.0360947842710033,
"learning_rate": 1.2955612785941526e-05,
"loss": 0.3921,
"step": 3060
},
{
"epoch": 1.2557021581262147,
"grad_norm": 0.8740894371532404,
"learning_rate": 1.2925314346311167e-05,
"loss": 0.3785,
"step": 3070
},
{
"epoch": 1.259793392656234,
"grad_norm": 0.7168443376463302,
"learning_rate": 1.2895015906680808e-05,
"loss": 0.403,
"step": 3080
},
{
"epoch": 1.2638846271862534,
"grad_norm": 0.5888660550300815,
"learning_rate": 1.2864717467050447e-05,
"loss": 0.3716,
"step": 3090
},
{
"epoch": 1.2679758617162729,
"grad_norm": 0.5289574947955048,
"learning_rate": 1.2834419027420089e-05,
"loss": 0.3764,
"step": 3100
},
{
"epoch": 1.2720670962462923,
"grad_norm": 0.7080617124171087,
"learning_rate": 1.280412058778973e-05,
"loss": 0.3927,
"step": 3110
},
{
"epoch": 1.2761583307763118,
"grad_norm": 0.8062114426141331,
"learning_rate": 1.277382214815937e-05,
"loss": 0.4044,
"step": 3120
},
{
"epoch": 1.280249565306331,
"grad_norm": 0.476668956425164,
"learning_rate": 1.274352370852901e-05,
"loss": 0.3868,
"step": 3130
},
{
"epoch": 1.2843407998363507,
"grad_norm": 0.6871622346909918,
"learning_rate": 1.2713225268898653e-05,
"loss": 0.3947,
"step": 3140
},
{
"epoch": 1.28843203436637,
"grad_norm": 0.7061613344838735,
"learning_rate": 1.2682926829268294e-05,
"loss": 0.3872,
"step": 3150
},
{
"epoch": 1.2925232688963895,
"grad_norm": 0.9863242302948113,
"learning_rate": 1.2652628389637935e-05,
"loss": 0.3889,
"step": 3160
},
{
"epoch": 1.296614503426409,
"grad_norm": 0.5738534501395554,
"learning_rate": 1.2622329950007575e-05,
"loss": 0.4054,
"step": 3170
},
{
"epoch": 1.3007057379564284,
"grad_norm": 0.8389559630933702,
"learning_rate": 1.2592031510377216e-05,
"loss": 0.3917,
"step": 3180
},
{
"epoch": 1.3047969724864479,
"grad_norm": 0.7991010292600728,
"learning_rate": 1.2561733070746857e-05,
"loss": 0.3901,
"step": 3190
},
{
"epoch": 1.3088882070164671,
"grad_norm": 0.5541164921899168,
"learning_rate": 1.25314346311165e-05,
"loss": 0.4008,
"step": 3200
},
{
"epoch": 1.3129794415464866,
"grad_norm": 0.8663675517676568,
"learning_rate": 1.2501136191486138e-05,
"loss": 0.3966,
"step": 3210
},
{
"epoch": 1.317070676076506,
"grad_norm": 0.5807714675119107,
"learning_rate": 1.247083775185578e-05,
"loss": 0.3881,
"step": 3220
},
{
"epoch": 1.3211619106065255,
"grad_norm": 0.5949773301489646,
"learning_rate": 1.2440539312225422e-05,
"loss": 0.3967,
"step": 3230
},
{
"epoch": 1.325253145136545,
"grad_norm": 1.0156281369557891,
"learning_rate": 1.2410240872595063e-05,
"loss": 0.3914,
"step": 3240
},
{
"epoch": 1.3293443796665643,
"grad_norm": 0.5546340288088691,
"learning_rate": 1.2379942432964704e-05,
"loss": 0.3869,
"step": 3250
},
{
"epoch": 1.3334356141965837,
"grad_norm": 1.0378625122625662,
"learning_rate": 1.2349643993334344e-05,
"loss": 0.4012,
"step": 3260
},
{
"epoch": 1.3375268487266032,
"grad_norm": 1.0535947037253341,
"learning_rate": 1.2319345553703985e-05,
"loss": 0.4132,
"step": 3270
},
{
"epoch": 1.3416180832566227,
"grad_norm": 0.6479776053412775,
"learning_rate": 1.2289047114073628e-05,
"loss": 0.3989,
"step": 3280
},
{
"epoch": 1.3457093177866422,
"grad_norm": 0.6474716962137215,
"learning_rate": 1.2258748674443269e-05,
"loss": 0.3937,
"step": 3290
},
{
"epoch": 1.3498005523166616,
"grad_norm": 0.770667683748129,
"learning_rate": 1.2228450234812908e-05,
"loss": 0.4021,
"step": 3300
},
{
"epoch": 1.353891786846681,
"grad_norm": 0.9562402793665876,
"learning_rate": 1.219815179518255e-05,
"loss": 0.3895,
"step": 3310
},
{
"epoch": 1.3579830213767003,
"grad_norm": 0.7835866454881345,
"learning_rate": 1.216785335555219e-05,
"loss": 0.3915,
"step": 3320
},
{
"epoch": 1.3620742559067198,
"grad_norm": 0.6840970037680498,
"learning_rate": 1.2137554915921832e-05,
"loss": 0.3933,
"step": 3330
},
{
"epoch": 1.3661654904367393,
"grad_norm": 0.6363375592990289,
"learning_rate": 1.2107256476291471e-05,
"loss": 0.3779,
"step": 3340
},
{
"epoch": 1.3702567249667588,
"grad_norm": 0.6422064337095349,
"learning_rate": 1.2076958036661112e-05,
"loss": 0.3781,
"step": 3350
},
{
"epoch": 1.3743479594967782,
"grad_norm": 0.7443280549443633,
"learning_rate": 1.2046659597030755e-05,
"loss": 0.3829,
"step": 3360
},
{
"epoch": 1.3784391940267975,
"grad_norm": 0.9031635762318649,
"learning_rate": 1.2016361157400396e-05,
"loss": 0.3964,
"step": 3370
},
{
"epoch": 1.382530428556817,
"grad_norm": 0.5980799440932613,
"learning_rate": 1.1986062717770036e-05,
"loss": 0.3887,
"step": 3380
},
{
"epoch": 1.3866216630868364,
"grad_norm": 0.6477461394839297,
"learning_rate": 1.1955764278139677e-05,
"loss": 0.3912,
"step": 3390
},
{
"epoch": 1.390712897616856,
"grad_norm": 0.8829826372548863,
"learning_rate": 1.1925465838509318e-05,
"loss": 0.4026,
"step": 3400
},
{
"epoch": 1.3948041321468754,
"grad_norm": 0.602437282415112,
"learning_rate": 1.1895167398878959e-05,
"loss": 0.3968,
"step": 3410
},
{
"epoch": 1.3988953666768946,
"grad_norm": 1.0853438418518562,
"learning_rate": 1.1864868959248599e-05,
"loss": 0.3795,
"step": 3420
},
{
"epoch": 1.4029866012069143,
"grad_norm": 0.68812770168341,
"learning_rate": 1.183457051961824e-05,
"loss": 0.3903,
"step": 3430
},
{
"epoch": 1.4070778357369336,
"grad_norm": 0.673891970112453,
"learning_rate": 1.1804272079987883e-05,
"loss": 0.3955,
"step": 3440
},
{
"epoch": 1.411169070266953,
"grad_norm": 0.9658876480715098,
"learning_rate": 1.1773973640357524e-05,
"loss": 0.3806,
"step": 3450
},
{
"epoch": 1.4152603047969725,
"grad_norm": 0.9072014909826842,
"learning_rate": 1.1743675200727163e-05,
"loss": 0.393,
"step": 3460
},
{
"epoch": 1.419351539326992,
"grad_norm": 0.8593625419024139,
"learning_rate": 1.1713376761096804e-05,
"loss": 0.3829,
"step": 3470
},
{
"epoch": 1.4234427738570115,
"grad_norm": 1.1598468495920022,
"learning_rate": 1.1683078321466445e-05,
"loss": 0.3895,
"step": 3480
},
{
"epoch": 1.4275340083870307,
"grad_norm": 0.6212230421582741,
"learning_rate": 1.1652779881836087e-05,
"loss": 0.404,
"step": 3490
},
{
"epoch": 1.4316252429170502,
"grad_norm": 1.052418138214992,
"learning_rate": 1.1622481442205726e-05,
"loss": 0.3788,
"step": 3500
},
{
"epoch": 1.4316252429170502,
"eval_loss": 0.3989790678024292,
"eval_runtime": 567.3009,
"eval_samples_per_second": 5.443,
"eval_steps_per_second": 0.908,
"step": 3500
},
{
"epoch": 1.4357164774470696,
"grad_norm": 0.9457200564933581,
"learning_rate": 1.1592183002575367e-05,
"loss": 0.3848,
"step": 3510
},
{
"epoch": 1.4398077119770891,
"grad_norm": 0.711408339200041,
"learning_rate": 1.156188456294501e-05,
"loss": 0.3777,
"step": 3520
},
{
"epoch": 1.4438989465071086,
"grad_norm": 0.765462240881764,
"learning_rate": 1.1531586123314651e-05,
"loss": 0.392,
"step": 3530
},
{
"epoch": 1.4479901810371278,
"grad_norm": 0.8447863863134836,
"learning_rate": 1.150128768368429e-05,
"loss": 0.394,
"step": 3540
},
{
"epoch": 1.4520814155671473,
"grad_norm": 0.7219223617330234,
"learning_rate": 1.1470989244053932e-05,
"loss": 0.3854,
"step": 3550
},
{
"epoch": 1.4561726500971668,
"grad_norm": 0.9253301498291125,
"learning_rate": 1.1440690804423573e-05,
"loss": 0.383,
"step": 3560
},
{
"epoch": 1.4602638846271863,
"grad_norm": 0.7352515331825862,
"learning_rate": 1.1410392364793214e-05,
"loss": 0.4006,
"step": 3570
},
{
"epoch": 1.4643551191572057,
"grad_norm": 0.7903238163122892,
"learning_rate": 1.1380093925162854e-05,
"loss": 0.3966,
"step": 3580
},
{
"epoch": 1.4684463536872252,
"grad_norm": 0.8163601731666391,
"learning_rate": 1.1349795485532495e-05,
"loss": 0.3843,
"step": 3590
},
{
"epoch": 1.4725375882172447,
"grad_norm": 0.7031404690333898,
"learning_rate": 1.1319497045902137e-05,
"loss": 0.3889,
"step": 3600
},
{
"epoch": 1.476628822747264,
"grad_norm": 0.9035695820122014,
"learning_rate": 1.1289198606271779e-05,
"loss": 0.3827,
"step": 3610
},
{
"epoch": 1.4807200572772834,
"grad_norm": 0.8712016470060545,
"learning_rate": 1.125890016664142e-05,
"loss": 0.3833,
"step": 3620
},
{
"epoch": 1.4848112918073029,
"grad_norm": 0.5886133981154346,
"learning_rate": 1.122860172701106e-05,
"loss": 0.3852,
"step": 3630
},
{
"epoch": 1.4889025263373223,
"grad_norm": 0.6641821222496354,
"learning_rate": 1.11983032873807e-05,
"loss": 0.3883,
"step": 3640
},
{
"epoch": 1.4929937608673418,
"grad_norm": 0.71470696861074,
"learning_rate": 1.1168004847750342e-05,
"loss": 0.39,
"step": 3650
},
{
"epoch": 1.497084995397361,
"grad_norm": 0.7599626830703158,
"learning_rate": 1.1137706408119984e-05,
"loss": 0.3988,
"step": 3660
},
{
"epoch": 1.5011762299273805,
"grad_norm": 0.8131240458205017,
"learning_rate": 1.1107407968489622e-05,
"loss": 0.3837,
"step": 3670
},
{
"epoch": 1.5052674644574,
"grad_norm": 0.5981406315717848,
"learning_rate": 1.1077109528859265e-05,
"loss": 0.3976,
"step": 3680
},
{
"epoch": 1.5093586989874195,
"grad_norm": 0.6546000876796034,
"learning_rate": 1.1046811089228906e-05,
"loss": 0.3845,
"step": 3690
},
{
"epoch": 1.513449933517439,
"grad_norm": 0.6618741576872935,
"learning_rate": 1.1016512649598547e-05,
"loss": 0.3829,
"step": 3700
},
{
"epoch": 1.5175411680474582,
"grad_norm": 0.7644320579880938,
"learning_rate": 1.0986214209968187e-05,
"loss": 0.3904,
"step": 3710
},
{
"epoch": 1.521632402577478,
"grad_norm": 0.7078963682359172,
"learning_rate": 1.0955915770337828e-05,
"loss": 0.3943,
"step": 3720
},
{
"epoch": 1.5257236371074971,
"grad_norm": 0.9863976210557551,
"learning_rate": 1.0925617330707469e-05,
"loss": 0.3836,
"step": 3730
},
{
"epoch": 1.5298148716375166,
"grad_norm": 0.7431834628180725,
"learning_rate": 1.0895318891077112e-05,
"loss": 0.4033,
"step": 3740
},
{
"epoch": 1.533906106167536,
"grad_norm": 0.9543361591228587,
"learning_rate": 1.0865020451446751e-05,
"loss": 0.3928,
"step": 3750
},
{
"epoch": 1.5379973406975553,
"grad_norm": 0.7174707063961077,
"learning_rate": 1.0834722011816392e-05,
"loss": 0.3848,
"step": 3760
},
{
"epoch": 1.542088575227575,
"grad_norm": 0.8245320777882585,
"learning_rate": 1.0804423572186034e-05,
"loss": 0.3992,
"step": 3770
},
{
"epoch": 1.5461798097575943,
"grad_norm": 1.0937610813639995,
"learning_rate": 1.0774125132555675e-05,
"loss": 0.3922,
"step": 3780
},
{
"epoch": 1.5502710442876138,
"grad_norm": 0.6595221788634811,
"learning_rate": 1.0743826692925314e-05,
"loss": 0.3846,
"step": 3790
},
{
"epoch": 1.5543622788176332,
"grad_norm": 0.6933714247729369,
"learning_rate": 1.0713528253294955e-05,
"loss": 0.3924,
"step": 3800
},
{
"epoch": 1.5584535133476527,
"grad_norm": 0.6322021390419345,
"learning_rate": 1.0683229813664597e-05,
"loss": 0.382,
"step": 3810
},
{
"epoch": 1.5625447478776722,
"grad_norm": 0.8617688710364446,
"learning_rate": 1.065293137403424e-05,
"loss": 0.39,
"step": 3820
},
{
"epoch": 1.5666359824076914,
"grad_norm": 1.244329874318961,
"learning_rate": 1.0622632934403879e-05,
"loss": 0.3963,
"step": 3830
},
{
"epoch": 1.5707272169377111,
"grad_norm": 0.9027740160226115,
"learning_rate": 1.059233449477352e-05,
"loss": 0.382,
"step": 3840
},
{
"epoch": 1.5748184514677304,
"grad_norm": 0.5021808277805254,
"learning_rate": 1.0562036055143161e-05,
"loss": 0.399,
"step": 3850
},
{
"epoch": 1.5789096859977498,
"grad_norm": 0.6718417113604481,
"learning_rate": 1.0531737615512802e-05,
"loss": 0.3947,
"step": 3860
},
{
"epoch": 1.5830009205277693,
"grad_norm": 0.6732165543554006,
"learning_rate": 1.0501439175882442e-05,
"loss": 0.3921,
"step": 3870
},
{
"epoch": 1.5870921550577886,
"grad_norm": 0.8949389121109214,
"learning_rate": 1.0471140736252083e-05,
"loss": 0.3789,
"step": 3880
},
{
"epoch": 1.5911833895878083,
"grad_norm": 0.8368104145013396,
"learning_rate": 1.0440842296621724e-05,
"loss": 0.3838,
"step": 3890
},
{
"epoch": 1.5952746241178275,
"grad_norm": 0.6115609968754325,
"learning_rate": 1.0410543856991367e-05,
"loss": 0.3919,
"step": 3900
},
{
"epoch": 1.599365858647847,
"grad_norm": 0.8379228899852589,
"learning_rate": 1.0380245417361006e-05,
"loss": 0.3991,
"step": 3910
},
{
"epoch": 1.6034570931778664,
"grad_norm": 0.7827671214396511,
"learning_rate": 1.0349946977730647e-05,
"loss": 0.3794,
"step": 3920
},
{
"epoch": 1.6075483277078857,
"grad_norm": 0.6805284446607271,
"learning_rate": 1.0319648538100289e-05,
"loss": 0.3936,
"step": 3930
},
{
"epoch": 1.6116395622379054,
"grad_norm": 0.5081615910622816,
"learning_rate": 1.028935009846993e-05,
"loss": 0.3882,
"step": 3940
},
{
"epoch": 1.6157307967679246,
"grad_norm": 0.585926687974076,
"learning_rate": 1.0259051658839571e-05,
"loss": 0.4009,
"step": 3950
},
{
"epoch": 1.619822031297944,
"grad_norm": 0.6755208541842371,
"learning_rate": 1.022875321920921e-05,
"loss": 0.3797,
"step": 3960
},
{
"epoch": 1.6239132658279636,
"grad_norm": 0.7578805362403562,
"learning_rate": 1.0198454779578853e-05,
"loss": 0.3877,
"step": 3970
},
{
"epoch": 1.628004500357983,
"grad_norm": 0.7490700009059831,
"learning_rate": 1.0168156339948494e-05,
"loss": 0.3779,
"step": 3980
},
{
"epoch": 1.6320957348880025,
"grad_norm": 0.6801784359822746,
"learning_rate": 1.0137857900318136e-05,
"loss": 0.3741,
"step": 3990
},
{
"epoch": 1.6361869694180218,
"grad_norm": 0.6583694408666441,
"learning_rate": 1.0107559460687775e-05,
"loss": 0.3959,
"step": 4000
},
{
"epoch": 1.6361869694180218,
"eval_loss": 0.39864814281463623,
"eval_runtime": 580.5731,
"eval_samples_per_second": 5.319,
"eval_steps_per_second": 0.887,
"step": 4000
},
{
"epoch": 1.6402782039480415,
"grad_norm": 0.9122028487385195,
"learning_rate": 1.0077261021057416e-05,
"loss": 0.3933,
"step": 4010
},
{
"epoch": 1.6443694384780607,
"grad_norm": 0.9631240105647585,
"learning_rate": 1.0046962581427057e-05,
"loss": 0.3886,
"step": 4020
},
{
"epoch": 1.6484606730080802,
"grad_norm": 0.8266805005833362,
"learning_rate": 1.0016664141796698e-05,
"loss": 0.3878,
"step": 4030
},
{
"epoch": 1.6525519075380997,
"grad_norm": 0.692721855989811,
"learning_rate": 9.98636570216634e-06,
"loss": 0.3732,
"step": 4040
},
{
"epoch": 1.656643142068119,
"grad_norm": 0.5908823120184169,
"learning_rate": 9.95606726253598e-06,
"loss": 0.3808,
"step": 4050
},
{
"epoch": 1.6607343765981386,
"grad_norm": 0.9905003235273305,
"learning_rate": 9.925768822905622e-06,
"loss": 0.3807,
"step": 4060
},
{
"epoch": 1.6648256111281579,
"grad_norm": 0.731190147784246,
"learning_rate": 9.895470383275261e-06,
"loss": 0.3826,
"step": 4070
},
{
"epoch": 1.6689168456581773,
"grad_norm": 0.5342009151761759,
"learning_rate": 9.865171943644904e-06,
"loss": 0.3931,
"step": 4080
},
{
"epoch": 1.6730080801881968,
"grad_norm": 0.6299401650866043,
"learning_rate": 9.834873504014544e-06,
"loss": 0.3655,
"step": 4090
},
{
"epoch": 1.6770993147182163,
"grad_norm": 0.6634845001555149,
"learning_rate": 9.804575064384185e-06,
"loss": 0.3857,
"step": 4100
},
{
"epoch": 1.6811905492482357,
"grad_norm": 0.47553645688690244,
"learning_rate": 9.774276624753826e-06,
"loss": 0.3819,
"step": 4110
},
{
"epoch": 1.685281783778255,
"grad_norm": 0.946430943640409,
"learning_rate": 9.743978185123467e-06,
"loss": 0.3761,
"step": 4120
},
{
"epoch": 1.6893730183082747,
"grad_norm": 0.9939471324083836,
"learning_rate": 9.713679745493108e-06,
"loss": 0.388,
"step": 4130
},
{
"epoch": 1.693464252838294,
"grad_norm": 0.6474361686497936,
"learning_rate": 9.68338130586275e-06,
"loss": 0.3662,
"step": 4140
},
{
"epoch": 1.6975554873683134,
"grad_norm": 0.7980334202517952,
"learning_rate": 9.653082866232389e-06,
"loss": 0.3849,
"step": 4150
},
{
"epoch": 1.7016467218983329,
"grad_norm": 0.728268244955796,
"learning_rate": 9.622784426602032e-06,
"loss": 0.3842,
"step": 4160
},
{
"epoch": 1.7057379564283521,
"grad_norm": 0.9131018127080367,
"learning_rate": 9.592485986971671e-06,
"loss": 0.3824,
"step": 4170
},
{
"epoch": 1.7098291909583718,
"grad_norm": 0.7674900891840208,
"learning_rate": 9.562187547341312e-06,
"loss": 0.3733,
"step": 4180
},
{
"epoch": 1.713920425488391,
"grad_norm": 0.9246508059789594,
"learning_rate": 9.531889107710953e-06,
"loss": 0.4029,
"step": 4190
},
{
"epoch": 1.7180116600184105,
"grad_norm": 0.6178369369789531,
"learning_rate": 9.501590668080595e-06,
"loss": 0.37,
"step": 4200
},
{
"epoch": 1.72210289454843,
"grad_norm": 0.9227660541364927,
"learning_rate": 9.471292228450236e-06,
"loss": 0.3841,
"step": 4210
},
{
"epoch": 1.7261941290784493,
"grad_norm": 0.6860127218022202,
"learning_rate": 9.440993788819877e-06,
"loss": 0.3859,
"step": 4220
},
{
"epoch": 1.730285363608469,
"grad_norm": 0.6968645833002234,
"learning_rate": 9.410695349189516e-06,
"loss": 0.4007,
"step": 4230
},
{
"epoch": 1.7343765981384882,
"grad_norm": 0.7545598370400031,
"learning_rate": 9.38039690955916e-06,
"loss": 0.3859,
"step": 4240
},
{
"epoch": 1.7384678326685077,
"grad_norm": 0.7363763263567308,
"learning_rate": 9.350098469928799e-06,
"loss": 0.3867,
"step": 4250
},
{
"epoch": 1.7425590671985272,
"grad_norm": 0.48614259001613647,
"learning_rate": 9.31980003029844e-06,
"loss": 0.3798,
"step": 4260
},
{
"epoch": 1.7466503017285466,
"grad_norm": 0.7729758585437992,
"learning_rate": 9.289501590668083e-06,
"loss": 0.3912,
"step": 4270
},
{
"epoch": 1.750741536258566,
"grad_norm": 0.7305646510615461,
"learning_rate": 9.259203151037722e-06,
"loss": 0.385,
"step": 4280
},
{
"epoch": 1.7548327707885853,
"grad_norm": 0.9399629054581843,
"learning_rate": 9.228904711407363e-06,
"loss": 0.3875,
"step": 4290
},
{
"epoch": 1.758924005318605,
"grad_norm": 0.49789660727149787,
"learning_rate": 9.198606271777004e-06,
"loss": 0.3886,
"step": 4300
},
{
"epoch": 1.7630152398486243,
"grad_norm": 0.8040334242726411,
"learning_rate": 9.168307832146646e-06,
"loss": 0.3854,
"step": 4310
},
{
"epoch": 1.7671064743786438,
"grad_norm": 0.520719829405063,
"learning_rate": 9.138009392516287e-06,
"loss": 0.3717,
"step": 4320
},
{
"epoch": 1.7711977089086632,
"grad_norm": 1.0848704176681172,
"learning_rate": 9.107710952885928e-06,
"loss": 0.3888,
"step": 4330
},
{
"epoch": 1.7752889434386825,
"grad_norm": 1.1159019237173737,
"learning_rate": 9.077412513255567e-06,
"loss": 0.3773,
"step": 4340
},
{
"epoch": 1.7793801779687022,
"grad_norm": 0.45334318412994085,
"learning_rate": 9.04711407362521e-06,
"loss": 0.3795,
"step": 4350
},
{
"epoch": 1.7834714124987214,
"grad_norm": 0.6934540742121007,
"learning_rate": 9.01681563399485e-06,
"loss": 0.3796,
"step": 4360
},
{
"epoch": 1.787562647028741,
"grad_norm": 0.5578670479631883,
"learning_rate": 8.98651719436449e-06,
"loss": 0.3845,
"step": 4370
},
{
"epoch": 1.7916538815587604,
"grad_norm": 0.7503759650869845,
"learning_rate": 8.956218754734132e-06,
"loss": 0.3942,
"step": 4380
},
{
"epoch": 1.7957451160887796,
"grad_norm": 0.867971089595999,
"learning_rate": 8.925920315103773e-06,
"loss": 0.3862,
"step": 4390
},
{
"epoch": 1.7998363506187993,
"grad_norm": 0.8400133056337923,
"learning_rate": 8.895621875473414e-06,
"loss": 0.4045,
"step": 4400
},
{
"epoch": 1.8039275851488186,
"grad_norm": 0.8593095780014482,
"learning_rate": 8.865323435843055e-06,
"loss": 0.3972,
"step": 4410
},
{
"epoch": 1.808018819678838,
"grad_norm": 0.8648015436921023,
"learning_rate": 8.835024996212695e-06,
"loss": 0.3885,
"step": 4420
},
{
"epoch": 1.8121100542088575,
"grad_norm": 0.8474119696284984,
"learning_rate": 8.804726556582338e-06,
"loss": 0.3797,
"step": 4430
},
{
"epoch": 1.816201288738877,
"grad_norm": 0.6888945197943561,
"learning_rate": 8.774428116951977e-06,
"loss": 0.386,
"step": 4440
},
{
"epoch": 1.8202925232688965,
"grad_norm": 0.8152832539209761,
"learning_rate": 8.744129677321618e-06,
"loss": 0.3903,
"step": 4450
},
{
"epoch": 1.8243837577989157,
"grad_norm": 0.7728650210318904,
"learning_rate": 8.71383123769126e-06,
"loss": 0.3919,
"step": 4460
},
{
"epoch": 1.8284749923289354,
"grad_norm": 0.5391104409638143,
"learning_rate": 8.6835327980609e-06,
"loss": 0.3803,
"step": 4470
},
{
"epoch": 1.8325662268589546,
"grad_norm": 0.7567717191116343,
"learning_rate": 8.653234358430542e-06,
"loss": 0.3815,
"step": 4480
},
{
"epoch": 1.8366574613889741,
"grad_norm": 0.8749818376847743,
"learning_rate": 8.622935918800183e-06,
"loss": 0.3883,
"step": 4490
},
{
"epoch": 1.8407486959189936,
"grad_norm": 0.8019778003533239,
"learning_rate": 8.592637479169822e-06,
"loss": 0.383,
"step": 4500
},
{
"epoch": 1.8407486959189936,
"eval_loss": 0.3955570459365845,
"eval_runtime": 568.3755,
"eval_samples_per_second": 5.433,
"eval_steps_per_second": 0.906,
"step": 4500
},
{
"epoch": 1.8448399304490128,
"grad_norm": 0.6293625351988416,
"learning_rate": 8.562339039539465e-06,
"loss": 0.387,
"step": 4510
},
{
"epoch": 1.8489311649790325,
"grad_norm": 0.6963219813926529,
"learning_rate": 8.532040599909105e-06,
"loss": 0.3802,
"step": 4520
},
{
"epoch": 1.8530223995090518,
"grad_norm": 0.6929306602792736,
"learning_rate": 8.501742160278746e-06,
"loss": 0.3797,
"step": 4530
},
{
"epoch": 1.8571136340390713,
"grad_norm": 0.669690717256871,
"learning_rate": 8.471443720648387e-06,
"loss": 0.394,
"step": 4540
},
{
"epoch": 1.8612048685690907,
"grad_norm": 0.6233392743914433,
"learning_rate": 8.441145281018028e-06,
"loss": 0.3896,
"step": 4550
},
{
"epoch": 1.8652961030991102,
"grad_norm": 0.7239502206719675,
"learning_rate": 8.41084684138767e-06,
"loss": 0.3947,
"step": 4560
},
{
"epoch": 1.8693873376291297,
"grad_norm": 0.6756956090605876,
"learning_rate": 8.38054840175731e-06,
"loss": 0.3925,
"step": 4570
},
{
"epoch": 1.873478572159149,
"grad_norm": 0.7446762514738522,
"learning_rate": 8.35024996212695e-06,
"loss": 0.3833,
"step": 4580
},
{
"epoch": 1.8775698066891686,
"grad_norm": 0.7701766571709215,
"learning_rate": 8.319951522496593e-06,
"loss": 0.3864,
"step": 4590
},
{
"epoch": 1.8816610412191879,
"grad_norm": 0.6441859481438332,
"learning_rate": 8.289653082866234e-06,
"loss": 0.3776,
"step": 4600
},
{
"epoch": 1.8857522757492073,
"grad_norm": 0.7880686132167092,
"learning_rate": 8.259354643235873e-06,
"loss": 0.3933,
"step": 4610
},
{
"epoch": 1.8898435102792268,
"grad_norm": 0.6332952763922544,
"learning_rate": 8.229056203605516e-06,
"loss": 0.3853,
"step": 4620
},
{
"epoch": 1.893934744809246,
"grad_norm": 0.7243342267131859,
"learning_rate": 8.198757763975156e-06,
"loss": 0.3792,
"step": 4630
},
{
"epoch": 1.8980259793392658,
"grad_norm": 0.7901459850102072,
"learning_rate": 8.168459324344797e-06,
"loss": 0.3869,
"step": 4640
},
{
"epoch": 1.902117213869285,
"grad_norm": 0.51044070307818,
"learning_rate": 8.138160884714438e-06,
"loss": 0.3775,
"step": 4650
},
{
"epoch": 1.9062084483993045,
"grad_norm": 0.8558077236451173,
"learning_rate": 8.107862445084079e-06,
"loss": 0.3894,
"step": 4660
},
{
"epoch": 1.910299682929324,
"grad_norm": 0.7131026847220464,
"learning_rate": 8.07756400545372e-06,
"loss": 0.3862,
"step": 4670
},
{
"epoch": 1.9143909174593432,
"grad_norm": 0.9750476025242552,
"learning_rate": 8.047265565823361e-06,
"loss": 0.3957,
"step": 4680
},
{
"epoch": 1.918482151989363,
"grad_norm": 0.7225249415951167,
"learning_rate": 8.016967126193e-06,
"loss": 0.4027,
"step": 4690
},
{
"epoch": 1.9225733865193821,
"grad_norm": 0.5891488270484448,
"learning_rate": 7.986668686562644e-06,
"loss": 0.3886,
"step": 4700
},
{
"epoch": 1.9266646210494016,
"grad_norm": 1.0628090466492606,
"learning_rate": 7.956370246932283e-06,
"loss": 0.398,
"step": 4710
},
{
"epoch": 1.930755855579421,
"grad_norm": 0.6647239527835125,
"learning_rate": 7.926071807301924e-06,
"loss": 0.3918,
"step": 4720
},
{
"epoch": 1.9348470901094406,
"grad_norm": 0.7463185047324199,
"learning_rate": 7.895773367671565e-06,
"loss": 0.3827,
"step": 4730
},
{
"epoch": 1.93893832463946,
"grad_norm": 0.7195015094736638,
"learning_rate": 7.865474928041206e-06,
"loss": 0.4016,
"step": 4740
},
{
"epoch": 1.9430295591694793,
"grad_norm": 0.8535138221139457,
"learning_rate": 7.835176488410848e-06,
"loss": 0.3869,
"step": 4750
},
{
"epoch": 1.947120793699499,
"grad_norm": 0.7791996740996957,
"learning_rate": 7.804878048780489e-06,
"loss": 0.3714,
"step": 4760
},
{
"epoch": 1.9512120282295182,
"grad_norm": 0.5772248432427366,
"learning_rate": 7.77457960915013e-06,
"loss": 0.3822,
"step": 4770
},
{
"epoch": 1.9553032627595377,
"grad_norm": 0.7604443317770228,
"learning_rate": 7.744281169519771e-06,
"loss": 0.379,
"step": 4780
},
{
"epoch": 1.9593944972895572,
"grad_norm": 0.588549852538312,
"learning_rate": 7.71398272988941e-06,
"loss": 0.3814,
"step": 4790
},
{
"epoch": 1.9634857318195764,
"grad_norm": 1.0609622015863114,
"learning_rate": 7.683684290259052e-06,
"loss": 0.3844,
"step": 4800
},
{
"epoch": 1.9675769663495961,
"grad_norm": 0.8833450757521427,
"learning_rate": 7.653385850628693e-06,
"loss": 0.3941,
"step": 4810
},
{
"epoch": 1.9716682008796154,
"grad_norm": 0.6612140126470812,
"learning_rate": 7.623087410998334e-06,
"loss": 0.3805,
"step": 4820
},
{
"epoch": 1.9757594354096348,
"grad_norm": 0.653419144919706,
"learning_rate": 7.592788971367975e-06,
"loss": 0.3666,
"step": 4830
},
{
"epoch": 1.9798506699396543,
"grad_norm": 0.6797670566745962,
"learning_rate": 7.562490531737616e-06,
"loss": 0.3742,
"step": 4840
},
{
"epoch": 1.9839419044696738,
"grad_norm": 0.6644531103160585,
"learning_rate": 7.5321920921072566e-06,
"loss": 0.3842,
"step": 4850
},
{
"epoch": 1.9880331389996933,
"grad_norm": 1.2106290360521903,
"learning_rate": 7.5018936524768986e-06,
"loss": 0.3911,
"step": 4860
},
{
"epoch": 1.9921243735297125,
"grad_norm": 0.5569971531476807,
"learning_rate": 7.471595212846539e-06,
"loss": 0.4069,
"step": 4870
},
{
"epoch": 1.996215608059732,
"grad_norm": 0.5191981517163119,
"learning_rate": 7.44129677321618e-06,
"loss": 0.3763,
"step": 4880
},
{
"epoch": 2.0,
"grad_norm": 1.3233158209509672,
"learning_rate": 7.41099833358582e-06,
"loss": 0.368,
"step": 4890
},
{
"epoch": 2.0040912345300192,
"grad_norm": 0.6324023695845297,
"learning_rate": 7.380699893955462e-06,
"loss": 0.3502,
"step": 4900
},
{
"epoch": 2.008182469060039,
"grad_norm": 0.6060365694093942,
"learning_rate": 7.350401454325103e-06,
"loss": 0.3634,
"step": 4910
},
{
"epoch": 2.012273703590058,
"grad_norm": 0.9148807418922357,
"learning_rate": 7.320103014694744e-06,
"loss": 0.3799,
"step": 4920
},
{
"epoch": 2.016364938120078,
"grad_norm": 0.5414356512353191,
"learning_rate": 7.289804575064385e-06,
"loss": 0.3637,
"step": 4930
},
{
"epoch": 2.020456172650097,
"grad_norm": 0.9191131346375879,
"learning_rate": 7.259506135434026e-06,
"loss": 0.3502,
"step": 4940
},
{
"epoch": 2.0245474071801164,
"grad_norm": 0.5288098199455225,
"learning_rate": 7.229207695803667e-06,
"loss": 0.3594,
"step": 4950
},
{
"epoch": 2.028638641710136,
"grad_norm": 0.6644017187035499,
"learning_rate": 7.1989092561733075e-06,
"loss": 0.3566,
"step": 4960
},
{
"epoch": 2.0327298762401553,
"grad_norm": 0.7063344184257762,
"learning_rate": 7.1686108165429495e-06,
"loss": 0.3729,
"step": 4970
},
{
"epoch": 2.036821110770175,
"grad_norm": 0.716588405815958,
"learning_rate": 7.13831237691259e-06,
"loss": 0.3632,
"step": 4980
},
{
"epoch": 2.0409123453001943,
"grad_norm": 0.780104450356012,
"learning_rate": 7.108013937282231e-06,
"loss": 0.3503,
"step": 4990
},
{
"epoch": 2.045003579830214,
"grad_norm": 0.9214206047619449,
"learning_rate": 7.077715497651871e-06,
"loss": 0.3583,
"step": 5000
},
{
"epoch": 2.045003579830214,
"eval_loss": 0.40945789217948914,
"eval_runtime": 566.8305,
"eval_samples_per_second": 5.448,
"eval_steps_per_second": 0.909,
"step": 5000
},
{
"epoch": 2.049094814360233,
"grad_norm": 0.8803623081885309,
"learning_rate": 7.047417058021513e-06,
"loss": 0.3524,
"step": 5010
},
{
"epoch": 2.0531860488902525,
"grad_norm": 0.8089956621396417,
"learning_rate": 7.0171186183911536e-06,
"loss": 0.3423,
"step": 5020
},
{
"epoch": 2.057277283420272,
"grad_norm": 1.026614900390396,
"learning_rate": 6.986820178760795e-06,
"loss": 0.3612,
"step": 5030
},
{
"epoch": 2.0613685179502914,
"grad_norm": 0.8153155935459168,
"learning_rate": 6.956521739130435e-06,
"loss": 0.3619,
"step": 5040
},
{
"epoch": 2.065459752480311,
"grad_norm": 0.931239718869365,
"learning_rate": 6.926223299500077e-06,
"loss": 0.3475,
"step": 5050
},
{
"epoch": 2.0695509870103304,
"grad_norm": 1.0467750800215039,
"learning_rate": 6.895924859869717e-06,
"loss": 0.3535,
"step": 5060
},
{
"epoch": 2.0736422215403496,
"grad_norm": 0.9059104329232058,
"learning_rate": 6.8656264202393585e-06,
"loss": 0.3697,
"step": 5070
},
{
"epoch": 2.0777334560703693,
"grad_norm": 0.9807695952493246,
"learning_rate": 6.835327980608999e-06,
"loss": 0.3465,
"step": 5080
},
{
"epoch": 2.0818246906003885,
"grad_norm": 0.8912366875608871,
"learning_rate": 6.805029540978641e-06,
"loss": 0.3423,
"step": 5090
},
{
"epoch": 2.0859159251304082,
"grad_norm": 0.7770463857589687,
"learning_rate": 6.774731101348281e-06,
"loss": 0.3514,
"step": 5100
},
{
"epoch": 2.0900071596604275,
"grad_norm": 0.9470302336462076,
"learning_rate": 6.744432661717922e-06,
"loss": 0.3282,
"step": 5110
},
{
"epoch": 2.094098394190447,
"grad_norm": 0.9649585745676128,
"learning_rate": 6.7141342220875625e-06,
"loss": 0.3525,
"step": 5120
},
{
"epoch": 2.0981896287204664,
"grad_norm": 0.9294501847855035,
"learning_rate": 6.6838357824572045e-06,
"loss": 0.3568,
"step": 5130
},
{
"epoch": 2.1022808632504857,
"grad_norm": 0.7971412344974254,
"learning_rate": 6.653537342826845e-06,
"loss": 0.3649,
"step": 5140
},
{
"epoch": 2.1063720977805054,
"grad_norm": 0.9239690361706143,
"learning_rate": 6.623238903196486e-06,
"loss": 0.3671,
"step": 5150
},
{
"epoch": 2.1104633323105246,
"grad_norm": 0.792788321441495,
"learning_rate": 6.592940463566126e-06,
"loss": 0.3696,
"step": 5160
},
{
"epoch": 2.1145545668405443,
"grad_norm": 0.96794000584284,
"learning_rate": 6.562642023935768e-06,
"loss": 0.3526,
"step": 5170
},
{
"epoch": 2.1186458013705636,
"grad_norm": 0.9461764596400418,
"learning_rate": 6.5323435843054086e-06,
"loss": 0.3481,
"step": 5180
},
{
"epoch": 2.122737035900583,
"grad_norm": 0.9746653976507995,
"learning_rate": 6.50204514467505e-06,
"loss": 0.3641,
"step": 5190
},
{
"epoch": 2.1268282704306025,
"grad_norm": 0.9134747859457668,
"learning_rate": 6.47174670504469e-06,
"loss": 0.3477,
"step": 5200
},
{
"epoch": 2.1309195049606218,
"grad_norm": 0.9422179764228524,
"learning_rate": 6.441448265414332e-06,
"loss": 0.3538,
"step": 5210
},
{
"epoch": 2.1350107394906415,
"grad_norm": 0.9768549519399179,
"learning_rate": 6.411149825783972e-06,
"loss": 0.36,
"step": 5220
},
{
"epoch": 2.1391019740206607,
"grad_norm": 0.8493532914358172,
"learning_rate": 6.3808513861536135e-06,
"loss": 0.3551,
"step": 5230
},
{
"epoch": 2.14319320855068,
"grad_norm": 0.850886523603342,
"learning_rate": 6.350552946523254e-06,
"loss": 0.3497,
"step": 5240
},
{
"epoch": 2.1472844430806997,
"grad_norm": 1.192315078529062,
"learning_rate": 6.320254506892896e-06,
"loss": 0.3618,
"step": 5250
},
{
"epoch": 2.151375677610719,
"grad_norm": 0.8946571568288144,
"learning_rate": 6.289956067262537e-06,
"loss": 0.3458,
"step": 5260
},
{
"epoch": 2.1554669121407386,
"grad_norm": 0.8759600168097015,
"learning_rate": 6.259657627632177e-06,
"loss": 0.3681,
"step": 5270
},
{
"epoch": 2.159558146670758,
"grad_norm": 0.8486779868515286,
"learning_rate": 6.229359188001819e-06,
"loss": 0.3674,
"step": 5280
},
{
"epoch": 2.163649381200777,
"grad_norm": 0.9573497878508632,
"learning_rate": 6.1990607483714595e-06,
"loss": 0.3515,
"step": 5290
},
{
"epoch": 2.167740615730797,
"grad_norm": 1.0608696592581122,
"learning_rate": 6.168762308741101e-06,
"loss": 0.3575,
"step": 5300
},
{
"epoch": 2.171831850260816,
"grad_norm": 0.9550842012101914,
"learning_rate": 6.138463869110741e-06,
"loss": 0.3591,
"step": 5310
},
{
"epoch": 2.1759230847908357,
"grad_norm": 0.7852881818968737,
"learning_rate": 6.108165429480383e-06,
"loss": 0.3609,
"step": 5320
},
{
"epoch": 2.180014319320855,
"grad_norm": 1.1566940002899782,
"learning_rate": 6.077866989850023e-06,
"loss": 0.3407,
"step": 5330
},
{
"epoch": 2.1841055538508747,
"grad_norm": 0.8078702927449839,
"learning_rate": 6.047568550219664e-06,
"loss": 0.3486,
"step": 5340
},
{
"epoch": 2.188196788380894,
"grad_norm": 0.9239475839958554,
"learning_rate": 6.017270110589305e-06,
"loss": 0.3503,
"step": 5350
},
{
"epoch": 2.192288022910913,
"grad_norm": 0.9902787811712669,
"learning_rate": 5.986971670958947e-06,
"loss": 0.3616,
"step": 5360
},
{
"epoch": 2.196379257440933,
"grad_norm": 0.993917561828579,
"learning_rate": 5.956673231328587e-06,
"loss": 0.3564,
"step": 5370
},
{
"epoch": 2.200470491970952,
"grad_norm": 0.9912437866525817,
"learning_rate": 5.926374791698228e-06,
"loss": 0.3515,
"step": 5380
},
{
"epoch": 2.204561726500972,
"grad_norm": 0.9063774423655953,
"learning_rate": 5.8960763520678685e-06,
"loss": 0.3597,
"step": 5390
},
{
"epoch": 2.208652961030991,
"grad_norm": 1.2314911545390765,
"learning_rate": 5.8657779124375105e-06,
"loss": 0.343,
"step": 5400
},
{
"epoch": 2.2127441955610108,
"grad_norm": 0.9949853000873754,
"learning_rate": 5.835479472807151e-06,
"loss": 0.3514,
"step": 5410
},
{
"epoch": 2.21683543009103,
"grad_norm": 1.000104375303856,
"learning_rate": 5.805181033176792e-06,
"loss": 0.3638,
"step": 5420
},
{
"epoch": 2.2209266646210493,
"grad_norm": 1.128813445069188,
"learning_rate": 5.774882593546432e-06,
"loss": 0.3656,
"step": 5430
},
{
"epoch": 2.225017899151069,
"grad_norm": 0.9792714948138356,
"learning_rate": 5.744584153916074e-06,
"loss": 0.3536,
"step": 5440
},
{
"epoch": 2.229109133681088,
"grad_norm": 1.0987526220359756,
"learning_rate": 5.7142857142857145e-06,
"loss": 0.3721,
"step": 5450
},
{
"epoch": 2.233200368211108,
"grad_norm": 0.8952391612311649,
"learning_rate": 5.683987274655356e-06,
"loss": 0.354,
"step": 5460
},
{
"epoch": 2.237291602741127,
"grad_norm": 1.1608074782040154,
"learning_rate": 5.653688835024996e-06,
"loss": 0.3561,
"step": 5470
},
{
"epoch": 2.2413828372711464,
"grad_norm": 1.003805233436142,
"learning_rate": 5.623390395394638e-06,
"loss": 0.3551,
"step": 5480
},
{
"epoch": 2.245474071801166,
"grad_norm": 0.8422927034761718,
"learning_rate": 5.593091955764278e-06,
"loss": 0.3467,
"step": 5490
},
{
"epoch": 2.2495653063311853,
"grad_norm": 1.034912305431081,
"learning_rate": 5.562793516133919e-06,
"loss": 0.3762,
"step": 5500
},
{
"epoch": 2.2495653063311853,
"eval_loss": 0.4107515215873718,
"eval_runtime": 566.5709,
"eval_samples_per_second": 5.45,
"eval_steps_per_second": 0.909,
"step": 5500
},
{
"epoch": 2.253656540861205,
"grad_norm": 0.9665919628552063,
"learning_rate": 5.53249507650356e-06,
"loss": 0.3683,
"step": 5510
},
{
"epoch": 2.2577477753912243,
"grad_norm": 1.1208116392163985,
"learning_rate": 5.502196636873202e-06,
"loss": 0.3492,
"step": 5520
},
{
"epoch": 2.2618390099212435,
"grad_norm": 0.8340800205831346,
"learning_rate": 5.471898197242842e-06,
"loss": 0.3372,
"step": 5530
},
{
"epoch": 2.2659302444512632,
"grad_norm": 0.9589224290151471,
"learning_rate": 5.441599757612483e-06,
"loss": 0.3478,
"step": 5540
},
{
"epoch": 2.2700214789812825,
"grad_norm": 1.015598680602829,
"learning_rate": 5.4113013179821235e-06,
"loss": 0.3607,
"step": 5550
},
{
"epoch": 2.274112713511302,
"grad_norm": 0.9746676080601815,
"learning_rate": 5.3810028783517655e-06,
"loss": 0.3611,
"step": 5560
},
{
"epoch": 2.2782039480413214,
"grad_norm": 0.9296538878972167,
"learning_rate": 5.350704438721406e-06,
"loss": 0.3576,
"step": 5570
},
{
"epoch": 2.2822951825713407,
"grad_norm": 0.8789512395616204,
"learning_rate": 5.320405999091047e-06,
"loss": 0.3629,
"step": 5580
},
{
"epoch": 2.2863864171013604,
"grad_norm": 1.0582855408609264,
"learning_rate": 5.290107559460688e-06,
"loss": 0.3634,
"step": 5590
},
{
"epoch": 2.2904776516313796,
"grad_norm": 0.9546020518135027,
"learning_rate": 5.259809119830329e-06,
"loss": 0.3586,
"step": 5600
},
{
"epoch": 2.2945688861613993,
"grad_norm": 0.962752971888759,
"learning_rate": 5.22951068019997e-06,
"loss": 0.3605,
"step": 5610
},
{
"epoch": 2.2986601206914186,
"grad_norm": 1.0245121086717093,
"learning_rate": 5.199212240569611e-06,
"loss": 0.3587,
"step": 5620
},
{
"epoch": 2.3027513552214383,
"grad_norm": 1.0040046723513294,
"learning_rate": 5.168913800939253e-06,
"loss": 0.3648,
"step": 5630
},
{
"epoch": 2.3068425897514575,
"grad_norm": 1.171908094587368,
"learning_rate": 5.138615361308893e-06,
"loss": 0.3592,
"step": 5640
},
{
"epoch": 2.3109338242814768,
"grad_norm": 1.011025217862205,
"learning_rate": 5.108316921678534e-06,
"loss": 0.3442,
"step": 5650
},
{
"epoch": 2.3150250588114965,
"grad_norm": 1.1859171170651996,
"learning_rate": 5.078018482048174e-06,
"loss": 0.3633,
"step": 5660
},
{
"epoch": 2.3191162933415157,
"grad_norm": 1.0924583307810276,
"learning_rate": 5.047720042417816e-06,
"loss": 0.3532,
"step": 5670
},
{
"epoch": 2.3232075278715354,
"grad_norm": 1.167947573027158,
"learning_rate": 5.017421602787457e-06,
"loss": 0.3604,
"step": 5680
},
{
"epoch": 2.3272987624015546,
"grad_norm": 1.2875926245828595,
"learning_rate": 4.987123163157098e-06,
"loss": 0.3429,
"step": 5690
},
{
"epoch": 2.3313899969315743,
"grad_norm": 1.156830068761064,
"learning_rate": 4.956824723526739e-06,
"loss": 0.3434,
"step": 5700
},
{
"epoch": 2.3354812314615936,
"grad_norm": 0.9404794900229477,
"learning_rate": 4.926526283896379e-06,
"loss": 0.3399,
"step": 5710
},
{
"epoch": 2.339572465991613,
"grad_norm": 1.116557440039318,
"learning_rate": 4.8962278442660205e-06,
"loss": 0.3584,
"step": 5720
},
{
"epoch": 2.3436637005216325,
"grad_norm": 1.2190130786864881,
"learning_rate": 4.865929404635662e-06,
"loss": 0.3701,
"step": 5730
},
{
"epoch": 2.347754935051652,
"grad_norm": 1.2099886678141103,
"learning_rate": 4.835630965005303e-06,
"loss": 0.3675,
"step": 5740
},
{
"epoch": 2.3518461695816715,
"grad_norm": 1.1364465919872204,
"learning_rate": 4.805332525374943e-06,
"loss": 0.3608,
"step": 5750
},
{
"epoch": 2.3559374041116907,
"grad_norm": 0.9321036699083428,
"learning_rate": 4.775034085744584e-06,
"loss": 0.3496,
"step": 5760
},
{
"epoch": 2.36002863864171,
"grad_norm": 0.9146634319851078,
"learning_rate": 4.744735646114225e-06,
"loss": 0.3502,
"step": 5770
},
{
"epoch": 2.3641198731717297,
"grad_norm": 1.000647504917848,
"learning_rate": 4.7144372064838665e-06,
"loss": 0.3544,
"step": 5780
},
{
"epoch": 2.368211107701749,
"grad_norm": 1.1557436169214348,
"learning_rate": 4.684138766853508e-06,
"loss": 0.3518,
"step": 5790
},
{
"epoch": 2.3723023422317686,
"grad_norm": 1.2926612320528568,
"learning_rate": 4.653840327223149e-06,
"loss": 0.3486,
"step": 5800
},
{
"epoch": 2.376393576761788,
"grad_norm": 1.2169663560603732,
"learning_rate": 4.62354188759279e-06,
"loss": 0.3539,
"step": 5810
},
{
"epoch": 2.380484811291807,
"grad_norm": 0.9319379770517142,
"learning_rate": 4.59324344796243e-06,
"loss": 0.328,
"step": 5820
},
{
"epoch": 2.384576045821827,
"grad_norm": 1.0263622334503117,
"learning_rate": 4.562945008332071e-06,
"loss": 0.3536,
"step": 5830
},
{
"epoch": 2.388667280351846,
"grad_norm": 1.152645840965358,
"learning_rate": 4.5326465687017126e-06,
"loss": 0.3536,
"step": 5840
},
{
"epoch": 2.3927585148818658,
"grad_norm": 0.9658502725314514,
"learning_rate": 4.502348129071354e-06,
"loss": 0.3548,
"step": 5850
},
{
"epoch": 2.396849749411885,
"grad_norm": 1.2197202047381088,
"learning_rate": 4.472049689440994e-06,
"loss": 0.3559,
"step": 5860
},
{
"epoch": 2.4009409839419042,
"grad_norm": 1.2765015130610389,
"learning_rate": 4.441751249810635e-06,
"loss": 0.3497,
"step": 5870
},
{
"epoch": 2.405032218471924,
"grad_norm": 1.38104638770045,
"learning_rate": 4.411452810180276e-06,
"loss": 0.3448,
"step": 5880
},
{
"epoch": 2.409123453001943,
"grad_norm": 1.1400186315945817,
"learning_rate": 4.3811543705499174e-06,
"loss": 0.3561,
"step": 5890
},
{
"epoch": 2.413214687531963,
"grad_norm": 1.332445802283346,
"learning_rate": 4.350855930919558e-06,
"loss": 0.3558,
"step": 5900
},
{
"epoch": 2.417305922061982,
"grad_norm": 0.8518719177973914,
"learning_rate": 4.320557491289199e-06,
"loss": 0.3557,
"step": 5910
},
{
"epoch": 2.4213971565920014,
"grad_norm": 1.3083347016848765,
"learning_rate": 4.29025905165884e-06,
"loss": 0.3443,
"step": 5920
},
{
"epoch": 2.425488391122021,
"grad_norm": 1.2245072085864033,
"learning_rate": 4.259960612028481e-06,
"loss": 0.3424,
"step": 5930
},
{
"epoch": 2.4295796256520403,
"grad_norm": 1.0512105297819025,
"learning_rate": 4.2296621723981215e-06,
"loss": 0.3668,
"step": 5940
},
{
"epoch": 2.43367086018206,
"grad_norm": 1.2181678876630533,
"learning_rate": 4.199363732767763e-06,
"loss": 0.3514,
"step": 5950
},
{
"epoch": 2.4377620947120793,
"grad_norm": 1.2107998993154634,
"learning_rate": 4.169065293137404e-06,
"loss": 0.3634,
"step": 5960
},
{
"epoch": 2.441853329242099,
"grad_norm": 1.162395707185428,
"learning_rate": 4.138766853507045e-06,
"loss": 0.35,
"step": 5970
},
{
"epoch": 2.445944563772118,
"grad_norm": 1.0932545213281812,
"learning_rate": 4.108468413876685e-06,
"loss": 0.3472,
"step": 5980
},
{
"epoch": 2.450035798302138,
"grad_norm": 1.0010475016537828,
"learning_rate": 4.078169974246326e-06,
"loss": 0.3584,
"step": 5990
},
{
"epoch": 2.454127032832157,
"grad_norm": 0.8180158364405802,
"learning_rate": 4.0478715346159675e-06,
"loss": 0.3597,
"step": 6000
},
{
"epoch": 2.454127032832157,
"eval_loss": 0.4111001789569855,
"eval_runtime": 567.1279,
"eval_samples_per_second": 5.445,
"eval_steps_per_second": 0.908,
"step": 6000
},
{
"epoch": 2.4582182673621764,
"grad_norm": 1.152408348113746,
"learning_rate": 4.017573094985609e-06,
"loss": 0.352,
"step": 6010
},
{
"epoch": 2.462309501892196,
"grad_norm": 0.9901037242730653,
"learning_rate": 3.987274655355249e-06,
"loss": 0.3413,
"step": 6020
},
{
"epoch": 2.4664007364222154,
"grad_norm": 1.0744413310129686,
"learning_rate": 3.95697621572489e-06,
"loss": 0.3572,
"step": 6030
},
{
"epoch": 2.470491970952235,
"grad_norm": 1.1304586395449463,
"learning_rate": 3.926677776094531e-06,
"loss": 0.3611,
"step": 6040
},
{
"epoch": 2.4745832054822543,
"grad_norm": 1.1937074296439871,
"learning_rate": 3.8963793364641724e-06,
"loss": 0.351,
"step": 6050
},
{
"epoch": 2.4786744400122735,
"grad_norm": 0.9420890149335949,
"learning_rate": 3.866080896833813e-06,
"loss": 0.3556,
"step": 6060
},
{
"epoch": 2.4827656745422932,
"grad_norm": 1.0885349991773454,
"learning_rate": 3.835782457203454e-06,
"loss": 0.3545,
"step": 6070
},
{
"epoch": 2.4868569090723125,
"grad_norm": 1.076589826454013,
"learning_rate": 3.805484017573095e-06,
"loss": 0.353,
"step": 6080
},
{
"epoch": 2.490948143602332,
"grad_norm": 0.9678457177285116,
"learning_rate": 3.775185577942736e-06,
"loss": 0.3535,
"step": 6090
},
{
"epoch": 2.4950393781323514,
"grad_norm": 1.2568971383029166,
"learning_rate": 3.7448871383123773e-06,
"loss": 0.3535,
"step": 6100
},
{
"epoch": 2.4991306126623707,
"grad_norm": 1.1438602826766222,
"learning_rate": 3.7145886986820185e-06,
"loss": 0.3548,
"step": 6110
},
{
"epoch": 2.5032218471923904,
"grad_norm": 1.1827797517036478,
"learning_rate": 3.6842902590516592e-06,
"loss": 0.3537,
"step": 6120
},
{
"epoch": 2.5073130817224096,
"grad_norm": 1.1604168543066307,
"learning_rate": 3.6539918194213004e-06,
"loss": 0.3443,
"step": 6130
},
{
"epoch": 2.5114043162524293,
"grad_norm": 1.5271313980044559,
"learning_rate": 3.623693379790941e-06,
"loss": 0.3676,
"step": 6140
},
{
"epoch": 2.5154955507824486,
"grad_norm": 1.0170843727726653,
"learning_rate": 3.5933949401605822e-06,
"loss": 0.333,
"step": 6150
},
{
"epoch": 2.519586785312468,
"grad_norm": 1.478391272829513,
"learning_rate": 3.563096500530223e-06,
"loss": 0.3289,
"step": 6160
},
{
"epoch": 2.5236780198424875,
"grad_norm": 1.1327119923685498,
"learning_rate": 3.532798060899864e-06,
"loss": 0.3465,
"step": 6170
},
{
"epoch": 2.5277692543725068,
"grad_norm": 1.2833494932025962,
"learning_rate": 3.502499621269505e-06,
"loss": 0.3603,
"step": 6180
},
{
"epoch": 2.5318604889025265,
"grad_norm": 1.3483786326224019,
"learning_rate": 3.472201181639146e-06,
"loss": 0.3499,
"step": 6190
},
{
"epoch": 2.5359517234325457,
"grad_norm": 1.41685799213282,
"learning_rate": 3.441902742008787e-06,
"loss": 0.3557,
"step": 6200
},
{
"epoch": 2.540042957962565,
"grad_norm": 1.1147817059656389,
"learning_rate": 3.411604302378428e-06,
"loss": 0.3525,
"step": 6210
},
{
"epoch": 2.5441341924925847,
"grad_norm": 1.24256482526061,
"learning_rate": 3.381305862748069e-06,
"loss": 0.3557,
"step": 6220
},
{
"epoch": 2.548225427022604,
"grad_norm": 1.14276520963034,
"learning_rate": 3.3510074231177097e-06,
"loss": 0.3443,
"step": 6230
},
{
"epoch": 2.5523166615526236,
"grad_norm": 1.4308356706747418,
"learning_rate": 3.320708983487351e-06,
"loss": 0.3642,
"step": 6240
},
{
"epoch": 2.556407896082643,
"grad_norm": 1.0603812873827165,
"learning_rate": 3.2904105438569916e-06,
"loss": 0.3507,
"step": 6250
},
{
"epoch": 2.560499130612662,
"grad_norm": 1.1444596493356962,
"learning_rate": 3.2601121042266328e-06,
"loss": 0.3463,
"step": 6260
},
{
"epoch": 2.564590365142682,
"grad_norm": 1.1550573279318823,
"learning_rate": 3.2298136645962735e-06,
"loss": 0.3517,
"step": 6270
},
{
"epoch": 2.5686815996727015,
"grad_norm": 1.2279449599940317,
"learning_rate": 3.1995152249659146e-06,
"loss": 0.3419,
"step": 6280
},
{
"epoch": 2.5727728342027207,
"grad_norm": 1.1531936082950656,
"learning_rate": 3.1692167853355554e-06,
"loss": 0.3553,
"step": 6290
},
{
"epoch": 2.57686406873274,
"grad_norm": 1.1430327649681449,
"learning_rate": 3.1389183457051965e-06,
"loss": 0.3481,
"step": 6300
},
{
"epoch": 2.5809553032627597,
"grad_norm": 1.2985006323282675,
"learning_rate": 3.1086199060748372e-06,
"loss": 0.3486,
"step": 6310
},
{
"epoch": 2.585046537792779,
"grad_norm": 1.1447086789004135,
"learning_rate": 3.0783214664444784e-06,
"loss": 0.3516,
"step": 6320
},
{
"epoch": 2.5891377723227986,
"grad_norm": 1.464516109493368,
"learning_rate": 3.048023026814119e-06,
"loss": 0.3478,
"step": 6330
},
{
"epoch": 2.593229006852818,
"grad_norm": 1.1333941032924868,
"learning_rate": 3.0177245871837603e-06,
"loss": 0.3372,
"step": 6340
},
{
"epoch": 2.597320241382837,
"grad_norm": 1.1610526768921359,
"learning_rate": 2.987426147553401e-06,
"loss": 0.3537,
"step": 6350
},
{
"epoch": 2.601411475912857,
"grad_norm": 1.166132435226011,
"learning_rate": 2.957127707923042e-06,
"loss": 0.3495,
"step": 6360
},
{
"epoch": 2.605502710442876,
"grad_norm": 1.1286755958679484,
"learning_rate": 2.926829268292683e-06,
"loss": 0.3567,
"step": 6370
},
{
"epoch": 2.6095939449728958,
"grad_norm": 1.0343679227746663,
"learning_rate": 2.896530828662324e-06,
"loss": 0.3471,
"step": 6380
},
{
"epoch": 2.613685179502915,
"grad_norm": 1.1778557485063874,
"learning_rate": 2.8662323890319647e-06,
"loss": 0.3549,
"step": 6390
},
{
"epoch": 2.6177764140329343,
"grad_norm": 1.226656877106952,
"learning_rate": 2.835933949401606e-06,
"loss": 0.3496,
"step": 6400
},
{
"epoch": 2.621867648562954,
"grad_norm": 1.3948209564375484,
"learning_rate": 2.8056355097712466e-06,
"loss": 0.3544,
"step": 6410
},
{
"epoch": 2.625958883092973,
"grad_norm": 1.233392087784029,
"learning_rate": 2.7753370701408878e-06,
"loss": 0.34,
"step": 6420
},
{
"epoch": 2.630050117622993,
"grad_norm": 1.2620457074981337,
"learning_rate": 2.7450386305105285e-06,
"loss": 0.3304,
"step": 6430
},
{
"epoch": 2.634141352153012,
"grad_norm": 1.269720329253969,
"learning_rate": 2.71474019088017e-06,
"loss": 0.3589,
"step": 6440
},
{
"epoch": 2.6382325866830314,
"grad_norm": 1.1908184218530549,
"learning_rate": 2.6844417512498112e-06,
"loss": 0.3511,
"step": 6450
},
{
"epoch": 2.642323821213051,
"grad_norm": 1.1293285956281658,
"learning_rate": 2.654143311619452e-06,
"loss": 0.3507,
"step": 6460
},
{
"epoch": 2.6464150557430703,
"grad_norm": 1.3110950677273854,
"learning_rate": 2.623844871989093e-06,
"loss": 0.3433,
"step": 6470
},
{
"epoch": 2.65050629027309,
"grad_norm": 1.4086576967977462,
"learning_rate": 2.593546432358734e-06,
"loss": 0.34,
"step": 6480
},
{
"epoch": 2.6545975248031093,
"grad_norm": 1.34700090210642,
"learning_rate": 2.563247992728375e-06,
"loss": 0.349,
"step": 6490
},
{
"epoch": 2.6586887593331285,
"grad_norm": 1.0282320900902353,
"learning_rate": 2.5329495530980157e-06,
"loss": 0.3601,
"step": 6500
},
{
"epoch": 2.6586887593331285,
"eval_loss": 0.41547685861587524,
"eval_runtime": 565.4311,
"eval_samples_per_second": 5.461,
"eval_steps_per_second": 0.911,
"step": 6500
},
{
"epoch": 2.6627799938631482,
"grad_norm": 1.1250564513049757,
"learning_rate": 2.502651113467657e-06,
"loss": 0.3493,
"step": 6510
},
{
"epoch": 2.6668712283931675,
"grad_norm": 1.1544902670015906,
"learning_rate": 2.4723526738372976e-06,
"loss": 0.356,
"step": 6520
},
{
"epoch": 2.670962462923187,
"grad_norm": 1.6062647614114969,
"learning_rate": 2.4420542342069387e-06,
"loss": 0.3537,
"step": 6530
},
{
"epoch": 2.6750536974532064,
"grad_norm": 1.4055380071214372,
"learning_rate": 2.4117557945765794e-06,
"loss": 0.3589,
"step": 6540
},
{
"epoch": 2.6791449319832257,
"grad_norm": 1.0767099444486827,
"learning_rate": 2.3814573549462206e-06,
"loss": 0.3564,
"step": 6550
},
{
"epoch": 2.6832361665132454,
"grad_norm": 1.0775655234852821,
"learning_rate": 2.3511589153158613e-06,
"loss": 0.3508,
"step": 6560
},
{
"epoch": 2.687327401043265,
"grad_norm": 1.3945094308830326,
"learning_rate": 2.3208604756855025e-06,
"loss": 0.3462,
"step": 6570
},
{
"epoch": 2.6914186355732843,
"grad_norm": 1.4325900380219518,
"learning_rate": 2.290562036055143e-06,
"loss": 0.34,
"step": 6580
},
{
"epoch": 2.6955098701033036,
"grad_norm": 1.0603337925865337,
"learning_rate": 2.2602635964247843e-06,
"loss": 0.341,
"step": 6590
},
{
"epoch": 2.6996011046333233,
"grad_norm": 1.1150418643447688,
"learning_rate": 2.229965156794425e-06,
"loss": 0.3427,
"step": 6600
},
{
"epoch": 2.7036923391633425,
"grad_norm": 1.5841244349700083,
"learning_rate": 2.199666717164066e-06,
"loss": 0.3446,
"step": 6610
},
{
"epoch": 2.707783573693362,
"grad_norm": 1.1978695972855773,
"learning_rate": 2.1693682775337074e-06,
"loss": 0.3447,
"step": 6620
},
{
"epoch": 2.7118748082233815,
"grad_norm": 1.0752732100706306,
"learning_rate": 2.139069837903348e-06,
"loss": 0.3338,
"step": 6630
},
{
"epoch": 2.7159660427534007,
"grad_norm": 1.2105228186701131,
"learning_rate": 2.1087713982729892e-06,
"loss": 0.3352,
"step": 6640
},
{
"epoch": 2.7200572772834204,
"grad_norm": 1.1393844873051664,
"learning_rate": 2.07847295864263e-06,
"loss": 0.3531,
"step": 6650
},
{
"epoch": 2.7241485118134396,
"grad_norm": 1.0238817342799573,
"learning_rate": 2.048174519012271e-06,
"loss": 0.3528,
"step": 6660
},
{
"epoch": 2.7282397463434593,
"grad_norm": 1.1816844502503707,
"learning_rate": 2.017876079381912e-06,
"loss": 0.3512,
"step": 6670
},
{
"epoch": 2.7323309808734786,
"grad_norm": 1.305926419960111,
"learning_rate": 1.987577639751553e-06,
"loss": 0.3442,
"step": 6680
},
{
"epoch": 2.736422215403498,
"grad_norm": 1.4261243840727893,
"learning_rate": 1.957279200121194e-06,
"loss": 0.3455,
"step": 6690
},
{
"epoch": 2.7405134499335175,
"grad_norm": 1.6388906420384781,
"learning_rate": 1.926980760490835e-06,
"loss": 0.3535,
"step": 6700
},
{
"epoch": 2.744604684463537,
"grad_norm": 1.5426677425739872,
"learning_rate": 1.896682320860476e-06,
"loss": 0.3475,
"step": 6710
},
{
"epoch": 2.7486959189935565,
"grad_norm": 1.1578285273120823,
"learning_rate": 1.866383881230117e-06,
"loss": 0.3434,
"step": 6720
},
{
"epoch": 2.7527871535235757,
"grad_norm": 1.0960855524012971,
"learning_rate": 1.8360854415997579e-06,
"loss": 0.3578,
"step": 6730
},
{
"epoch": 2.756878388053595,
"grad_norm": 1.1655103795481652,
"learning_rate": 1.8057870019693988e-06,
"loss": 0.3566,
"step": 6740
},
{
"epoch": 2.7609696225836147,
"grad_norm": 1.4270524025940055,
"learning_rate": 1.7754885623390398e-06,
"loss": 0.3517,
"step": 6750
},
{
"epoch": 2.765060857113634,
"grad_norm": 1.3348787554214543,
"learning_rate": 1.7451901227086807e-06,
"loss": 0.3518,
"step": 6760
},
{
"epoch": 2.7691520916436536,
"grad_norm": 1.0118054266554564,
"learning_rate": 1.7148916830783216e-06,
"loss": 0.3517,
"step": 6770
},
{
"epoch": 2.773243326173673,
"grad_norm": 1.5502107988903457,
"learning_rate": 1.6845932434479626e-06,
"loss": 0.3402,
"step": 6780
},
{
"epoch": 2.777334560703692,
"grad_norm": 1.061216156435797,
"learning_rate": 1.6542948038176035e-06,
"loss": 0.3521,
"step": 6790
},
{
"epoch": 2.781425795233712,
"grad_norm": 1.5424114671353548,
"learning_rate": 1.6239963641872444e-06,
"loss": 0.3456,
"step": 6800
},
{
"epoch": 2.785517029763731,
"grad_norm": 1.2155670661756302,
"learning_rate": 1.5936979245568854e-06,
"loss": 0.3444,
"step": 6810
},
{
"epoch": 2.7896082642937508,
"grad_norm": 1.2991785598636614,
"learning_rate": 1.5633994849265263e-06,
"loss": 0.3453,
"step": 6820
},
{
"epoch": 2.79369949882377,
"grad_norm": 1.5117541536726462,
"learning_rate": 1.5331010452961673e-06,
"loss": 0.3426,
"step": 6830
},
{
"epoch": 2.7977907333537892,
"grad_norm": 1.397208477585223,
"learning_rate": 1.5028026056658082e-06,
"loss": 0.3505,
"step": 6840
},
{
"epoch": 2.801881967883809,
"grad_norm": 1.2017192873007096,
"learning_rate": 1.4725041660354491e-06,
"loss": 0.3341,
"step": 6850
},
{
"epoch": 2.8059732024138286,
"grad_norm": 1.3674252863596916,
"learning_rate": 1.4422057264050903e-06,
"loss": 0.3578,
"step": 6860
},
{
"epoch": 2.810064436943848,
"grad_norm": 1.2963144848882437,
"learning_rate": 1.4119072867747312e-06,
"loss": 0.3448,
"step": 6870
},
{
"epoch": 2.814155671473867,
"grad_norm": 1.3472526890767895,
"learning_rate": 1.3816088471443724e-06,
"loss": 0.3545,
"step": 6880
},
{
"epoch": 2.818246906003887,
"grad_norm": 1.4300718931297243,
"learning_rate": 1.3513104075140133e-06,
"loss": 0.3458,
"step": 6890
},
{
"epoch": 2.822338140533906,
"grad_norm": 1.1374915010677178,
"learning_rate": 1.3210119678836542e-06,
"loss": 0.3452,
"step": 6900
},
{
"epoch": 2.8264293750639258,
"grad_norm": 1.3186888057848343,
"learning_rate": 1.2907135282532952e-06,
"loss": 0.3582,
"step": 6910
},
{
"epoch": 2.830520609593945,
"grad_norm": 1.4076831798674376,
"learning_rate": 1.2604150886229361e-06,
"loss": 0.3473,
"step": 6920
},
{
"epoch": 2.8346118441239643,
"grad_norm": 1.236226000169149,
"learning_rate": 1.230116648992577e-06,
"loss": 0.3508,
"step": 6930
},
{
"epoch": 2.838703078653984,
"grad_norm": 1.1598603618467764,
"learning_rate": 1.199818209362218e-06,
"loss": 0.3135,
"step": 6940
},
{
"epoch": 2.842794313184003,
"grad_norm": 1.4470943796781588,
"learning_rate": 1.169519769731859e-06,
"loss": 0.3369,
"step": 6950
},
{
"epoch": 2.846885547714023,
"grad_norm": 1.4514521872416606,
"learning_rate": 1.1392213301014999e-06,
"loss": 0.3652,
"step": 6960
},
{
"epoch": 2.850976782244042,
"grad_norm": 1.508158438836179,
"learning_rate": 1.1089228904711408e-06,
"loss": 0.3424,
"step": 6970
},
{
"epoch": 2.8550680167740614,
"grad_norm": 1.2409802713756901,
"learning_rate": 1.078624450840782e-06,
"loss": 0.3494,
"step": 6980
},
{
"epoch": 2.859159251304081,
"grad_norm": 1.277646969163353,
"learning_rate": 1.0483260112104229e-06,
"loss": 0.3323,
"step": 6990
},
{
"epoch": 2.8632504858341004,
"grad_norm": 1.181273366360208,
"learning_rate": 1.0180275715800638e-06,
"loss": 0.3454,
"step": 7000
},
{
"epoch": 2.8632504858341004,
"eval_loss": 0.4169977903366089,
"eval_runtime": 566.1639,
"eval_samples_per_second": 5.454,
"eval_steps_per_second": 0.91,
"step": 7000
},
{
"epoch": 2.86734172036412,
"grad_norm": 1.2542825969651887,
"learning_rate": 9.877291319497048e-07,
"loss": 0.3441,
"step": 7010
},
{
"epoch": 2.8714329548941393,
"grad_norm": 1.2663043582399385,
"learning_rate": 9.574306923193457e-07,
"loss": 0.3577,
"step": 7020
},
{
"epoch": 2.8755241894241585,
"grad_norm": 1.2989911359603148,
"learning_rate": 9.271322526889865e-07,
"loss": 0.3441,
"step": 7030
},
{
"epoch": 2.8796154239541782,
"grad_norm": 1.4071532233625517,
"learning_rate": 8.968338130586275e-07,
"loss": 0.3461,
"step": 7040
},
{
"epoch": 2.8837066584841975,
"grad_norm": 1.3845463524426405,
"learning_rate": 8.665353734282685e-07,
"loss": 0.3497,
"step": 7050
},
{
"epoch": 2.887797893014217,
"grad_norm": 1.1682379379190977,
"learning_rate": 8.362369337979096e-07,
"loss": 0.3559,
"step": 7060
},
{
"epoch": 2.8918891275442364,
"grad_norm": 1.3470954040058543,
"learning_rate": 8.059384941675505e-07,
"loss": 0.3429,
"step": 7070
},
{
"epoch": 2.8959803620742557,
"grad_norm": 1.208626776638643,
"learning_rate": 7.756400545371914e-07,
"loss": 0.3415,
"step": 7080
},
{
"epoch": 2.9000715966042754,
"grad_norm": 1.4017945161316288,
"learning_rate": 7.453416149068324e-07,
"loss": 0.3554,
"step": 7090
},
{
"epoch": 2.9041628311342946,
"grad_norm": 1.183169645634326,
"learning_rate": 7.150431752764733e-07,
"loss": 0.3464,
"step": 7100
},
{
"epoch": 2.9082540656643143,
"grad_norm": 1.321129525250198,
"learning_rate": 6.847447356461142e-07,
"loss": 0.3508,
"step": 7110
},
{
"epoch": 2.9123453001943336,
"grad_norm": 1.4041097528620985,
"learning_rate": 6.544462960157552e-07,
"loss": 0.3475,
"step": 7120
},
{
"epoch": 2.916436534724353,
"grad_norm": 1.2651747018935453,
"learning_rate": 6.241478563853962e-07,
"loss": 0.353,
"step": 7130
},
{
"epoch": 2.9205277692543725,
"grad_norm": 1.2832363456828761,
"learning_rate": 5.938494167550372e-07,
"loss": 0.3416,
"step": 7140
},
{
"epoch": 2.924619003784392,
"grad_norm": 1.1421739779615996,
"learning_rate": 5.635509771246781e-07,
"loss": 0.3472,
"step": 7150
},
{
"epoch": 2.9287102383144115,
"grad_norm": 1.643848869321631,
"learning_rate": 5.33252537494319e-07,
"loss": 0.3405,
"step": 7160
},
{
"epoch": 2.9328014728444307,
"grad_norm": 1.1839381749310496,
"learning_rate": 5.029540978639601e-07,
"loss": 0.3301,
"step": 7170
},
{
"epoch": 2.9368927073744504,
"grad_norm": 1.2749563659235175,
"learning_rate": 4.72655658233601e-07,
"loss": 0.3344,
"step": 7180
},
{
"epoch": 2.9409839419044697,
"grad_norm": 1.5435865262538777,
"learning_rate": 4.4235721860324195e-07,
"loss": 0.3409,
"step": 7190
},
{
"epoch": 2.9450751764344894,
"grad_norm": 1.5137986871644649,
"learning_rate": 4.120587789728829e-07,
"loss": 0.3475,
"step": 7200
},
{
"epoch": 2.9491664109645086,
"grad_norm": 1.2316496489032414,
"learning_rate": 3.8176033934252394e-07,
"loss": 0.3348,
"step": 7210
},
{
"epoch": 2.953257645494528,
"grad_norm": 1.2680106473026544,
"learning_rate": 3.5146189971216487e-07,
"loss": 0.3335,
"step": 7220
},
{
"epoch": 2.9573488800245475,
"grad_norm": 1.0733135362099837,
"learning_rate": 3.211634600818058e-07,
"loss": 0.347,
"step": 7230
},
{
"epoch": 2.961440114554567,
"grad_norm": 1.5331378983904185,
"learning_rate": 2.908650204514468e-07,
"loss": 0.3489,
"step": 7240
},
{
"epoch": 2.9655313490845865,
"grad_norm": 1.1721604841420616,
"learning_rate": 2.6056658082108774e-07,
"loss": 0.3407,
"step": 7250
},
{
"epoch": 2.9696225836146057,
"grad_norm": 1.2850118367826986,
"learning_rate": 2.302681411907287e-07,
"loss": 0.3589,
"step": 7260
},
{
"epoch": 2.973713818144625,
"grad_norm": 1.4413311960347719,
"learning_rate": 1.9996970156036967e-07,
"loss": 0.3369,
"step": 7270
},
{
"epoch": 2.9778050526746447,
"grad_norm": 1.314019429686636,
"learning_rate": 1.6967126193001063e-07,
"loss": 0.3507,
"step": 7280
},
{
"epoch": 2.981896287204664,
"grad_norm": 1.113303777593494,
"learning_rate": 1.3937282229965157e-07,
"loss": 0.3523,
"step": 7290
},
{
"epoch": 2.9859875217346836,
"grad_norm": 1.1248866357161083,
"learning_rate": 1.0907438266929254e-07,
"loss": 0.3437,
"step": 7300
},
{
"epoch": 2.990078756264703,
"grad_norm": 1.1898353136027997,
"learning_rate": 7.877594303893351e-08,
"loss": 0.3545,
"step": 7310
},
{
"epoch": 2.994169990794722,
"grad_norm": 1.4708707926240419,
"learning_rate": 4.8477503408574464e-08,
"loss": 0.3666,
"step": 7320
},
{
"epoch": 2.998261225324742,
"grad_norm": 1.3652249352362715,
"learning_rate": 1.8179063778215425e-08,
"loss": 0.3625,
"step": 7330
}
],
"logging_steps": 10,
"max_steps": 7335,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 307331176169472.0,
"train_batch_size": 2,
"trial_name": null,
"trial_params": null
}