CodCodingCode's picture
Upload folder using huggingface_hub
85fae62 verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 2.0,
"eval_steps": 500,
"global_step": 6508,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0030736130321192563,
"grad_norm": 2.8125,
"learning_rate": 1.9972341733251385e-05,
"loss": 1.2571,
"step": 10
},
{
"epoch": 0.0061472260642385125,
"grad_norm": 2.53125,
"learning_rate": 1.994161032575292e-05,
"loss": 0.6897,
"step": 20
},
{
"epoch": 0.009220839096357769,
"grad_norm": 2.25,
"learning_rate": 1.9910878918254458e-05,
"loss": 0.7244,
"step": 30
},
{
"epoch": 0.012294452128477025,
"grad_norm": 1.984375,
"learning_rate": 1.9880147510755993e-05,
"loss": 0.662,
"step": 40
},
{
"epoch": 0.015368065160596281,
"grad_norm": 1.7890625,
"learning_rate": 1.984941610325753e-05,
"loss": 0.6253,
"step": 50
},
{
"epoch": 0.018441678192715538,
"grad_norm": 2.15625,
"learning_rate": 1.9818684695759067e-05,
"loss": 0.6154,
"step": 60
},
{
"epoch": 0.021515291224834792,
"grad_norm": 1.796875,
"learning_rate": 1.9787953288260605e-05,
"loss": 0.5772,
"step": 70
},
{
"epoch": 0.02458890425695405,
"grad_norm": 1.6484375,
"learning_rate": 1.975722188076214e-05,
"loss": 0.6075,
"step": 80
},
{
"epoch": 0.027662517289073305,
"grad_norm": 2.140625,
"learning_rate": 1.972649047326368e-05,
"loss": 0.5716,
"step": 90
},
{
"epoch": 0.030736130321192563,
"grad_norm": 1.8828125,
"learning_rate": 1.9695759065765213e-05,
"loss": 0.6216,
"step": 100
},
{
"epoch": 0.03380974335331182,
"grad_norm": 1.59375,
"learning_rate": 1.9665027658266752e-05,
"loss": 0.5748,
"step": 110
},
{
"epoch": 0.036883356385431075,
"grad_norm": 2.359375,
"learning_rate": 1.9634296250768287e-05,
"loss": 0.6002,
"step": 120
},
{
"epoch": 0.03995696941755033,
"grad_norm": 2.3125,
"learning_rate": 1.9603564843269825e-05,
"loss": 0.603,
"step": 130
},
{
"epoch": 0.043030582449669584,
"grad_norm": 1.90625,
"learning_rate": 1.957283343577136e-05,
"loss": 0.5885,
"step": 140
},
{
"epoch": 0.046104195481788846,
"grad_norm": 1.8203125,
"learning_rate": 1.9542102028272895e-05,
"loss": 0.5882,
"step": 150
},
{
"epoch": 0.0491778085139081,
"grad_norm": 2.0,
"learning_rate": 1.9511370620774434e-05,
"loss": 0.5882,
"step": 160
},
{
"epoch": 0.052251421546027355,
"grad_norm": 1.7109375,
"learning_rate": 1.948063921327597e-05,
"loss": 0.617,
"step": 170
},
{
"epoch": 0.05532503457814661,
"grad_norm": 1.921875,
"learning_rate": 1.9449907805777507e-05,
"loss": 0.5883,
"step": 180
},
{
"epoch": 0.05839864761026587,
"grad_norm": 2.234375,
"learning_rate": 1.9419176398279042e-05,
"loss": 0.5665,
"step": 190
},
{
"epoch": 0.061472260642385125,
"grad_norm": 1.9140625,
"learning_rate": 1.938844499078058e-05,
"loss": 0.5579,
"step": 200
},
{
"epoch": 0.06454587367450439,
"grad_norm": 1.6875,
"learning_rate": 1.9357713583282115e-05,
"loss": 0.5114,
"step": 210
},
{
"epoch": 0.06761948670662364,
"grad_norm": 1.96875,
"learning_rate": 1.9326982175783654e-05,
"loss": 0.5922,
"step": 220
},
{
"epoch": 0.0706930997387429,
"grad_norm": 1.9375,
"learning_rate": 1.929625076828519e-05,
"loss": 0.5241,
"step": 230
},
{
"epoch": 0.07376671277086215,
"grad_norm": 1.859375,
"learning_rate": 1.9265519360786727e-05,
"loss": 0.5573,
"step": 240
},
{
"epoch": 0.0768403258029814,
"grad_norm": 1.8984375,
"learning_rate": 1.9234787953288262e-05,
"loss": 0.5565,
"step": 250
},
{
"epoch": 0.07991393883510066,
"grad_norm": 1.9765625,
"learning_rate": 1.92040565457898e-05,
"loss": 0.5994,
"step": 260
},
{
"epoch": 0.08298755186721991,
"grad_norm": 1.8515625,
"learning_rate": 1.9173325138291336e-05,
"loss": 0.5496,
"step": 270
},
{
"epoch": 0.08606116489933917,
"grad_norm": 1.3671875,
"learning_rate": 1.914259373079287e-05,
"loss": 0.4966,
"step": 280
},
{
"epoch": 0.08913477793145842,
"grad_norm": 1.9765625,
"learning_rate": 1.911186232329441e-05,
"loss": 0.5337,
"step": 290
},
{
"epoch": 0.09220839096357769,
"grad_norm": 2.421875,
"learning_rate": 1.9081130915795944e-05,
"loss": 0.5477,
"step": 300
},
{
"epoch": 0.09528200399569695,
"grad_norm": 1.921875,
"learning_rate": 1.9050399508297482e-05,
"loss": 0.5003,
"step": 310
},
{
"epoch": 0.0983556170278162,
"grad_norm": 1.9296875,
"learning_rate": 1.9019668100799017e-05,
"loss": 0.5316,
"step": 320
},
{
"epoch": 0.10142923005993545,
"grad_norm": 1.921875,
"learning_rate": 1.8988936693300556e-05,
"loss": 0.5299,
"step": 330
},
{
"epoch": 0.10450284309205471,
"grad_norm": 2.140625,
"learning_rate": 1.895820528580209e-05,
"loss": 0.5126,
"step": 340
},
{
"epoch": 0.10757645612417396,
"grad_norm": 1.890625,
"learning_rate": 1.892747387830363e-05,
"loss": 0.5374,
"step": 350
},
{
"epoch": 0.11065006915629322,
"grad_norm": 1.65625,
"learning_rate": 1.8896742470805164e-05,
"loss": 0.5385,
"step": 360
},
{
"epoch": 0.11372368218841247,
"grad_norm": 1.953125,
"learning_rate": 1.8866011063306702e-05,
"loss": 0.5072,
"step": 370
},
{
"epoch": 0.11679729522053174,
"grad_norm": 1.4453125,
"learning_rate": 1.8835279655808237e-05,
"loss": 0.5379,
"step": 380
},
{
"epoch": 0.119870908252651,
"grad_norm": 1.9140625,
"learning_rate": 1.8804548248309776e-05,
"loss": 0.5164,
"step": 390
},
{
"epoch": 0.12294452128477025,
"grad_norm": 2.390625,
"learning_rate": 1.877381684081131e-05,
"loss": 0.5303,
"step": 400
},
{
"epoch": 0.1260181343168895,
"grad_norm": 1.9296875,
"learning_rate": 1.8743085433312846e-05,
"loss": 0.5223,
"step": 410
},
{
"epoch": 0.12909174734900877,
"grad_norm": 2.0625,
"learning_rate": 1.8712354025814384e-05,
"loss": 0.5457,
"step": 420
},
{
"epoch": 0.13216536038112803,
"grad_norm": 2.234375,
"learning_rate": 1.868162261831592e-05,
"loss": 0.5,
"step": 430
},
{
"epoch": 0.13523897341324728,
"grad_norm": 2.015625,
"learning_rate": 1.8650891210817458e-05,
"loss": 0.525,
"step": 440
},
{
"epoch": 0.13831258644536654,
"grad_norm": 1.546875,
"learning_rate": 1.8620159803318993e-05,
"loss": 0.4843,
"step": 450
},
{
"epoch": 0.1413861994774858,
"grad_norm": 1.9765625,
"learning_rate": 1.858942839582053e-05,
"loss": 0.5126,
"step": 460
},
{
"epoch": 0.14445981250960505,
"grad_norm": 2.09375,
"learning_rate": 1.8558696988322066e-05,
"loss": 0.5159,
"step": 470
},
{
"epoch": 0.1475334255417243,
"grad_norm": 1.9921875,
"learning_rate": 1.8527965580823604e-05,
"loss": 0.5117,
"step": 480
},
{
"epoch": 0.15060703857384355,
"grad_norm": 1.9609375,
"learning_rate": 1.849723417332514e-05,
"loss": 0.4941,
"step": 490
},
{
"epoch": 0.1536806516059628,
"grad_norm": 1.78125,
"learning_rate": 1.8466502765826678e-05,
"loss": 0.5206,
"step": 500
},
{
"epoch": 0.15675426463808206,
"grad_norm": 2.328125,
"learning_rate": 1.8435771358328213e-05,
"loss": 0.5227,
"step": 510
},
{
"epoch": 0.15982787767020132,
"grad_norm": 1.890625,
"learning_rate": 1.840503995082975e-05,
"loss": 0.4969,
"step": 520
},
{
"epoch": 0.16290149070232057,
"grad_norm": 1.9765625,
"learning_rate": 1.8374308543331286e-05,
"loss": 0.5237,
"step": 530
},
{
"epoch": 0.16597510373443983,
"grad_norm": 2.078125,
"learning_rate": 1.834357713583282e-05,
"loss": 0.5529,
"step": 540
},
{
"epoch": 0.16904871676655908,
"grad_norm": 2.15625,
"learning_rate": 1.831284572833436e-05,
"loss": 0.535,
"step": 550
},
{
"epoch": 0.17212232979867834,
"grad_norm": 2.28125,
"learning_rate": 1.8282114320835895e-05,
"loss": 0.495,
"step": 560
},
{
"epoch": 0.1751959428307976,
"grad_norm": 1.96875,
"learning_rate": 1.8251382913337433e-05,
"loss": 0.4784,
"step": 570
},
{
"epoch": 0.17826955586291685,
"grad_norm": 1.90625,
"learning_rate": 1.8220651505838968e-05,
"loss": 0.5158,
"step": 580
},
{
"epoch": 0.18134316889503613,
"grad_norm": 1.5859375,
"learning_rate": 1.8189920098340506e-05,
"loss": 0.5033,
"step": 590
},
{
"epoch": 0.18441678192715538,
"grad_norm": 1.765625,
"learning_rate": 1.815918869084204e-05,
"loss": 0.5106,
"step": 600
},
{
"epoch": 0.18749039495927464,
"grad_norm": 2.203125,
"learning_rate": 1.812845728334358e-05,
"loss": 0.5103,
"step": 610
},
{
"epoch": 0.1905640079913939,
"grad_norm": 1.5390625,
"learning_rate": 1.8097725875845115e-05,
"loss": 0.5091,
"step": 620
},
{
"epoch": 0.19363762102351315,
"grad_norm": 1.7578125,
"learning_rate": 1.8066994468346653e-05,
"loss": 0.485,
"step": 630
},
{
"epoch": 0.1967112340556324,
"grad_norm": 2.03125,
"learning_rate": 1.8036263060848188e-05,
"loss": 0.4814,
"step": 640
},
{
"epoch": 0.19978484708775165,
"grad_norm": 1.9765625,
"learning_rate": 1.8005531653349727e-05,
"loss": 0.4873,
"step": 650
},
{
"epoch": 0.2028584601198709,
"grad_norm": 1.9921875,
"learning_rate": 1.797480024585126e-05,
"loss": 0.4744,
"step": 660
},
{
"epoch": 0.20593207315199016,
"grad_norm": 1.890625,
"learning_rate": 1.7944068838352797e-05,
"loss": 0.5105,
"step": 670
},
{
"epoch": 0.20900568618410942,
"grad_norm": 2.0625,
"learning_rate": 1.7913337430854335e-05,
"loss": 0.496,
"step": 680
},
{
"epoch": 0.21207929921622867,
"grad_norm": 2.1875,
"learning_rate": 1.788260602335587e-05,
"loss": 0.4867,
"step": 690
},
{
"epoch": 0.21515291224834793,
"grad_norm": 1.8984375,
"learning_rate": 1.785187461585741e-05,
"loss": 0.4406,
"step": 700
},
{
"epoch": 0.21822652528046718,
"grad_norm": 1.71875,
"learning_rate": 1.7821143208358943e-05,
"loss": 0.4459,
"step": 710
},
{
"epoch": 0.22130013831258644,
"grad_norm": 1.9609375,
"learning_rate": 1.7790411800860482e-05,
"loss": 0.4765,
"step": 720
},
{
"epoch": 0.2243737513447057,
"grad_norm": 2.28125,
"learning_rate": 1.7759680393362017e-05,
"loss": 0.4879,
"step": 730
},
{
"epoch": 0.22744736437682495,
"grad_norm": 2.171875,
"learning_rate": 1.7728948985863555e-05,
"loss": 0.4889,
"step": 740
},
{
"epoch": 0.23052097740894423,
"grad_norm": 1.9921875,
"learning_rate": 1.769821757836509e-05,
"loss": 0.4491,
"step": 750
},
{
"epoch": 0.23359459044106348,
"grad_norm": 1.7890625,
"learning_rate": 1.766748617086663e-05,
"loss": 0.4794,
"step": 760
},
{
"epoch": 0.23666820347318274,
"grad_norm": 2.171875,
"learning_rate": 1.7636754763368163e-05,
"loss": 0.4935,
"step": 770
},
{
"epoch": 0.239741816505302,
"grad_norm": 1.6953125,
"learning_rate": 1.7606023355869702e-05,
"loss": 0.4491,
"step": 780
},
{
"epoch": 0.24281542953742125,
"grad_norm": 1.8984375,
"learning_rate": 1.7575291948371237e-05,
"loss": 0.4424,
"step": 790
},
{
"epoch": 0.2458890425695405,
"grad_norm": 2.03125,
"learning_rate": 1.7544560540872772e-05,
"loss": 0.4756,
"step": 800
},
{
"epoch": 0.24896265560165975,
"grad_norm": 2.296875,
"learning_rate": 1.751382913337431e-05,
"loss": 0.4743,
"step": 810
},
{
"epoch": 0.252036268633779,
"grad_norm": 1.8046875,
"learning_rate": 1.7483097725875845e-05,
"loss": 0.4598,
"step": 820
},
{
"epoch": 0.25510988166589826,
"grad_norm": 2.03125,
"learning_rate": 1.7452366318377384e-05,
"loss": 0.4849,
"step": 830
},
{
"epoch": 0.25818349469801755,
"grad_norm": 1.984375,
"learning_rate": 1.742163491087892e-05,
"loss": 0.4555,
"step": 840
},
{
"epoch": 0.2612571077301368,
"grad_norm": 1.859375,
"learning_rate": 1.7390903503380457e-05,
"loss": 0.4487,
"step": 850
},
{
"epoch": 0.26433072076225606,
"grad_norm": 2.453125,
"learning_rate": 1.7360172095881992e-05,
"loss": 0.4816,
"step": 860
},
{
"epoch": 0.2674043337943753,
"grad_norm": 2.078125,
"learning_rate": 1.732944068838353e-05,
"loss": 0.4694,
"step": 870
},
{
"epoch": 0.27047794682649456,
"grad_norm": 1.8828125,
"learning_rate": 1.7298709280885065e-05,
"loss": 0.4826,
"step": 880
},
{
"epoch": 0.2735515598586138,
"grad_norm": 1.890625,
"learning_rate": 1.7267977873386604e-05,
"loss": 0.4397,
"step": 890
},
{
"epoch": 0.2766251728907331,
"grad_norm": 2.046875,
"learning_rate": 1.723724646588814e-05,
"loss": 0.4297,
"step": 900
},
{
"epoch": 0.2796987859228523,
"grad_norm": 1.859375,
"learning_rate": 1.7206515058389677e-05,
"loss": 0.4964,
"step": 910
},
{
"epoch": 0.2827723989549716,
"grad_norm": 1.4921875,
"learning_rate": 1.7175783650891212e-05,
"loss": 0.4641,
"step": 920
},
{
"epoch": 0.2858460119870908,
"grad_norm": 1.4765625,
"learning_rate": 1.7145052243392747e-05,
"loss": 0.4507,
"step": 930
},
{
"epoch": 0.2889196250192101,
"grad_norm": 1.9921875,
"learning_rate": 1.7114320835894286e-05,
"loss": 0.4675,
"step": 940
},
{
"epoch": 0.2919932380513293,
"grad_norm": 1.703125,
"learning_rate": 1.708358942839582e-05,
"loss": 0.4329,
"step": 950
},
{
"epoch": 0.2950668510834486,
"grad_norm": 1.859375,
"learning_rate": 1.705285802089736e-05,
"loss": 0.4666,
"step": 960
},
{
"epoch": 0.2981404641155678,
"grad_norm": 2.0625,
"learning_rate": 1.7022126613398894e-05,
"loss": 0.445,
"step": 970
},
{
"epoch": 0.3012140771476871,
"grad_norm": 1.8125,
"learning_rate": 1.6991395205900432e-05,
"loss": 0.4755,
"step": 980
},
{
"epoch": 0.30428769017980634,
"grad_norm": 1.71875,
"learning_rate": 1.6960663798401967e-05,
"loss": 0.4107,
"step": 990
},
{
"epoch": 0.3073613032119256,
"grad_norm": 2.03125,
"learning_rate": 1.6929932390903506e-05,
"loss": 0.4595,
"step": 1000
},
{
"epoch": 0.3104349162440449,
"grad_norm": 2.0625,
"learning_rate": 1.689920098340504e-05,
"loss": 0.4523,
"step": 1010
},
{
"epoch": 0.3135085292761641,
"grad_norm": 1.984375,
"learning_rate": 1.686846957590658e-05,
"loss": 0.4645,
"step": 1020
},
{
"epoch": 0.3165821423082834,
"grad_norm": 2.28125,
"learning_rate": 1.6837738168408114e-05,
"loss": 0.4895,
"step": 1030
},
{
"epoch": 0.31965575534040264,
"grad_norm": 1.984375,
"learning_rate": 1.6807006760909653e-05,
"loss": 0.4519,
"step": 1040
},
{
"epoch": 0.3227293683725219,
"grad_norm": 2.21875,
"learning_rate": 1.6776275353411188e-05,
"loss": 0.4727,
"step": 1050
},
{
"epoch": 0.32580298140464115,
"grad_norm": 1.8203125,
"learning_rate": 1.6745543945912723e-05,
"loss": 0.442,
"step": 1060
},
{
"epoch": 0.3288765944367604,
"grad_norm": 2.234375,
"learning_rate": 1.671481253841426e-05,
"loss": 0.4645,
"step": 1070
},
{
"epoch": 0.33195020746887965,
"grad_norm": 2.015625,
"learning_rate": 1.6684081130915796e-05,
"loss": 0.4463,
"step": 1080
},
{
"epoch": 0.33502382050099894,
"grad_norm": 1.9296875,
"learning_rate": 1.6653349723417334e-05,
"loss": 0.4446,
"step": 1090
},
{
"epoch": 0.33809743353311816,
"grad_norm": 1.734375,
"learning_rate": 1.662261831591887e-05,
"loss": 0.4594,
"step": 1100
},
{
"epoch": 0.34117104656523745,
"grad_norm": 1.9765625,
"learning_rate": 1.6591886908420408e-05,
"loss": 0.4874,
"step": 1110
},
{
"epoch": 0.3442446595973567,
"grad_norm": 1.921875,
"learning_rate": 1.6561155500921943e-05,
"loss": 0.4757,
"step": 1120
},
{
"epoch": 0.34731827262947595,
"grad_norm": 2.203125,
"learning_rate": 1.653042409342348e-05,
"loss": 0.4756,
"step": 1130
},
{
"epoch": 0.3503918856615952,
"grad_norm": 2.25,
"learning_rate": 1.6499692685925016e-05,
"loss": 0.4271,
"step": 1140
},
{
"epoch": 0.35346549869371446,
"grad_norm": 2.3125,
"learning_rate": 1.6468961278426554e-05,
"loss": 0.4882,
"step": 1150
},
{
"epoch": 0.3565391117258337,
"grad_norm": 2.078125,
"learning_rate": 1.643822987092809e-05,
"loss": 0.4381,
"step": 1160
},
{
"epoch": 0.359612724757953,
"grad_norm": 1.796875,
"learning_rate": 1.6407498463429628e-05,
"loss": 0.4276,
"step": 1170
},
{
"epoch": 0.36268633779007226,
"grad_norm": 1.8046875,
"learning_rate": 1.6376767055931163e-05,
"loss": 0.4088,
"step": 1180
},
{
"epoch": 0.3657599508221915,
"grad_norm": 1.7109375,
"learning_rate": 1.6346035648432698e-05,
"loss": 0.4368,
"step": 1190
},
{
"epoch": 0.36883356385431076,
"grad_norm": 1.625,
"learning_rate": 1.6315304240934236e-05,
"loss": 0.4488,
"step": 1200
},
{
"epoch": 0.37190717688643,
"grad_norm": 2.15625,
"learning_rate": 1.628457283343577e-05,
"loss": 0.4213,
"step": 1210
},
{
"epoch": 0.3749807899185493,
"grad_norm": 2.078125,
"learning_rate": 1.625384142593731e-05,
"loss": 0.4275,
"step": 1220
},
{
"epoch": 0.3780544029506685,
"grad_norm": 1.8359375,
"learning_rate": 1.6223110018438845e-05,
"loss": 0.4721,
"step": 1230
},
{
"epoch": 0.3811280159827878,
"grad_norm": 1.9140625,
"learning_rate": 1.6192378610940383e-05,
"loss": 0.3997,
"step": 1240
},
{
"epoch": 0.384201629014907,
"grad_norm": 2.015625,
"learning_rate": 1.6161647203441918e-05,
"loss": 0.433,
"step": 1250
},
{
"epoch": 0.3872752420470263,
"grad_norm": 2.359375,
"learning_rate": 1.6130915795943456e-05,
"loss": 0.4386,
"step": 1260
},
{
"epoch": 0.3903488550791455,
"grad_norm": 2.046875,
"learning_rate": 1.610018438844499e-05,
"loss": 0.4366,
"step": 1270
},
{
"epoch": 0.3934224681112648,
"grad_norm": 1.8046875,
"learning_rate": 1.606945298094653e-05,
"loss": 0.41,
"step": 1280
},
{
"epoch": 0.396496081143384,
"grad_norm": 2.203125,
"learning_rate": 1.6038721573448065e-05,
"loss": 0.4296,
"step": 1290
},
{
"epoch": 0.3995696941755033,
"grad_norm": 2.03125,
"learning_rate": 1.6007990165949603e-05,
"loss": 0.4292,
"step": 1300
},
{
"epoch": 0.40264330720762254,
"grad_norm": 1.875,
"learning_rate": 1.5977258758451138e-05,
"loss": 0.4056,
"step": 1310
},
{
"epoch": 0.4057169202397418,
"grad_norm": 1.90625,
"learning_rate": 1.5946527350952673e-05,
"loss": 0.4191,
"step": 1320
},
{
"epoch": 0.40879053327186105,
"grad_norm": 1.7265625,
"learning_rate": 1.591579594345421e-05,
"loss": 0.4153,
"step": 1330
},
{
"epoch": 0.4118641463039803,
"grad_norm": 2.40625,
"learning_rate": 1.5885064535955747e-05,
"loss": 0.4415,
"step": 1340
},
{
"epoch": 0.4149377593360996,
"grad_norm": 2.359375,
"learning_rate": 1.5854333128457285e-05,
"loss": 0.4004,
"step": 1350
},
{
"epoch": 0.41801137236821884,
"grad_norm": 1.78125,
"learning_rate": 1.582360172095882e-05,
"loss": 0.4418,
"step": 1360
},
{
"epoch": 0.4210849854003381,
"grad_norm": 2.34375,
"learning_rate": 1.579287031346036e-05,
"loss": 0.4311,
"step": 1370
},
{
"epoch": 0.42415859843245735,
"grad_norm": 2.015625,
"learning_rate": 1.5762138905961893e-05,
"loss": 0.4112,
"step": 1380
},
{
"epoch": 0.42723221146457663,
"grad_norm": 2.25,
"learning_rate": 1.5731407498463432e-05,
"loss": 0.4352,
"step": 1390
},
{
"epoch": 0.43030582449669585,
"grad_norm": 1.9453125,
"learning_rate": 1.5700676090964967e-05,
"loss": 0.4617,
"step": 1400
},
{
"epoch": 0.43337943752881514,
"grad_norm": 2.15625,
"learning_rate": 1.5669944683466505e-05,
"loss": 0.3976,
"step": 1410
},
{
"epoch": 0.43645305056093436,
"grad_norm": 2.203125,
"learning_rate": 1.563921327596804e-05,
"loss": 0.432,
"step": 1420
},
{
"epoch": 0.43952666359305365,
"grad_norm": 1.8203125,
"learning_rate": 1.560848186846958e-05,
"loss": 0.3597,
"step": 1430
},
{
"epoch": 0.4426002766251729,
"grad_norm": 1.6015625,
"learning_rate": 1.5577750460971114e-05,
"loss": 0.4091,
"step": 1440
},
{
"epoch": 0.44567388965729215,
"grad_norm": 2.125,
"learning_rate": 1.554701905347265e-05,
"loss": 0.4275,
"step": 1450
},
{
"epoch": 0.4487475026894114,
"grad_norm": 1.9375,
"learning_rate": 1.5516287645974187e-05,
"loss": 0.4125,
"step": 1460
},
{
"epoch": 0.45182111572153066,
"grad_norm": 1.8203125,
"learning_rate": 1.5485556238475722e-05,
"loss": 0.3917,
"step": 1470
},
{
"epoch": 0.4548947287536499,
"grad_norm": 1.8515625,
"learning_rate": 1.545482483097726e-05,
"loss": 0.364,
"step": 1480
},
{
"epoch": 0.4579683417857692,
"grad_norm": 1.625,
"learning_rate": 1.5424093423478795e-05,
"loss": 0.4297,
"step": 1490
},
{
"epoch": 0.46104195481788846,
"grad_norm": 2.109375,
"learning_rate": 1.5393362015980334e-05,
"loss": 0.4274,
"step": 1500
},
{
"epoch": 0.4641155678500077,
"grad_norm": 1.9453125,
"learning_rate": 1.536263060848187e-05,
"loss": 0.3786,
"step": 1510
},
{
"epoch": 0.46718918088212696,
"grad_norm": 2.53125,
"learning_rate": 1.5331899200983407e-05,
"loss": 0.4062,
"step": 1520
},
{
"epoch": 0.4702627939142462,
"grad_norm": 2.359375,
"learning_rate": 1.5301167793484942e-05,
"loss": 0.3928,
"step": 1530
},
{
"epoch": 0.4733364069463655,
"grad_norm": 2.375,
"learning_rate": 1.527043638598648e-05,
"loss": 0.4124,
"step": 1540
},
{
"epoch": 0.4764100199784847,
"grad_norm": 1.8125,
"learning_rate": 1.5239704978488017e-05,
"loss": 0.4111,
"step": 1550
},
{
"epoch": 0.479483633010604,
"grad_norm": 1.546875,
"learning_rate": 1.5208973570989554e-05,
"loss": 0.385,
"step": 1560
},
{
"epoch": 0.4825572460427232,
"grad_norm": 1.953125,
"learning_rate": 1.5178242163491089e-05,
"loss": 0.4165,
"step": 1570
},
{
"epoch": 0.4856308590748425,
"grad_norm": 1.7265625,
"learning_rate": 1.5147510755992626e-05,
"loss": 0.3929,
"step": 1580
},
{
"epoch": 0.4887044721069617,
"grad_norm": 2.46875,
"learning_rate": 1.5116779348494162e-05,
"loss": 0.4247,
"step": 1590
},
{
"epoch": 0.491778085139081,
"grad_norm": 2.09375,
"learning_rate": 1.5086047940995699e-05,
"loss": 0.4078,
"step": 1600
},
{
"epoch": 0.4948516981712002,
"grad_norm": 1.5546875,
"learning_rate": 1.5055316533497236e-05,
"loss": 0.3808,
"step": 1610
},
{
"epoch": 0.4979253112033195,
"grad_norm": 1.9609375,
"learning_rate": 1.5024585125998772e-05,
"loss": 0.4193,
"step": 1620
},
{
"epoch": 0.5009989242354388,
"grad_norm": 1.9921875,
"learning_rate": 1.4993853718500309e-05,
"loss": 0.3915,
"step": 1630
},
{
"epoch": 0.504072537267558,
"grad_norm": 2.109375,
"learning_rate": 1.4963122311001846e-05,
"loss": 0.4306,
"step": 1640
},
{
"epoch": 0.5071461502996772,
"grad_norm": 2.171875,
"learning_rate": 1.4932390903503382e-05,
"loss": 0.4278,
"step": 1650
},
{
"epoch": 0.5102197633317965,
"grad_norm": 2.3125,
"learning_rate": 1.490165949600492e-05,
"loss": 0.4045,
"step": 1660
},
{
"epoch": 0.5132933763639158,
"grad_norm": 1.828125,
"learning_rate": 1.4870928088506456e-05,
"loss": 0.3882,
"step": 1670
},
{
"epoch": 0.5163669893960351,
"grad_norm": 2.140625,
"learning_rate": 1.4840196681007993e-05,
"loss": 0.4204,
"step": 1680
},
{
"epoch": 0.5194406024281543,
"grad_norm": 2.046875,
"learning_rate": 1.480946527350953e-05,
"loss": 0.362,
"step": 1690
},
{
"epoch": 0.5225142154602735,
"grad_norm": 2.328125,
"learning_rate": 1.4778733866011064e-05,
"loss": 0.3536,
"step": 1700
},
{
"epoch": 0.5255878284923928,
"grad_norm": 1.953125,
"learning_rate": 1.4748002458512601e-05,
"loss": 0.3586,
"step": 1710
},
{
"epoch": 0.5286614415245121,
"grad_norm": 1.7890625,
"learning_rate": 1.4717271051014138e-05,
"loss": 0.3937,
"step": 1720
},
{
"epoch": 0.5317350545566313,
"grad_norm": 2.3125,
"learning_rate": 1.4686539643515674e-05,
"loss": 0.3939,
"step": 1730
},
{
"epoch": 0.5348086675887506,
"grad_norm": 1.9375,
"learning_rate": 1.4655808236017211e-05,
"loss": 0.3851,
"step": 1740
},
{
"epoch": 0.5378822806208698,
"grad_norm": 1.8984375,
"learning_rate": 1.4625076828518748e-05,
"loss": 0.429,
"step": 1750
},
{
"epoch": 0.5409558936529891,
"grad_norm": 2.078125,
"learning_rate": 1.4594345421020284e-05,
"loss": 0.3902,
"step": 1760
},
{
"epoch": 0.5440295066851083,
"grad_norm": 2.140625,
"learning_rate": 1.4563614013521821e-05,
"loss": 0.3783,
"step": 1770
},
{
"epoch": 0.5471031197172276,
"grad_norm": 1.9296875,
"learning_rate": 1.4532882606023358e-05,
"loss": 0.4017,
"step": 1780
},
{
"epoch": 0.5501767327493469,
"grad_norm": 2.359375,
"learning_rate": 1.4502151198524894e-05,
"loss": 0.3858,
"step": 1790
},
{
"epoch": 0.5532503457814661,
"grad_norm": 2.375,
"learning_rate": 1.4471419791026431e-05,
"loss": 0.3847,
"step": 1800
},
{
"epoch": 0.5563239588135853,
"grad_norm": 1.8671875,
"learning_rate": 1.4440688383527968e-05,
"loss": 0.4029,
"step": 1810
},
{
"epoch": 0.5593975718457046,
"grad_norm": 1.8828125,
"learning_rate": 1.4409956976029505e-05,
"loss": 0.3941,
"step": 1820
},
{
"epoch": 0.5624711848778239,
"grad_norm": 2.46875,
"learning_rate": 1.437922556853104e-05,
"loss": 0.3994,
"step": 1830
},
{
"epoch": 0.5655447979099432,
"grad_norm": 2.21875,
"learning_rate": 1.4348494161032576e-05,
"loss": 0.3863,
"step": 1840
},
{
"epoch": 0.5686184109420624,
"grad_norm": 1.9296875,
"learning_rate": 1.4317762753534113e-05,
"loss": 0.3733,
"step": 1850
},
{
"epoch": 0.5716920239741816,
"grad_norm": 1.828125,
"learning_rate": 1.428703134603565e-05,
"loss": 0.367,
"step": 1860
},
{
"epoch": 0.5747656370063009,
"grad_norm": 1.9375,
"learning_rate": 1.4256299938537186e-05,
"loss": 0.3405,
"step": 1870
},
{
"epoch": 0.5778392500384202,
"grad_norm": 1.9453125,
"learning_rate": 1.4225568531038723e-05,
"loss": 0.39,
"step": 1880
},
{
"epoch": 0.5809128630705395,
"grad_norm": 1.6328125,
"learning_rate": 1.419483712354026e-05,
"loss": 0.3968,
"step": 1890
},
{
"epoch": 0.5839864761026586,
"grad_norm": 2.421875,
"learning_rate": 1.4164105716041796e-05,
"loss": 0.3926,
"step": 1900
},
{
"epoch": 0.5870600891347779,
"grad_norm": 2.203125,
"learning_rate": 1.4133374308543333e-05,
"loss": 0.3772,
"step": 1910
},
{
"epoch": 0.5901337021668972,
"grad_norm": 1.8359375,
"learning_rate": 1.410264290104487e-05,
"loss": 0.4271,
"step": 1920
},
{
"epoch": 0.5932073151990165,
"grad_norm": 2.96875,
"learning_rate": 1.4071911493546407e-05,
"loss": 0.3683,
"step": 1930
},
{
"epoch": 0.5962809282311357,
"grad_norm": 2.0,
"learning_rate": 1.4041180086047943e-05,
"loss": 0.3764,
"step": 1940
},
{
"epoch": 0.5993545412632549,
"grad_norm": 1.8515625,
"learning_rate": 1.401044867854948e-05,
"loss": 0.4047,
"step": 1950
},
{
"epoch": 0.6024281542953742,
"grad_norm": 2.0625,
"learning_rate": 1.3979717271051015e-05,
"loss": 0.3684,
"step": 1960
},
{
"epoch": 0.6055017673274935,
"grad_norm": 2.5625,
"learning_rate": 1.3948985863552552e-05,
"loss": 0.4148,
"step": 1970
},
{
"epoch": 0.6085753803596127,
"grad_norm": 2.078125,
"learning_rate": 1.3918254456054088e-05,
"loss": 0.3858,
"step": 1980
},
{
"epoch": 0.611648993391732,
"grad_norm": 2.515625,
"learning_rate": 1.3887523048555625e-05,
"loss": 0.3473,
"step": 1990
},
{
"epoch": 0.6147226064238512,
"grad_norm": 1.875,
"learning_rate": 1.3856791641057162e-05,
"loss": 0.3727,
"step": 2000
},
{
"epoch": 0.6177962194559705,
"grad_norm": 2.40625,
"learning_rate": 1.3826060233558698e-05,
"loss": 0.3615,
"step": 2010
},
{
"epoch": 0.6208698324880898,
"grad_norm": 1.9453125,
"learning_rate": 1.3795328826060235e-05,
"loss": 0.39,
"step": 2020
},
{
"epoch": 0.623943445520209,
"grad_norm": 1.7109375,
"learning_rate": 1.3764597418561772e-05,
"loss": 0.3736,
"step": 2030
},
{
"epoch": 0.6270170585523283,
"grad_norm": 1.828125,
"learning_rate": 1.3733866011063308e-05,
"loss": 0.3375,
"step": 2040
},
{
"epoch": 0.6300906715844475,
"grad_norm": 1.9765625,
"learning_rate": 1.3703134603564845e-05,
"loss": 0.3694,
"step": 2050
},
{
"epoch": 0.6331642846165668,
"grad_norm": 1.9765625,
"learning_rate": 1.3672403196066382e-05,
"loss": 0.3941,
"step": 2060
},
{
"epoch": 0.636237897648686,
"grad_norm": 1.640625,
"learning_rate": 1.3641671788567919e-05,
"loss": 0.333,
"step": 2070
},
{
"epoch": 0.6393115106808053,
"grad_norm": 1.8125,
"learning_rate": 1.3610940381069455e-05,
"loss": 0.3883,
"step": 2080
},
{
"epoch": 0.6423851237129246,
"grad_norm": 1.8515625,
"learning_rate": 1.358020897357099e-05,
"loss": 0.3817,
"step": 2090
},
{
"epoch": 0.6454587367450438,
"grad_norm": 2.515625,
"learning_rate": 1.3549477566072527e-05,
"loss": 0.37,
"step": 2100
},
{
"epoch": 0.648532349777163,
"grad_norm": 2.046875,
"learning_rate": 1.3518746158574064e-05,
"loss": 0.372,
"step": 2110
},
{
"epoch": 0.6516059628092823,
"grad_norm": 3.109375,
"learning_rate": 1.34880147510756e-05,
"loss": 0.388,
"step": 2120
},
{
"epoch": 0.6546795758414016,
"grad_norm": 2.140625,
"learning_rate": 1.3457283343577137e-05,
"loss": 0.3619,
"step": 2130
},
{
"epoch": 0.6577531888735209,
"grad_norm": 2.265625,
"learning_rate": 1.3426551936078674e-05,
"loss": 0.408,
"step": 2140
},
{
"epoch": 0.66082680190564,
"grad_norm": 2.453125,
"learning_rate": 1.339582052858021e-05,
"loss": 0.3902,
"step": 2150
},
{
"epoch": 0.6639004149377593,
"grad_norm": 2.234375,
"learning_rate": 1.3365089121081747e-05,
"loss": 0.3731,
"step": 2160
},
{
"epoch": 0.6669740279698786,
"grad_norm": 2.015625,
"learning_rate": 1.3334357713583284e-05,
"loss": 0.3645,
"step": 2170
},
{
"epoch": 0.6700476410019979,
"grad_norm": 2.3125,
"learning_rate": 1.330362630608482e-05,
"loss": 0.3926,
"step": 2180
},
{
"epoch": 0.6731212540341172,
"grad_norm": 2.109375,
"learning_rate": 1.3272894898586357e-05,
"loss": 0.3805,
"step": 2190
},
{
"epoch": 0.6761948670662363,
"grad_norm": 1.8359375,
"learning_rate": 1.3242163491087894e-05,
"loss": 0.3481,
"step": 2200
},
{
"epoch": 0.6792684800983556,
"grad_norm": 2.109375,
"learning_rate": 1.321143208358943e-05,
"loss": 0.3531,
"step": 2210
},
{
"epoch": 0.6823420931304749,
"grad_norm": 1.9765625,
"learning_rate": 1.3180700676090966e-05,
"loss": 0.3419,
"step": 2220
},
{
"epoch": 0.6854157061625942,
"grad_norm": 1.609375,
"learning_rate": 1.3149969268592502e-05,
"loss": 0.3551,
"step": 2230
},
{
"epoch": 0.6884893191947133,
"grad_norm": 2.28125,
"learning_rate": 1.3119237861094039e-05,
"loss": 0.3668,
"step": 2240
},
{
"epoch": 0.6915629322268326,
"grad_norm": 2.46875,
"learning_rate": 1.3088506453595576e-05,
"loss": 0.3915,
"step": 2250
},
{
"epoch": 0.6946365452589519,
"grad_norm": 2.484375,
"learning_rate": 1.3057775046097112e-05,
"loss": 0.3845,
"step": 2260
},
{
"epoch": 0.6977101582910712,
"grad_norm": 2.578125,
"learning_rate": 1.3027043638598649e-05,
"loss": 0.3735,
"step": 2270
},
{
"epoch": 0.7007837713231904,
"grad_norm": 2.25,
"learning_rate": 1.2996312231100186e-05,
"loss": 0.355,
"step": 2280
},
{
"epoch": 0.7038573843553096,
"grad_norm": 2.109375,
"learning_rate": 1.2965580823601722e-05,
"loss": 0.3618,
"step": 2290
},
{
"epoch": 0.7069309973874289,
"grad_norm": 2.546875,
"learning_rate": 1.293484941610326e-05,
"loss": 0.3698,
"step": 2300
},
{
"epoch": 0.7100046104195482,
"grad_norm": 2.546875,
"learning_rate": 1.2904118008604796e-05,
"loss": 0.359,
"step": 2310
},
{
"epoch": 0.7130782234516674,
"grad_norm": 2.15625,
"learning_rate": 1.2873386601106333e-05,
"loss": 0.3443,
"step": 2320
},
{
"epoch": 0.7161518364837867,
"grad_norm": 2.65625,
"learning_rate": 1.284265519360787e-05,
"loss": 0.3849,
"step": 2330
},
{
"epoch": 0.719225449515906,
"grad_norm": 2.84375,
"learning_rate": 1.2811923786109406e-05,
"loss": 0.4124,
"step": 2340
},
{
"epoch": 0.7222990625480252,
"grad_norm": 2.140625,
"learning_rate": 1.2781192378610941e-05,
"loss": 0.3298,
"step": 2350
},
{
"epoch": 0.7253726755801445,
"grad_norm": 2.515625,
"learning_rate": 1.2750460971112478e-05,
"loss": 0.37,
"step": 2360
},
{
"epoch": 0.7284462886122637,
"grad_norm": 2.0625,
"learning_rate": 1.2719729563614014e-05,
"loss": 0.388,
"step": 2370
},
{
"epoch": 0.731519901644383,
"grad_norm": 1.796875,
"learning_rate": 1.2688998156115551e-05,
"loss": 0.3375,
"step": 2380
},
{
"epoch": 0.7345935146765022,
"grad_norm": 2.40625,
"learning_rate": 1.2658266748617088e-05,
"loss": 0.3783,
"step": 2390
},
{
"epoch": 0.7376671277086215,
"grad_norm": 1.828125,
"learning_rate": 1.2627535341118624e-05,
"loss": 0.3779,
"step": 2400
},
{
"epoch": 0.7407407407407407,
"grad_norm": 1.8203125,
"learning_rate": 1.2596803933620161e-05,
"loss": 0.3391,
"step": 2410
},
{
"epoch": 0.74381435377286,
"grad_norm": 2.5625,
"learning_rate": 1.2566072526121698e-05,
"loss": 0.3817,
"step": 2420
},
{
"epoch": 0.7468879668049793,
"grad_norm": 2.453125,
"learning_rate": 1.2535341118623235e-05,
"loss": 0.3502,
"step": 2430
},
{
"epoch": 0.7499615798370985,
"grad_norm": 2.359375,
"learning_rate": 1.2504609711124771e-05,
"loss": 0.3552,
"step": 2440
},
{
"epoch": 0.7530351928692177,
"grad_norm": 2.09375,
"learning_rate": 1.2473878303626308e-05,
"loss": 0.3718,
"step": 2450
},
{
"epoch": 0.756108805901337,
"grad_norm": 2.03125,
"learning_rate": 1.2443146896127845e-05,
"loss": 0.3423,
"step": 2460
},
{
"epoch": 0.7591824189334563,
"grad_norm": 2.109375,
"learning_rate": 1.2412415488629381e-05,
"loss": 0.3806,
"step": 2470
},
{
"epoch": 0.7622560319655756,
"grad_norm": 2.15625,
"learning_rate": 1.2381684081130916e-05,
"loss": 0.353,
"step": 2480
},
{
"epoch": 0.7653296449976947,
"grad_norm": 2.046875,
"learning_rate": 1.2350952673632453e-05,
"loss": 0.3523,
"step": 2490
},
{
"epoch": 0.768403258029814,
"grad_norm": 2.109375,
"learning_rate": 1.232022126613399e-05,
"loss": 0.3781,
"step": 2500
},
{
"epoch": 0.7714768710619333,
"grad_norm": 2.21875,
"learning_rate": 1.2289489858635526e-05,
"loss": 0.3533,
"step": 2510
},
{
"epoch": 0.7745504840940526,
"grad_norm": 1.703125,
"learning_rate": 1.2258758451137063e-05,
"loss": 0.346,
"step": 2520
},
{
"epoch": 0.7776240971261719,
"grad_norm": 2.21875,
"learning_rate": 1.22280270436386e-05,
"loss": 0.3564,
"step": 2530
},
{
"epoch": 0.780697710158291,
"grad_norm": 2.015625,
"learning_rate": 1.2197295636140136e-05,
"loss": 0.3397,
"step": 2540
},
{
"epoch": 0.7837713231904103,
"grad_norm": 1.7421875,
"learning_rate": 1.2166564228641673e-05,
"loss": 0.3456,
"step": 2550
},
{
"epoch": 0.7868449362225296,
"grad_norm": 2.109375,
"learning_rate": 1.213583282114321e-05,
"loss": 0.3214,
"step": 2560
},
{
"epoch": 0.7899185492546489,
"grad_norm": 2.046875,
"learning_rate": 1.2105101413644747e-05,
"loss": 0.3448,
"step": 2570
},
{
"epoch": 0.792992162286768,
"grad_norm": 2.171875,
"learning_rate": 1.2074370006146283e-05,
"loss": 0.3729,
"step": 2580
},
{
"epoch": 0.7960657753188873,
"grad_norm": 2.390625,
"learning_rate": 1.204363859864782e-05,
"loss": 0.3235,
"step": 2590
},
{
"epoch": 0.7991393883510066,
"grad_norm": 2.796875,
"learning_rate": 1.2012907191149357e-05,
"loss": 0.3423,
"step": 2600
},
{
"epoch": 0.8022130013831259,
"grad_norm": 2.28125,
"learning_rate": 1.1982175783650892e-05,
"loss": 0.3604,
"step": 2610
},
{
"epoch": 0.8052866144152451,
"grad_norm": 1.9765625,
"learning_rate": 1.1951444376152428e-05,
"loss": 0.3702,
"step": 2620
},
{
"epoch": 0.8083602274473644,
"grad_norm": 1.9921875,
"learning_rate": 1.1920712968653965e-05,
"loss": 0.3613,
"step": 2630
},
{
"epoch": 0.8114338404794836,
"grad_norm": 2.296875,
"learning_rate": 1.1889981561155502e-05,
"loss": 0.3198,
"step": 2640
},
{
"epoch": 0.8145074535116029,
"grad_norm": 2.625,
"learning_rate": 1.1859250153657038e-05,
"loss": 0.3552,
"step": 2650
},
{
"epoch": 0.8175810665437221,
"grad_norm": 2.234375,
"learning_rate": 1.1828518746158575e-05,
"loss": 0.3481,
"step": 2660
},
{
"epoch": 0.8206546795758414,
"grad_norm": 1.8359375,
"learning_rate": 1.1797787338660112e-05,
"loss": 0.3437,
"step": 2670
},
{
"epoch": 0.8237282926079607,
"grad_norm": 2.0,
"learning_rate": 1.1767055931161648e-05,
"loss": 0.3552,
"step": 2680
},
{
"epoch": 0.8268019056400799,
"grad_norm": 2.34375,
"learning_rate": 1.1736324523663185e-05,
"loss": 0.3293,
"step": 2690
},
{
"epoch": 0.8298755186721992,
"grad_norm": 2.203125,
"learning_rate": 1.1705593116164722e-05,
"loss": 0.329,
"step": 2700
},
{
"epoch": 0.8329491317043184,
"grad_norm": 1.8046875,
"learning_rate": 1.1674861708666259e-05,
"loss": 0.338,
"step": 2710
},
{
"epoch": 0.8360227447364377,
"grad_norm": 2.140625,
"learning_rate": 1.1644130301167795e-05,
"loss": 0.3558,
"step": 2720
},
{
"epoch": 0.839096357768557,
"grad_norm": 2.875,
"learning_rate": 1.1613398893669332e-05,
"loss": 0.3521,
"step": 2730
},
{
"epoch": 0.8421699708006762,
"grad_norm": 2.359375,
"learning_rate": 1.1582667486170867e-05,
"loss": 0.381,
"step": 2740
},
{
"epoch": 0.8452435838327954,
"grad_norm": 2.046875,
"learning_rate": 1.1551936078672404e-05,
"loss": 0.351,
"step": 2750
},
{
"epoch": 0.8483171968649147,
"grad_norm": 1.9375,
"learning_rate": 1.152120467117394e-05,
"loss": 0.3478,
"step": 2760
},
{
"epoch": 0.851390809897034,
"grad_norm": 2.25,
"learning_rate": 1.1490473263675477e-05,
"loss": 0.3434,
"step": 2770
},
{
"epoch": 0.8544644229291533,
"grad_norm": 1.9375,
"learning_rate": 1.1459741856177014e-05,
"loss": 0.3457,
"step": 2780
},
{
"epoch": 0.8575380359612724,
"grad_norm": 1.96875,
"learning_rate": 1.142901044867855e-05,
"loss": 0.3267,
"step": 2790
},
{
"epoch": 0.8606116489933917,
"grad_norm": 3.171875,
"learning_rate": 1.1398279041180087e-05,
"loss": 0.3545,
"step": 2800
},
{
"epoch": 0.863685262025511,
"grad_norm": 1.7734375,
"learning_rate": 1.1367547633681624e-05,
"loss": 0.3394,
"step": 2810
},
{
"epoch": 0.8667588750576303,
"grad_norm": 2.421875,
"learning_rate": 1.133681622618316e-05,
"loss": 0.3388,
"step": 2820
},
{
"epoch": 0.8698324880897496,
"grad_norm": 2.265625,
"learning_rate": 1.1306084818684697e-05,
"loss": 0.3326,
"step": 2830
},
{
"epoch": 0.8729061011218687,
"grad_norm": 2.328125,
"learning_rate": 1.1275353411186234e-05,
"loss": 0.2943,
"step": 2840
},
{
"epoch": 0.875979714153988,
"grad_norm": 2.765625,
"learning_rate": 1.124462200368777e-05,
"loss": 0.3485,
"step": 2850
},
{
"epoch": 0.8790533271861073,
"grad_norm": 2.640625,
"learning_rate": 1.1213890596189307e-05,
"loss": 0.3435,
"step": 2860
},
{
"epoch": 0.8821269402182266,
"grad_norm": 1.90625,
"learning_rate": 1.1183159188690842e-05,
"loss": 0.336,
"step": 2870
},
{
"epoch": 0.8852005532503457,
"grad_norm": 2.234375,
"learning_rate": 1.1152427781192379e-05,
"loss": 0.3299,
"step": 2880
},
{
"epoch": 0.888274166282465,
"grad_norm": 2.359375,
"learning_rate": 1.1121696373693916e-05,
"loss": 0.3236,
"step": 2890
},
{
"epoch": 0.8913477793145843,
"grad_norm": 2.6875,
"learning_rate": 1.1090964966195452e-05,
"loss": 0.3486,
"step": 2900
},
{
"epoch": 0.8944213923467036,
"grad_norm": 2.46875,
"learning_rate": 1.1060233558696989e-05,
"loss": 0.361,
"step": 2910
},
{
"epoch": 0.8974950053788228,
"grad_norm": 2.15625,
"learning_rate": 1.1029502151198526e-05,
"loss": 0.3271,
"step": 2920
},
{
"epoch": 0.900568618410942,
"grad_norm": 2.71875,
"learning_rate": 1.0998770743700062e-05,
"loss": 0.3285,
"step": 2930
},
{
"epoch": 0.9036422314430613,
"grad_norm": 2.015625,
"learning_rate": 1.09680393362016e-05,
"loss": 0.3021,
"step": 2940
},
{
"epoch": 0.9067158444751806,
"grad_norm": 2.515625,
"learning_rate": 1.0937307928703136e-05,
"loss": 0.3023,
"step": 2950
},
{
"epoch": 0.9097894575072998,
"grad_norm": 2.203125,
"learning_rate": 1.0906576521204673e-05,
"loss": 0.351,
"step": 2960
},
{
"epoch": 0.9128630705394191,
"grad_norm": 2.515625,
"learning_rate": 1.087584511370621e-05,
"loss": 0.3478,
"step": 2970
},
{
"epoch": 0.9159366835715383,
"grad_norm": 2.09375,
"learning_rate": 1.0845113706207746e-05,
"loss": 0.3674,
"step": 2980
},
{
"epoch": 0.9190102966036576,
"grad_norm": 2.25,
"learning_rate": 1.0814382298709283e-05,
"loss": 0.3331,
"step": 2990
},
{
"epoch": 0.9220839096357769,
"grad_norm": 2.109375,
"learning_rate": 1.0783650891210818e-05,
"loss": 0.3487,
"step": 3000
},
{
"epoch": 0.9251575226678961,
"grad_norm": 1.984375,
"learning_rate": 1.0752919483712354e-05,
"loss": 0.3305,
"step": 3010
},
{
"epoch": 0.9282311357000154,
"grad_norm": 2.640625,
"learning_rate": 1.0722188076213891e-05,
"loss": 0.3359,
"step": 3020
},
{
"epoch": 0.9313047487321346,
"grad_norm": 2.15625,
"learning_rate": 1.0691456668715428e-05,
"loss": 0.3223,
"step": 3030
},
{
"epoch": 0.9343783617642539,
"grad_norm": 2.515625,
"learning_rate": 1.0660725261216964e-05,
"loss": 0.3388,
"step": 3040
},
{
"epoch": 0.9374519747963731,
"grad_norm": 2.15625,
"learning_rate": 1.0629993853718501e-05,
"loss": 0.3115,
"step": 3050
},
{
"epoch": 0.9405255878284924,
"grad_norm": 2.421875,
"learning_rate": 1.0599262446220038e-05,
"loss": 0.3434,
"step": 3060
},
{
"epoch": 0.9435992008606117,
"grad_norm": 1.859375,
"learning_rate": 1.0568531038721575e-05,
"loss": 0.333,
"step": 3070
},
{
"epoch": 0.946672813892731,
"grad_norm": 2.28125,
"learning_rate": 1.0537799631223111e-05,
"loss": 0.3097,
"step": 3080
},
{
"epoch": 0.9497464269248501,
"grad_norm": 1.78125,
"learning_rate": 1.0507068223724648e-05,
"loss": 0.3393,
"step": 3090
},
{
"epoch": 0.9528200399569694,
"grad_norm": 2.296875,
"learning_rate": 1.0476336816226185e-05,
"loss": 0.32,
"step": 3100
},
{
"epoch": 0.9558936529890887,
"grad_norm": 2.359375,
"learning_rate": 1.0445605408727721e-05,
"loss": 0.3458,
"step": 3110
},
{
"epoch": 0.958967266021208,
"grad_norm": 2.578125,
"learning_rate": 1.0414874001229258e-05,
"loss": 0.3408,
"step": 3120
},
{
"epoch": 0.9620408790533271,
"grad_norm": 2.3125,
"learning_rate": 1.0384142593730793e-05,
"loss": 0.3059,
"step": 3130
},
{
"epoch": 0.9651144920854464,
"grad_norm": 2.625,
"learning_rate": 1.035341118623233e-05,
"loss": 0.3436,
"step": 3140
},
{
"epoch": 0.9681881051175657,
"grad_norm": 2.03125,
"learning_rate": 1.0322679778733866e-05,
"loss": 0.3455,
"step": 3150
},
{
"epoch": 0.971261718149685,
"grad_norm": 2.21875,
"learning_rate": 1.0291948371235403e-05,
"loss": 0.3236,
"step": 3160
},
{
"epoch": 0.9743353311818043,
"grad_norm": 2.203125,
"learning_rate": 1.026121696373694e-05,
"loss": 0.3414,
"step": 3170
},
{
"epoch": 0.9774089442139234,
"grad_norm": 2.390625,
"learning_rate": 1.0230485556238476e-05,
"loss": 0.3181,
"step": 3180
},
{
"epoch": 0.9804825572460427,
"grad_norm": 2.40625,
"learning_rate": 1.0199754148740013e-05,
"loss": 0.3312,
"step": 3190
},
{
"epoch": 0.983556170278162,
"grad_norm": 2.28125,
"learning_rate": 1.016902274124155e-05,
"loss": 0.3483,
"step": 3200
},
{
"epoch": 0.9866297833102813,
"grad_norm": 1.84375,
"learning_rate": 1.0138291333743087e-05,
"loss": 0.3294,
"step": 3210
},
{
"epoch": 0.9897033963424005,
"grad_norm": 1.953125,
"learning_rate": 1.0107559926244623e-05,
"loss": 0.3232,
"step": 3220
},
{
"epoch": 0.9927770093745197,
"grad_norm": 2.34375,
"learning_rate": 1.007682851874616e-05,
"loss": 0.2939,
"step": 3230
},
{
"epoch": 0.995850622406639,
"grad_norm": 1.9453125,
"learning_rate": 1.0046097111247697e-05,
"loss": 0.3432,
"step": 3240
},
{
"epoch": 0.9989242354387583,
"grad_norm": 2.3125,
"learning_rate": 1.0015365703749233e-05,
"loss": 0.3194,
"step": 3250
},
{
"epoch": 1.0018441678192715,
"grad_norm": 3.0,
"learning_rate": 9.98463429625077e-06,
"loss": 0.2676,
"step": 3260
},
{
"epoch": 1.0049177808513907,
"grad_norm": 2.734375,
"learning_rate": 9.953902888752307e-06,
"loss": 0.2694,
"step": 3270
},
{
"epoch": 1.0079913938835101,
"grad_norm": 1.875,
"learning_rate": 9.923171481253843e-06,
"loss": 0.2422,
"step": 3280
},
{
"epoch": 1.0110650069156293,
"grad_norm": 1.8515625,
"learning_rate": 9.892440073755378e-06,
"loss": 0.2634,
"step": 3290
},
{
"epoch": 1.0141386199477487,
"grad_norm": 2.0625,
"learning_rate": 9.861708666256915e-06,
"loss": 0.252,
"step": 3300
},
{
"epoch": 1.0172122329798678,
"grad_norm": 2.484375,
"learning_rate": 9.830977258758452e-06,
"loss": 0.2702,
"step": 3310
},
{
"epoch": 1.020285846011987,
"grad_norm": 2.1875,
"learning_rate": 9.800245851259988e-06,
"loss": 0.2959,
"step": 3320
},
{
"epoch": 1.0233594590441064,
"grad_norm": 1.890625,
"learning_rate": 9.769514443761525e-06,
"loss": 0.2707,
"step": 3330
},
{
"epoch": 1.0264330720762256,
"grad_norm": 2.125,
"learning_rate": 9.738783036263062e-06,
"loss": 0.2567,
"step": 3340
},
{
"epoch": 1.0295066851083448,
"grad_norm": 2.40625,
"learning_rate": 9.708051628764599e-06,
"loss": 0.2623,
"step": 3350
},
{
"epoch": 1.0325802981404641,
"grad_norm": 2.171875,
"learning_rate": 9.677320221266134e-06,
"loss": 0.2846,
"step": 3360
},
{
"epoch": 1.0356539111725833,
"grad_norm": 2.296875,
"learning_rate": 9.64658881376767e-06,
"loss": 0.2979,
"step": 3370
},
{
"epoch": 1.0387275242047027,
"grad_norm": 2.140625,
"learning_rate": 9.615857406269209e-06,
"loss": 0.2692,
"step": 3380
},
{
"epoch": 1.0418011372368219,
"grad_norm": 2.203125,
"learning_rate": 9.585125998770745e-06,
"loss": 0.2733,
"step": 3390
},
{
"epoch": 1.044874750268941,
"grad_norm": 2.375,
"learning_rate": 9.554394591272282e-06,
"loss": 0.2625,
"step": 3400
},
{
"epoch": 1.0479483633010604,
"grad_norm": 2.0625,
"learning_rate": 9.523663183773819e-06,
"loss": 0.283,
"step": 3410
},
{
"epoch": 1.0510219763331796,
"grad_norm": 2.171875,
"learning_rate": 9.492931776275354e-06,
"loss": 0.2823,
"step": 3420
},
{
"epoch": 1.054095589365299,
"grad_norm": 2.25,
"learning_rate": 9.46220036877689e-06,
"loss": 0.2728,
"step": 3430
},
{
"epoch": 1.0571692023974182,
"grad_norm": 1.8203125,
"learning_rate": 9.431468961278427e-06,
"loss": 0.2437,
"step": 3440
},
{
"epoch": 1.0602428154295374,
"grad_norm": 2.375,
"learning_rate": 9.400737553779964e-06,
"loss": 0.2473,
"step": 3450
},
{
"epoch": 1.0633164284616567,
"grad_norm": 1.7421875,
"learning_rate": 9.3700061462815e-06,
"loss": 0.27,
"step": 3460
},
{
"epoch": 1.066390041493776,
"grad_norm": 2.65625,
"learning_rate": 9.339274738783037e-06,
"loss": 0.293,
"step": 3470
},
{
"epoch": 1.069463654525895,
"grad_norm": 3.109375,
"learning_rate": 9.308543331284574e-06,
"loss": 0.2806,
"step": 3480
},
{
"epoch": 1.0725372675580145,
"grad_norm": 2.546875,
"learning_rate": 9.277811923786109e-06,
"loss": 0.2824,
"step": 3490
},
{
"epoch": 1.0756108805901337,
"grad_norm": 3.0625,
"learning_rate": 9.247080516287647e-06,
"loss": 0.2685,
"step": 3500
},
{
"epoch": 1.078684493622253,
"grad_norm": 2.15625,
"learning_rate": 9.216349108789184e-06,
"loss": 0.2478,
"step": 3510
},
{
"epoch": 1.0817581066543722,
"grad_norm": 2.140625,
"learning_rate": 9.18561770129072e-06,
"loss": 0.2639,
"step": 3520
},
{
"epoch": 1.0848317196864914,
"grad_norm": 2.609375,
"learning_rate": 9.154886293792257e-06,
"loss": 0.2966,
"step": 3530
},
{
"epoch": 1.0879053327186108,
"grad_norm": 2.078125,
"learning_rate": 9.124154886293794e-06,
"loss": 0.2538,
"step": 3540
},
{
"epoch": 1.09097894575073,
"grad_norm": 2.0625,
"learning_rate": 9.093423478795329e-06,
"loss": 0.283,
"step": 3550
},
{
"epoch": 1.0940525587828493,
"grad_norm": 2.578125,
"learning_rate": 9.062692071296866e-06,
"loss": 0.295,
"step": 3560
},
{
"epoch": 1.0971261718149685,
"grad_norm": 2.046875,
"learning_rate": 9.031960663798402e-06,
"loss": 0.2694,
"step": 3570
},
{
"epoch": 1.1001997848470877,
"grad_norm": 2.328125,
"learning_rate": 9.00122925629994e-06,
"loss": 0.275,
"step": 3580
},
{
"epoch": 1.103273397879207,
"grad_norm": 2.640625,
"learning_rate": 8.970497848801476e-06,
"loss": 0.3061,
"step": 3590
},
{
"epoch": 1.1063470109113263,
"grad_norm": 2.53125,
"learning_rate": 8.939766441303013e-06,
"loss": 0.2823,
"step": 3600
},
{
"epoch": 1.1094206239434454,
"grad_norm": 2.296875,
"learning_rate": 8.90903503380455e-06,
"loss": 0.2522,
"step": 3610
},
{
"epoch": 1.1124942369755648,
"grad_norm": 2.078125,
"learning_rate": 8.878303626306086e-06,
"loss": 0.2925,
"step": 3620
},
{
"epoch": 1.115567850007684,
"grad_norm": 2.765625,
"learning_rate": 8.847572218807623e-06,
"loss": 0.2762,
"step": 3630
},
{
"epoch": 1.1186414630398034,
"grad_norm": 2.65625,
"learning_rate": 8.81684081130916e-06,
"loss": 0.2761,
"step": 3640
},
{
"epoch": 1.1217150760719226,
"grad_norm": 2.296875,
"learning_rate": 8.786109403810696e-06,
"loss": 0.273,
"step": 3650
},
{
"epoch": 1.1247886891040417,
"grad_norm": 2.328125,
"learning_rate": 8.755377996312233e-06,
"loss": 0.2805,
"step": 3660
},
{
"epoch": 1.1278623021361611,
"grad_norm": 2.703125,
"learning_rate": 8.72464658881377e-06,
"loss": 0.2725,
"step": 3670
},
{
"epoch": 1.1309359151682803,
"grad_norm": 2.4375,
"learning_rate": 8.693915181315304e-06,
"loss": 0.2676,
"step": 3680
},
{
"epoch": 1.1340095282003997,
"grad_norm": 2.359375,
"learning_rate": 8.663183773816841e-06,
"loss": 0.2852,
"step": 3690
},
{
"epoch": 1.1370831412325189,
"grad_norm": 2.765625,
"learning_rate": 8.632452366318378e-06,
"loss": 0.2739,
"step": 3700
},
{
"epoch": 1.140156754264638,
"grad_norm": 2.078125,
"learning_rate": 8.601720958819915e-06,
"loss": 0.2742,
"step": 3710
},
{
"epoch": 1.1432303672967574,
"grad_norm": 2.640625,
"learning_rate": 8.570989551321451e-06,
"loss": 0.2766,
"step": 3720
},
{
"epoch": 1.1463039803288766,
"grad_norm": 2.375,
"learning_rate": 8.540258143822988e-06,
"loss": 0.2897,
"step": 3730
},
{
"epoch": 1.1493775933609958,
"grad_norm": 2.0625,
"learning_rate": 8.509526736324525e-06,
"loss": 0.2664,
"step": 3740
},
{
"epoch": 1.1524512063931152,
"grad_norm": 2.359375,
"learning_rate": 8.478795328826061e-06,
"loss": 0.2723,
"step": 3750
},
{
"epoch": 1.1555248194252343,
"grad_norm": 1.765625,
"learning_rate": 8.448063921327598e-06,
"loss": 0.2647,
"step": 3760
},
{
"epoch": 1.1585984324573535,
"grad_norm": 2.59375,
"learning_rate": 8.417332513829135e-06,
"loss": 0.3002,
"step": 3770
},
{
"epoch": 1.161672045489473,
"grad_norm": 1.796875,
"learning_rate": 8.386601106330671e-06,
"loss": 0.2566,
"step": 3780
},
{
"epoch": 1.164745658521592,
"grad_norm": 2.328125,
"learning_rate": 8.355869698832208e-06,
"loss": 0.2624,
"step": 3790
},
{
"epoch": 1.1678192715537115,
"grad_norm": 2.140625,
"learning_rate": 8.325138291333745e-06,
"loss": 0.2831,
"step": 3800
},
{
"epoch": 1.1708928845858306,
"grad_norm": 2.171875,
"learning_rate": 8.29440688383528e-06,
"loss": 0.2856,
"step": 3810
},
{
"epoch": 1.17396649761795,
"grad_norm": 2.21875,
"learning_rate": 8.263675476336816e-06,
"loss": 0.2612,
"step": 3820
},
{
"epoch": 1.1770401106500692,
"grad_norm": 2.296875,
"learning_rate": 8.232944068838353e-06,
"loss": 0.2484,
"step": 3830
},
{
"epoch": 1.1801137236821884,
"grad_norm": 1.828125,
"learning_rate": 8.20221266133989e-06,
"loss": 0.2772,
"step": 3840
},
{
"epoch": 1.1831873367143078,
"grad_norm": 2.375,
"learning_rate": 8.171481253841427e-06,
"loss": 0.2308,
"step": 3850
},
{
"epoch": 1.186260949746427,
"grad_norm": 2.203125,
"learning_rate": 8.140749846342963e-06,
"loss": 0.2431,
"step": 3860
},
{
"epoch": 1.189334562778546,
"grad_norm": 2.5625,
"learning_rate": 8.1100184388445e-06,
"loss": 0.262,
"step": 3870
},
{
"epoch": 1.1924081758106655,
"grad_norm": 2.578125,
"learning_rate": 8.079287031346037e-06,
"loss": 0.2678,
"step": 3880
},
{
"epoch": 1.1954817888427847,
"grad_norm": 2.75,
"learning_rate": 8.048555623847573e-06,
"loss": 0.2382,
"step": 3890
},
{
"epoch": 1.1985554018749038,
"grad_norm": 2.46875,
"learning_rate": 8.01782421634911e-06,
"loss": 0.2762,
"step": 3900
},
{
"epoch": 1.2016290149070232,
"grad_norm": 2.0,
"learning_rate": 7.987092808850647e-06,
"loss": 0.2571,
"step": 3910
},
{
"epoch": 1.2047026279391424,
"grad_norm": 2.734375,
"learning_rate": 7.956361401352183e-06,
"loss": 0.314,
"step": 3920
},
{
"epoch": 1.2077762409712618,
"grad_norm": 2.3125,
"learning_rate": 7.92562999385372e-06,
"loss": 0.2412,
"step": 3930
},
{
"epoch": 1.210849854003381,
"grad_norm": 2.515625,
"learning_rate": 7.894898586355255e-06,
"loss": 0.2633,
"step": 3940
},
{
"epoch": 1.2139234670355001,
"grad_norm": 2.75,
"learning_rate": 7.864167178856792e-06,
"loss": 0.2563,
"step": 3950
},
{
"epoch": 1.2169970800676195,
"grad_norm": 2.125,
"learning_rate": 7.833435771358328e-06,
"loss": 0.2917,
"step": 3960
},
{
"epoch": 1.2200706930997387,
"grad_norm": 2.453125,
"learning_rate": 7.802704363859865e-06,
"loss": 0.2385,
"step": 3970
},
{
"epoch": 1.223144306131858,
"grad_norm": 1.984375,
"learning_rate": 7.771972956361402e-06,
"loss": 0.265,
"step": 3980
},
{
"epoch": 1.2262179191639773,
"grad_norm": 2.75,
"learning_rate": 7.741241548862939e-06,
"loss": 0.2519,
"step": 3990
},
{
"epoch": 1.2292915321960964,
"grad_norm": 2.0625,
"learning_rate": 7.710510141364475e-06,
"loss": 0.2523,
"step": 4000
},
{
"epoch": 1.2323651452282158,
"grad_norm": 3.03125,
"learning_rate": 7.679778733866012e-06,
"loss": 0.2485,
"step": 4010
},
{
"epoch": 1.235438758260335,
"grad_norm": 2.484375,
"learning_rate": 7.649047326367549e-06,
"loss": 0.2591,
"step": 4020
},
{
"epoch": 1.2385123712924542,
"grad_norm": 2.265625,
"learning_rate": 7.6183159188690845e-06,
"loss": 0.2765,
"step": 4030
},
{
"epoch": 1.2415859843245736,
"grad_norm": 2.921875,
"learning_rate": 7.587584511370621e-06,
"loss": 0.2541,
"step": 4040
},
{
"epoch": 1.2446595973566927,
"grad_norm": 2.484375,
"learning_rate": 7.556853103872158e-06,
"loss": 0.2941,
"step": 4050
},
{
"epoch": 1.2477332103888121,
"grad_norm": 2.265625,
"learning_rate": 7.526121696373695e-06,
"loss": 0.2644,
"step": 4060
},
{
"epoch": 1.2508068234209313,
"grad_norm": 1.859375,
"learning_rate": 7.4953902888752304e-06,
"loss": 0.2743,
"step": 4070
},
{
"epoch": 1.2538804364530507,
"grad_norm": 2.359375,
"learning_rate": 7.464658881376767e-06,
"loss": 0.2657,
"step": 4080
},
{
"epoch": 1.2569540494851699,
"grad_norm": 2.578125,
"learning_rate": 7.433927473878304e-06,
"loss": 0.2711,
"step": 4090
},
{
"epoch": 1.260027662517289,
"grad_norm": 2.828125,
"learning_rate": 7.4031960663798405e-06,
"loss": 0.2613,
"step": 4100
},
{
"epoch": 1.2631012755494084,
"grad_norm": 2.65625,
"learning_rate": 7.372464658881377e-06,
"loss": 0.2861,
"step": 4110
},
{
"epoch": 1.2661748885815276,
"grad_norm": 2.125,
"learning_rate": 7.341733251382914e-06,
"loss": 0.2608,
"step": 4120
},
{
"epoch": 1.2692485016136468,
"grad_norm": 2.90625,
"learning_rate": 7.311001843884451e-06,
"loss": 0.268,
"step": 4130
},
{
"epoch": 1.2723221146457662,
"grad_norm": 2.78125,
"learning_rate": 7.2802704363859865e-06,
"loss": 0.2485,
"step": 4140
},
{
"epoch": 1.2753957276778853,
"grad_norm": 1.7890625,
"learning_rate": 7.249539028887523e-06,
"loss": 0.2528,
"step": 4150
},
{
"epoch": 1.2784693407100045,
"grad_norm": 2.15625,
"learning_rate": 7.21880762138906e-06,
"loss": 0.2659,
"step": 4160
},
{
"epoch": 1.281542953742124,
"grad_norm": 2.359375,
"learning_rate": 7.1880762138905965e-06,
"loss": 0.2727,
"step": 4170
},
{
"epoch": 1.284616566774243,
"grad_norm": 2.21875,
"learning_rate": 7.157344806392133e-06,
"loss": 0.277,
"step": 4180
},
{
"epoch": 1.2876901798063622,
"grad_norm": 3.0,
"learning_rate": 7.12661339889367e-06,
"loss": 0.2812,
"step": 4190
},
{
"epoch": 1.2907637928384816,
"grad_norm": 2.34375,
"learning_rate": 7.095881991395206e-06,
"loss": 0.2952,
"step": 4200
},
{
"epoch": 1.2938374058706008,
"grad_norm": 2.28125,
"learning_rate": 7.0651505838967425e-06,
"loss": 0.2447,
"step": 4210
},
{
"epoch": 1.2969110189027202,
"grad_norm": 2.78125,
"learning_rate": 7.034419176398279e-06,
"loss": 0.2678,
"step": 4220
},
{
"epoch": 1.2999846319348394,
"grad_norm": 1.8359375,
"learning_rate": 7.003687768899816e-06,
"loss": 0.2617,
"step": 4230
},
{
"epoch": 1.3030582449669588,
"grad_norm": 2.140625,
"learning_rate": 6.9729563614013526e-06,
"loss": 0.2657,
"step": 4240
},
{
"epoch": 1.306131857999078,
"grad_norm": 2.65625,
"learning_rate": 6.942224953902889e-06,
"loss": 0.28,
"step": 4250
},
{
"epoch": 1.309205471031197,
"grad_norm": 2.5,
"learning_rate": 6.911493546404427e-06,
"loss": 0.2695,
"step": 4260
},
{
"epoch": 1.3122790840633165,
"grad_norm": 2.515625,
"learning_rate": 6.880762138905962e-06,
"loss": 0.2686,
"step": 4270
},
{
"epoch": 1.3153526970954357,
"grad_norm": 2.3125,
"learning_rate": 6.8500307314074985e-06,
"loss": 0.2472,
"step": 4280
},
{
"epoch": 1.3184263101275548,
"grad_norm": 3.21875,
"learning_rate": 6.819299323909035e-06,
"loss": 0.2498,
"step": 4290
},
{
"epoch": 1.3214999231596742,
"grad_norm": 3.03125,
"learning_rate": 6.788567916410572e-06,
"loss": 0.2758,
"step": 4300
},
{
"epoch": 1.3245735361917934,
"grad_norm": 2.484375,
"learning_rate": 6.7578365089121086e-06,
"loss": 0.2772,
"step": 4310
},
{
"epoch": 1.3276471492239126,
"grad_norm": 3.078125,
"learning_rate": 6.727105101413646e-06,
"loss": 0.2693,
"step": 4320
},
{
"epoch": 1.330720762256032,
"grad_norm": 2.140625,
"learning_rate": 6.696373693915181e-06,
"loss": 0.275,
"step": 4330
},
{
"epoch": 1.3337943752881511,
"grad_norm": 2.625,
"learning_rate": 6.665642286416718e-06,
"loss": 0.2852,
"step": 4340
},
{
"epoch": 1.3368679883202705,
"grad_norm": 2.59375,
"learning_rate": 6.6349108789182545e-06,
"loss": 0.2865,
"step": 4350
},
{
"epoch": 1.3399416013523897,
"grad_norm": 2.296875,
"learning_rate": 6.604179471419791e-06,
"loss": 0.2714,
"step": 4360
},
{
"epoch": 1.343015214384509,
"grad_norm": 2.515625,
"learning_rate": 6.573448063921328e-06,
"loss": 0.2551,
"step": 4370
},
{
"epoch": 1.3460888274166283,
"grad_norm": 1.984375,
"learning_rate": 6.5427166564228654e-06,
"loss": 0.255,
"step": 4380
},
{
"epoch": 1.3491624404487474,
"grad_norm": 2.328125,
"learning_rate": 6.511985248924402e-06,
"loss": 0.236,
"step": 4390
},
{
"epoch": 1.3522360534808668,
"grad_norm": 1.9453125,
"learning_rate": 6.481253841425937e-06,
"loss": 0.2599,
"step": 4400
},
{
"epoch": 1.355309666512986,
"grad_norm": 2.546875,
"learning_rate": 6.450522433927474e-06,
"loss": 0.2586,
"step": 4410
},
{
"epoch": 1.3583832795451052,
"grad_norm": 1.8515625,
"learning_rate": 6.4197910264290105e-06,
"loss": 0.2531,
"step": 4420
},
{
"epoch": 1.3614568925772246,
"grad_norm": 2.171875,
"learning_rate": 6.389059618930547e-06,
"loss": 0.2543,
"step": 4430
},
{
"epoch": 1.3645305056093437,
"grad_norm": 2.6875,
"learning_rate": 6.358328211432085e-06,
"loss": 0.2641,
"step": 4440
},
{
"epoch": 1.367604118641463,
"grad_norm": 2.328125,
"learning_rate": 6.3275968039336215e-06,
"loss": 0.2631,
"step": 4450
},
{
"epoch": 1.3706777316735823,
"grad_norm": 2.5625,
"learning_rate": 6.2968653964351565e-06,
"loss": 0.2782,
"step": 4460
},
{
"epoch": 1.3737513447057015,
"grad_norm": 2.765625,
"learning_rate": 6.266133988936693e-06,
"loss": 0.2614,
"step": 4470
},
{
"epoch": 1.3768249577378209,
"grad_norm": 1.984375,
"learning_rate": 6.23540258143823e-06,
"loss": 0.2633,
"step": 4480
},
{
"epoch": 1.37989857076994,
"grad_norm": 2.515625,
"learning_rate": 6.2046711739397665e-06,
"loss": 0.2587,
"step": 4490
},
{
"epoch": 1.3829721838020594,
"grad_norm": 2.015625,
"learning_rate": 6.173939766441303e-06,
"loss": 0.2503,
"step": 4500
},
{
"epoch": 1.3860457968341786,
"grad_norm": 2.0,
"learning_rate": 6.143208358942841e-06,
"loss": 0.2426,
"step": 4510
},
{
"epoch": 1.3891194098662978,
"grad_norm": 2.890625,
"learning_rate": 6.1124769514443775e-06,
"loss": 0.2734,
"step": 4520
},
{
"epoch": 1.3921930228984172,
"grad_norm": 2.484375,
"learning_rate": 6.0817455439459125e-06,
"loss": 0.2667,
"step": 4530
},
{
"epoch": 1.3952666359305363,
"grad_norm": 2.53125,
"learning_rate": 6.051014136447449e-06,
"loss": 0.2595,
"step": 4540
},
{
"epoch": 1.3983402489626555,
"grad_norm": 2.015625,
"learning_rate": 6.020282728948986e-06,
"loss": 0.2494,
"step": 4550
},
{
"epoch": 1.401413861994775,
"grad_norm": 2.453125,
"learning_rate": 5.9895513214505226e-06,
"loss": 0.207,
"step": 4560
},
{
"epoch": 1.404487475026894,
"grad_norm": 2.375,
"learning_rate": 5.95881991395206e-06,
"loss": 0.2547,
"step": 4570
},
{
"epoch": 1.4075610880590133,
"grad_norm": 2.421875,
"learning_rate": 5.928088506453597e-06,
"loss": 0.2823,
"step": 4580
},
{
"epoch": 1.4106347010911326,
"grad_norm": 2.4375,
"learning_rate": 5.897357098955132e-06,
"loss": 0.2654,
"step": 4590
},
{
"epoch": 1.4137083141232518,
"grad_norm": 2.359375,
"learning_rate": 5.8666256914566685e-06,
"loss": 0.2591,
"step": 4600
},
{
"epoch": 1.4167819271553712,
"grad_norm": 2.671875,
"learning_rate": 5.835894283958205e-06,
"loss": 0.2856,
"step": 4610
},
{
"epoch": 1.4198555401874904,
"grad_norm": 2.3125,
"learning_rate": 5.805162876459742e-06,
"loss": 0.2865,
"step": 4620
},
{
"epoch": 1.4229291532196098,
"grad_norm": 2.28125,
"learning_rate": 5.7744314689612794e-06,
"loss": 0.2241,
"step": 4630
},
{
"epoch": 1.426002766251729,
"grad_norm": 2.578125,
"learning_rate": 5.743700061462816e-06,
"loss": 0.2939,
"step": 4640
},
{
"epoch": 1.4290763792838481,
"grad_norm": 2.375,
"learning_rate": 5.712968653964353e-06,
"loss": 0.2837,
"step": 4650
},
{
"epoch": 1.4321499923159675,
"grad_norm": 1.796875,
"learning_rate": 5.682237246465888e-06,
"loss": 0.2509,
"step": 4660
},
{
"epoch": 1.4352236053480867,
"grad_norm": 2.296875,
"learning_rate": 5.6515058389674245e-06,
"loss": 0.2812,
"step": 4670
},
{
"epoch": 1.4382972183802059,
"grad_norm": 2.28125,
"learning_rate": 5.620774431468961e-06,
"loss": 0.2667,
"step": 4680
},
{
"epoch": 1.4413708314123252,
"grad_norm": 1.3125,
"learning_rate": 5.590043023970499e-06,
"loss": 0.2411,
"step": 4690
},
{
"epoch": 1.4444444444444444,
"grad_norm": 2.234375,
"learning_rate": 5.5593116164720354e-06,
"loss": 0.2485,
"step": 4700
},
{
"epoch": 1.4475180574765636,
"grad_norm": 2.609375,
"learning_rate": 5.528580208973572e-06,
"loss": 0.2606,
"step": 4710
},
{
"epoch": 1.450591670508683,
"grad_norm": 2.8125,
"learning_rate": 5.497848801475107e-06,
"loss": 0.2536,
"step": 4720
},
{
"epoch": 1.4536652835408022,
"grad_norm": 2.578125,
"learning_rate": 5.467117393976644e-06,
"loss": 0.2716,
"step": 4730
},
{
"epoch": 1.4567388965729215,
"grad_norm": 1.8671875,
"learning_rate": 5.4363859864781805e-06,
"loss": 0.2297,
"step": 4740
},
{
"epoch": 1.4598125096050407,
"grad_norm": 2.078125,
"learning_rate": 5.405654578979718e-06,
"loss": 0.2937,
"step": 4750
},
{
"epoch": 1.4628861226371601,
"grad_norm": 2.375,
"learning_rate": 5.374923171481255e-06,
"loss": 0.2927,
"step": 4760
},
{
"epoch": 1.4659597356692793,
"grad_norm": 2.421875,
"learning_rate": 5.3441917639827915e-06,
"loss": 0.2757,
"step": 4770
},
{
"epoch": 1.4690333487013985,
"grad_norm": 1.9765625,
"learning_rate": 5.313460356484328e-06,
"loss": 0.2751,
"step": 4780
},
{
"epoch": 1.4721069617335178,
"grad_norm": 2.6875,
"learning_rate": 5.282728948985863e-06,
"loss": 0.2831,
"step": 4790
},
{
"epoch": 1.475180574765637,
"grad_norm": 2.078125,
"learning_rate": 5.2519975414874e-06,
"loss": 0.2273,
"step": 4800
},
{
"epoch": 1.4782541877977562,
"grad_norm": 2.046875,
"learning_rate": 5.221266133988937e-06,
"loss": 0.265,
"step": 4810
},
{
"epoch": 1.4813278008298756,
"grad_norm": 2.515625,
"learning_rate": 5.190534726490474e-06,
"loss": 0.2605,
"step": 4820
},
{
"epoch": 1.4844014138619948,
"grad_norm": 2.25,
"learning_rate": 5.159803318992011e-06,
"loss": 0.2595,
"step": 4830
},
{
"epoch": 1.487475026894114,
"grad_norm": 2.25,
"learning_rate": 5.1290719114935475e-06,
"loss": 0.2491,
"step": 4840
},
{
"epoch": 1.4905486399262333,
"grad_norm": 1.9921875,
"learning_rate": 5.0983405039950825e-06,
"loss": 0.2739,
"step": 4850
},
{
"epoch": 1.4936222529583525,
"grad_norm": 2.71875,
"learning_rate": 5.067609096496619e-06,
"loss": 0.2642,
"step": 4860
},
{
"epoch": 1.4966958659904717,
"grad_norm": 2.59375,
"learning_rate": 5.036877688998157e-06,
"loss": 0.2876,
"step": 4870
},
{
"epoch": 1.499769479022591,
"grad_norm": 2.34375,
"learning_rate": 5.006146281499693e-06,
"loss": 0.2167,
"step": 4880
},
{
"epoch": 1.5028430920547104,
"grad_norm": 1.953125,
"learning_rate": 4.97541487400123e-06,
"loss": 0.2668,
"step": 4890
},
{
"epoch": 1.5059167050868296,
"grad_norm": 2.703125,
"learning_rate": 4.944683466502766e-06,
"loss": 0.274,
"step": 4900
},
{
"epoch": 1.5089903181189488,
"grad_norm": 2.125,
"learning_rate": 4.913952059004303e-06,
"loss": 0.2559,
"step": 4910
},
{
"epoch": 1.5120639311510682,
"grad_norm": 2.28125,
"learning_rate": 4.883220651505839e-06,
"loss": 0.253,
"step": 4920
},
{
"epoch": 1.5151375441831874,
"grad_norm": 2.484375,
"learning_rate": 4.852489244007376e-06,
"loss": 0.2538,
"step": 4930
},
{
"epoch": 1.5182111572153065,
"grad_norm": 2.75,
"learning_rate": 4.821757836508913e-06,
"loss": 0.2763,
"step": 4940
},
{
"epoch": 1.521284770247426,
"grad_norm": 2.203125,
"learning_rate": 4.7910264290104494e-06,
"loss": 0.2621,
"step": 4950
},
{
"epoch": 1.524358383279545,
"grad_norm": 2.21875,
"learning_rate": 4.760295021511986e-06,
"loss": 0.269,
"step": 4960
},
{
"epoch": 1.5274319963116643,
"grad_norm": 2.265625,
"learning_rate": 4.729563614013522e-06,
"loss": 0.2617,
"step": 4970
},
{
"epoch": 1.5305056093437837,
"grad_norm": 2.359375,
"learning_rate": 4.698832206515059e-06,
"loss": 0.2691,
"step": 4980
},
{
"epoch": 1.5335792223759028,
"grad_norm": 2.703125,
"learning_rate": 4.668100799016595e-06,
"loss": 0.2592,
"step": 4990
},
{
"epoch": 1.536652835408022,
"grad_norm": 2.890625,
"learning_rate": 4.637369391518132e-06,
"loss": 0.2441,
"step": 5000
},
{
"epoch": 1.5397264484401414,
"grad_norm": 2.140625,
"learning_rate": 4.606637984019669e-06,
"loss": 0.2212,
"step": 5010
},
{
"epoch": 1.5428000614722608,
"grad_norm": 2.75,
"learning_rate": 4.5759065765212054e-06,
"loss": 0.2572,
"step": 5020
},
{
"epoch": 1.5458736745043797,
"grad_norm": 2.5625,
"learning_rate": 4.545175169022741e-06,
"loss": 0.2554,
"step": 5030
},
{
"epoch": 1.5489472875364991,
"grad_norm": 2.75,
"learning_rate": 4.514443761524278e-06,
"loss": 0.2633,
"step": 5040
},
{
"epoch": 1.5520209005686185,
"grad_norm": 3.5625,
"learning_rate": 4.483712354025815e-06,
"loss": 0.2433,
"step": 5050
},
{
"epoch": 1.5550945136007377,
"grad_norm": 2.796875,
"learning_rate": 4.452980946527351e-06,
"loss": 0.284,
"step": 5060
},
{
"epoch": 1.5581681266328569,
"grad_norm": 3.53125,
"learning_rate": 4.422249539028888e-06,
"loss": 0.2655,
"step": 5070
},
{
"epoch": 1.5612417396649763,
"grad_norm": 2.171875,
"learning_rate": 4.391518131530425e-06,
"loss": 0.2448,
"step": 5080
},
{
"epoch": 1.5643153526970954,
"grad_norm": 2.65625,
"learning_rate": 4.3607867240319615e-06,
"loss": 0.2536,
"step": 5090
},
{
"epoch": 1.5673889657292146,
"grad_norm": 2.3125,
"learning_rate": 4.330055316533497e-06,
"loss": 0.2619,
"step": 5100
},
{
"epoch": 1.570462578761334,
"grad_norm": 2.265625,
"learning_rate": 4.299323909035034e-06,
"loss": 0.2762,
"step": 5110
},
{
"epoch": 1.5735361917934532,
"grad_norm": 2.296875,
"learning_rate": 4.268592501536571e-06,
"loss": 0.2555,
"step": 5120
},
{
"epoch": 1.5766098048255723,
"grad_norm": 2.046875,
"learning_rate": 4.237861094038107e-06,
"loss": 0.2457,
"step": 5130
},
{
"epoch": 1.5796834178576917,
"grad_norm": 2.578125,
"learning_rate": 4.207129686539644e-06,
"loss": 0.27,
"step": 5140
},
{
"epoch": 1.5827570308898111,
"grad_norm": 2.1875,
"learning_rate": 4.176398279041181e-06,
"loss": 0.2525,
"step": 5150
},
{
"epoch": 1.58583064392193,
"grad_norm": 2.390625,
"learning_rate": 4.145666871542717e-06,
"loss": 0.2788,
"step": 5160
},
{
"epoch": 1.5889042569540495,
"grad_norm": 2.46875,
"learning_rate": 4.114935464044253e-06,
"loss": 0.2732,
"step": 5170
},
{
"epoch": 1.5919778699861689,
"grad_norm": 2.359375,
"learning_rate": 4.08420405654579e-06,
"loss": 0.2415,
"step": 5180
},
{
"epoch": 1.595051483018288,
"grad_norm": 2.5625,
"learning_rate": 4.053472649047327e-06,
"loss": 0.2461,
"step": 5190
},
{
"epoch": 1.5981250960504072,
"grad_norm": 2.46875,
"learning_rate": 4.022741241548863e-06,
"loss": 0.2554,
"step": 5200
},
{
"epoch": 1.6011987090825266,
"grad_norm": 2.5,
"learning_rate": 3.9920098340504e-06,
"loss": 0.2824,
"step": 5210
},
{
"epoch": 1.6042723221146458,
"grad_norm": 2.328125,
"learning_rate": 3.961278426551937e-06,
"loss": 0.2289,
"step": 5220
},
{
"epoch": 1.607345935146765,
"grad_norm": 2.453125,
"learning_rate": 3.930547019053473e-06,
"loss": 0.2515,
"step": 5230
},
{
"epoch": 1.6104195481788843,
"grad_norm": 2.703125,
"learning_rate": 3.899815611555009e-06,
"loss": 0.2636,
"step": 5240
},
{
"epoch": 1.6134931612110035,
"grad_norm": 2.375,
"learning_rate": 3.869084204056546e-06,
"loss": 0.2656,
"step": 5250
},
{
"epoch": 1.6165667742431227,
"grad_norm": 2.4375,
"learning_rate": 3.838352796558083e-06,
"loss": 0.2758,
"step": 5260
},
{
"epoch": 1.619640387275242,
"grad_norm": 2.375,
"learning_rate": 3.807621389059619e-06,
"loss": 0.2656,
"step": 5270
},
{
"epoch": 1.6227140003073615,
"grad_norm": 2.453125,
"learning_rate": 3.776889981561156e-06,
"loss": 0.2808,
"step": 5280
},
{
"epoch": 1.6257876133394804,
"grad_norm": 2.140625,
"learning_rate": 3.746158574062692e-06,
"loss": 0.2827,
"step": 5290
},
{
"epoch": 1.6288612263715998,
"grad_norm": 2.265625,
"learning_rate": 3.7154271665642287e-06,
"loss": 0.2532,
"step": 5300
},
{
"epoch": 1.6319348394037192,
"grad_norm": 2.90625,
"learning_rate": 3.6846957590657658e-06,
"loss": 0.2599,
"step": 5310
},
{
"epoch": 1.6350084524358384,
"grad_norm": 2.625,
"learning_rate": 3.6539643515673025e-06,
"loss": 0.2759,
"step": 5320
},
{
"epoch": 1.6380820654679575,
"grad_norm": 2.640625,
"learning_rate": 3.6232329440688383e-06,
"loss": 0.262,
"step": 5330
},
{
"epoch": 1.641155678500077,
"grad_norm": 2.40625,
"learning_rate": 3.5925015365703754e-06,
"loss": 0.2804,
"step": 5340
},
{
"epoch": 1.644229291532196,
"grad_norm": 2.15625,
"learning_rate": 3.561770129071912e-06,
"loss": 0.2631,
"step": 5350
},
{
"epoch": 1.6473029045643153,
"grad_norm": 2.28125,
"learning_rate": 3.531038721573448e-06,
"loss": 0.2603,
"step": 5360
},
{
"epoch": 1.6503765175964347,
"grad_norm": 1.9921875,
"learning_rate": 3.500307314074985e-06,
"loss": 0.234,
"step": 5370
},
{
"epoch": 1.6534501306285538,
"grad_norm": 1.859375,
"learning_rate": 3.469575906576522e-06,
"loss": 0.2589,
"step": 5380
},
{
"epoch": 1.656523743660673,
"grad_norm": 2.71875,
"learning_rate": 3.4388444990780576e-06,
"loss": 0.2643,
"step": 5390
},
{
"epoch": 1.6595973566927924,
"grad_norm": 2.015625,
"learning_rate": 3.4081130915795948e-06,
"loss": 0.252,
"step": 5400
},
{
"epoch": 1.6626709697249118,
"grad_norm": 2.34375,
"learning_rate": 3.3773816840811315e-06,
"loss": 0.2721,
"step": 5410
},
{
"epoch": 1.6657445827570307,
"grad_norm": 2.515625,
"learning_rate": 3.3466502765826673e-06,
"loss": 0.2749,
"step": 5420
},
{
"epoch": 1.6688181957891501,
"grad_norm": 2.59375,
"learning_rate": 3.3159188690842044e-06,
"loss": 0.2675,
"step": 5430
},
{
"epoch": 1.6718918088212695,
"grad_norm": 2.765625,
"learning_rate": 3.285187461585741e-06,
"loss": 0.2348,
"step": 5440
},
{
"epoch": 1.6749654218533887,
"grad_norm": 2.25,
"learning_rate": 3.254456054087278e-06,
"loss": 0.2552,
"step": 5450
},
{
"epoch": 1.6780390348855079,
"grad_norm": 2.140625,
"learning_rate": 3.223724646588814e-06,
"loss": 0.306,
"step": 5460
},
{
"epoch": 1.6811126479176273,
"grad_norm": 2.328125,
"learning_rate": 3.1929932390903508e-06,
"loss": 0.2652,
"step": 5470
},
{
"epoch": 1.6841862609497464,
"grad_norm": 2.15625,
"learning_rate": 3.1622618315918875e-06,
"loss": 0.2572,
"step": 5480
},
{
"epoch": 1.6872598739818656,
"grad_norm": 2.125,
"learning_rate": 3.1315304240934238e-06,
"loss": 0.2807,
"step": 5490
},
{
"epoch": 1.690333487013985,
"grad_norm": 2.578125,
"learning_rate": 3.1007990165949604e-06,
"loss": 0.2612,
"step": 5500
},
{
"epoch": 1.6934071000461042,
"grad_norm": 2.234375,
"learning_rate": 3.070067609096497e-06,
"loss": 0.2599,
"step": 5510
},
{
"epoch": 1.6964807130782233,
"grad_norm": 2.375,
"learning_rate": 3.0393362015980334e-06,
"loss": 0.2675,
"step": 5520
},
{
"epoch": 1.6995543261103427,
"grad_norm": 1.9140625,
"learning_rate": 3.00860479409957e-06,
"loss": 0.2352,
"step": 5530
},
{
"epoch": 1.702627939142462,
"grad_norm": 1.875,
"learning_rate": 2.977873386601107e-06,
"loss": 0.262,
"step": 5540
},
{
"epoch": 1.705701552174581,
"grad_norm": 2.53125,
"learning_rate": 2.947141979102643e-06,
"loss": 0.2622,
"step": 5550
},
{
"epoch": 1.7087751652067005,
"grad_norm": 2.796875,
"learning_rate": 2.9164105716041798e-06,
"loss": 0.2577,
"step": 5560
},
{
"epoch": 1.7118487782388199,
"grad_norm": 2.25,
"learning_rate": 2.8856791641057165e-06,
"loss": 0.2688,
"step": 5570
},
{
"epoch": 1.714922391270939,
"grad_norm": 3.0,
"learning_rate": 2.854947756607253e-06,
"loss": 0.2794,
"step": 5580
},
{
"epoch": 1.7179960043030582,
"grad_norm": 2.328125,
"learning_rate": 2.8242163491087894e-06,
"loss": 0.2451,
"step": 5590
},
{
"epoch": 1.7210696173351776,
"grad_norm": 2.46875,
"learning_rate": 2.793484941610326e-06,
"loss": 0.2478,
"step": 5600
},
{
"epoch": 1.7241432303672968,
"grad_norm": 2.25,
"learning_rate": 2.762753534111863e-06,
"loss": 0.2533,
"step": 5610
},
{
"epoch": 1.727216843399416,
"grad_norm": 2.359375,
"learning_rate": 2.732022126613399e-06,
"loss": 0.2803,
"step": 5620
},
{
"epoch": 1.7302904564315353,
"grad_norm": 2.4375,
"learning_rate": 2.7012907191149358e-06,
"loss": 0.236,
"step": 5630
},
{
"epoch": 1.7333640694636545,
"grad_norm": 2.765625,
"learning_rate": 2.6705593116164725e-06,
"loss": 0.2735,
"step": 5640
},
{
"epoch": 1.7364376824957737,
"grad_norm": 2.328125,
"learning_rate": 2.6398279041180088e-06,
"loss": 0.2486,
"step": 5650
},
{
"epoch": 1.739511295527893,
"grad_norm": 2.46875,
"learning_rate": 2.6090964966195454e-06,
"loss": 0.2673,
"step": 5660
},
{
"epoch": 1.7425849085600122,
"grad_norm": 2.171875,
"learning_rate": 2.578365089121082e-06,
"loss": 0.2445,
"step": 5670
},
{
"epoch": 1.7456585215921314,
"grad_norm": 2.828125,
"learning_rate": 2.5476336816226184e-06,
"loss": 0.2702,
"step": 5680
},
{
"epoch": 1.7487321346242508,
"grad_norm": 2.3125,
"learning_rate": 2.516902274124155e-06,
"loss": 0.2502,
"step": 5690
},
{
"epoch": 1.7518057476563702,
"grad_norm": 2.296875,
"learning_rate": 2.486170866625692e-06,
"loss": 0.2615,
"step": 5700
},
{
"epoch": 1.7548793606884892,
"grad_norm": 1.8828125,
"learning_rate": 2.455439459127228e-06,
"loss": 0.2696,
"step": 5710
},
{
"epoch": 1.7579529737206085,
"grad_norm": 2.3125,
"learning_rate": 2.4247080516287648e-06,
"loss": 0.2489,
"step": 5720
},
{
"epoch": 1.761026586752728,
"grad_norm": 2.84375,
"learning_rate": 2.3939766441303015e-06,
"loss": 0.2448,
"step": 5730
},
{
"epoch": 1.7641001997848471,
"grad_norm": 2.875,
"learning_rate": 2.3632452366318377e-06,
"loss": 0.2512,
"step": 5740
},
{
"epoch": 1.7671738128169663,
"grad_norm": 2.125,
"learning_rate": 2.3325138291333744e-06,
"loss": 0.2481,
"step": 5750
},
{
"epoch": 1.7702474258490857,
"grad_norm": 2.90625,
"learning_rate": 2.301782421634911e-06,
"loss": 0.2504,
"step": 5760
},
{
"epoch": 1.7733210388812048,
"grad_norm": 2.515625,
"learning_rate": 2.2710510141364474e-06,
"loss": 0.2581,
"step": 5770
},
{
"epoch": 1.776394651913324,
"grad_norm": 2.40625,
"learning_rate": 2.240319606637984e-06,
"loss": 0.2627,
"step": 5780
},
{
"epoch": 1.7794682649454434,
"grad_norm": 2.40625,
"learning_rate": 2.2095881991395208e-06,
"loss": 0.25,
"step": 5790
},
{
"epoch": 1.7825418779775626,
"grad_norm": 2.390625,
"learning_rate": 2.1788567916410575e-06,
"loss": 0.2664,
"step": 5800
},
{
"epoch": 1.7856154910096818,
"grad_norm": 2.671875,
"learning_rate": 2.1481253841425938e-06,
"loss": 0.2627,
"step": 5810
},
{
"epoch": 1.7886891040418011,
"grad_norm": 2.359375,
"learning_rate": 2.1173939766441304e-06,
"loss": 0.2092,
"step": 5820
},
{
"epoch": 1.7917627170739205,
"grad_norm": 2.390625,
"learning_rate": 2.086662569145667e-06,
"loss": 0.2603,
"step": 5830
},
{
"epoch": 1.7948363301060395,
"grad_norm": 1.8984375,
"learning_rate": 2.0559311616472034e-06,
"loss": 0.2244,
"step": 5840
},
{
"epoch": 1.7979099431381589,
"grad_norm": 2.078125,
"learning_rate": 2.02519975414874e-06,
"loss": 0.2542,
"step": 5850
},
{
"epoch": 1.8009835561702783,
"grad_norm": 1.953125,
"learning_rate": 1.994468346650277e-06,
"loss": 0.2478,
"step": 5860
},
{
"epoch": 1.8040571692023974,
"grad_norm": 2.515625,
"learning_rate": 1.963736939151813e-06,
"loss": 0.2253,
"step": 5870
},
{
"epoch": 1.8071307822345166,
"grad_norm": 1.9296875,
"learning_rate": 1.9330055316533498e-06,
"loss": 0.2727,
"step": 5880
},
{
"epoch": 1.810204395266636,
"grad_norm": 2.8125,
"learning_rate": 1.9022741241548865e-06,
"loss": 0.273,
"step": 5890
},
{
"epoch": 1.8132780082987552,
"grad_norm": 1.96875,
"learning_rate": 1.871542716656423e-06,
"loss": 0.2747,
"step": 5900
},
{
"epoch": 1.8163516213308744,
"grad_norm": 2.09375,
"learning_rate": 1.8408113091579596e-06,
"loss": 0.2624,
"step": 5910
},
{
"epoch": 1.8194252343629937,
"grad_norm": 2.71875,
"learning_rate": 1.8100799016594961e-06,
"loss": 0.2392,
"step": 5920
},
{
"epoch": 1.822498847395113,
"grad_norm": 2.375,
"learning_rate": 1.7793484941610328e-06,
"loss": 0.2444,
"step": 5930
},
{
"epoch": 1.825572460427232,
"grad_norm": 2.484375,
"learning_rate": 1.7486170866625693e-06,
"loss": 0.2719,
"step": 5940
},
{
"epoch": 1.8286460734593515,
"grad_norm": 2.671875,
"learning_rate": 1.7178856791641058e-06,
"loss": 0.2604,
"step": 5950
},
{
"epoch": 1.8317196864914709,
"grad_norm": 2.46875,
"learning_rate": 1.6871542716656425e-06,
"loss": 0.2628,
"step": 5960
},
{
"epoch": 1.8347932995235898,
"grad_norm": 2.140625,
"learning_rate": 1.656422864167179e-06,
"loss": 0.2842,
"step": 5970
},
{
"epoch": 1.8378669125557092,
"grad_norm": 2.859375,
"learning_rate": 1.6256914566687157e-06,
"loss": 0.2726,
"step": 5980
},
{
"epoch": 1.8409405255878286,
"grad_norm": 2.21875,
"learning_rate": 1.5949600491702521e-06,
"loss": 0.2798,
"step": 5990
},
{
"epoch": 1.8440141386199478,
"grad_norm": 2.5,
"learning_rate": 1.5642286416717886e-06,
"loss": 0.2865,
"step": 6000
},
{
"epoch": 1.847087751652067,
"grad_norm": 2.71875,
"learning_rate": 1.5334972341733253e-06,
"loss": 0.2594,
"step": 6010
},
{
"epoch": 1.8501613646841863,
"grad_norm": 2.703125,
"learning_rate": 1.5027658266748618e-06,
"loss": 0.2585,
"step": 6020
},
{
"epoch": 1.8532349777163055,
"grad_norm": 2.8125,
"learning_rate": 1.4720344191763983e-06,
"loss": 0.2758,
"step": 6030
},
{
"epoch": 1.8563085907484247,
"grad_norm": 2.34375,
"learning_rate": 1.441303011677935e-06,
"loss": 0.2722,
"step": 6040
},
{
"epoch": 1.859382203780544,
"grad_norm": 2.1875,
"learning_rate": 1.4105716041794715e-06,
"loss": 0.2812,
"step": 6050
},
{
"epoch": 1.8624558168126633,
"grad_norm": 2.796875,
"learning_rate": 1.3798401966810082e-06,
"loss": 0.2263,
"step": 6060
},
{
"epoch": 1.8655294298447824,
"grad_norm": 2.21875,
"learning_rate": 1.3491087891825446e-06,
"loss": 0.2759,
"step": 6070
},
{
"epoch": 1.8686030428769018,
"grad_norm": 2.59375,
"learning_rate": 1.3183773816840811e-06,
"loss": 0.2584,
"step": 6080
},
{
"epoch": 1.8716766559090212,
"grad_norm": 2.484375,
"learning_rate": 1.2876459741856178e-06,
"loss": 0.2728,
"step": 6090
},
{
"epoch": 1.8747502689411402,
"grad_norm": 2.296875,
"learning_rate": 1.2569145666871543e-06,
"loss": 0.2466,
"step": 6100
},
{
"epoch": 1.8778238819732596,
"grad_norm": 2.6875,
"learning_rate": 1.226183159188691e-06,
"loss": 0.2428,
"step": 6110
},
{
"epoch": 1.880897495005379,
"grad_norm": 2.90625,
"learning_rate": 1.1954517516902275e-06,
"loss": 0.2383,
"step": 6120
},
{
"epoch": 1.8839711080374981,
"grad_norm": 2.515625,
"learning_rate": 1.1647203441917642e-06,
"loss": 0.2902,
"step": 6130
},
{
"epoch": 1.8870447210696173,
"grad_norm": 2.796875,
"learning_rate": 1.1339889366933007e-06,
"loss": 0.2328,
"step": 6140
},
{
"epoch": 1.8901183341017367,
"grad_norm": 2.203125,
"learning_rate": 1.1032575291948371e-06,
"loss": 0.2504,
"step": 6150
},
{
"epoch": 1.8931919471338559,
"grad_norm": 2.25,
"learning_rate": 1.0725261216963738e-06,
"loss": 0.2605,
"step": 6160
},
{
"epoch": 1.896265560165975,
"grad_norm": 2.625,
"learning_rate": 1.0417947141979103e-06,
"loss": 0.2615,
"step": 6170
},
{
"epoch": 1.8993391731980944,
"grad_norm": 3.03125,
"learning_rate": 1.0110633066994468e-06,
"loss": 0.2613,
"step": 6180
},
{
"epoch": 1.9024127862302136,
"grad_norm": 2.828125,
"learning_rate": 9.803318992009835e-07,
"loss": 0.2761,
"step": 6190
},
{
"epoch": 1.9054863992623328,
"grad_norm": 2.078125,
"learning_rate": 9.496004917025201e-07,
"loss": 0.2589,
"step": 6200
},
{
"epoch": 1.9085600122944522,
"grad_norm": 2.671875,
"learning_rate": 9.188690842040567e-07,
"loss": 0.2762,
"step": 6210
},
{
"epoch": 1.9116336253265716,
"grad_norm": 2.40625,
"learning_rate": 8.881376767055933e-07,
"loss": 0.2326,
"step": 6220
},
{
"epoch": 1.9147072383586905,
"grad_norm": 2.484375,
"learning_rate": 8.574062692071297e-07,
"loss": 0.2404,
"step": 6230
},
{
"epoch": 1.91778085139081,
"grad_norm": 2.609375,
"learning_rate": 8.266748617086663e-07,
"loss": 0.2708,
"step": 6240
},
{
"epoch": 1.9208544644229293,
"grad_norm": 1.796875,
"learning_rate": 7.959434542102029e-07,
"loss": 0.2478,
"step": 6250
},
{
"epoch": 1.9239280774550485,
"grad_norm": 2.078125,
"learning_rate": 7.652120467117395e-07,
"loss": 0.2615,
"step": 6260
},
{
"epoch": 1.9270016904871676,
"grad_norm": 2.640625,
"learning_rate": 7.34480639213276e-07,
"loss": 0.2484,
"step": 6270
},
{
"epoch": 1.930075303519287,
"grad_norm": 2.296875,
"learning_rate": 7.037492317148126e-07,
"loss": 0.2866,
"step": 6280
},
{
"epoch": 1.9331489165514062,
"grad_norm": 2.328125,
"learning_rate": 6.730178242163492e-07,
"loss": 0.2632,
"step": 6290
},
{
"epoch": 1.9362225295835254,
"grad_norm": 2.8125,
"learning_rate": 6.422864167178858e-07,
"loss": 0.2576,
"step": 6300
},
{
"epoch": 1.9392961426156448,
"grad_norm": 2.125,
"learning_rate": 6.115550092194224e-07,
"loss": 0.2428,
"step": 6310
},
{
"epoch": 1.942369755647764,
"grad_norm": 2.1875,
"learning_rate": 5.808236017209588e-07,
"loss": 0.2435,
"step": 6320
},
{
"epoch": 1.945443368679883,
"grad_norm": 1.8515625,
"learning_rate": 5.500921942224954e-07,
"loss": 0.2537,
"step": 6330
},
{
"epoch": 1.9485169817120025,
"grad_norm": 2.09375,
"learning_rate": 5.19360786724032e-07,
"loss": 0.2701,
"step": 6340
},
{
"epoch": 1.9515905947441217,
"grad_norm": 2.515625,
"learning_rate": 4.886293792255686e-07,
"loss": 0.1977,
"step": 6350
},
{
"epoch": 1.9546642077762408,
"grad_norm": 2.78125,
"learning_rate": 4.5789797172710514e-07,
"loss": 0.2635,
"step": 6360
},
{
"epoch": 1.9577378208083602,
"grad_norm": 2.265625,
"learning_rate": 4.271665642286417e-07,
"loss": 0.2385,
"step": 6370
},
{
"epoch": 1.9608114338404796,
"grad_norm": 2.546875,
"learning_rate": 3.964351567301783e-07,
"loss": 0.2657,
"step": 6380
},
{
"epoch": 1.9638850468725988,
"grad_norm": 2.234375,
"learning_rate": 3.657037492317148e-07,
"loss": 0.2301,
"step": 6390
},
{
"epoch": 1.966958659904718,
"grad_norm": 2.328125,
"learning_rate": 3.3497234173325144e-07,
"loss": 0.2379,
"step": 6400
},
{
"epoch": 1.9700322729368374,
"grad_norm": 2.203125,
"learning_rate": 3.04240934234788e-07,
"loss": 0.2598,
"step": 6410
},
{
"epoch": 1.9731058859689565,
"grad_norm": 2.546875,
"learning_rate": 2.7350952673632457e-07,
"loss": 0.253,
"step": 6420
},
{
"epoch": 1.9761794990010757,
"grad_norm": 2.375,
"learning_rate": 2.427781192378611e-07,
"loss": 0.2614,
"step": 6430
},
{
"epoch": 1.979253112033195,
"grad_norm": 1.8984375,
"learning_rate": 2.120467117393977e-07,
"loss": 0.2474,
"step": 6440
},
{
"epoch": 1.9823267250653143,
"grad_norm": 1.8203125,
"learning_rate": 1.8131530424093426e-07,
"loss": 0.2825,
"step": 6450
},
{
"epoch": 1.9854003380974334,
"grad_norm": 1.9921875,
"learning_rate": 1.5058389674247082e-07,
"loss": 0.2705,
"step": 6460
},
{
"epoch": 1.9884739511295528,
"grad_norm": 1.8203125,
"learning_rate": 1.1985248924400738e-07,
"loss": 0.2716,
"step": 6470
},
{
"epoch": 1.991547564161672,
"grad_norm": 2.25,
"learning_rate": 8.912108174554396e-08,
"loss": 0.2911,
"step": 6480
},
{
"epoch": 1.9946211771937912,
"grad_norm": 2.40625,
"learning_rate": 5.838967424708052e-08,
"loss": 0.2304,
"step": 6490
},
{
"epoch": 1.9976947902259106,
"grad_norm": 2.15625,
"learning_rate": 2.7658266748617086e-08,
"loss": 0.2298,
"step": 6500
}
],
"logging_steps": 10,
"max_steps": 6508,
"num_input_tokens_seen": 0,
"num_train_epochs": 2,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 3.600196660472316e+18,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}