marianna13's picture
Upload folder using huggingface_hub
b2f4a9f verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 3.0,
"eval_steps": 500,
"global_step": 5871,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.005109862033725089,
"grad_norm": 0.33659857760959316,
"learning_rate": 5e-06,
"loss": 1.2266,
"step": 10
},
{
"epoch": 0.010219724067450179,
"grad_norm": 0.27906848020280245,
"learning_rate": 5e-06,
"loss": 1.1927,
"step": 20
},
{
"epoch": 0.015329586101175269,
"grad_norm": 0.17780612512031585,
"learning_rate": 5e-06,
"loss": 1.1325,
"step": 30
},
{
"epoch": 0.020439448134900357,
"grad_norm": 0.1258477993946921,
"learning_rate": 5e-06,
"loss": 1.0755,
"step": 40
},
{
"epoch": 0.025549310168625446,
"grad_norm": 0.10850896052962487,
"learning_rate": 5e-06,
"loss": 1.0559,
"step": 50
},
{
"epoch": 0.030659172202350538,
"grad_norm": 0.1008214475411604,
"learning_rate": 5e-06,
"loss": 1.0381,
"step": 60
},
{
"epoch": 0.03576903423607562,
"grad_norm": 0.09230130516181131,
"learning_rate": 5e-06,
"loss": 1.0359,
"step": 70
},
{
"epoch": 0.040878896269800714,
"grad_norm": 0.12761405349230925,
"learning_rate": 5e-06,
"loss": 1.02,
"step": 80
},
{
"epoch": 0.045988758303525806,
"grad_norm": 0.09657363359659779,
"learning_rate": 5e-06,
"loss": 1.0108,
"step": 90
},
{
"epoch": 0.05109862033725089,
"grad_norm": 0.07985711883606804,
"learning_rate": 5e-06,
"loss": 1.0112,
"step": 100
},
{
"epoch": 0.05620848237097598,
"grad_norm": 0.3841577988890222,
"learning_rate": 5e-06,
"loss": 1.004,
"step": 110
},
{
"epoch": 0.061318344404701075,
"grad_norm": 0.07880791404523275,
"learning_rate": 5e-06,
"loss": 1.0005,
"step": 120
},
{
"epoch": 0.06642820643842616,
"grad_norm": 0.07684646209763941,
"learning_rate": 5e-06,
"loss": 0.9969,
"step": 130
},
{
"epoch": 0.07153806847215124,
"grad_norm": 0.07364942700770905,
"learning_rate": 5e-06,
"loss": 0.9817,
"step": 140
},
{
"epoch": 0.07664793050587634,
"grad_norm": 0.07307840824030326,
"learning_rate": 5e-06,
"loss": 1.0017,
"step": 150
},
{
"epoch": 0.08175779253960143,
"grad_norm": 0.07899180708739008,
"learning_rate": 5e-06,
"loss": 1.0064,
"step": 160
},
{
"epoch": 0.08686765457332651,
"grad_norm": 0.13990750744492506,
"learning_rate": 5e-06,
"loss": 0.9664,
"step": 170
},
{
"epoch": 0.09197751660705161,
"grad_norm": 0.07623610966974216,
"learning_rate": 5e-06,
"loss": 0.993,
"step": 180
},
{
"epoch": 0.0970873786407767,
"grad_norm": 0.07177184337925457,
"learning_rate": 5e-06,
"loss": 0.9802,
"step": 190
},
{
"epoch": 0.10219724067450178,
"grad_norm": 0.07599595168859198,
"learning_rate": 5e-06,
"loss": 0.9821,
"step": 200
},
{
"epoch": 0.10730710270822688,
"grad_norm": 0.08156619926999734,
"learning_rate": 5e-06,
"loss": 0.9848,
"step": 210
},
{
"epoch": 0.11241696474195197,
"grad_norm": 0.08647559353210667,
"learning_rate": 5e-06,
"loss": 0.9669,
"step": 220
},
{
"epoch": 0.11752682677567705,
"grad_norm": 0.09175368496172068,
"learning_rate": 5e-06,
"loss": 0.9768,
"step": 230
},
{
"epoch": 0.12263668880940215,
"grad_norm": 0.07679212882863759,
"learning_rate": 5e-06,
"loss": 0.9785,
"step": 240
},
{
"epoch": 0.12774655084312725,
"grad_norm": 0.0876157967492165,
"learning_rate": 5e-06,
"loss": 0.97,
"step": 250
},
{
"epoch": 0.13285641287685232,
"grad_norm": 0.07438658820336003,
"learning_rate": 5e-06,
"loss": 0.9642,
"step": 260
},
{
"epoch": 0.13796627491057742,
"grad_norm": 0.11327800060102156,
"learning_rate": 5e-06,
"loss": 0.9566,
"step": 270
},
{
"epoch": 0.1430761369443025,
"grad_norm": 0.07300739672589697,
"learning_rate": 5e-06,
"loss": 0.9641,
"step": 280
},
{
"epoch": 0.1481859989780276,
"grad_norm": 0.0706224800129807,
"learning_rate": 5e-06,
"loss": 0.9608,
"step": 290
},
{
"epoch": 0.1532958610117527,
"grad_norm": 0.10268101288716767,
"learning_rate": 5e-06,
"loss": 0.9594,
"step": 300
},
{
"epoch": 0.15840572304547776,
"grad_norm": 0.10115126588406155,
"learning_rate": 5e-06,
"loss": 0.9497,
"step": 310
},
{
"epoch": 0.16351558507920286,
"grad_norm": 0.07906545142322015,
"learning_rate": 5e-06,
"loss": 0.9708,
"step": 320
},
{
"epoch": 0.16862544711292796,
"grad_norm": 0.08205445152944524,
"learning_rate": 5e-06,
"loss": 0.9578,
"step": 330
},
{
"epoch": 0.17373530914665303,
"grad_norm": 0.08422630569633463,
"learning_rate": 5e-06,
"loss": 0.9596,
"step": 340
},
{
"epoch": 0.17884517118037813,
"grad_norm": 0.08374924134366432,
"learning_rate": 5e-06,
"loss": 0.9538,
"step": 350
},
{
"epoch": 0.18395503321410323,
"grad_norm": 0.07381594898486545,
"learning_rate": 5e-06,
"loss": 0.9632,
"step": 360
},
{
"epoch": 0.1890648952478283,
"grad_norm": 0.0742024858441321,
"learning_rate": 5e-06,
"loss": 0.9574,
"step": 370
},
{
"epoch": 0.1941747572815534,
"grad_norm": 0.07635133337670513,
"learning_rate": 5e-06,
"loss": 0.9642,
"step": 380
},
{
"epoch": 0.1992846193152785,
"grad_norm": 0.07547026097910627,
"learning_rate": 5e-06,
"loss": 0.9559,
"step": 390
},
{
"epoch": 0.20439448134900357,
"grad_norm": 0.08098346259777026,
"learning_rate": 5e-06,
"loss": 0.9608,
"step": 400
},
{
"epoch": 0.20950434338272866,
"grad_norm": 0.0740110860457063,
"learning_rate": 5e-06,
"loss": 0.9626,
"step": 410
},
{
"epoch": 0.21461420541645376,
"grad_norm": 0.06838098261628725,
"learning_rate": 5e-06,
"loss": 0.956,
"step": 420
},
{
"epoch": 0.21972406745017883,
"grad_norm": 0.07627481328859255,
"learning_rate": 5e-06,
"loss": 0.947,
"step": 430
},
{
"epoch": 0.22483392948390393,
"grad_norm": 0.07121602448305268,
"learning_rate": 5e-06,
"loss": 0.936,
"step": 440
},
{
"epoch": 0.22994379151762903,
"grad_norm": 0.07130096221687535,
"learning_rate": 5e-06,
"loss": 0.9489,
"step": 450
},
{
"epoch": 0.2350536535513541,
"grad_norm": 0.0705427638202678,
"learning_rate": 5e-06,
"loss": 0.9488,
"step": 460
},
{
"epoch": 0.2401635155850792,
"grad_norm": 0.07263405103965026,
"learning_rate": 5e-06,
"loss": 0.9577,
"step": 470
},
{
"epoch": 0.2452733776188043,
"grad_norm": 0.11012974887960399,
"learning_rate": 5e-06,
"loss": 0.943,
"step": 480
},
{
"epoch": 0.2503832396525294,
"grad_norm": 0.10355809585077033,
"learning_rate": 5e-06,
"loss": 0.939,
"step": 490
},
{
"epoch": 0.2554931016862545,
"grad_norm": 0.0793194619120587,
"learning_rate": 5e-06,
"loss": 0.9461,
"step": 500
},
{
"epoch": 0.26060296371997954,
"grad_norm": 0.07305857929904291,
"learning_rate": 5e-06,
"loss": 0.94,
"step": 510
},
{
"epoch": 0.26571282575370464,
"grad_norm": 0.07524998598879123,
"learning_rate": 5e-06,
"loss": 0.9346,
"step": 520
},
{
"epoch": 0.27082268778742974,
"grad_norm": 0.07947372908760092,
"learning_rate": 5e-06,
"loss": 0.9469,
"step": 530
},
{
"epoch": 0.27593254982115484,
"grad_norm": 0.1241601221912902,
"learning_rate": 5e-06,
"loss": 0.9439,
"step": 540
},
{
"epoch": 0.28104241185487994,
"grad_norm": 0.09428433932371161,
"learning_rate": 5e-06,
"loss": 0.9364,
"step": 550
},
{
"epoch": 0.286152273888605,
"grad_norm": 0.07643085121646655,
"learning_rate": 5e-06,
"loss": 0.9362,
"step": 560
},
{
"epoch": 0.2912621359223301,
"grad_norm": 0.07053914919464828,
"learning_rate": 5e-06,
"loss": 0.9434,
"step": 570
},
{
"epoch": 0.2963719979560552,
"grad_norm": 0.07770599024749027,
"learning_rate": 5e-06,
"loss": 0.9339,
"step": 580
},
{
"epoch": 0.3014818599897803,
"grad_norm": 0.07178293782557839,
"learning_rate": 5e-06,
"loss": 0.9375,
"step": 590
},
{
"epoch": 0.3065917220235054,
"grad_norm": 0.08584366089279562,
"learning_rate": 5e-06,
"loss": 0.9371,
"step": 600
},
{
"epoch": 0.3117015840572305,
"grad_norm": 0.07944827441074408,
"learning_rate": 5e-06,
"loss": 0.9455,
"step": 610
},
{
"epoch": 0.3168114460909555,
"grad_norm": 0.07201506395499371,
"learning_rate": 5e-06,
"loss": 0.9468,
"step": 620
},
{
"epoch": 0.3219213081246806,
"grad_norm": 0.06802889338099032,
"learning_rate": 5e-06,
"loss": 0.9216,
"step": 630
},
{
"epoch": 0.3270311701584057,
"grad_norm": 0.07330902217526364,
"learning_rate": 5e-06,
"loss": 0.9236,
"step": 640
},
{
"epoch": 0.3321410321921308,
"grad_norm": 0.07252233851918731,
"learning_rate": 5e-06,
"loss": 0.9389,
"step": 650
},
{
"epoch": 0.3372508942258559,
"grad_norm": 0.07496139269293961,
"learning_rate": 5e-06,
"loss": 0.9237,
"step": 660
},
{
"epoch": 0.342360756259581,
"grad_norm": 0.07750656954616246,
"learning_rate": 5e-06,
"loss": 0.9369,
"step": 670
},
{
"epoch": 0.34747061829330605,
"grad_norm": 0.07099579035430438,
"learning_rate": 5e-06,
"loss": 0.9305,
"step": 680
},
{
"epoch": 0.35258048032703115,
"grad_norm": 0.07147696465885814,
"learning_rate": 5e-06,
"loss": 0.9278,
"step": 690
},
{
"epoch": 0.35769034236075625,
"grad_norm": 0.06679094313753112,
"learning_rate": 5e-06,
"loss": 0.9209,
"step": 700
},
{
"epoch": 0.36280020439448135,
"grad_norm": 0.0741205857711921,
"learning_rate": 5e-06,
"loss": 0.9239,
"step": 710
},
{
"epoch": 0.36791006642820645,
"grad_norm": 0.07314565717664778,
"learning_rate": 5e-06,
"loss": 0.9228,
"step": 720
},
{
"epoch": 0.37301992846193155,
"grad_norm": 0.06970796428185207,
"learning_rate": 5e-06,
"loss": 0.9261,
"step": 730
},
{
"epoch": 0.3781297904956566,
"grad_norm": 0.08781289255732734,
"learning_rate": 5e-06,
"loss": 0.9184,
"step": 740
},
{
"epoch": 0.3832396525293817,
"grad_norm": 0.0733298775244439,
"learning_rate": 5e-06,
"loss": 0.9239,
"step": 750
},
{
"epoch": 0.3883495145631068,
"grad_norm": 0.07059494784250205,
"learning_rate": 5e-06,
"loss": 0.9256,
"step": 760
},
{
"epoch": 0.3934593765968319,
"grad_norm": 0.07435764722394751,
"learning_rate": 5e-06,
"loss": 0.9317,
"step": 770
},
{
"epoch": 0.398569238630557,
"grad_norm": 0.08580109544056579,
"learning_rate": 5e-06,
"loss": 0.915,
"step": 780
},
{
"epoch": 0.4036791006642821,
"grad_norm": 0.13857712945248865,
"learning_rate": 5e-06,
"loss": 0.9266,
"step": 790
},
{
"epoch": 0.40878896269800713,
"grad_norm": 0.09252960975773743,
"learning_rate": 5e-06,
"loss": 0.9292,
"step": 800
},
{
"epoch": 0.41389882473173223,
"grad_norm": 0.0786939477268516,
"learning_rate": 5e-06,
"loss": 0.9229,
"step": 810
},
{
"epoch": 0.4190086867654573,
"grad_norm": 0.07870146545651402,
"learning_rate": 5e-06,
"loss": 0.9222,
"step": 820
},
{
"epoch": 0.4241185487991824,
"grad_norm": 0.22636518086425345,
"learning_rate": 5e-06,
"loss": 0.9198,
"step": 830
},
{
"epoch": 0.4292284108329075,
"grad_norm": 0.07019267445206644,
"learning_rate": 5e-06,
"loss": 0.9175,
"step": 840
},
{
"epoch": 0.4343382728666326,
"grad_norm": 0.1807350892295952,
"learning_rate": 5e-06,
"loss": 0.9196,
"step": 850
},
{
"epoch": 0.43944813490035767,
"grad_norm": 0.0725849350335069,
"learning_rate": 5e-06,
"loss": 0.9125,
"step": 860
},
{
"epoch": 0.44455799693408277,
"grad_norm": 0.07593721579585741,
"learning_rate": 5e-06,
"loss": 0.9281,
"step": 870
},
{
"epoch": 0.44966785896780787,
"grad_norm": 0.07560334097743046,
"learning_rate": 5e-06,
"loss": 0.9135,
"step": 880
},
{
"epoch": 0.45477772100153296,
"grad_norm": 0.06971504265848284,
"learning_rate": 5e-06,
"loss": 0.9237,
"step": 890
},
{
"epoch": 0.45988758303525806,
"grad_norm": 0.07156759501755959,
"learning_rate": 5e-06,
"loss": 0.9138,
"step": 900
},
{
"epoch": 0.46499744506898316,
"grad_norm": 0.07895160546618792,
"learning_rate": 5e-06,
"loss": 0.921,
"step": 910
},
{
"epoch": 0.4701073071027082,
"grad_norm": 0.07451564044792575,
"learning_rate": 5e-06,
"loss": 0.9199,
"step": 920
},
{
"epoch": 0.4752171691364333,
"grad_norm": 0.10378845787672986,
"learning_rate": 5e-06,
"loss": 0.9265,
"step": 930
},
{
"epoch": 0.4803270311701584,
"grad_norm": 0.0667143142951708,
"learning_rate": 5e-06,
"loss": 0.9177,
"step": 940
},
{
"epoch": 0.4854368932038835,
"grad_norm": 0.1158465430965177,
"learning_rate": 5e-06,
"loss": 0.9226,
"step": 950
},
{
"epoch": 0.4905467552376086,
"grad_norm": 0.07122640133664004,
"learning_rate": 5e-06,
"loss": 0.9264,
"step": 960
},
{
"epoch": 0.4956566172713337,
"grad_norm": 0.07221992051194193,
"learning_rate": 5e-06,
"loss": 0.9131,
"step": 970
},
{
"epoch": 0.5007664793050588,
"grad_norm": 0.07924509176000209,
"learning_rate": 5e-06,
"loss": 0.9125,
"step": 980
},
{
"epoch": 0.5058763413387839,
"grad_norm": 0.07672190213186483,
"learning_rate": 5e-06,
"loss": 0.9122,
"step": 990
},
{
"epoch": 0.510986203372509,
"grad_norm": 0.07692062751941903,
"learning_rate": 5e-06,
"loss": 0.929,
"step": 1000
},
{
"epoch": 0.516096065406234,
"grad_norm": 0.11741801773911116,
"learning_rate": 5e-06,
"loss": 0.9159,
"step": 1010
},
{
"epoch": 0.5212059274399591,
"grad_norm": 0.07370462072026876,
"learning_rate": 5e-06,
"loss": 0.9297,
"step": 1020
},
{
"epoch": 0.5263157894736842,
"grad_norm": 0.07324740223664235,
"learning_rate": 5e-06,
"loss": 0.9102,
"step": 1030
},
{
"epoch": 0.5314256515074093,
"grad_norm": 0.06719441090965804,
"learning_rate": 5e-06,
"loss": 0.9095,
"step": 1040
},
{
"epoch": 0.5365355135411344,
"grad_norm": 0.0752101579557485,
"learning_rate": 5e-06,
"loss": 0.9118,
"step": 1050
},
{
"epoch": 0.5416453755748595,
"grad_norm": 0.07316438131338852,
"learning_rate": 5e-06,
"loss": 0.909,
"step": 1060
},
{
"epoch": 0.5467552376085846,
"grad_norm": 0.07217762056098373,
"learning_rate": 5e-06,
"loss": 0.9045,
"step": 1070
},
{
"epoch": 0.5518650996423097,
"grad_norm": 0.06882269438361814,
"learning_rate": 5e-06,
"loss": 0.9241,
"step": 1080
},
{
"epoch": 0.5569749616760348,
"grad_norm": 0.18147585225890778,
"learning_rate": 5e-06,
"loss": 0.9167,
"step": 1090
},
{
"epoch": 0.5620848237097599,
"grad_norm": 0.07945137709096421,
"learning_rate": 5e-06,
"loss": 0.9058,
"step": 1100
},
{
"epoch": 0.567194685743485,
"grad_norm": 0.06985941949554184,
"learning_rate": 5e-06,
"loss": 0.9159,
"step": 1110
},
{
"epoch": 0.57230454777721,
"grad_norm": 0.07626155133966736,
"learning_rate": 5e-06,
"loss": 0.9152,
"step": 1120
},
{
"epoch": 0.5774144098109351,
"grad_norm": 0.07367359682427968,
"learning_rate": 5e-06,
"loss": 0.9033,
"step": 1130
},
{
"epoch": 0.5825242718446602,
"grad_norm": 0.07246294215791606,
"learning_rate": 5e-06,
"loss": 0.9056,
"step": 1140
},
{
"epoch": 0.5876341338783853,
"grad_norm": 0.06636935667746374,
"learning_rate": 5e-06,
"loss": 0.9096,
"step": 1150
},
{
"epoch": 0.5927439959121104,
"grad_norm": 0.06914204310614962,
"learning_rate": 5e-06,
"loss": 0.9124,
"step": 1160
},
{
"epoch": 0.5978538579458355,
"grad_norm": 0.06622472549827609,
"learning_rate": 5e-06,
"loss": 0.9042,
"step": 1170
},
{
"epoch": 0.6029637199795606,
"grad_norm": 0.07526422643221314,
"learning_rate": 5e-06,
"loss": 0.9099,
"step": 1180
},
{
"epoch": 0.6080735820132857,
"grad_norm": 0.07865312982736179,
"learning_rate": 5e-06,
"loss": 0.9121,
"step": 1190
},
{
"epoch": 0.6131834440470108,
"grad_norm": 0.08100630197435015,
"learning_rate": 5e-06,
"loss": 0.9108,
"step": 1200
},
{
"epoch": 0.6182933060807358,
"grad_norm": 0.14439251719042823,
"learning_rate": 5e-06,
"loss": 0.9085,
"step": 1210
},
{
"epoch": 0.623403168114461,
"grad_norm": 0.07351842090072493,
"learning_rate": 5e-06,
"loss": 0.9102,
"step": 1220
},
{
"epoch": 0.628513030148186,
"grad_norm": 0.0710785086017881,
"learning_rate": 5e-06,
"loss": 0.9137,
"step": 1230
},
{
"epoch": 0.633622892181911,
"grad_norm": 0.07184247120200028,
"learning_rate": 5e-06,
"loss": 0.8997,
"step": 1240
},
{
"epoch": 0.6387327542156361,
"grad_norm": 0.07131702245604768,
"learning_rate": 5e-06,
"loss": 0.9087,
"step": 1250
},
{
"epoch": 0.6438426162493612,
"grad_norm": 0.0730402054534026,
"learning_rate": 5e-06,
"loss": 0.894,
"step": 1260
},
{
"epoch": 0.6489524782830863,
"grad_norm": 0.07322424351281255,
"learning_rate": 5e-06,
"loss": 0.9127,
"step": 1270
},
{
"epoch": 0.6540623403168114,
"grad_norm": 0.09436026180688377,
"learning_rate": 5e-06,
"loss": 0.919,
"step": 1280
},
{
"epoch": 0.6591722023505365,
"grad_norm": 0.07076855510548616,
"learning_rate": 5e-06,
"loss": 0.9044,
"step": 1290
},
{
"epoch": 0.6642820643842616,
"grad_norm": 0.07409183519590924,
"learning_rate": 5e-06,
"loss": 0.8925,
"step": 1300
},
{
"epoch": 0.6693919264179867,
"grad_norm": 0.3253569122085339,
"learning_rate": 5e-06,
"loss": 0.897,
"step": 1310
},
{
"epoch": 0.6745017884517118,
"grad_norm": 0.07002482080488048,
"learning_rate": 5e-06,
"loss": 0.8905,
"step": 1320
},
{
"epoch": 0.6796116504854369,
"grad_norm": 0.8542494508434101,
"learning_rate": 5e-06,
"loss": 0.9052,
"step": 1330
},
{
"epoch": 0.684721512519162,
"grad_norm": 0.09406257309495393,
"learning_rate": 5e-06,
"loss": 0.91,
"step": 1340
},
{
"epoch": 0.6898313745528871,
"grad_norm": 0.08661096377263379,
"learning_rate": 5e-06,
"loss": 0.9096,
"step": 1350
},
{
"epoch": 0.6949412365866121,
"grad_norm": 0.07544412418690473,
"learning_rate": 5e-06,
"loss": 0.9039,
"step": 1360
},
{
"epoch": 0.7000510986203372,
"grad_norm": 0.07803646465670126,
"learning_rate": 5e-06,
"loss": 0.8892,
"step": 1370
},
{
"epoch": 0.7051609606540623,
"grad_norm": 0.07088553834184458,
"learning_rate": 5e-06,
"loss": 0.8975,
"step": 1380
},
{
"epoch": 0.7102708226877874,
"grad_norm": 0.08931444692520309,
"learning_rate": 5e-06,
"loss": 0.882,
"step": 1390
},
{
"epoch": 0.7153806847215125,
"grad_norm": 0.07261368946056458,
"learning_rate": 5e-06,
"loss": 0.903,
"step": 1400
},
{
"epoch": 0.7204905467552376,
"grad_norm": 0.07364955770557434,
"learning_rate": 5e-06,
"loss": 0.9019,
"step": 1410
},
{
"epoch": 0.7256004087889627,
"grad_norm": 0.07232725672310655,
"learning_rate": 5e-06,
"loss": 0.9047,
"step": 1420
},
{
"epoch": 0.7307102708226878,
"grad_norm": 0.07284538520098455,
"learning_rate": 5e-06,
"loss": 0.8969,
"step": 1430
},
{
"epoch": 0.7358201328564129,
"grad_norm": 0.06804520794145767,
"learning_rate": 5e-06,
"loss": 0.8983,
"step": 1440
},
{
"epoch": 0.740929994890138,
"grad_norm": 0.07080981200832792,
"learning_rate": 5e-06,
"loss": 0.9034,
"step": 1450
},
{
"epoch": 0.7460398569238631,
"grad_norm": 0.07015236628919412,
"learning_rate": 5e-06,
"loss": 0.9033,
"step": 1460
},
{
"epoch": 0.7511497189575882,
"grad_norm": 0.10418032275009997,
"learning_rate": 5e-06,
"loss": 0.9008,
"step": 1470
},
{
"epoch": 0.7562595809913132,
"grad_norm": 0.07009703060030868,
"learning_rate": 5e-06,
"loss": 0.9006,
"step": 1480
},
{
"epoch": 0.7613694430250383,
"grad_norm": 0.07561374961635098,
"learning_rate": 5e-06,
"loss": 0.8992,
"step": 1490
},
{
"epoch": 0.7664793050587634,
"grad_norm": 0.07543087417873817,
"learning_rate": 5e-06,
"loss": 0.9022,
"step": 1500
},
{
"epoch": 0.7715891670924885,
"grad_norm": 0.07303222610905763,
"learning_rate": 5e-06,
"loss": 0.8876,
"step": 1510
},
{
"epoch": 0.7766990291262136,
"grad_norm": 0.12947977214304454,
"learning_rate": 5e-06,
"loss": 0.9017,
"step": 1520
},
{
"epoch": 0.7818088911599387,
"grad_norm": 0.07389484625510247,
"learning_rate": 5e-06,
"loss": 0.8972,
"step": 1530
},
{
"epoch": 0.7869187531936638,
"grad_norm": 0.07577895984597245,
"learning_rate": 5e-06,
"loss": 0.9029,
"step": 1540
},
{
"epoch": 0.7920286152273889,
"grad_norm": 0.07179725920965228,
"learning_rate": 5e-06,
"loss": 0.8973,
"step": 1550
},
{
"epoch": 0.797138477261114,
"grad_norm": 0.09446753813662215,
"learning_rate": 5e-06,
"loss": 0.8884,
"step": 1560
},
{
"epoch": 0.8022483392948391,
"grad_norm": 0.0694040937128912,
"learning_rate": 5e-06,
"loss": 0.8858,
"step": 1570
},
{
"epoch": 0.8073582013285642,
"grad_norm": 0.07119010100651514,
"learning_rate": 5e-06,
"loss": 0.891,
"step": 1580
},
{
"epoch": 0.8124680633622893,
"grad_norm": 0.06814136288038194,
"learning_rate": 5e-06,
"loss": 0.8882,
"step": 1590
},
{
"epoch": 0.8175779253960143,
"grad_norm": 0.14101206207855727,
"learning_rate": 5e-06,
"loss": 0.8901,
"step": 1600
},
{
"epoch": 0.8226877874297394,
"grad_norm": 0.06823593991844153,
"learning_rate": 5e-06,
"loss": 0.8969,
"step": 1610
},
{
"epoch": 0.8277976494634645,
"grad_norm": 0.0781844807225488,
"learning_rate": 5e-06,
"loss": 0.9092,
"step": 1620
},
{
"epoch": 0.8329075114971896,
"grad_norm": 0.07770931010296782,
"learning_rate": 5e-06,
"loss": 0.9099,
"step": 1630
},
{
"epoch": 0.8380173735309147,
"grad_norm": 0.07037044497495822,
"learning_rate": 5e-06,
"loss": 0.8955,
"step": 1640
},
{
"epoch": 0.8431272355646398,
"grad_norm": 0.0765067693249436,
"learning_rate": 5e-06,
"loss": 0.8839,
"step": 1650
},
{
"epoch": 0.8482370975983649,
"grad_norm": 0.09158943006728,
"learning_rate": 5e-06,
"loss": 0.9055,
"step": 1660
},
{
"epoch": 0.85334695963209,
"grad_norm": 0.07388296884455732,
"learning_rate": 5e-06,
"loss": 0.8829,
"step": 1670
},
{
"epoch": 0.858456821665815,
"grad_norm": 0.07364367251534173,
"learning_rate": 5e-06,
"loss": 0.8906,
"step": 1680
},
{
"epoch": 0.8635666836995401,
"grad_norm": 0.10472917075697648,
"learning_rate": 5e-06,
"loss": 0.8998,
"step": 1690
},
{
"epoch": 0.8686765457332652,
"grad_norm": 0.07209129300287077,
"learning_rate": 5e-06,
"loss": 0.9079,
"step": 1700
},
{
"epoch": 0.8737864077669902,
"grad_norm": 0.06988678080238575,
"learning_rate": 5e-06,
"loss": 0.903,
"step": 1710
},
{
"epoch": 0.8788962698007153,
"grad_norm": 0.07515978821834793,
"learning_rate": 5e-06,
"loss": 0.8904,
"step": 1720
},
{
"epoch": 0.8840061318344404,
"grad_norm": 0.07433094849872902,
"learning_rate": 5e-06,
"loss": 0.889,
"step": 1730
},
{
"epoch": 0.8891159938681655,
"grad_norm": 0.08433306671757082,
"learning_rate": 5e-06,
"loss": 0.8986,
"step": 1740
},
{
"epoch": 0.8942258559018906,
"grad_norm": 0.07396884075939439,
"learning_rate": 5e-06,
"loss": 0.8874,
"step": 1750
},
{
"epoch": 0.8993357179356157,
"grad_norm": 0.07751456977281067,
"learning_rate": 5e-06,
"loss": 0.8992,
"step": 1760
},
{
"epoch": 0.9044455799693408,
"grad_norm": 0.0789527788405623,
"learning_rate": 5e-06,
"loss": 0.8969,
"step": 1770
},
{
"epoch": 0.9095554420030659,
"grad_norm": 0.07602904790252976,
"learning_rate": 5e-06,
"loss": 0.8901,
"step": 1780
},
{
"epoch": 0.914665304036791,
"grad_norm": 0.06982248060839691,
"learning_rate": 5e-06,
"loss": 0.8968,
"step": 1790
},
{
"epoch": 0.9197751660705161,
"grad_norm": 0.07169359819101034,
"learning_rate": 5e-06,
"loss": 0.9087,
"step": 1800
},
{
"epoch": 0.9248850281042412,
"grad_norm": 0.07321342917964954,
"learning_rate": 5e-06,
"loss": 0.8969,
"step": 1810
},
{
"epoch": 0.9299948901379663,
"grad_norm": 0.07444618890892986,
"learning_rate": 5e-06,
"loss": 0.9045,
"step": 1820
},
{
"epoch": 0.9351047521716913,
"grad_norm": 0.11365069738490574,
"learning_rate": 5e-06,
"loss": 0.8909,
"step": 1830
},
{
"epoch": 0.9402146142054164,
"grad_norm": 0.380588282512115,
"learning_rate": 5e-06,
"loss": 0.8803,
"step": 1840
},
{
"epoch": 0.9453244762391415,
"grad_norm": 0.07262951461490495,
"learning_rate": 5e-06,
"loss": 0.8718,
"step": 1850
},
{
"epoch": 0.9504343382728666,
"grad_norm": 0.06829017808656876,
"learning_rate": 5e-06,
"loss": 0.8822,
"step": 1860
},
{
"epoch": 0.9555442003065917,
"grad_norm": 0.07878675404968587,
"learning_rate": 5e-06,
"loss": 0.878,
"step": 1870
},
{
"epoch": 0.9606540623403168,
"grad_norm": 0.0760531098736545,
"learning_rate": 5e-06,
"loss": 0.8808,
"step": 1880
},
{
"epoch": 0.9657639243740419,
"grad_norm": 0.12494712249352007,
"learning_rate": 5e-06,
"loss": 0.8829,
"step": 1890
},
{
"epoch": 0.970873786407767,
"grad_norm": 0.07219974763591076,
"learning_rate": 5e-06,
"loss": 0.8875,
"step": 1900
},
{
"epoch": 0.9759836484414921,
"grad_norm": 0.0678705623185834,
"learning_rate": 5e-06,
"loss": 0.9012,
"step": 1910
},
{
"epoch": 0.9810935104752172,
"grad_norm": 0.07156172069177699,
"learning_rate": 5e-06,
"loss": 0.898,
"step": 1920
},
{
"epoch": 0.9862033725089423,
"grad_norm": 0.0732677504535335,
"learning_rate": 5e-06,
"loss": 0.8962,
"step": 1930
},
{
"epoch": 0.9913132345426674,
"grad_norm": 0.07202912622478508,
"learning_rate": 5e-06,
"loss": 0.8951,
"step": 1940
},
{
"epoch": 0.9964230965763924,
"grad_norm": 0.10969747960973296,
"learning_rate": 5e-06,
"loss": 0.9097,
"step": 1950
},
{
"epoch": 1.0015329586101176,
"grad_norm": 0.06912267564528896,
"learning_rate": 5e-06,
"loss": 0.8839,
"step": 1960
},
{
"epoch": 1.0066428206438427,
"grad_norm": 0.07686255349334188,
"learning_rate": 5e-06,
"loss": 0.8895,
"step": 1970
},
{
"epoch": 1.0117526826775678,
"grad_norm": 0.07239494301457147,
"learning_rate": 5e-06,
"loss": 0.8802,
"step": 1980
},
{
"epoch": 1.016862544711293,
"grad_norm": 0.06896146275536877,
"learning_rate": 5e-06,
"loss": 0.8743,
"step": 1990
},
{
"epoch": 1.021972406745018,
"grad_norm": 0.06973937218730168,
"learning_rate": 5e-06,
"loss": 0.8858,
"step": 2000
},
{
"epoch": 1.0270822687787429,
"grad_norm": 0.07590849152549478,
"learning_rate": 5e-06,
"loss": 0.8702,
"step": 2010
},
{
"epoch": 1.032192130812468,
"grad_norm": 0.07727922335579492,
"learning_rate": 5e-06,
"loss": 0.8802,
"step": 2020
},
{
"epoch": 1.037301992846193,
"grad_norm": 0.07317945118774036,
"learning_rate": 5e-06,
"loss": 0.8764,
"step": 2030
},
{
"epoch": 1.0424118548799182,
"grad_norm": 0.0790764962189524,
"learning_rate": 5e-06,
"loss": 0.8675,
"step": 2040
},
{
"epoch": 1.0475217169136433,
"grad_norm": 0.14927111441309496,
"learning_rate": 5e-06,
"loss": 0.8795,
"step": 2050
},
{
"epoch": 1.0526315789473684,
"grad_norm": 0.09144412400190963,
"learning_rate": 5e-06,
"loss": 0.8787,
"step": 2060
},
{
"epoch": 1.0577414409810935,
"grad_norm": 0.07045354181303777,
"learning_rate": 5e-06,
"loss": 0.8797,
"step": 2070
},
{
"epoch": 1.0628513030148186,
"grad_norm": 0.07891724899455275,
"learning_rate": 5e-06,
"loss": 0.8905,
"step": 2080
},
{
"epoch": 1.0679611650485437,
"grad_norm": 0.06772214066866822,
"learning_rate": 5e-06,
"loss": 0.8872,
"step": 2090
},
{
"epoch": 1.0730710270822688,
"grad_norm": 0.0712589563903319,
"learning_rate": 5e-06,
"loss": 0.8855,
"step": 2100
},
{
"epoch": 1.0781808891159939,
"grad_norm": 0.06862698369349315,
"learning_rate": 5e-06,
"loss": 0.881,
"step": 2110
},
{
"epoch": 1.083290751149719,
"grad_norm": 0.0736606231100836,
"learning_rate": 5e-06,
"loss": 0.8674,
"step": 2120
},
{
"epoch": 1.088400613183444,
"grad_norm": 0.07106355054734692,
"learning_rate": 5e-06,
"loss": 0.8946,
"step": 2130
},
{
"epoch": 1.0935104752171692,
"grad_norm": 0.07255234123815231,
"learning_rate": 5e-06,
"loss": 0.882,
"step": 2140
},
{
"epoch": 1.0986203372508943,
"grad_norm": 0.0732742082692386,
"learning_rate": 5e-06,
"loss": 0.8676,
"step": 2150
},
{
"epoch": 1.1037301992846194,
"grad_norm": 0.06964362410343532,
"learning_rate": 5e-06,
"loss": 0.8774,
"step": 2160
},
{
"epoch": 1.1088400613183444,
"grad_norm": 0.06779365708748122,
"learning_rate": 5e-06,
"loss": 0.8746,
"step": 2170
},
{
"epoch": 1.1139499233520695,
"grad_norm": 0.1233845297748753,
"learning_rate": 5e-06,
"loss": 0.8765,
"step": 2180
},
{
"epoch": 1.1190597853857946,
"grad_norm": 0.08906300490745893,
"learning_rate": 5e-06,
"loss": 0.8887,
"step": 2190
},
{
"epoch": 1.1241696474195197,
"grad_norm": 0.09632061303323498,
"learning_rate": 5e-06,
"loss": 0.873,
"step": 2200
},
{
"epoch": 1.1292795094532448,
"grad_norm": 0.08521280891691754,
"learning_rate": 5e-06,
"loss": 0.8898,
"step": 2210
},
{
"epoch": 1.13438937148697,
"grad_norm": 0.08360918869150777,
"learning_rate": 5e-06,
"loss": 0.8744,
"step": 2220
},
{
"epoch": 1.139499233520695,
"grad_norm": 0.07235955845669943,
"learning_rate": 5e-06,
"loss": 0.871,
"step": 2230
},
{
"epoch": 1.14460909555442,
"grad_norm": 0.0730315588992174,
"learning_rate": 5e-06,
"loss": 0.8821,
"step": 2240
},
{
"epoch": 1.1497189575881452,
"grad_norm": 0.11078438063259888,
"learning_rate": 5e-06,
"loss": 0.8769,
"step": 2250
},
{
"epoch": 1.1548288196218701,
"grad_norm": 0.07056038440126992,
"learning_rate": 5e-06,
"loss": 0.8794,
"step": 2260
},
{
"epoch": 1.1599386816555952,
"grad_norm": 0.0708667161304354,
"learning_rate": 5e-06,
"loss": 0.8816,
"step": 2270
},
{
"epoch": 1.1650485436893203,
"grad_norm": 0.07609323526554022,
"learning_rate": 5e-06,
"loss": 0.8819,
"step": 2280
},
{
"epoch": 1.1701584057230454,
"grad_norm": 0.0735028145209297,
"learning_rate": 5e-06,
"loss": 0.8703,
"step": 2290
},
{
"epoch": 1.1752682677567705,
"grad_norm": 0.07069493332631555,
"learning_rate": 5e-06,
"loss": 0.8847,
"step": 2300
},
{
"epoch": 1.1803781297904956,
"grad_norm": 0.072283465693465,
"learning_rate": 5e-06,
"loss": 0.8821,
"step": 2310
},
{
"epoch": 1.1854879918242207,
"grad_norm": 0.0682393777040698,
"learning_rate": 5e-06,
"loss": 0.8783,
"step": 2320
},
{
"epoch": 1.1905978538579458,
"grad_norm": 0.07033706862496379,
"learning_rate": 5e-06,
"loss": 0.8803,
"step": 2330
},
{
"epoch": 1.195707715891671,
"grad_norm": 0.07616585487021542,
"learning_rate": 5e-06,
"loss": 0.8797,
"step": 2340
},
{
"epoch": 1.200817577925396,
"grad_norm": 0.09257368897028359,
"learning_rate": 5e-06,
"loss": 0.8848,
"step": 2350
},
{
"epoch": 1.205927439959121,
"grad_norm": 0.07218601192609732,
"learning_rate": 5e-06,
"loss": 0.88,
"step": 2360
},
{
"epoch": 1.2110373019928462,
"grad_norm": 0.06958174210341682,
"learning_rate": 5e-06,
"loss": 0.8736,
"step": 2370
},
{
"epoch": 1.2161471640265713,
"grad_norm": 0.07752486671959553,
"learning_rate": 5e-06,
"loss": 0.8872,
"step": 2380
},
{
"epoch": 1.2212570260602964,
"grad_norm": 0.06925884038134095,
"learning_rate": 5e-06,
"loss": 0.8723,
"step": 2390
},
{
"epoch": 1.2263668880940215,
"grad_norm": 0.08334609436821461,
"learning_rate": 5e-06,
"loss": 0.8733,
"step": 2400
},
{
"epoch": 1.2314767501277466,
"grad_norm": 0.07385728362174653,
"learning_rate": 5e-06,
"loss": 0.884,
"step": 2410
},
{
"epoch": 1.2365866121614717,
"grad_norm": 0.07450268605349716,
"learning_rate": 5e-06,
"loss": 0.8759,
"step": 2420
},
{
"epoch": 1.2416964741951968,
"grad_norm": 0.07354090310210888,
"learning_rate": 5e-06,
"loss": 0.8688,
"step": 2430
},
{
"epoch": 1.246806336228922,
"grad_norm": 0.07320978260247811,
"learning_rate": 5e-06,
"loss": 0.8714,
"step": 2440
},
{
"epoch": 1.2519161982626468,
"grad_norm": 0.11135661739097093,
"learning_rate": 5e-06,
"loss": 0.8739,
"step": 2450
},
{
"epoch": 1.257026060296372,
"grad_norm": 0.06689839605333785,
"learning_rate": 5e-06,
"loss": 0.8765,
"step": 2460
},
{
"epoch": 1.262135922330097,
"grad_norm": 0.06878455740414403,
"learning_rate": 5e-06,
"loss": 0.8689,
"step": 2470
},
{
"epoch": 1.2672457843638223,
"grad_norm": 0.07939944943381874,
"learning_rate": 5e-06,
"loss": 0.8746,
"step": 2480
},
{
"epoch": 1.2723556463975472,
"grad_norm": 0.0847486139125745,
"learning_rate": 5e-06,
"loss": 0.8559,
"step": 2490
},
{
"epoch": 1.2774655084312725,
"grad_norm": 0.07908001512097204,
"learning_rate": 5e-06,
"loss": 0.8837,
"step": 2500
},
{
"epoch": 1.2825753704649974,
"grad_norm": 0.07179831207428604,
"learning_rate": 5e-06,
"loss": 0.8719,
"step": 2510
},
{
"epoch": 1.2876852324987225,
"grad_norm": 0.06908942799455607,
"learning_rate": 5e-06,
"loss": 0.8719,
"step": 2520
},
{
"epoch": 1.2927950945324476,
"grad_norm": 0.18170225872460127,
"learning_rate": 5e-06,
"loss": 0.8716,
"step": 2530
},
{
"epoch": 1.2979049565661727,
"grad_norm": 0.07191218825096998,
"learning_rate": 5e-06,
"loss": 0.8722,
"step": 2540
},
{
"epoch": 1.3030148185998978,
"grad_norm": 0.07372060023260767,
"learning_rate": 5e-06,
"loss": 0.864,
"step": 2550
},
{
"epoch": 1.3081246806336229,
"grad_norm": 0.06857070499864196,
"learning_rate": 5e-06,
"loss": 0.8737,
"step": 2560
},
{
"epoch": 1.313234542667348,
"grad_norm": 0.07004154529738099,
"learning_rate": 5e-06,
"loss": 0.8813,
"step": 2570
},
{
"epoch": 1.318344404701073,
"grad_norm": 0.07646266858295366,
"learning_rate": 5e-06,
"loss": 0.8673,
"step": 2580
},
{
"epoch": 1.3234542667347982,
"grad_norm": 0.22621512935898874,
"learning_rate": 5e-06,
"loss": 0.874,
"step": 2590
},
{
"epoch": 1.3285641287685233,
"grad_norm": 0.07172127109965361,
"learning_rate": 5e-06,
"loss": 0.8702,
"step": 2600
},
{
"epoch": 1.3336739908022484,
"grad_norm": 0.07919820863816981,
"learning_rate": 5e-06,
"loss": 0.8714,
"step": 2610
},
{
"epoch": 1.3387838528359735,
"grad_norm": 0.0737384119314816,
"learning_rate": 5e-06,
"loss": 0.8596,
"step": 2620
},
{
"epoch": 1.3438937148696986,
"grad_norm": 0.07649678940149303,
"learning_rate": 5e-06,
"loss": 0.8762,
"step": 2630
},
{
"epoch": 1.3490035769034237,
"grad_norm": 0.07616587969512677,
"learning_rate": 5e-06,
"loss": 0.8752,
"step": 2640
},
{
"epoch": 1.3541134389371488,
"grad_norm": 0.07029295823641774,
"learning_rate": 5e-06,
"loss": 0.8873,
"step": 2650
},
{
"epoch": 1.3592233009708738,
"grad_norm": 0.09494349210733874,
"learning_rate": 5e-06,
"loss": 0.8711,
"step": 2660
},
{
"epoch": 1.364333163004599,
"grad_norm": 0.07259460873158792,
"learning_rate": 5e-06,
"loss": 0.8637,
"step": 2670
},
{
"epoch": 1.369443025038324,
"grad_norm": 0.07116222480426102,
"learning_rate": 5e-06,
"loss": 0.871,
"step": 2680
},
{
"epoch": 1.3745528870720491,
"grad_norm": 0.07020506586822424,
"learning_rate": 5e-06,
"loss": 0.8695,
"step": 2690
},
{
"epoch": 1.379662749105774,
"grad_norm": 0.07347815298194012,
"learning_rate": 5e-06,
"loss": 0.8635,
"step": 2700
},
{
"epoch": 1.3847726111394993,
"grad_norm": 0.07534096599250913,
"learning_rate": 5e-06,
"loss": 0.874,
"step": 2710
},
{
"epoch": 1.3898824731732242,
"grad_norm": 0.07312451287982565,
"learning_rate": 5e-06,
"loss": 0.8583,
"step": 2720
},
{
"epoch": 1.3949923352069495,
"grad_norm": 0.07656396202261084,
"learning_rate": 5e-06,
"loss": 0.8757,
"step": 2730
},
{
"epoch": 1.4001021972406744,
"grad_norm": 0.06967035802932788,
"learning_rate": 5e-06,
"loss": 0.8679,
"step": 2740
},
{
"epoch": 1.4052120592743995,
"grad_norm": 0.07040784275347066,
"learning_rate": 5e-06,
"loss": 0.8691,
"step": 2750
},
{
"epoch": 1.4103219213081246,
"grad_norm": 0.07613577722321895,
"learning_rate": 5e-06,
"loss": 0.8622,
"step": 2760
},
{
"epoch": 1.4154317833418497,
"grad_norm": 0.0864741566205661,
"learning_rate": 5e-06,
"loss": 0.8795,
"step": 2770
},
{
"epoch": 1.4205416453755748,
"grad_norm": 0.07310285802451263,
"learning_rate": 5e-06,
"loss": 0.8652,
"step": 2780
},
{
"epoch": 1.4256515074093,
"grad_norm": 0.09973189293248541,
"learning_rate": 5e-06,
"loss": 0.8588,
"step": 2790
},
{
"epoch": 1.430761369443025,
"grad_norm": 0.06701808614361114,
"learning_rate": 5e-06,
"loss": 0.8618,
"step": 2800
},
{
"epoch": 1.43587123147675,
"grad_norm": 0.07371045895798196,
"learning_rate": 5e-06,
"loss": 0.868,
"step": 2810
},
{
"epoch": 1.4409810935104752,
"grad_norm": 0.07317697928871572,
"learning_rate": 5e-06,
"loss": 0.8696,
"step": 2820
},
{
"epoch": 1.4460909555442003,
"grad_norm": 0.07564444911927448,
"learning_rate": 5e-06,
"loss": 0.8752,
"step": 2830
},
{
"epoch": 1.4512008175779254,
"grad_norm": 0.07293430197763458,
"learning_rate": 5e-06,
"loss": 0.8739,
"step": 2840
},
{
"epoch": 1.4563106796116505,
"grad_norm": 0.08105248343525902,
"learning_rate": 5e-06,
"loss": 0.8657,
"step": 2850
},
{
"epoch": 1.4614205416453756,
"grad_norm": 0.06997018312332681,
"learning_rate": 5e-06,
"loss": 0.8687,
"step": 2860
},
{
"epoch": 1.4665304036791007,
"grad_norm": 0.0747583962947945,
"learning_rate": 5e-06,
"loss": 0.876,
"step": 2870
},
{
"epoch": 1.4716402657128258,
"grad_norm": 0.07233657303691476,
"learning_rate": 5e-06,
"loss": 0.8737,
"step": 2880
},
{
"epoch": 1.476750127746551,
"grad_norm": 0.09587028399395618,
"learning_rate": 5e-06,
"loss": 0.8737,
"step": 2890
},
{
"epoch": 1.481859989780276,
"grad_norm": 0.07573225439316397,
"learning_rate": 5e-06,
"loss": 0.8768,
"step": 2900
},
{
"epoch": 1.486969851814001,
"grad_norm": 0.07814308912110395,
"learning_rate": 5e-06,
"loss": 0.8585,
"step": 2910
},
{
"epoch": 1.4920797138477262,
"grad_norm": 0.07563890740598028,
"learning_rate": 5e-06,
"loss": 0.8688,
"step": 2920
},
{
"epoch": 1.497189575881451,
"grad_norm": 0.09576148511380349,
"learning_rate": 5e-06,
"loss": 0.8664,
"step": 2930
},
{
"epoch": 1.5022994379151764,
"grad_norm": 0.07323289783838559,
"learning_rate": 5e-06,
"loss": 0.8635,
"step": 2940
},
{
"epoch": 1.5074092999489013,
"grad_norm": 0.06803171848149765,
"learning_rate": 5e-06,
"loss": 0.87,
"step": 2950
},
{
"epoch": 1.5125191619826266,
"grad_norm": 0.0750090128477362,
"learning_rate": 5e-06,
"loss": 0.8772,
"step": 2960
},
{
"epoch": 1.5176290240163515,
"grad_norm": 0.07137689878588128,
"learning_rate": 5e-06,
"loss": 0.8751,
"step": 2970
},
{
"epoch": 1.5227388860500768,
"grad_norm": 0.07023359972800564,
"learning_rate": 5e-06,
"loss": 0.8643,
"step": 2980
},
{
"epoch": 1.5278487480838017,
"grad_norm": 0.07455037859546688,
"learning_rate": 5e-06,
"loss": 0.861,
"step": 2990
},
{
"epoch": 1.532958610117527,
"grad_norm": 0.25433406248411833,
"learning_rate": 5e-06,
"loss": 0.8647,
"step": 3000
},
{
"epoch": 1.5380684721512519,
"grad_norm": 0.07418037183166513,
"learning_rate": 5e-06,
"loss": 0.8722,
"step": 3010
},
{
"epoch": 1.543178334184977,
"grad_norm": 0.527753296001325,
"learning_rate": 5e-06,
"loss": 0.8487,
"step": 3020
},
{
"epoch": 1.548288196218702,
"grad_norm": 0.08808550920766758,
"learning_rate": 5e-06,
"loss": 0.8496,
"step": 3030
},
{
"epoch": 1.5533980582524272,
"grad_norm": 0.2138159671824102,
"learning_rate": 5e-06,
"loss": 0.8589,
"step": 3040
},
{
"epoch": 1.5585079202861523,
"grad_norm": 0.0756879397945152,
"learning_rate": 5e-06,
"loss": 0.8611,
"step": 3050
},
{
"epoch": 1.5636177823198774,
"grad_norm": 0.07261770215074904,
"learning_rate": 5e-06,
"loss": 0.8653,
"step": 3060
},
{
"epoch": 1.5687276443536025,
"grad_norm": 0.0716298530862272,
"learning_rate": 5e-06,
"loss": 0.8742,
"step": 3070
},
{
"epoch": 1.5738375063873276,
"grad_norm": 0.07104512740332262,
"learning_rate": 5e-06,
"loss": 0.8639,
"step": 3080
},
{
"epoch": 1.5789473684210527,
"grad_norm": 0.0734502304224508,
"learning_rate": 5e-06,
"loss": 0.8654,
"step": 3090
},
{
"epoch": 1.5840572304547778,
"grad_norm": 0.0718119752165925,
"learning_rate": 5e-06,
"loss": 0.8829,
"step": 3100
},
{
"epoch": 1.5891670924885029,
"grad_norm": 0.09084312572124162,
"learning_rate": 5e-06,
"loss": 0.87,
"step": 3110
},
{
"epoch": 1.594276954522228,
"grad_norm": 0.0718386089812322,
"learning_rate": 5e-06,
"loss": 0.8674,
"step": 3120
},
{
"epoch": 1.599386816555953,
"grad_norm": 0.07237170282771108,
"learning_rate": 5e-06,
"loss": 0.8773,
"step": 3130
},
{
"epoch": 1.604496678589678,
"grad_norm": 0.08717441511772783,
"learning_rate": 5e-06,
"loss": 0.8596,
"step": 3140
},
{
"epoch": 1.6096065406234032,
"grad_norm": 0.07543204298441727,
"learning_rate": 5e-06,
"loss": 0.8662,
"step": 3150
},
{
"epoch": 1.6147164026571281,
"grad_norm": 0.16450122906983272,
"learning_rate": 5e-06,
"loss": 0.8521,
"step": 3160
},
{
"epoch": 1.6198262646908534,
"grad_norm": 0.0761301308978785,
"learning_rate": 5e-06,
"loss": 0.8676,
"step": 3170
},
{
"epoch": 1.6249361267245783,
"grad_norm": 0.07342038953957022,
"learning_rate": 5e-06,
"loss": 0.858,
"step": 3180
},
{
"epoch": 1.6300459887583036,
"grad_norm": 0.07763414058632681,
"learning_rate": 5e-06,
"loss": 0.8515,
"step": 3190
},
{
"epoch": 1.6351558507920285,
"grad_norm": 0.07214206206945292,
"learning_rate": 5e-06,
"loss": 0.8552,
"step": 3200
},
{
"epoch": 1.6402657128257538,
"grad_norm": 0.07217350290839333,
"learning_rate": 5e-06,
"loss": 0.8692,
"step": 3210
},
{
"epoch": 1.6453755748594787,
"grad_norm": 0.07049849978950093,
"learning_rate": 5e-06,
"loss": 0.8637,
"step": 3220
},
{
"epoch": 1.650485436893204,
"grad_norm": 0.07280794458006132,
"learning_rate": 5e-06,
"loss": 0.865,
"step": 3230
},
{
"epoch": 1.655595298926929,
"grad_norm": 0.0725642747319213,
"learning_rate": 5e-06,
"loss": 0.8707,
"step": 3240
},
{
"epoch": 1.660705160960654,
"grad_norm": 0.0833892314403624,
"learning_rate": 5e-06,
"loss": 0.8563,
"step": 3250
},
{
"epoch": 1.6658150229943791,
"grad_norm": 0.07584097737400752,
"learning_rate": 5e-06,
"loss": 0.8641,
"step": 3260
},
{
"epoch": 1.6709248850281042,
"grad_norm": 0.07213631314506445,
"learning_rate": 5e-06,
"loss": 0.8481,
"step": 3270
},
{
"epoch": 1.6760347470618293,
"grad_norm": 0.09381953559357656,
"learning_rate": 5e-06,
"loss": 0.8701,
"step": 3280
},
{
"epoch": 1.6811446090955544,
"grad_norm": 0.08666983262995791,
"learning_rate": 5e-06,
"loss": 0.8647,
"step": 3290
},
{
"epoch": 1.6862544711292795,
"grad_norm": 0.07241102137300287,
"learning_rate": 5e-06,
"loss": 0.8659,
"step": 3300
},
{
"epoch": 1.6913643331630046,
"grad_norm": 0.07570259861415368,
"learning_rate": 5e-06,
"loss": 0.857,
"step": 3310
},
{
"epoch": 1.6964741951967297,
"grad_norm": 0.0712132785838072,
"learning_rate": 5e-06,
"loss": 0.8628,
"step": 3320
},
{
"epoch": 1.7015840572304548,
"grad_norm": 0.08048218877183527,
"learning_rate": 5e-06,
"loss": 0.8695,
"step": 3330
},
{
"epoch": 1.70669391926418,
"grad_norm": 0.09318443713082784,
"learning_rate": 5e-06,
"loss": 0.8517,
"step": 3340
},
{
"epoch": 1.711803781297905,
"grad_norm": 0.07475815362258889,
"learning_rate": 5e-06,
"loss": 0.8511,
"step": 3350
},
{
"epoch": 1.71691364333163,
"grad_norm": 0.17675220557910395,
"learning_rate": 5e-06,
"loss": 0.8704,
"step": 3360
},
{
"epoch": 1.722023505365355,
"grad_norm": 0.08155115744528993,
"learning_rate": 5e-06,
"loss": 0.8782,
"step": 3370
},
{
"epoch": 1.7271333673990803,
"grad_norm": 0.0688426924243207,
"learning_rate": 5e-06,
"loss": 0.8745,
"step": 3380
},
{
"epoch": 1.7322432294328052,
"grad_norm": 0.20082535327419024,
"learning_rate": 5e-06,
"loss": 0.8714,
"step": 3390
},
{
"epoch": 1.7373530914665305,
"grad_norm": 0.07080189321410434,
"learning_rate": 5e-06,
"loss": 0.8599,
"step": 3400
},
{
"epoch": 1.7424629535002554,
"grad_norm": 0.07121276144536708,
"learning_rate": 5e-06,
"loss": 0.8559,
"step": 3410
},
{
"epoch": 1.7475728155339807,
"grad_norm": 0.07900066453877426,
"learning_rate": 5e-06,
"loss": 0.8478,
"step": 3420
},
{
"epoch": 1.7526826775677056,
"grad_norm": 0.06739380270330443,
"learning_rate": 5e-06,
"loss": 0.8607,
"step": 3430
},
{
"epoch": 1.757792539601431,
"grad_norm": 0.07034902532215459,
"learning_rate": 5e-06,
"loss": 0.8571,
"step": 3440
},
{
"epoch": 1.7629024016351558,
"grad_norm": 0.12075846844627011,
"learning_rate": 5e-06,
"loss": 0.8656,
"step": 3450
},
{
"epoch": 1.768012263668881,
"grad_norm": 0.06907079017712828,
"learning_rate": 5e-06,
"loss": 0.8639,
"step": 3460
},
{
"epoch": 1.773122125702606,
"grad_norm": 0.07179248413436795,
"learning_rate": 5e-06,
"loss": 0.8576,
"step": 3470
},
{
"epoch": 1.778231987736331,
"grad_norm": 0.07132838373781546,
"learning_rate": 5e-06,
"loss": 0.8491,
"step": 3480
},
{
"epoch": 1.7833418497700562,
"grad_norm": 0.06976172529981421,
"learning_rate": 5e-06,
"loss": 0.862,
"step": 3490
},
{
"epoch": 1.7884517118037813,
"grad_norm": 0.07698270839551638,
"learning_rate": 5e-06,
"loss": 0.8798,
"step": 3500
},
{
"epoch": 1.7935615738375064,
"grad_norm": 0.07221517095149313,
"learning_rate": 5e-06,
"loss": 0.8614,
"step": 3510
},
{
"epoch": 1.7986714358712315,
"grad_norm": 0.06942929153366358,
"learning_rate": 5e-06,
"loss": 0.8516,
"step": 3520
},
{
"epoch": 1.8037812979049566,
"grad_norm": 0.07515778700424314,
"learning_rate": 5e-06,
"loss": 0.8513,
"step": 3530
},
{
"epoch": 1.8088911599386817,
"grad_norm": 0.09189050887868352,
"learning_rate": 5e-06,
"loss": 0.8516,
"step": 3540
},
{
"epoch": 1.8140010219724068,
"grad_norm": 0.0728814328355567,
"learning_rate": 5e-06,
"loss": 0.8621,
"step": 3550
},
{
"epoch": 1.8191108840061319,
"grad_norm": 0.10913457550732027,
"learning_rate": 5e-06,
"loss": 0.8646,
"step": 3560
},
{
"epoch": 1.824220746039857,
"grad_norm": 0.07052348211530159,
"learning_rate": 5e-06,
"loss": 0.8681,
"step": 3570
},
{
"epoch": 1.829330608073582,
"grad_norm": 0.07264597385076586,
"learning_rate": 5e-06,
"loss": 0.8511,
"step": 3580
},
{
"epoch": 1.8344404701073072,
"grad_norm": 0.07018720603142706,
"learning_rate": 5e-06,
"loss": 0.8496,
"step": 3590
},
{
"epoch": 1.839550332141032,
"grad_norm": 0.07405274174709763,
"learning_rate": 5e-06,
"loss": 0.8642,
"step": 3600
},
{
"epoch": 1.8446601941747574,
"grad_norm": 0.06823076228063171,
"learning_rate": 5e-06,
"loss": 0.8688,
"step": 3610
},
{
"epoch": 1.8497700562084822,
"grad_norm": 0.07162073827665592,
"learning_rate": 5e-06,
"loss": 0.8539,
"step": 3620
},
{
"epoch": 1.8548799182422075,
"grad_norm": 0.08920479490208502,
"learning_rate": 5e-06,
"loss": 0.8654,
"step": 3630
},
{
"epoch": 1.8599897802759324,
"grad_norm": 0.06984715874447373,
"learning_rate": 5e-06,
"loss": 0.8398,
"step": 3640
},
{
"epoch": 1.8650996423096577,
"grad_norm": 0.07140680546752168,
"learning_rate": 5e-06,
"loss": 0.8563,
"step": 3650
},
{
"epoch": 1.8702095043433826,
"grad_norm": 0.074558519928601,
"learning_rate": 5e-06,
"loss": 0.8538,
"step": 3660
},
{
"epoch": 1.875319366377108,
"grad_norm": 0.0721153352880791,
"learning_rate": 5e-06,
"loss": 0.8681,
"step": 3670
},
{
"epoch": 1.8804292284108328,
"grad_norm": 0.0758789196857535,
"learning_rate": 5e-06,
"loss": 0.8635,
"step": 3680
},
{
"epoch": 1.8855390904445581,
"grad_norm": 0.14485848629874987,
"learning_rate": 5e-06,
"loss": 0.8481,
"step": 3690
},
{
"epoch": 1.890648952478283,
"grad_norm": 0.0819983215124987,
"learning_rate": 5e-06,
"loss": 0.8629,
"step": 3700
},
{
"epoch": 1.8957588145120083,
"grad_norm": 0.12069351872374186,
"learning_rate": 5e-06,
"loss": 0.8586,
"step": 3710
},
{
"epoch": 1.9008686765457332,
"grad_norm": 0.07900003124096018,
"learning_rate": 5e-06,
"loss": 0.8601,
"step": 3720
},
{
"epoch": 1.9059785385794583,
"grad_norm": 0.07294378619834212,
"learning_rate": 5e-06,
"loss": 0.8587,
"step": 3730
},
{
"epoch": 1.9110884006131834,
"grad_norm": 0.07407840442755602,
"learning_rate": 5e-06,
"loss": 0.8528,
"step": 3740
},
{
"epoch": 1.9161982626469085,
"grad_norm": 0.06874497219883488,
"learning_rate": 5e-06,
"loss": 0.8605,
"step": 3750
},
{
"epoch": 1.9213081246806336,
"grad_norm": 0.07843559919629378,
"learning_rate": 5e-06,
"loss": 0.8527,
"step": 3760
},
{
"epoch": 1.9264179867143587,
"grad_norm": 0.07180996099623314,
"learning_rate": 5e-06,
"loss": 0.8523,
"step": 3770
},
{
"epoch": 1.9315278487480838,
"grad_norm": 0.08056676529863596,
"learning_rate": 5e-06,
"loss": 0.8633,
"step": 3780
},
{
"epoch": 1.936637710781809,
"grad_norm": 0.07275382170948991,
"learning_rate": 5e-06,
"loss": 0.8515,
"step": 3790
},
{
"epoch": 1.941747572815534,
"grad_norm": 0.07098939053342133,
"learning_rate": 5e-06,
"loss": 0.8547,
"step": 3800
},
{
"epoch": 1.946857434849259,
"grad_norm": 0.06701155010213629,
"learning_rate": 5e-06,
"loss": 0.8481,
"step": 3810
},
{
"epoch": 1.9519672968829842,
"grad_norm": 0.07031575089856135,
"learning_rate": 5e-06,
"loss": 0.8438,
"step": 3820
},
{
"epoch": 1.9570771589167093,
"grad_norm": 0.07564841797187823,
"learning_rate": 5e-06,
"loss": 0.8582,
"step": 3830
},
{
"epoch": 1.9621870209504344,
"grad_norm": 0.07024404592017057,
"learning_rate": 5e-06,
"loss": 0.8621,
"step": 3840
},
{
"epoch": 1.9672968829841593,
"grad_norm": 0.06812867527901585,
"learning_rate": 5e-06,
"loss": 0.8535,
"step": 3850
},
{
"epoch": 1.9724067450178846,
"grad_norm": 0.06914782077210797,
"learning_rate": 5e-06,
"loss": 0.8579,
"step": 3860
},
{
"epoch": 1.9775166070516095,
"grad_norm": 0.07264208755659882,
"learning_rate": 5e-06,
"loss": 0.8587,
"step": 3870
},
{
"epoch": 1.9826264690853348,
"grad_norm": 0.07656435460747489,
"learning_rate": 5e-06,
"loss": 0.8569,
"step": 3880
},
{
"epoch": 1.9877363311190597,
"grad_norm": 0.07973389243138154,
"learning_rate": 5e-06,
"loss": 0.8519,
"step": 3890
},
{
"epoch": 1.992846193152785,
"grad_norm": 0.07179271046170288,
"learning_rate": 5e-06,
"loss": 0.8622,
"step": 3900
},
{
"epoch": 1.9979560551865099,
"grad_norm": 0.11365614273983228,
"learning_rate": 5e-06,
"loss": 0.8573,
"step": 3910
},
{
"epoch": 2.003065917220235,
"grad_norm": 0.07141837340160201,
"learning_rate": 5e-06,
"loss": 0.8513,
"step": 3920
},
{
"epoch": 2.00817577925396,
"grad_norm": 0.10505624480952601,
"learning_rate": 5e-06,
"loss": 0.8631,
"step": 3930
},
{
"epoch": 2.0132856412876854,
"grad_norm": 0.07301133387642879,
"learning_rate": 5e-06,
"loss": 0.851,
"step": 3940
},
{
"epoch": 2.0183955033214103,
"grad_norm": 0.08124439773368344,
"learning_rate": 5e-06,
"loss": 0.8556,
"step": 3950
},
{
"epoch": 2.0235053653551356,
"grad_norm": 0.18456635683801864,
"learning_rate": 5e-06,
"loss": 0.8601,
"step": 3960
},
{
"epoch": 2.0286152273888605,
"grad_norm": 0.0839091878240074,
"learning_rate": 5e-06,
"loss": 0.8476,
"step": 3970
},
{
"epoch": 2.033725089422586,
"grad_norm": 0.06923549834655754,
"learning_rate": 5e-06,
"loss": 0.836,
"step": 3980
},
{
"epoch": 2.0388349514563107,
"grad_norm": 0.07270485015207773,
"learning_rate": 5e-06,
"loss": 0.8559,
"step": 3990
},
{
"epoch": 2.043944813490036,
"grad_norm": 0.0855738032972615,
"learning_rate": 5e-06,
"loss": 0.8527,
"step": 4000
},
{
"epoch": 2.049054675523761,
"grad_norm": 0.0737050057656388,
"learning_rate": 5e-06,
"loss": 0.8532,
"step": 4010
},
{
"epoch": 2.0541645375574857,
"grad_norm": 0.10214471523135125,
"learning_rate": 5e-06,
"loss": 0.8428,
"step": 4020
},
{
"epoch": 2.059274399591211,
"grad_norm": 0.0883446336624383,
"learning_rate": 5e-06,
"loss": 0.8451,
"step": 4030
},
{
"epoch": 2.064384261624936,
"grad_norm": 0.07417051518736814,
"learning_rate": 5e-06,
"loss": 0.8536,
"step": 4040
},
{
"epoch": 2.0694941236586613,
"grad_norm": 0.06790782347477319,
"learning_rate": 5e-06,
"loss": 0.8375,
"step": 4050
},
{
"epoch": 2.074603985692386,
"grad_norm": 0.07032456253507346,
"learning_rate": 5e-06,
"loss": 0.8429,
"step": 4060
},
{
"epoch": 2.0797138477261115,
"grad_norm": 0.10296354446960965,
"learning_rate": 5e-06,
"loss": 0.8459,
"step": 4070
},
{
"epoch": 2.0848237097598363,
"grad_norm": 0.11146967299257413,
"learning_rate": 5e-06,
"loss": 0.8368,
"step": 4080
},
{
"epoch": 2.0899335717935617,
"grad_norm": 0.07549742436944092,
"learning_rate": 5e-06,
"loss": 0.8523,
"step": 4090
},
{
"epoch": 2.0950434338272865,
"grad_norm": 0.07340256510209953,
"learning_rate": 5e-06,
"loss": 0.8322,
"step": 4100
},
{
"epoch": 2.100153295861012,
"grad_norm": 0.0757031546197513,
"learning_rate": 5e-06,
"loss": 0.8551,
"step": 4110
},
{
"epoch": 2.1052631578947367,
"grad_norm": 0.15844891302909508,
"learning_rate": 5e-06,
"loss": 0.8361,
"step": 4120
},
{
"epoch": 2.110373019928462,
"grad_norm": 0.07913043954436448,
"learning_rate": 5e-06,
"loss": 0.8478,
"step": 4130
},
{
"epoch": 2.115482881962187,
"grad_norm": 0.07876653993674046,
"learning_rate": 5e-06,
"loss": 0.8491,
"step": 4140
},
{
"epoch": 2.1205927439959122,
"grad_norm": 0.1767146264245717,
"learning_rate": 5e-06,
"loss": 0.8451,
"step": 4150
},
{
"epoch": 2.125702606029637,
"grad_norm": 0.06871322115584581,
"learning_rate": 5e-06,
"loss": 0.8586,
"step": 4160
},
{
"epoch": 2.1308124680633624,
"grad_norm": 0.07906449831613256,
"learning_rate": 5e-06,
"loss": 0.8477,
"step": 4170
},
{
"epoch": 2.1359223300970873,
"grad_norm": 0.09775775034969345,
"learning_rate": 5e-06,
"loss": 0.8522,
"step": 4180
},
{
"epoch": 2.1410321921308126,
"grad_norm": 0.0760441138088996,
"learning_rate": 5e-06,
"loss": 0.8469,
"step": 4190
},
{
"epoch": 2.1461420541645375,
"grad_norm": 0.07225182587888014,
"learning_rate": 5e-06,
"loss": 0.8608,
"step": 4200
},
{
"epoch": 2.151251916198263,
"grad_norm": 0.13154761525803269,
"learning_rate": 5e-06,
"loss": 0.8488,
"step": 4210
},
{
"epoch": 2.1563617782319877,
"grad_norm": 0.07374851045064566,
"learning_rate": 5e-06,
"loss": 0.841,
"step": 4220
},
{
"epoch": 2.161471640265713,
"grad_norm": 0.07031542242629284,
"learning_rate": 5e-06,
"loss": 0.8568,
"step": 4230
},
{
"epoch": 2.166581502299438,
"grad_norm": 0.07651902738251523,
"learning_rate": 5e-06,
"loss": 0.8352,
"step": 4240
},
{
"epoch": 2.171691364333163,
"grad_norm": 0.071926401689458,
"learning_rate": 5e-06,
"loss": 0.8558,
"step": 4250
},
{
"epoch": 2.176801226366888,
"grad_norm": 0.07423622272637805,
"learning_rate": 5e-06,
"loss": 0.8418,
"step": 4260
},
{
"epoch": 2.181911088400613,
"grad_norm": 0.07126311114741704,
"learning_rate": 5e-06,
"loss": 0.8522,
"step": 4270
},
{
"epoch": 2.1870209504343383,
"grad_norm": 0.07377430376003857,
"learning_rate": 5e-06,
"loss": 0.8446,
"step": 4280
},
{
"epoch": 2.192130812468063,
"grad_norm": 0.06949191578937867,
"learning_rate": 5e-06,
"loss": 0.8454,
"step": 4290
},
{
"epoch": 2.1972406745017885,
"grad_norm": 0.07843634966497359,
"learning_rate": 5e-06,
"loss": 0.8495,
"step": 4300
},
{
"epoch": 2.2023505365355134,
"grad_norm": 0.07396166555039149,
"learning_rate": 5e-06,
"loss": 0.8544,
"step": 4310
},
{
"epoch": 2.2074603985692387,
"grad_norm": 0.06906877269519048,
"learning_rate": 5e-06,
"loss": 0.8519,
"step": 4320
},
{
"epoch": 2.2125702606029636,
"grad_norm": 0.07738435381809824,
"learning_rate": 5e-06,
"loss": 0.8612,
"step": 4330
},
{
"epoch": 2.217680122636689,
"grad_norm": 0.07272677909652538,
"learning_rate": 5e-06,
"loss": 0.8483,
"step": 4340
},
{
"epoch": 2.2227899846704138,
"grad_norm": 0.1704602161006234,
"learning_rate": 5e-06,
"loss": 0.8494,
"step": 4350
},
{
"epoch": 2.227899846704139,
"grad_norm": 0.08280430588455355,
"learning_rate": 5e-06,
"loss": 0.8369,
"step": 4360
},
{
"epoch": 2.233009708737864,
"grad_norm": 0.0778136234869108,
"learning_rate": 5e-06,
"loss": 0.8413,
"step": 4370
},
{
"epoch": 2.2381195707715893,
"grad_norm": 0.06902177322002463,
"learning_rate": 5e-06,
"loss": 0.8555,
"step": 4380
},
{
"epoch": 2.243229432805314,
"grad_norm": 0.07436632107648229,
"learning_rate": 5e-06,
"loss": 0.8341,
"step": 4390
},
{
"epoch": 2.2483392948390395,
"grad_norm": 0.08037131223518179,
"learning_rate": 5e-06,
"loss": 0.837,
"step": 4400
},
{
"epoch": 2.2534491568727644,
"grad_norm": 0.12080203046044066,
"learning_rate": 5e-06,
"loss": 0.8484,
"step": 4410
},
{
"epoch": 2.2585590189064897,
"grad_norm": 0.08092569363882715,
"learning_rate": 5e-06,
"loss": 0.8387,
"step": 4420
},
{
"epoch": 2.2636688809402146,
"grad_norm": 0.07901582143607909,
"learning_rate": 5e-06,
"loss": 0.8436,
"step": 4430
},
{
"epoch": 2.26877874297394,
"grad_norm": 0.07599584492537564,
"learning_rate": 5e-06,
"loss": 0.8603,
"step": 4440
},
{
"epoch": 2.2738886050076648,
"grad_norm": 0.07078355889274002,
"learning_rate": 5e-06,
"loss": 0.8354,
"step": 4450
},
{
"epoch": 2.27899846704139,
"grad_norm": 0.10294533024093251,
"learning_rate": 5e-06,
"loss": 0.8515,
"step": 4460
},
{
"epoch": 2.284108329075115,
"grad_norm": 0.07272362249279035,
"learning_rate": 5e-06,
"loss": 0.8448,
"step": 4470
},
{
"epoch": 2.28921819110884,
"grad_norm": 0.07263729571802433,
"learning_rate": 5e-06,
"loss": 0.8457,
"step": 4480
},
{
"epoch": 2.294328053142565,
"grad_norm": 0.07075340939726783,
"learning_rate": 5e-06,
"loss": 0.8479,
"step": 4490
},
{
"epoch": 2.2994379151762905,
"grad_norm": 0.07782417357166051,
"learning_rate": 5e-06,
"loss": 0.8553,
"step": 4500
},
{
"epoch": 2.3045477772100154,
"grad_norm": 0.07531889060674515,
"learning_rate": 5e-06,
"loss": 0.8462,
"step": 4510
},
{
"epoch": 2.3096576392437402,
"grad_norm": 0.07208019084503213,
"learning_rate": 5e-06,
"loss": 0.8525,
"step": 4520
},
{
"epoch": 2.3147675012774656,
"grad_norm": 0.0835242862697962,
"learning_rate": 5e-06,
"loss": 0.8513,
"step": 4530
},
{
"epoch": 2.3198773633111904,
"grad_norm": 0.07119059664069095,
"learning_rate": 5e-06,
"loss": 0.8447,
"step": 4540
},
{
"epoch": 2.3249872253449158,
"grad_norm": 0.07612636819413399,
"learning_rate": 5e-06,
"loss": 0.8392,
"step": 4550
},
{
"epoch": 2.3300970873786406,
"grad_norm": 0.19261221848157942,
"learning_rate": 5e-06,
"loss": 0.8412,
"step": 4560
},
{
"epoch": 2.335206949412366,
"grad_norm": 0.07559214061873046,
"learning_rate": 5e-06,
"loss": 0.8439,
"step": 4570
},
{
"epoch": 2.340316811446091,
"grad_norm": 0.07642749847907093,
"learning_rate": 5e-06,
"loss": 0.8496,
"step": 4580
},
{
"epoch": 2.345426673479816,
"grad_norm": 0.08063004427375733,
"learning_rate": 5e-06,
"loss": 0.8409,
"step": 4590
},
{
"epoch": 2.350536535513541,
"grad_norm": 0.0720684787353252,
"learning_rate": 5e-06,
"loss": 0.8307,
"step": 4600
},
{
"epoch": 2.3556463975472663,
"grad_norm": 0.0694395988603129,
"learning_rate": 5e-06,
"loss": 0.8495,
"step": 4610
},
{
"epoch": 2.3607562595809912,
"grad_norm": 0.12026943156490484,
"learning_rate": 5e-06,
"loss": 0.853,
"step": 4620
},
{
"epoch": 2.3658661216147165,
"grad_norm": 0.07464848880206523,
"learning_rate": 5e-06,
"loss": 0.8504,
"step": 4630
},
{
"epoch": 2.3709759836484414,
"grad_norm": 0.15496993855974875,
"learning_rate": 5e-06,
"loss": 0.8441,
"step": 4640
},
{
"epoch": 2.3760858456821667,
"grad_norm": 0.07134185410989129,
"learning_rate": 5e-06,
"loss": 0.8447,
"step": 4650
},
{
"epoch": 2.3811957077158916,
"grad_norm": 0.07242368272475976,
"learning_rate": 5e-06,
"loss": 0.8406,
"step": 4660
},
{
"epoch": 2.386305569749617,
"grad_norm": 0.0697443617984476,
"learning_rate": 5e-06,
"loss": 0.8329,
"step": 4670
},
{
"epoch": 2.391415431783342,
"grad_norm": 0.07044017928388062,
"learning_rate": 5e-06,
"loss": 0.8462,
"step": 4680
},
{
"epoch": 2.396525293817067,
"grad_norm": 0.06981669634551392,
"learning_rate": 5e-06,
"loss": 0.8348,
"step": 4690
},
{
"epoch": 2.401635155850792,
"grad_norm": 0.07668972004047576,
"learning_rate": 5e-06,
"loss": 0.8398,
"step": 4700
},
{
"epoch": 2.406745017884517,
"grad_norm": 0.07415042030571524,
"learning_rate": 5e-06,
"loss": 0.8443,
"step": 4710
},
{
"epoch": 2.411854879918242,
"grad_norm": 0.07295120303267964,
"learning_rate": 5e-06,
"loss": 0.8507,
"step": 4720
},
{
"epoch": 2.4169647419519675,
"grad_norm": 0.07741393469112542,
"learning_rate": 5e-06,
"loss": 0.8275,
"step": 4730
},
{
"epoch": 2.4220746039856924,
"grad_norm": 0.07693385601963627,
"learning_rate": 5e-06,
"loss": 0.8308,
"step": 4740
},
{
"epoch": 2.4271844660194173,
"grad_norm": 0.0755618799603966,
"learning_rate": 5e-06,
"loss": 0.8405,
"step": 4750
},
{
"epoch": 2.4322943280531426,
"grad_norm": 0.09159998552156758,
"learning_rate": 5e-06,
"loss": 0.8347,
"step": 4760
},
{
"epoch": 2.4374041900868675,
"grad_norm": 0.07341845982606449,
"learning_rate": 5e-06,
"loss": 0.8322,
"step": 4770
},
{
"epoch": 2.442514052120593,
"grad_norm": 0.07237831649311181,
"learning_rate": 5e-06,
"loss": 0.8484,
"step": 4780
},
{
"epoch": 2.4476239141543177,
"grad_norm": 0.07192165535074778,
"learning_rate": 5e-06,
"loss": 0.8408,
"step": 4790
},
{
"epoch": 2.452733776188043,
"grad_norm": 0.06648324207306504,
"learning_rate": 5e-06,
"loss": 0.8425,
"step": 4800
},
{
"epoch": 2.457843638221768,
"grad_norm": 0.07448716190725979,
"learning_rate": 5e-06,
"loss": 0.8425,
"step": 4810
},
{
"epoch": 2.462953500255493,
"grad_norm": 0.07121284030804295,
"learning_rate": 5e-06,
"loss": 0.8375,
"step": 4820
},
{
"epoch": 2.468063362289218,
"grad_norm": 0.06909159105773967,
"learning_rate": 5e-06,
"loss": 0.854,
"step": 4830
},
{
"epoch": 2.4731732243229434,
"grad_norm": 0.13098921577807285,
"learning_rate": 5e-06,
"loss": 0.8384,
"step": 4840
},
{
"epoch": 2.4782830863566683,
"grad_norm": 0.07170492201621687,
"learning_rate": 5e-06,
"loss": 0.8404,
"step": 4850
},
{
"epoch": 2.4833929483903936,
"grad_norm": 0.13089324735228272,
"learning_rate": 5e-06,
"loss": 0.8348,
"step": 4860
},
{
"epoch": 2.4885028104241185,
"grad_norm": 0.08153573679797024,
"learning_rate": 5e-06,
"loss": 0.842,
"step": 4870
},
{
"epoch": 2.493612672457844,
"grad_norm": 0.07186193556818891,
"learning_rate": 5e-06,
"loss": 0.8401,
"step": 4880
},
{
"epoch": 2.4987225344915687,
"grad_norm": 0.06985782726108822,
"learning_rate": 5e-06,
"loss": 0.8358,
"step": 4890
},
{
"epoch": 2.5038323965252935,
"grad_norm": 0.08449885090909025,
"learning_rate": 5e-06,
"loss": 0.832,
"step": 4900
},
{
"epoch": 2.508942258559019,
"grad_norm": 0.07501130238223368,
"learning_rate": 5e-06,
"loss": 0.8418,
"step": 4910
},
{
"epoch": 2.514052120592744,
"grad_norm": 0.07141698939838328,
"learning_rate": 5e-06,
"loss": 0.8446,
"step": 4920
},
{
"epoch": 2.519161982626469,
"grad_norm": 0.08787977387334635,
"learning_rate": 5e-06,
"loss": 0.8452,
"step": 4930
},
{
"epoch": 2.524271844660194,
"grad_norm": 0.07291979045054302,
"learning_rate": 5e-06,
"loss": 0.8272,
"step": 4940
},
{
"epoch": 2.5293817066939193,
"grad_norm": 0.09132127326249193,
"learning_rate": 5e-06,
"loss": 0.8307,
"step": 4950
},
{
"epoch": 2.5344915687276446,
"grad_norm": 0.07043767494061426,
"learning_rate": 5e-06,
"loss": 0.8341,
"step": 4960
},
{
"epoch": 2.5396014307613695,
"grad_norm": 0.06814985809885249,
"learning_rate": 5e-06,
"loss": 0.8434,
"step": 4970
},
{
"epoch": 2.5447112927950943,
"grad_norm": 0.06881812234299259,
"learning_rate": 5e-06,
"loss": 0.8336,
"step": 4980
},
{
"epoch": 2.5498211548288197,
"grad_norm": 0.07479588795644554,
"learning_rate": 5e-06,
"loss": 0.841,
"step": 4990
},
{
"epoch": 2.554931016862545,
"grad_norm": 0.07030708837765214,
"learning_rate": 5e-06,
"loss": 0.8345,
"step": 5000
},
{
"epoch": 2.56004087889627,
"grad_norm": 0.07322531185816425,
"learning_rate": 5e-06,
"loss": 0.8522,
"step": 5010
},
{
"epoch": 2.5651507409299947,
"grad_norm": 0.0773410219834939,
"learning_rate": 5e-06,
"loss": 0.8336,
"step": 5020
},
{
"epoch": 2.57026060296372,
"grad_norm": 0.07112761941798731,
"learning_rate": 5e-06,
"loss": 0.8356,
"step": 5030
},
{
"epoch": 2.575370464997445,
"grad_norm": 0.08092671080816972,
"learning_rate": 5e-06,
"loss": 0.8395,
"step": 5040
},
{
"epoch": 2.5804803270311703,
"grad_norm": 0.0743938817129939,
"learning_rate": 5e-06,
"loss": 0.8436,
"step": 5050
},
{
"epoch": 2.585590189064895,
"grad_norm": 0.09478134737738281,
"learning_rate": 5e-06,
"loss": 0.8346,
"step": 5060
},
{
"epoch": 2.5907000510986204,
"grad_norm": 0.07135602438861294,
"learning_rate": 5e-06,
"loss": 0.8478,
"step": 5070
},
{
"epoch": 2.5958099131323453,
"grad_norm": 0.07891915237359796,
"learning_rate": 5e-06,
"loss": 0.8447,
"step": 5080
},
{
"epoch": 2.6009197751660706,
"grad_norm": 0.0677288319217476,
"learning_rate": 5e-06,
"loss": 0.837,
"step": 5090
},
{
"epoch": 2.6060296371997955,
"grad_norm": 0.07944388902515932,
"learning_rate": 5e-06,
"loss": 0.835,
"step": 5100
},
{
"epoch": 2.611139499233521,
"grad_norm": 0.08111140690525463,
"learning_rate": 5e-06,
"loss": 0.8514,
"step": 5110
},
{
"epoch": 2.6162493612672457,
"grad_norm": 0.07187495630590449,
"learning_rate": 5e-06,
"loss": 0.8218,
"step": 5120
},
{
"epoch": 2.6213592233009706,
"grad_norm": 0.07201247333070886,
"learning_rate": 5e-06,
"loss": 0.8459,
"step": 5130
},
{
"epoch": 2.626469085334696,
"grad_norm": 0.0725250109726293,
"learning_rate": 5e-06,
"loss": 0.8316,
"step": 5140
},
{
"epoch": 2.6315789473684212,
"grad_norm": 0.06989455908125097,
"learning_rate": 5e-06,
"loss": 0.8387,
"step": 5150
},
{
"epoch": 2.636688809402146,
"grad_norm": 0.0732170962484253,
"learning_rate": 5e-06,
"loss": 0.8293,
"step": 5160
},
{
"epoch": 2.641798671435871,
"grad_norm": 0.10306093874083574,
"learning_rate": 5e-06,
"loss": 0.8232,
"step": 5170
},
{
"epoch": 2.6469085334695963,
"grad_norm": 0.08216331881118105,
"learning_rate": 5e-06,
"loss": 0.8313,
"step": 5180
},
{
"epoch": 2.6520183955033216,
"grad_norm": 0.076191518190693,
"learning_rate": 5e-06,
"loss": 0.8421,
"step": 5190
},
{
"epoch": 2.6571282575370465,
"grad_norm": 0.0740134986538764,
"learning_rate": 5e-06,
"loss": 0.8299,
"step": 5200
},
{
"epoch": 2.6622381195707714,
"grad_norm": 0.07496389694090964,
"learning_rate": 5e-06,
"loss": 0.8487,
"step": 5210
},
{
"epoch": 2.6673479816044967,
"grad_norm": 0.07747091248924778,
"learning_rate": 5e-06,
"loss": 0.8347,
"step": 5220
},
{
"epoch": 2.672457843638222,
"grad_norm": 0.07574688789486558,
"learning_rate": 5e-06,
"loss": 0.8425,
"step": 5230
},
{
"epoch": 2.677567705671947,
"grad_norm": 0.6563717615599837,
"learning_rate": 5e-06,
"loss": 0.84,
"step": 5240
},
{
"epoch": 2.682677567705672,
"grad_norm": 0.0686367826525851,
"learning_rate": 5e-06,
"loss": 0.8379,
"step": 5250
},
{
"epoch": 2.687787429739397,
"grad_norm": 0.07371216319372703,
"learning_rate": 5e-06,
"loss": 0.8394,
"step": 5260
},
{
"epoch": 2.692897291773122,
"grad_norm": 0.08012300669491436,
"learning_rate": 5e-06,
"loss": 0.8384,
"step": 5270
},
{
"epoch": 2.6980071538068473,
"grad_norm": 0.07316847913612938,
"learning_rate": 5e-06,
"loss": 0.8282,
"step": 5280
},
{
"epoch": 2.703117015840572,
"grad_norm": 0.07844263026076834,
"learning_rate": 5e-06,
"loss": 0.8436,
"step": 5290
},
{
"epoch": 2.7082268778742975,
"grad_norm": 0.07050662660833308,
"learning_rate": 5e-06,
"loss": 0.8351,
"step": 5300
},
{
"epoch": 2.7133367399080224,
"grad_norm": 0.07050037035095988,
"learning_rate": 5e-06,
"loss": 0.8316,
"step": 5310
},
{
"epoch": 2.7184466019417477,
"grad_norm": 0.08412885461053499,
"learning_rate": 5e-06,
"loss": 0.8222,
"step": 5320
},
{
"epoch": 2.7235564639754726,
"grad_norm": 0.0687520823166467,
"learning_rate": 5e-06,
"loss": 0.8348,
"step": 5330
},
{
"epoch": 2.728666326009198,
"grad_norm": 0.06966312923863605,
"learning_rate": 5e-06,
"loss": 0.8441,
"step": 5340
},
{
"epoch": 2.7337761880429228,
"grad_norm": 0.07226731254674132,
"learning_rate": 5e-06,
"loss": 0.848,
"step": 5350
},
{
"epoch": 2.738886050076648,
"grad_norm": 0.06981884709594542,
"learning_rate": 5e-06,
"loss": 0.8431,
"step": 5360
},
{
"epoch": 2.743995912110373,
"grad_norm": 0.2293923868794652,
"learning_rate": 5e-06,
"loss": 0.8258,
"step": 5370
},
{
"epoch": 2.7491057741440983,
"grad_norm": 0.0711478134486219,
"learning_rate": 5e-06,
"loss": 0.8457,
"step": 5380
},
{
"epoch": 2.754215636177823,
"grad_norm": 0.0748407931600363,
"learning_rate": 5e-06,
"loss": 0.8368,
"step": 5390
},
{
"epoch": 2.759325498211548,
"grad_norm": 0.07392069328246453,
"learning_rate": 5e-06,
"loss": 0.8352,
"step": 5400
},
{
"epoch": 2.7644353602452734,
"grad_norm": 0.0721976398087471,
"learning_rate": 5e-06,
"loss": 0.8393,
"step": 5410
},
{
"epoch": 2.7695452222789987,
"grad_norm": 0.07095428053471639,
"learning_rate": 5e-06,
"loss": 0.8471,
"step": 5420
},
{
"epoch": 2.7746550843127236,
"grad_norm": 0.07005521600898579,
"learning_rate": 5e-06,
"loss": 0.8313,
"step": 5430
},
{
"epoch": 2.7797649463464484,
"grad_norm": 0.07197884091944991,
"learning_rate": 5e-06,
"loss": 0.8344,
"step": 5440
},
{
"epoch": 2.7848748083801738,
"grad_norm": 0.07344997287379904,
"learning_rate": 5e-06,
"loss": 0.8362,
"step": 5450
},
{
"epoch": 2.789984670413899,
"grad_norm": 0.08322662489974626,
"learning_rate": 5e-06,
"loss": 0.8367,
"step": 5460
},
{
"epoch": 2.795094532447624,
"grad_norm": 0.07375885412776004,
"learning_rate": 5e-06,
"loss": 0.8512,
"step": 5470
},
{
"epoch": 2.800204394481349,
"grad_norm": 0.07070472807893792,
"learning_rate": 5e-06,
"loss": 0.8253,
"step": 5480
},
{
"epoch": 2.805314256515074,
"grad_norm": 0.07428848558504005,
"learning_rate": 5e-06,
"loss": 0.8468,
"step": 5490
},
{
"epoch": 2.810424118548799,
"grad_norm": 0.07307806588861744,
"learning_rate": 5e-06,
"loss": 0.8234,
"step": 5500
},
{
"epoch": 2.8155339805825244,
"grad_norm": 0.06945402346011086,
"learning_rate": 5e-06,
"loss": 0.8249,
"step": 5510
},
{
"epoch": 2.8206438426162492,
"grad_norm": 0.07097605102696264,
"learning_rate": 5e-06,
"loss": 0.825,
"step": 5520
},
{
"epoch": 2.8257537046499746,
"grad_norm": 0.06978991852647402,
"learning_rate": 5e-06,
"loss": 0.832,
"step": 5530
},
{
"epoch": 2.8308635666836994,
"grad_norm": 0.06908078380765026,
"learning_rate": 5e-06,
"loss": 0.8399,
"step": 5540
},
{
"epoch": 2.8359734287174247,
"grad_norm": 0.07440005138379917,
"learning_rate": 5e-06,
"loss": 0.8586,
"step": 5550
},
{
"epoch": 2.8410832907511496,
"grad_norm": 0.07215664654991572,
"learning_rate": 5e-06,
"loss": 0.8344,
"step": 5560
},
{
"epoch": 2.846193152784875,
"grad_norm": 0.07245462980842311,
"learning_rate": 5e-06,
"loss": 0.8377,
"step": 5570
},
{
"epoch": 2.8513030148186,
"grad_norm": 0.09751916723568736,
"learning_rate": 5e-06,
"loss": 0.8359,
"step": 5580
},
{
"epoch": 2.856412876852325,
"grad_norm": 0.0855945660262981,
"learning_rate": 5e-06,
"loss": 0.8204,
"step": 5590
},
{
"epoch": 2.86152273888605,
"grad_norm": 0.07362144431075729,
"learning_rate": 5e-06,
"loss": 0.8339,
"step": 5600
},
{
"epoch": 2.8666326009197753,
"grad_norm": 0.13422144896661692,
"learning_rate": 5e-06,
"loss": 0.8156,
"step": 5610
},
{
"epoch": 2.8717424629535,
"grad_norm": 0.07766367283659784,
"learning_rate": 5e-06,
"loss": 0.8366,
"step": 5620
},
{
"epoch": 2.876852324987225,
"grad_norm": 0.07067764533904561,
"learning_rate": 5e-06,
"loss": 0.8319,
"step": 5630
},
{
"epoch": 2.8819621870209504,
"grad_norm": 0.07333934553515849,
"learning_rate": 5e-06,
"loss": 0.8403,
"step": 5640
},
{
"epoch": 2.8870720490546757,
"grad_norm": 0.06944274451784573,
"learning_rate": 5e-06,
"loss": 0.8147,
"step": 5650
},
{
"epoch": 2.8921819110884006,
"grad_norm": 0.07851574427605637,
"learning_rate": 5e-06,
"loss": 0.8324,
"step": 5660
},
{
"epoch": 2.8972917731221255,
"grad_norm": 0.07149894775276078,
"learning_rate": 5e-06,
"loss": 0.8299,
"step": 5670
},
{
"epoch": 2.902401635155851,
"grad_norm": 0.06665250066960869,
"learning_rate": 5e-06,
"loss": 0.8375,
"step": 5680
},
{
"epoch": 2.907511497189576,
"grad_norm": 0.07577406445198871,
"learning_rate": 5e-06,
"loss": 0.8448,
"step": 5690
},
{
"epoch": 2.912621359223301,
"grad_norm": 0.0713958097703238,
"learning_rate": 5e-06,
"loss": 0.8229,
"step": 5700
},
{
"epoch": 2.917731221257026,
"grad_norm": 0.06957330049674462,
"learning_rate": 5e-06,
"loss": 0.8183,
"step": 5710
},
{
"epoch": 2.922841083290751,
"grad_norm": 0.07288163057814427,
"learning_rate": 5e-06,
"loss": 0.8331,
"step": 5720
},
{
"epoch": 2.927950945324476,
"grad_norm": 0.08344782116194745,
"learning_rate": 5e-06,
"loss": 0.8438,
"step": 5730
},
{
"epoch": 2.9330608073582014,
"grad_norm": 0.07775582347773638,
"learning_rate": 5e-06,
"loss": 0.8366,
"step": 5740
},
{
"epoch": 2.9381706693919263,
"grad_norm": 0.07127430773847676,
"learning_rate": 5e-06,
"loss": 0.8292,
"step": 5750
},
{
"epoch": 2.9432805314256516,
"grad_norm": 0.07234002587778945,
"learning_rate": 5e-06,
"loss": 0.8243,
"step": 5760
},
{
"epoch": 2.9483903934593765,
"grad_norm": 0.07200420571093186,
"learning_rate": 5e-06,
"loss": 0.8359,
"step": 5770
},
{
"epoch": 2.953500255493102,
"grad_norm": 0.07108664442621124,
"learning_rate": 5e-06,
"loss": 0.8238,
"step": 5780
},
{
"epoch": 2.9586101175268267,
"grad_norm": 0.07157660179737228,
"learning_rate": 5e-06,
"loss": 0.8396,
"step": 5790
},
{
"epoch": 2.963719979560552,
"grad_norm": 0.0703513442497635,
"learning_rate": 5e-06,
"loss": 0.846,
"step": 5800
},
{
"epoch": 2.968829841594277,
"grad_norm": 0.07279488037158444,
"learning_rate": 5e-06,
"loss": 0.8328,
"step": 5810
},
{
"epoch": 2.973939703628002,
"grad_norm": 0.07319627670370121,
"learning_rate": 5e-06,
"loss": 0.839,
"step": 5820
},
{
"epoch": 2.979049565661727,
"grad_norm": 0.06864454810394688,
"learning_rate": 5e-06,
"loss": 0.838,
"step": 5830
},
{
"epoch": 2.9841594276954524,
"grad_norm": 0.07409945387796224,
"learning_rate": 5e-06,
"loss": 0.8325,
"step": 5840
},
{
"epoch": 2.9892692897291773,
"grad_norm": 0.0827828571554194,
"learning_rate": 5e-06,
"loss": 0.8348,
"step": 5850
},
{
"epoch": 2.994379151762902,
"grad_norm": 0.07176468776677063,
"learning_rate": 5e-06,
"loss": 0.8345,
"step": 5860
},
{
"epoch": 2.9994890137966275,
"grad_norm": 0.07248509371414884,
"learning_rate": 5e-06,
"loss": 0.8471,
"step": 5870
},
{
"epoch": 3.0,
"step": 5871,
"total_flos": 1.808135467381555e+16,
"train_loss": 0.8794001941176538,
"train_runtime": 31816.8802,
"train_samples_per_second": 94.436,
"train_steps_per_second": 0.185
}
],
"logging_steps": 10,
"max_steps": 5871,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 1.808135467381555e+16,
"train_batch_size": 16,
"trial_name": null,
"trial_params": null
}