Grogros's picture
Training in progress, step 3000, checkpoint
8c93d7c verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.458882239345328,
"eval_steps": 500,
"global_step": 3000,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0015296074644844267,
"grad_norm": 39.202430725097656,
"learning_rate": 4.0000000000000003e-07,
"loss": 2.2544,
"step": 10
},
{
"epoch": 0.0030592149289688534,
"grad_norm": 6.937076568603516,
"learning_rate": 8.000000000000001e-07,
"loss": 1.9815,
"step": 20
},
{
"epoch": 0.00458882239345328,
"grad_norm": 12.800159454345703,
"learning_rate": 1.2000000000000002e-06,
"loss": 1.8989,
"step": 30
},
{
"epoch": 0.006118429857937707,
"grad_norm": 18.019540786743164,
"learning_rate": 1.6000000000000001e-06,
"loss": 1.894,
"step": 40
},
{
"epoch": 0.007648037322422133,
"grad_norm": 19.988229751586914,
"learning_rate": 2.0000000000000003e-06,
"loss": 1.8713,
"step": 50
},
{
"epoch": 0.00917764478690656,
"grad_norm": 21.80278968811035,
"learning_rate": 2.4000000000000003e-06,
"loss": 1.8662,
"step": 60
},
{
"epoch": 0.010707252251390987,
"grad_norm": 23.45534324645996,
"learning_rate": 2.8000000000000003e-06,
"loss": 1.8661,
"step": 70
},
{
"epoch": 0.012236859715875414,
"grad_norm": 23.260648727416992,
"learning_rate": 3.2000000000000003e-06,
"loss": 1.8222,
"step": 80
},
{
"epoch": 0.013766467180359841,
"grad_norm": 23.629636764526367,
"learning_rate": 3.6000000000000003e-06,
"loss": 1.8312,
"step": 90
},
{
"epoch": 0.015296074644844266,
"grad_norm": 25.211572647094727,
"learning_rate": 4.000000000000001e-06,
"loss": 1.8441,
"step": 100
},
{
"epoch": 0.016825682109328693,
"grad_norm": 23.6651668548584,
"learning_rate": 4.4e-06,
"loss": 1.8113,
"step": 110
},
{
"epoch": 0.01835528957381312,
"grad_norm": 24.198888778686523,
"learning_rate": 4.800000000000001e-06,
"loss": 1.8209,
"step": 120
},
{
"epoch": 0.019884897038297548,
"grad_norm": 21.0020694732666,
"learning_rate": 5.2e-06,
"loss": 1.8178,
"step": 130
},
{
"epoch": 0.021414504502781973,
"grad_norm": 22.20139503479004,
"learning_rate": 5.600000000000001e-06,
"loss": 1.8184,
"step": 140
},
{
"epoch": 0.0229441119672664,
"grad_norm": 20.499448776245117,
"learning_rate": 6e-06,
"loss": 1.7967,
"step": 150
},
{
"epoch": 0.024473719431750827,
"grad_norm": 27.421058654785156,
"learning_rate": 6.4000000000000006e-06,
"loss": 1.7887,
"step": 160
},
{
"epoch": 0.026003326896235253,
"grad_norm": 19.557756423950195,
"learning_rate": 6.800000000000001e-06,
"loss": 1.7954,
"step": 170
},
{
"epoch": 0.027532934360719682,
"grad_norm": 27.11305809020996,
"learning_rate": 7.2000000000000005e-06,
"loss": 1.7946,
"step": 180
},
{
"epoch": 0.029062541825204107,
"grad_norm": 19.00750160217285,
"learning_rate": 7.600000000000001e-06,
"loss": 1.7845,
"step": 190
},
{
"epoch": 0.030592149289688533,
"grad_norm": 18.32988929748535,
"learning_rate": 8.000000000000001e-06,
"loss": 1.7817,
"step": 200
},
{
"epoch": 0.03212175675417296,
"grad_norm": 22.121030807495117,
"learning_rate": 8.400000000000001e-06,
"loss": 1.8228,
"step": 210
},
{
"epoch": 0.03365136421865739,
"grad_norm": 30.522912979125977,
"learning_rate": 8.8e-06,
"loss": 1.8359,
"step": 220
},
{
"epoch": 0.03518097168314181,
"grad_norm": 26.51453399658203,
"learning_rate": 9.200000000000002e-06,
"loss": 1.7971,
"step": 230
},
{
"epoch": 0.03671057914762624,
"grad_norm": 28.695058822631836,
"learning_rate": 9.600000000000001e-06,
"loss": 1.8302,
"step": 240
},
{
"epoch": 0.03824018661211067,
"grad_norm": 27.09485626220703,
"learning_rate": 1e-05,
"loss": 1.829,
"step": 250
},
{
"epoch": 0.039769794076595096,
"grad_norm": 19.422021865844727,
"learning_rate": 1.04e-05,
"loss": 1.7814,
"step": 260
},
{
"epoch": 0.04129940154107952,
"grad_norm": 26.004735946655273,
"learning_rate": 1.0800000000000002e-05,
"loss": 1.8323,
"step": 270
},
{
"epoch": 0.042829009005563946,
"grad_norm": 21.66231346130371,
"learning_rate": 1.1200000000000001e-05,
"loss": 1.8503,
"step": 280
},
{
"epoch": 0.04435861647004837,
"grad_norm": 16.89419174194336,
"learning_rate": 1.16e-05,
"loss": 1.8079,
"step": 290
},
{
"epoch": 0.0458882239345328,
"grad_norm": 11.961407661437988,
"learning_rate": 1.2e-05,
"loss": 1.8598,
"step": 300
},
{
"epoch": 0.04741783139901723,
"grad_norm": 22.449304580688477,
"learning_rate": 1.2400000000000002e-05,
"loss": 1.8461,
"step": 310
},
{
"epoch": 0.048947438863501655,
"grad_norm": 17.80685806274414,
"learning_rate": 1.2800000000000001e-05,
"loss": 1.8374,
"step": 320
},
{
"epoch": 0.05047704632798608,
"grad_norm": 20.850351333618164,
"learning_rate": 1.3200000000000002e-05,
"loss": 1.8325,
"step": 330
},
{
"epoch": 0.052006653792470506,
"grad_norm": 18.351491928100586,
"learning_rate": 1.3600000000000002e-05,
"loss": 1.8575,
"step": 340
},
{
"epoch": 0.05353626125695493,
"grad_norm": 23.839378356933594,
"learning_rate": 1.4e-05,
"loss": 1.8521,
"step": 350
},
{
"epoch": 0.055065868721439364,
"grad_norm": 17.616167068481445,
"learning_rate": 1.4400000000000001e-05,
"loss": 1.8278,
"step": 360
},
{
"epoch": 0.05659547618592379,
"grad_norm": 25.248546600341797,
"learning_rate": 1.48e-05,
"loss": 1.8904,
"step": 370
},
{
"epoch": 0.058125083650408214,
"grad_norm": 23.11628532409668,
"learning_rate": 1.5200000000000002e-05,
"loss": 1.8552,
"step": 380
},
{
"epoch": 0.05965469111489264,
"grad_norm": 21.66451072692871,
"learning_rate": 1.5600000000000003e-05,
"loss": 1.8795,
"step": 390
},
{
"epoch": 0.061184298579377065,
"grad_norm": 22.456846237182617,
"learning_rate": 1.6000000000000003e-05,
"loss": 1.8819,
"step": 400
},
{
"epoch": 0.0627139060438615,
"grad_norm": 18.99188232421875,
"learning_rate": 1.64e-05,
"loss": 1.8874,
"step": 410
},
{
"epoch": 0.06424351350834592,
"grad_norm": 20.916900634765625,
"learning_rate": 1.6800000000000002e-05,
"loss": 1.8895,
"step": 420
},
{
"epoch": 0.06577312097283035,
"grad_norm": 25.91555404663086,
"learning_rate": 1.72e-05,
"loss": 1.8769,
"step": 430
},
{
"epoch": 0.06730272843731477,
"grad_norm": 20.888111114501953,
"learning_rate": 1.76e-05,
"loss": 1.8992,
"step": 440
},
{
"epoch": 0.0688323359017992,
"grad_norm": 18.96579933166504,
"learning_rate": 1.8e-05,
"loss": 1.9045,
"step": 450
},
{
"epoch": 0.07036194336628362,
"grad_norm": 27.60817527770996,
"learning_rate": 1.8400000000000003e-05,
"loss": 1.8765,
"step": 460
},
{
"epoch": 0.07189155083076805,
"grad_norm": 21.365825653076172,
"learning_rate": 1.88e-05,
"loss": 1.9315,
"step": 470
},
{
"epoch": 0.07342115829525248,
"grad_norm": 21.732152938842773,
"learning_rate": 1.9200000000000003e-05,
"loss": 1.9313,
"step": 480
},
{
"epoch": 0.0749507657597369,
"grad_norm": 21.766752243041992,
"learning_rate": 1.9600000000000002e-05,
"loss": 1.9151,
"step": 490
},
{
"epoch": 0.07648037322422134,
"grad_norm": 21.943374633789062,
"learning_rate": 2e-05,
"loss": 1.9768,
"step": 500
},
{
"epoch": 0.07800998068870577,
"grad_norm": 20.251033782958984,
"learning_rate": 1.9999756307053947e-05,
"loss": 1.9019,
"step": 510
},
{
"epoch": 0.07953958815319019,
"grad_norm": 26.903671264648438,
"learning_rate": 1.9999025240093045e-05,
"loss": 1.9229,
"step": 520
},
{
"epoch": 0.08106919561767462,
"grad_norm": 17.911582946777344,
"learning_rate": 1.9997806834748455e-05,
"loss": 1.9123,
"step": 530
},
{
"epoch": 0.08259880308215904,
"grad_norm": 17.10667610168457,
"learning_rate": 1.9996101150403543e-05,
"loss": 1.8725,
"step": 540
},
{
"epoch": 0.08412841054664347,
"grad_norm": 23.082595825195312,
"learning_rate": 1.999390827019096e-05,
"loss": 1.9008,
"step": 550
},
{
"epoch": 0.08565801801112789,
"grad_norm": 16.786951065063477,
"learning_rate": 1.9991228300988586e-05,
"loss": 1.9283,
"step": 560
},
{
"epoch": 0.08718762547561232,
"grad_norm": 22.904605865478516,
"learning_rate": 1.9988061373414342e-05,
"loss": 1.886,
"step": 570
},
{
"epoch": 0.08871723294009674,
"grad_norm": 18.19251823425293,
"learning_rate": 1.9984407641819812e-05,
"loss": 1.8846,
"step": 580
},
{
"epoch": 0.09024684040458117,
"grad_norm": 17.656436920166016,
"learning_rate": 1.9980267284282718e-05,
"loss": 1.8561,
"step": 590
},
{
"epoch": 0.0917764478690656,
"grad_norm": 20.029586791992188,
"learning_rate": 1.9975640502598243e-05,
"loss": 1.867,
"step": 600
},
{
"epoch": 0.09330605533355003,
"grad_norm": 15.413036346435547,
"learning_rate": 1.9970527522269204e-05,
"loss": 1.867,
"step": 610
},
{
"epoch": 0.09483566279803446,
"grad_norm": 17.78459930419922,
"learning_rate": 1.9964928592495046e-05,
"loss": 1.8522,
"step": 620
},
{
"epoch": 0.09636527026251888,
"grad_norm": 18.63687515258789,
"learning_rate": 1.9958843986159705e-05,
"loss": 1.8715,
"step": 630
},
{
"epoch": 0.09789487772700331,
"grad_norm": 18.381534576416016,
"learning_rate": 1.9952273999818312e-05,
"loss": 1.8483,
"step": 640
},
{
"epoch": 0.09942448519148774,
"grad_norm": 17.309951782226562,
"learning_rate": 1.9945218953682736e-05,
"loss": 1.8411,
"step": 650
},
{
"epoch": 0.10095409265597216,
"grad_norm": 16.78116798400879,
"learning_rate": 1.9937679191605964e-05,
"loss": 1.8417,
"step": 660
},
{
"epoch": 0.10248370012045659,
"grad_norm": 14.930190086364746,
"learning_rate": 1.992965508106537e-05,
"loss": 1.7848,
"step": 670
},
{
"epoch": 0.10401330758494101,
"grad_norm": 15.964579582214355,
"learning_rate": 1.9921147013144782e-05,
"loss": 1.8235,
"step": 680
},
{
"epoch": 0.10554291504942544,
"grad_norm": 19.765722274780273,
"learning_rate": 1.991215540251542e-05,
"loss": 1.8351,
"step": 690
},
{
"epoch": 0.10707252251390986,
"grad_norm": 22.259653091430664,
"learning_rate": 1.9902680687415704e-05,
"loss": 1.873,
"step": 700
},
{
"epoch": 0.10860212997839429,
"grad_norm": 17.007463455200195,
"learning_rate": 1.9892723329629885e-05,
"loss": 1.7792,
"step": 710
},
{
"epoch": 0.11013173744287873,
"grad_norm": 16.560100555419922,
"learning_rate": 1.988228381446553e-05,
"loss": 1.7872,
"step": 720
},
{
"epoch": 0.11166134490736315,
"grad_norm": 20.133487701416016,
"learning_rate": 1.987136265072988e-05,
"loss": 1.8089,
"step": 730
},
{
"epoch": 0.11319095237184758,
"grad_norm": 13.525500297546387,
"learning_rate": 1.985996037070505e-05,
"loss": 1.8169,
"step": 740
},
{
"epoch": 0.114720559836332,
"grad_norm": 17.305484771728516,
"learning_rate": 1.9848077530122083e-05,
"loss": 1.8002,
"step": 750
},
{
"epoch": 0.11625016730081643,
"grad_norm": 14.32529067993164,
"learning_rate": 1.983571470813386e-05,
"loss": 1.7969,
"step": 760
},
{
"epoch": 0.11777977476530085,
"grad_norm": 16.328943252563477,
"learning_rate": 1.982287250728689e-05,
"loss": 1.7679,
"step": 770
},
{
"epoch": 0.11930938222978528,
"grad_norm": 10.76165771484375,
"learning_rate": 1.9809551553491918e-05,
"loss": 1.8304,
"step": 780
},
{
"epoch": 0.1208389896942697,
"grad_norm": 14.789531707763672,
"learning_rate": 1.979575249599344e-05,
"loss": 1.8215,
"step": 790
},
{
"epoch": 0.12236859715875413,
"grad_norm": 16.026445388793945,
"learning_rate": 1.9781476007338058e-05,
"loss": 1.7581,
"step": 800
},
{
"epoch": 0.12389820462323856,
"grad_norm": 17.013036727905273,
"learning_rate": 1.9766722783341682e-05,
"loss": 1.7698,
"step": 810
},
{
"epoch": 0.125427812087723,
"grad_norm": 24.33189582824707,
"learning_rate": 1.9751493543055634e-05,
"loss": 1.7896,
"step": 820
},
{
"epoch": 0.1269574195522074,
"grad_norm": 18.09388542175293,
"learning_rate": 1.9735789028731603e-05,
"loss": 1.754,
"step": 830
},
{
"epoch": 0.12848702701669185,
"grad_norm": 12.945524215698242,
"learning_rate": 1.9719610005785466e-05,
"loss": 1.7693,
"step": 840
},
{
"epoch": 0.13001663448117626,
"grad_norm": 12.579715728759766,
"learning_rate": 1.9702957262759964e-05,
"loss": 1.7323,
"step": 850
},
{
"epoch": 0.1315462419456607,
"grad_norm": 13.929019927978516,
"learning_rate": 1.9685831611286312e-05,
"loss": 1.7814,
"step": 860
},
{
"epoch": 0.1330758494101451,
"grad_norm": 16.18221664428711,
"learning_rate": 1.9668233886044597e-05,
"loss": 1.7715,
"step": 870
},
{
"epoch": 0.13460545687462955,
"grad_norm": 17.0338191986084,
"learning_rate": 1.9650164944723116e-05,
"loss": 1.7783,
"step": 880
},
{
"epoch": 0.136135064339114,
"grad_norm": 12.997424125671387,
"learning_rate": 1.9631625667976584e-05,
"loss": 1.7303,
"step": 890
},
{
"epoch": 0.1376646718035984,
"grad_norm": 14.024730682373047,
"learning_rate": 1.961261695938319e-05,
"loss": 1.7659,
"step": 900
},
{
"epoch": 0.13919427926808284,
"grad_norm": 15.084492683410645,
"learning_rate": 1.9593139745400575e-05,
"loss": 1.7504,
"step": 910
},
{
"epoch": 0.14072388673256725,
"grad_norm": 13.797721862792969,
"learning_rate": 1.9573194975320672e-05,
"loss": 1.7514,
"step": 920
},
{
"epoch": 0.1422534941970517,
"grad_norm": 12.69803237915039,
"learning_rate": 1.9552783621223437e-05,
"loss": 1.7344,
"step": 930
},
{
"epoch": 0.1437831016615361,
"grad_norm": 11.39012622833252,
"learning_rate": 1.9531906677929472e-05,
"loss": 1.7235,
"step": 940
},
{
"epoch": 0.14531270912602054,
"grad_norm": 15.734513282775879,
"learning_rate": 1.9510565162951538e-05,
"loss": 1.7329,
"step": 950
},
{
"epoch": 0.14684231659050495,
"grad_norm": 11.408859252929688,
"learning_rate": 1.9488760116444966e-05,
"loss": 1.7163,
"step": 960
},
{
"epoch": 0.1483719240549894,
"grad_norm": 11.509827613830566,
"learning_rate": 1.9466492601156964e-05,
"loss": 1.7178,
"step": 970
},
{
"epoch": 0.1499015315194738,
"grad_norm": 6.658020496368408,
"learning_rate": 1.944376370237481e-05,
"loss": 1.7269,
"step": 980
},
{
"epoch": 0.15143113898395824,
"grad_norm": 15.084721565246582,
"learning_rate": 1.942057452787297e-05,
"loss": 1.7002,
"step": 990
},
{
"epoch": 0.15296074644844268,
"grad_norm": 12.061055183410645,
"learning_rate": 1.9396926207859085e-05,
"loss": 1.7466,
"step": 1000
},
{
"epoch": 0.1544903539129271,
"grad_norm": 12.866460800170898,
"learning_rate": 1.937281989491892e-05,
"loss": 1.7213,
"step": 1010
},
{
"epoch": 0.15601996137741153,
"grad_norm": 10.788905143737793,
"learning_rate": 1.9348256763960146e-05,
"loss": 1.7204,
"step": 1020
},
{
"epoch": 0.15754956884189594,
"grad_norm": 14.169967651367188,
"learning_rate": 1.9323238012155125e-05,
"loss": 1.6906,
"step": 1030
},
{
"epoch": 0.15907917630638038,
"grad_norm": 15.22304916381836,
"learning_rate": 1.9297764858882516e-05,
"loss": 1.7012,
"step": 1040
},
{
"epoch": 0.1606087837708648,
"grad_norm": 15.920389175415039,
"learning_rate": 1.9271838545667876e-05,
"loss": 1.7151,
"step": 1050
},
{
"epoch": 0.16213839123534923,
"grad_norm": 14.644919395446777,
"learning_rate": 1.9245460336123136e-05,
"loss": 1.7011,
"step": 1060
},
{
"epoch": 0.16366799869983364,
"grad_norm": 11.079008102416992,
"learning_rate": 1.9218631515885007e-05,
"loss": 1.6773,
"step": 1070
},
{
"epoch": 0.16519760616431808,
"grad_norm": 13.276355743408203,
"learning_rate": 1.9191353392552346e-05,
"loss": 1.6895,
"step": 1080
},
{
"epoch": 0.1667272136288025,
"grad_norm": 12.620210647583008,
"learning_rate": 1.9163627295622397e-05,
"loss": 1.7153,
"step": 1090
},
{
"epoch": 0.16825682109328693,
"grad_norm": 13.213116645812988,
"learning_rate": 1.913545457642601e-05,
"loss": 1.6805,
"step": 1100
},
{
"epoch": 0.16978642855777137,
"grad_norm": 13.620598793029785,
"learning_rate": 1.910683660806177e-05,
"loss": 1.6882,
"step": 1110
},
{
"epoch": 0.17131603602225579,
"grad_norm": 14.6329927444458,
"learning_rate": 1.907777478532909e-05,
"loss": 1.6843,
"step": 1120
},
{
"epoch": 0.17284564348674022,
"grad_norm": 12.677013397216797,
"learning_rate": 1.9048270524660197e-05,
"loss": 1.7041,
"step": 1130
},
{
"epoch": 0.17437525095122464,
"grad_norm": 12.261626243591309,
"learning_rate": 1.901832526405114e-05,
"loss": 1.6819,
"step": 1140
},
{
"epoch": 0.17590485841570908,
"grad_norm": 14.508549690246582,
"learning_rate": 1.8987940462991673e-05,
"loss": 1.6847,
"step": 1150
},
{
"epoch": 0.1774344658801935,
"grad_norm": 14.293961524963379,
"learning_rate": 1.895711760239413e-05,
"loss": 1.6928,
"step": 1160
},
{
"epoch": 0.17896407334467793,
"grad_norm": 13.377256393432617,
"learning_rate": 1.892585818452126e-05,
"loss": 1.6838,
"step": 1170
},
{
"epoch": 0.18049368080916234,
"grad_norm": 12.398565292358398,
"learning_rate": 1.889416373291298e-05,
"loss": 1.6692,
"step": 1180
},
{
"epoch": 0.18202328827364678,
"grad_norm": 11.622918128967285,
"learning_rate": 1.8862035792312148e-05,
"loss": 1.6429,
"step": 1190
},
{
"epoch": 0.1835528957381312,
"grad_norm": 9.335916519165039,
"learning_rate": 1.8829475928589272e-05,
"loss": 1.6535,
"step": 1200
},
{
"epoch": 0.18508250320261563,
"grad_norm": 30.566650390625,
"learning_rate": 1.879648572866617e-05,
"loss": 1.6625,
"step": 1210
},
{
"epoch": 0.18661211066710007,
"grad_norm": 10.248709678649902,
"learning_rate": 1.8763066800438638e-05,
"loss": 1.657,
"step": 1220
},
{
"epoch": 0.18814171813158448,
"grad_norm": 12.403678894042969,
"learning_rate": 1.8729220772698096e-05,
"loss": 1.6588,
"step": 1230
},
{
"epoch": 0.18967132559606892,
"grad_norm": 12.880125999450684,
"learning_rate": 1.869494929505219e-05,
"loss": 1.6782,
"step": 1240
},
{
"epoch": 0.19120093306055333,
"grad_norm": 11.847280502319336,
"learning_rate": 1.866025403784439e-05,
"loss": 1.6612,
"step": 1250
},
{
"epoch": 0.19273054052503777,
"grad_norm": 13.305404663085938,
"learning_rate": 1.8625136692072577e-05,
"loss": 1.6565,
"step": 1260
},
{
"epoch": 0.19426014798952218,
"grad_norm": 14.423601150512695,
"learning_rate": 1.8589598969306646e-05,
"loss": 1.677,
"step": 1270
},
{
"epoch": 0.19578975545400662,
"grad_norm": 15.052698135375977,
"learning_rate": 1.855364260160507e-05,
"loss": 1.6467,
"step": 1280
},
{
"epoch": 0.19731936291849103,
"grad_norm": 11.820367813110352,
"learning_rate": 1.851726934143048e-05,
"loss": 1.6384,
"step": 1290
},
{
"epoch": 0.19884897038297547,
"grad_norm": 15.453312873840332,
"learning_rate": 1.848048096156426e-05,
"loss": 1.637,
"step": 1300
},
{
"epoch": 0.20037857784745988,
"grad_norm": 10.9462308883667,
"learning_rate": 1.8443279255020153e-05,
"loss": 1.6397,
"step": 1310
},
{
"epoch": 0.20190818531194432,
"grad_norm": 12.902162551879883,
"learning_rate": 1.8405666034956842e-05,
"loss": 1.6289,
"step": 1320
},
{
"epoch": 0.20343779277642876,
"grad_norm": 9.770879745483398,
"learning_rate": 1.836764313458962e-05,
"loss": 1.6349,
"step": 1330
},
{
"epoch": 0.20496740024091317,
"grad_norm": 11.91503620147705,
"learning_rate": 1.8329212407100996e-05,
"loss": 1.6101,
"step": 1340
},
{
"epoch": 0.2064970077053976,
"grad_norm": 9.018235206604004,
"learning_rate": 1.8290375725550417e-05,
"loss": 1.6194,
"step": 1350
},
{
"epoch": 0.20802661516988202,
"grad_norm": 13.213310241699219,
"learning_rate": 1.8251134982782952e-05,
"loss": 1.6173,
"step": 1360
},
{
"epoch": 0.20955622263436646,
"grad_norm": 13.184313774108887,
"learning_rate": 1.821149209133704e-05,
"loss": 1.644,
"step": 1370
},
{
"epoch": 0.21108583009885087,
"grad_norm": 11.71191692352295,
"learning_rate": 1.8171448983351284e-05,
"loss": 1.6355,
"step": 1380
},
{
"epoch": 0.2126154375633353,
"grad_norm": 11.976449966430664,
"learning_rate": 1.8131007610470278e-05,
"loss": 1.6196,
"step": 1390
},
{
"epoch": 0.21414504502781972,
"grad_norm": 12.10886287689209,
"learning_rate": 1.8090169943749477e-05,
"loss": 1.637,
"step": 1400
},
{
"epoch": 0.21567465249230416,
"grad_norm": 9.17182731628418,
"learning_rate": 1.804893797355914e-05,
"loss": 1.5883,
"step": 1410
},
{
"epoch": 0.21720425995678858,
"grad_norm": 15.71194076538086,
"learning_rate": 1.8007313709487334e-05,
"loss": 1.6215,
"step": 1420
},
{
"epoch": 0.21873386742127301,
"grad_norm": 10.610137939453125,
"learning_rate": 1.7965299180241963e-05,
"loss": 1.6228,
"step": 1430
},
{
"epoch": 0.22026347488575745,
"grad_norm": 12.10354232788086,
"learning_rate": 1.792289643355191e-05,
"loss": 1.6052,
"step": 1440
},
{
"epoch": 0.22179308235024187,
"grad_norm": 10.069908142089844,
"learning_rate": 1.788010753606722e-05,
"loss": 1.5964,
"step": 1450
},
{
"epoch": 0.2233226898147263,
"grad_norm": 12.154913902282715,
"learning_rate": 1.78369345732584e-05,
"loss": 1.6269,
"step": 1460
},
{
"epoch": 0.22485229727921072,
"grad_norm": 12.322149276733398,
"learning_rate": 1.7793379649314743e-05,
"loss": 1.5855,
"step": 1470
},
{
"epoch": 0.22638190474369516,
"grad_norm": 10.875051498413086,
"learning_rate": 1.7749444887041797e-05,
"loss": 1.6005,
"step": 1480
},
{
"epoch": 0.22791151220817957,
"grad_norm": 11.17204761505127,
"learning_rate": 1.7705132427757895e-05,
"loss": 1.5849,
"step": 1490
},
{
"epoch": 0.229441119672664,
"grad_norm": 10.732039451599121,
"learning_rate": 1.766044443118978e-05,
"loss": 1.5825,
"step": 1500
},
{
"epoch": 0.23097072713714842,
"grad_norm": 13.450652122497559,
"learning_rate": 1.761538307536737e-05,
"loss": 1.6146,
"step": 1510
},
{
"epoch": 0.23250033460163286,
"grad_norm": 11.979947090148926,
"learning_rate": 1.7569950556517566e-05,
"loss": 1.5928,
"step": 1520
},
{
"epoch": 0.23402994206611727,
"grad_norm": 11.305917739868164,
"learning_rate": 1.7524149088957244e-05,
"loss": 1.5866,
"step": 1530
},
{
"epoch": 0.2355595495306017,
"grad_norm": 10.360641479492188,
"learning_rate": 1.747798090498532e-05,
"loss": 1.6028,
"step": 1540
},
{
"epoch": 0.23708915699508615,
"grad_norm": 8.656867027282715,
"learning_rate": 1.7431448254773943e-05,
"loss": 1.6014,
"step": 1550
},
{
"epoch": 0.23861876445957056,
"grad_norm": 11.687288284301758,
"learning_rate": 1.7384553406258842e-05,
"loss": 1.5948,
"step": 1560
},
{
"epoch": 0.240148371924055,
"grad_norm": 10.943402290344238,
"learning_rate": 1.7337298645028764e-05,
"loss": 1.5923,
"step": 1570
},
{
"epoch": 0.2416779793885394,
"grad_norm": 10.023202896118164,
"learning_rate": 1.7289686274214116e-05,
"loss": 1.5719,
"step": 1580
},
{
"epoch": 0.24320758685302385,
"grad_norm": 9.700725555419922,
"learning_rate": 1.7241718614374678e-05,
"loss": 1.5872,
"step": 1590
},
{
"epoch": 0.24473719431750826,
"grad_norm": 11.386409759521484,
"learning_rate": 1.7193398003386514e-05,
"loss": 1.5832,
"step": 1600
},
{
"epoch": 0.2462668017819927,
"grad_norm": 9.78838062286377,
"learning_rate": 1.7144726796328034e-05,
"loss": 1.5904,
"step": 1610
},
{
"epoch": 0.2477964092464771,
"grad_norm": 12.67321491241455,
"learning_rate": 1.709570736536521e-05,
"loss": 1.5779,
"step": 1620
},
{
"epoch": 0.24932601671096155,
"grad_norm": 10.230249404907227,
"learning_rate": 1.7046342099635948e-05,
"loss": 1.5931,
"step": 1630
},
{
"epoch": 0.250855624175446,
"grad_norm": 9.709312438964844,
"learning_rate": 1.6996633405133656e-05,
"loss": 1.5728,
"step": 1640
},
{
"epoch": 0.25238523163993043,
"grad_norm": 10.340200424194336,
"learning_rate": 1.6946583704589973e-05,
"loss": 1.561,
"step": 1650
},
{
"epoch": 0.2539148391044148,
"grad_norm": 9.730533599853516,
"learning_rate": 1.68961954373567e-05,
"loss": 1.5487,
"step": 1660
},
{
"epoch": 0.25544444656889925,
"grad_norm": 8.787372589111328,
"learning_rate": 1.684547105928689e-05,
"loss": 1.5704,
"step": 1670
},
{
"epoch": 0.2569740540333837,
"grad_norm": 9.325871467590332,
"learning_rate": 1.6794413042615168e-05,
"loss": 1.5352,
"step": 1680
},
{
"epoch": 0.25850366149786813,
"grad_norm": 10.094749450683594,
"learning_rate": 1.6743023875837233e-05,
"loss": 1.5718,
"step": 1690
},
{
"epoch": 0.2600332689623525,
"grad_norm": 9.554350852966309,
"learning_rate": 1.6691306063588583e-05,
"loss": 1.5563,
"step": 1700
},
{
"epoch": 0.26156287642683695,
"grad_norm": 10.526246070861816,
"learning_rate": 1.6639262126522417e-05,
"loss": 1.5399,
"step": 1710
},
{
"epoch": 0.2630924838913214,
"grad_norm": 8.69581413269043,
"learning_rate": 1.6586894601186804e-05,
"loss": 1.5456,
"step": 1720
},
{
"epoch": 0.26462209135580583,
"grad_norm": 9.803443908691406,
"learning_rate": 1.6534206039901057e-05,
"loss": 1.54,
"step": 1730
},
{
"epoch": 0.2661516988202902,
"grad_norm": 9.926239013671875,
"learning_rate": 1.6481199010631312e-05,
"loss": 1.5468,
"step": 1740
},
{
"epoch": 0.26768130628477466,
"grad_norm": 11.072277069091797,
"learning_rate": 1.6427876096865394e-05,
"loss": 1.5495,
"step": 1750
},
{
"epoch": 0.2692109137492591,
"grad_norm": 10.02304744720459,
"learning_rate": 1.63742398974869e-05,
"loss": 1.567,
"step": 1760
},
{
"epoch": 0.27074052121374353,
"grad_norm": 9.076205253601074,
"learning_rate": 1.632029302664851e-05,
"loss": 1.553,
"step": 1770
},
{
"epoch": 0.272270128678228,
"grad_norm": 7.942831039428711,
"learning_rate": 1.6266038113644605e-05,
"loss": 1.5408,
"step": 1780
},
{
"epoch": 0.27379973614271236,
"grad_norm": 10.165443420410156,
"learning_rate": 1.6211477802783105e-05,
"loss": 1.5317,
"step": 1790
},
{
"epoch": 0.2753293436071968,
"grad_norm": 10.491219520568848,
"learning_rate": 1.6156614753256583e-05,
"loss": 1.5528,
"step": 1800
},
{
"epoch": 0.27685895107168124,
"grad_norm": 4.210089206695557,
"learning_rate": 1.610145163901268e-05,
"loss": 1.5295,
"step": 1810
},
{
"epoch": 0.2783885585361657,
"grad_norm": 12.827298164367676,
"learning_rate": 1.6045991148623752e-05,
"loss": 1.5366,
"step": 1820
},
{
"epoch": 0.27991816600065006,
"grad_norm": 11.551325798034668,
"learning_rate": 1.599023598515586e-05,
"loss": 1.5226,
"step": 1830
},
{
"epoch": 0.2814477734651345,
"grad_norm": 11.47888469696045,
"learning_rate": 1.5934188866037017e-05,
"loss": 1.5285,
"step": 1840
},
{
"epoch": 0.28297738092961894,
"grad_norm": 9.28005599975586,
"learning_rate": 1.5877852522924733e-05,
"loss": 1.504,
"step": 1850
},
{
"epoch": 0.2845069883941034,
"grad_norm": 10.573029518127441,
"learning_rate": 1.5821229701572897e-05,
"loss": 1.5212,
"step": 1860
},
{
"epoch": 0.2860365958585878,
"grad_norm": 11.156023025512695,
"learning_rate": 1.5764323161697933e-05,
"loss": 1.5257,
"step": 1870
},
{
"epoch": 0.2875662033230722,
"grad_norm": 9.835415840148926,
"learning_rate": 1.570713567684432e-05,
"loss": 1.5119,
"step": 1880
},
{
"epoch": 0.28909581078755664,
"grad_norm": 8.080103874206543,
"learning_rate": 1.564967003424938e-05,
"loss": 1.5048,
"step": 1890
},
{
"epoch": 0.2906254182520411,
"grad_norm": 10.452638626098633,
"learning_rate": 1.5591929034707468e-05,
"loss": 1.5197,
"step": 1900
},
{
"epoch": 0.2921550257165255,
"grad_norm": 10.441474914550781,
"learning_rate": 1.553391549243344e-05,
"loss": 1.5317,
"step": 1910
},
{
"epoch": 0.2936846331810099,
"grad_norm": 9.752622604370117,
"learning_rate": 1.5475632234925505e-05,
"loss": 1.5069,
"step": 1920
},
{
"epoch": 0.29521424064549434,
"grad_norm": 8.496747016906738,
"learning_rate": 1.54170821028274e-05,
"loss": 1.5285,
"step": 1930
},
{
"epoch": 0.2967438481099788,
"grad_norm": 9.566315650939941,
"learning_rate": 1.5358267949789968e-05,
"loss": 1.5162,
"step": 1940
},
{
"epoch": 0.2982734555744632,
"grad_norm": 6.093721389770508,
"learning_rate": 1.529919264233205e-05,
"loss": 1.5008,
"step": 1950
},
{
"epoch": 0.2998030630389476,
"grad_norm": 7.963770866394043,
"learning_rate": 1.5239859059700794e-05,
"loss": 1.4732,
"step": 1960
},
{
"epoch": 0.30133267050343204,
"grad_norm": 8.183985710144043,
"learning_rate": 1.5180270093731305e-05,
"loss": 1.5126,
"step": 1970
},
{
"epoch": 0.3028622779679165,
"grad_norm": 10.149001121520996,
"learning_rate": 1.5120428648705716e-05,
"loss": 1.5145,
"step": 1980
},
{
"epoch": 0.3043918854324009,
"grad_norm": 8.544008255004883,
"learning_rate": 1.5060337641211637e-05,
"loss": 1.5061,
"step": 1990
},
{
"epoch": 0.30592149289688536,
"grad_norm": 9.953203201293945,
"learning_rate": 1.5000000000000002e-05,
"loss": 1.5209,
"step": 2000
},
{
"epoch": 0.30745110036136974,
"grad_norm": 10.6850004196167,
"learning_rate": 1.493941866584231e-05,
"loss": 1.5163,
"step": 2010
},
{
"epoch": 0.3089807078258542,
"grad_norm": 8.49984359741211,
"learning_rate": 1.4878596591387329e-05,
"loss": 1.4868,
"step": 2020
},
{
"epoch": 0.3105103152903386,
"grad_norm": 8.950973510742188,
"learning_rate": 1.4817536741017153e-05,
"loss": 1.481,
"step": 2030
},
{
"epoch": 0.31203992275482306,
"grad_norm": 10.185256958007812,
"learning_rate": 1.4756242090702756e-05,
"loss": 1.49,
"step": 2040
},
{
"epoch": 0.31356953021930745,
"grad_norm": 8.98540210723877,
"learning_rate": 1.469471562785891e-05,
"loss": 1.4905,
"step": 2050
},
{
"epoch": 0.3150991376837919,
"grad_norm": 7.6039299964904785,
"learning_rate": 1.463296035119862e-05,
"loss": 1.4794,
"step": 2060
},
{
"epoch": 0.3166287451482763,
"grad_norm": 8.91348934173584,
"learning_rate": 1.4570979270586944e-05,
"loss": 1.4697,
"step": 2070
},
{
"epoch": 0.31815835261276076,
"grad_norm": 8.245038986206055,
"learning_rate": 1.4508775406894308e-05,
"loss": 1.4827,
"step": 2080
},
{
"epoch": 0.3196879600772452,
"grad_norm": 9.471598625183105,
"learning_rate": 1.4446351791849276e-05,
"loss": 1.4912,
"step": 2090
},
{
"epoch": 0.3212175675417296,
"grad_norm": 5.9019975662231445,
"learning_rate": 1.4383711467890776e-05,
"loss": 1.4708,
"step": 2100
},
{
"epoch": 0.322747175006214,
"grad_norm": 7.020793437957764,
"learning_rate": 1.4320857488019826e-05,
"loss": 1.4702,
"step": 2110
},
{
"epoch": 0.32427678247069847,
"grad_norm": 9.424378395080566,
"learning_rate": 1.4257792915650728e-05,
"loss": 1.4805,
"step": 2120
},
{
"epoch": 0.3258063899351829,
"grad_norm": 7.374673366546631,
"learning_rate": 1.4194520824461773e-05,
"loss": 1.4682,
"step": 2130
},
{
"epoch": 0.3273359973996673,
"grad_norm": 10.07297134399414,
"learning_rate": 1.413104429824542e-05,
"loss": 1.4817,
"step": 2140
},
{
"epoch": 0.32886560486415173,
"grad_norm": 8.525253295898438,
"learning_rate": 1.4067366430758004e-05,
"loss": 1.4686,
"step": 2150
},
{
"epoch": 0.33039521232863617,
"grad_norm": 9.093647956848145,
"learning_rate": 1.4003490325568953e-05,
"loss": 1.4779,
"step": 2160
},
{
"epoch": 0.3319248197931206,
"grad_norm": 7.984882831573486,
"learning_rate": 1.3939419095909513e-05,
"loss": 1.4756,
"step": 2170
},
{
"epoch": 0.333454427257605,
"grad_norm": 7.52358341217041,
"learning_rate": 1.3875155864521031e-05,
"loss": 1.4531,
"step": 2180
},
{
"epoch": 0.33498403472208943,
"grad_norm": 9.23783016204834,
"learning_rate": 1.3810703763502744e-05,
"loss": 1.4685,
"step": 2190
},
{
"epoch": 0.33651364218657387,
"grad_norm": 10.533625602722168,
"learning_rate": 1.3746065934159123e-05,
"loss": 1.4532,
"step": 2200
},
{
"epoch": 0.3380432496510583,
"grad_norm": 10.983015060424805,
"learning_rate": 1.3681245526846782e-05,
"loss": 1.4631,
"step": 2210
},
{
"epoch": 0.33957285711554275,
"grad_norm": 7.50883150100708,
"learning_rate": 1.3616245700820922e-05,
"loss": 1.4627,
"step": 2220
},
{
"epoch": 0.34110246458002713,
"grad_norm": 6.438501834869385,
"learning_rate": 1.3551069624081372e-05,
"loss": 1.4517,
"step": 2230
},
{
"epoch": 0.34263207204451157,
"grad_norm": 8.066999435424805,
"learning_rate": 1.3485720473218153e-05,
"loss": 1.4548,
"step": 2240
},
{
"epoch": 0.344161679508996,
"grad_norm": 8.656118392944336,
"learning_rate": 1.342020143325669e-05,
"loss": 1.4445,
"step": 2250
},
{
"epoch": 0.34569128697348045,
"grad_norm": 7.250131607055664,
"learning_rate": 1.3354515697502552e-05,
"loss": 1.4321,
"step": 2260
},
{
"epoch": 0.34722089443796483,
"grad_norm": 8.546892166137695,
"learning_rate": 1.3288666467385834e-05,
"loss": 1.4464,
"step": 2270
},
{
"epoch": 0.3487505019024493,
"grad_norm": 8.729716300964355,
"learning_rate": 1.3222656952305113e-05,
"loss": 1.4541,
"step": 2280
},
{
"epoch": 0.3502801093669337,
"grad_norm": 8.721868515014648,
"learning_rate": 1.3156490369471026e-05,
"loss": 1.4475,
"step": 2290
},
{
"epoch": 0.35180971683141815,
"grad_norm": 8.831208229064941,
"learning_rate": 1.3090169943749475e-05,
"loss": 1.4302,
"step": 2300
},
{
"epoch": 0.3533393242959026,
"grad_norm": 8.883235931396484,
"learning_rate": 1.3023698907504447e-05,
"loss": 1.4432,
"step": 2310
},
{
"epoch": 0.354868931760387,
"grad_norm": 8.794004440307617,
"learning_rate": 1.2957080500440469e-05,
"loss": 1.4444,
"step": 2320
},
{
"epoch": 0.3563985392248714,
"grad_norm": 4.8828935623168945,
"learning_rate": 1.2890317969444716e-05,
"loss": 1.4384,
"step": 2330
},
{
"epoch": 0.35792814668935585,
"grad_norm": 15.948801040649414,
"learning_rate": 1.2823414568428767e-05,
"loss": 1.4551,
"step": 2340
},
{
"epoch": 0.3594577541538403,
"grad_norm": 7.99323844909668,
"learning_rate": 1.2756373558169992e-05,
"loss": 1.4202,
"step": 2350
},
{
"epoch": 0.3609873616183247,
"grad_norm": 5.047421455383301,
"learning_rate": 1.2689198206152657e-05,
"loss": 1.4278,
"step": 2360
},
{
"epoch": 0.3625169690828091,
"grad_norm": 5.64467191696167,
"learning_rate": 1.2621891786408648e-05,
"loss": 1.4138,
"step": 2370
},
{
"epoch": 0.36404657654729355,
"grad_norm": 8.789236068725586,
"learning_rate": 1.2554457579357906e-05,
"loss": 1.4153,
"step": 2380
},
{
"epoch": 0.365576184011778,
"grad_norm": 8.083710670471191,
"learning_rate": 1.2486898871648552e-05,
"loss": 1.4093,
"step": 2390
},
{
"epoch": 0.3671057914762624,
"grad_norm": 5.929847240447998,
"learning_rate": 1.2419218955996677e-05,
"loss": 1.4153,
"step": 2400
},
{
"epoch": 0.3686353989407468,
"grad_norm": 7.852176666259766,
"learning_rate": 1.23514211310259e-05,
"loss": 1.4145,
"step": 2410
},
{
"epoch": 0.37016500640523126,
"grad_norm": 9.10372257232666,
"learning_rate": 1.2283508701106559e-05,
"loss": 1.414,
"step": 2420
},
{
"epoch": 0.3716946138697157,
"grad_norm": 9.65334415435791,
"learning_rate": 1.2215484976194675e-05,
"loss": 1.4363,
"step": 2430
},
{
"epoch": 0.37322422133420013,
"grad_norm": 7.612096786499023,
"learning_rate": 1.2147353271670634e-05,
"loss": 1.4188,
"step": 2440
},
{
"epoch": 0.3747538287986845,
"grad_norm": 5.171387672424316,
"learning_rate": 1.2079116908177592e-05,
"loss": 1.4108,
"step": 2450
},
{
"epoch": 0.37628343626316896,
"grad_norm": 6.974627494812012,
"learning_rate": 1.2010779211459649e-05,
"loss": 1.4164,
"step": 2460
},
{
"epoch": 0.3778130437276534,
"grad_norm": 7.995100021362305,
"learning_rate": 1.194234351219972e-05,
"loss": 1.4241,
"step": 2470
},
{
"epoch": 0.37934265119213784,
"grad_norm": 9.874503135681152,
"learning_rate": 1.187381314585725e-05,
"loss": 1.3957,
"step": 2480
},
{
"epoch": 0.3808722586566222,
"grad_norm": 9.190123558044434,
"learning_rate": 1.1805191452505602e-05,
"loss": 1.4234,
"step": 2490
},
{
"epoch": 0.38240186612110666,
"grad_norm": 9.317654609680176,
"learning_rate": 1.1736481776669307e-05,
"loss": 1.4251,
"step": 2500
},
{
"epoch": 0.3839314735855911,
"grad_norm": 8.3729829788208,
"learning_rate": 1.1667687467161025e-05,
"loss": 1.4024,
"step": 2510
},
{
"epoch": 0.38546108105007554,
"grad_norm": 8.835628509521484,
"learning_rate": 1.159881187691835e-05,
"loss": 1.4098,
"step": 2520
},
{
"epoch": 0.38699068851456,
"grad_norm": 8.480125427246094,
"learning_rate": 1.1529858362840383e-05,
"loss": 1.4,
"step": 2530
},
{
"epoch": 0.38852029597904436,
"grad_norm": 8.7410306930542,
"learning_rate": 1.1460830285624119e-05,
"loss": 1.4084,
"step": 2540
},
{
"epoch": 0.3900499034435288,
"grad_norm": 5.247309684753418,
"learning_rate": 1.1391731009600655e-05,
"loss": 1.3918,
"step": 2550
},
{
"epoch": 0.39157951090801324,
"grad_norm": 6.82070779800415,
"learning_rate": 1.1322563902571227e-05,
"loss": 1.3829,
"step": 2560
},
{
"epoch": 0.3931091183724977,
"grad_norm": 7.315955638885498,
"learning_rate": 1.1253332335643043e-05,
"loss": 1.3879,
"step": 2570
},
{
"epoch": 0.39463872583698206,
"grad_norm": 8.81851863861084,
"learning_rate": 1.1184039683065014e-05,
"loss": 1.3809,
"step": 2580
},
{
"epoch": 0.3961683333014665,
"grad_norm": 7.224653720855713,
"learning_rate": 1.1114689322063255e-05,
"loss": 1.3958,
"step": 2590
},
{
"epoch": 0.39769794076595094,
"grad_norm": 6.905256748199463,
"learning_rate": 1.1045284632676535e-05,
"loss": 1.3854,
"step": 2600
},
{
"epoch": 0.3992275482304354,
"grad_norm": 7.848160266876221,
"learning_rate": 1.0975828997591496e-05,
"loss": 1.3876,
"step": 2610
},
{
"epoch": 0.40075715569491976,
"grad_norm": 6.962865352630615,
"learning_rate": 1.0906325801977804e-05,
"loss": 1.3909,
"step": 2620
},
{
"epoch": 0.4022867631594042,
"grad_norm": 7.401321887969971,
"learning_rate": 1.083677843332316e-05,
"loss": 1.3825,
"step": 2630
},
{
"epoch": 0.40381637062388864,
"grad_norm": 5.543237209320068,
"learning_rate": 1.0767190281268187e-05,
"loss": 1.3773,
"step": 2640
},
{
"epoch": 0.4053459780883731,
"grad_norm": 7.603894233703613,
"learning_rate": 1.0697564737441254e-05,
"loss": 1.3961,
"step": 2650
},
{
"epoch": 0.4068755855528575,
"grad_norm": 7.263538837432861,
"learning_rate": 1.0627905195293135e-05,
"loss": 1.3696,
"step": 2660
},
{
"epoch": 0.4084051930173419,
"grad_norm": 9.08191967010498,
"learning_rate": 1.055821504993164e-05,
"loss": 1.374,
"step": 2670
},
{
"epoch": 0.40993480048182634,
"grad_norm": 6.848121166229248,
"learning_rate": 1.0488497697956134e-05,
"loss": 1.3765,
"step": 2680
},
{
"epoch": 0.4114644079463108,
"grad_norm": 6.978294849395752,
"learning_rate": 1.0418756537291996e-05,
"loss": 1.375,
"step": 2690
},
{
"epoch": 0.4129940154107952,
"grad_norm": 7.876370429992676,
"learning_rate": 1.0348994967025012e-05,
"loss": 1.3681,
"step": 2700
},
{
"epoch": 0.4145236228752796,
"grad_norm": 5.700248718261719,
"learning_rate": 1.0279216387235691e-05,
"loss": 1.3733,
"step": 2710
},
{
"epoch": 0.41605323033976405,
"grad_norm": 5.646441459655762,
"learning_rate": 1.0209424198833571e-05,
"loss": 1.3487,
"step": 2720
},
{
"epoch": 0.4175828378042485,
"grad_norm": 6.236967086791992,
"learning_rate": 1.0139621803391454e-05,
"loss": 1.3621,
"step": 2730
},
{
"epoch": 0.4191124452687329,
"grad_norm": 6.606427192687988,
"learning_rate": 1.0069812602979617e-05,
"loss": 1.3551,
"step": 2740
},
{
"epoch": 0.42064205273321736,
"grad_norm": 5.275976181030273,
"learning_rate": 1e-05,
"loss": 1.3554,
"step": 2750
},
{
"epoch": 0.42217166019770175,
"grad_norm": 7.677177429199219,
"learning_rate": 9.930187397020385e-06,
"loss": 1.3601,
"step": 2760
},
{
"epoch": 0.4237012676621862,
"grad_norm": 7.980209827423096,
"learning_rate": 9.860378196608549e-06,
"loss": 1.3671,
"step": 2770
},
{
"epoch": 0.4252308751266706,
"grad_norm": 4.75565767288208,
"learning_rate": 9.790575801166432e-06,
"loss": 1.3557,
"step": 2780
},
{
"epoch": 0.42676048259115507,
"grad_norm": 5.737847328186035,
"learning_rate": 9.720783612764314e-06,
"loss": 1.3475,
"step": 2790
},
{
"epoch": 0.42829009005563945,
"grad_norm": 7.28814172744751,
"learning_rate": 9.651005032974994e-06,
"loss": 1.3516,
"step": 2800
},
{
"epoch": 0.4298196975201239,
"grad_norm": 6.400086879730225,
"learning_rate": 9.581243462708007e-06,
"loss": 1.3552,
"step": 2810
},
{
"epoch": 0.43134930498460833,
"grad_norm": 14.871561050415039,
"learning_rate": 9.511502302043867e-06,
"loss": 1.338,
"step": 2820
},
{
"epoch": 0.43287891244909277,
"grad_norm": 7.7634100914001465,
"learning_rate": 9.441784950068362e-06,
"loss": 1.346,
"step": 2830
},
{
"epoch": 0.43440851991357715,
"grad_norm": 7.211244106292725,
"learning_rate": 9.372094804706867e-06,
"loss": 1.3526,
"step": 2840
},
{
"epoch": 0.4359381273780616,
"grad_norm": 6.430041313171387,
"learning_rate": 9.302435262558748e-06,
"loss": 1.3343,
"step": 2850
},
{
"epoch": 0.43746773484254603,
"grad_norm": 5.903066635131836,
"learning_rate": 9.232809718731815e-06,
"loss": 1.3353,
"step": 2860
},
{
"epoch": 0.43899734230703047,
"grad_norm": 7.717010021209717,
"learning_rate": 9.163221566676847e-06,
"loss": 1.3438,
"step": 2870
},
{
"epoch": 0.4405269497715149,
"grad_norm": 7.673276901245117,
"learning_rate": 9.093674198022201e-06,
"loss": 1.3394,
"step": 2880
},
{
"epoch": 0.4420565572359993,
"grad_norm": 6.96506929397583,
"learning_rate": 9.024171002408507e-06,
"loss": 1.35,
"step": 2890
},
{
"epoch": 0.44358616470048373,
"grad_norm": 5.854609966278076,
"learning_rate": 8.954715367323468e-06,
"loss": 1.3376,
"step": 2900
},
{
"epoch": 0.44511577216496817,
"grad_norm": 6.3792572021484375,
"learning_rate": 8.885310677936746e-06,
"loss": 1.3359,
"step": 2910
},
{
"epoch": 0.4466453796294526,
"grad_norm": 6.389842510223389,
"learning_rate": 8.815960316934991e-06,
"loss": 1.3299,
"step": 2920
},
{
"epoch": 0.448174987093937,
"grad_norm": 6.603227138519287,
"learning_rate": 8.746667664356957e-06,
"loss": 1.323,
"step": 2930
},
{
"epoch": 0.44970459455842143,
"grad_norm": 6.4203338623046875,
"learning_rate": 8.677436097428775e-06,
"loss": 1.3459,
"step": 2940
},
{
"epoch": 0.4512342020229059,
"grad_norm": 5.568302154541016,
"learning_rate": 8.60826899039935e-06,
"loss": 1.3434,
"step": 2950
},
{
"epoch": 0.4527638094873903,
"grad_norm": 6.737658977508545,
"learning_rate": 8.539169714375885e-06,
"loss": 1.3146,
"step": 2960
},
{
"epoch": 0.45429341695187475,
"grad_norm": 5.4618940353393555,
"learning_rate": 8.47014163715962e-06,
"loss": 1.3205,
"step": 2970
},
{
"epoch": 0.45582302441635913,
"grad_norm": 6.360799312591553,
"learning_rate": 8.401188123081653e-06,
"loss": 1.3198,
"step": 2980
},
{
"epoch": 0.4573526318808436,
"grad_norm": 6.304644584655762,
"learning_rate": 8.332312532838978e-06,
"loss": 1.3261,
"step": 2990
},
{
"epoch": 0.458882239345328,
"grad_norm": 4.984965801239014,
"learning_rate": 8.263518223330698e-06,
"loss": 1.3181,
"step": 3000
}
],
"logging_steps": 10,
"max_steps": 5000,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 1000,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 5.73984932364288e+17,
"train_batch_size": 8,
"trial_name": null,
"trial_params": null
}