phi4-mini-rag / checkpoint-13293 /trainer_state.json
JacobLinCool's picture
Super-squash branch 'main' using huggingface_hub
cb02cf4 verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 3.0,
"eval_steps": 500,
"global_step": 13293,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0022568269013766643,
"grad_norm": 342.0,
"learning_rate": 2.0000000000000003e-06,
"loss": 6.6491,
"step": 10
},
{
"epoch": 0.0045136538027533285,
"grad_norm": 60.75,
"learning_rate": 4.000000000000001e-06,
"loss": 2.1305,
"step": 20
},
{
"epoch": 0.006770480704129994,
"grad_norm": 13.9375,
"learning_rate": 6e-06,
"loss": 0.2525,
"step": 30
},
{
"epoch": 0.009027307605506657,
"grad_norm": 100.0,
"learning_rate": 8.000000000000001e-06,
"loss": 0.2105,
"step": 40
},
{
"epoch": 0.011284134506883321,
"grad_norm": 84.5,
"learning_rate": 1e-05,
"loss": 0.0882,
"step": 50
},
{
"epoch": 0.013540961408259987,
"grad_norm": 0.06591796875,
"learning_rate": 9.992448840897079e-06,
"loss": 0.2672,
"step": 60
},
{
"epoch": 0.01579778830963665,
"grad_norm": 0.00982666015625,
"learning_rate": 9.984897681794156e-06,
"loss": 0.0005,
"step": 70
},
{
"epoch": 0.018054615211013314,
"grad_norm": 0.00927734375,
"learning_rate": 9.977346522691234e-06,
"loss": 0.0073,
"step": 80
},
{
"epoch": 0.020311442112389978,
"grad_norm": 0.0020751953125,
"learning_rate": 9.969795363588312e-06,
"loss": 0.0,
"step": 90
},
{
"epoch": 0.022568269013766643,
"grad_norm": 0.0023345947265625,
"learning_rate": 9.96224420448539e-06,
"loss": 0.0,
"step": 100
},
{
"epoch": 0.02482509591514331,
"grad_norm": 0.0027618408203125,
"learning_rate": 9.954693045382467e-06,
"loss": 0.0,
"step": 110
},
{
"epoch": 0.027081922816519974,
"grad_norm": 0.0198974609375,
"learning_rate": 9.947141886279545e-06,
"loss": 0.0001,
"step": 120
},
{
"epoch": 0.02933874971789664,
"grad_norm": 0.005035400390625,
"learning_rate": 9.939590727176623e-06,
"loss": 0.0,
"step": 130
},
{
"epoch": 0.0315955766192733,
"grad_norm": 0.0927734375,
"learning_rate": 9.9320395680737e-06,
"loss": 0.0,
"step": 140
},
{
"epoch": 0.033852403520649964,
"grad_norm": 0.0025177001953125,
"learning_rate": 9.924488408970778e-06,
"loss": 0.6798,
"step": 150
},
{
"epoch": 0.03610923042202663,
"grad_norm": 0.01239013671875,
"learning_rate": 9.916937249867854e-06,
"loss": 0.216,
"step": 160
},
{
"epoch": 0.03836605732340329,
"grad_norm": 0.00543212890625,
"learning_rate": 9.909386090764932e-06,
"loss": 0.0005,
"step": 170
},
{
"epoch": 0.040622884224779957,
"grad_norm": 0.006072998046875,
"learning_rate": 9.901834931662012e-06,
"loss": 0.4586,
"step": 180
},
{
"epoch": 0.04287971112615662,
"grad_norm": 0.00101470947265625,
"learning_rate": 9.89428377255909e-06,
"loss": 0.0,
"step": 190
},
{
"epoch": 0.045136538027533285,
"grad_norm": 0.000591278076171875,
"learning_rate": 9.886732613456166e-06,
"loss": 0.0001,
"step": 200
},
{
"epoch": 0.04739336492890995,
"grad_norm": 0.042236328125,
"learning_rate": 9.879181454353243e-06,
"loss": 0.4003,
"step": 210
},
{
"epoch": 0.04965019183028662,
"grad_norm": 0.00171661376953125,
"learning_rate": 9.871630295250321e-06,
"loss": 0.0001,
"step": 220
},
{
"epoch": 0.051907018731663285,
"grad_norm": 0.0009918212890625,
"learning_rate": 9.8640791361474e-06,
"loss": 0.38,
"step": 230
},
{
"epoch": 0.05416384563303995,
"grad_norm": 0.1376953125,
"learning_rate": 9.856527977044477e-06,
"loss": 0.6111,
"step": 240
},
{
"epoch": 0.05642067253441661,
"grad_norm": 0.0033721923828125,
"learning_rate": 9.848976817941554e-06,
"loss": 0.3474,
"step": 250
},
{
"epoch": 0.05867749943579328,
"grad_norm": 2.5,
"learning_rate": 9.841425658838632e-06,
"loss": 0.0004,
"step": 260
},
{
"epoch": 0.06093432633716994,
"grad_norm": 0.000698089599609375,
"learning_rate": 9.83387449973571e-06,
"loss": 0.0,
"step": 270
},
{
"epoch": 0.0631911532385466,
"grad_norm": 0.0159912109375,
"learning_rate": 9.82632334063279e-06,
"loss": 0.0001,
"step": 280
},
{
"epoch": 0.06544798013992327,
"grad_norm": 0.01904296875,
"learning_rate": 9.818772181529865e-06,
"loss": 0.0,
"step": 290
},
{
"epoch": 0.06770480704129993,
"grad_norm": 0.00119781494140625,
"learning_rate": 9.811221022426943e-06,
"loss": 0.0,
"step": 300
},
{
"epoch": 0.0699616339426766,
"grad_norm": 0.00106048583984375,
"learning_rate": 9.803669863324021e-06,
"loss": 0.0,
"step": 310
},
{
"epoch": 0.07221846084405326,
"grad_norm": 0.00186920166015625,
"learning_rate": 9.796118704221099e-06,
"loss": 0.2718,
"step": 320
},
{
"epoch": 0.07447528774542993,
"grad_norm": 17.25,
"learning_rate": 9.788567545118176e-06,
"loss": 0.0021,
"step": 330
},
{
"epoch": 0.07673211464680658,
"grad_norm": 0.16796875,
"learning_rate": 9.781016386015254e-06,
"loss": 0.3681,
"step": 340
},
{
"epoch": 0.07898894154818326,
"grad_norm": 0.041015625,
"learning_rate": 9.773465226912332e-06,
"loss": 0.2174,
"step": 350
},
{
"epoch": 0.08124576844955991,
"grad_norm": 0.005401611328125,
"learning_rate": 9.76591406780941e-06,
"loss": 0.0003,
"step": 360
},
{
"epoch": 0.08350259535093658,
"grad_norm": 0.0069580078125,
"learning_rate": 9.758362908706487e-06,
"loss": 0.0,
"step": 370
},
{
"epoch": 0.08575942225231324,
"grad_norm": 0.051513671875,
"learning_rate": 9.750811749603565e-06,
"loss": 0.0,
"step": 380
},
{
"epoch": 0.08801624915368991,
"grad_norm": 0.00099945068359375,
"learning_rate": 9.743260590500643e-06,
"loss": 0.0001,
"step": 390
},
{
"epoch": 0.09027307605506657,
"grad_norm": 69.0,
"learning_rate": 9.73570943139772e-06,
"loss": 0.3307,
"step": 400
},
{
"epoch": 0.09252990295644324,
"grad_norm": 0.047607421875,
"learning_rate": 9.728158272294797e-06,
"loss": 0.0001,
"step": 410
},
{
"epoch": 0.0947867298578199,
"grad_norm": 0.002288818359375,
"learning_rate": 9.720607113191876e-06,
"loss": 0.0,
"step": 420
},
{
"epoch": 0.09704355675919657,
"grad_norm": 0.00250244140625,
"learning_rate": 9.713055954088954e-06,
"loss": 0.0001,
"step": 430
},
{
"epoch": 0.09930038366057324,
"grad_norm": 0.00469970703125,
"learning_rate": 9.705504794986032e-06,
"loss": 0.0,
"step": 440
},
{
"epoch": 0.1015572105619499,
"grad_norm": 0.000396728515625,
"learning_rate": 9.697953635883108e-06,
"loss": 0.0001,
"step": 450
},
{
"epoch": 0.10381403746332657,
"grad_norm": 57.5,
"learning_rate": 9.690402476780186e-06,
"loss": 0.3762,
"step": 460
},
{
"epoch": 0.10607086436470323,
"grad_norm": 0.00113677978515625,
"learning_rate": 9.682851317677265e-06,
"loss": 0.0001,
"step": 470
},
{
"epoch": 0.1083276912660799,
"grad_norm": 0.000865936279296875,
"learning_rate": 9.675300158574343e-06,
"loss": 0.0,
"step": 480
},
{
"epoch": 0.11058451816745656,
"grad_norm": 0.0208740234375,
"learning_rate": 9.66774899947142e-06,
"loss": 0.3327,
"step": 490
},
{
"epoch": 0.11284134506883323,
"grad_norm": 0.00079345703125,
"learning_rate": 9.660197840368497e-06,
"loss": 0.0,
"step": 500
},
{
"epoch": 0.11509817197020988,
"grad_norm": 0.002532958984375,
"learning_rate": 9.652646681265574e-06,
"loss": 0.8702,
"step": 510
},
{
"epoch": 0.11735499887158655,
"grad_norm": 0.029541015625,
"learning_rate": 9.645095522162652e-06,
"loss": 0.0,
"step": 520
},
{
"epoch": 0.11961182577296321,
"grad_norm": 0.0025177001953125,
"learning_rate": 9.637544363059732e-06,
"loss": 0.1008,
"step": 530
},
{
"epoch": 0.12186865267433988,
"grad_norm": 0.00628662109375,
"learning_rate": 9.629993203956808e-06,
"loss": 0.0,
"step": 540
},
{
"epoch": 0.12412547957571654,
"grad_norm": 0.0291748046875,
"learning_rate": 9.622442044853885e-06,
"loss": 0.0002,
"step": 550
},
{
"epoch": 0.1263823064770932,
"grad_norm": 0.26171875,
"learning_rate": 9.614890885750963e-06,
"loss": 0.0001,
"step": 560
},
{
"epoch": 0.12863913337846988,
"grad_norm": 0.0028839111328125,
"learning_rate": 9.607339726648041e-06,
"loss": 0.0,
"step": 570
},
{
"epoch": 0.13089596027984654,
"grad_norm": 0.0004730224609375,
"learning_rate": 9.599788567545119e-06,
"loss": 0.0,
"step": 580
},
{
"epoch": 0.1331527871812232,
"grad_norm": 0.0004100799560546875,
"learning_rate": 9.592237408442197e-06,
"loss": 0.0,
"step": 590
},
{
"epoch": 0.13540961408259986,
"grad_norm": 0.0712890625,
"learning_rate": 9.584686249339274e-06,
"loss": 0.0,
"step": 600
},
{
"epoch": 0.13766644098397654,
"grad_norm": 0.0244140625,
"learning_rate": 9.577135090236352e-06,
"loss": 0.0198,
"step": 610
},
{
"epoch": 0.1399232678853532,
"grad_norm": 16.625,
"learning_rate": 9.56958393113343e-06,
"loss": 0.0008,
"step": 620
},
{
"epoch": 0.14218009478672985,
"grad_norm": 1.6796875,
"learning_rate": 9.562032772030508e-06,
"loss": 0.0001,
"step": 630
},
{
"epoch": 0.1444369216881065,
"grad_norm": 0.00055694580078125,
"learning_rate": 9.554481612927585e-06,
"loss": 0.0,
"step": 640
},
{
"epoch": 0.1466937485894832,
"grad_norm": 0.000457763671875,
"learning_rate": 9.546930453824663e-06,
"loss": 0.0,
"step": 650
},
{
"epoch": 0.14895057549085985,
"grad_norm": 0.001251220703125,
"learning_rate": 9.53937929472174e-06,
"loss": 0.0,
"step": 660
},
{
"epoch": 0.1512074023922365,
"grad_norm": 0.00077056884765625,
"learning_rate": 9.531828135618819e-06,
"loss": 0.3169,
"step": 670
},
{
"epoch": 0.15346422929361317,
"grad_norm": 0.00122833251953125,
"learning_rate": 9.524276976515896e-06,
"loss": 0.7091,
"step": 680
},
{
"epoch": 0.15572105619498985,
"grad_norm": 0.00067901611328125,
"learning_rate": 9.516725817412974e-06,
"loss": 0.0,
"step": 690
},
{
"epoch": 0.1579778830963665,
"grad_norm": 0.00445556640625,
"learning_rate": 9.50917465831005e-06,
"loss": 0.4241,
"step": 700
},
{
"epoch": 0.16023470999774317,
"grad_norm": 0.000370025634765625,
"learning_rate": 9.501623499207128e-06,
"loss": 0.0071,
"step": 710
},
{
"epoch": 0.16249153689911983,
"grad_norm": 0.0016326904296875,
"learning_rate": 9.494072340104207e-06,
"loss": 0.2712,
"step": 720
},
{
"epoch": 0.1647483638004965,
"grad_norm": 0.07470703125,
"learning_rate": 9.486521181001285e-06,
"loss": 0.0,
"step": 730
},
{
"epoch": 0.16700519070187317,
"grad_norm": 0.10302734375,
"learning_rate": 9.478970021898363e-06,
"loss": 0.3055,
"step": 740
},
{
"epoch": 0.16926201760324983,
"grad_norm": 0.032470703125,
"learning_rate": 9.471418862795439e-06,
"loss": 0.0,
"step": 750
},
{
"epoch": 0.17151884450462648,
"grad_norm": 0.0003681182861328125,
"learning_rate": 9.463867703692517e-06,
"loss": 0.0,
"step": 760
},
{
"epoch": 0.17377567140600317,
"grad_norm": 0.0004787445068359375,
"learning_rate": 9.456316544589596e-06,
"loss": 0.0,
"step": 770
},
{
"epoch": 0.17603249830737983,
"grad_norm": 0.0021514892578125,
"learning_rate": 9.448765385486674e-06,
"loss": 0.0003,
"step": 780
},
{
"epoch": 0.17828932520875648,
"grad_norm": 0.00040435791015625,
"learning_rate": 9.44121422638375e-06,
"loss": 0.0,
"step": 790
},
{
"epoch": 0.18054615211013314,
"grad_norm": 0.00135040283203125,
"learning_rate": 9.433663067280828e-06,
"loss": 0.0,
"step": 800
},
{
"epoch": 0.18280297901150983,
"grad_norm": 0.000331878662109375,
"learning_rate": 9.426111908177906e-06,
"loss": 0.0,
"step": 810
},
{
"epoch": 0.18505980591288648,
"grad_norm": 0.00049591064453125,
"learning_rate": 9.418560749074985e-06,
"loss": 0.0,
"step": 820
},
{
"epoch": 0.18731663281426314,
"grad_norm": 0.0003795623779296875,
"learning_rate": 9.411009589972061e-06,
"loss": 0.0,
"step": 830
},
{
"epoch": 0.1895734597156398,
"grad_norm": 0.000629425048828125,
"learning_rate": 9.403458430869139e-06,
"loss": 0.3432,
"step": 840
},
{
"epoch": 0.19183028661701648,
"grad_norm": 0.00048828125,
"learning_rate": 9.395907271766217e-06,
"loss": 0.0,
"step": 850
},
{
"epoch": 0.19408711351839314,
"grad_norm": 0.0004253387451171875,
"learning_rate": 9.388356112663294e-06,
"loss": 0.0,
"step": 860
},
{
"epoch": 0.1963439404197698,
"grad_norm": 0.000579833984375,
"learning_rate": 9.380804953560372e-06,
"loss": 0.0,
"step": 870
},
{
"epoch": 0.19860076732114648,
"grad_norm": 0.007232666015625,
"learning_rate": 9.37325379445745e-06,
"loss": 0.0,
"step": 880
},
{
"epoch": 0.20085759422252314,
"grad_norm": 0.003631591796875,
"learning_rate": 9.365702635354528e-06,
"loss": 0.1363,
"step": 890
},
{
"epoch": 0.2031144211238998,
"grad_norm": 0.003875732421875,
"learning_rate": 9.358151476251605e-06,
"loss": 0.0,
"step": 900
},
{
"epoch": 0.20537124802527645,
"grad_norm": 0.00830078125,
"learning_rate": 9.350600317148683e-06,
"loss": 0.0,
"step": 910
},
{
"epoch": 0.20762807492665314,
"grad_norm": 0.00213623046875,
"learning_rate": 9.343049158045761e-06,
"loss": 0.0,
"step": 920
},
{
"epoch": 0.2098849018280298,
"grad_norm": 0.0003643035888671875,
"learning_rate": 9.335497998942839e-06,
"loss": 0.0929,
"step": 930
},
{
"epoch": 0.21214172872940645,
"grad_norm": 0.67578125,
"learning_rate": 9.327946839839916e-06,
"loss": 0.6867,
"step": 940
},
{
"epoch": 0.2143985556307831,
"grad_norm": 0.0030975341796875,
"learning_rate": 9.320395680736994e-06,
"loss": 0.0005,
"step": 950
},
{
"epoch": 0.2166553825321598,
"grad_norm": 0.0011138916015625,
"learning_rate": 9.312844521634072e-06,
"loss": 0.3669,
"step": 960
},
{
"epoch": 0.21891220943353645,
"grad_norm": 71.5,
"learning_rate": 9.30529336253115e-06,
"loss": 0.2307,
"step": 970
},
{
"epoch": 0.2211690363349131,
"grad_norm": 0.0164794921875,
"learning_rate": 9.297742203428227e-06,
"loss": 0.0213,
"step": 980
},
{
"epoch": 0.22342586323628977,
"grad_norm": 0.003570556640625,
"learning_rate": 9.290191044325305e-06,
"loss": 0.0001,
"step": 990
},
{
"epoch": 0.22568269013766645,
"grad_norm": 0.000316619873046875,
"learning_rate": 9.282639885222381e-06,
"loss": 0.0002,
"step": 1000
},
{
"epoch": 0.2279395170390431,
"grad_norm": 0.0004711151123046875,
"learning_rate": 9.27508872611946e-06,
"loss": 0.0,
"step": 1010
},
{
"epoch": 0.23019634394041977,
"grad_norm": 0.00096893310546875,
"learning_rate": 9.267537567016539e-06,
"loss": 0.0007,
"step": 1020
},
{
"epoch": 0.23245317084179642,
"grad_norm": 0.005950927734375,
"learning_rate": 9.259986407913616e-06,
"loss": 0.4785,
"step": 1030
},
{
"epoch": 0.2347099977431731,
"grad_norm": 0.0087890625,
"learning_rate": 9.252435248810692e-06,
"loss": 0.2708,
"step": 1040
},
{
"epoch": 0.23696682464454977,
"grad_norm": 0.00506591796875,
"learning_rate": 9.24488408970777e-06,
"loss": 0.2259,
"step": 1050
},
{
"epoch": 0.23922365154592642,
"grad_norm": 0.001434326171875,
"learning_rate": 9.237332930604848e-06,
"loss": 0.0045,
"step": 1060
},
{
"epoch": 0.24148047844730308,
"grad_norm": 0.003143310546875,
"learning_rate": 9.229781771501927e-06,
"loss": 0.0,
"step": 1070
},
{
"epoch": 0.24373730534867977,
"grad_norm": 0.0025787353515625,
"learning_rate": 9.222230612399003e-06,
"loss": 0.0,
"step": 1080
},
{
"epoch": 0.24599413225005642,
"grad_norm": 0.0003833770751953125,
"learning_rate": 9.214679453296081e-06,
"loss": 0.0,
"step": 1090
},
{
"epoch": 0.24825095915143308,
"grad_norm": 0.00494384765625,
"learning_rate": 9.207128294193159e-06,
"loss": 0.0,
"step": 1100
},
{
"epoch": 0.25050778605280977,
"grad_norm": 0.0024566650390625,
"learning_rate": 9.199577135090237e-06,
"loss": 0.1667,
"step": 1110
},
{
"epoch": 0.2527646129541864,
"grad_norm": 0.00183868408203125,
"learning_rate": 9.192025975987314e-06,
"loss": 0.0001,
"step": 1120
},
{
"epoch": 0.2550214398555631,
"grad_norm": 0.00396728515625,
"learning_rate": 9.184474816884392e-06,
"loss": 0.0,
"step": 1130
},
{
"epoch": 0.25727826675693977,
"grad_norm": 0.0147705078125,
"learning_rate": 9.17692365778147e-06,
"loss": 0.0,
"step": 1140
},
{
"epoch": 0.2595350936583164,
"grad_norm": 0.00179290771484375,
"learning_rate": 9.169372498678548e-06,
"loss": 0.0,
"step": 1150
},
{
"epoch": 0.2617919205596931,
"grad_norm": 0.0025634765625,
"learning_rate": 9.161821339575625e-06,
"loss": 0.0,
"step": 1160
},
{
"epoch": 0.2640487474610697,
"grad_norm": 0.004486083984375,
"learning_rate": 9.154270180472703e-06,
"loss": 0.0,
"step": 1170
},
{
"epoch": 0.2663055743624464,
"grad_norm": 0.0001277923583984375,
"learning_rate": 9.146719021369781e-06,
"loss": 0.0,
"step": 1180
},
{
"epoch": 0.2685624012638231,
"grad_norm": 0.0001659393310546875,
"learning_rate": 9.139167862266859e-06,
"loss": 0.0,
"step": 1190
},
{
"epoch": 0.2708192281651997,
"grad_norm": 0.0002899169921875,
"learning_rate": 9.131616703163937e-06,
"loss": 0.0412,
"step": 1200
},
{
"epoch": 0.2730760550665764,
"grad_norm": 0.006134033203125,
"learning_rate": 9.124065544061014e-06,
"loss": 0.0,
"step": 1210
},
{
"epoch": 0.2753328819679531,
"grad_norm": 0.03759765625,
"learning_rate": 9.116514384958092e-06,
"loss": 0.2408,
"step": 1220
},
{
"epoch": 0.2775897088693297,
"grad_norm": 0.00067138671875,
"learning_rate": 9.10896322585517e-06,
"loss": 0.0,
"step": 1230
},
{
"epoch": 0.2798465357707064,
"grad_norm": 0.0007171630859375,
"learning_rate": 9.101412066752248e-06,
"loss": 0.0007,
"step": 1240
},
{
"epoch": 0.282103362672083,
"grad_norm": 0.004669189453125,
"learning_rate": 9.093860907649325e-06,
"loss": 0.3364,
"step": 1250
},
{
"epoch": 0.2843601895734597,
"grad_norm": 0.000713348388671875,
"learning_rate": 9.086309748546403e-06,
"loss": 0.0,
"step": 1260
},
{
"epoch": 0.2866170164748364,
"grad_norm": 0.0010223388671875,
"learning_rate": 9.078758589443481e-06,
"loss": 0.0,
"step": 1270
},
{
"epoch": 0.288873843376213,
"grad_norm": 0.002960205078125,
"learning_rate": 9.071207430340559e-06,
"loss": 0.0,
"step": 1280
},
{
"epoch": 0.2911306702775897,
"grad_norm": 0.00034332275390625,
"learning_rate": 9.063656271237635e-06,
"loss": 0.0,
"step": 1290
},
{
"epoch": 0.2933874971789664,
"grad_norm": 0.002349853515625,
"learning_rate": 9.056105112134712e-06,
"loss": 0.0,
"step": 1300
},
{
"epoch": 0.295644324080343,
"grad_norm": 0.00058746337890625,
"learning_rate": 9.048553953031792e-06,
"loss": 0.2488,
"step": 1310
},
{
"epoch": 0.2979011509817197,
"grad_norm": 0.00115966796875,
"learning_rate": 9.04100279392887e-06,
"loss": 0.0,
"step": 1320
},
{
"epoch": 0.3001579778830964,
"grad_norm": 0.004425048828125,
"learning_rate": 9.033451634825946e-06,
"loss": 0.2421,
"step": 1330
},
{
"epoch": 0.302414804784473,
"grad_norm": 0.000965118408203125,
"learning_rate": 9.025900475723023e-06,
"loss": 0.0244,
"step": 1340
},
{
"epoch": 0.3046716316858497,
"grad_norm": 0.0810546875,
"learning_rate": 9.018349316620101e-06,
"loss": 0.0,
"step": 1350
},
{
"epoch": 0.30692845858722634,
"grad_norm": 0.00732421875,
"learning_rate": 9.01079815751718e-06,
"loss": 0.0,
"step": 1360
},
{
"epoch": 0.309185285488603,
"grad_norm": 0.0010223388671875,
"learning_rate": 9.003246998414258e-06,
"loss": 0.0,
"step": 1370
},
{
"epoch": 0.3114421123899797,
"grad_norm": 0.00130462646484375,
"learning_rate": 8.995695839311335e-06,
"loss": 0.0,
"step": 1380
},
{
"epoch": 0.31369893929135634,
"grad_norm": 0.0004634857177734375,
"learning_rate": 8.988144680208412e-06,
"loss": 0.0,
"step": 1390
},
{
"epoch": 0.315955766192733,
"grad_norm": 0.0206298828125,
"learning_rate": 8.98059352110549e-06,
"loss": 0.2664,
"step": 1400
},
{
"epoch": 0.3182125930941097,
"grad_norm": 0.005218505859375,
"learning_rate": 8.973042362002568e-06,
"loss": 0.0,
"step": 1410
},
{
"epoch": 0.32046941999548634,
"grad_norm": 0.00154876708984375,
"learning_rate": 8.965491202899646e-06,
"loss": 0.1572,
"step": 1420
},
{
"epoch": 0.322726246896863,
"grad_norm": 0.000640869140625,
"learning_rate": 8.957940043796723e-06,
"loss": 0.0003,
"step": 1430
},
{
"epoch": 0.32498307379823965,
"grad_norm": 0.00119781494140625,
"learning_rate": 8.950388884693801e-06,
"loss": 0.0,
"step": 1440
},
{
"epoch": 0.32723990069961634,
"grad_norm": 0.0245361328125,
"learning_rate": 8.942837725590879e-06,
"loss": 0.0,
"step": 1450
},
{
"epoch": 0.329496727600993,
"grad_norm": 0.0004062652587890625,
"learning_rate": 8.935286566487957e-06,
"loss": 0.0,
"step": 1460
},
{
"epoch": 0.33175355450236965,
"grad_norm": 0.000385284423828125,
"learning_rate": 8.927735407385034e-06,
"loss": 0.0,
"step": 1470
},
{
"epoch": 0.33401038140374634,
"grad_norm": 0.00616455078125,
"learning_rate": 8.920184248282112e-06,
"loss": 0.0,
"step": 1480
},
{
"epoch": 0.336267208305123,
"grad_norm": 0.002532958984375,
"learning_rate": 8.91263308917919e-06,
"loss": 0.0,
"step": 1490
},
{
"epoch": 0.33852403520649965,
"grad_norm": 0.0011444091796875,
"learning_rate": 8.905081930076268e-06,
"loss": 0.0,
"step": 1500
},
{
"epoch": 0.34078086210787634,
"grad_norm": 0.0004138946533203125,
"learning_rate": 8.897530770973345e-06,
"loss": 0.0,
"step": 1510
},
{
"epoch": 0.34303768900925297,
"grad_norm": 0.000579833984375,
"learning_rate": 8.889979611870423e-06,
"loss": 0.3398,
"step": 1520
},
{
"epoch": 0.34529451591062965,
"grad_norm": 0.0003948211669921875,
"learning_rate": 8.882428452767501e-06,
"loss": 0.0,
"step": 1530
},
{
"epoch": 0.34755134281200634,
"grad_norm": 0.000247955322265625,
"learning_rate": 8.874877293664577e-06,
"loss": 0.0,
"step": 1540
},
{
"epoch": 0.34980816971338297,
"grad_norm": 0.000255584716796875,
"learning_rate": 8.867326134561656e-06,
"loss": 0.2765,
"step": 1550
},
{
"epoch": 0.35206499661475965,
"grad_norm": 0.0001964569091796875,
"learning_rate": 8.859774975458734e-06,
"loss": 0.0,
"step": 1560
},
{
"epoch": 0.35432182351613634,
"grad_norm": 0.00037384033203125,
"learning_rate": 8.852223816355812e-06,
"loss": 0.0,
"step": 1570
},
{
"epoch": 0.35657865041751297,
"grad_norm": 0.00052642822265625,
"learning_rate": 8.84467265725289e-06,
"loss": 0.0,
"step": 1580
},
{
"epoch": 0.35883547731888965,
"grad_norm": 0.00023555755615234375,
"learning_rate": 8.837121498149966e-06,
"loss": 0.4111,
"step": 1590
},
{
"epoch": 0.3610923042202663,
"grad_norm": 0.05419921875,
"learning_rate": 8.829570339047044e-06,
"loss": 0.0007,
"step": 1600
},
{
"epoch": 0.36334913112164297,
"grad_norm": 0.00982666015625,
"learning_rate": 8.822019179944123e-06,
"loss": 0.0,
"step": 1610
},
{
"epoch": 0.36560595802301965,
"grad_norm": 0.0007476806640625,
"learning_rate": 8.8144680208412e-06,
"loss": 0.0,
"step": 1620
},
{
"epoch": 0.3678627849243963,
"grad_norm": 0.0030059814453125,
"learning_rate": 8.806916861738277e-06,
"loss": 0.0,
"step": 1630
},
{
"epoch": 0.37011961182577297,
"grad_norm": 0.005889892578125,
"learning_rate": 8.799365702635355e-06,
"loss": 0.0,
"step": 1640
},
{
"epoch": 0.37237643872714965,
"grad_norm": 0.006927490234375,
"learning_rate": 8.791814543532432e-06,
"loss": 0.0001,
"step": 1650
},
{
"epoch": 0.3746332656285263,
"grad_norm": 0.00494384765625,
"learning_rate": 8.784263384429512e-06,
"loss": 0.0,
"step": 1660
},
{
"epoch": 0.37689009252990296,
"grad_norm": 0.00799560546875,
"learning_rate": 8.776712225326588e-06,
"loss": 0.0,
"step": 1670
},
{
"epoch": 0.3791469194312796,
"grad_norm": 0.0003604888916015625,
"learning_rate": 8.769161066223666e-06,
"loss": 0.0,
"step": 1680
},
{
"epoch": 0.3814037463326563,
"grad_norm": 66.5,
"learning_rate": 8.761609907120743e-06,
"loss": 0.651,
"step": 1690
},
{
"epoch": 0.38366057323403296,
"grad_norm": 0.005645751953125,
"learning_rate": 8.754058748017821e-06,
"loss": 0.0,
"step": 1700
},
{
"epoch": 0.3859174001354096,
"grad_norm": 0.002105712890625,
"learning_rate": 8.746507588914899e-06,
"loss": 0.0,
"step": 1710
},
{
"epoch": 0.3881742270367863,
"grad_norm": 0.0003948211669921875,
"learning_rate": 8.738956429811977e-06,
"loss": 0.0,
"step": 1720
},
{
"epoch": 0.39043105393816296,
"grad_norm": 0.0014801025390625,
"learning_rate": 8.731405270709054e-06,
"loss": 0.0,
"step": 1730
},
{
"epoch": 0.3926878808395396,
"grad_norm": 0.000560760498046875,
"learning_rate": 8.723854111606132e-06,
"loss": 0.0,
"step": 1740
},
{
"epoch": 0.3949447077409163,
"grad_norm": 0.000659942626953125,
"learning_rate": 8.71630295250321e-06,
"loss": 0.0,
"step": 1750
},
{
"epoch": 0.39720153464229296,
"grad_norm": 0.162109375,
"learning_rate": 8.708751793400288e-06,
"loss": 0.0,
"step": 1760
},
{
"epoch": 0.3994583615436696,
"grad_norm": 0.000843048095703125,
"learning_rate": 8.701200634297366e-06,
"loss": 0.1511,
"step": 1770
},
{
"epoch": 0.4017151884450463,
"grad_norm": 0.003631591796875,
"learning_rate": 8.693649475194443e-06,
"loss": 0.8574,
"step": 1780
},
{
"epoch": 0.4039720153464229,
"grad_norm": 0.004425048828125,
"learning_rate": 8.686098316091521e-06,
"loss": 0.0013,
"step": 1790
},
{
"epoch": 0.4062288422477996,
"grad_norm": 0.004486083984375,
"learning_rate": 8.678547156988599e-06,
"loss": 0.0002,
"step": 1800
},
{
"epoch": 0.4084856691491763,
"grad_norm": 0.0019683837890625,
"learning_rate": 8.670995997885677e-06,
"loss": 0.4738,
"step": 1810
},
{
"epoch": 0.4107424960505529,
"grad_norm": 0.01092529296875,
"learning_rate": 8.663444838782754e-06,
"loss": 0.0,
"step": 1820
},
{
"epoch": 0.4129993229519296,
"grad_norm": 0.0013885498046875,
"learning_rate": 8.655893679679832e-06,
"loss": 0.0,
"step": 1830
},
{
"epoch": 0.4152561498533063,
"grad_norm": 0.09326171875,
"learning_rate": 8.648342520576908e-06,
"loss": 0.0,
"step": 1840
},
{
"epoch": 0.4175129767546829,
"grad_norm": 0.01409912109375,
"learning_rate": 8.640791361473988e-06,
"loss": 0.0001,
"step": 1850
},
{
"epoch": 0.4197698036560596,
"grad_norm": 0.004119873046875,
"learning_rate": 8.633240202371065e-06,
"loss": 0.4574,
"step": 1860
},
{
"epoch": 0.4220266305574362,
"grad_norm": 0.00121307373046875,
"learning_rate": 8.625689043268143e-06,
"loss": 0.0,
"step": 1870
},
{
"epoch": 0.4242834574588129,
"grad_norm": 0.006103515625,
"learning_rate": 8.61813788416522e-06,
"loss": 0.0,
"step": 1880
},
{
"epoch": 0.4265402843601896,
"grad_norm": 0.005157470703125,
"learning_rate": 8.610586725062297e-06,
"loss": 0.0,
"step": 1890
},
{
"epoch": 0.4287971112615662,
"grad_norm": 0.00360107421875,
"learning_rate": 8.603035565959376e-06,
"loss": 0.715,
"step": 1900
},
{
"epoch": 0.4310539381629429,
"grad_norm": 0.0028076171875,
"learning_rate": 8.595484406856454e-06,
"loss": 0.0,
"step": 1910
},
{
"epoch": 0.4333107650643196,
"grad_norm": 0.00799560546875,
"learning_rate": 8.58793324775353e-06,
"loss": 0.0001,
"step": 1920
},
{
"epoch": 0.4355675919656962,
"grad_norm": 0.003265380859375,
"learning_rate": 8.580382088650608e-06,
"loss": 0.2658,
"step": 1930
},
{
"epoch": 0.4378244188670729,
"grad_norm": 0.00970458984375,
"learning_rate": 8.572830929547686e-06,
"loss": 0.0,
"step": 1940
},
{
"epoch": 0.44008124576844954,
"grad_norm": 0.1376953125,
"learning_rate": 8.565279770444764e-06,
"loss": 0.0,
"step": 1950
},
{
"epoch": 0.4423380726698262,
"grad_norm": 0.007568359375,
"learning_rate": 8.557728611341841e-06,
"loss": 0.0,
"step": 1960
},
{
"epoch": 0.4445948995712029,
"grad_norm": 0.000518798828125,
"learning_rate": 8.550177452238919e-06,
"loss": 0.0,
"step": 1970
},
{
"epoch": 0.44685172647257954,
"grad_norm": 0.0037841796875,
"learning_rate": 8.542626293135997e-06,
"loss": 0.0,
"step": 1980
},
{
"epoch": 0.4491085533739562,
"grad_norm": 0.000659942626953125,
"learning_rate": 8.535075134033075e-06,
"loss": 0.0,
"step": 1990
},
{
"epoch": 0.4513653802753329,
"grad_norm": 0.000308990478515625,
"learning_rate": 8.527523974930152e-06,
"loss": 0.0,
"step": 2000
},
{
"epoch": 0.45362220717670954,
"grad_norm": 0.00167083740234375,
"learning_rate": 8.51997281582723e-06,
"loss": 0.187,
"step": 2010
},
{
"epoch": 0.4558790340780862,
"grad_norm": 0.001251220703125,
"learning_rate": 8.512421656724308e-06,
"loss": 0.7379,
"step": 2020
},
{
"epoch": 0.45813586097946285,
"grad_norm": 0.003753662109375,
"learning_rate": 8.504870497621386e-06,
"loss": 0.0,
"step": 2030
},
{
"epoch": 0.46039268788083954,
"grad_norm": 0.00179290771484375,
"learning_rate": 8.497319338518463e-06,
"loss": 0.0,
"step": 2040
},
{
"epoch": 0.4626495147822162,
"grad_norm": 0.00034332275390625,
"learning_rate": 8.489768179415541e-06,
"loss": 0.0,
"step": 2050
},
{
"epoch": 0.46490634168359285,
"grad_norm": 0.0003948211669921875,
"learning_rate": 8.482217020312619e-06,
"loss": 0.0,
"step": 2060
},
{
"epoch": 0.46716316858496953,
"grad_norm": 0.0012359619140625,
"learning_rate": 8.474665861209697e-06,
"loss": 0.0,
"step": 2070
},
{
"epoch": 0.4694199954863462,
"grad_norm": 0.000896453857421875,
"learning_rate": 8.467114702106774e-06,
"loss": 0.0,
"step": 2080
},
{
"epoch": 0.47167682238772285,
"grad_norm": 4.8125,
"learning_rate": 8.459563543003852e-06,
"loss": 0.0007,
"step": 2090
},
{
"epoch": 0.47393364928909953,
"grad_norm": 0.00017452239990234375,
"learning_rate": 8.45201238390093e-06,
"loss": 0.0,
"step": 2100
},
{
"epoch": 0.47619047619047616,
"grad_norm": 0.00037384033203125,
"learning_rate": 8.444461224798008e-06,
"loss": 0.0,
"step": 2110
},
{
"epoch": 0.47844730309185285,
"grad_norm": 0.0001983642578125,
"learning_rate": 8.436910065695085e-06,
"loss": 0.0,
"step": 2120
},
{
"epoch": 0.48070412999322953,
"grad_norm": 0.00189971923828125,
"learning_rate": 8.429358906592162e-06,
"loss": 0.0,
"step": 2130
},
{
"epoch": 0.48296095689460616,
"grad_norm": 0.0002422332763671875,
"learning_rate": 8.421807747489241e-06,
"loss": 0.0,
"step": 2140
},
{
"epoch": 0.48521778379598285,
"grad_norm": 0.000522613525390625,
"learning_rate": 8.414256588386319e-06,
"loss": 0.0,
"step": 2150
},
{
"epoch": 0.48747461069735953,
"grad_norm": 0.00021266937255859375,
"learning_rate": 8.406705429283396e-06,
"loss": 0.0,
"step": 2160
},
{
"epoch": 0.48973143759873616,
"grad_norm": 0.000537872314453125,
"learning_rate": 8.399154270180473e-06,
"loss": 0.0,
"step": 2170
},
{
"epoch": 0.49198826450011285,
"grad_norm": 0.001617431640625,
"learning_rate": 8.39160311107755e-06,
"loss": 0.6385,
"step": 2180
},
{
"epoch": 0.4942450914014895,
"grad_norm": 0.0020904541015625,
"learning_rate": 8.384051951974628e-06,
"loss": 0.0643,
"step": 2190
},
{
"epoch": 0.49650191830286616,
"grad_norm": 0.0020904541015625,
"learning_rate": 8.376500792871708e-06,
"loss": 0.0,
"step": 2200
},
{
"epoch": 0.49875874520424285,
"grad_norm": 0.25390625,
"learning_rate": 8.368949633768784e-06,
"loss": 0.0001,
"step": 2210
},
{
"epoch": 0.5010155721056195,
"grad_norm": 0.018310546875,
"learning_rate": 8.361398474665861e-06,
"loss": 0.3428,
"step": 2220
},
{
"epoch": 0.5032723990069962,
"grad_norm": 0.006622314453125,
"learning_rate": 8.353847315562939e-06,
"loss": 0.307,
"step": 2230
},
{
"epoch": 0.5055292259083728,
"grad_norm": 0.03515625,
"learning_rate": 8.346296156460017e-06,
"loss": 0.0007,
"step": 2240
},
{
"epoch": 0.5077860528097495,
"grad_norm": 0.0308837890625,
"learning_rate": 8.338744997357096e-06,
"loss": 0.0308,
"step": 2250
},
{
"epoch": 0.5100428797111262,
"grad_norm": 0.006591796875,
"learning_rate": 8.331193838254172e-06,
"loss": 0.2879,
"step": 2260
},
{
"epoch": 0.5122997066125028,
"grad_norm": 77.0,
"learning_rate": 8.32364267915125e-06,
"loss": 0.4258,
"step": 2270
},
{
"epoch": 0.5145565335138795,
"grad_norm": 0.031005859375,
"learning_rate": 8.316091520048328e-06,
"loss": 0.0502,
"step": 2280
},
{
"epoch": 0.5168133604152562,
"grad_norm": 0.515625,
"learning_rate": 8.308540360945406e-06,
"loss": 0.0001,
"step": 2290
},
{
"epoch": 0.5190701873166328,
"grad_norm": 0.01007080078125,
"learning_rate": 8.300989201842483e-06,
"loss": 0.0,
"step": 2300
},
{
"epoch": 0.5213270142180095,
"grad_norm": 0.01483154296875,
"learning_rate": 8.293438042739561e-06,
"loss": 0.3391,
"step": 2310
},
{
"epoch": 0.5235838411193862,
"grad_norm": 0.150390625,
"learning_rate": 8.285886883636639e-06,
"loss": 0.0,
"step": 2320
},
{
"epoch": 0.5258406680207628,
"grad_norm": 0.002288818359375,
"learning_rate": 8.278335724533717e-06,
"loss": 0.3255,
"step": 2330
},
{
"epoch": 0.5280974949221394,
"grad_norm": 0.00347900390625,
"learning_rate": 8.270784565430794e-06,
"loss": 0.2631,
"step": 2340
},
{
"epoch": 0.5303543218235162,
"grad_norm": 0.038330078125,
"learning_rate": 8.263233406327872e-06,
"loss": 0.0,
"step": 2350
},
{
"epoch": 0.5326111487248928,
"grad_norm": 0.01055908203125,
"learning_rate": 8.25568224722495e-06,
"loss": 0.0001,
"step": 2360
},
{
"epoch": 0.5348679756262694,
"grad_norm": 0.0040283203125,
"learning_rate": 8.248131088122028e-06,
"loss": 0.0,
"step": 2370
},
{
"epoch": 0.5371248025276462,
"grad_norm": 0.00035858154296875,
"learning_rate": 8.240579929019104e-06,
"loss": 0.0,
"step": 2380
},
{
"epoch": 0.5393816294290228,
"grad_norm": 0.001983642578125,
"learning_rate": 8.233028769916183e-06,
"loss": 0.0003,
"step": 2390
},
{
"epoch": 0.5416384563303994,
"grad_norm": 0.0002651214599609375,
"learning_rate": 8.225477610813261e-06,
"loss": 0.0,
"step": 2400
},
{
"epoch": 0.5438952832317762,
"grad_norm": 0.0009918212890625,
"learning_rate": 8.217926451710339e-06,
"loss": 0.0,
"step": 2410
},
{
"epoch": 0.5461521101331528,
"grad_norm": 0.0009613037109375,
"learning_rate": 8.210375292607415e-06,
"loss": 0.0,
"step": 2420
},
{
"epoch": 0.5484089370345294,
"grad_norm": 0.00107574462890625,
"learning_rate": 8.202824133504493e-06,
"loss": 0.0,
"step": 2430
},
{
"epoch": 0.5506657639359062,
"grad_norm": 0.0032958984375,
"learning_rate": 8.195272974401572e-06,
"loss": 0.248,
"step": 2440
},
{
"epoch": 0.5529225908372828,
"grad_norm": 0.0021209716796875,
"learning_rate": 8.18772181529865e-06,
"loss": 0.0,
"step": 2450
},
{
"epoch": 0.5551794177386594,
"grad_norm": 51.75,
"learning_rate": 8.180170656195728e-06,
"loss": 0.4205,
"step": 2460
},
{
"epoch": 0.5574362446400362,
"grad_norm": 0.005767822265625,
"learning_rate": 8.172619497092804e-06,
"loss": 0.0,
"step": 2470
},
{
"epoch": 0.5596930715414128,
"grad_norm": 0.0185546875,
"learning_rate": 8.165068337989881e-06,
"loss": 0.0,
"step": 2480
},
{
"epoch": 0.5619498984427894,
"grad_norm": 0.0118408203125,
"learning_rate": 8.15751717888696e-06,
"loss": 0.13,
"step": 2490
},
{
"epoch": 0.564206725344166,
"grad_norm": 1.375,
"learning_rate": 8.149966019784039e-06,
"loss": 0.0002,
"step": 2500
},
{
"epoch": 0.5664635522455428,
"grad_norm": 0.00433349609375,
"learning_rate": 8.142414860681115e-06,
"loss": 0.0,
"step": 2510
},
{
"epoch": 0.5687203791469194,
"grad_norm": 0.0026702880859375,
"learning_rate": 8.134863701578192e-06,
"loss": 0.0,
"step": 2520
},
{
"epoch": 0.570977206048296,
"grad_norm": 548.0,
"learning_rate": 8.12731254247527e-06,
"loss": 0.6147,
"step": 2530
},
{
"epoch": 0.5732340329496728,
"grad_norm": 0.0020904541015625,
"learning_rate": 8.119761383372348e-06,
"loss": 0.0,
"step": 2540
},
{
"epoch": 0.5754908598510494,
"grad_norm": 0.003387451171875,
"learning_rate": 8.112210224269426e-06,
"loss": 0.0,
"step": 2550
},
{
"epoch": 0.577747686752426,
"grad_norm": 0.0002536773681640625,
"learning_rate": 8.104659065166504e-06,
"loss": 0.0,
"step": 2560
},
{
"epoch": 0.5800045136538028,
"grad_norm": 0.0035552978515625,
"learning_rate": 8.097107906063581e-06,
"loss": 0.3707,
"step": 2570
},
{
"epoch": 0.5822613405551794,
"grad_norm": 0.0018157958984375,
"learning_rate": 8.089556746960659e-06,
"loss": 0.3046,
"step": 2580
},
{
"epoch": 0.584518167456556,
"grad_norm": 0.0078125,
"learning_rate": 8.082005587857737e-06,
"loss": 0.0,
"step": 2590
},
{
"epoch": 0.5867749943579328,
"grad_norm": 0.0037078857421875,
"learning_rate": 8.074454428754815e-06,
"loss": 0.3055,
"step": 2600
},
{
"epoch": 0.5890318212593094,
"grad_norm": 0.0150146484375,
"learning_rate": 8.066903269651892e-06,
"loss": 0.0,
"step": 2610
},
{
"epoch": 0.591288648160686,
"grad_norm": 0.0250244140625,
"learning_rate": 8.05935211054897e-06,
"loss": 0.0001,
"step": 2620
},
{
"epoch": 0.5935454750620628,
"grad_norm": 0.020751953125,
"learning_rate": 8.051800951446048e-06,
"loss": 0.0001,
"step": 2630
},
{
"epoch": 0.5958023019634394,
"grad_norm": 0.00372314453125,
"learning_rate": 8.044249792343126e-06,
"loss": 0.2485,
"step": 2640
},
{
"epoch": 0.598059128864816,
"grad_norm": 0.001800537109375,
"learning_rate": 8.036698633240203e-06,
"loss": 0.0,
"step": 2650
},
{
"epoch": 0.6003159557661928,
"grad_norm": 83.0,
"learning_rate": 8.029147474137281e-06,
"loss": 0.3156,
"step": 2660
},
{
"epoch": 0.6025727826675694,
"grad_norm": 0.007720947265625,
"learning_rate": 8.021596315034359e-06,
"loss": 0.0,
"step": 2670
},
{
"epoch": 0.604829609568946,
"grad_norm": 0.001373291015625,
"learning_rate": 8.014045155931437e-06,
"loss": 0.0002,
"step": 2680
},
{
"epoch": 0.6070864364703227,
"grad_norm": 0.0008087158203125,
"learning_rate": 8.006493996828514e-06,
"loss": 0.0,
"step": 2690
},
{
"epoch": 0.6093432633716994,
"grad_norm": 0.0030059814453125,
"learning_rate": 7.998942837725592e-06,
"loss": 0.0,
"step": 2700
},
{
"epoch": 0.611600090273076,
"grad_norm": 0.0016021728515625,
"learning_rate": 7.99139167862267e-06,
"loss": 0.0,
"step": 2710
},
{
"epoch": 0.6138569171744527,
"grad_norm": 0.0036468505859375,
"learning_rate": 7.983840519519746e-06,
"loss": 0.0014,
"step": 2720
},
{
"epoch": 0.6161137440758294,
"grad_norm": 0.0003185272216796875,
"learning_rate": 7.976289360416824e-06,
"loss": 0.0,
"step": 2730
},
{
"epoch": 0.618370570977206,
"grad_norm": 0.008056640625,
"learning_rate": 7.968738201313903e-06,
"loss": 0.0,
"step": 2740
},
{
"epoch": 0.6206273978785827,
"grad_norm": 0.00103759765625,
"learning_rate": 7.961187042210981e-06,
"loss": 0.0,
"step": 2750
},
{
"epoch": 0.6228842247799594,
"grad_norm": 132.0,
"learning_rate": 7.953635883108057e-06,
"loss": 0.3156,
"step": 2760
},
{
"epoch": 0.625141051681336,
"grad_norm": 0.024658203125,
"learning_rate": 7.946084724005135e-06,
"loss": 0.0,
"step": 2770
},
{
"epoch": 0.6273978785827127,
"grad_norm": 0.000629425048828125,
"learning_rate": 7.938533564902213e-06,
"loss": 0.0,
"step": 2780
},
{
"epoch": 0.6296547054840894,
"grad_norm": 0.00897216796875,
"learning_rate": 7.930982405799292e-06,
"loss": 0.0,
"step": 2790
},
{
"epoch": 0.631911532385466,
"grad_norm": 0.007354736328125,
"learning_rate": 7.923431246696368e-06,
"loss": 0.0,
"step": 2800
},
{
"epoch": 0.6341683592868427,
"grad_norm": 0.0037841796875,
"learning_rate": 7.915880087593446e-06,
"loss": 0.0001,
"step": 2810
},
{
"epoch": 0.6364251861882194,
"grad_norm": 0.0036163330078125,
"learning_rate": 7.908328928490524e-06,
"loss": 0.0,
"step": 2820
},
{
"epoch": 0.638682013089596,
"grad_norm": 0.00250244140625,
"learning_rate": 7.900777769387601e-06,
"loss": 0.0,
"step": 2830
},
{
"epoch": 0.6409388399909727,
"grad_norm": 0.00173187255859375,
"learning_rate": 7.893226610284679e-06,
"loss": 0.0,
"step": 2840
},
{
"epoch": 0.6431956668923493,
"grad_norm": 0.0004520416259765625,
"learning_rate": 7.885675451181757e-06,
"loss": 0.0,
"step": 2850
},
{
"epoch": 0.645452493793726,
"grad_norm": 0.00021457672119140625,
"learning_rate": 7.878124292078835e-06,
"loss": 0.0,
"step": 2860
},
{
"epoch": 0.6477093206951027,
"grad_norm": 0.0004405975341796875,
"learning_rate": 7.870573132975912e-06,
"loss": 0.0,
"step": 2870
},
{
"epoch": 0.6499661475964793,
"grad_norm": 0.00045013427734375,
"learning_rate": 7.86302197387299e-06,
"loss": 0.0,
"step": 2880
},
{
"epoch": 0.652222974497856,
"grad_norm": 0.000522613525390625,
"learning_rate": 7.855470814770068e-06,
"loss": 0.0,
"step": 2890
},
{
"epoch": 0.6544798013992327,
"grad_norm": 0.00029754638671875,
"learning_rate": 7.847919655667146e-06,
"loss": 0.3409,
"step": 2900
},
{
"epoch": 0.6567366283006093,
"grad_norm": 0.00372314453125,
"learning_rate": 7.840368496564223e-06,
"loss": 0.1273,
"step": 2910
},
{
"epoch": 0.658993455201986,
"grad_norm": 0.005401611328125,
"learning_rate": 7.832817337461301e-06,
"loss": 0.2191,
"step": 2920
},
{
"epoch": 0.6612502821033627,
"grad_norm": 47.25,
"learning_rate": 7.825266178358379e-06,
"loss": 0.2522,
"step": 2930
},
{
"epoch": 0.6635071090047393,
"grad_norm": 0.018310546875,
"learning_rate": 7.817715019255457e-06,
"loss": 0.0982,
"step": 2940
},
{
"epoch": 0.665763935906116,
"grad_norm": 0.001220703125,
"learning_rate": 7.810163860152535e-06,
"loss": 0.0001,
"step": 2950
},
{
"epoch": 0.6680207628074927,
"grad_norm": 0.001495361328125,
"learning_rate": 7.802612701049612e-06,
"loss": 0.1024,
"step": 2960
},
{
"epoch": 0.6702775897088693,
"grad_norm": 0.005401611328125,
"learning_rate": 7.795061541946688e-06,
"loss": 0.0001,
"step": 2970
},
{
"epoch": 0.672534416610246,
"grad_norm": 0.0035858154296875,
"learning_rate": 7.787510382843768e-06,
"loss": 0.0,
"step": 2980
},
{
"epoch": 0.6747912435116227,
"grad_norm": 0.0654296875,
"learning_rate": 7.779959223740846e-06,
"loss": 0.0,
"step": 2990
},
{
"epoch": 0.6770480704129993,
"grad_norm": 0.002838134765625,
"learning_rate": 7.772408064637923e-06,
"loss": 0.0,
"step": 3000
},
{
"epoch": 0.6793048973143759,
"grad_norm": 0.001068115234375,
"learning_rate": 7.764856905535e-06,
"loss": 0.1202,
"step": 3010
},
{
"epoch": 0.6815617242157527,
"grad_norm": 0.0078125,
"learning_rate": 7.757305746432077e-06,
"loss": 0.2662,
"step": 3020
},
{
"epoch": 0.6838185511171293,
"grad_norm": 0.002227783203125,
"learning_rate": 7.749754587329157e-06,
"loss": 0.3129,
"step": 3030
},
{
"epoch": 0.6860753780185059,
"grad_norm": 0.00093841552734375,
"learning_rate": 7.742203428226234e-06,
"loss": 0.0,
"step": 3040
},
{
"epoch": 0.6883322049198827,
"grad_norm": 0.00162506103515625,
"learning_rate": 7.73465226912331e-06,
"loss": 0.0,
"step": 3050
},
{
"epoch": 0.6905890318212593,
"grad_norm": 0.0113525390625,
"learning_rate": 7.727101110020388e-06,
"loss": 0.0,
"step": 3060
},
{
"epoch": 0.6928458587226359,
"grad_norm": 0.0042724609375,
"learning_rate": 7.719549950917466e-06,
"loss": 0.0,
"step": 3070
},
{
"epoch": 0.6951026856240127,
"grad_norm": 0.00174713134765625,
"learning_rate": 7.711998791814544e-06,
"loss": 0.0001,
"step": 3080
},
{
"epoch": 0.6973595125253893,
"grad_norm": 0.0021820068359375,
"learning_rate": 7.704447632711621e-06,
"loss": 0.0,
"step": 3090
},
{
"epoch": 0.6996163394267659,
"grad_norm": 0.00016307830810546875,
"learning_rate": 7.6968964736087e-06,
"loss": 0.0,
"step": 3100
},
{
"epoch": 0.7018731663281427,
"grad_norm": 0.0008544921875,
"learning_rate": 7.689345314505777e-06,
"loss": 0.0868,
"step": 3110
},
{
"epoch": 0.7041299932295193,
"grad_norm": 0.00162506103515625,
"learning_rate": 7.681794155402855e-06,
"loss": 0.0,
"step": 3120
},
{
"epoch": 0.7063868201308959,
"grad_norm": 0.83984375,
"learning_rate": 7.674242996299933e-06,
"loss": 0.0001,
"step": 3130
},
{
"epoch": 0.7086436470322727,
"grad_norm": 0.00021266937255859375,
"learning_rate": 7.66669183719701e-06,
"loss": 0.0,
"step": 3140
},
{
"epoch": 0.7109004739336493,
"grad_norm": 0.000926971435546875,
"learning_rate": 7.659140678094088e-06,
"loss": 0.0,
"step": 3150
},
{
"epoch": 0.7131573008350259,
"grad_norm": 2.515625,
"learning_rate": 7.651589518991166e-06,
"loss": 0.0004,
"step": 3160
},
{
"epoch": 0.7154141277364027,
"grad_norm": 0.0029144287109375,
"learning_rate": 7.644038359888244e-06,
"loss": 0.0,
"step": 3170
},
{
"epoch": 0.7176709546377793,
"grad_norm": 62.5,
"learning_rate": 7.636487200785321e-06,
"loss": 0.3018,
"step": 3180
},
{
"epoch": 0.7199277815391559,
"grad_norm": 0.0005950927734375,
"learning_rate": 7.628936041682399e-06,
"loss": 0.0,
"step": 3190
},
{
"epoch": 0.7221846084405326,
"grad_norm": 0.00054168701171875,
"learning_rate": 7.621384882579477e-06,
"loss": 0.0,
"step": 3200
},
{
"epoch": 0.7244414353419093,
"grad_norm": 0.00701904296875,
"learning_rate": 7.613833723476555e-06,
"loss": 0.0,
"step": 3210
},
{
"epoch": 0.7266982622432859,
"grad_norm": 41.75,
"learning_rate": 7.6062825643736315e-06,
"loss": 0.2809,
"step": 3220
},
{
"epoch": 0.7289550891446626,
"grad_norm": 0.0018463134765625,
"learning_rate": 7.598731405270709e-06,
"loss": 0.0,
"step": 3230
},
{
"epoch": 0.7312119160460393,
"grad_norm": 0.0024261474609375,
"learning_rate": 7.591180246167788e-06,
"loss": 0.0,
"step": 3240
},
{
"epoch": 0.7334687429474159,
"grad_norm": 0.0032501220703125,
"learning_rate": 7.583629087064866e-06,
"loss": 0.0,
"step": 3250
},
{
"epoch": 0.7357255698487926,
"grad_norm": 0.0016326904296875,
"learning_rate": 7.5760779279619426e-06,
"loss": 0.0,
"step": 3260
},
{
"epoch": 0.7379823967501693,
"grad_norm": 0.000629425048828125,
"learning_rate": 7.56852676885902e-06,
"loss": 0.0,
"step": 3270
},
{
"epoch": 0.7402392236515459,
"grad_norm": 0.0022735595703125,
"learning_rate": 7.560975609756098e-06,
"loss": 0.0,
"step": 3280
},
{
"epoch": 0.7424960505529226,
"grad_norm": 0.00193023681640625,
"learning_rate": 7.553424450653176e-06,
"loss": 0.0,
"step": 3290
},
{
"epoch": 0.7447528774542993,
"grad_norm": 0.00144195556640625,
"learning_rate": 7.545873291550253e-06,
"loss": 0.4039,
"step": 3300
},
{
"epoch": 0.7470097043556759,
"grad_norm": 0.006134033203125,
"learning_rate": 7.538322132447331e-06,
"loss": 0.0,
"step": 3310
},
{
"epoch": 0.7492665312570526,
"grad_norm": 0.0037384033203125,
"learning_rate": 7.530770973344409e-06,
"loss": 0.0,
"step": 3320
},
{
"epoch": 0.7515233581584293,
"grad_norm": 0.0028228759765625,
"learning_rate": 7.523219814241487e-06,
"loss": 0.0,
"step": 3330
},
{
"epoch": 0.7537801850598059,
"grad_norm": 0.0038604736328125,
"learning_rate": 7.515668655138565e-06,
"loss": 0.0,
"step": 3340
},
{
"epoch": 0.7560370119611826,
"grad_norm": 0.00151824951171875,
"learning_rate": 7.5081174960356416e-06,
"loss": 0.0,
"step": 3350
},
{
"epoch": 0.7582938388625592,
"grad_norm": 0.0002689361572265625,
"learning_rate": 7.500566336932719e-06,
"loss": 0.0,
"step": 3360
},
{
"epoch": 0.7605506657639359,
"grad_norm": 0.000316619873046875,
"learning_rate": 7.493015177829798e-06,
"loss": 0.0,
"step": 3370
},
{
"epoch": 0.7628074926653126,
"grad_norm": 0.00106048583984375,
"learning_rate": 7.485464018726876e-06,
"loss": 0.0,
"step": 3380
},
{
"epoch": 0.7650643195666892,
"grad_norm": 0.00186920166015625,
"learning_rate": 7.477912859623953e-06,
"loss": 0.0,
"step": 3390
},
{
"epoch": 0.7673211464680659,
"grad_norm": 0.00110626220703125,
"learning_rate": 7.47036170052103e-06,
"loss": 0.0,
"step": 3400
},
{
"epoch": 0.7695779733694426,
"grad_norm": 0.0007781982421875,
"learning_rate": 7.462810541418108e-06,
"loss": 0.0,
"step": 3410
},
{
"epoch": 0.7718348002708192,
"grad_norm": 0.000652313232421875,
"learning_rate": 7.455259382315187e-06,
"loss": 0.4836,
"step": 3420
},
{
"epoch": 0.7740916271721959,
"grad_norm": 0.002777099609375,
"learning_rate": 7.447708223212264e-06,
"loss": 0.0,
"step": 3430
},
{
"epoch": 0.7763484540735726,
"grad_norm": 0.002410888671875,
"learning_rate": 7.440157064109341e-06,
"loss": 0.0,
"step": 3440
},
{
"epoch": 0.7786052809749492,
"grad_norm": 0.0023345947265625,
"learning_rate": 7.432605905006419e-06,
"loss": 0.0,
"step": 3450
},
{
"epoch": 0.7808621078763259,
"grad_norm": 0.000885009765625,
"learning_rate": 7.425054745903497e-06,
"loss": 0.0,
"step": 3460
},
{
"epoch": 0.7831189347777026,
"grad_norm": 0.001190185546875,
"learning_rate": 7.417503586800574e-06,
"loss": 0.0,
"step": 3470
},
{
"epoch": 0.7853757616790792,
"grad_norm": 0.00164794921875,
"learning_rate": 7.409952427697652e-06,
"loss": 0.3779,
"step": 3480
},
{
"epoch": 0.7876325885804559,
"grad_norm": 0.0024871826171875,
"learning_rate": 7.40240126859473e-06,
"loss": 0.3797,
"step": 3490
},
{
"epoch": 0.7898894154818326,
"grad_norm": 0.0009002685546875,
"learning_rate": 7.394850109491808e-06,
"loss": 0.0,
"step": 3500
},
{
"epoch": 0.7921462423832092,
"grad_norm": 0.00145721435546875,
"learning_rate": 7.387298950388885e-06,
"loss": 0.0,
"step": 3510
},
{
"epoch": 0.7944030692845859,
"grad_norm": 0.00035858154296875,
"learning_rate": 7.379747791285963e-06,
"loss": 0.3298,
"step": 3520
},
{
"epoch": 0.7966598961859626,
"grad_norm": 0.000583648681640625,
"learning_rate": 7.37219663218304e-06,
"loss": 0.0,
"step": 3530
},
{
"epoch": 0.7989167230873392,
"grad_norm": 0.0028228759765625,
"learning_rate": 7.364645473080119e-06,
"loss": 0.187,
"step": 3540
},
{
"epoch": 0.8011735499887158,
"grad_norm": 0.01263427734375,
"learning_rate": 7.357094313977197e-06,
"loss": 0.2544,
"step": 3550
},
{
"epoch": 0.8034303768900926,
"grad_norm": 0.01226806640625,
"learning_rate": 7.349543154874274e-06,
"loss": 0.0001,
"step": 3560
},
{
"epoch": 0.8056872037914692,
"grad_norm": 0.00079345703125,
"learning_rate": 7.3419919957713514e-06,
"loss": 0.0,
"step": 3570
},
{
"epoch": 0.8079440306928458,
"grad_norm": 0.291015625,
"learning_rate": 7.334440836668429e-06,
"loss": 0.0001,
"step": 3580
},
{
"epoch": 0.8102008575942226,
"grad_norm": 0.0021514892578125,
"learning_rate": 7.326889677565508e-06,
"loss": 0.0,
"step": 3590
},
{
"epoch": 0.8124576844955992,
"grad_norm": 0.0004634857177734375,
"learning_rate": 7.319338518462584e-06,
"loss": 0.0,
"step": 3600
},
{
"epoch": 0.8147145113969758,
"grad_norm": 0.001800537109375,
"learning_rate": 7.3117873593596625e-06,
"loss": 0.0,
"step": 3610
},
{
"epoch": 0.8169713382983526,
"grad_norm": 0.0024871826171875,
"learning_rate": 7.30423620025674e-06,
"loss": 0.0,
"step": 3620
},
{
"epoch": 0.8192281651997292,
"grad_norm": 0.00119781494140625,
"learning_rate": 7.296685041153818e-06,
"loss": 0.0,
"step": 3630
},
{
"epoch": 0.8214849921011058,
"grad_norm": 0.000629425048828125,
"learning_rate": 7.289133882050895e-06,
"loss": 0.0,
"step": 3640
},
{
"epoch": 0.8237418190024826,
"grad_norm": 0.000797271728515625,
"learning_rate": 7.281582722947973e-06,
"loss": 0.3255,
"step": 3650
},
{
"epoch": 0.8259986459038592,
"grad_norm": 0.0008392333984375,
"learning_rate": 7.274031563845051e-06,
"loss": 0.0036,
"step": 3660
},
{
"epoch": 0.8282554728052358,
"grad_norm": 0.00164794921875,
"learning_rate": 7.266480404742129e-06,
"loss": 0.0,
"step": 3670
},
{
"epoch": 0.8305122997066126,
"grad_norm": 0.00012874603271484375,
"learning_rate": 7.258929245639206e-06,
"loss": 0.0,
"step": 3680
},
{
"epoch": 0.8327691266079892,
"grad_norm": 0.000461578369140625,
"learning_rate": 7.251378086536284e-06,
"loss": 0.3065,
"step": 3690
},
{
"epoch": 0.8350259535093658,
"grad_norm": 57.75,
"learning_rate": 7.2438269274333615e-06,
"loss": 0.3243,
"step": 3700
},
{
"epoch": 0.8372827804107424,
"grad_norm": 0.0020599365234375,
"learning_rate": 7.236275768330439e-06,
"loss": 0.0,
"step": 3710
},
{
"epoch": 0.8395396073121192,
"grad_norm": 0.0155029296875,
"learning_rate": 7.228724609227516e-06,
"loss": 0.0,
"step": 3720
},
{
"epoch": 0.8417964342134958,
"grad_norm": 0.005401611328125,
"learning_rate": 7.221173450124595e-06,
"loss": 0.0,
"step": 3730
},
{
"epoch": 0.8440532611148724,
"grad_norm": 0.0025634765625,
"learning_rate": 7.2136222910216725e-06,
"loss": 0.0,
"step": 3740
},
{
"epoch": 0.8463100880162492,
"grad_norm": 0.003143310546875,
"learning_rate": 7.20607113191875e-06,
"loss": 0.0,
"step": 3750
},
{
"epoch": 0.8485669149176258,
"grad_norm": 0.00994873046875,
"learning_rate": 7.198519972815828e-06,
"loss": 0.0,
"step": 3760
},
{
"epoch": 0.8508237418190024,
"grad_norm": 0.0191650390625,
"learning_rate": 7.190968813712905e-06,
"loss": 0.0,
"step": 3770
},
{
"epoch": 0.8530805687203792,
"grad_norm": 0.00024318695068359375,
"learning_rate": 7.1834176546099836e-06,
"loss": 0.0001,
"step": 3780
},
{
"epoch": 0.8553373956217558,
"grad_norm": 0.0035247802734375,
"learning_rate": 7.175866495507061e-06,
"loss": 0.208,
"step": 3790
},
{
"epoch": 0.8575942225231324,
"grad_norm": 0.0002269744873046875,
"learning_rate": 7.168315336404139e-06,
"loss": 0.0,
"step": 3800
},
{
"epoch": 0.8598510494245092,
"grad_norm": 0.00250244140625,
"learning_rate": 7.160764177301216e-06,
"loss": 0.3297,
"step": 3810
},
{
"epoch": 0.8621078763258858,
"grad_norm": 0.00150299072265625,
"learning_rate": 7.153213018198294e-06,
"loss": 0.0,
"step": 3820
},
{
"epoch": 0.8643647032272624,
"grad_norm": 0.00274658203125,
"learning_rate": 7.1456618590953715e-06,
"loss": 0.0,
"step": 3830
},
{
"epoch": 0.8666215301286392,
"grad_norm": 0.00262451171875,
"learning_rate": 7.13811069999245e-06,
"loss": 0.0033,
"step": 3840
},
{
"epoch": 0.8688783570300158,
"grad_norm": 0.002197265625,
"learning_rate": 7.130559540889527e-06,
"loss": 0.0441,
"step": 3850
},
{
"epoch": 0.8711351839313924,
"grad_norm": 0.031982421875,
"learning_rate": 7.123008381786605e-06,
"loss": 0.0,
"step": 3860
},
{
"epoch": 0.8733920108327691,
"grad_norm": 0.00469970703125,
"learning_rate": 7.1154572226836826e-06,
"loss": 0.0,
"step": 3870
},
{
"epoch": 0.8756488377341458,
"grad_norm": 0.00164794921875,
"learning_rate": 7.10790606358076e-06,
"loss": 0.0,
"step": 3880
},
{
"epoch": 0.8779056646355224,
"grad_norm": 0.00125885009765625,
"learning_rate": 7.100354904477837e-06,
"loss": 0.0,
"step": 3890
},
{
"epoch": 0.8801624915368991,
"grad_norm": 0.003814697265625,
"learning_rate": 7.092803745374916e-06,
"loss": 0.0,
"step": 3900
},
{
"epoch": 0.8824193184382758,
"grad_norm": 0.01007080078125,
"learning_rate": 7.085252586271994e-06,
"loss": 0.0,
"step": 3910
},
{
"epoch": 0.8846761453396524,
"grad_norm": 0.000579833984375,
"learning_rate": 7.077701427169071e-06,
"loss": 0.0,
"step": 3920
},
{
"epoch": 0.8869329722410291,
"grad_norm": 0.00848388671875,
"learning_rate": 7.070150268066148e-06,
"loss": 0.0,
"step": 3930
},
{
"epoch": 0.8891897991424058,
"grad_norm": 0.00054931640625,
"learning_rate": 7.062599108963226e-06,
"loss": 0.0,
"step": 3940
},
{
"epoch": 0.8914466260437824,
"grad_norm": 0.000835418701171875,
"learning_rate": 7.055047949860304e-06,
"loss": 0.0,
"step": 3950
},
{
"epoch": 0.8937034529451591,
"grad_norm": 0.00048828125,
"learning_rate": 7.047496790757382e-06,
"loss": 0.0,
"step": 3960
},
{
"epoch": 0.8959602798465358,
"grad_norm": 0.00023365020751953125,
"learning_rate": 7.039945631654459e-06,
"loss": 0.0,
"step": 3970
},
{
"epoch": 0.8982171067479124,
"grad_norm": 0.00335693359375,
"learning_rate": 7.032394472551537e-06,
"loss": 0.0,
"step": 3980
},
{
"epoch": 0.9004739336492891,
"grad_norm": 0.00014400482177734375,
"learning_rate": 7.024843313448615e-06,
"loss": 0.0,
"step": 3990
},
{
"epoch": 0.9027307605506658,
"grad_norm": 0.0002613067626953125,
"learning_rate": 7.017292154345693e-06,
"loss": 0.0,
"step": 4000
},
{
"epoch": 0.9049875874520424,
"grad_norm": 0.00043487548828125,
"learning_rate": 7.009740995242771e-06,
"loss": 0.0,
"step": 4010
},
{
"epoch": 0.9072444143534191,
"grad_norm": 0.318359375,
"learning_rate": 7.002189836139847e-06,
"loss": 0.0,
"step": 4020
},
{
"epoch": 0.9095012412547958,
"grad_norm": 0.00089263916015625,
"learning_rate": 6.994638677036926e-06,
"loss": 0.0,
"step": 4030
},
{
"epoch": 0.9117580681561724,
"grad_norm": 0.000247955322265625,
"learning_rate": 6.987087517934004e-06,
"loss": 0.0,
"step": 4040
},
{
"epoch": 0.9140148950575491,
"grad_norm": 0.00011873245239257812,
"learning_rate": 6.979536358831081e-06,
"loss": 0.0,
"step": 4050
},
{
"epoch": 0.9162717219589257,
"grad_norm": 0.00018978118896484375,
"learning_rate": 6.971985199728158e-06,
"loss": 0.0,
"step": 4060
},
{
"epoch": 0.9185285488603024,
"grad_norm": 0.0001049041748046875,
"learning_rate": 6.964434040625236e-06,
"loss": 0.3925,
"step": 4070
},
{
"epoch": 0.9207853757616791,
"grad_norm": 0.0002918243408203125,
"learning_rate": 6.956882881522315e-06,
"loss": 0.0,
"step": 4080
},
{
"epoch": 0.9230422026630557,
"grad_norm": 0.000255584716796875,
"learning_rate": 6.9493317224193925e-06,
"loss": 0.0,
"step": 4090
},
{
"epoch": 0.9252990295644324,
"grad_norm": 0.0001773834228515625,
"learning_rate": 6.941780563316469e-06,
"loss": 0.0,
"step": 4100
},
{
"epoch": 0.9275558564658091,
"grad_norm": 0.0087890625,
"learning_rate": 6.934229404213547e-06,
"loss": 0.3115,
"step": 4110
},
{
"epoch": 0.9298126833671857,
"grad_norm": 0.00093841552734375,
"learning_rate": 6.926678245110625e-06,
"loss": 0.086,
"step": 4120
},
{
"epoch": 0.9320695102685624,
"grad_norm": 0.000865936279296875,
"learning_rate": 6.9191270860077035e-06,
"loss": 0.0488,
"step": 4130
},
{
"epoch": 0.9343263371699391,
"grad_norm": 0.00144195556640625,
"learning_rate": 6.9115759269047796e-06,
"loss": 0.0,
"step": 4140
},
{
"epoch": 0.9365831640713157,
"grad_norm": 0.00823974609375,
"learning_rate": 6.904024767801858e-06,
"loss": 0.0,
"step": 4150
},
{
"epoch": 0.9388399909726924,
"grad_norm": 0.001922607421875,
"learning_rate": 6.896473608698936e-06,
"loss": 0.0,
"step": 4160
},
{
"epoch": 0.9410968178740691,
"grad_norm": 0.00102996826171875,
"learning_rate": 6.888922449596014e-06,
"loss": 0.0,
"step": 4170
},
{
"epoch": 0.9433536447754457,
"grad_norm": 0.000972747802734375,
"learning_rate": 6.881371290493091e-06,
"loss": 0.0,
"step": 4180
},
{
"epoch": 0.9456104716768224,
"grad_norm": 53.5,
"learning_rate": 6.873820131390168e-06,
"loss": 0.1112,
"step": 4190
},
{
"epoch": 0.9478672985781991,
"grad_norm": 0.00390625,
"learning_rate": 6.866268972287247e-06,
"loss": 0.4043,
"step": 4200
},
{
"epoch": 0.9501241254795757,
"grad_norm": 0.0021514892578125,
"learning_rate": 6.858717813184325e-06,
"loss": 0.0001,
"step": 4210
},
{
"epoch": 0.9523809523809523,
"grad_norm": 0.00074005126953125,
"learning_rate": 6.8511666540814025e-06,
"loss": 0.3612,
"step": 4220
},
{
"epoch": 0.9546377792823291,
"grad_norm": 0.002044677734375,
"learning_rate": 6.843615494978479e-06,
"loss": 0.0,
"step": 4230
},
{
"epoch": 0.9568946061837057,
"grad_norm": 0.0120849609375,
"learning_rate": 6.836064335875557e-06,
"loss": 0.2291,
"step": 4240
},
{
"epoch": 0.9591514330850823,
"grad_norm": 0.004669189453125,
"learning_rate": 6.828513176772635e-06,
"loss": 0.0,
"step": 4250
},
{
"epoch": 0.9614082599864591,
"grad_norm": 0.00494384765625,
"learning_rate": 6.8209620176697136e-06,
"loss": 0.0,
"step": 4260
},
{
"epoch": 0.9636650868878357,
"grad_norm": 0.00072479248046875,
"learning_rate": 6.8134108585667905e-06,
"loss": 0.0,
"step": 4270
},
{
"epoch": 0.9659219137892123,
"grad_norm": 0.005584716796875,
"learning_rate": 6.805859699463868e-06,
"loss": 0.0,
"step": 4280
},
{
"epoch": 0.9681787406905891,
"grad_norm": 0.0017242431640625,
"learning_rate": 6.798308540360946e-06,
"loss": 0.0,
"step": 4290
},
{
"epoch": 0.9704355675919657,
"grad_norm": 0.005035400390625,
"learning_rate": 6.790757381258024e-06,
"loss": 0.0,
"step": 4300
},
{
"epoch": 0.9726923944933423,
"grad_norm": 0.001739501953125,
"learning_rate": 6.783206222155101e-06,
"loss": 0.2294,
"step": 4310
},
{
"epoch": 0.9749492213947191,
"grad_norm": 0.000492095947265625,
"learning_rate": 6.775655063052179e-06,
"loss": 0.0,
"step": 4320
},
{
"epoch": 0.9772060482960957,
"grad_norm": 0.004486083984375,
"learning_rate": 6.768103903949257e-06,
"loss": 0.0,
"step": 4330
},
{
"epoch": 0.9794628751974723,
"grad_norm": 0.000896453857421875,
"learning_rate": 6.760552744846335e-06,
"loss": 0.0,
"step": 4340
},
{
"epoch": 0.9817197020988491,
"grad_norm": 0.01055908203125,
"learning_rate": 6.753001585743412e-06,
"loss": 0.0,
"step": 4350
},
{
"epoch": 0.9839765290002257,
"grad_norm": 0.00104522705078125,
"learning_rate": 6.7454504266404895e-06,
"loss": 0.1874,
"step": 4360
},
{
"epoch": 0.9862333559016023,
"grad_norm": 0.00069427490234375,
"learning_rate": 6.737899267537567e-06,
"loss": 0.0005,
"step": 4370
},
{
"epoch": 0.988490182802979,
"grad_norm": 0.030517578125,
"learning_rate": 6.730348108434646e-06,
"loss": 0.0,
"step": 4380
},
{
"epoch": 0.9907470097043557,
"grad_norm": 0.028076171875,
"learning_rate": 6.722796949331723e-06,
"loss": 0.0001,
"step": 4390
},
{
"epoch": 0.9930038366057323,
"grad_norm": 0.007171630859375,
"learning_rate": 6.7152457902288005e-06,
"loss": 0.0,
"step": 4400
},
{
"epoch": 0.995260663507109,
"grad_norm": 0.00023746490478515625,
"learning_rate": 6.707694631125878e-06,
"loss": 0.0,
"step": 4410
},
{
"epoch": 0.9975174904084857,
"grad_norm": 0.00061798095703125,
"learning_rate": 6.700143472022956e-06,
"loss": 0.0,
"step": 4420
},
{
"epoch": 0.9997743173098623,
"grad_norm": 0.01312255859375,
"learning_rate": 6.692592312920035e-06,
"loss": 0.0,
"step": 4430
},
{
"epoch": 1.002031144211239,
"grad_norm": 0.00026702880859375,
"learning_rate": 6.6850411538171116e-06,
"loss": 0.0,
"step": 4440
},
{
"epoch": 1.0042879711126156,
"grad_norm": 0.0025634765625,
"learning_rate": 6.677489994714189e-06,
"loss": 0.0,
"step": 4450
},
{
"epoch": 1.0065447980139923,
"grad_norm": 0.0003719329833984375,
"learning_rate": 6.669938835611267e-06,
"loss": 0.02,
"step": 4460
},
{
"epoch": 1.008801624915369,
"grad_norm": 0.003082275390625,
"learning_rate": 6.662387676508345e-06,
"loss": 0.0,
"step": 4470
},
{
"epoch": 1.0110584518167456,
"grad_norm": 0.00017070770263671875,
"learning_rate": 6.654836517405422e-06,
"loss": 0.0,
"step": 4480
},
{
"epoch": 1.0133152787181223,
"grad_norm": 0.00022220611572265625,
"learning_rate": 6.6472853583024995e-06,
"loss": 0.0,
"step": 4490
},
{
"epoch": 1.015572105619499,
"grad_norm": 0.0004119873046875,
"learning_rate": 6.639734199199578e-06,
"loss": 0.0,
"step": 4500
},
{
"epoch": 1.0178289325208756,
"grad_norm": 0.000156402587890625,
"learning_rate": 6.632183040096656e-06,
"loss": 0.0,
"step": 4510
},
{
"epoch": 1.0200857594222523,
"grad_norm": 152.0,
"learning_rate": 6.624631880993733e-06,
"loss": 0.1668,
"step": 4520
},
{
"epoch": 1.022342586323629,
"grad_norm": 0.0002727508544921875,
"learning_rate": 6.6170807218908106e-06,
"loss": 0.0,
"step": 4530
},
{
"epoch": 1.0245994132250056,
"grad_norm": 0.000118255615234375,
"learning_rate": 6.609529562787888e-06,
"loss": 0.0,
"step": 4540
},
{
"epoch": 1.0268562401263823,
"grad_norm": 0.000286102294921875,
"learning_rate": 6.601978403684967e-06,
"loss": 0.0,
"step": 4550
},
{
"epoch": 1.029113067027759,
"grad_norm": 0.0001544952392578125,
"learning_rate": 6.594427244582044e-06,
"loss": 0.0,
"step": 4560
},
{
"epoch": 1.0313698939291356,
"grad_norm": 0.000286102294921875,
"learning_rate": 6.586876085479122e-06,
"loss": 0.0,
"step": 4570
},
{
"epoch": 1.0336267208305123,
"grad_norm": 0.00054168701171875,
"learning_rate": 6.579324926376199e-06,
"loss": 0.0,
"step": 4580
},
{
"epoch": 1.035883547731889,
"grad_norm": 0.003662109375,
"learning_rate": 6.571773767273277e-06,
"loss": 0.0,
"step": 4590
},
{
"epoch": 1.0381403746332656,
"grad_norm": 0.000576019287109375,
"learning_rate": 6.564222608170354e-06,
"loss": 0.0,
"step": 4600
},
{
"epoch": 1.0403972015346423,
"grad_norm": 0.00018310546875,
"learning_rate": 6.556671449067432e-06,
"loss": 0.0,
"step": 4610
},
{
"epoch": 1.042654028436019,
"grad_norm": 0.00083160400390625,
"learning_rate": 6.54912028996451e-06,
"loss": 0.0,
"step": 4620
},
{
"epoch": 1.0449108553373956,
"grad_norm": 0.0004596710205078125,
"learning_rate": 6.541569130861588e-06,
"loss": 0.0,
"step": 4630
},
{
"epoch": 1.0471676822387723,
"grad_norm": 0.0021820068359375,
"learning_rate": 6.534017971758666e-06,
"loss": 0.0,
"step": 4640
},
{
"epoch": 1.0494245091401488,
"grad_norm": 0.00077056884765625,
"learning_rate": 6.526466812655743e-06,
"loss": 0.0,
"step": 4650
},
{
"epoch": 1.0516813360415256,
"grad_norm": 0.0002460479736328125,
"learning_rate": 6.518915653552821e-06,
"loss": 0.0,
"step": 4660
},
{
"epoch": 1.0539381629429023,
"grad_norm": 48.25,
"learning_rate": 6.511364494449899e-06,
"loss": 0.268,
"step": 4670
},
{
"epoch": 1.0561949898442788,
"grad_norm": 0.00084686279296875,
"learning_rate": 6.503813335346977e-06,
"loss": 0.2907,
"step": 4680
},
{
"epoch": 1.0584518167456556,
"grad_norm": 0.00946044921875,
"learning_rate": 6.496262176244054e-06,
"loss": 0.5606,
"step": 4690
},
{
"epoch": 1.0607086436470323,
"grad_norm": 0.001953125,
"learning_rate": 6.488711017141132e-06,
"loss": 0.0,
"step": 4700
},
{
"epoch": 1.0629654705484088,
"grad_norm": 0.0115966796875,
"learning_rate": 6.481159858038209e-06,
"loss": 0.0,
"step": 4710
},
{
"epoch": 1.0652222974497856,
"grad_norm": 0.05419921875,
"learning_rate": 6.473608698935287e-06,
"loss": 0.0771,
"step": 4720
},
{
"epoch": 1.0674791243511623,
"grad_norm": 0.002838134765625,
"learning_rate": 6.466057539832364e-06,
"loss": 0.0,
"step": 4730
},
{
"epoch": 1.0697359512525388,
"grad_norm": 0.002593994140625,
"learning_rate": 6.458506380729443e-06,
"loss": 0.0,
"step": 4740
},
{
"epoch": 1.0719927781539156,
"grad_norm": 0.0021820068359375,
"learning_rate": 6.4509552216265204e-06,
"loss": 0.0,
"step": 4750
},
{
"epoch": 1.0742496050552923,
"grad_norm": 0.0012664794921875,
"learning_rate": 6.443404062523598e-06,
"loss": 0.0,
"step": 4760
},
{
"epoch": 1.0765064319566688,
"grad_norm": 0.024169921875,
"learning_rate": 6.435852903420675e-06,
"loss": 0.336,
"step": 4770
},
{
"epoch": 1.0787632588580456,
"grad_norm": 0.0072021484375,
"learning_rate": 6.428301744317753e-06,
"loss": 0.1798,
"step": 4780
},
{
"epoch": 1.0810200857594223,
"grad_norm": 0.00128936767578125,
"learning_rate": 6.4207505852148315e-06,
"loss": 0.0,
"step": 4790
},
{
"epoch": 1.0832769126607988,
"grad_norm": 0.0050048828125,
"learning_rate": 6.413199426111909e-06,
"loss": 0.0,
"step": 4800
},
{
"epoch": 1.0855337395621756,
"grad_norm": 0.006744384765625,
"learning_rate": 6.405648267008986e-06,
"loss": 0.0,
"step": 4810
},
{
"epoch": 1.0877905664635523,
"grad_norm": 0.00164031982421875,
"learning_rate": 6.398097107906064e-06,
"loss": 0.0,
"step": 4820
},
{
"epoch": 1.0900473933649288,
"grad_norm": 0.0177001953125,
"learning_rate": 6.390545948803142e-06,
"loss": 0.0,
"step": 4830
},
{
"epoch": 1.0923042202663056,
"grad_norm": 0.00015544891357421875,
"learning_rate": 6.3829947897002194e-06,
"loss": 0.0,
"step": 4840
},
{
"epoch": 1.0945610471676823,
"grad_norm": 0.000179290771484375,
"learning_rate": 6.375443630597298e-06,
"loss": 0.0,
"step": 4850
},
{
"epoch": 1.0968178740690588,
"grad_norm": 0.005828857421875,
"learning_rate": 6.367892471494375e-06,
"loss": 0.0,
"step": 4860
},
{
"epoch": 1.0990747009704356,
"grad_norm": 0.00131988525390625,
"learning_rate": 6.360341312391453e-06,
"loss": 0.0,
"step": 4870
},
{
"epoch": 1.1013315278718123,
"grad_norm": 0.00098419189453125,
"learning_rate": 6.3527901532885305e-06,
"loss": 0.0,
"step": 4880
},
{
"epoch": 1.1035883547731888,
"grad_norm": 0.000156402587890625,
"learning_rate": 6.345238994185608e-06,
"loss": 0.0,
"step": 4890
},
{
"epoch": 1.1058451816745656,
"grad_norm": 0.00141143798828125,
"learning_rate": 6.337687835082685e-06,
"loss": 0.0,
"step": 4900
},
{
"epoch": 1.1081020085759423,
"grad_norm": 0.00164031982421875,
"learning_rate": 6.330136675979763e-06,
"loss": 0.2685,
"step": 4910
},
{
"epoch": 1.1103588354773188,
"grad_norm": 0.0014190673828125,
"learning_rate": 6.3225855168768415e-06,
"loss": 0.0,
"step": 4920
},
{
"epoch": 1.1126156623786956,
"grad_norm": 0.000972747802734375,
"learning_rate": 6.315034357773919e-06,
"loss": 0.1327,
"step": 4930
},
{
"epoch": 1.1148724892800723,
"grad_norm": 0.26171875,
"learning_rate": 6.307483198670996e-06,
"loss": 0.0,
"step": 4940
},
{
"epoch": 1.1171293161814488,
"grad_norm": 0.002227783203125,
"learning_rate": 6.299932039568074e-06,
"loss": 0.0,
"step": 4950
},
{
"epoch": 1.1193861430828256,
"grad_norm": 0.0145263671875,
"learning_rate": 6.292380880465152e-06,
"loss": 0.0,
"step": 4960
},
{
"epoch": 1.1216429699842023,
"grad_norm": 0.00238037109375,
"learning_rate": 6.28482972136223e-06,
"loss": 0.0,
"step": 4970
},
{
"epoch": 1.1238997968855788,
"grad_norm": 0.00031280517578125,
"learning_rate": 6.277278562259307e-06,
"loss": 0.0,
"step": 4980
},
{
"epoch": 1.1261566237869556,
"grad_norm": 0.0038604736328125,
"learning_rate": 6.269727403156385e-06,
"loss": 0.0,
"step": 4990
},
{
"epoch": 1.128413450688332,
"grad_norm": 0.002532958984375,
"learning_rate": 6.262176244053463e-06,
"loss": 0.0,
"step": 5000
},
{
"epoch": 1.1306702775897088,
"grad_norm": 0.000244140625,
"learning_rate": 6.2546250849505405e-06,
"loss": 0.0,
"step": 5010
},
{
"epoch": 1.1329271044910856,
"grad_norm": 0.004119873046875,
"learning_rate": 6.2470739258476174e-06,
"loss": 0.3443,
"step": 5020
},
{
"epoch": 1.1351839313924623,
"grad_norm": 0.002685546875,
"learning_rate": 6.239522766744695e-06,
"loss": 0.0,
"step": 5030
},
{
"epoch": 1.1374407582938388,
"grad_norm": 0.05810546875,
"learning_rate": 6.231971607641774e-06,
"loss": 0.0,
"step": 5040
},
{
"epoch": 1.1396975851952156,
"grad_norm": 0.01239013671875,
"learning_rate": 6.2244204485388516e-06,
"loss": 0.0,
"step": 5050
},
{
"epoch": 1.141954412096592,
"grad_norm": 0.0234375,
"learning_rate": 6.2168692894359285e-06,
"loss": 0.3027,
"step": 5060
},
{
"epoch": 1.1442112389979688,
"grad_norm": 0.00051116943359375,
"learning_rate": 6.209318130333006e-06,
"loss": 0.0,
"step": 5070
},
{
"epoch": 1.1464680658993456,
"grad_norm": 0.000598907470703125,
"learning_rate": 6.201766971230084e-06,
"loss": 0.0,
"step": 5080
},
{
"epoch": 1.148724892800722,
"grad_norm": 0.0147705078125,
"learning_rate": 6.194215812127163e-06,
"loss": 0.0,
"step": 5090
},
{
"epoch": 1.1509817197020988,
"grad_norm": 0.005218505859375,
"learning_rate": 6.18666465302424e-06,
"loss": 0.0,
"step": 5100
},
{
"epoch": 1.1532385466034756,
"grad_norm": 0.00250244140625,
"learning_rate": 6.179113493921317e-06,
"loss": 0.0,
"step": 5110
},
{
"epoch": 1.155495373504852,
"grad_norm": 0.0002765655517578125,
"learning_rate": 6.171562334818395e-06,
"loss": 0.0,
"step": 5120
},
{
"epoch": 1.1577522004062288,
"grad_norm": 0.000728607177734375,
"learning_rate": 6.164011175715473e-06,
"loss": 0.0,
"step": 5130
},
{
"epoch": 1.1600090273076056,
"grad_norm": 0.00830078125,
"learning_rate": 6.1564600166125506e-06,
"loss": 0.1798,
"step": 5140
},
{
"epoch": 1.162265854208982,
"grad_norm": 0.000766754150390625,
"learning_rate": 6.1489088575096275e-06,
"loss": 0.0,
"step": 5150
},
{
"epoch": 1.1645226811103588,
"grad_norm": 0.0004215240478515625,
"learning_rate": 6.141357698406706e-06,
"loss": 0.0,
"step": 5160
},
{
"epoch": 1.1667795080117356,
"grad_norm": 0.0034637451171875,
"learning_rate": 6.133806539303784e-06,
"loss": 0.0,
"step": 5170
},
{
"epoch": 1.169036334913112,
"grad_norm": 8.0108642578125e-05,
"learning_rate": 6.126255380200862e-06,
"loss": 0.0,
"step": 5180
},
{
"epoch": 1.1712931618144888,
"grad_norm": 0.00191497802734375,
"learning_rate": 6.1187042210979385e-06,
"loss": 0.0,
"step": 5190
},
{
"epoch": 1.1735499887158656,
"grad_norm": 0.0003147125244140625,
"learning_rate": 6.111153061995016e-06,
"loss": 0.0,
"step": 5200
},
{
"epoch": 1.175806815617242,
"grad_norm": 0.005126953125,
"learning_rate": 6.103601902892095e-06,
"loss": 0.0,
"step": 5210
},
{
"epoch": 1.1780636425186188,
"grad_norm": 0.00104522705078125,
"learning_rate": 6.096050743789173e-06,
"loss": 0.0,
"step": 5220
},
{
"epoch": 1.1803204694199956,
"grad_norm": 0.00109100341796875,
"learning_rate": 6.0884995846862496e-06,
"loss": 0.098,
"step": 5230
},
{
"epoch": 1.182577296321372,
"grad_norm": 0.0028228759765625,
"learning_rate": 6.080948425583327e-06,
"loss": 0.0,
"step": 5240
},
{
"epoch": 1.1848341232227488,
"grad_norm": 0.0003376007080078125,
"learning_rate": 6.073397266480405e-06,
"loss": 0.0,
"step": 5250
},
{
"epoch": 1.1870909501241256,
"grad_norm": 0.0022430419921875,
"learning_rate": 6.065846107377483e-06,
"loss": 0.0,
"step": 5260
},
{
"epoch": 1.189347777025502,
"grad_norm": 0.003082275390625,
"learning_rate": 6.05829494827456e-06,
"loss": 0.2679,
"step": 5270
},
{
"epoch": 1.1916046039268788,
"grad_norm": 0.0012359619140625,
"learning_rate": 6.050743789171638e-06,
"loss": 0.122,
"step": 5280
},
{
"epoch": 1.1938614308282554,
"grad_norm": 0.000385284423828125,
"learning_rate": 6.043192630068716e-06,
"loss": 0.0,
"step": 5290
},
{
"epoch": 1.196118257729632,
"grad_norm": 0.00018596649169921875,
"learning_rate": 6.035641470965794e-06,
"loss": 0.0,
"step": 5300
},
{
"epoch": 1.1983750846310088,
"grad_norm": 0.002349853515625,
"learning_rate": 6.028090311862872e-06,
"loss": 0.0,
"step": 5310
},
{
"epoch": 1.2006319115323856,
"grad_norm": 0.000621795654296875,
"learning_rate": 6.0205391527599486e-06,
"loss": 0.0,
"step": 5320
},
{
"epoch": 1.202888738433762,
"grad_norm": 0.001739501953125,
"learning_rate": 6.012987993657027e-06,
"loss": 0.0,
"step": 5330
},
{
"epoch": 1.2051455653351388,
"grad_norm": 0.00019931793212890625,
"learning_rate": 6.005436834554105e-06,
"loss": 0.0,
"step": 5340
},
{
"epoch": 1.2074023922365154,
"grad_norm": 0.00186920166015625,
"learning_rate": 5.997885675451183e-06,
"loss": 0.1819,
"step": 5350
},
{
"epoch": 1.209659219137892,
"grad_norm": 0.00061798095703125,
"learning_rate": 5.99033451634826e-06,
"loss": 0.0,
"step": 5360
},
{
"epoch": 1.2119160460392688,
"grad_norm": 0.0018157958984375,
"learning_rate": 5.982783357245337e-06,
"loss": 0.0,
"step": 5370
},
{
"epoch": 1.2141728729406456,
"grad_norm": 0.01904296875,
"learning_rate": 5.975232198142415e-06,
"loss": 0.0,
"step": 5380
},
{
"epoch": 1.216429699842022,
"grad_norm": 0.0014190673828125,
"learning_rate": 5.967681039039494e-06,
"loss": 0.0,
"step": 5390
},
{
"epoch": 1.2186865267433988,
"grad_norm": 0.000553131103515625,
"learning_rate": 5.960129879936571e-06,
"loss": 0.0,
"step": 5400
},
{
"epoch": 1.2209433536447754,
"grad_norm": 0.000858306884765625,
"learning_rate": 5.952578720833648e-06,
"loss": 0.0,
"step": 5410
},
{
"epoch": 1.223200180546152,
"grad_norm": 0.001678466796875,
"learning_rate": 5.945027561730726e-06,
"loss": 0.6368,
"step": 5420
},
{
"epoch": 1.2254570074475288,
"grad_norm": 0.0004119873046875,
"learning_rate": 5.937476402627804e-06,
"loss": 0.0676,
"step": 5430
},
{
"epoch": 1.2277138343489054,
"grad_norm": 0.005126953125,
"learning_rate": 5.929925243524881e-06,
"loss": 0.1247,
"step": 5440
},
{
"epoch": 1.229970661250282,
"grad_norm": 0.000949859619140625,
"learning_rate": 5.9223740844219595e-06,
"loss": 0.0,
"step": 5450
},
{
"epoch": 1.2322274881516588,
"grad_norm": 0.00148773193359375,
"learning_rate": 5.914822925319037e-06,
"loss": 0.0,
"step": 5460
},
{
"epoch": 1.2344843150530354,
"grad_norm": 0.0020904541015625,
"learning_rate": 5.907271766216115e-06,
"loss": 0.0,
"step": 5470
},
{
"epoch": 1.236741141954412,
"grad_norm": 0.000865936279296875,
"learning_rate": 5.899720607113192e-06,
"loss": 0.0,
"step": 5480
},
{
"epoch": 1.2389979688557888,
"grad_norm": 0.0021820068359375,
"learning_rate": 5.89216944801027e-06,
"loss": 0.0,
"step": 5490
},
{
"epoch": 1.2412547957571654,
"grad_norm": 0.003570556640625,
"learning_rate": 5.884618288907347e-06,
"loss": 0.0,
"step": 5500
},
{
"epoch": 1.243511622658542,
"grad_norm": 0.000560760498046875,
"learning_rate": 5.877067129804426e-06,
"loss": 0.2991,
"step": 5510
},
{
"epoch": 1.2457684495599188,
"grad_norm": 0.0027008056640625,
"learning_rate": 5.869515970701504e-06,
"loss": 0.0,
"step": 5520
},
{
"epoch": 1.2480252764612954,
"grad_norm": 0.003997802734375,
"learning_rate": 5.861964811598581e-06,
"loss": 0.2871,
"step": 5530
},
{
"epoch": 1.250282103362672,
"grad_norm": 0.0020751953125,
"learning_rate": 5.8544136524956585e-06,
"loss": 0.0,
"step": 5540
},
{
"epoch": 1.2525389302640488,
"grad_norm": 0.000579833984375,
"learning_rate": 5.846862493392736e-06,
"loss": 0.0,
"step": 5550
},
{
"epoch": 1.2547957571654254,
"grad_norm": 0.004119873046875,
"learning_rate": 5.839311334289815e-06,
"loss": 0.0,
"step": 5560
},
{
"epoch": 1.257052584066802,
"grad_norm": 0.00054931640625,
"learning_rate": 5.831760175186891e-06,
"loss": 0.0001,
"step": 5570
},
{
"epoch": 1.2593094109681786,
"grad_norm": 0.00170135498046875,
"learning_rate": 5.8242090160839695e-06,
"loss": 0.1847,
"step": 5580
},
{
"epoch": 1.2615662378695554,
"grad_norm": 0.000247955322265625,
"learning_rate": 5.816657856981047e-06,
"loss": 0.0,
"step": 5590
},
{
"epoch": 1.263823064770932,
"grad_norm": 0.0002956390380859375,
"learning_rate": 5.809106697878125e-06,
"loss": 0.0,
"step": 5600
},
{
"epoch": 1.2660798916723088,
"grad_norm": 0.001983642578125,
"learning_rate": 5.801555538775202e-06,
"loss": 0.0,
"step": 5610
},
{
"epoch": 1.2683367185736854,
"grad_norm": 0.00022792816162109375,
"learning_rate": 5.79400437967228e-06,
"loss": 0.0008,
"step": 5620
},
{
"epoch": 1.270593545475062,
"grad_norm": 0.000247955322265625,
"learning_rate": 5.786453220569358e-06,
"loss": 0.0,
"step": 5630
},
{
"epoch": 1.2728503723764386,
"grad_norm": 0.00080108642578125,
"learning_rate": 5.778902061466436e-06,
"loss": 0.0,
"step": 5640
},
{
"epoch": 1.2751071992778154,
"grad_norm": 0.0022430419921875,
"learning_rate": 5.771350902363513e-06,
"loss": 0.2461,
"step": 5650
},
{
"epoch": 1.277364026179192,
"grad_norm": 0.000720977783203125,
"learning_rate": 5.763799743260591e-06,
"loss": 0.0,
"step": 5660
},
{
"epoch": 1.2796208530805688,
"grad_norm": 0.0001697540283203125,
"learning_rate": 5.7562485841576685e-06,
"loss": 0.0,
"step": 5670
},
{
"epoch": 1.2818776799819454,
"grad_norm": 0.0003223419189453125,
"learning_rate": 5.748697425054747e-06,
"loss": 0.0,
"step": 5680
},
{
"epoch": 1.284134506883322,
"grad_norm": 0.00067138671875,
"learning_rate": 5.741146265951823e-06,
"loss": 0.0,
"step": 5690
},
{
"epoch": 1.2863913337846986,
"grad_norm": 0.000133514404296875,
"learning_rate": 5.733595106848902e-06,
"loss": 0.0,
"step": 5700
},
{
"epoch": 1.2886481606860754,
"grad_norm": 0.0006103515625,
"learning_rate": 5.7260439477459796e-06,
"loss": 0.0,
"step": 5710
},
{
"epoch": 1.290904987587452,
"grad_norm": 0.01220703125,
"learning_rate": 5.718492788643057e-06,
"loss": 0.0,
"step": 5720
},
{
"epoch": 1.2931618144888288,
"grad_norm": 0.0005645751953125,
"learning_rate": 5.710941629540135e-06,
"loss": 0.0,
"step": 5730
},
{
"epoch": 1.2954186413902053,
"grad_norm": 0.00152587890625,
"learning_rate": 5.703390470437212e-06,
"loss": 0.0,
"step": 5740
},
{
"epoch": 1.297675468291582,
"grad_norm": 0.00170135498046875,
"learning_rate": 5.695839311334291e-06,
"loss": 0.0,
"step": 5750
},
{
"epoch": 1.2999322951929586,
"grad_norm": 0.0004100799560546875,
"learning_rate": 5.688288152231368e-06,
"loss": 0.3257,
"step": 5760
},
{
"epoch": 1.3021891220943353,
"grad_norm": 0.000667572021484375,
"learning_rate": 5.680736993128446e-06,
"loss": 0.0,
"step": 5770
},
{
"epoch": 1.304445948995712,
"grad_norm": 0.0002956390380859375,
"learning_rate": 5.673185834025523e-06,
"loss": 0.0,
"step": 5780
},
{
"epoch": 1.3067027758970888,
"grad_norm": 9.918212890625e-05,
"learning_rate": 5.665634674922601e-06,
"loss": 0.0,
"step": 5790
},
{
"epoch": 1.3089596027984653,
"grad_norm": 0.000293731689453125,
"learning_rate": 5.6580835158196786e-06,
"loss": 0.0,
"step": 5800
},
{
"epoch": 1.311216429699842,
"grad_norm": 70.5,
"learning_rate": 5.650532356716757e-06,
"loss": 0.1552,
"step": 5810
},
{
"epoch": 1.3134732566012186,
"grad_norm": 0.00022029876708984375,
"learning_rate": 5.642981197613834e-06,
"loss": 0.2409,
"step": 5820
},
{
"epoch": 1.3157300835025953,
"grad_norm": 0.0005340576171875,
"learning_rate": 5.635430038510912e-06,
"loss": 0.0,
"step": 5830
},
{
"epoch": 1.317986910403972,
"grad_norm": 0.00165557861328125,
"learning_rate": 5.62787887940799e-06,
"loss": 0.4549,
"step": 5840
},
{
"epoch": 1.3202437373053486,
"grad_norm": 0.000545501708984375,
"learning_rate": 5.620327720305067e-06,
"loss": 0.0,
"step": 5850
},
{
"epoch": 1.3225005642067253,
"grad_norm": 0.000202178955078125,
"learning_rate": 5.612776561202144e-06,
"loss": 0.1561,
"step": 5860
},
{
"epoch": 1.324757391108102,
"grad_norm": 0.0022125244140625,
"learning_rate": 5.605225402099223e-06,
"loss": 0.0,
"step": 5870
},
{
"epoch": 1.3270142180094786,
"grad_norm": 0.001617431640625,
"learning_rate": 5.597674242996301e-06,
"loss": 0.0,
"step": 5880
},
{
"epoch": 1.3292710449108553,
"grad_norm": 0.0001316070556640625,
"learning_rate": 5.590123083893378e-06,
"loss": 0.0,
"step": 5890
},
{
"epoch": 1.331527871812232,
"grad_norm": 0.00150299072265625,
"learning_rate": 5.582571924790455e-06,
"loss": 0.0,
"step": 5900
},
{
"epoch": 1.3337846987136086,
"grad_norm": 0.0024261474609375,
"learning_rate": 5.575020765687533e-06,
"loss": 0.0,
"step": 5910
},
{
"epoch": 1.3360415256149853,
"grad_norm": 0.000370025634765625,
"learning_rate": 5.567469606584611e-06,
"loss": 0.1382,
"step": 5920
},
{
"epoch": 1.3382983525163619,
"grad_norm": 0.002410888671875,
"learning_rate": 5.5599184474816894e-06,
"loss": 0.0,
"step": 5930
},
{
"epoch": 1.3405551794177386,
"grad_norm": 0.00010395050048828125,
"learning_rate": 5.552367288378766e-06,
"loss": 0.0,
"step": 5940
},
{
"epoch": 1.3428120063191153,
"grad_norm": 0.00049591064453125,
"learning_rate": 5.544816129275844e-06,
"loss": 0.0,
"step": 5950
},
{
"epoch": 1.345068833220492,
"grad_norm": 0.0003814697265625,
"learning_rate": 5.537264970172922e-06,
"loss": 0.0,
"step": 5960
},
{
"epoch": 1.3473256601218686,
"grad_norm": 0.0228271484375,
"learning_rate": 5.52971381107e-06,
"loss": 0.0036,
"step": 5970
},
{
"epoch": 1.3495824870232453,
"grad_norm": 0.000568389892578125,
"learning_rate": 5.522162651967078e-06,
"loss": 0.3318,
"step": 5980
},
{
"epoch": 1.3518393139246219,
"grad_norm": 0.000804901123046875,
"learning_rate": 5.514611492864155e-06,
"loss": 0.1936,
"step": 5990
},
{
"epoch": 1.3540961408259986,
"grad_norm": 0.0015869140625,
"learning_rate": 5.507060333761233e-06,
"loss": 0.0,
"step": 6000
},
{
"epoch": 1.3563529677273753,
"grad_norm": 0.00189971923828125,
"learning_rate": 5.499509174658311e-06,
"loss": 0.0,
"step": 6010
},
{
"epoch": 1.358609794628752,
"grad_norm": 0.000518798828125,
"learning_rate": 5.4919580155553884e-06,
"loss": 0.2387,
"step": 6020
},
{
"epoch": 1.3608666215301286,
"grad_norm": 0.00323486328125,
"learning_rate": 5.484406856452465e-06,
"loss": 0.0003,
"step": 6030
},
{
"epoch": 1.3631234484315053,
"grad_norm": 0.0181884765625,
"learning_rate": 5.476855697349543e-06,
"loss": 0.0,
"step": 6040
},
{
"epoch": 1.3653802753328819,
"grad_norm": 0.000217437744140625,
"learning_rate": 5.469304538246622e-06,
"loss": 0.0,
"step": 6050
},
{
"epoch": 1.3676371022342586,
"grad_norm": 0.0008087158203125,
"learning_rate": 5.4617533791436995e-06,
"loss": 0.0,
"step": 6060
},
{
"epoch": 1.3698939291356353,
"grad_norm": 0.001922607421875,
"learning_rate": 5.454202220040776e-06,
"loss": 0.0,
"step": 6070
},
{
"epoch": 1.372150756037012,
"grad_norm": 0.00183868408203125,
"learning_rate": 5.446651060937854e-06,
"loss": 0.0,
"step": 6080
},
{
"epoch": 1.3744075829383886,
"grad_norm": 0.00193023681640625,
"learning_rate": 5.439099901834932e-06,
"loss": 0.0,
"step": 6090
},
{
"epoch": 1.3766644098397653,
"grad_norm": 0.007476806640625,
"learning_rate": 5.4315487427320105e-06,
"loss": 0.0,
"step": 6100
},
{
"epoch": 1.3789212367411419,
"grad_norm": 0.000171661376953125,
"learning_rate": 5.423997583629087e-06,
"loss": 0.0082,
"step": 6110
},
{
"epoch": 1.3811780636425186,
"grad_norm": 0.00201416015625,
"learning_rate": 5.416446424526165e-06,
"loss": 0.0,
"step": 6120
},
{
"epoch": 1.3834348905438953,
"grad_norm": 0.0010223388671875,
"learning_rate": 5.408895265423243e-06,
"loss": 0.0,
"step": 6130
},
{
"epoch": 1.3856917174452719,
"grad_norm": 0.0030517578125,
"learning_rate": 5.401344106320321e-06,
"loss": 0.0,
"step": 6140
},
{
"epoch": 1.3879485443466486,
"grad_norm": 0.00070953369140625,
"learning_rate": 5.393792947217398e-06,
"loss": 0.0,
"step": 6150
},
{
"epoch": 1.3902053712480253,
"grad_norm": 0.0004482269287109375,
"learning_rate": 5.386241788114475e-06,
"loss": 0.0,
"step": 6160
},
{
"epoch": 1.3924621981494019,
"grad_norm": 0.00118255615234375,
"learning_rate": 5.378690629011554e-06,
"loss": 0.3508,
"step": 6170
},
{
"epoch": 1.3947190250507786,
"grad_norm": 0.0012664794921875,
"learning_rate": 5.371139469908632e-06,
"loss": 0.2107,
"step": 6180
},
{
"epoch": 1.3969758519521553,
"grad_norm": 0.00125885009765625,
"learning_rate": 5.3635883108057095e-06,
"loss": 0.0,
"step": 6190
},
{
"epoch": 1.3992326788535319,
"grad_norm": 0.000713348388671875,
"learning_rate": 5.3560371517027864e-06,
"loss": 0.2159,
"step": 6200
},
{
"epoch": 1.4014895057549086,
"grad_norm": 0.00090789794921875,
"learning_rate": 5.348485992599864e-06,
"loss": 0.0,
"step": 6210
},
{
"epoch": 1.4037463326562853,
"grad_norm": 0.00982666015625,
"learning_rate": 5.340934833496943e-06,
"loss": 0.2689,
"step": 6220
},
{
"epoch": 1.4060031595576619,
"grad_norm": 0.01019287109375,
"learning_rate": 5.3333836743940206e-06,
"loss": 0.0,
"step": 6230
},
{
"epoch": 1.4082599864590386,
"grad_norm": 0.02392578125,
"learning_rate": 5.3258325152910975e-06,
"loss": 0.0001,
"step": 6240
},
{
"epoch": 1.4105168133604153,
"grad_norm": 0.0023345947265625,
"learning_rate": 5.318281356188175e-06,
"loss": 0.0001,
"step": 6250
},
{
"epoch": 1.4127736402617919,
"grad_norm": 0.000885009765625,
"learning_rate": 5.310730197085253e-06,
"loss": 0.1802,
"step": 6260
},
{
"epoch": 1.4150304671631686,
"grad_norm": 0.0024566650390625,
"learning_rate": 5.303179037982331e-06,
"loss": 0.0,
"step": 6270
},
{
"epoch": 1.4172872940645451,
"grad_norm": 0.00579833984375,
"learning_rate": 5.295627878879408e-06,
"loss": 0.0,
"step": 6280
},
{
"epoch": 1.4195441209659219,
"grad_norm": 0.0103759765625,
"learning_rate": 5.288076719776486e-06,
"loss": 0.0,
"step": 6290
},
{
"epoch": 1.4218009478672986,
"grad_norm": 0.000530242919921875,
"learning_rate": 5.280525560673564e-06,
"loss": 0.0,
"step": 6300
},
{
"epoch": 1.4240577747686753,
"grad_norm": 0.000827789306640625,
"learning_rate": 5.272974401570642e-06,
"loss": 0.0,
"step": 6310
},
{
"epoch": 1.4263146016700519,
"grad_norm": 0.00836181640625,
"learning_rate": 5.265423242467719e-06,
"loss": 0.0,
"step": 6320
},
{
"epoch": 1.4285714285714286,
"grad_norm": 0.0013427734375,
"learning_rate": 5.2578720833647965e-06,
"loss": 0.0,
"step": 6330
},
{
"epoch": 1.4308282554728051,
"grad_norm": 0.001434326171875,
"learning_rate": 5.250320924261875e-06,
"loss": 0.0,
"step": 6340
},
{
"epoch": 1.4330850823741819,
"grad_norm": 0.00025177001953125,
"learning_rate": 5.242769765158953e-06,
"loss": 0.0,
"step": 6350
},
{
"epoch": 1.4353419092755586,
"grad_norm": 78.0,
"learning_rate": 5.23521860605603e-06,
"loss": 0.4103,
"step": 6360
},
{
"epoch": 1.4375987361769353,
"grad_norm": 0.0024871826171875,
"learning_rate": 5.2276674469531075e-06,
"loss": 0.0,
"step": 6370
},
{
"epoch": 1.4398555630783119,
"grad_norm": 0.000316619873046875,
"learning_rate": 5.220116287850185e-06,
"loss": 0.0,
"step": 6380
},
{
"epoch": 1.4421123899796886,
"grad_norm": 0.002777099609375,
"learning_rate": 5.212565128747263e-06,
"loss": 0.0,
"step": 6390
},
{
"epoch": 1.4443692168810651,
"grad_norm": 0.006439208984375,
"learning_rate": 5.205013969644342e-06,
"loss": 0.0,
"step": 6400
},
{
"epoch": 1.4466260437824419,
"grad_norm": 0.00030517578125,
"learning_rate": 5.1974628105414186e-06,
"loss": 0.0,
"step": 6410
},
{
"epoch": 1.4488828706838186,
"grad_norm": 0.003997802734375,
"learning_rate": 5.189911651438496e-06,
"loss": 0.0,
"step": 6420
},
{
"epoch": 1.4511396975851953,
"grad_norm": 0.000629425048828125,
"learning_rate": 5.182360492335574e-06,
"loss": 0.0002,
"step": 6430
},
{
"epoch": 1.4533965244865719,
"grad_norm": 0.00159454345703125,
"learning_rate": 5.174809333232652e-06,
"loss": 0.2868,
"step": 6440
},
{
"epoch": 1.4556533513879486,
"grad_norm": 0.00095367431640625,
"learning_rate": 5.167258174129729e-06,
"loss": 0.0,
"step": 6450
},
{
"epoch": 1.4579101782893251,
"grad_norm": 0.0003032684326171875,
"learning_rate": 5.1597070150268065e-06,
"loss": 0.0,
"step": 6460
},
{
"epoch": 1.4601670051907019,
"grad_norm": 0.00159454345703125,
"learning_rate": 5.152155855923885e-06,
"loss": 0.0,
"step": 6470
},
{
"epoch": 1.4624238320920786,
"grad_norm": 0.00213623046875,
"learning_rate": 5.144604696820963e-06,
"loss": 0.0653,
"step": 6480
},
{
"epoch": 1.4646806589934551,
"grad_norm": 0.0011749267578125,
"learning_rate": 5.13705353771804e-06,
"loss": 0.0,
"step": 6490
},
{
"epoch": 1.4669374858948319,
"grad_norm": 0.01177978515625,
"learning_rate": 5.1295023786151176e-06,
"loss": 0.0,
"step": 6500
},
{
"epoch": 1.4691943127962086,
"grad_norm": 0.0001354217529296875,
"learning_rate": 5.121951219512195e-06,
"loss": 0.0,
"step": 6510
},
{
"epoch": 1.4714511396975851,
"grad_norm": 0.00017833709716796875,
"learning_rate": 5.114400060409274e-06,
"loss": 0.0,
"step": 6520
},
{
"epoch": 1.4737079665989619,
"grad_norm": 0.0006561279296875,
"learning_rate": 5.106848901306351e-06,
"loss": 0.0,
"step": 6530
},
{
"epoch": 1.4759647935003386,
"grad_norm": 0.000286102294921875,
"learning_rate": 5.099297742203429e-06,
"loss": 0.0,
"step": 6540
},
{
"epoch": 1.4782216204017151,
"grad_norm": 0.00026702880859375,
"learning_rate": 5.091746583100506e-06,
"loss": 0.0,
"step": 6550
},
{
"epoch": 1.4804784473030919,
"grad_norm": 0.01361083984375,
"learning_rate": 5.084195423997584e-06,
"loss": 0.0,
"step": 6560
},
{
"epoch": 1.4827352742044684,
"grad_norm": 0.0001583099365234375,
"learning_rate": 5.076644264894661e-06,
"loss": 0.0592,
"step": 6570
},
{
"epoch": 1.4849921011058451,
"grad_norm": 0.00131988525390625,
"learning_rate": 5.069093105791739e-06,
"loss": 0.0,
"step": 6580
},
{
"epoch": 1.4872489280072219,
"grad_norm": 0.000667572021484375,
"learning_rate": 5.061541946688817e-06,
"loss": 0.0,
"step": 6590
},
{
"epoch": 1.4895057549085986,
"grad_norm": 0.000759124755859375,
"learning_rate": 5.053990787585895e-06,
"loss": 0.0,
"step": 6600
},
{
"epoch": 1.4917625818099751,
"grad_norm": 0.000255584716796875,
"learning_rate": 5.046439628482973e-06,
"loss": 0.0364,
"step": 6610
},
{
"epoch": 1.4940194087113519,
"grad_norm": 0.00023365020751953125,
"learning_rate": 5.03888846938005e-06,
"loss": 0.0,
"step": 6620
},
{
"epoch": 1.4962762356127284,
"grad_norm": 8.153915405273438e-05,
"learning_rate": 5.031337310277128e-06,
"loss": 0.0,
"step": 6630
},
{
"epoch": 1.4985330625141051,
"grad_norm": 0.0028228759765625,
"learning_rate": 5.023786151174206e-06,
"loss": 0.0,
"step": 6640
},
{
"epoch": 1.5007898894154819,
"grad_norm": 9.775161743164062e-05,
"learning_rate": 5.016234992071284e-06,
"loss": 0.0,
"step": 6650
},
{
"epoch": 1.5030467163168586,
"grad_norm": 0.000316619873046875,
"learning_rate": 5.008683832968361e-06,
"loss": 0.2175,
"step": 6660
},
{
"epoch": 1.5053035432182351,
"grad_norm": 0.0003604888916015625,
"learning_rate": 5.001132673865439e-06,
"loss": 0.0,
"step": 6670
},
{
"epoch": 1.5075603701196119,
"grad_norm": 0.000179290771484375,
"learning_rate": 4.993581514762516e-06,
"loss": 0.0,
"step": 6680
},
{
"epoch": 1.5098171970209884,
"grad_norm": 0.0028839111328125,
"learning_rate": 4.986030355659594e-06,
"loss": 0.0,
"step": 6690
},
{
"epoch": 1.5120740239223651,
"grad_norm": 0.00125885009765625,
"learning_rate": 4.978479196556672e-06,
"loss": 0.0,
"step": 6700
},
{
"epoch": 1.5143308508237419,
"grad_norm": 0.000331878662109375,
"learning_rate": 4.97092803745375e-06,
"loss": 0.0,
"step": 6710
},
{
"epoch": 1.5165876777251186,
"grad_norm": 0.000675201416015625,
"learning_rate": 4.9633768783508275e-06,
"loss": 0.0,
"step": 6720
},
{
"epoch": 1.5188445046264951,
"grad_norm": 0.00015735626220703125,
"learning_rate": 4.955825719247904e-06,
"loss": 0.0897,
"step": 6730
},
{
"epoch": 1.5211013315278719,
"grad_norm": 0.00061798095703125,
"learning_rate": 4.948274560144983e-06,
"loss": 0.0,
"step": 6740
},
{
"epoch": 1.5233581584292484,
"grad_norm": 0.0009765625,
"learning_rate": 4.94072340104206e-06,
"loss": 0.0,
"step": 6750
},
{
"epoch": 1.5256149853306251,
"grad_norm": 0.00010585784912109375,
"learning_rate": 4.9331722419391385e-06,
"loss": 0.0,
"step": 6760
},
{
"epoch": 1.5278718122320019,
"grad_norm": 0.0001087188720703125,
"learning_rate": 4.925621082836215e-06,
"loss": 0.0,
"step": 6770
},
{
"epoch": 1.5301286391333786,
"grad_norm": 0.0009307861328125,
"learning_rate": 4.918069923733293e-06,
"loss": 0.0,
"step": 6780
},
{
"epoch": 1.5323854660347551,
"grad_norm": 0.002593994140625,
"learning_rate": 4.910518764630371e-06,
"loss": 0.0,
"step": 6790
},
{
"epoch": 1.5346422929361316,
"grad_norm": 0.00048828125,
"learning_rate": 4.902967605527449e-06,
"loss": 0.0,
"step": 6800
},
{
"epoch": 1.5368991198375084,
"grad_norm": 0.00016021728515625,
"learning_rate": 4.8954164464245265e-06,
"loss": 0.0,
"step": 6810
},
{
"epoch": 1.5391559467388851,
"grad_norm": 0.00019168853759765625,
"learning_rate": 4.887865287321604e-06,
"loss": 0.6172,
"step": 6820
},
{
"epoch": 1.5414127736402619,
"grad_norm": 0.00167083740234375,
"learning_rate": 4.880314128218682e-06,
"loss": 0.0,
"step": 6830
},
{
"epoch": 1.5436696005416386,
"grad_norm": 0.0029754638671875,
"learning_rate": 4.87276296911576e-06,
"loss": 0.0,
"step": 6840
},
{
"epoch": 1.5459264274430151,
"grad_norm": 0.0010833740234375,
"learning_rate": 4.8652118100128375e-06,
"loss": 0.0,
"step": 6850
},
{
"epoch": 1.5481832543443916,
"grad_norm": 0.00012683868408203125,
"learning_rate": 4.857660650909915e-06,
"loss": 0.0,
"step": 6860
},
{
"epoch": 1.5504400812457684,
"grad_norm": 0.0010223388671875,
"learning_rate": 4.850109491806993e-06,
"loss": 0.2717,
"step": 6870
},
{
"epoch": 1.5526969081471451,
"grad_norm": 0.00341796875,
"learning_rate": 4.842558332704071e-06,
"loss": 0.0,
"step": 6880
},
{
"epoch": 1.5549537350485219,
"grad_norm": 0.002685546875,
"learning_rate": 4.8350071736011486e-06,
"loss": 0.0,
"step": 6890
},
{
"epoch": 1.5572105619498986,
"grad_norm": 9.5367431640625e-05,
"learning_rate": 4.8274560144982255e-06,
"loss": 0.0,
"step": 6900
},
{
"epoch": 1.5594673888512751,
"grad_norm": 0.0003795623779296875,
"learning_rate": 4.819904855395304e-06,
"loss": 0.0,
"step": 6910
},
{
"epoch": 1.5617242157526516,
"grad_norm": 0.00022220611572265625,
"learning_rate": 4.812353696292381e-06,
"loss": 0.0,
"step": 6920
},
{
"epoch": 1.5639810426540284,
"grad_norm": 0.00040435791015625,
"learning_rate": 4.804802537189459e-06,
"loss": 0.0,
"step": 6930
},
{
"epoch": 1.5662378695554051,
"grad_norm": 0.000690460205078125,
"learning_rate": 4.7972513780865365e-06,
"loss": 0.0,
"step": 6940
},
{
"epoch": 1.5684946964567819,
"grad_norm": 0.001983642578125,
"learning_rate": 4.789700218983614e-06,
"loss": 0.0,
"step": 6950
},
{
"epoch": 1.5707515233581584,
"grad_norm": 0.00018787384033203125,
"learning_rate": 4.782149059880692e-06,
"loss": 0.0,
"step": 6960
},
{
"epoch": 1.5730083502595351,
"grad_norm": 0.000152587890625,
"learning_rate": 4.77459790077777e-06,
"loss": 0.0,
"step": 6970
},
{
"epoch": 1.5752651771609116,
"grad_norm": 0.00014591217041015625,
"learning_rate": 4.7670467416748476e-06,
"loss": 0.0,
"step": 6980
},
{
"epoch": 1.5775220040622884,
"grad_norm": 0.00018978118896484375,
"learning_rate": 4.759495582571925e-06,
"loss": 0.0,
"step": 6990
},
{
"epoch": 1.5797788309636651,
"grad_norm": 0.000286102294921875,
"learning_rate": 4.751944423469002e-06,
"loss": 0.0,
"step": 7000
},
{
"epoch": 1.5820356578650419,
"grad_norm": 0.000293731689453125,
"learning_rate": 4.744393264366081e-06,
"loss": 0.2482,
"step": 7010
},
{
"epoch": 1.5842924847664184,
"grad_norm": 0.0005035400390625,
"learning_rate": 4.736842105263158e-06,
"loss": 0.3,
"step": 7020
},
{
"epoch": 1.5865493116677951,
"grad_norm": 0.00110626220703125,
"learning_rate": 4.729290946160236e-06,
"loss": 0.0,
"step": 7030
},
{
"epoch": 1.5888061385691716,
"grad_norm": 0.00040435791015625,
"learning_rate": 4.721739787057314e-06,
"loss": 0.0,
"step": 7040
},
{
"epoch": 1.5910629654705484,
"grad_norm": 0.00555419921875,
"learning_rate": 4.714188627954391e-06,
"loss": 0.0,
"step": 7050
},
{
"epoch": 1.5933197923719251,
"grad_norm": 0.00018215179443359375,
"learning_rate": 4.70663746885147e-06,
"loss": 0.0,
"step": 7060
},
{
"epoch": 1.5955766192733019,
"grad_norm": 10.125,
"learning_rate": 4.6990863097485466e-06,
"loss": 0.0016,
"step": 7070
},
{
"epoch": 1.5978334461746784,
"grad_norm": 0.000713348388671875,
"learning_rate": 4.691535150645624e-06,
"loss": 0.0,
"step": 7080
},
{
"epoch": 1.6000902730760551,
"grad_norm": 0.0004825592041015625,
"learning_rate": 4.683983991542702e-06,
"loss": 0.0002,
"step": 7090
},
{
"epoch": 1.6023470999774316,
"grad_norm": 0.00099945068359375,
"learning_rate": 4.67643283243978e-06,
"loss": 0.3182,
"step": 7100
},
{
"epoch": 1.6046039268788084,
"grad_norm": 0.004852294921875,
"learning_rate": 4.668881673336858e-06,
"loss": 0.0,
"step": 7110
},
{
"epoch": 1.6068607537801851,
"grad_norm": 0.000957489013671875,
"learning_rate": 4.661330514233935e-06,
"loss": 0.0001,
"step": 7120
},
{
"epoch": 1.6091175806815619,
"grad_norm": 0.0024566650390625,
"learning_rate": 4.653779355131013e-06,
"loss": 0.2413,
"step": 7130
},
{
"epoch": 1.6113744075829384,
"grad_norm": 44.25,
"learning_rate": 4.646228196028091e-06,
"loss": 0.287,
"step": 7140
},
{
"epoch": 1.613631234484315,
"grad_norm": 0.00049591064453125,
"learning_rate": 4.638677036925169e-06,
"loss": 0.2296,
"step": 7150
},
{
"epoch": 1.6158880613856916,
"grad_norm": 0.000514984130859375,
"learning_rate": 4.631125877822246e-06,
"loss": 0.0,
"step": 7160
},
{
"epoch": 1.6181448882870684,
"grad_norm": 0.0059814453125,
"learning_rate": 4.623574718719323e-06,
"loss": 0.0,
"step": 7170
},
{
"epoch": 1.6204017151884451,
"grad_norm": 0.00185394287109375,
"learning_rate": 4.616023559616402e-06,
"loss": 0.0002,
"step": 7180
},
{
"epoch": 1.6226585420898219,
"grad_norm": 0.00064849853515625,
"learning_rate": 4.608472400513479e-06,
"loss": 0.0,
"step": 7190
},
{
"epoch": 1.6249153689911984,
"grad_norm": 0.00125885009765625,
"learning_rate": 4.600921241410557e-06,
"loss": 0.0,
"step": 7200
},
{
"epoch": 1.627172195892575,
"grad_norm": 0.0007171630859375,
"learning_rate": 4.593370082307634e-06,
"loss": 0.0,
"step": 7210
},
{
"epoch": 1.6294290227939516,
"grad_norm": 0.00335693359375,
"learning_rate": 4.585818923204712e-06,
"loss": 0.0,
"step": 7220
},
{
"epoch": 1.6316858496953284,
"grad_norm": 0.0038909912109375,
"learning_rate": 4.57826776410179e-06,
"loss": 0.2573,
"step": 7230
},
{
"epoch": 1.6339426765967051,
"grad_norm": 0.0015869140625,
"learning_rate": 4.570716604998868e-06,
"loss": 0.0,
"step": 7240
},
{
"epoch": 1.6361995034980819,
"grad_norm": 0.0031585693359375,
"learning_rate": 4.563165445895945e-06,
"loss": 0.2589,
"step": 7250
},
{
"epoch": 1.6384563303994584,
"grad_norm": 0.0035552978515625,
"learning_rate": 4.555614286793023e-06,
"loss": 0.0,
"step": 7260
},
{
"epoch": 1.640713157300835,
"grad_norm": 0.020751953125,
"learning_rate": 4.548063127690101e-06,
"loss": 0.2702,
"step": 7270
},
{
"epoch": 1.6429699842022116,
"grad_norm": 0.00457763671875,
"learning_rate": 4.540511968587179e-06,
"loss": 0.0,
"step": 7280
},
{
"epoch": 1.6452268111035884,
"grad_norm": 0.0078125,
"learning_rate": 4.5329608094842564e-06,
"loss": 0.0,
"step": 7290
},
{
"epoch": 1.6474836380049651,
"grad_norm": 0.00579833984375,
"learning_rate": 4.525409650381334e-06,
"loss": 0.0,
"step": 7300
},
{
"epoch": 1.6497404649063416,
"grad_norm": 0.0003147125244140625,
"learning_rate": 4.517858491278412e-06,
"loss": 0.0,
"step": 7310
},
{
"epoch": 1.6519972918077184,
"grad_norm": 0.00286865234375,
"learning_rate": 4.510307332175489e-06,
"loss": 0.1944,
"step": 7320
},
{
"epoch": 1.654254118709095,
"grad_norm": 0.004638671875,
"learning_rate": 4.5027561730725675e-06,
"loss": 0.0,
"step": 7330
},
{
"epoch": 1.6565109456104716,
"grad_norm": 0.002685546875,
"learning_rate": 4.495205013969644e-06,
"loss": 0.0,
"step": 7340
},
{
"epoch": 1.6587677725118484,
"grad_norm": 0.00156402587890625,
"learning_rate": 4.487653854866722e-06,
"loss": 0.0,
"step": 7350
},
{
"epoch": 1.6610245994132251,
"grad_norm": 0.000568389892578125,
"learning_rate": 4.4801026957638e-06,
"loss": 0.0,
"step": 7360
},
{
"epoch": 1.6632814263146016,
"grad_norm": 0.01422119140625,
"learning_rate": 4.472551536660878e-06,
"loss": 0.0,
"step": 7370
},
{
"epoch": 1.6655382532159784,
"grad_norm": 0.0009613037109375,
"learning_rate": 4.4650003775579554e-06,
"loss": 0.0,
"step": 7380
},
{
"epoch": 1.6677950801173549,
"grad_norm": 0.00055694580078125,
"learning_rate": 4.457449218455033e-06,
"loss": 0.2759,
"step": 7390
},
{
"epoch": 1.6700519070187316,
"grad_norm": 28.0,
"learning_rate": 4.449898059352111e-06,
"loss": 0.0055,
"step": 7400
},
{
"epoch": 1.6723087339201084,
"grad_norm": 0.0010528564453125,
"learning_rate": 4.442346900249189e-06,
"loss": 0.0,
"step": 7410
},
{
"epoch": 1.674565560821485,
"grad_norm": 0.00034332275390625,
"learning_rate": 4.4347957411462665e-06,
"loss": 0.0,
"step": 7420
},
{
"epoch": 1.6768223877228616,
"grad_norm": 0.0086669921875,
"learning_rate": 4.427244582043344e-06,
"loss": 0.0757,
"step": 7430
},
{
"epoch": 1.6790792146242384,
"grad_norm": 0.00023651123046875,
"learning_rate": 4.419693422940421e-06,
"loss": 0.0,
"step": 7440
},
{
"epoch": 1.6813360415256149,
"grad_norm": 0.0011138916015625,
"learning_rate": 4.4121422638375e-06,
"loss": 0.0,
"step": 7450
},
{
"epoch": 1.6835928684269916,
"grad_norm": 0.0016021728515625,
"learning_rate": 4.404591104734577e-06,
"loss": 0.0,
"step": 7460
},
{
"epoch": 1.6858496953283684,
"grad_norm": 0.004730224609375,
"learning_rate": 4.3970399456316544e-06,
"loss": 0.2891,
"step": 7470
},
{
"epoch": 1.688106522229745,
"grad_norm": 0.00030517578125,
"learning_rate": 4.389488786528733e-06,
"loss": 0.0,
"step": 7480
},
{
"epoch": 1.6903633491311216,
"grad_norm": 0.000568389892578125,
"learning_rate": 4.38193762742581e-06,
"loss": 0.0004,
"step": 7490
},
{
"epoch": 1.6926201760324981,
"grad_norm": 0.0004596710205078125,
"learning_rate": 4.3743864683228886e-06,
"loss": 0.0,
"step": 7500
},
{
"epoch": 1.6948770029338749,
"grad_norm": 0.0003757476806640625,
"learning_rate": 4.3668353092199655e-06,
"loss": 0.6468,
"step": 7510
},
{
"epoch": 1.6971338298352516,
"grad_norm": 0.00109100341796875,
"learning_rate": 4.359284150117043e-06,
"loss": 0.0,
"step": 7520
},
{
"epoch": 1.6993906567366284,
"grad_norm": 0.000308990478515625,
"learning_rate": 4.351732991014121e-06,
"loss": 0.0,
"step": 7530
},
{
"epoch": 1.701647483638005,
"grad_norm": 0.000705718994140625,
"learning_rate": 4.344181831911199e-06,
"loss": 0.0,
"step": 7540
},
{
"epoch": 1.7039043105393816,
"grad_norm": 0.00109100341796875,
"learning_rate": 4.3366306728082765e-06,
"loss": 0.195,
"step": 7550
},
{
"epoch": 1.7061611374407581,
"grad_norm": 0.00023937225341796875,
"learning_rate": 4.329079513705354e-06,
"loss": 0.2883,
"step": 7560
},
{
"epoch": 1.7084179643421349,
"grad_norm": 0.005950927734375,
"learning_rate": 4.321528354602432e-06,
"loss": 0.0,
"step": 7570
},
{
"epoch": 1.7106747912435116,
"grad_norm": 0.001373291015625,
"learning_rate": 4.31397719549951e-06,
"loss": 0.0,
"step": 7580
},
{
"epoch": 1.7129316181448884,
"grad_norm": 0.00046539306640625,
"learning_rate": 4.306426036396587e-06,
"loss": 0.0,
"step": 7590
},
{
"epoch": 1.715188445046265,
"grad_norm": 8.6875,
"learning_rate": 4.298874877293665e-06,
"loss": 0.0005,
"step": 7600
},
{
"epoch": 1.7174452719476416,
"grad_norm": 0.004058837890625,
"learning_rate": 4.291323718190742e-06,
"loss": 0.2448,
"step": 7610
},
{
"epoch": 1.7197020988490181,
"grad_norm": 0.001678466796875,
"learning_rate": 4.28377255908782e-06,
"loss": 0.0,
"step": 7620
},
{
"epoch": 1.7219589257503949,
"grad_norm": 9.012222290039062e-05,
"learning_rate": 4.276221399984898e-06,
"loss": 0.0,
"step": 7630
},
{
"epoch": 1.7242157526517716,
"grad_norm": 0.007354736328125,
"learning_rate": 4.2686702408819755e-06,
"loss": 0.0,
"step": 7640
},
{
"epoch": 1.7264725795531484,
"grad_norm": 0.00131988525390625,
"learning_rate": 4.261119081779053e-06,
"loss": 0.0,
"step": 7650
},
{
"epoch": 1.7287294064545249,
"grad_norm": 0.0031890869140625,
"learning_rate": 4.253567922676131e-06,
"loss": 0.0,
"step": 7660
},
{
"epoch": 1.7309862333559016,
"grad_norm": 0.00174713134765625,
"learning_rate": 4.246016763573209e-06,
"loss": 0.0,
"step": 7670
},
{
"epoch": 1.7332430602572781,
"grad_norm": 0.00029754638671875,
"learning_rate": 4.2384656044702866e-06,
"loss": 0.0,
"step": 7680
},
{
"epoch": 1.7354998871586549,
"grad_norm": 0.0001964569091796875,
"learning_rate": 4.230914445367364e-06,
"loss": 0.0,
"step": 7690
},
{
"epoch": 1.7377567140600316,
"grad_norm": 0.0036163330078125,
"learning_rate": 4.223363286264442e-06,
"loss": 0.1878,
"step": 7700
},
{
"epoch": 1.7400135409614084,
"grad_norm": 0.0152587890625,
"learning_rate": 4.21581212716152e-06,
"loss": 0.0,
"step": 7710
},
{
"epoch": 1.7422703678627849,
"grad_norm": 0.041015625,
"learning_rate": 4.208260968058598e-06,
"loss": 0.0,
"step": 7720
},
{
"epoch": 1.7445271947641616,
"grad_norm": 0.000701904296875,
"learning_rate": 4.200709808955675e-06,
"loss": 0.0,
"step": 7730
},
{
"epoch": 1.7467840216655381,
"grad_norm": 0.000568389892578125,
"learning_rate": 4.193158649852752e-06,
"loss": 0.24,
"step": 7740
},
{
"epoch": 1.7490408485669149,
"grad_norm": 0.01068115234375,
"learning_rate": 4.185607490749831e-06,
"loss": 0.0,
"step": 7750
},
{
"epoch": 1.7512976754682916,
"grad_norm": 0.001220703125,
"learning_rate": 4.178056331646908e-06,
"loss": 0.2857,
"step": 7760
},
{
"epoch": 1.7535545023696684,
"grad_norm": 0.0022430419921875,
"learning_rate": 4.170505172543986e-06,
"loss": 0.0,
"step": 7770
},
{
"epoch": 1.7558113292710449,
"grad_norm": 0.00173187255859375,
"learning_rate": 4.162954013441063e-06,
"loss": 0.0001,
"step": 7780
},
{
"epoch": 1.7580681561724216,
"grad_norm": 0.002044677734375,
"learning_rate": 4.155402854338141e-06,
"loss": 0.0,
"step": 7790
},
{
"epoch": 1.7603249830737981,
"grad_norm": 0.005859375,
"learning_rate": 4.147851695235219e-06,
"loss": 0.0,
"step": 7800
},
{
"epoch": 1.7625818099751749,
"grad_norm": 0.000335693359375,
"learning_rate": 4.140300536132297e-06,
"loss": 0.0,
"step": 7810
},
{
"epoch": 1.7648386368765516,
"grad_norm": 0.003814697265625,
"learning_rate": 4.132749377029374e-06,
"loss": 0.0,
"step": 7820
},
{
"epoch": 1.7670954637779284,
"grad_norm": 0.002349853515625,
"learning_rate": 4.125198217926452e-06,
"loss": 0.0,
"step": 7830
},
{
"epoch": 1.7693522906793049,
"grad_norm": 0.00189208984375,
"learning_rate": 4.11764705882353e-06,
"loss": 0.0,
"step": 7840
},
{
"epoch": 1.7716091175806814,
"grad_norm": 0.0025482177734375,
"learning_rate": 4.110095899720608e-06,
"loss": 0.2085,
"step": 7850
},
{
"epoch": 1.7738659444820581,
"grad_norm": 0.001220703125,
"learning_rate": 4.1025447406176846e-06,
"loss": 0.0,
"step": 7860
},
{
"epoch": 1.7761227713834349,
"grad_norm": 0.000881195068359375,
"learning_rate": 4.094993581514763e-06,
"loss": 0.0,
"step": 7870
},
{
"epoch": 1.7783795982848116,
"grad_norm": 0.000736236572265625,
"learning_rate": 4.08744242241184e-06,
"loss": 0.0,
"step": 7880
},
{
"epoch": 1.7806364251861884,
"grad_norm": 0.0113525390625,
"learning_rate": 4.079891263308918e-06,
"loss": 0.0,
"step": 7890
},
{
"epoch": 1.7828932520875649,
"grad_norm": 0.00067901611328125,
"learning_rate": 4.072340104205996e-06,
"loss": 0.0,
"step": 7900
},
{
"epoch": 1.7851500789889414,
"grad_norm": 0.00138092041015625,
"learning_rate": 4.064788945103073e-06,
"loss": 0.0,
"step": 7910
},
{
"epoch": 1.7874069058903181,
"grad_norm": 0.0015106201171875,
"learning_rate": 4.057237786000152e-06,
"loss": 0.0,
"step": 7920
},
{
"epoch": 1.7896637327916949,
"grad_norm": 0.00390625,
"learning_rate": 4.049686626897229e-06,
"loss": 0.0,
"step": 7930
},
{
"epoch": 1.7919205596930716,
"grad_norm": 0.004547119140625,
"learning_rate": 4.042135467794307e-06,
"loss": 0.0,
"step": 7940
},
{
"epoch": 1.7941773865944484,
"grad_norm": 0.000423431396484375,
"learning_rate": 4.034584308691384e-06,
"loss": 0.0,
"step": 7950
},
{
"epoch": 1.7964342134958249,
"grad_norm": 0.0036468505859375,
"learning_rate": 4.027033149588462e-06,
"loss": 0.0,
"step": 7960
},
{
"epoch": 1.7986910403972014,
"grad_norm": 0.00157928466796875,
"learning_rate": 4.01948199048554e-06,
"loss": 0.051,
"step": 7970
},
{
"epoch": 1.8009478672985781,
"grad_norm": 0.0004138946533203125,
"learning_rate": 4.011930831382618e-06,
"loss": 0.0,
"step": 7980
},
{
"epoch": 1.8032046941999549,
"grad_norm": 0.0005035400390625,
"learning_rate": 4.0043796722796955e-06,
"loss": 0.0,
"step": 7990
},
{
"epoch": 1.8054615211013316,
"grad_norm": 0.0003814697265625,
"learning_rate": 3.996828513176773e-06,
"loss": 0.1119,
"step": 8000
},
{
"epoch": 1.8077183480027081,
"grad_norm": 0.0019073486328125,
"learning_rate": 3.98927735407385e-06,
"loss": 0.0,
"step": 8010
},
{
"epoch": 1.8099751749040849,
"grad_norm": 0.001129150390625,
"learning_rate": 3.981726194970929e-06,
"loss": 0.0,
"step": 8020
},
{
"epoch": 1.8122320018054614,
"grad_norm": 0.00016307830810546875,
"learning_rate": 3.974175035868006e-06,
"loss": 0.0,
"step": 8030
},
{
"epoch": 1.8144888287068381,
"grad_norm": 0.00106048583984375,
"learning_rate": 3.966623876765084e-06,
"loss": 0.0,
"step": 8040
},
{
"epoch": 1.8167456556082149,
"grad_norm": 0.001220703125,
"learning_rate": 3.959072717662161e-06,
"loss": 0.0,
"step": 8050
},
{
"epoch": 1.8190024825095916,
"grad_norm": 0.0002155303955078125,
"learning_rate": 3.951521558559239e-06,
"loss": 0.0332,
"step": 8060
},
{
"epoch": 1.8212593094109681,
"grad_norm": 0.00019741058349609375,
"learning_rate": 3.943970399456317e-06,
"loss": 0.0,
"step": 8070
},
{
"epoch": 1.8235161363123449,
"grad_norm": 0.00012063980102539062,
"learning_rate": 3.9364192403533945e-06,
"loss": 0.0,
"step": 8080
},
{
"epoch": 1.8257729632137214,
"grad_norm": 0.004180908203125,
"learning_rate": 3.928868081250472e-06,
"loss": 0.0,
"step": 8090
},
{
"epoch": 1.8280297901150981,
"grad_norm": 0.0003299713134765625,
"learning_rate": 3.92131692214755e-06,
"loss": 0.0,
"step": 8100
},
{
"epoch": 1.8302866170164749,
"grad_norm": 0.000904083251953125,
"learning_rate": 3.913765763044628e-06,
"loss": 0.0,
"step": 8110
},
{
"epoch": 1.8325434439178516,
"grad_norm": 0.0008544921875,
"learning_rate": 3.9062146039417055e-06,
"loss": 0.0,
"step": 8120
},
{
"epoch": 1.8348002708192281,
"grad_norm": 0.00022792816162109375,
"learning_rate": 3.898663444838783e-06,
"loss": 0.0,
"step": 8130
},
{
"epoch": 1.8370570977206049,
"grad_norm": 0.000637054443359375,
"learning_rate": 3.891112285735861e-06,
"loss": 0.3226,
"step": 8140
},
{
"epoch": 1.8393139246219814,
"grad_norm": 0.00665283203125,
"learning_rate": 3.883561126632939e-06,
"loss": 0.0,
"step": 8150
},
{
"epoch": 1.8415707515233581,
"grad_norm": 0.0030975341796875,
"learning_rate": 3.8760099675300165e-06,
"loss": 0.303,
"step": 8160
},
{
"epoch": 1.8438275784247349,
"grad_norm": 0.0003681182861328125,
"learning_rate": 3.868458808427094e-06,
"loss": 0.0,
"step": 8170
},
{
"epoch": 1.8460844053261116,
"grad_norm": 0.0133056640625,
"learning_rate": 3.860907649324171e-06,
"loss": 0.6187,
"step": 8180
},
{
"epoch": 1.8483412322274881,
"grad_norm": 0.0015411376953125,
"learning_rate": 3.85335649022125e-06,
"loss": 0.0,
"step": 8190
},
{
"epoch": 1.8505980591288647,
"grad_norm": 0.001708984375,
"learning_rate": 3.845805331118327e-06,
"loss": 0.0025,
"step": 8200
},
{
"epoch": 1.8528548860302414,
"grad_norm": 0.00830078125,
"learning_rate": 3.8382541720154045e-06,
"loss": 0.0,
"step": 8210
},
{
"epoch": 1.8551117129316181,
"grad_norm": 0.00390625,
"learning_rate": 3.830703012912482e-06,
"loss": 0.0,
"step": 8220
},
{
"epoch": 1.8573685398329949,
"grad_norm": 0.00098419189453125,
"learning_rate": 3.82315185380956e-06,
"loss": 0.3487,
"step": 8230
},
{
"epoch": 1.8596253667343716,
"grad_norm": 0.007049560546875,
"learning_rate": 3.815600694706638e-06,
"loss": 0.2516,
"step": 8240
},
{
"epoch": 1.8618821936357481,
"grad_norm": 0.00165557861328125,
"learning_rate": 3.8080495356037155e-06,
"loss": 0.0,
"step": 8250
},
{
"epoch": 1.8641390205371247,
"grad_norm": 0.005218505859375,
"learning_rate": 3.800498376500793e-06,
"loss": 0.0,
"step": 8260
},
{
"epoch": 1.8663958474385014,
"grad_norm": 0.00144195556640625,
"learning_rate": 3.792947217397871e-06,
"loss": 0.0305,
"step": 8270
},
{
"epoch": 1.8686526743398781,
"grad_norm": 0.00064849853515625,
"learning_rate": 3.7853960582949484e-06,
"loss": 0.1983,
"step": 8280
},
{
"epoch": 1.8709095012412549,
"grad_norm": 0.00897216796875,
"learning_rate": 3.777844899192026e-06,
"loss": 0.0,
"step": 8290
},
{
"epoch": 1.8731663281426316,
"grad_norm": 0.0009918212890625,
"learning_rate": 3.770293740089104e-06,
"loss": 0.0,
"step": 8300
},
{
"epoch": 1.8754231550440081,
"grad_norm": 0.002105712890625,
"learning_rate": 3.7627425809861817e-06,
"loss": 0.0,
"step": 8310
},
{
"epoch": 1.8776799819453847,
"grad_norm": 0.01226806640625,
"learning_rate": 3.755191421883259e-06,
"loss": 0.0,
"step": 8320
},
{
"epoch": 1.8799368088467614,
"grad_norm": 0.00830078125,
"learning_rate": 3.7476402627803372e-06,
"loss": 0.0,
"step": 8330
},
{
"epoch": 1.8821936357481381,
"grad_norm": 0.01190185546875,
"learning_rate": 3.7400891036774146e-06,
"loss": 0.0,
"step": 8340
},
{
"epoch": 1.8844504626495149,
"grad_norm": 0.0004444122314453125,
"learning_rate": 3.7325379445744923e-06,
"loss": 0.1238,
"step": 8350
},
{
"epoch": 1.8867072895508914,
"grad_norm": 0.00946044921875,
"learning_rate": 3.7249867854715705e-06,
"loss": 0.0,
"step": 8360
},
{
"epoch": 1.8889641164522681,
"grad_norm": 0.004608154296875,
"learning_rate": 3.717435626368648e-06,
"loss": 0.0,
"step": 8370
},
{
"epoch": 1.8912209433536447,
"grad_norm": 0.000576019287109375,
"learning_rate": 3.709884467265726e-06,
"loss": 0.0,
"step": 8380
},
{
"epoch": 1.8934777702550214,
"grad_norm": 0.019775390625,
"learning_rate": 3.7023333081628034e-06,
"loss": 0.0,
"step": 8390
},
{
"epoch": 1.8957345971563981,
"grad_norm": 0.001556396484375,
"learning_rate": 3.694782149059881e-06,
"loss": 0.1019,
"step": 8400
},
{
"epoch": 1.8979914240577749,
"grad_norm": 0.00089263916015625,
"learning_rate": 3.6872309899569585e-06,
"loss": 0.6805,
"step": 8410
},
{
"epoch": 1.9002482509591514,
"grad_norm": 0.0027618408203125,
"learning_rate": 3.6796798308540366e-06,
"loss": 0.2843,
"step": 8420
},
{
"epoch": 1.9025050778605281,
"grad_norm": 0.002288818359375,
"learning_rate": 3.672128671751114e-06,
"loss": 0.0,
"step": 8430
},
{
"epoch": 1.9047619047619047,
"grad_norm": 0.0033111572265625,
"learning_rate": 3.664577512648192e-06,
"loss": 0.0191,
"step": 8440
},
{
"epoch": 1.9070187316632814,
"grad_norm": 0.001129150390625,
"learning_rate": 3.6570263535452695e-06,
"loss": 0.3397,
"step": 8450
},
{
"epoch": 1.9092755585646581,
"grad_norm": 0.001190185546875,
"learning_rate": 3.6494751944423473e-06,
"loss": 0.0,
"step": 8460
},
{
"epoch": 1.9115323854660349,
"grad_norm": 0.0184326171875,
"learning_rate": 3.6419240353394246e-06,
"loss": 0.0,
"step": 8470
},
{
"epoch": 1.9137892123674114,
"grad_norm": 0.0087890625,
"learning_rate": 3.6343728762365028e-06,
"loss": 0.0,
"step": 8480
},
{
"epoch": 1.9160460392687881,
"grad_norm": 0.00193023681640625,
"learning_rate": 3.62682171713358e-06,
"loss": 0.0,
"step": 8490
},
{
"epoch": 1.9183028661701647,
"grad_norm": 0.006805419921875,
"learning_rate": 3.619270558030658e-06,
"loss": 0.0,
"step": 8500
},
{
"epoch": 1.9205596930715414,
"grad_norm": 0.0019683837890625,
"learning_rate": 3.6117193989277356e-06,
"loss": 0.0,
"step": 8510
},
{
"epoch": 1.9228165199729181,
"grad_norm": 0.0028533935546875,
"learning_rate": 3.6041682398248134e-06,
"loss": 0.0,
"step": 8520
},
{
"epoch": 1.9250733468742949,
"grad_norm": 37.5,
"learning_rate": 3.5966170807218907e-06,
"loss": 0.0852,
"step": 8530
},
{
"epoch": 1.9273301737756714,
"grad_norm": 0.00604248046875,
"learning_rate": 3.589065921618969e-06,
"loss": 0.0012,
"step": 8540
},
{
"epoch": 1.929587000677048,
"grad_norm": 0.0025787353515625,
"learning_rate": 3.5815147625160463e-06,
"loss": 0.0,
"step": 8550
},
{
"epoch": 1.9318438275784247,
"grad_norm": 0.00112152099609375,
"learning_rate": 3.573963603413124e-06,
"loss": 0.0,
"step": 8560
},
{
"epoch": 1.9341006544798014,
"grad_norm": 0.0019683837890625,
"learning_rate": 3.566412444310202e-06,
"loss": 0.0,
"step": 8570
},
{
"epoch": 1.9363574813811781,
"grad_norm": 0.0019683837890625,
"learning_rate": 3.5588612852072795e-06,
"loss": 0.3208,
"step": 8580
},
{
"epoch": 1.9386143082825549,
"grad_norm": 0.00160980224609375,
"learning_rate": 3.5513101261043577e-06,
"loss": 0.0,
"step": 8590
},
{
"epoch": 1.9408711351839314,
"grad_norm": 0.0021514892578125,
"learning_rate": 3.543758967001435e-06,
"loss": 0.0,
"step": 8600
},
{
"epoch": 1.943127962085308,
"grad_norm": 0.0024261474609375,
"learning_rate": 3.536207807898513e-06,
"loss": 0.0,
"step": 8610
},
{
"epoch": 1.9453847889866847,
"grad_norm": 0.00347900390625,
"learning_rate": 3.52865664879559e-06,
"loss": 0.0,
"step": 8620
},
{
"epoch": 1.9476416158880614,
"grad_norm": 0.002838134765625,
"learning_rate": 3.5211054896926683e-06,
"loss": 0.0,
"step": 8630
},
{
"epoch": 1.9498984427894381,
"grad_norm": 0.000637054443359375,
"learning_rate": 3.5135543305897457e-06,
"loss": 0.0343,
"step": 8640
},
{
"epoch": 1.9521552696908149,
"grad_norm": 0.006072998046875,
"learning_rate": 3.506003171486824e-06,
"loss": 0.0,
"step": 8650
},
{
"epoch": 1.9544120965921914,
"grad_norm": 0.0037994384765625,
"learning_rate": 3.498452012383901e-06,
"loss": 0.0,
"step": 8660
},
{
"epoch": 1.956668923493568,
"grad_norm": 0.04638671875,
"learning_rate": 3.490900853280979e-06,
"loss": 0.2184,
"step": 8670
},
{
"epoch": 1.9589257503949447,
"grad_norm": 0.000858306884765625,
"learning_rate": 3.4833496941780563e-06,
"loss": 0.0,
"step": 8680
},
{
"epoch": 1.9611825772963214,
"grad_norm": 0.0023040771484375,
"learning_rate": 3.4757985350751345e-06,
"loss": 0.0,
"step": 8690
},
{
"epoch": 1.9634394041976981,
"grad_norm": 0.0012054443359375,
"learning_rate": 3.468247375972212e-06,
"loss": 0.0,
"step": 8700
},
{
"epoch": 1.9656962310990747,
"grad_norm": 0.0004787445068359375,
"learning_rate": 3.46069621686929e-06,
"loss": 0.0,
"step": 8710
},
{
"epoch": 1.9679530580004514,
"grad_norm": 0.00020313262939453125,
"learning_rate": 3.4531450577663673e-06,
"loss": 0.0,
"step": 8720
},
{
"epoch": 1.970209884901828,
"grad_norm": 131.0,
"learning_rate": 3.445593898663445e-06,
"loss": 0.2685,
"step": 8730
},
{
"epoch": 1.9724667118032047,
"grad_norm": 0.0003910064697265625,
"learning_rate": 3.4380427395605224e-06,
"loss": 0.0,
"step": 8740
},
{
"epoch": 1.9747235387045814,
"grad_norm": 0.0023345947265625,
"learning_rate": 3.4304915804576006e-06,
"loss": 0.0,
"step": 8750
},
{
"epoch": 1.9769803656059581,
"grad_norm": 0.00064849853515625,
"learning_rate": 3.422940421354678e-06,
"loss": 0.2698,
"step": 8760
},
{
"epoch": 1.9792371925073347,
"grad_norm": 0.000713348388671875,
"learning_rate": 3.4153892622517557e-06,
"loss": 0.2504,
"step": 8770
},
{
"epoch": 1.9814940194087114,
"grad_norm": 0.000518798828125,
"learning_rate": 3.407838103148834e-06,
"loss": 0.0,
"step": 8780
},
{
"epoch": 1.983750846310088,
"grad_norm": 0.004425048828125,
"learning_rate": 3.4002869440459112e-06,
"loss": 0.0555,
"step": 8790
},
{
"epoch": 1.9860076732114647,
"grad_norm": 0.000774383544921875,
"learning_rate": 3.3927357849429894e-06,
"loss": 0.1858,
"step": 8800
},
{
"epoch": 1.9882645001128414,
"grad_norm": 0.0018768310546875,
"learning_rate": 3.3851846258400668e-06,
"loss": 0.0,
"step": 8810
},
{
"epoch": 1.9905213270142181,
"grad_norm": 0.01324462890625,
"learning_rate": 3.3776334667371445e-06,
"loss": 0.1943,
"step": 8820
},
{
"epoch": 1.9927781539155947,
"grad_norm": 49.0,
"learning_rate": 3.370082307634222e-06,
"loss": 0.1281,
"step": 8830
},
{
"epoch": 1.9950349808169714,
"grad_norm": 0.00034332275390625,
"learning_rate": 3.3625311485313e-06,
"loss": 0.3255,
"step": 8840
},
{
"epoch": 1.997291807718348,
"grad_norm": 0.00057220458984375,
"learning_rate": 3.3549799894283774e-06,
"loss": 0.0,
"step": 8850
},
{
"epoch": 1.9995486346197247,
"grad_norm": 0.00177001953125,
"learning_rate": 3.3474288303254556e-06,
"loss": 0.0,
"step": 8860
},
{
"epoch": 2.0018054615211014,
"grad_norm": 0.003814697265625,
"learning_rate": 3.339877671222533e-06,
"loss": 0.0095,
"step": 8870
},
{
"epoch": 2.004062288422478,
"grad_norm": 0.0002956390380859375,
"learning_rate": 3.3323265121196107e-06,
"loss": 0.0,
"step": 8880
},
{
"epoch": 2.006319115323855,
"grad_norm": 0.0022125244140625,
"learning_rate": 3.324775353016688e-06,
"loss": 0.0587,
"step": 8890
},
{
"epoch": 2.008575942225231,
"grad_norm": 0.00084686279296875,
"learning_rate": 3.317224193913766e-06,
"loss": 0.0,
"step": 8900
},
{
"epoch": 2.010832769126608,
"grad_norm": 0.004150390625,
"learning_rate": 3.3096730348108435e-06,
"loss": 0.0,
"step": 8910
},
{
"epoch": 2.0130895960279847,
"grad_norm": 0.00162506103515625,
"learning_rate": 3.3021218757079217e-06,
"loss": 0.0,
"step": 8920
},
{
"epoch": 2.0153464229293614,
"grad_norm": 50.0,
"learning_rate": 3.294570716604999e-06,
"loss": 0.2458,
"step": 8930
},
{
"epoch": 2.017603249830738,
"grad_norm": 0.0091552734375,
"learning_rate": 3.287019557502077e-06,
"loss": 0.0,
"step": 8940
},
{
"epoch": 2.0198600767321144,
"grad_norm": 0.01055908203125,
"learning_rate": 3.279468398399154e-06,
"loss": 0.0,
"step": 8950
},
{
"epoch": 2.022116903633491,
"grad_norm": 0.00022029876708984375,
"learning_rate": 3.2719172392962323e-06,
"loss": 0.1657,
"step": 8960
},
{
"epoch": 2.024373730534868,
"grad_norm": 0.0032806396484375,
"learning_rate": 3.2643660801933097e-06,
"loss": 0.0002,
"step": 8970
},
{
"epoch": 2.0266305574362447,
"grad_norm": 0.0032196044921875,
"learning_rate": 3.256814921090388e-06,
"loss": 0.0,
"step": 8980
},
{
"epoch": 2.0288873843376214,
"grad_norm": 0.00020885467529296875,
"learning_rate": 3.249263761987465e-06,
"loss": 0.0,
"step": 8990
},
{
"epoch": 2.031144211238998,
"grad_norm": 0.004425048828125,
"learning_rate": 3.241712602884543e-06,
"loss": 0.1935,
"step": 9000
},
{
"epoch": 2.0334010381403744,
"grad_norm": 0.000484466552734375,
"learning_rate": 3.234161443781621e-06,
"loss": 0.0,
"step": 9010
},
{
"epoch": 2.035657865041751,
"grad_norm": 0.00032806396484375,
"learning_rate": 3.2266102846786985e-06,
"loss": 0.0,
"step": 9020
},
{
"epoch": 2.037914691943128,
"grad_norm": 0.022216796875,
"learning_rate": 3.2190591255757762e-06,
"loss": 0.0,
"step": 9030
},
{
"epoch": 2.0401715188445047,
"grad_norm": 0.0016021728515625,
"learning_rate": 3.211507966472854e-06,
"loss": 0.1482,
"step": 9040
},
{
"epoch": 2.0424283457458814,
"grad_norm": 0.00244140625,
"learning_rate": 3.2039568073699318e-06,
"loss": 0.0,
"step": 9050
},
{
"epoch": 2.044685172647258,
"grad_norm": 0.0157470703125,
"learning_rate": 3.196405648267009e-06,
"loss": 0.2466,
"step": 9060
},
{
"epoch": 2.0469419995486344,
"grad_norm": 0.001495361328125,
"learning_rate": 3.1888544891640873e-06,
"loss": 0.0,
"step": 9070
},
{
"epoch": 2.049198826450011,
"grad_norm": 0.000934600830078125,
"learning_rate": 3.1813033300611646e-06,
"loss": 0.0,
"step": 9080
},
{
"epoch": 2.051455653351388,
"grad_norm": 39.0,
"learning_rate": 3.1737521709582424e-06,
"loss": 0.1776,
"step": 9090
},
{
"epoch": 2.0537124802527646,
"grad_norm": 0.0022125244140625,
"learning_rate": 3.1662010118553197e-06,
"loss": 0.0,
"step": 9100
},
{
"epoch": 2.0559693071541414,
"grad_norm": 0.00016880035400390625,
"learning_rate": 3.158649852752398e-06,
"loss": 0.0,
"step": 9110
},
{
"epoch": 2.058226134055518,
"grad_norm": 0.006927490234375,
"learning_rate": 3.1510986936494752e-06,
"loss": 0.0,
"step": 9120
},
{
"epoch": 2.0604829609568944,
"grad_norm": 0.004730224609375,
"learning_rate": 3.1435475345465534e-06,
"loss": 0.0,
"step": 9130
},
{
"epoch": 2.062739787858271,
"grad_norm": 0.000957489013671875,
"learning_rate": 3.1359963754436308e-06,
"loss": 0.0,
"step": 9140
},
{
"epoch": 2.064996614759648,
"grad_norm": 0.0004863739013671875,
"learning_rate": 3.1284452163407085e-06,
"loss": 0.0,
"step": 9150
},
{
"epoch": 2.0672534416610246,
"grad_norm": 0.0003566741943359375,
"learning_rate": 3.120894057237786e-06,
"loss": 0.0,
"step": 9160
},
{
"epoch": 2.0695102685624014,
"grad_norm": 0.0024261474609375,
"learning_rate": 3.113342898134864e-06,
"loss": 0.0538,
"step": 9170
},
{
"epoch": 2.071767095463778,
"grad_norm": 0.005462646484375,
"learning_rate": 3.1057917390319414e-06,
"loss": 0.0,
"step": 9180
},
{
"epoch": 2.0740239223651544,
"grad_norm": 0.001190185546875,
"learning_rate": 3.0982405799290196e-06,
"loss": 0.0,
"step": 9190
},
{
"epoch": 2.076280749266531,
"grad_norm": 0.000926971435546875,
"learning_rate": 3.090689420826097e-06,
"loss": 0.0,
"step": 9200
},
{
"epoch": 2.078537576167908,
"grad_norm": 0.00518798828125,
"learning_rate": 3.0831382617231747e-06,
"loss": 0.0,
"step": 9210
},
{
"epoch": 2.0807944030692846,
"grad_norm": 0.00011110305786132812,
"learning_rate": 3.075587102620253e-06,
"loss": 0.0,
"step": 9220
},
{
"epoch": 2.0830512299706614,
"grad_norm": 0.001312255859375,
"learning_rate": 3.06803594351733e-06,
"loss": 0.0,
"step": 9230
},
{
"epoch": 2.085308056872038,
"grad_norm": 0.0027313232421875,
"learning_rate": 3.060484784414408e-06,
"loss": 0.0,
"step": 9240
},
{
"epoch": 2.0875648837734144,
"grad_norm": 0.00107574462890625,
"learning_rate": 3.0529336253114857e-06,
"loss": 0.0,
"step": 9250
},
{
"epoch": 2.089821710674791,
"grad_norm": 0.0025634765625,
"learning_rate": 3.0453824662085635e-06,
"loss": 0.0,
"step": 9260
},
{
"epoch": 2.092078537576168,
"grad_norm": 0.0002613067626953125,
"learning_rate": 3.037831307105641e-06,
"loss": 0.0,
"step": 9270
},
{
"epoch": 2.0943353644775446,
"grad_norm": 0.009521484375,
"learning_rate": 3.030280148002719e-06,
"loss": 0.2726,
"step": 9280
},
{
"epoch": 2.0965921913789214,
"grad_norm": 0.000850677490234375,
"learning_rate": 3.0227289888997963e-06,
"loss": 0.1702,
"step": 9290
},
{
"epoch": 2.0988490182802977,
"grad_norm": 0.0023193359375,
"learning_rate": 3.015177829796874e-06,
"loss": 0.0,
"step": 9300
},
{
"epoch": 2.1011058451816744,
"grad_norm": 0.002685546875,
"learning_rate": 3.007626670693952e-06,
"loss": 0.2532,
"step": 9310
},
{
"epoch": 2.103362672083051,
"grad_norm": 0.000301361083984375,
"learning_rate": 3.0000755115910296e-06,
"loss": 0.2287,
"step": 9320
},
{
"epoch": 2.105619498984428,
"grad_norm": 0.003021240234375,
"learning_rate": 2.992524352488107e-06,
"loss": 0.0,
"step": 9330
},
{
"epoch": 2.1078763258858046,
"grad_norm": 0.0054931640625,
"learning_rate": 2.984973193385185e-06,
"loss": 0.1921,
"step": 9340
},
{
"epoch": 2.1101331527871814,
"grad_norm": 0.004974365234375,
"learning_rate": 2.9774220342822625e-06,
"loss": 0.0,
"step": 9350
},
{
"epoch": 2.1123899796885577,
"grad_norm": 0.00909423828125,
"learning_rate": 2.9698708751793402e-06,
"loss": 0.0,
"step": 9360
},
{
"epoch": 2.1146468065899344,
"grad_norm": 0.00469970703125,
"learning_rate": 2.962319716076418e-06,
"loss": 0.0727,
"step": 9370
},
{
"epoch": 2.116903633491311,
"grad_norm": 7.03125,
"learning_rate": 2.9547685569734957e-06,
"loss": 0.0003,
"step": 9380
},
{
"epoch": 2.119160460392688,
"grad_norm": 0.002288818359375,
"learning_rate": 2.947217397870573e-06,
"loss": 0.0001,
"step": 9390
},
{
"epoch": 2.1214172872940646,
"grad_norm": 0.000705718994140625,
"learning_rate": 2.9396662387676513e-06,
"loss": 0.0,
"step": 9400
},
{
"epoch": 2.1236741141954414,
"grad_norm": 113.0,
"learning_rate": 2.9321150796647286e-06,
"loss": 0.2171,
"step": 9410
},
{
"epoch": 2.1259309410968177,
"grad_norm": 0.00238037109375,
"learning_rate": 2.9245639205618064e-06,
"loss": 0.0,
"step": 9420
},
{
"epoch": 2.1281877679981944,
"grad_norm": 0.00445556640625,
"learning_rate": 2.9170127614588837e-06,
"loss": 0.0,
"step": 9430
},
{
"epoch": 2.130444594899571,
"grad_norm": 0.00634765625,
"learning_rate": 2.909461602355962e-06,
"loss": 0.0,
"step": 9440
},
{
"epoch": 2.132701421800948,
"grad_norm": 0.0035552978515625,
"learning_rate": 2.9019104432530396e-06,
"loss": 0.0,
"step": 9450
},
{
"epoch": 2.1349582487023246,
"grad_norm": 41.5,
"learning_rate": 2.8943592841501174e-06,
"loss": 0.2743,
"step": 9460
},
{
"epoch": 2.1372150756037014,
"grad_norm": 0.0028076171875,
"learning_rate": 2.886808125047195e-06,
"loss": 0.0,
"step": 9470
},
{
"epoch": 2.1394719025050777,
"grad_norm": 0.000965118408203125,
"learning_rate": 2.8792569659442725e-06,
"loss": 0.0,
"step": 9480
},
{
"epoch": 2.1417287294064544,
"grad_norm": 0.000728607177734375,
"learning_rate": 2.8717058068413507e-06,
"loss": 0.0,
"step": 9490
},
{
"epoch": 2.143985556307831,
"grad_norm": 0.00048065185546875,
"learning_rate": 2.864154647738428e-06,
"loss": 0.0,
"step": 9500
},
{
"epoch": 2.146242383209208,
"grad_norm": 0.000591278076171875,
"learning_rate": 2.8566034886355058e-06,
"loss": 0.0,
"step": 9510
},
{
"epoch": 2.1484992101105846,
"grad_norm": 0.0022735595703125,
"learning_rate": 2.8490523295325835e-06,
"loss": 0.0,
"step": 9520
},
{
"epoch": 2.1507560370119614,
"grad_norm": 0.00022602081298828125,
"learning_rate": 2.8415011704296613e-06,
"loss": 0.0,
"step": 9530
},
{
"epoch": 2.1530128639133377,
"grad_norm": 0.0015716552734375,
"learning_rate": 2.8339500113267386e-06,
"loss": 0.1853,
"step": 9540
},
{
"epoch": 2.1552696908147144,
"grad_norm": 0.0012054443359375,
"learning_rate": 2.826398852223817e-06,
"loss": 0.0,
"step": 9550
},
{
"epoch": 2.157526517716091,
"grad_norm": 0.0006103515625,
"learning_rate": 2.818847693120894e-06,
"loss": 0.0,
"step": 9560
},
{
"epoch": 2.159783344617468,
"grad_norm": 0.0010223388671875,
"learning_rate": 2.811296534017972e-06,
"loss": 0.0,
"step": 9570
},
{
"epoch": 2.1620401715188446,
"grad_norm": 0.0003719329833984375,
"learning_rate": 2.8037453749150497e-06,
"loss": 0.0,
"step": 9580
},
{
"epoch": 2.1642969984202214,
"grad_norm": 0.01708984375,
"learning_rate": 2.7961942158121275e-06,
"loss": 0.0,
"step": 9590
},
{
"epoch": 2.1665538253215977,
"grad_norm": 7.915496826171875e-05,
"learning_rate": 2.788643056709205e-06,
"loss": 0.0641,
"step": 9600
},
{
"epoch": 2.1688106522229744,
"grad_norm": 121.5,
"learning_rate": 2.781091897606283e-06,
"loss": 0.5332,
"step": 9610
},
{
"epoch": 2.171067479124351,
"grad_norm": 0.0010528564453125,
"learning_rate": 2.7735407385033603e-06,
"loss": 0.2714,
"step": 9620
},
{
"epoch": 2.173324306025728,
"grad_norm": 0.00051116943359375,
"learning_rate": 2.765989579400438e-06,
"loss": 0.0,
"step": 9630
},
{
"epoch": 2.1755811329271046,
"grad_norm": 0.00026702880859375,
"learning_rate": 2.758438420297516e-06,
"loss": 0.0,
"step": 9640
},
{
"epoch": 2.177837959828481,
"grad_norm": 0.00012969970703125,
"learning_rate": 2.7508872611945936e-06,
"loss": 0.0,
"step": 9650
},
{
"epoch": 2.1800947867298577,
"grad_norm": 0.001953125,
"learning_rate": 2.7433361020916714e-06,
"loss": 0.0,
"step": 9660
},
{
"epoch": 2.1823516136312344,
"grad_norm": 0.000766754150390625,
"learning_rate": 2.735784942988749e-06,
"loss": 0.5437,
"step": 9670
},
{
"epoch": 2.184608440532611,
"grad_norm": 0.0069580078125,
"learning_rate": 2.728233783885827e-06,
"loss": 0.0,
"step": 9680
},
{
"epoch": 2.186865267433988,
"grad_norm": 0.00225830078125,
"learning_rate": 2.7206826247829042e-06,
"loss": 0.0,
"step": 9690
},
{
"epoch": 2.1891220943353646,
"grad_norm": 0.00022792816162109375,
"learning_rate": 2.7131314656799824e-06,
"loss": 0.0,
"step": 9700
},
{
"epoch": 2.191378921236741,
"grad_norm": 0.0004405975341796875,
"learning_rate": 2.7055803065770597e-06,
"loss": 0.0,
"step": 9710
},
{
"epoch": 2.1936357481381177,
"grad_norm": 0.00151824951171875,
"learning_rate": 2.6980291474741375e-06,
"loss": 0.0,
"step": 9720
},
{
"epoch": 2.1958925750394944,
"grad_norm": 0.0003032684326171875,
"learning_rate": 2.6904779883712153e-06,
"loss": 0.0,
"step": 9730
},
{
"epoch": 2.198149401940871,
"grad_norm": 0.00084686279296875,
"learning_rate": 2.682926829268293e-06,
"loss": 0.0,
"step": 9740
},
{
"epoch": 2.200406228842248,
"grad_norm": 0.005340576171875,
"learning_rate": 2.6753756701653704e-06,
"loss": 0.0887,
"step": 9750
},
{
"epoch": 2.2026630557436246,
"grad_norm": 0.005218505859375,
"learning_rate": 2.6678245110624485e-06,
"loss": 0.0451,
"step": 9760
},
{
"epoch": 2.204919882645001,
"grad_norm": 0.0027008056640625,
"learning_rate": 2.660273351959526e-06,
"loss": 0.0,
"step": 9770
},
{
"epoch": 2.2071767095463777,
"grad_norm": 0.002044677734375,
"learning_rate": 2.6527221928566036e-06,
"loss": 0.0,
"step": 9780
},
{
"epoch": 2.2094335364477544,
"grad_norm": 0.000885009765625,
"learning_rate": 2.6451710337536814e-06,
"loss": 0.0,
"step": 9790
},
{
"epoch": 2.211690363349131,
"grad_norm": 0.0003299713134765625,
"learning_rate": 2.637619874650759e-06,
"loss": 0.0,
"step": 9800
},
{
"epoch": 2.213947190250508,
"grad_norm": 0.0036773681640625,
"learning_rate": 2.6300687155478365e-06,
"loss": 0.0,
"step": 9810
},
{
"epoch": 2.2162040171518846,
"grad_norm": 0.017578125,
"learning_rate": 2.6225175564449147e-06,
"loss": 0.1628,
"step": 9820
},
{
"epoch": 2.218460844053261,
"grad_norm": 0.001251220703125,
"learning_rate": 2.614966397341992e-06,
"loss": 0.0,
"step": 9830
},
{
"epoch": 2.2207176709546377,
"grad_norm": 0.0003490447998046875,
"learning_rate": 2.6074152382390698e-06,
"loss": 0.0,
"step": 9840
},
{
"epoch": 2.2229744978560144,
"grad_norm": 0.0020294189453125,
"learning_rate": 2.5998640791361475e-06,
"loss": 0.0,
"step": 9850
},
{
"epoch": 2.225231324757391,
"grad_norm": 0.00151824951171875,
"learning_rate": 2.5923129200332253e-06,
"loss": 0.0,
"step": 9860
},
{
"epoch": 2.227488151658768,
"grad_norm": 0.001373291015625,
"learning_rate": 2.5847617609303035e-06,
"loss": 0.0,
"step": 9870
},
{
"epoch": 2.2297449785601446,
"grad_norm": 0.010986328125,
"learning_rate": 2.577210601827381e-06,
"loss": 0.2653,
"step": 9880
},
{
"epoch": 2.232001805461521,
"grad_norm": 0.000423431396484375,
"learning_rate": 2.5696594427244586e-06,
"loss": 0.0,
"step": 9890
},
{
"epoch": 2.2342586323628977,
"grad_norm": 0.0172119140625,
"learning_rate": 2.562108283621536e-06,
"loss": 0.0,
"step": 9900
},
{
"epoch": 2.2365154592642744,
"grad_norm": 0.0008087158203125,
"learning_rate": 2.554557124518614e-06,
"loss": 0.0438,
"step": 9910
},
{
"epoch": 2.238772286165651,
"grad_norm": 0.00173187255859375,
"learning_rate": 2.5470059654156914e-06,
"loss": 0.0,
"step": 9920
},
{
"epoch": 2.241029113067028,
"grad_norm": 0.0001888275146484375,
"learning_rate": 2.5394548063127696e-06,
"loss": 0.0,
"step": 9930
},
{
"epoch": 2.2432859399684046,
"grad_norm": 0.007354736328125,
"learning_rate": 2.531903647209847e-06,
"loss": 0.0,
"step": 9940
},
{
"epoch": 2.245542766869781,
"grad_norm": 0.000286102294921875,
"learning_rate": 2.5243524881069247e-06,
"loss": 0.1885,
"step": 9950
},
{
"epoch": 2.2477995937711577,
"grad_norm": 0.000972747802734375,
"learning_rate": 2.516801329004002e-06,
"loss": 0.0,
"step": 9960
},
{
"epoch": 2.2500564206725344,
"grad_norm": 0.00150299072265625,
"learning_rate": 2.5092501699010802e-06,
"loss": 0.0,
"step": 9970
},
{
"epoch": 2.252313247573911,
"grad_norm": 0.005889892578125,
"learning_rate": 2.5016990107981576e-06,
"loss": 0.0,
"step": 9980
},
{
"epoch": 2.254570074475288,
"grad_norm": 0.00531005859375,
"learning_rate": 2.4941478516952353e-06,
"loss": 0.0114,
"step": 9990
},
{
"epoch": 2.256826901376664,
"grad_norm": 0.0054931640625,
"learning_rate": 2.486596692592313e-06,
"loss": 0.0,
"step": 10000
},
{
"epoch": 2.259083728278041,
"grad_norm": 0.000782012939453125,
"learning_rate": 2.479045533489391e-06,
"loss": 0.206,
"step": 10010
},
{
"epoch": 2.2613405551794177,
"grad_norm": 0.0026702880859375,
"learning_rate": 2.4714943743864686e-06,
"loss": 0.0,
"step": 10020
},
{
"epoch": 2.2635973820807944,
"grad_norm": 0.000499725341796875,
"learning_rate": 2.4639432152835464e-06,
"loss": 0.0,
"step": 10030
},
{
"epoch": 2.265854208982171,
"grad_norm": 37.5,
"learning_rate": 2.456392056180624e-06,
"loss": 0.1826,
"step": 10040
},
{
"epoch": 2.268111035883548,
"grad_norm": 0.00078582763671875,
"learning_rate": 2.4488408970777015e-06,
"loss": 0.0,
"step": 10050
},
{
"epoch": 2.2703678627849246,
"grad_norm": 0.00029754638671875,
"learning_rate": 2.4412897379747792e-06,
"loss": 0.021,
"step": 10060
},
{
"epoch": 2.272624689686301,
"grad_norm": 0.01092529296875,
"learning_rate": 2.433738578871857e-06,
"loss": 0.0,
"step": 10070
},
{
"epoch": 2.2748815165876777,
"grad_norm": 0.00101470947265625,
"learning_rate": 2.4261874197689348e-06,
"loss": 0.2556,
"step": 10080
},
{
"epoch": 2.2771383434890544,
"grad_norm": 0.00022792816162109375,
"learning_rate": 2.4186362606660125e-06,
"loss": 0.0,
"step": 10090
},
{
"epoch": 2.279395170390431,
"grad_norm": 0.00445556640625,
"learning_rate": 2.4110851015630903e-06,
"loss": 0.0,
"step": 10100
},
{
"epoch": 2.281651997291808,
"grad_norm": 0.00020503997802734375,
"learning_rate": 2.4035339424601676e-06,
"loss": 0.0,
"step": 10110
},
{
"epoch": 2.283908824193184,
"grad_norm": 0.0004329681396484375,
"learning_rate": 2.3959827833572454e-06,
"loss": 0.0,
"step": 10120
},
{
"epoch": 2.286165651094561,
"grad_norm": 0.0032501220703125,
"learning_rate": 2.388431624254323e-06,
"loss": 0.1042,
"step": 10130
},
{
"epoch": 2.2884224779959377,
"grad_norm": 0.0035400390625,
"learning_rate": 2.380880465151401e-06,
"loss": 0.0,
"step": 10140
},
{
"epoch": 2.2906793048973144,
"grad_norm": 0.0005645751953125,
"learning_rate": 2.3733293060484787e-06,
"loss": 0.0,
"step": 10150
},
{
"epoch": 2.292936131798691,
"grad_norm": 0.0020751953125,
"learning_rate": 2.3657781469455564e-06,
"loss": 0.0,
"step": 10160
},
{
"epoch": 2.295192958700068,
"grad_norm": 0.00010776519775390625,
"learning_rate": 2.3582269878426338e-06,
"loss": 0.0,
"step": 10170
},
{
"epoch": 2.297449785601444,
"grad_norm": 0.0028533935546875,
"learning_rate": 2.3506758287397115e-06,
"loss": 0.0,
"step": 10180
},
{
"epoch": 2.299706612502821,
"grad_norm": 0.0031890869140625,
"learning_rate": 2.3431246696367893e-06,
"loss": 0.2192,
"step": 10190
},
{
"epoch": 2.3019634394041977,
"grad_norm": 0.0047607421875,
"learning_rate": 2.3355735105338675e-06,
"loss": 0.0,
"step": 10200
},
{
"epoch": 2.3042202663055744,
"grad_norm": 0.0025177001953125,
"learning_rate": 2.328022351430945e-06,
"loss": 0.0,
"step": 10210
},
{
"epoch": 2.306477093206951,
"grad_norm": 0.0034942626953125,
"learning_rate": 2.3204711923280226e-06,
"loss": 0.0,
"step": 10220
},
{
"epoch": 2.3087339201083275,
"grad_norm": 0.000568389892578125,
"learning_rate": 2.3129200332251003e-06,
"loss": 0.0,
"step": 10230
},
{
"epoch": 2.310990747009704,
"grad_norm": 0.0030517578125,
"learning_rate": 2.305368874122178e-06,
"loss": 0.0,
"step": 10240
},
{
"epoch": 2.313247573911081,
"grad_norm": 0.000644683837890625,
"learning_rate": 2.297817715019256e-06,
"loss": 0.0,
"step": 10250
},
{
"epoch": 2.3155044008124577,
"grad_norm": 0.00107574462890625,
"learning_rate": 2.290266555916333e-06,
"loss": 0.0,
"step": 10260
},
{
"epoch": 2.3177612277138344,
"grad_norm": 0.0004329681396484375,
"learning_rate": 2.282715396813411e-06,
"loss": 0.103,
"step": 10270
},
{
"epoch": 2.320018054615211,
"grad_norm": 0.0025787353515625,
"learning_rate": 2.2751642377104887e-06,
"loss": 0.0,
"step": 10280
},
{
"epoch": 2.322274881516588,
"grad_norm": 0.004730224609375,
"learning_rate": 2.2676130786075665e-06,
"loss": 0.0,
"step": 10290
},
{
"epoch": 2.324531708417964,
"grad_norm": 0.0009002685546875,
"learning_rate": 2.2600619195046442e-06,
"loss": 0.0,
"step": 10300
},
{
"epoch": 2.326788535319341,
"grad_norm": 0.00010013580322265625,
"learning_rate": 2.252510760401722e-06,
"loss": 0.0,
"step": 10310
},
{
"epoch": 2.3290453622207177,
"grad_norm": 0.00032806396484375,
"learning_rate": 2.2449596012987993e-06,
"loss": 0.0,
"step": 10320
},
{
"epoch": 2.3313021891220944,
"grad_norm": 0.0036163330078125,
"learning_rate": 2.237408442195877e-06,
"loss": 0.003,
"step": 10330
},
{
"epoch": 2.333559016023471,
"grad_norm": 0.0005645751953125,
"learning_rate": 2.229857283092955e-06,
"loss": 0.0,
"step": 10340
},
{
"epoch": 2.3358158429248475,
"grad_norm": 0.000354766845703125,
"learning_rate": 2.2223061239900326e-06,
"loss": 0.0,
"step": 10350
},
{
"epoch": 2.338072669826224,
"grad_norm": 0.0011749267578125,
"learning_rate": 2.2147549648871104e-06,
"loss": 0.0,
"step": 10360
},
{
"epoch": 2.340329496727601,
"grad_norm": 0.00518798828125,
"learning_rate": 2.207203805784188e-06,
"loss": 0.0,
"step": 10370
},
{
"epoch": 2.3425863236289777,
"grad_norm": 0.000255584716796875,
"learning_rate": 2.1996526466812655e-06,
"loss": 0.0,
"step": 10380
},
{
"epoch": 2.3448431505303544,
"grad_norm": 0.000667572021484375,
"learning_rate": 2.1921014875783432e-06,
"loss": 0.0,
"step": 10390
},
{
"epoch": 2.347099977431731,
"grad_norm": 0.0008087158203125,
"learning_rate": 2.184550328475421e-06,
"loss": 0.0,
"step": 10400
},
{
"epoch": 2.349356804333108,
"grad_norm": 0.00067901611328125,
"learning_rate": 2.1769991693724988e-06,
"loss": 0.0,
"step": 10410
},
{
"epoch": 2.351613631234484,
"grad_norm": 0.001312255859375,
"learning_rate": 2.1694480102695765e-06,
"loss": 0.0,
"step": 10420
},
{
"epoch": 2.353870458135861,
"grad_norm": 0.00144195556640625,
"learning_rate": 2.1618968511666543e-06,
"loss": 0.0,
"step": 10430
},
{
"epoch": 2.3561272850372377,
"grad_norm": 0.00023174285888671875,
"learning_rate": 2.154345692063732e-06,
"loss": 0.0,
"step": 10440
},
{
"epoch": 2.3583841119386144,
"grad_norm": 0.00274658203125,
"learning_rate": 2.14679453296081e-06,
"loss": 0.0,
"step": 10450
},
{
"epoch": 2.360640938839991,
"grad_norm": 0.000858306884765625,
"learning_rate": 2.1392433738578876e-06,
"loss": 0.3298,
"step": 10460
},
{
"epoch": 2.3628977657413675,
"grad_norm": 0.00128936767578125,
"learning_rate": 2.1316922147549653e-06,
"loss": 0.1187,
"step": 10470
},
{
"epoch": 2.365154592642744,
"grad_norm": 0.024169921875,
"learning_rate": 2.1241410556520427e-06,
"loss": 0.317,
"step": 10480
},
{
"epoch": 2.367411419544121,
"grad_norm": 0.000843048095703125,
"learning_rate": 2.1165898965491204e-06,
"loss": 0.0,
"step": 10490
},
{
"epoch": 2.3696682464454977,
"grad_norm": 0.00058746337890625,
"learning_rate": 2.109038737446198e-06,
"loss": 0.0,
"step": 10500
},
{
"epoch": 2.3719250733468744,
"grad_norm": 0.00063323974609375,
"learning_rate": 2.101487578343276e-06,
"loss": 0.0,
"step": 10510
},
{
"epoch": 2.374181900248251,
"grad_norm": 0.00125885009765625,
"learning_rate": 2.0939364192403537e-06,
"loss": 0.0,
"step": 10520
},
{
"epoch": 2.3764387271496275,
"grad_norm": 54.5,
"learning_rate": 2.0863852601374315e-06,
"loss": 0.246,
"step": 10530
},
{
"epoch": 2.378695554051004,
"grad_norm": 0.00054168701171875,
"learning_rate": 2.078834101034509e-06,
"loss": 0.0,
"step": 10540
},
{
"epoch": 2.380952380952381,
"grad_norm": 0.00034332275390625,
"learning_rate": 2.0712829419315866e-06,
"loss": 0.0,
"step": 10550
},
{
"epoch": 2.3832092078537577,
"grad_norm": 0.00019073486328125,
"learning_rate": 2.0637317828286643e-06,
"loss": 0.0,
"step": 10560
},
{
"epoch": 2.3854660347551344,
"grad_norm": 0.00141143798828125,
"learning_rate": 2.056180623725742e-06,
"loss": 0.0,
"step": 10570
},
{
"epoch": 2.3877228616565107,
"grad_norm": 0.000926971435546875,
"learning_rate": 2.04862946462282e-06,
"loss": 0.0,
"step": 10580
},
{
"epoch": 2.3899796885578874,
"grad_norm": 0.00750732421875,
"learning_rate": 2.041078305519897e-06,
"loss": 0.0,
"step": 10590
},
{
"epoch": 2.392236515459264,
"grad_norm": 0.00628662109375,
"learning_rate": 2.033527146416975e-06,
"loss": 0.0,
"step": 10600
},
{
"epoch": 2.394493342360641,
"grad_norm": 0.00616455078125,
"learning_rate": 2.0259759873140527e-06,
"loss": 0.2714,
"step": 10610
},
{
"epoch": 2.3967501692620177,
"grad_norm": 0.003875732421875,
"learning_rate": 2.0184248282111305e-06,
"loss": 0.0,
"step": 10620
},
{
"epoch": 2.3990069961633944,
"grad_norm": 0.000865936279296875,
"learning_rate": 2.0108736691082082e-06,
"loss": 0.0,
"step": 10630
},
{
"epoch": 2.401263823064771,
"grad_norm": 0.001800537109375,
"learning_rate": 2.003322510005286e-06,
"loss": 0.0,
"step": 10640
},
{
"epoch": 2.4035206499661474,
"grad_norm": 0.00014400482177734375,
"learning_rate": 1.9957713509023637e-06,
"loss": 0.0,
"step": 10650
},
{
"epoch": 2.405777476867524,
"grad_norm": 0.000606536865234375,
"learning_rate": 1.9882201917994415e-06,
"loss": 0.0,
"step": 10660
},
{
"epoch": 2.408034303768901,
"grad_norm": 0.0020751953125,
"learning_rate": 1.9806690326965193e-06,
"loss": 0.3892,
"step": 10670
},
{
"epoch": 2.4102911306702777,
"grad_norm": 0.0001983642578125,
"learning_rate": 1.973117873593597e-06,
"loss": 0.0,
"step": 10680
},
{
"epoch": 2.4125479575716544,
"grad_norm": 0.005767822265625,
"learning_rate": 1.9655667144906744e-06,
"loss": 0.0,
"step": 10690
},
{
"epoch": 2.4148047844730307,
"grad_norm": 0.00101470947265625,
"learning_rate": 1.958015555387752e-06,
"loss": 0.0,
"step": 10700
},
{
"epoch": 2.4170616113744074,
"grad_norm": 0.00142669677734375,
"learning_rate": 1.95046439628483e-06,
"loss": 0.0,
"step": 10710
},
{
"epoch": 2.419318438275784,
"grad_norm": 0.00084686279296875,
"learning_rate": 1.9429132371819076e-06,
"loss": 0.0,
"step": 10720
},
{
"epoch": 2.421575265177161,
"grad_norm": 0.000934600830078125,
"learning_rate": 1.9353620780789854e-06,
"loss": 0.0,
"step": 10730
},
{
"epoch": 2.4238320920785377,
"grad_norm": 0.0048828125,
"learning_rate": 1.927810918976063e-06,
"loss": 0.0,
"step": 10740
},
{
"epoch": 2.4260889189799144,
"grad_norm": 0.050537109375,
"learning_rate": 1.9202597598731405e-06,
"loss": 0.0,
"step": 10750
},
{
"epoch": 2.428345745881291,
"grad_norm": 0.0023193359375,
"learning_rate": 1.9127086007702183e-06,
"loss": 0.0,
"step": 10760
},
{
"epoch": 2.4306025727826674,
"grad_norm": 0.0002231597900390625,
"learning_rate": 1.905157441667296e-06,
"loss": 0.0384,
"step": 10770
},
{
"epoch": 2.432859399684044,
"grad_norm": 0.000553131103515625,
"learning_rate": 1.8976062825643738e-06,
"loss": 0.0,
"step": 10780
},
{
"epoch": 2.435116226585421,
"grad_norm": 0.0029449462890625,
"learning_rate": 1.8900551234614513e-06,
"loss": 0.0,
"step": 10790
},
{
"epoch": 2.4373730534867977,
"grad_norm": 0.0002536773681640625,
"learning_rate": 1.882503964358529e-06,
"loss": 0.0,
"step": 10800
},
{
"epoch": 2.4396298803881744,
"grad_norm": 0.00262451171875,
"learning_rate": 1.8749528052556069e-06,
"loss": 0.0,
"step": 10810
},
{
"epoch": 2.4418867072895507,
"grad_norm": 0.01361083984375,
"learning_rate": 1.8674016461526844e-06,
"loss": 0.0,
"step": 10820
},
{
"epoch": 2.4441435341909274,
"grad_norm": 0.0029296875,
"learning_rate": 1.8598504870497622e-06,
"loss": 0.0,
"step": 10830
},
{
"epoch": 2.446400361092304,
"grad_norm": 0.00031280517578125,
"learning_rate": 1.85229932794684e-06,
"loss": 0.0,
"step": 10840
},
{
"epoch": 2.448657187993681,
"grad_norm": 0.0004711151123046875,
"learning_rate": 1.844748168843918e-06,
"loss": 0.0,
"step": 10850
},
{
"epoch": 2.4509140148950577,
"grad_norm": 0.01434326171875,
"learning_rate": 1.8371970097409954e-06,
"loss": 0.0,
"step": 10860
},
{
"epoch": 2.4531708417964344,
"grad_norm": 0.0006866455078125,
"learning_rate": 1.8296458506380732e-06,
"loss": 0.0,
"step": 10870
},
{
"epoch": 2.4554276686978107,
"grad_norm": 0.0003509521484375,
"learning_rate": 1.822094691535151e-06,
"loss": 0.0,
"step": 10880
},
{
"epoch": 2.4576844955991874,
"grad_norm": 0.00022983551025390625,
"learning_rate": 1.8145435324322285e-06,
"loss": 0.0,
"step": 10890
},
{
"epoch": 2.459941322500564,
"grad_norm": 9.059906005859375e-05,
"learning_rate": 1.8069923733293063e-06,
"loss": 0.0,
"step": 10900
},
{
"epoch": 2.462198149401941,
"grad_norm": 9.441375732421875e-05,
"learning_rate": 1.799441214226384e-06,
"loss": 0.0,
"step": 10910
},
{
"epoch": 2.4644549763033177,
"grad_norm": 0.00091552734375,
"learning_rate": 1.7918900551234616e-06,
"loss": 0.0,
"step": 10920
},
{
"epoch": 2.466711803204694,
"grad_norm": 0.0269775390625,
"learning_rate": 1.7843388960205394e-06,
"loss": 0.0,
"step": 10930
},
{
"epoch": 2.4689686301060707,
"grad_norm": 0.001708984375,
"learning_rate": 1.7767877369176171e-06,
"loss": 0.0,
"step": 10940
},
{
"epoch": 2.4712254570074474,
"grad_norm": 0.007537841796875,
"learning_rate": 1.7692365778146947e-06,
"loss": 0.0,
"step": 10950
},
{
"epoch": 2.473482283908824,
"grad_norm": 0.004364013671875,
"learning_rate": 1.7616854187117724e-06,
"loss": 0.0,
"step": 10960
},
{
"epoch": 2.475739110810201,
"grad_norm": 0.0003871917724609375,
"learning_rate": 1.7541342596088502e-06,
"loss": 0.0,
"step": 10970
},
{
"epoch": 2.4779959377115777,
"grad_norm": 0.00142669677734375,
"learning_rate": 1.7465831005059277e-06,
"loss": 0.0,
"step": 10980
},
{
"epoch": 2.4802527646129544,
"grad_norm": 0.00066375732421875,
"learning_rate": 1.7390319414030055e-06,
"loss": 0.0,
"step": 10990
},
{
"epoch": 2.4825095915143307,
"grad_norm": 0.0002689361572265625,
"learning_rate": 1.7314807823000833e-06,
"loss": 0.0,
"step": 11000
},
{
"epoch": 2.4847664184157074,
"grad_norm": 0.0002536773681640625,
"learning_rate": 1.7239296231971608e-06,
"loss": 0.0,
"step": 11010
},
{
"epoch": 2.487023245317084,
"grad_norm": 0.00115203857421875,
"learning_rate": 1.7163784640942386e-06,
"loss": 0.0,
"step": 11020
},
{
"epoch": 2.489280072218461,
"grad_norm": 0.00089263916015625,
"learning_rate": 1.7088273049913161e-06,
"loss": 0.0,
"step": 11030
},
{
"epoch": 2.4915368991198377,
"grad_norm": 0.0001964569091796875,
"learning_rate": 1.7012761458883939e-06,
"loss": 0.0,
"step": 11040
},
{
"epoch": 2.493793726021214,
"grad_norm": 0.0014190673828125,
"learning_rate": 1.6937249867854716e-06,
"loss": 0.0,
"step": 11050
},
{
"epoch": 2.4960505529225907,
"grad_norm": 0.0002155303955078125,
"learning_rate": 1.6861738276825492e-06,
"loss": 0.0,
"step": 11060
},
{
"epoch": 2.4983073798239674,
"grad_norm": 0.000537872314453125,
"learning_rate": 1.6786226685796272e-06,
"loss": 0.0,
"step": 11070
},
{
"epoch": 2.500564206725344,
"grad_norm": 0.003204345703125,
"learning_rate": 1.671071509476705e-06,
"loss": 0.0,
"step": 11080
},
{
"epoch": 2.502821033626721,
"grad_norm": 0.0002727508544921875,
"learning_rate": 1.6635203503737827e-06,
"loss": 0.0,
"step": 11090
},
{
"epoch": 2.5050778605280977,
"grad_norm": 0.001373291015625,
"learning_rate": 1.6559691912708602e-06,
"loss": 0.0,
"step": 11100
},
{
"epoch": 2.5073346874294744,
"grad_norm": 0.000644683837890625,
"learning_rate": 1.648418032167938e-06,
"loss": 0.0,
"step": 11110
},
{
"epoch": 2.5095915143308507,
"grad_norm": 0.0001316070556640625,
"learning_rate": 1.6408668730650157e-06,
"loss": 0.1207,
"step": 11120
},
{
"epoch": 2.5118483412322274,
"grad_norm": 0.00013256072998046875,
"learning_rate": 1.6333157139620933e-06,
"loss": 0.0,
"step": 11130
},
{
"epoch": 2.514105168133604,
"grad_norm": 0.001129150390625,
"learning_rate": 1.625764554859171e-06,
"loss": 0.0,
"step": 11140
},
{
"epoch": 2.516361995034981,
"grad_norm": 0.01123046875,
"learning_rate": 1.6182133957562488e-06,
"loss": 0.0,
"step": 11150
},
{
"epoch": 2.518618821936357,
"grad_norm": 0.0007171630859375,
"learning_rate": 1.6106622366533264e-06,
"loss": 0.0,
"step": 11160
},
{
"epoch": 2.520875648837734,
"grad_norm": 45.5,
"learning_rate": 1.6031110775504041e-06,
"loss": 0.2039,
"step": 11170
},
{
"epoch": 2.5231324757391107,
"grad_norm": 0.000759124755859375,
"learning_rate": 1.5955599184474819e-06,
"loss": 0.0,
"step": 11180
},
{
"epoch": 2.5253893026404874,
"grad_norm": 7.104873657226562e-05,
"learning_rate": 1.5880087593445594e-06,
"loss": 0.0,
"step": 11190
},
{
"epoch": 2.527646129541864,
"grad_norm": 0.0001220703125,
"learning_rate": 1.5804576002416372e-06,
"loss": 0.0,
"step": 11200
},
{
"epoch": 2.529902956443241,
"grad_norm": 0.00103759765625,
"learning_rate": 1.572906441138715e-06,
"loss": 0.0,
"step": 11210
},
{
"epoch": 2.5321597833446177,
"grad_norm": 0.006256103515625,
"learning_rate": 1.5653552820357925e-06,
"loss": 0.2662,
"step": 11220
},
{
"epoch": 2.5344166102459944,
"grad_norm": 0.00012493133544921875,
"learning_rate": 1.5578041229328703e-06,
"loss": 0.0,
"step": 11230
},
{
"epoch": 2.5366734371473707,
"grad_norm": 0.0274658203125,
"learning_rate": 1.550252963829948e-06,
"loss": 0.0,
"step": 11240
},
{
"epoch": 2.5389302640487474,
"grad_norm": 8.440017700195312e-05,
"learning_rate": 1.5427018047270256e-06,
"loss": 0.204,
"step": 11250
},
{
"epoch": 2.541187090950124,
"grad_norm": 0.00013446807861328125,
"learning_rate": 1.5351506456241033e-06,
"loss": 0.0,
"step": 11260
},
{
"epoch": 2.543443917851501,
"grad_norm": 0.0023040771484375,
"learning_rate": 1.527599486521181e-06,
"loss": 0.0608,
"step": 11270
},
{
"epoch": 2.545700744752877,
"grad_norm": 0.00023651123046875,
"learning_rate": 1.5200483274182587e-06,
"loss": 0.0,
"step": 11280
},
{
"epoch": 2.547957571654254,
"grad_norm": 0.013916015625,
"learning_rate": 1.5124971683153366e-06,
"loss": 0.0,
"step": 11290
},
{
"epoch": 2.5502143985556307,
"grad_norm": 45.5,
"learning_rate": 1.5049460092124144e-06,
"loss": 0.1629,
"step": 11300
},
{
"epoch": 2.5524712254570074,
"grad_norm": 0.005950927734375,
"learning_rate": 1.497394850109492e-06,
"loss": 0.0,
"step": 11310
},
{
"epoch": 2.554728052358384,
"grad_norm": 0.000423431396484375,
"learning_rate": 1.4898436910065697e-06,
"loss": 0.0,
"step": 11320
},
{
"epoch": 2.556984879259761,
"grad_norm": 0.00113677978515625,
"learning_rate": 1.4822925319036475e-06,
"loss": 0.0,
"step": 11330
},
{
"epoch": 2.5592417061611377,
"grad_norm": 0.0002803802490234375,
"learning_rate": 1.474741372800725e-06,
"loss": 0.0357,
"step": 11340
},
{
"epoch": 2.561498533062514,
"grad_norm": 0.0002651214599609375,
"learning_rate": 1.4671902136978028e-06,
"loss": 0.0,
"step": 11350
},
{
"epoch": 2.5637553599638907,
"grad_norm": 0.00058746337890625,
"learning_rate": 1.4596390545948805e-06,
"loss": 0.5908,
"step": 11360
},
{
"epoch": 2.5660121868652674,
"grad_norm": 0.000148773193359375,
"learning_rate": 1.452087895491958e-06,
"loss": 0.3479,
"step": 11370
},
{
"epoch": 2.568269013766644,
"grad_norm": 0.000675201416015625,
"learning_rate": 1.4445367363890358e-06,
"loss": 0.0,
"step": 11380
},
{
"epoch": 2.570525840668021,
"grad_norm": 0.00017547607421875,
"learning_rate": 1.4369855772861136e-06,
"loss": 0.0,
"step": 11390
},
{
"epoch": 2.572782667569397,
"grad_norm": 0.000469207763671875,
"learning_rate": 1.4294344181831911e-06,
"loss": 0.0,
"step": 11400
},
{
"epoch": 2.575039494470774,
"grad_norm": 0.00061798095703125,
"learning_rate": 1.421883259080269e-06,
"loss": 0.0,
"step": 11410
},
{
"epoch": 2.5772963213721507,
"grad_norm": 0.00087738037109375,
"learning_rate": 1.4143320999773467e-06,
"loss": 0.0,
"step": 11420
},
{
"epoch": 2.5795531482735274,
"grad_norm": 0.0004367828369140625,
"learning_rate": 1.4067809408744242e-06,
"loss": 0.0,
"step": 11430
},
{
"epoch": 2.581809975174904,
"grad_norm": 0.01251220703125,
"learning_rate": 1.399229781771502e-06,
"loss": 0.0,
"step": 11440
},
{
"epoch": 2.584066802076281,
"grad_norm": 0.000194549560546875,
"learning_rate": 1.3916786226685797e-06,
"loss": 0.0,
"step": 11450
},
{
"epoch": 2.5863236289776577,
"grad_norm": 0.000835418701171875,
"learning_rate": 1.3841274635656573e-06,
"loss": 0.0,
"step": 11460
},
{
"epoch": 2.588580455879034,
"grad_norm": 0.000904083251953125,
"learning_rate": 1.376576304462735e-06,
"loss": 0.0,
"step": 11470
},
{
"epoch": 2.5908372827804107,
"grad_norm": 0.05029296875,
"learning_rate": 1.3690251453598128e-06,
"loss": 0.0,
"step": 11480
},
{
"epoch": 2.5930941096817874,
"grad_norm": 0.006317138671875,
"learning_rate": 1.3614739862568904e-06,
"loss": 0.0,
"step": 11490
},
{
"epoch": 2.595350936583164,
"grad_norm": 0.00122833251953125,
"learning_rate": 1.3539228271539681e-06,
"loss": 0.0,
"step": 11500
},
{
"epoch": 2.5976077634845405,
"grad_norm": 0.005828857421875,
"learning_rate": 1.346371668051046e-06,
"loss": 0.0339,
"step": 11510
},
{
"epoch": 2.599864590385917,
"grad_norm": 0.00177001953125,
"learning_rate": 1.3388205089481239e-06,
"loss": 0.0,
"step": 11520
},
{
"epoch": 2.602121417287294,
"grad_norm": 0.004241943359375,
"learning_rate": 1.3312693498452014e-06,
"loss": 0.0,
"step": 11530
},
{
"epoch": 2.6043782441886707,
"grad_norm": 0.0003204345703125,
"learning_rate": 1.3237181907422792e-06,
"loss": 0.0,
"step": 11540
},
{
"epoch": 2.6066350710900474,
"grad_norm": 0.00025177001953125,
"learning_rate": 1.316167031639357e-06,
"loss": 0.0,
"step": 11550
},
{
"epoch": 2.608891897991424,
"grad_norm": 0.009033203125,
"learning_rate": 1.3086158725364345e-06,
"loss": 0.0,
"step": 11560
},
{
"epoch": 2.611148724892801,
"grad_norm": 0.0010833740234375,
"learning_rate": 1.3010647134335122e-06,
"loss": 0.0,
"step": 11570
},
{
"epoch": 2.6134055517941777,
"grad_norm": 0.00012159347534179688,
"learning_rate": 1.29351355433059e-06,
"loss": 0.0,
"step": 11580
},
{
"epoch": 2.615662378695554,
"grad_norm": 105.0,
"learning_rate": 1.2859623952276675e-06,
"loss": 0.1517,
"step": 11590
},
{
"epoch": 2.6179192055969307,
"grad_norm": 78.0,
"learning_rate": 1.2784112361247453e-06,
"loss": 0.271,
"step": 11600
},
{
"epoch": 2.6201760324983074,
"grad_norm": 0.00087738037109375,
"learning_rate": 1.270860077021823e-06,
"loss": 0.0,
"step": 11610
},
{
"epoch": 2.622432859399684,
"grad_norm": 7.677078247070312e-05,
"learning_rate": 1.2633089179189006e-06,
"loss": 0.0,
"step": 11620
},
{
"epoch": 2.6246896863010605,
"grad_norm": 0.026123046875,
"learning_rate": 1.2557577588159784e-06,
"loss": 0.0,
"step": 11630
},
{
"epoch": 2.626946513202437,
"grad_norm": 0.00341796875,
"learning_rate": 1.248206599713056e-06,
"loss": 0.0,
"step": 11640
},
{
"epoch": 2.629203340103814,
"grad_norm": 0.0004825592041015625,
"learning_rate": 1.2406554406101337e-06,
"loss": 0.0,
"step": 11650
},
{
"epoch": 2.6314601670051907,
"grad_norm": 0.0016632080078125,
"learning_rate": 1.2331042815072114e-06,
"loss": 0.0,
"step": 11660
},
{
"epoch": 2.6337169939065674,
"grad_norm": 0.00013446807861328125,
"learning_rate": 1.2255531224042892e-06,
"loss": 0.0,
"step": 11670
},
{
"epoch": 2.635973820807944,
"grad_norm": 0.006805419921875,
"learning_rate": 1.218001963301367e-06,
"loss": 0.0,
"step": 11680
},
{
"epoch": 2.638230647709321,
"grad_norm": 0.001617431640625,
"learning_rate": 1.2104508041984445e-06,
"loss": 0.4549,
"step": 11690
},
{
"epoch": 2.640487474610697,
"grad_norm": 0.0001430511474609375,
"learning_rate": 1.2028996450955223e-06,
"loss": 0.0,
"step": 11700
},
{
"epoch": 2.642744301512074,
"grad_norm": 0.000576019287109375,
"learning_rate": 1.1953484859926e-06,
"loss": 0.3013,
"step": 11710
},
{
"epoch": 2.6450011284134507,
"grad_norm": 0.00457763671875,
"learning_rate": 1.1877973268896776e-06,
"loss": 0.0,
"step": 11720
},
{
"epoch": 2.6472579553148274,
"grad_norm": 0.0021514892578125,
"learning_rate": 1.1802461677867553e-06,
"loss": 0.0,
"step": 11730
},
{
"epoch": 2.649514782216204,
"grad_norm": 0.0004119873046875,
"learning_rate": 1.1726950086838331e-06,
"loss": 0.0,
"step": 11740
},
{
"epoch": 2.6517716091175805,
"grad_norm": 0.0026397705078125,
"learning_rate": 1.1651438495809107e-06,
"loss": 0.3461,
"step": 11750
},
{
"epoch": 2.654028436018957,
"grad_norm": 0.0023040771484375,
"learning_rate": 1.1575926904779884e-06,
"loss": 0.0968,
"step": 11760
},
{
"epoch": 2.656285262920334,
"grad_norm": 13.0625,
"learning_rate": 1.1500415313750662e-06,
"loss": 0.0107,
"step": 11770
},
{
"epoch": 2.6585420898217107,
"grad_norm": 0.00035858154296875,
"learning_rate": 1.142490372272144e-06,
"loss": 0.0,
"step": 11780
},
{
"epoch": 2.6607989167230874,
"grad_norm": 0.0002498626708984375,
"learning_rate": 1.1349392131692217e-06,
"loss": 0.0,
"step": 11790
},
{
"epoch": 2.663055743624464,
"grad_norm": 0.006622314453125,
"learning_rate": 1.1273880540662992e-06,
"loss": 0.0973,
"step": 11800
},
{
"epoch": 2.665312570525841,
"grad_norm": 0.004791259765625,
"learning_rate": 1.119836894963377e-06,
"loss": 0.0,
"step": 11810
},
{
"epoch": 2.667569397427217,
"grad_norm": 0.000263214111328125,
"learning_rate": 1.1122857358604548e-06,
"loss": 0.0,
"step": 11820
},
{
"epoch": 2.669826224328594,
"grad_norm": 0.007110595703125,
"learning_rate": 1.1047345767575323e-06,
"loss": 0.0,
"step": 11830
},
{
"epoch": 2.6720830512299707,
"grad_norm": 0.00098419189453125,
"learning_rate": 1.09718341765461e-06,
"loss": 0.2316,
"step": 11840
},
{
"epoch": 2.6743398781313474,
"grad_norm": 0.0002899169921875,
"learning_rate": 1.0896322585516878e-06,
"loss": 0.0,
"step": 11850
},
{
"epoch": 2.6765967050327237,
"grad_norm": 0.03955078125,
"learning_rate": 1.0820810994487654e-06,
"loss": 0.0,
"step": 11860
},
{
"epoch": 2.6788535319341005,
"grad_norm": 0.002777099609375,
"learning_rate": 1.0745299403458432e-06,
"loss": 0.155,
"step": 11870
},
{
"epoch": 2.681110358835477,
"grad_norm": 0.00238037109375,
"learning_rate": 1.066978781242921e-06,
"loss": 0.0,
"step": 11880
},
{
"epoch": 2.683367185736854,
"grad_norm": 0.00091552734375,
"learning_rate": 1.0594276221399987e-06,
"loss": 0.0262,
"step": 11890
},
{
"epoch": 2.6856240126382307,
"grad_norm": 0.00023937225341796875,
"learning_rate": 1.0518764630370764e-06,
"loss": 0.0,
"step": 11900
},
{
"epoch": 2.6878808395396074,
"grad_norm": 0.015380859375,
"learning_rate": 1.044325303934154e-06,
"loss": 0.6079,
"step": 11910
},
{
"epoch": 2.690137666440984,
"grad_norm": 0.00113677978515625,
"learning_rate": 1.0367741448312317e-06,
"loss": 0.0,
"step": 11920
},
{
"epoch": 2.692394493342361,
"grad_norm": 0.0021820068359375,
"learning_rate": 1.0292229857283093e-06,
"loss": 0.0,
"step": 11930
},
{
"epoch": 2.694651320243737,
"grad_norm": 0.00023555755615234375,
"learning_rate": 1.021671826625387e-06,
"loss": 0.0,
"step": 11940
},
{
"epoch": 2.696908147145114,
"grad_norm": 0.0002536773681640625,
"learning_rate": 1.0141206675224648e-06,
"loss": 0.0,
"step": 11950
},
{
"epoch": 2.6991649740464907,
"grad_norm": 7.295608520507812e-05,
"learning_rate": 1.0065695084195424e-06,
"loss": 0.0002,
"step": 11960
},
{
"epoch": 2.7014218009478674,
"grad_norm": 0.0014801025390625,
"learning_rate": 9.990183493166201e-07,
"loss": 0.0,
"step": 11970
},
{
"epoch": 2.7036786278492437,
"grad_norm": 0.00017547607421875,
"learning_rate": 9.914671902136979e-07,
"loss": 0.2184,
"step": 11980
},
{
"epoch": 2.7059354547506205,
"grad_norm": 99.0,
"learning_rate": 9.839160311107754e-07,
"loss": 0.3833,
"step": 11990
},
{
"epoch": 2.708192281651997,
"grad_norm": 68.0,
"learning_rate": 9.763648720078534e-07,
"loss": 0.6392,
"step": 12000
},
{
"epoch": 2.710449108553374,
"grad_norm": 0.001983642578125,
"learning_rate": 9.68813712904931e-07,
"loss": 0.2191,
"step": 12010
},
{
"epoch": 2.7127059354547507,
"grad_norm": 0.000545501708984375,
"learning_rate": 9.612625538020087e-07,
"loss": 0.0,
"step": 12020
},
{
"epoch": 2.7149627623561274,
"grad_norm": 0.002227783203125,
"learning_rate": 9.537113946990865e-07,
"loss": 0.0,
"step": 12030
},
{
"epoch": 2.717219589257504,
"grad_norm": 0.0010223388671875,
"learning_rate": 9.461602355961641e-07,
"loss": 0.0,
"step": 12040
},
{
"epoch": 2.7194764161588805,
"grad_norm": 0.006378173828125,
"learning_rate": 9.386090764932418e-07,
"loss": 0.0,
"step": 12050
},
{
"epoch": 2.721733243060257,
"grad_norm": 0.0024261474609375,
"learning_rate": 9.310579173903194e-07,
"loss": 0.0,
"step": 12060
},
{
"epoch": 2.723990069961634,
"grad_norm": 0.0008392333984375,
"learning_rate": 9.235067582873972e-07,
"loss": 0.0,
"step": 12070
},
{
"epoch": 2.7262468968630107,
"grad_norm": 0.000827789306640625,
"learning_rate": 9.159555991844749e-07,
"loss": 0.0,
"step": 12080
},
{
"epoch": 2.7285037237643874,
"grad_norm": 0.00121307373046875,
"learning_rate": 9.084044400815525e-07,
"loss": 0.2787,
"step": 12090
},
{
"epoch": 2.7307605506657637,
"grad_norm": 0.00060272216796875,
"learning_rate": 9.008532809786303e-07,
"loss": 0.0,
"step": 12100
},
{
"epoch": 2.7330173775671405,
"grad_norm": 0.0034332275390625,
"learning_rate": 8.93302121875708e-07,
"loss": 0.0,
"step": 12110
},
{
"epoch": 2.735274204468517,
"grad_norm": 0.010498046875,
"learning_rate": 8.857509627727858e-07,
"loss": 0.0,
"step": 12120
},
{
"epoch": 2.737531031369894,
"grad_norm": 0.01104736328125,
"learning_rate": 8.781998036698634e-07,
"loss": 0.0,
"step": 12130
},
{
"epoch": 2.7397878582712707,
"grad_norm": 0.000850677490234375,
"learning_rate": 8.706486445669411e-07,
"loss": 0.2278,
"step": 12140
},
{
"epoch": 2.7420446851726474,
"grad_norm": 0.0011444091796875,
"learning_rate": 8.630974854640188e-07,
"loss": 0.0,
"step": 12150
},
{
"epoch": 2.744301512074024,
"grad_norm": 0.000820159912109375,
"learning_rate": 8.555463263610965e-07,
"loss": 0.0,
"step": 12160
},
{
"epoch": 2.7465583389754005,
"grad_norm": 0.00191497802734375,
"learning_rate": 8.479951672581742e-07,
"loss": 0.0,
"step": 12170
},
{
"epoch": 2.748815165876777,
"grad_norm": 0.003143310546875,
"learning_rate": 8.404440081552518e-07,
"loss": 0.0,
"step": 12180
},
{
"epoch": 2.751071992778154,
"grad_norm": 0.00032806396484375,
"learning_rate": 8.328928490523296e-07,
"loss": 0.0,
"step": 12190
},
{
"epoch": 2.7533288196795307,
"grad_norm": 0.0010223388671875,
"learning_rate": 8.253416899494072e-07,
"loss": 0.0,
"step": 12200
},
{
"epoch": 2.755585646580907,
"grad_norm": 0.0019378662109375,
"learning_rate": 8.177905308464849e-07,
"loss": 0.0,
"step": 12210
},
{
"epoch": 2.7578424734822837,
"grad_norm": 0.00063323974609375,
"learning_rate": 8.102393717435628e-07,
"loss": 0.2411,
"step": 12220
},
{
"epoch": 2.7600993003836605,
"grad_norm": 0.000118255615234375,
"learning_rate": 8.026882126406404e-07,
"loss": 0.0,
"step": 12230
},
{
"epoch": 2.762356127285037,
"grad_norm": 0.0002593994140625,
"learning_rate": 7.951370535377182e-07,
"loss": 0.2326,
"step": 12240
},
{
"epoch": 2.764612954186414,
"grad_norm": 0.0002460479736328125,
"learning_rate": 7.875858944347958e-07,
"loss": 0.0,
"step": 12250
},
{
"epoch": 2.7668697810877907,
"grad_norm": 0.00093841552734375,
"learning_rate": 7.800347353318735e-07,
"loss": 0.0,
"step": 12260
},
{
"epoch": 2.7691266079891674,
"grad_norm": 0.01611328125,
"learning_rate": 7.724835762289513e-07,
"loss": 0.0,
"step": 12270
},
{
"epoch": 2.7713834348905437,
"grad_norm": 0.00102996826171875,
"learning_rate": 7.649324171260289e-07,
"loss": 0.0,
"step": 12280
},
{
"epoch": 2.7736402617919205,
"grad_norm": 0.0002574920654296875,
"learning_rate": 7.573812580231066e-07,
"loss": 0.0,
"step": 12290
},
{
"epoch": 2.775897088693297,
"grad_norm": 0.0003337860107421875,
"learning_rate": 7.498300989201842e-07,
"loss": 0.0,
"step": 12300
},
{
"epoch": 2.778153915594674,
"grad_norm": 0.000606536865234375,
"learning_rate": 7.42278939817262e-07,
"loss": 0.0,
"step": 12310
},
{
"epoch": 2.7804107424960507,
"grad_norm": 0.00141143798828125,
"learning_rate": 7.347277807143396e-07,
"loss": 0.0,
"step": 12320
},
{
"epoch": 2.782667569397427,
"grad_norm": 0.004241943359375,
"learning_rate": 7.271766216114175e-07,
"loss": 0.0,
"step": 12330
},
{
"epoch": 2.7849243962988037,
"grad_norm": 0.0006103515625,
"learning_rate": 7.196254625084952e-07,
"loss": 0.0,
"step": 12340
},
{
"epoch": 2.7871812232001805,
"grad_norm": 0.0021209716796875,
"learning_rate": 7.120743034055728e-07,
"loss": 0.0,
"step": 12350
},
{
"epoch": 2.789438050101557,
"grad_norm": 0.00023365020751953125,
"learning_rate": 7.045231443026506e-07,
"loss": 0.0,
"step": 12360
},
{
"epoch": 2.791694877002934,
"grad_norm": 0.00010156631469726562,
"learning_rate": 6.969719851997282e-07,
"loss": 0.0,
"step": 12370
},
{
"epoch": 2.7939517039043107,
"grad_norm": 0.00021266937255859375,
"learning_rate": 6.894208260968059e-07,
"loss": 0.0,
"step": 12380
},
{
"epoch": 2.7962085308056874,
"grad_norm": 0.00020503997802734375,
"learning_rate": 6.818696669938836e-07,
"loss": 0.0479,
"step": 12390
},
{
"epoch": 2.7984653577070637,
"grad_norm": 0.0013580322265625,
"learning_rate": 6.743185078909613e-07,
"loss": 0.0,
"step": 12400
},
{
"epoch": 2.8007221846084405,
"grad_norm": 0.000537872314453125,
"learning_rate": 6.66767348788039e-07,
"loss": 0.2429,
"step": 12410
},
{
"epoch": 2.802979011509817,
"grad_norm": 0.00421142578125,
"learning_rate": 6.592161896851167e-07,
"loss": 0.0,
"step": 12420
},
{
"epoch": 2.805235838411194,
"grad_norm": 0.00775146484375,
"learning_rate": 6.516650305821944e-07,
"loss": 0.0,
"step": 12430
},
{
"epoch": 2.8074926653125707,
"grad_norm": 0.006072998046875,
"learning_rate": 6.441138714792721e-07,
"loss": 0.0,
"step": 12440
},
{
"epoch": 2.809749492213947,
"grad_norm": 0.00013637542724609375,
"learning_rate": 6.365627123763499e-07,
"loss": 0.0,
"step": 12450
},
{
"epoch": 2.8120063191153237,
"grad_norm": 0.00020122528076171875,
"learning_rate": 6.290115532734275e-07,
"loss": 0.0,
"step": 12460
},
{
"epoch": 2.8142631460167005,
"grad_norm": 0.00025177001953125,
"learning_rate": 6.214603941705052e-07,
"loss": 0.3119,
"step": 12470
},
{
"epoch": 2.816519972918077,
"grad_norm": 0.0004367828369140625,
"learning_rate": 6.13909235067583e-07,
"loss": 0.0,
"step": 12480
},
{
"epoch": 2.818776799819454,
"grad_norm": 0.103515625,
"learning_rate": 6.063580759646606e-07,
"loss": 0.0,
"step": 12490
},
{
"epoch": 2.8210336267208307,
"grad_norm": 0.00016307830810546875,
"learning_rate": 5.988069168617383e-07,
"loss": 0.0,
"step": 12500
},
{
"epoch": 2.8232904536222074,
"grad_norm": 0.0003147125244140625,
"learning_rate": 5.91255757758816e-07,
"loss": 0.0,
"step": 12510
},
{
"epoch": 2.8255472805235837,
"grad_norm": 0.000823974609375,
"learning_rate": 5.837045986558938e-07,
"loss": 0.0018,
"step": 12520
},
{
"epoch": 2.8278041074249605,
"grad_norm": 0.0012359619140625,
"learning_rate": 5.761534395529714e-07,
"loss": 0.2192,
"step": 12530
},
{
"epoch": 2.830060934326337,
"grad_norm": 0.000232696533203125,
"learning_rate": 5.686022804500491e-07,
"loss": 0.0,
"step": 12540
},
{
"epoch": 2.832317761227714,
"grad_norm": 0.00031280517578125,
"learning_rate": 5.610511213471269e-07,
"loss": 0.0,
"step": 12550
},
{
"epoch": 2.8345745881290902,
"grad_norm": 0.010498046875,
"learning_rate": 5.534999622442045e-07,
"loss": 0.0909,
"step": 12560
},
{
"epoch": 2.836831415030467,
"grad_norm": 0.00016880035400390625,
"learning_rate": 5.459488031412823e-07,
"loss": 0.0,
"step": 12570
},
{
"epoch": 2.8390882419318437,
"grad_norm": 0.0004482269287109375,
"learning_rate": 5.383976440383599e-07,
"loss": 0.0,
"step": 12580
},
{
"epoch": 2.8413450688332205,
"grad_norm": 0.006591796875,
"learning_rate": 5.308464849354376e-07,
"loss": 0.0001,
"step": 12590
},
{
"epoch": 2.843601895734597,
"grad_norm": 0.0032806396484375,
"learning_rate": 5.232953258325153e-07,
"loss": 0.0,
"step": 12600
},
{
"epoch": 2.845858722635974,
"grad_norm": 0.003326416015625,
"learning_rate": 5.15744166729593e-07,
"loss": 0.0,
"step": 12610
},
{
"epoch": 2.8481155495373507,
"grad_norm": 0.0005645751953125,
"learning_rate": 5.081930076266707e-07,
"loss": 0.0,
"step": 12620
},
{
"epoch": 2.850372376438727,
"grad_norm": 0.00010633468627929688,
"learning_rate": 5.006418485237484e-07,
"loss": 0.0,
"step": 12630
},
{
"epoch": 2.8526292033401037,
"grad_norm": 0.00022411346435546875,
"learning_rate": 4.930906894208262e-07,
"loss": 0.0,
"step": 12640
},
{
"epoch": 2.8548860302414805,
"grad_norm": 0.00022411346435546875,
"learning_rate": 4.855395303179038e-07,
"loss": 0.0,
"step": 12650
},
{
"epoch": 2.857142857142857,
"grad_norm": 0.000301361083984375,
"learning_rate": 4.779883712149815e-07,
"loss": 0.0,
"step": 12660
},
{
"epoch": 2.859399684044234,
"grad_norm": 0.00048065185546875,
"learning_rate": 4.704372121120592e-07,
"loss": 0.0,
"step": 12670
},
{
"epoch": 2.8616565109456102,
"grad_norm": 0.00174713134765625,
"learning_rate": 4.6288605300913696e-07,
"loss": 0.0,
"step": 12680
},
{
"epoch": 2.863913337846987,
"grad_norm": 0.02001953125,
"learning_rate": 4.5533489390621467e-07,
"loss": 0.1636,
"step": 12690
},
{
"epoch": 2.8661701647483637,
"grad_norm": 0.000553131103515625,
"learning_rate": 4.477837348032923e-07,
"loss": 0.0,
"step": 12700
},
{
"epoch": 2.8684269916497405,
"grad_norm": 0.00110626220703125,
"learning_rate": 4.4023257570037003e-07,
"loss": 0.0,
"step": 12710
},
{
"epoch": 2.870683818551117,
"grad_norm": 0.00015926361083984375,
"learning_rate": 4.3268141659744774e-07,
"loss": 0.0,
"step": 12720
},
{
"epoch": 2.872940645452494,
"grad_norm": 0.0028228759765625,
"learning_rate": 4.251302574945254e-07,
"loss": 0.0,
"step": 12730
},
{
"epoch": 2.8751974723538707,
"grad_norm": 0.000568389892578125,
"learning_rate": 4.1757909839160315e-07,
"loss": 0.0,
"step": 12740
},
{
"epoch": 2.877454299255247,
"grad_norm": 0.0211181640625,
"learning_rate": 4.1002793928868086e-07,
"loss": 0.0,
"step": 12750
},
{
"epoch": 2.8797111261566237,
"grad_norm": 0.0003528594970703125,
"learning_rate": 4.0247678018575857e-07,
"loss": 0.0,
"step": 12760
},
{
"epoch": 2.8819679530580005,
"grad_norm": 0.001312255859375,
"learning_rate": 3.949256210828362e-07,
"loss": 0.0,
"step": 12770
},
{
"epoch": 2.884224779959377,
"grad_norm": 0.00194549560546875,
"learning_rate": 3.8737446197991393e-07,
"loss": 0.0,
"step": 12780
},
{
"epoch": 2.886481606860754,
"grad_norm": 0.0035858154296875,
"learning_rate": 3.798233028769917e-07,
"loss": 0.0,
"step": 12790
},
{
"epoch": 2.8887384337621302,
"grad_norm": 0.00087738037109375,
"learning_rate": 3.7227214377406935e-07,
"loss": 0.0,
"step": 12800
},
{
"epoch": 2.890995260663507,
"grad_norm": 0.0003910064697265625,
"learning_rate": 3.6472098467114705e-07,
"loss": 0.0,
"step": 12810
},
{
"epoch": 2.8932520875648837,
"grad_norm": 0.00167083740234375,
"learning_rate": 3.5716982556822476e-07,
"loss": 0.0,
"step": 12820
},
{
"epoch": 2.8955089144662605,
"grad_norm": 0.001190185546875,
"learning_rate": 3.496186664653024e-07,
"loss": 0.2909,
"step": 12830
},
{
"epoch": 2.897765741367637,
"grad_norm": 0.000522613525390625,
"learning_rate": 3.420675073623801e-07,
"loss": 0.0,
"step": 12840
},
{
"epoch": 2.900022568269014,
"grad_norm": 0.00013828277587890625,
"learning_rate": 3.345163482594579e-07,
"loss": 0.0,
"step": 12850
},
{
"epoch": 2.9022793951703907,
"grad_norm": 58.5,
"learning_rate": 3.2696518915653554e-07,
"loss": 0.4576,
"step": 12860
},
{
"epoch": 2.904536222071767,
"grad_norm": 0.00213623046875,
"learning_rate": 3.1941403005361325e-07,
"loss": 0.0,
"step": 12870
},
{
"epoch": 2.9067930489731437,
"grad_norm": 0.000522613525390625,
"learning_rate": 3.1186287095069096e-07,
"loss": 0.0001,
"step": 12880
},
{
"epoch": 2.9090498758745205,
"grad_norm": 0.005035400390625,
"learning_rate": 3.0431171184776866e-07,
"loss": 0.0,
"step": 12890
},
{
"epoch": 2.911306702775897,
"grad_norm": 0.0010223388671875,
"learning_rate": 2.9676055274484637e-07,
"loss": 0.0,
"step": 12900
},
{
"epoch": 2.9135635296772735,
"grad_norm": 0.00113677978515625,
"learning_rate": 2.892093936419241e-07,
"loss": 0.0,
"step": 12910
},
{
"epoch": 2.9158203565786502,
"grad_norm": 0.00145721435546875,
"learning_rate": 2.816582345390018e-07,
"loss": 0.0,
"step": 12920
},
{
"epoch": 2.918077183480027,
"grad_norm": 0.000316619873046875,
"learning_rate": 2.7410707543607944e-07,
"loss": 0.2586,
"step": 12930
},
{
"epoch": 2.9203340103814037,
"grad_norm": 0.0020599365234375,
"learning_rate": 2.6655591633315715e-07,
"loss": 0.0,
"step": 12940
},
{
"epoch": 2.9225908372827805,
"grad_norm": 0.00087738037109375,
"learning_rate": 2.5900475723023486e-07,
"loss": 0.0,
"step": 12950
},
{
"epoch": 2.924847664184157,
"grad_norm": 0.000518798828125,
"learning_rate": 2.5145359812731257e-07,
"loss": 0.0,
"step": 12960
},
{
"epoch": 2.927104491085534,
"grad_norm": 6.437301635742188e-05,
"learning_rate": 2.439024390243903e-07,
"loss": 0.0,
"step": 12970
},
{
"epoch": 2.9293613179869102,
"grad_norm": 54.5,
"learning_rate": 2.3635127992146796e-07,
"loss": 0.1558,
"step": 12980
},
{
"epoch": 2.931618144888287,
"grad_norm": 0.0007781982421875,
"learning_rate": 2.2880012081854564e-07,
"loss": 0.0,
"step": 12990
},
{
"epoch": 2.9338749717896637,
"grad_norm": 0.0004444122314453125,
"learning_rate": 2.2124896171562337e-07,
"loss": 0.0,
"step": 13000
},
{
"epoch": 2.9361317986910405,
"grad_norm": 124.5,
"learning_rate": 2.1369780261270105e-07,
"loss": 0.2421,
"step": 13010
},
{
"epoch": 2.938388625592417,
"grad_norm": 0.0010528564453125,
"learning_rate": 2.061466435097788e-07,
"loss": 0.0,
"step": 13020
},
{
"epoch": 2.9406454524937935,
"grad_norm": 0.00022411346435546875,
"learning_rate": 1.9859548440685647e-07,
"loss": 0.0,
"step": 13030
},
{
"epoch": 2.9429022793951702,
"grad_norm": 0.00018215179443359375,
"learning_rate": 1.9104432530393415e-07,
"loss": 0.221,
"step": 13040
},
{
"epoch": 2.945159106296547,
"grad_norm": 0.006195068359375,
"learning_rate": 1.8349316620101188e-07,
"loss": 0.0,
"step": 13050
},
{
"epoch": 2.9474159331979237,
"grad_norm": 0.00017070770263671875,
"learning_rate": 1.7594200709808957e-07,
"loss": 0.0,
"step": 13060
},
{
"epoch": 2.9496727600993005,
"grad_norm": 0.00012302398681640625,
"learning_rate": 1.6839084799516725e-07,
"loss": 0.0,
"step": 13070
},
{
"epoch": 2.951929587000677,
"grad_norm": 0.0031585693359375,
"learning_rate": 1.6083968889224498e-07,
"loss": 0.0,
"step": 13080
},
{
"epoch": 2.954186413902054,
"grad_norm": 0.000843048095703125,
"learning_rate": 1.5328852978932266e-07,
"loss": 0.0,
"step": 13090
},
{
"epoch": 2.9564432408034302,
"grad_norm": 0.0026397705078125,
"learning_rate": 1.4573737068640037e-07,
"loss": 0.0,
"step": 13100
},
{
"epoch": 2.958700067704807,
"grad_norm": 0.00093841552734375,
"learning_rate": 1.3818621158347808e-07,
"loss": 0.0,
"step": 13110
},
{
"epoch": 2.9609568946061837,
"grad_norm": 0.0037384033203125,
"learning_rate": 1.3063505248055579e-07,
"loss": 0.0,
"step": 13120
},
{
"epoch": 2.9632137215075605,
"grad_norm": 0.000659942626953125,
"learning_rate": 1.2308389337763347e-07,
"loss": 0.0,
"step": 13130
},
{
"epoch": 2.9654705484089368,
"grad_norm": 0.0002613067626953125,
"learning_rate": 1.1553273427471118e-07,
"loss": 0.0,
"step": 13140
},
{
"epoch": 2.9677273753103135,
"grad_norm": 0.000972747802734375,
"learning_rate": 1.0798157517178888e-07,
"loss": 0.0,
"step": 13150
},
{
"epoch": 2.9699842022116902,
"grad_norm": 0.004058837890625,
"learning_rate": 1.0043041606886658e-07,
"loss": 0.0,
"step": 13160
},
{
"epoch": 2.972241029113067,
"grad_norm": 0.0020599365234375,
"learning_rate": 9.287925696594427e-08,
"loss": 0.3007,
"step": 13170
},
{
"epoch": 2.9744978560144437,
"grad_norm": 164.0,
"learning_rate": 8.532809786302198e-08,
"loss": 0.1846,
"step": 13180
},
{
"epoch": 2.9767546829158205,
"grad_norm": 0.00124359130859375,
"learning_rate": 7.777693876009968e-08,
"loss": 0.0,
"step": 13190
},
{
"epoch": 2.979011509817197,
"grad_norm": 0.028076171875,
"learning_rate": 7.022577965717738e-08,
"loss": 0.0,
"step": 13200
},
{
"epoch": 2.981268336718574,
"grad_norm": 0.0002155303955078125,
"learning_rate": 6.267462055425508e-08,
"loss": 0.0353,
"step": 13210
},
{
"epoch": 2.9835251636199502,
"grad_norm": 0.00018978118896484375,
"learning_rate": 5.5123461451332786e-08,
"loss": 0.1542,
"step": 13220
},
{
"epoch": 2.985781990521327,
"grad_norm": 0.004119873046875,
"learning_rate": 4.757230234841049e-08,
"loss": 0.0,
"step": 13230
},
{
"epoch": 2.9880388174227037,
"grad_norm": 0.0001163482666015625,
"learning_rate": 4.002114324548819e-08,
"loss": 0.0,
"step": 13240
},
{
"epoch": 2.9902956443240805,
"grad_norm": 0.000827789306640625,
"learning_rate": 3.246998414256588e-08,
"loss": 0.0,
"step": 13250
},
{
"epoch": 2.9925524712254568,
"grad_norm": 28.0,
"learning_rate": 2.4918825039643587e-08,
"loss": 0.6274,
"step": 13260
},
{
"epoch": 2.9948092981268335,
"grad_norm": 0.00372314453125,
"learning_rate": 1.736766593672129e-08,
"loss": 0.0,
"step": 13270
},
{
"epoch": 2.9970661250282102,
"grad_norm": 0.00156402587890625,
"learning_rate": 9.81650683379899e-09,
"loss": 0.0,
"step": 13280
},
{
"epoch": 2.999322951929587,
"grad_norm": 0.0002536773681640625,
"learning_rate": 2.2653477308766897e-09,
"loss": 0.0,
"step": 13290
}
],
"logging_steps": 10,
"max_steps": 13293,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 5.091760756705075e+16,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}