vit-face-universal / checkpoint-12904 /trainer_state.json
michaelgathara's picture
Upload folder using huggingface_hub
09e0768 verified
{
"best_global_step": 12904,
"best_metric": 0.3503767491926803,
"best_model_checkpoint": "models/combined_finetuned\\checkpoint-12904",
"epoch": 2.0,
"eval_steps": 500,
"global_step": 12904,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0015499070055796653,
"grad_norm": 18.449249267578125,
"learning_rate": 1.9986050836949784e-05,
"loss": 2.3857,
"step": 10
},
{
"epoch": 0.0030998140111593306,
"grad_norm": 17.223068237304688,
"learning_rate": 1.997055176689399e-05,
"loss": 1.8583,
"step": 20
},
{
"epoch": 0.0046497210167389955,
"grad_norm": 6.169708251953125,
"learning_rate": 1.995505269683819e-05,
"loss": 1.4451,
"step": 30
},
{
"epoch": 0.006199628022318661,
"grad_norm": 14.2859525680542,
"learning_rate": 1.9939553626782395e-05,
"loss": 1.3793,
"step": 40
},
{
"epoch": 0.007749535027898326,
"grad_norm": 10.35885238647461,
"learning_rate": 1.9924054556726596e-05,
"loss": 1.3316,
"step": 50
},
{
"epoch": 0.009299442033477991,
"grad_norm": 15.786426544189453,
"learning_rate": 1.99085554866708e-05,
"loss": 1.5881,
"step": 60
},
{
"epoch": 0.010849349039057656,
"grad_norm": 8.541648864746094,
"learning_rate": 1.9893056416615006e-05,
"loss": 1.1178,
"step": 70
},
{
"epoch": 0.012399256044637322,
"grad_norm": 12.486669540405273,
"learning_rate": 1.9877557346559207e-05,
"loss": 1.2329,
"step": 80
},
{
"epoch": 0.013949163050216987,
"grad_norm": 18.372262954711914,
"learning_rate": 1.9862058276503412e-05,
"loss": 1.1821,
"step": 90
},
{
"epoch": 0.015499070055796652,
"grad_norm": 14.894574165344238,
"learning_rate": 1.9846559206447617e-05,
"loss": 1.154,
"step": 100
},
{
"epoch": 0.017048977061376317,
"grad_norm": 10.745121002197266,
"learning_rate": 1.9831060136391818e-05,
"loss": 1.2915,
"step": 110
},
{
"epoch": 0.018598884066955982,
"grad_norm": 9.22031307220459,
"learning_rate": 1.9815561066336023e-05,
"loss": 1.3442,
"step": 120
},
{
"epoch": 0.020148791072535647,
"grad_norm": 7.619173526763916,
"learning_rate": 1.9800061996280224e-05,
"loss": 1.2266,
"step": 130
},
{
"epoch": 0.02169869807811531,
"grad_norm": 7.492969512939453,
"learning_rate": 1.978456292622443e-05,
"loss": 1.2723,
"step": 140
},
{
"epoch": 0.023248605083694977,
"grad_norm": 8.811915397644043,
"learning_rate": 1.976906385616863e-05,
"loss": 1.0317,
"step": 150
},
{
"epoch": 0.024798512089274645,
"grad_norm": 8.983158111572266,
"learning_rate": 1.9753564786112835e-05,
"loss": 0.9404,
"step": 160
},
{
"epoch": 0.02634841909485431,
"grad_norm": 7.216524600982666,
"learning_rate": 1.9738065716057036e-05,
"loss": 1.1478,
"step": 170
},
{
"epoch": 0.027898326100433975,
"grad_norm": 7.921091079711914,
"learning_rate": 1.972256664600124e-05,
"loss": 1.3151,
"step": 180
},
{
"epoch": 0.02944823310601364,
"grad_norm": 9.79519271850586,
"learning_rate": 1.9707067575945446e-05,
"loss": 1.0722,
"step": 190
},
{
"epoch": 0.030998140111593304,
"grad_norm": 6.854232311248779,
"learning_rate": 1.9691568505889647e-05,
"loss": 1.021,
"step": 200
},
{
"epoch": 0.03254804711717297,
"grad_norm": 5.456936836242676,
"learning_rate": 1.9676069435833852e-05,
"loss": 0.9743,
"step": 210
},
{
"epoch": 0.034097954122752634,
"grad_norm": 9.073031425476074,
"learning_rate": 1.9660570365778057e-05,
"loss": 1.1088,
"step": 220
},
{
"epoch": 0.0356478611283323,
"grad_norm": 11.155451774597168,
"learning_rate": 1.9645071295722258e-05,
"loss": 1.1379,
"step": 230
},
{
"epoch": 0.037197768133911964,
"grad_norm": 6.259587287902832,
"learning_rate": 1.9629572225666463e-05,
"loss": 1.2134,
"step": 240
},
{
"epoch": 0.03874767513949163,
"grad_norm": 9.289825439453125,
"learning_rate": 1.9614073155610664e-05,
"loss": 0.8888,
"step": 250
},
{
"epoch": 0.040297582145071294,
"grad_norm": 10.56016731262207,
"learning_rate": 1.959857408555487e-05,
"loss": 0.866,
"step": 260
},
{
"epoch": 0.04184748915065096,
"grad_norm": 9.807143211364746,
"learning_rate": 1.958307501549907e-05,
"loss": 1.0504,
"step": 270
},
{
"epoch": 0.04339739615623062,
"grad_norm": 8.206954956054688,
"learning_rate": 1.9567575945443275e-05,
"loss": 0.9964,
"step": 280
},
{
"epoch": 0.04494730316181029,
"grad_norm": 11.40170955657959,
"learning_rate": 1.9552076875387477e-05,
"loss": 0.9297,
"step": 290
},
{
"epoch": 0.04649721016738995,
"grad_norm": 12.768455505371094,
"learning_rate": 1.953657780533168e-05,
"loss": 0.9266,
"step": 300
},
{
"epoch": 0.048047117172969625,
"grad_norm": 7.376617908477783,
"learning_rate": 1.9521078735275886e-05,
"loss": 0.9087,
"step": 310
},
{
"epoch": 0.04959702417854929,
"grad_norm": 7.627091884613037,
"learning_rate": 1.9505579665220087e-05,
"loss": 0.8817,
"step": 320
},
{
"epoch": 0.051146931184128955,
"grad_norm": 8.248144149780273,
"learning_rate": 1.9490080595164292e-05,
"loss": 0.8539,
"step": 330
},
{
"epoch": 0.05269683818970862,
"grad_norm": 8.01970386505127,
"learning_rate": 1.9474581525108497e-05,
"loss": 1.1062,
"step": 340
},
{
"epoch": 0.054246745195288285,
"grad_norm": 6.326109409332275,
"learning_rate": 1.94590824550527e-05,
"loss": 0.9872,
"step": 350
},
{
"epoch": 0.05579665220086795,
"grad_norm": 6.3588128089904785,
"learning_rate": 1.9443583384996903e-05,
"loss": 1.0147,
"step": 360
},
{
"epoch": 0.057346559206447614,
"grad_norm": 5.58894157409668,
"learning_rate": 1.9428084314941104e-05,
"loss": 0.9058,
"step": 370
},
{
"epoch": 0.05889646621202728,
"grad_norm": 7.436977863311768,
"learning_rate": 1.941258524488531e-05,
"loss": 1.0007,
"step": 380
},
{
"epoch": 0.060446373217606944,
"grad_norm": 9.496243476867676,
"learning_rate": 1.939708617482951e-05,
"loss": 1.0589,
"step": 390
},
{
"epoch": 0.06199628022318661,
"grad_norm": 8.498618125915527,
"learning_rate": 1.9381587104773715e-05,
"loss": 1.0345,
"step": 400
},
{
"epoch": 0.06354618722876627,
"grad_norm": 9.962313652038574,
"learning_rate": 1.9366088034717917e-05,
"loss": 0.889,
"step": 410
},
{
"epoch": 0.06509609423434594,
"grad_norm": 10.519235610961914,
"learning_rate": 1.935058896466212e-05,
"loss": 1.1431,
"step": 420
},
{
"epoch": 0.0666460012399256,
"grad_norm": 9.293874740600586,
"learning_rate": 1.9335089894606326e-05,
"loss": 0.842,
"step": 430
},
{
"epoch": 0.06819590824550527,
"grad_norm": 13.305548667907715,
"learning_rate": 1.9319590824550528e-05,
"loss": 1.0341,
"step": 440
},
{
"epoch": 0.06974581525108493,
"grad_norm": 10.887805938720703,
"learning_rate": 1.9304091754494732e-05,
"loss": 0.9853,
"step": 450
},
{
"epoch": 0.0712957222566646,
"grad_norm": 9.665596008300781,
"learning_rate": 1.9288592684438937e-05,
"loss": 0.8416,
"step": 460
},
{
"epoch": 0.07284562926224426,
"grad_norm": 7.394255638122559,
"learning_rate": 1.927309361438314e-05,
"loss": 0.8716,
"step": 470
},
{
"epoch": 0.07439553626782393,
"grad_norm": 9.392470359802246,
"learning_rate": 1.9257594544327343e-05,
"loss": 0.8584,
"step": 480
},
{
"epoch": 0.07594544327340359,
"grad_norm": 7.491386413574219,
"learning_rate": 1.9242095474271545e-05,
"loss": 0.8534,
"step": 490
},
{
"epoch": 0.07749535027898326,
"grad_norm": 9.354204177856445,
"learning_rate": 1.922659640421575e-05,
"loss": 0.9783,
"step": 500
},
{
"epoch": 0.07904525728456292,
"grad_norm": 7.941482067108154,
"learning_rate": 1.921109733415995e-05,
"loss": 0.9655,
"step": 510
},
{
"epoch": 0.08059516429014259,
"grad_norm": 7.988227367401123,
"learning_rate": 1.9195598264104156e-05,
"loss": 0.818,
"step": 520
},
{
"epoch": 0.08214507129572225,
"grad_norm": 10.701931953430176,
"learning_rate": 1.9180099194048357e-05,
"loss": 0.9184,
"step": 530
},
{
"epoch": 0.08369497830130192,
"grad_norm": 7.293822765350342,
"learning_rate": 1.916460012399256e-05,
"loss": 0.5997,
"step": 540
},
{
"epoch": 0.08524488530688158,
"grad_norm": 6.722202301025391,
"learning_rate": 1.9149101053936766e-05,
"loss": 0.8596,
"step": 550
},
{
"epoch": 0.08679479231246125,
"grad_norm": 12.55391788482666,
"learning_rate": 1.9133601983880968e-05,
"loss": 1.0486,
"step": 560
},
{
"epoch": 0.08834469931804091,
"grad_norm": 4.803437232971191,
"learning_rate": 1.9118102913825173e-05,
"loss": 0.7763,
"step": 570
},
{
"epoch": 0.08989460632362058,
"grad_norm": 4.649928569793701,
"learning_rate": 1.9102603843769377e-05,
"loss": 1.0312,
"step": 580
},
{
"epoch": 0.09144451332920024,
"grad_norm": 10.55102825164795,
"learning_rate": 1.908710477371358e-05,
"loss": 0.9301,
"step": 590
},
{
"epoch": 0.0929944203347799,
"grad_norm": 5.860504150390625,
"learning_rate": 1.9071605703657783e-05,
"loss": 0.8281,
"step": 600
},
{
"epoch": 0.09454432734035959,
"grad_norm": 4.4910101890563965,
"learning_rate": 1.9056106633601985e-05,
"loss": 0.8093,
"step": 610
},
{
"epoch": 0.09609423434593925,
"grad_norm": 16.84994888305664,
"learning_rate": 1.904060756354619e-05,
"loss": 0.9299,
"step": 620
},
{
"epoch": 0.09764414135151891,
"grad_norm": 12.170949935913086,
"learning_rate": 1.902510849349039e-05,
"loss": 0.6854,
"step": 630
},
{
"epoch": 0.09919404835709858,
"grad_norm": 10.96451473236084,
"learning_rate": 1.9009609423434596e-05,
"loss": 0.9306,
"step": 640
},
{
"epoch": 0.10074395536267824,
"grad_norm": 10.7017822265625,
"learning_rate": 1.8994110353378797e-05,
"loss": 0.9565,
"step": 650
},
{
"epoch": 0.10229386236825791,
"grad_norm": 7.391804218292236,
"learning_rate": 1.8978611283323002e-05,
"loss": 0.7833,
"step": 660
},
{
"epoch": 0.10384376937383757,
"grad_norm": 12.865453720092773,
"learning_rate": 1.8963112213267207e-05,
"loss": 0.9807,
"step": 670
},
{
"epoch": 0.10539367637941724,
"grad_norm": 10.577310562133789,
"learning_rate": 1.8947613143211408e-05,
"loss": 0.8834,
"step": 680
},
{
"epoch": 0.1069435833849969,
"grad_norm": 11.489950180053711,
"learning_rate": 1.8932114073155613e-05,
"loss": 1.0228,
"step": 690
},
{
"epoch": 0.10849349039057657,
"grad_norm": 2.870493173599243,
"learning_rate": 1.8916615003099818e-05,
"loss": 0.7706,
"step": 700
},
{
"epoch": 0.11004339739615623,
"grad_norm": 5.9892706871032715,
"learning_rate": 1.890111593304402e-05,
"loss": 0.8552,
"step": 710
},
{
"epoch": 0.1115933044017359,
"grad_norm": 3.6785945892333984,
"learning_rate": 1.8885616862988224e-05,
"loss": 0.7112,
"step": 720
},
{
"epoch": 0.11314321140731556,
"grad_norm": 6.439176559448242,
"learning_rate": 1.8870117792932425e-05,
"loss": 0.8313,
"step": 730
},
{
"epoch": 0.11469311841289523,
"grad_norm": 15.108148574829102,
"learning_rate": 1.885461872287663e-05,
"loss": 0.8033,
"step": 740
},
{
"epoch": 0.1162430254184749,
"grad_norm": 6.084428310394287,
"learning_rate": 1.883911965282083e-05,
"loss": 0.7643,
"step": 750
},
{
"epoch": 0.11779293242405456,
"grad_norm": 10.377224922180176,
"learning_rate": 1.8823620582765036e-05,
"loss": 1.0162,
"step": 760
},
{
"epoch": 0.11934283942963422,
"grad_norm": 9.730420112609863,
"learning_rate": 1.8808121512709237e-05,
"loss": 0.8065,
"step": 770
},
{
"epoch": 0.12089274643521389,
"grad_norm": 9.038290023803711,
"learning_rate": 1.8792622442653442e-05,
"loss": 0.8185,
"step": 780
},
{
"epoch": 0.12244265344079355,
"grad_norm": 9.10308837890625,
"learning_rate": 1.8777123372597647e-05,
"loss": 0.9643,
"step": 790
},
{
"epoch": 0.12399256044637322,
"grad_norm": 7.978387832641602,
"learning_rate": 1.8761624302541848e-05,
"loss": 0.9773,
"step": 800
},
{
"epoch": 0.12554246745195288,
"grad_norm": 9.240801811218262,
"learning_rate": 1.8746125232486053e-05,
"loss": 1.0035,
"step": 810
},
{
"epoch": 0.12709237445753255,
"grad_norm": 7.459136962890625,
"learning_rate": 1.8730626162430258e-05,
"loss": 0.898,
"step": 820
},
{
"epoch": 0.1286422814631122,
"grad_norm": 7.9399027824401855,
"learning_rate": 1.871512709237446e-05,
"loss": 1.061,
"step": 830
},
{
"epoch": 0.13019218846869188,
"grad_norm": 5.291181564331055,
"learning_rate": 1.8699628022318664e-05,
"loss": 0.8294,
"step": 840
},
{
"epoch": 0.13174209547427154,
"grad_norm": 5.9574079513549805,
"learning_rate": 1.8684128952262865e-05,
"loss": 1.1741,
"step": 850
},
{
"epoch": 0.1332920024798512,
"grad_norm": 8.97977352142334,
"learning_rate": 1.866862988220707e-05,
"loss": 1.024,
"step": 860
},
{
"epoch": 0.13484190948543087,
"grad_norm": 7.710363388061523,
"learning_rate": 1.865313081215127e-05,
"loss": 0.862,
"step": 870
},
{
"epoch": 0.13639181649101054,
"grad_norm": 12.134010314941406,
"learning_rate": 1.8637631742095476e-05,
"loss": 1.0665,
"step": 880
},
{
"epoch": 0.1379417234965902,
"grad_norm": 7.517919063568115,
"learning_rate": 1.8622132672039677e-05,
"loss": 0.9495,
"step": 890
},
{
"epoch": 0.13949163050216987,
"grad_norm": 7.245954990386963,
"learning_rate": 1.8606633601983882e-05,
"loss": 0.7532,
"step": 900
},
{
"epoch": 0.14104153750774953,
"grad_norm": 7.372646808624268,
"learning_rate": 1.8591134531928087e-05,
"loss": 0.8079,
"step": 910
},
{
"epoch": 0.1425914445133292,
"grad_norm": 7.447389125823975,
"learning_rate": 1.857563546187229e-05,
"loss": 0.7802,
"step": 920
},
{
"epoch": 0.14414135151890886,
"grad_norm": 5.043920993804932,
"learning_rate": 1.8560136391816493e-05,
"loss": 0.8062,
"step": 930
},
{
"epoch": 0.14569125852448853,
"grad_norm": 9.097290992736816,
"learning_rate": 1.8544637321760698e-05,
"loss": 0.8234,
"step": 940
},
{
"epoch": 0.1472411655300682,
"grad_norm": 6.955564498901367,
"learning_rate": 1.85291382517049e-05,
"loss": 0.951,
"step": 950
},
{
"epoch": 0.14879107253564786,
"grad_norm": 8.295125961303711,
"learning_rate": 1.8513639181649104e-05,
"loss": 0.6106,
"step": 960
},
{
"epoch": 0.15034097954122752,
"grad_norm": 6.4834513664245605,
"learning_rate": 1.8498140111593305e-05,
"loss": 0.8087,
"step": 970
},
{
"epoch": 0.15189088654680719,
"grad_norm": 8.002385139465332,
"learning_rate": 1.848264104153751e-05,
"loss": 0.7515,
"step": 980
},
{
"epoch": 0.15344079355238685,
"grad_norm": 8.667396545410156,
"learning_rate": 1.846714197148171e-05,
"loss": 0.8659,
"step": 990
},
{
"epoch": 0.15499070055796652,
"grad_norm": 7.546538352966309,
"learning_rate": 1.8451642901425916e-05,
"loss": 0.9391,
"step": 1000
},
{
"epoch": 0.15654060756354618,
"grad_norm": 10.719348907470703,
"learning_rate": 1.8436143831370118e-05,
"loss": 1.0736,
"step": 1010
},
{
"epoch": 0.15809051456912585,
"grad_norm": 4.889281749725342,
"learning_rate": 1.8420644761314322e-05,
"loss": 0.6875,
"step": 1020
},
{
"epoch": 0.1596404215747055,
"grad_norm": 7.394768238067627,
"learning_rate": 1.8405145691258527e-05,
"loss": 0.8181,
"step": 1030
},
{
"epoch": 0.16119032858028517,
"grad_norm": 8.632201194763184,
"learning_rate": 1.8389646621202732e-05,
"loss": 1.0494,
"step": 1040
},
{
"epoch": 0.16274023558586484,
"grad_norm": 7.193940162658691,
"learning_rate": 1.8374147551146933e-05,
"loss": 0.7158,
"step": 1050
},
{
"epoch": 0.1642901425914445,
"grad_norm": 7.70432710647583,
"learning_rate": 1.8358648481091138e-05,
"loss": 1.0215,
"step": 1060
},
{
"epoch": 0.16584004959702417,
"grad_norm": 5.057873249053955,
"learning_rate": 1.834314941103534e-05,
"loss": 0.6548,
"step": 1070
},
{
"epoch": 0.16738995660260383,
"grad_norm": 6.86895751953125,
"learning_rate": 1.8327650340979544e-05,
"loss": 0.8226,
"step": 1080
},
{
"epoch": 0.1689398636081835,
"grad_norm": 3.7493245601654053,
"learning_rate": 1.8312151270923746e-05,
"loss": 0.8125,
"step": 1090
},
{
"epoch": 0.17048977061376316,
"grad_norm": 6.053397178649902,
"learning_rate": 1.829665220086795e-05,
"loss": 0.7817,
"step": 1100
},
{
"epoch": 0.17203967761934283,
"grad_norm": 7.619532585144043,
"learning_rate": 1.8281153130812152e-05,
"loss": 0.6368,
"step": 1110
},
{
"epoch": 0.1735895846249225,
"grad_norm": 10.370380401611328,
"learning_rate": 1.8265654060756356e-05,
"loss": 0.899,
"step": 1120
},
{
"epoch": 0.17513949163050216,
"grad_norm": 3.4213171005249023,
"learning_rate": 1.8250154990700558e-05,
"loss": 0.5833,
"step": 1130
},
{
"epoch": 0.17668939863608182,
"grad_norm": 8.931166648864746,
"learning_rate": 1.8234655920644763e-05,
"loss": 0.7682,
"step": 1140
},
{
"epoch": 0.1782393056416615,
"grad_norm": 4.954645156860352,
"learning_rate": 1.8219156850588967e-05,
"loss": 0.8337,
"step": 1150
},
{
"epoch": 0.17978921264724115,
"grad_norm": 3.9182803630828857,
"learning_rate": 1.8203657780533172e-05,
"loss": 0.9461,
"step": 1160
},
{
"epoch": 0.18133911965282082,
"grad_norm": 10.957509994506836,
"learning_rate": 1.8188158710477373e-05,
"loss": 1.0823,
"step": 1170
},
{
"epoch": 0.18288902665840048,
"grad_norm": 5.406250476837158,
"learning_rate": 1.8172659640421578e-05,
"loss": 0.8319,
"step": 1180
},
{
"epoch": 0.18443893366398015,
"grad_norm": 9.316621780395508,
"learning_rate": 1.815716057036578e-05,
"loss": 0.8848,
"step": 1190
},
{
"epoch": 0.1859888406695598,
"grad_norm": 7.41352653503418,
"learning_rate": 1.8141661500309984e-05,
"loss": 0.8053,
"step": 1200
},
{
"epoch": 0.1875387476751395,
"grad_norm": 8.243196487426758,
"learning_rate": 1.8126162430254186e-05,
"loss": 0.9792,
"step": 1210
},
{
"epoch": 0.18908865468071917,
"grad_norm": 6.000314712524414,
"learning_rate": 1.811066336019839e-05,
"loss": 0.7051,
"step": 1220
},
{
"epoch": 0.19063856168629884,
"grad_norm": 9.19047737121582,
"learning_rate": 1.8095164290142592e-05,
"loss": 0.7739,
"step": 1230
},
{
"epoch": 0.1921884686918785,
"grad_norm": 4.234779357910156,
"learning_rate": 1.8079665220086797e-05,
"loss": 1.0071,
"step": 1240
},
{
"epoch": 0.19373837569745817,
"grad_norm": 5.595962047576904,
"learning_rate": 1.8064166150030998e-05,
"loss": 0.9207,
"step": 1250
},
{
"epoch": 0.19528828270303783,
"grad_norm": 9.883673667907715,
"learning_rate": 1.8048667079975203e-05,
"loss": 0.7329,
"step": 1260
},
{
"epoch": 0.1968381897086175,
"grad_norm": 4.593166828155518,
"learning_rate": 1.8033168009919408e-05,
"loss": 0.6768,
"step": 1270
},
{
"epoch": 0.19838809671419716,
"grad_norm": 11.609228134155273,
"learning_rate": 1.8017668939863612e-05,
"loss": 1.1065,
"step": 1280
},
{
"epoch": 0.19993800371977682,
"grad_norm": 4.635692596435547,
"learning_rate": 1.8002169869807814e-05,
"loss": 0.568,
"step": 1290
},
{
"epoch": 0.2014879107253565,
"grad_norm": 10.727137565612793,
"learning_rate": 1.798667079975202e-05,
"loss": 0.8751,
"step": 1300
},
{
"epoch": 0.20303781773093615,
"grad_norm": 6.74473237991333,
"learning_rate": 1.797117172969622e-05,
"loss": 0.7213,
"step": 1310
},
{
"epoch": 0.20458772473651582,
"grad_norm": 0.40561366081237793,
"learning_rate": 1.7955672659640425e-05,
"loss": 0.7377,
"step": 1320
},
{
"epoch": 0.20613763174209548,
"grad_norm": 9.337224006652832,
"learning_rate": 1.7940173589584626e-05,
"loss": 0.8402,
"step": 1330
},
{
"epoch": 0.20768753874767515,
"grad_norm": 5.331075668334961,
"learning_rate": 1.792467451952883e-05,
"loss": 0.7356,
"step": 1340
},
{
"epoch": 0.2092374457532548,
"grad_norm": 10.293356895446777,
"learning_rate": 1.7909175449473032e-05,
"loss": 0.8442,
"step": 1350
},
{
"epoch": 0.21078735275883448,
"grad_norm": 3.94999098777771,
"learning_rate": 1.7893676379417237e-05,
"loss": 0.7805,
"step": 1360
},
{
"epoch": 0.21233725976441414,
"grad_norm": 13.504864692687988,
"learning_rate": 1.7878177309361438e-05,
"loss": 0.7697,
"step": 1370
},
{
"epoch": 0.2138871667699938,
"grad_norm": 8.761556625366211,
"learning_rate": 1.7862678239305643e-05,
"loss": 0.8553,
"step": 1380
},
{
"epoch": 0.21543707377557347,
"grad_norm": 9.55759334564209,
"learning_rate": 1.7847179169249848e-05,
"loss": 0.7005,
"step": 1390
},
{
"epoch": 0.21698698078115314,
"grad_norm": 9.412562370300293,
"learning_rate": 1.7831680099194053e-05,
"loss": 0.8092,
"step": 1400
},
{
"epoch": 0.2185368877867328,
"grad_norm": 3.6494250297546387,
"learning_rate": 1.7816181029138254e-05,
"loss": 0.7754,
"step": 1410
},
{
"epoch": 0.22008679479231247,
"grad_norm": 7.303593158721924,
"learning_rate": 1.780068195908246e-05,
"loss": 0.6011,
"step": 1420
},
{
"epoch": 0.22163670179789213,
"grad_norm": 4.854985237121582,
"learning_rate": 1.778518288902666e-05,
"loss": 0.8983,
"step": 1430
},
{
"epoch": 0.2231866088034718,
"grad_norm": 4.855501174926758,
"learning_rate": 1.7769683818970865e-05,
"loss": 0.6847,
"step": 1440
},
{
"epoch": 0.22473651580905146,
"grad_norm": 8.637642860412598,
"learning_rate": 1.7754184748915066e-05,
"loss": 0.6741,
"step": 1450
},
{
"epoch": 0.22628642281463113,
"grad_norm": 11.863778114318848,
"learning_rate": 1.773868567885927e-05,
"loss": 0.9925,
"step": 1460
},
{
"epoch": 0.2278363298202108,
"grad_norm": 7.329986095428467,
"learning_rate": 1.7723186608803472e-05,
"loss": 0.9337,
"step": 1470
},
{
"epoch": 0.22938623682579046,
"grad_norm": 10.348973274230957,
"learning_rate": 1.7707687538747677e-05,
"loss": 0.7293,
"step": 1480
},
{
"epoch": 0.23093614383137012,
"grad_norm": 10.60584831237793,
"learning_rate": 1.769218846869188e-05,
"loss": 0.9099,
"step": 1490
},
{
"epoch": 0.2324860508369498,
"grad_norm": 10.066960334777832,
"learning_rate": 1.7676689398636083e-05,
"loss": 0.8515,
"step": 1500
},
{
"epoch": 0.23403595784252945,
"grad_norm": 8.389772415161133,
"learning_rate": 1.7661190328580288e-05,
"loss": 0.8219,
"step": 1510
},
{
"epoch": 0.23558586484810912,
"grad_norm": 9.143502235412598,
"learning_rate": 1.7645691258524493e-05,
"loss": 0.7401,
"step": 1520
},
{
"epoch": 0.23713577185368878,
"grad_norm": 3.5140132904052734,
"learning_rate": 1.7630192188468694e-05,
"loss": 0.8184,
"step": 1530
},
{
"epoch": 0.23868567885926845,
"grad_norm": 7.025918483734131,
"learning_rate": 1.76146931184129e-05,
"loss": 0.5758,
"step": 1540
},
{
"epoch": 0.2402355858648481,
"grad_norm": 4.971466541290283,
"learning_rate": 1.75991940483571e-05,
"loss": 0.974,
"step": 1550
},
{
"epoch": 0.24178549287042778,
"grad_norm": 3.7477047443389893,
"learning_rate": 1.7583694978301305e-05,
"loss": 0.5627,
"step": 1560
},
{
"epoch": 0.24333539987600744,
"grad_norm": 8.450775146484375,
"learning_rate": 1.7568195908245506e-05,
"loss": 0.5772,
"step": 1570
},
{
"epoch": 0.2448853068815871,
"grad_norm": 5.886590480804443,
"learning_rate": 1.755269683818971e-05,
"loss": 0.6206,
"step": 1580
},
{
"epoch": 0.24643521388716677,
"grad_norm": 8.515731811523438,
"learning_rate": 1.7537197768133912e-05,
"loss": 0.8716,
"step": 1590
},
{
"epoch": 0.24798512089274644,
"grad_norm": 6.910757541656494,
"learning_rate": 1.7521698698078117e-05,
"loss": 0.8909,
"step": 1600
},
{
"epoch": 0.2495350278983261,
"grad_norm": 5.422664642333984,
"learning_rate": 1.750619962802232e-05,
"loss": 0.7547,
"step": 1610
},
{
"epoch": 0.25108493490390577,
"grad_norm": 9.602860450744629,
"learning_rate": 1.7490700557966523e-05,
"loss": 0.8456,
"step": 1620
},
{
"epoch": 0.25263484190948543,
"grad_norm": 6.122979164123535,
"learning_rate": 1.7475201487910728e-05,
"loss": 0.6478,
"step": 1630
},
{
"epoch": 0.2541847489150651,
"grad_norm": 9.506874084472656,
"learning_rate": 1.745970241785493e-05,
"loss": 0.8572,
"step": 1640
},
{
"epoch": 0.25573465592064476,
"grad_norm": 4.676219940185547,
"learning_rate": 1.7444203347799134e-05,
"loss": 0.9653,
"step": 1650
},
{
"epoch": 0.2572845629262244,
"grad_norm": 7.991693496704102,
"learning_rate": 1.7428704277743336e-05,
"loss": 0.5923,
"step": 1660
},
{
"epoch": 0.2588344699318041,
"grad_norm": 4.9383063316345215,
"learning_rate": 1.741320520768754e-05,
"loss": 0.7942,
"step": 1670
},
{
"epoch": 0.26038437693738375,
"grad_norm": 5.993865489959717,
"learning_rate": 1.7397706137631742e-05,
"loss": 0.7397,
"step": 1680
},
{
"epoch": 0.2619342839429634,
"grad_norm": 8.85261344909668,
"learning_rate": 1.7382207067575947e-05,
"loss": 0.8507,
"step": 1690
},
{
"epoch": 0.2634841909485431,
"grad_norm": 7.463364601135254,
"learning_rate": 1.7366707997520148e-05,
"loss": 0.7555,
"step": 1700
},
{
"epoch": 0.26503409795412275,
"grad_norm": 5.885605812072754,
"learning_rate": 1.7351208927464353e-05,
"loss": 0.6855,
"step": 1710
},
{
"epoch": 0.2665840049597024,
"grad_norm": 5.121423244476318,
"learning_rate": 1.7335709857408557e-05,
"loss": 0.6555,
"step": 1720
},
{
"epoch": 0.2681339119652821,
"grad_norm": 10.616006851196289,
"learning_rate": 1.732021078735276e-05,
"loss": 0.7157,
"step": 1730
},
{
"epoch": 0.26968381897086174,
"grad_norm": 6.397147178649902,
"learning_rate": 1.7304711717296964e-05,
"loss": 0.5973,
"step": 1740
},
{
"epoch": 0.2712337259764414,
"grad_norm": 6.465205669403076,
"learning_rate": 1.7289212647241168e-05,
"loss": 0.7021,
"step": 1750
},
{
"epoch": 0.2727836329820211,
"grad_norm": 12.990324974060059,
"learning_rate": 1.727371357718537e-05,
"loss": 1.0169,
"step": 1760
},
{
"epoch": 0.27433353998760074,
"grad_norm": 7.413804054260254,
"learning_rate": 1.7258214507129574e-05,
"loss": 0.67,
"step": 1770
},
{
"epoch": 0.2758834469931804,
"grad_norm": 9.471087455749512,
"learning_rate": 1.7242715437073776e-05,
"loss": 0.6722,
"step": 1780
},
{
"epoch": 0.27743335399876007,
"grad_norm": 4.511841297149658,
"learning_rate": 1.722721636701798e-05,
"loss": 0.6935,
"step": 1790
},
{
"epoch": 0.27898326100433973,
"grad_norm": 3.657198429107666,
"learning_rate": 1.7211717296962182e-05,
"loss": 0.6261,
"step": 1800
},
{
"epoch": 0.2805331680099194,
"grad_norm": 6.290339946746826,
"learning_rate": 1.7196218226906387e-05,
"loss": 0.7307,
"step": 1810
},
{
"epoch": 0.28208307501549906,
"grad_norm": 3.8453187942504883,
"learning_rate": 1.7180719156850588e-05,
"loss": 0.6748,
"step": 1820
},
{
"epoch": 0.28363298202107873,
"grad_norm": 8.411739349365234,
"learning_rate": 1.7165220086794793e-05,
"loss": 0.9805,
"step": 1830
},
{
"epoch": 0.2851828890266584,
"grad_norm": 7.1236653327941895,
"learning_rate": 1.7149721016738998e-05,
"loss": 0.8837,
"step": 1840
},
{
"epoch": 0.28673279603223806,
"grad_norm": 8.25506591796875,
"learning_rate": 1.71342219466832e-05,
"loss": 0.658,
"step": 1850
},
{
"epoch": 0.2882827030378177,
"grad_norm": 6.947543144226074,
"learning_rate": 1.7118722876627404e-05,
"loss": 0.6613,
"step": 1860
},
{
"epoch": 0.2898326100433974,
"grad_norm": 9.180550575256348,
"learning_rate": 1.710322380657161e-05,
"loss": 0.6232,
"step": 1870
},
{
"epoch": 0.29138251704897705,
"grad_norm": 2.687608003616333,
"learning_rate": 1.708772473651581e-05,
"loss": 0.5759,
"step": 1880
},
{
"epoch": 0.2929324240545567,
"grad_norm": 5.574268341064453,
"learning_rate": 1.7072225666460015e-05,
"loss": 0.686,
"step": 1890
},
{
"epoch": 0.2944823310601364,
"grad_norm": 5.5589070320129395,
"learning_rate": 1.7056726596404216e-05,
"loss": 0.7817,
"step": 1900
},
{
"epoch": 0.29603223806571605,
"grad_norm": 8.487772941589355,
"learning_rate": 1.704122752634842e-05,
"loss": 0.9106,
"step": 1910
},
{
"epoch": 0.2975821450712957,
"grad_norm": 5.565657615661621,
"learning_rate": 1.7025728456292622e-05,
"loss": 0.5631,
"step": 1920
},
{
"epoch": 0.2991320520768754,
"grad_norm": 3.6714425086975098,
"learning_rate": 1.7010229386236827e-05,
"loss": 0.6945,
"step": 1930
},
{
"epoch": 0.30068195908245504,
"grad_norm": 3.192857265472412,
"learning_rate": 1.6994730316181028e-05,
"loss": 0.6953,
"step": 1940
},
{
"epoch": 0.3022318660880347,
"grad_norm": 6.31390905380249,
"learning_rate": 1.6979231246125233e-05,
"loss": 0.9058,
"step": 1950
},
{
"epoch": 0.30378177309361437,
"grad_norm": 3.161252498626709,
"learning_rate": 1.6963732176069438e-05,
"loss": 0.9399,
"step": 1960
},
{
"epoch": 0.30533168009919404,
"grad_norm": 6.567419528961182,
"learning_rate": 1.694823310601364e-05,
"loss": 0.7066,
"step": 1970
},
{
"epoch": 0.3068815871047737,
"grad_norm": 10.786821365356445,
"learning_rate": 1.6932734035957844e-05,
"loss": 0.58,
"step": 1980
},
{
"epoch": 0.30843149411035337,
"grad_norm": 4.073357105255127,
"learning_rate": 1.691723496590205e-05,
"loss": 0.8259,
"step": 1990
},
{
"epoch": 0.30998140111593303,
"grad_norm": 5.362609386444092,
"learning_rate": 1.690173589584625e-05,
"loss": 0.7092,
"step": 2000
},
{
"epoch": 0.3115313081215127,
"grad_norm": 4.431297302246094,
"learning_rate": 1.6886236825790455e-05,
"loss": 0.8349,
"step": 2010
},
{
"epoch": 0.31308121512709236,
"grad_norm": 4.878475666046143,
"learning_rate": 1.6870737755734656e-05,
"loss": 0.6687,
"step": 2020
},
{
"epoch": 0.314631122132672,
"grad_norm": 7.905298233032227,
"learning_rate": 1.685523868567886e-05,
"loss": 0.7769,
"step": 2030
},
{
"epoch": 0.3161810291382517,
"grad_norm": 7.189042568206787,
"learning_rate": 1.6839739615623062e-05,
"loss": 0.7288,
"step": 2040
},
{
"epoch": 0.31773093614383136,
"grad_norm": 9.089221000671387,
"learning_rate": 1.6824240545567267e-05,
"loss": 0.6814,
"step": 2050
},
{
"epoch": 0.319280843149411,
"grad_norm": 8.003725051879883,
"learning_rate": 1.680874147551147e-05,
"loss": 0.8992,
"step": 2060
},
{
"epoch": 0.3208307501549907,
"grad_norm": 5.8466410636901855,
"learning_rate": 1.6793242405455673e-05,
"loss": 0.7694,
"step": 2070
},
{
"epoch": 0.32238065716057035,
"grad_norm": 5.350661277770996,
"learning_rate": 1.6777743335399878e-05,
"loss": 0.8208,
"step": 2080
},
{
"epoch": 0.32393056416615,
"grad_norm": 6.502786636352539,
"learning_rate": 1.676224426534408e-05,
"loss": 0.6959,
"step": 2090
},
{
"epoch": 0.3254804711717297,
"grad_norm": 11.890753746032715,
"learning_rate": 1.6746745195288284e-05,
"loss": 1.0159,
"step": 2100
},
{
"epoch": 0.32703037817730934,
"grad_norm": 6.462416648864746,
"learning_rate": 1.673124612523249e-05,
"loss": 0.8057,
"step": 2110
},
{
"epoch": 0.328580285182889,
"grad_norm": 8.766998291015625,
"learning_rate": 1.671574705517669e-05,
"loss": 0.9381,
"step": 2120
},
{
"epoch": 0.3301301921884687,
"grad_norm": 4.663023471832275,
"learning_rate": 1.6700247985120895e-05,
"loss": 0.9093,
"step": 2130
},
{
"epoch": 0.33168009919404834,
"grad_norm": 8.117950439453125,
"learning_rate": 1.6684748915065096e-05,
"loss": 0.6841,
"step": 2140
},
{
"epoch": 0.333230006199628,
"grad_norm": 3.851203441619873,
"learning_rate": 1.66692498450093e-05,
"loss": 0.6287,
"step": 2150
},
{
"epoch": 0.33477991320520767,
"grad_norm": 10.521380424499512,
"learning_rate": 1.6653750774953503e-05,
"loss": 0.6752,
"step": 2160
},
{
"epoch": 0.33632982021078733,
"grad_norm": 3.793071746826172,
"learning_rate": 1.6638251704897707e-05,
"loss": 0.6795,
"step": 2170
},
{
"epoch": 0.337879727216367,
"grad_norm": 8.640475273132324,
"learning_rate": 1.662275263484191e-05,
"loss": 0.7289,
"step": 2180
},
{
"epoch": 0.33942963422194666,
"grad_norm": 7.897937774658203,
"learning_rate": 1.6607253564786113e-05,
"loss": 0.5091,
"step": 2190
},
{
"epoch": 0.34097954122752633,
"grad_norm": 6.605180740356445,
"learning_rate": 1.6591754494730318e-05,
"loss": 0.7188,
"step": 2200
},
{
"epoch": 0.342529448233106,
"grad_norm": 10.198129653930664,
"learning_rate": 1.657625542467452e-05,
"loss": 0.7225,
"step": 2210
},
{
"epoch": 0.34407935523868566,
"grad_norm": 3.2186694145202637,
"learning_rate": 1.6560756354618724e-05,
"loss": 0.5876,
"step": 2220
},
{
"epoch": 0.3456292622442653,
"grad_norm": 11.494028091430664,
"learning_rate": 1.654525728456293e-05,
"loss": 0.7043,
"step": 2230
},
{
"epoch": 0.347179169249845,
"grad_norm": 9.092860221862793,
"learning_rate": 1.652975821450713e-05,
"loss": 0.8232,
"step": 2240
},
{
"epoch": 0.34872907625542465,
"grad_norm": 5.422155857086182,
"learning_rate": 1.6514259144451335e-05,
"loss": 0.6459,
"step": 2250
},
{
"epoch": 0.3502789832610043,
"grad_norm": 8.467617988586426,
"learning_rate": 1.6498760074395537e-05,
"loss": 0.6452,
"step": 2260
},
{
"epoch": 0.351828890266584,
"grad_norm": 8.067217826843262,
"learning_rate": 1.648326100433974e-05,
"loss": 0.8663,
"step": 2270
},
{
"epoch": 0.35337879727216365,
"grad_norm": 6.379934310913086,
"learning_rate": 1.6467761934283943e-05,
"loss": 0.6867,
"step": 2280
},
{
"epoch": 0.3549287042777433,
"grad_norm": 4.482844352722168,
"learning_rate": 1.6452262864228147e-05,
"loss": 0.6077,
"step": 2290
},
{
"epoch": 0.356478611283323,
"grad_norm": 5.696823596954346,
"learning_rate": 1.643676379417235e-05,
"loss": 0.7574,
"step": 2300
},
{
"epoch": 0.35802851828890264,
"grad_norm": 8.586214065551758,
"learning_rate": 1.6421264724116554e-05,
"loss": 0.8223,
"step": 2310
},
{
"epoch": 0.3595784252944823,
"grad_norm": 4.69325065612793,
"learning_rate": 1.640576565406076e-05,
"loss": 0.9626,
"step": 2320
},
{
"epoch": 0.36112833230006197,
"grad_norm": 7.331809043884277,
"learning_rate": 1.639026658400496e-05,
"loss": 0.6907,
"step": 2330
},
{
"epoch": 0.36267823930564164,
"grad_norm": 6.298605442047119,
"learning_rate": 1.6374767513949164e-05,
"loss": 0.8718,
"step": 2340
},
{
"epoch": 0.3642281463112213,
"grad_norm": 4.11399507522583,
"learning_rate": 1.635926844389337e-05,
"loss": 0.7102,
"step": 2350
},
{
"epoch": 0.36577805331680097,
"grad_norm": 4.657794952392578,
"learning_rate": 1.634376937383757e-05,
"loss": 0.6324,
"step": 2360
},
{
"epoch": 0.36732796032238063,
"grad_norm": 9.156194686889648,
"learning_rate": 1.6328270303781775e-05,
"loss": 0.7338,
"step": 2370
},
{
"epoch": 0.3688778673279603,
"grad_norm": 16.996049880981445,
"learning_rate": 1.6312771233725977e-05,
"loss": 0.8453,
"step": 2380
},
{
"epoch": 0.37042777433353996,
"grad_norm": 8.539373397827148,
"learning_rate": 1.629727216367018e-05,
"loss": 0.8521,
"step": 2390
},
{
"epoch": 0.3719776813391196,
"grad_norm": 3.439394474029541,
"learning_rate": 1.6281773093614383e-05,
"loss": 0.7142,
"step": 2400
},
{
"epoch": 0.3735275883446993,
"grad_norm": 2.998128652572632,
"learning_rate": 1.6266274023558588e-05,
"loss": 0.7817,
"step": 2410
},
{
"epoch": 0.375077495350279,
"grad_norm": 7.721405506134033,
"learning_rate": 1.625077495350279e-05,
"loss": 0.7564,
"step": 2420
},
{
"epoch": 0.3766274023558587,
"grad_norm": 4.268898963928223,
"learning_rate": 1.6235275883446994e-05,
"loss": 0.6392,
"step": 2430
},
{
"epoch": 0.37817730936143834,
"grad_norm": 8.477879524230957,
"learning_rate": 1.62197768133912e-05,
"loss": 0.7102,
"step": 2440
},
{
"epoch": 0.379727216367018,
"grad_norm": 6.268753528594971,
"learning_rate": 1.62042777433354e-05,
"loss": 0.8036,
"step": 2450
},
{
"epoch": 0.38127712337259767,
"grad_norm": 8.156237602233887,
"learning_rate": 1.6188778673279605e-05,
"loss": 0.798,
"step": 2460
},
{
"epoch": 0.38282703037817734,
"grad_norm": 8.959247589111328,
"learning_rate": 1.617327960322381e-05,
"loss": 0.8801,
"step": 2470
},
{
"epoch": 0.384376937383757,
"grad_norm": 4.165592670440674,
"learning_rate": 1.615778053316801e-05,
"loss": 0.7829,
"step": 2480
},
{
"epoch": 0.38592684438933667,
"grad_norm": 6.727826118469238,
"learning_rate": 1.6142281463112216e-05,
"loss": 0.9334,
"step": 2490
},
{
"epoch": 0.38747675139491633,
"grad_norm": 5.1142497062683105,
"learning_rate": 1.6126782393056417e-05,
"loss": 0.9171,
"step": 2500
},
{
"epoch": 0.389026658400496,
"grad_norm": 4.020603179931641,
"learning_rate": 1.6111283323000622e-05,
"loss": 0.5681,
"step": 2510
},
{
"epoch": 0.39057656540607566,
"grad_norm": 5.856823921203613,
"learning_rate": 1.6095784252944823e-05,
"loss": 0.6705,
"step": 2520
},
{
"epoch": 0.3921264724116553,
"grad_norm": 8.811408042907715,
"learning_rate": 1.6080285182889028e-05,
"loss": 0.8041,
"step": 2530
},
{
"epoch": 0.393676379417235,
"grad_norm": 2.1637704372406006,
"learning_rate": 1.606478611283323e-05,
"loss": 0.7588,
"step": 2540
},
{
"epoch": 0.39522628642281465,
"grad_norm": 5.179090976715088,
"learning_rate": 1.6049287042777434e-05,
"loss": 0.5119,
"step": 2550
},
{
"epoch": 0.3967761934283943,
"grad_norm": 9.014410018920898,
"learning_rate": 1.603378797272164e-05,
"loss": 0.743,
"step": 2560
},
{
"epoch": 0.398326100433974,
"grad_norm": 5.311403751373291,
"learning_rate": 1.601828890266584e-05,
"loss": 0.7806,
"step": 2570
},
{
"epoch": 0.39987600743955365,
"grad_norm": 5.875848770141602,
"learning_rate": 1.6002789832610045e-05,
"loss": 0.866,
"step": 2580
},
{
"epoch": 0.4014259144451333,
"grad_norm": 5.602693557739258,
"learning_rate": 1.598729076255425e-05,
"loss": 0.7083,
"step": 2590
},
{
"epoch": 0.402975821450713,
"grad_norm": 10.609829902648926,
"learning_rate": 1.597179169249845e-05,
"loss": 0.7104,
"step": 2600
},
{
"epoch": 0.40452572845629264,
"grad_norm": 5.588577747344971,
"learning_rate": 1.5956292622442656e-05,
"loss": 0.8263,
"step": 2610
},
{
"epoch": 0.4060756354618723,
"grad_norm": 3.9749042987823486,
"learning_rate": 1.5940793552386857e-05,
"loss": 0.7349,
"step": 2620
},
{
"epoch": 0.407625542467452,
"grad_norm": 5.295260906219482,
"learning_rate": 1.5925294482331062e-05,
"loss": 0.6873,
"step": 2630
},
{
"epoch": 0.40917544947303164,
"grad_norm": 3.9901695251464844,
"learning_rate": 1.5909795412275263e-05,
"loss": 0.6693,
"step": 2640
},
{
"epoch": 0.4107253564786113,
"grad_norm": 6.513910293579102,
"learning_rate": 1.5894296342219468e-05,
"loss": 0.7173,
"step": 2650
},
{
"epoch": 0.41227526348419097,
"grad_norm": 5.145935535430908,
"learning_rate": 1.587879727216367e-05,
"loss": 0.7877,
"step": 2660
},
{
"epoch": 0.41382517048977063,
"grad_norm": 8.055877685546875,
"learning_rate": 1.5863298202107874e-05,
"loss": 0.7365,
"step": 2670
},
{
"epoch": 0.4153750774953503,
"grad_norm": 14.172343254089355,
"learning_rate": 1.584779913205208e-05,
"loss": 1.0304,
"step": 2680
},
{
"epoch": 0.41692498450092996,
"grad_norm": 5.074191093444824,
"learning_rate": 1.583230006199628e-05,
"loss": 0.83,
"step": 2690
},
{
"epoch": 0.4184748915065096,
"grad_norm": 6.958271026611328,
"learning_rate": 1.5816800991940485e-05,
"loss": 0.5557,
"step": 2700
},
{
"epoch": 0.4200247985120893,
"grad_norm": 5.674747943878174,
"learning_rate": 1.580130192188469e-05,
"loss": 0.546,
"step": 2710
},
{
"epoch": 0.42157470551766896,
"grad_norm": 6.318327903747559,
"learning_rate": 1.578580285182889e-05,
"loss": 0.7078,
"step": 2720
},
{
"epoch": 0.4231246125232486,
"grad_norm": 8.800183296203613,
"learning_rate": 1.5770303781773096e-05,
"loss": 0.7918,
"step": 2730
},
{
"epoch": 0.4246745195288283,
"grad_norm": 7.619904518127441,
"learning_rate": 1.5754804711717297e-05,
"loss": 0.9327,
"step": 2740
},
{
"epoch": 0.42622442653440795,
"grad_norm": 8.662442207336426,
"learning_rate": 1.5739305641661502e-05,
"loss": 0.816,
"step": 2750
},
{
"epoch": 0.4277743335399876,
"grad_norm": 7.784162998199463,
"learning_rate": 1.5723806571605703e-05,
"loss": 0.8073,
"step": 2760
},
{
"epoch": 0.4293242405455673,
"grad_norm": 5.800148963928223,
"learning_rate": 1.5708307501549908e-05,
"loss": 0.7166,
"step": 2770
},
{
"epoch": 0.43087414755114695,
"grad_norm": 6.138567924499512,
"learning_rate": 1.569280843149411e-05,
"loss": 0.8677,
"step": 2780
},
{
"epoch": 0.4324240545567266,
"grad_norm": 8.85258674621582,
"learning_rate": 1.5677309361438314e-05,
"loss": 0.7602,
"step": 2790
},
{
"epoch": 0.4339739615623063,
"grad_norm": 5.599133014678955,
"learning_rate": 1.566181029138252e-05,
"loss": 0.8947,
"step": 2800
},
{
"epoch": 0.43552386856788594,
"grad_norm": 7.680168628692627,
"learning_rate": 1.564631122132672e-05,
"loss": 0.6632,
"step": 2810
},
{
"epoch": 0.4370737755734656,
"grad_norm": 9.464559555053711,
"learning_rate": 1.5630812151270925e-05,
"loss": 0.6559,
"step": 2820
},
{
"epoch": 0.43862368257904527,
"grad_norm": 1.4966700077056885,
"learning_rate": 1.561531308121513e-05,
"loss": 0.5048,
"step": 2830
},
{
"epoch": 0.44017358958462494,
"grad_norm": 10.32955551147461,
"learning_rate": 1.559981401115933e-05,
"loss": 0.627,
"step": 2840
},
{
"epoch": 0.4417234965902046,
"grad_norm": 8.718482971191406,
"learning_rate": 1.5584314941103536e-05,
"loss": 0.7869,
"step": 2850
},
{
"epoch": 0.44327340359578427,
"grad_norm": 7.256460666656494,
"learning_rate": 1.5568815871047738e-05,
"loss": 0.6825,
"step": 2860
},
{
"epoch": 0.44482331060136393,
"grad_norm": 6.733129978179932,
"learning_rate": 1.5553316800991942e-05,
"loss": 0.7388,
"step": 2870
},
{
"epoch": 0.4463732176069436,
"grad_norm": 10.719423294067383,
"learning_rate": 1.5537817730936144e-05,
"loss": 0.6301,
"step": 2880
},
{
"epoch": 0.44792312461252326,
"grad_norm": 5.897913932800293,
"learning_rate": 1.552231866088035e-05,
"loss": 0.9147,
"step": 2890
},
{
"epoch": 0.4494730316181029,
"grad_norm": 3.1488702297210693,
"learning_rate": 1.550681959082455e-05,
"loss": 0.5445,
"step": 2900
},
{
"epoch": 0.4510229386236826,
"grad_norm": 10.640909194946289,
"learning_rate": 1.5491320520768755e-05,
"loss": 0.7616,
"step": 2910
},
{
"epoch": 0.45257284562926225,
"grad_norm": 7.557347297668457,
"learning_rate": 1.547582145071296e-05,
"loss": 0.6965,
"step": 2920
},
{
"epoch": 0.4541227526348419,
"grad_norm": 5.1408843994140625,
"learning_rate": 1.546032238065716e-05,
"loss": 0.6548,
"step": 2930
},
{
"epoch": 0.4556726596404216,
"grad_norm": 8.21218204498291,
"learning_rate": 1.5444823310601365e-05,
"loss": 1.013,
"step": 2940
},
{
"epoch": 0.45722256664600125,
"grad_norm": 11.295351028442383,
"learning_rate": 1.542932424054557e-05,
"loss": 0.9081,
"step": 2950
},
{
"epoch": 0.4587724736515809,
"grad_norm": 7.3191046714782715,
"learning_rate": 1.541382517048977e-05,
"loss": 0.6628,
"step": 2960
},
{
"epoch": 0.4603223806571606,
"grad_norm": 2.974071741104126,
"learning_rate": 1.5398326100433976e-05,
"loss": 0.8033,
"step": 2970
},
{
"epoch": 0.46187228766274024,
"grad_norm": 8.998639106750488,
"learning_rate": 1.5382827030378178e-05,
"loss": 0.6918,
"step": 2980
},
{
"epoch": 0.4634221946683199,
"grad_norm": 7.389767169952393,
"learning_rate": 1.5367327960322382e-05,
"loss": 0.713,
"step": 2990
},
{
"epoch": 0.4649721016738996,
"grad_norm": 3.8665876388549805,
"learning_rate": 1.5351828890266584e-05,
"loss": 0.6958,
"step": 3000
},
{
"epoch": 0.46652200867947924,
"grad_norm": 7.674843788146973,
"learning_rate": 1.533632982021079e-05,
"loss": 1.0091,
"step": 3010
},
{
"epoch": 0.4680719156850589,
"grad_norm": 12.13735294342041,
"learning_rate": 1.532083075015499e-05,
"loss": 0.9314,
"step": 3020
},
{
"epoch": 0.46962182269063857,
"grad_norm": 4.446971416473389,
"learning_rate": 1.5305331680099195e-05,
"loss": 0.4494,
"step": 3030
},
{
"epoch": 0.47117172969621823,
"grad_norm": 9.129325866699219,
"learning_rate": 1.52898326100434e-05,
"loss": 0.8304,
"step": 3040
},
{
"epoch": 0.4727216367017979,
"grad_norm": 5.487653732299805,
"learning_rate": 1.52743335399876e-05,
"loss": 0.661,
"step": 3050
},
{
"epoch": 0.47427154370737756,
"grad_norm": 6.226546764373779,
"learning_rate": 1.5258834469931806e-05,
"loss": 0.8503,
"step": 3060
},
{
"epoch": 0.47582145071295723,
"grad_norm": 9.435007095336914,
"learning_rate": 1.5243335399876009e-05,
"loss": 0.6498,
"step": 3070
},
{
"epoch": 0.4773713577185369,
"grad_norm": 7.831610679626465,
"learning_rate": 1.5227836329820212e-05,
"loss": 0.6945,
"step": 3080
},
{
"epoch": 0.47892126472411656,
"grad_norm": 7.432961940765381,
"learning_rate": 1.5212337259764417e-05,
"loss": 0.6282,
"step": 3090
},
{
"epoch": 0.4804711717296962,
"grad_norm": 9.168987274169922,
"learning_rate": 1.5196838189708618e-05,
"loss": 0.707,
"step": 3100
},
{
"epoch": 0.4820210787352759,
"grad_norm": 5.86536169052124,
"learning_rate": 1.5181339119652823e-05,
"loss": 0.898,
"step": 3110
},
{
"epoch": 0.48357098574085555,
"grad_norm": 7.2908172607421875,
"learning_rate": 1.5165840049597024e-05,
"loss": 0.8231,
"step": 3120
},
{
"epoch": 0.4851208927464352,
"grad_norm": 4.543672561645508,
"learning_rate": 1.5150340979541229e-05,
"loss": 0.7451,
"step": 3130
},
{
"epoch": 0.4866707997520149,
"grad_norm": 3.355882167816162,
"learning_rate": 1.5134841909485432e-05,
"loss": 0.8017,
"step": 3140
},
{
"epoch": 0.48822070675759455,
"grad_norm": 3.6283857822418213,
"learning_rate": 1.5119342839429637e-05,
"loss": 0.7989,
"step": 3150
},
{
"epoch": 0.4897706137631742,
"grad_norm": 8.624027252197266,
"learning_rate": 1.5103843769373838e-05,
"loss": 0.8073,
"step": 3160
},
{
"epoch": 0.4913205207687539,
"grad_norm": 5.2577595710754395,
"learning_rate": 1.5088344699318043e-05,
"loss": 0.66,
"step": 3170
},
{
"epoch": 0.49287042777433354,
"grad_norm": 4.758912563323975,
"learning_rate": 1.5072845629262244e-05,
"loss": 0.6226,
"step": 3180
},
{
"epoch": 0.4944203347799132,
"grad_norm": 5.687361240386963,
"learning_rate": 1.5057346559206449e-05,
"loss": 0.792,
"step": 3190
},
{
"epoch": 0.49597024178549287,
"grad_norm": 5.457574844360352,
"learning_rate": 1.5041847489150652e-05,
"loss": 0.958,
"step": 3200
},
{
"epoch": 0.49752014879107254,
"grad_norm": 4.327171802520752,
"learning_rate": 1.5026348419094857e-05,
"loss": 0.6538,
"step": 3210
},
{
"epoch": 0.4990700557966522,
"grad_norm": 3.7483696937561035,
"learning_rate": 1.5010849349039058e-05,
"loss": 0.5517,
"step": 3220
},
{
"epoch": 0.5006199628022319,
"grad_norm": 7.629466533660889,
"learning_rate": 1.4995350278983263e-05,
"loss": 0.8133,
"step": 3230
},
{
"epoch": 0.5021698698078115,
"grad_norm": 6.833787441253662,
"learning_rate": 1.4979851208927464e-05,
"loss": 0.5979,
"step": 3240
},
{
"epoch": 0.5037197768133912,
"grad_norm": 5.130917549133301,
"learning_rate": 1.4964352138871669e-05,
"loss": 0.6538,
"step": 3250
},
{
"epoch": 0.5052696838189709,
"grad_norm": 5.909976005554199,
"learning_rate": 1.4948853068815872e-05,
"loss": 0.7074,
"step": 3260
},
{
"epoch": 0.5068195908245505,
"grad_norm": 7.032032489776611,
"learning_rate": 1.4933353998760077e-05,
"loss": 0.8226,
"step": 3270
},
{
"epoch": 0.5083694978301302,
"grad_norm": 5.627322196960449,
"learning_rate": 1.4917854928704278e-05,
"loss": 0.6371,
"step": 3280
},
{
"epoch": 0.5099194048357099,
"grad_norm": 8.351363182067871,
"learning_rate": 1.4902355858648483e-05,
"loss": 0.9501,
"step": 3290
},
{
"epoch": 0.5114693118412895,
"grad_norm": 7.641820907592773,
"learning_rate": 1.4886856788592684e-05,
"loss": 0.846,
"step": 3300
},
{
"epoch": 0.5130192188468692,
"grad_norm": 5.8080010414123535,
"learning_rate": 1.4871357718536889e-05,
"loss": 0.6714,
"step": 3310
},
{
"epoch": 0.5145691258524488,
"grad_norm": 3.774705648422241,
"learning_rate": 1.4855858648481092e-05,
"loss": 0.7114,
"step": 3320
},
{
"epoch": 0.5161190328580285,
"grad_norm": 3.194676160812378,
"learning_rate": 1.4840359578425297e-05,
"loss": 0.6901,
"step": 3330
},
{
"epoch": 0.5176689398636082,
"grad_norm": 3.790651559829712,
"learning_rate": 1.4824860508369498e-05,
"loss": 0.6808,
"step": 3340
},
{
"epoch": 0.5192188468691878,
"grad_norm": 5.215169429779053,
"learning_rate": 1.4809361438313703e-05,
"loss": 0.6263,
"step": 3350
},
{
"epoch": 0.5207687538747675,
"grad_norm": 6.934802055358887,
"learning_rate": 1.4793862368257904e-05,
"loss": 0.839,
"step": 3360
},
{
"epoch": 0.5223186608803472,
"grad_norm": 9.833372116088867,
"learning_rate": 1.4778363298202109e-05,
"loss": 0.8676,
"step": 3370
},
{
"epoch": 0.5238685678859268,
"grad_norm": 8.891237258911133,
"learning_rate": 1.4762864228146312e-05,
"loss": 0.7184,
"step": 3380
},
{
"epoch": 0.5254184748915065,
"grad_norm": 2.9913341999053955,
"learning_rate": 1.4747365158090517e-05,
"loss": 0.4414,
"step": 3390
},
{
"epoch": 0.5269683818970862,
"grad_norm": 11.538298606872559,
"learning_rate": 1.4731866088034718e-05,
"loss": 0.4681,
"step": 3400
},
{
"epoch": 0.5285182889026658,
"grad_norm": 6.064094543457031,
"learning_rate": 1.4716367017978923e-05,
"loss": 0.9835,
"step": 3410
},
{
"epoch": 0.5300681959082455,
"grad_norm": 4.357783317565918,
"learning_rate": 1.4700867947923126e-05,
"loss": 0.4394,
"step": 3420
},
{
"epoch": 0.5316181029138252,
"grad_norm": 3.527278423309326,
"learning_rate": 1.468536887786733e-05,
"loss": 0.5394,
"step": 3430
},
{
"epoch": 0.5331680099194048,
"grad_norm": 5.441196918487549,
"learning_rate": 1.4669869807811532e-05,
"loss": 0.5942,
"step": 3440
},
{
"epoch": 0.5347179169249845,
"grad_norm": 7.565889358520508,
"learning_rate": 1.4654370737755737e-05,
"loss": 0.6188,
"step": 3450
},
{
"epoch": 0.5362678239305642,
"grad_norm": 5.547229290008545,
"learning_rate": 1.4638871667699938e-05,
"loss": 0.7393,
"step": 3460
},
{
"epoch": 0.5378177309361438,
"grad_norm": 13.453187942504883,
"learning_rate": 1.4623372597644143e-05,
"loss": 0.8353,
"step": 3470
},
{
"epoch": 0.5393676379417235,
"grad_norm": 9.306943893432617,
"learning_rate": 1.4607873527588346e-05,
"loss": 0.9389,
"step": 3480
},
{
"epoch": 0.5409175449473032,
"grad_norm": 5.817188739776611,
"learning_rate": 1.459237445753255e-05,
"loss": 0.7419,
"step": 3490
},
{
"epoch": 0.5424674519528828,
"grad_norm": 4.312830448150635,
"learning_rate": 1.4576875387476752e-05,
"loss": 0.6188,
"step": 3500
},
{
"epoch": 0.5440173589584625,
"grad_norm": 8.189769744873047,
"learning_rate": 1.4561376317420957e-05,
"loss": 0.8339,
"step": 3510
},
{
"epoch": 0.5455672659640421,
"grad_norm": 5.929519176483154,
"learning_rate": 1.4545877247365159e-05,
"loss": 0.7464,
"step": 3520
},
{
"epoch": 0.5471171729696218,
"grad_norm": 6.465437412261963,
"learning_rate": 1.4530378177309363e-05,
"loss": 0.6996,
"step": 3530
},
{
"epoch": 0.5486670799752015,
"grad_norm": 5.720170974731445,
"learning_rate": 1.4514879107253566e-05,
"loss": 0.9782,
"step": 3540
},
{
"epoch": 0.5502169869807811,
"grad_norm": 8.770698547363281,
"learning_rate": 1.449938003719777e-05,
"loss": 0.6206,
"step": 3550
},
{
"epoch": 0.5517668939863608,
"grad_norm": 5.114345073699951,
"learning_rate": 1.4483880967141972e-05,
"loss": 0.5781,
"step": 3560
},
{
"epoch": 0.5533168009919405,
"grad_norm": 8.168386459350586,
"learning_rate": 1.4468381897086177e-05,
"loss": 0.8869,
"step": 3570
},
{
"epoch": 0.5548667079975201,
"grad_norm": 7.974008560180664,
"learning_rate": 1.4452882827030379e-05,
"loss": 0.8623,
"step": 3580
},
{
"epoch": 0.5564166150030998,
"grad_norm": 5.2523274421691895,
"learning_rate": 1.4437383756974583e-05,
"loss": 0.7054,
"step": 3590
},
{
"epoch": 0.5579665220086795,
"grad_norm": 5.542336463928223,
"learning_rate": 1.4421884686918786e-05,
"loss": 0.6853,
"step": 3600
},
{
"epoch": 0.5595164290142591,
"grad_norm": 3.6195218563079834,
"learning_rate": 1.440638561686299e-05,
"loss": 0.6817,
"step": 3610
},
{
"epoch": 0.5610663360198388,
"grad_norm": 4.10429573059082,
"learning_rate": 1.4390886546807193e-05,
"loss": 0.5063,
"step": 3620
},
{
"epoch": 0.5626162430254185,
"grad_norm": 9.308690071105957,
"learning_rate": 1.4375387476751397e-05,
"loss": 0.745,
"step": 3630
},
{
"epoch": 0.5641661500309981,
"grad_norm": 6.351543426513672,
"learning_rate": 1.4359888406695599e-05,
"loss": 0.6918,
"step": 3640
},
{
"epoch": 0.5657160570365778,
"grad_norm": 7.3764495849609375,
"learning_rate": 1.4344389336639803e-05,
"loss": 0.5581,
"step": 3650
},
{
"epoch": 0.5672659640421575,
"grad_norm": 6.400803565979004,
"learning_rate": 1.4328890266584007e-05,
"loss": 0.6851,
"step": 3660
},
{
"epoch": 0.5688158710477371,
"grad_norm": 5.576094627380371,
"learning_rate": 1.431339119652821e-05,
"loss": 0.7973,
"step": 3670
},
{
"epoch": 0.5703657780533168,
"grad_norm": 5.8568902015686035,
"learning_rate": 1.4297892126472413e-05,
"loss": 0.7913,
"step": 3680
},
{
"epoch": 0.5719156850588965,
"grad_norm": 6.588540077209473,
"learning_rate": 1.4282393056416617e-05,
"loss": 0.6668,
"step": 3690
},
{
"epoch": 0.5734655920644761,
"grad_norm": 4.362242221832275,
"learning_rate": 1.4266893986360819e-05,
"loss": 0.4781,
"step": 3700
},
{
"epoch": 0.5750154990700558,
"grad_norm": 6.261458396911621,
"learning_rate": 1.4251394916305024e-05,
"loss": 0.6587,
"step": 3710
},
{
"epoch": 0.5765654060756354,
"grad_norm": 4.799465179443359,
"learning_rate": 1.4235895846249227e-05,
"loss": 0.7028,
"step": 3720
},
{
"epoch": 0.5781153130812151,
"grad_norm": 6.386651039123535,
"learning_rate": 1.422039677619343e-05,
"loss": 0.6336,
"step": 3730
},
{
"epoch": 0.5796652200867948,
"grad_norm": 7.835630893707275,
"learning_rate": 1.4204897706137633e-05,
"loss": 0.6865,
"step": 3740
},
{
"epoch": 0.5812151270923744,
"grad_norm": 3.549483060836792,
"learning_rate": 1.4189398636081838e-05,
"loss": 0.8948,
"step": 3750
},
{
"epoch": 0.5827650340979541,
"grad_norm": 6.352856636047363,
"learning_rate": 1.4173899566026039e-05,
"loss": 0.6209,
"step": 3760
},
{
"epoch": 0.5843149411035338,
"grad_norm": 2.8066344261169434,
"learning_rate": 1.4158400495970244e-05,
"loss": 0.6885,
"step": 3770
},
{
"epoch": 0.5858648481091134,
"grad_norm": 4.101128578186035,
"learning_rate": 1.4142901425914447e-05,
"loss": 0.6328,
"step": 3780
},
{
"epoch": 0.5874147551146931,
"grad_norm": 6.97265100479126,
"learning_rate": 1.412740235585865e-05,
"loss": 0.5702,
"step": 3790
},
{
"epoch": 0.5889646621202728,
"grad_norm": 7.036039352416992,
"learning_rate": 1.4111903285802853e-05,
"loss": 0.8447,
"step": 3800
},
{
"epoch": 0.5905145691258524,
"grad_norm": 7.859857082366943,
"learning_rate": 1.4096404215747058e-05,
"loss": 0.6214,
"step": 3810
},
{
"epoch": 0.5920644761314321,
"grad_norm": 7.97307825088501,
"learning_rate": 1.4080905145691259e-05,
"loss": 0.6224,
"step": 3820
},
{
"epoch": 0.5936143831370118,
"grad_norm": 7.060767650604248,
"learning_rate": 1.4065406075635464e-05,
"loss": 1.2209,
"step": 3830
},
{
"epoch": 0.5951642901425914,
"grad_norm": 3.851486921310425,
"learning_rate": 1.4049907005579667e-05,
"loss": 0.6759,
"step": 3840
},
{
"epoch": 0.5967141971481711,
"grad_norm": 4.023894309997559,
"learning_rate": 1.403440793552387e-05,
"loss": 0.8202,
"step": 3850
},
{
"epoch": 0.5982641041537508,
"grad_norm": 6.258404731750488,
"learning_rate": 1.4018908865468073e-05,
"loss": 0.7032,
"step": 3860
},
{
"epoch": 0.5998140111593304,
"grad_norm": 9.688002586364746,
"learning_rate": 1.4003409795412278e-05,
"loss": 0.6889,
"step": 3870
},
{
"epoch": 0.6013639181649101,
"grad_norm": 4.189449787139893,
"learning_rate": 1.3987910725356479e-05,
"loss": 0.7047,
"step": 3880
},
{
"epoch": 0.6029138251704897,
"grad_norm": 2.062530040740967,
"learning_rate": 1.3972411655300684e-05,
"loss": 0.5757,
"step": 3890
},
{
"epoch": 0.6044637321760694,
"grad_norm": 7.18925666809082,
"learning_rate": 1.3956912585244887e-05,
"loss": 0.7404,
"step": 3900
},
{
"epoch": 0.6060136391816491,
"grad_norm": 11.522746086120605,
"learning_rate": 1.394141351518909e-05,
"loss": 0.911,
"step": 3910
},
{
"epoch": 0.6075635461872287,
"grad_norm": 7.289572715759277,
"learning_rate": 1.3925914445133293e-05,
"loss": 0.7325,
"step": 3920
},
{
"epoch": 0.6091134531928084,
"grad_norm": 5.386538505554199,
"learning_rate": 1.3910415375077498e-05,
"loss": 0.513,
"step": 3930
},
{
"epoch": 0.6106633601983881,
"grad_norm": 5.218569278717041,
"learning_rate": 1.38949163050217e-05,
"loss": 0.7397,
"step": 3940
},
{
"epoch": 0.6122132672039677,
"grad_norm": 8.259590148925781,
"learning_rate": 1.3879417234965904e-05,
"loss": 0.6696,
"step": 3950
},
{
"epoch": 0.6137631742095474,
"grad_norm": 2.930074691772461,
"learning_rate": 1.3863918164910107e-05,
"loss": 0.5533,
"step": 3960
},
{
"epoch": 0.6153130812151271,
"grad_norm": 5.612064838409424,
"learning_rate": 1.384841909485431e-05,
"loss": 0.6844,
"step": 3970
},
{
"epoch": 0.6168629882207067,
"grad_norm": 8.284967422485352,
"learning_rate": 1.3832920024798513e-05,
"loss": 0.713,
"step": 3980
},
{
"epoch": 0.6184128952262864,
"grad_norm": 5.959351062774658,
"learning_rate": 1.3817420954742718e-05,
"loss": 0.785,
"step": 3990
},
{
"epoch": 0.6199628022318661,
"grad_norm": 5.368082523345947,
"learning_rate": 1.380192188468692e-05,
"loss": 0.6414,
"step": 4000
},
{
"epoch": 0.6215127092374457,
"grad_norm": 6.045334815979004,
"learning_rate": 1.3786422814631124e-05,
"loss": 0.7842,
"step": 4010
},
{
"epoch": 0.6230626162430254,
"grad_norm": 7.259751319885254,
"learning_rate": 1.3770923744575327e-05,
"loss": 0.691,
"step": 4020
},
{
"epoch": 0.6246125232486051,
"grad_norm": 3.850944995880127,
"learning_rate": 1.375542467451953e-05,
"loss": 0.549,
"step": 4030
},
{
"epoch": 0.6261624302541847,
"grad_norm": 2.9327070713043213,
"learning_rate": 1.3739925604463733e-05,
"loss": 0.5583,
"step": 4040
},
{
"epoch": 0.6277123372597644,
"grad_norm": 3.157712697982788,
"learning_rate": 1.3724426534407938e-05,
"loss": 0.508,
"step": 4050
},
{
"epoch": 0.629262244265344,
"grad_norm": 5.871835708618164,
"learning_rate": 1.370892746435214e-05,
"loss": 0.7169,
"step": 4060
},
{
"epoch": 0.6308121512709237,
"grad_norm": 2.6925415992736816,
"learning_rate": 1.3693428394296344e-05,
"loss": 0.5738,
"step": 4070
},
{
"epoch": 0.6323620582765034,
"grad_norm": 4.072033882141113,
"learning_rate": 1.3677929324240547e-05,
"loss": 0.7167,
"step": 4080
},
{
"epoch": 0.633911965282083,
"grad_norm": 2.6646270751953125,
"learning_rate": 1.366243025418475e-05,
"loss": 0.5226,
"step": 4090
},
{
"epoch": 0.6354618722876627,
"grad_norm": 1.837414264678955,
"learning_rate": 1.3646931184128953e-05,
"loss": 0.5607,
"step": 4100
},
{
"epoch": 0.6370117792932424,
"grad_norm": 5.846659183502197,
"learning_rate": 1.3631432114073158e-05,
"loss": 0.5539,
"step": 4110
},
{
"epoch": 0.638561686298822,
"grad_norm": 4.042922496795654,
"learning_rate": 1.361593304401736e-05,
"loss": 0.7586,
"step": 4120
},
{
"epoch": 0.6401115933044017,
"grad_norm": 4.464644908905029,
"learning_rate": 1.3600433973961564e-05,
"loss": 0.7025,
"step": 4130
},
{
"epoch": 0.6416615003099814,
"grad_norm": 5.151641845703125,
"learning_rate": 1.3584934903905767e-05,
"loss": 0.7848,
"step": 4140
},
{
"epoch": 0.643211407315561,
"grad_norm": 8.169122695922852,
"learning_rate": 1.356943583384997e-05,
"loss": 0.7885,
"step": 4150
},
{
"epoch": 0.6447613143211407,
"grad_norm": 5.91544246673584,
"learning_rate": 1.3553936763794173e-05,
"loss": 0.6493,
"step": 4160
},
{
"epoch": 0.6463112213267204,
"grad_norm": 3.980699062347412,
"learning_rate": 1.3538437693738378e-05,
"loss": 0.5557,
"step": 4170
},
{
"epoch": 0.6478611283323,
"grad_norm": 4.185513496398926,
"learning_rate": 1.352293862368258e-05,
"loss": 0.724,
"step": 4180
},
{
"epoch": 0.6494110353378797,
"grad_norm": 5.239704608917236,
"learning_rate": 1.3507439553626784e-05,
"loss": 0.6691,
"step": 4190
},
{
"epoch": 0.6509609423434594,
"grad_norm": 6.143891334533691,
"learning_rate": 1.3491940483570987e-05,
"loss": 0.6889,
"step": 4200
},
{
"epoch": 0.652510849349039,
"grad_norm": 3.6569466590881348,
"learning_rate": 1.347644141351519e-05,
"loss": 0.5583,
"step": 4210
},
{
"epoch": 0.6540607563546187,
"grad_norm": 4.665682315826416,
"learning_rate": 1.3460942343459394e-05,
"loss": 0.7787,
"step": 4220
},
{
"epoch": 0.6556106633601984,
"grad_norm": 7.364247798919678,
"learning_rate": 1.3445443273403598e-05,
"loss": 0.7035,
"step": 4230
},
{
"epoch": 0.657160570365778,
"grad_norm": 11.255681991577148,
"learning_rate": 1.34299442033478e-05,
"loss": 0.711,
"step": 4240
},
{
"epoch": 0.6587104773713577,
"grad_norm": 5.959405899047852,
"learning_rate": 1.3414445133292004e-05,
"loss": 0.6408,
"step": 4250
},
{
"epoch": 0.6602603843769373,
"grad_norm": 7.016552925109863,
"learning_rate": 1.3398946063236207e-05,
"loss": 0.6437,
"step": 4260
},
{
"epoch": 0.661810291382517,
"grad_norm": 8.809164047241211,
"learning_rate": 1.338344699318041e-05,
"loss": 0.8967,
"step": 4270
},
{
"epoch": 0.6633601983880967,
"grad_norm": 6.494601249694824,
"learning_rate": 1.3367947923124614e-05,
"loss": 0.9146,
"step": 4280
},
{
"epoch": 0.6649101053936763,
"grad_norm": 7.576080799102783,
"learning_rate": 1.3352448853068818e-05,
"loss": 0.6699,
"step": 4290
},
{
"epoch": 0.666460012399256,
"grad_norm": 6.122068881988525,
"learning_rate": 1.333694978301302e-05,
"loss": 0.6826,
"step": 4300
},
{
"epoch": 0.6680099194048357,
"grad_norm": 7.942433834075928,
"learning_rate": 1.3321450712957225e-05,
"loss": 0.8486,
"step": 4310
},
{
"epoch": 0.6695598264104153,
"grad_norm": 6.053321838378906,
"learning_rate": 1.3305951642901428e-05,
"loss": 0.5775,
"step": 4320
},
{
"epoch": 0.671109733415995,
"grad_norm": 9.208568572998047,
"learning_rate": 1.329045257284563e-05,
"loss": 0.8334,
"step": 4330
},
{
"epoch": 0.6726596404215747,
"grad_norm": 6.217433929443359,
"learning_rate": 1.3274953502789834e-05,
"loss": 0.7451,
"step": 4340
},
{
"epoch": 0.6742095474271543,
"grad_norm": 5.161596775054932,
"learning_rate": 1.3259454432734038e-05,
"loss": 0.7425,
"step": 4350
},
{
"epoch": 0.675759454432734,
"grad_norm": 11.464788436889648,
"learning_rate": 1.324395536267824e-05,
"loss": 0.7977,
"step": 4360
},
{
"epoch": 0.6773093614383137,
"grad_norm": 4.669081687927246,
"learning_rate": 1.3228456292622445e-05,
"loss": 0.5158,
"step": 4370
},
{
"epoch": 0.6788592684438933,
"grad_norm": 6.2955756187438965,
"learning_rate": 1.3212957222566648e-05,
"loss": 0.7647,
"step": 4380
},
{
"epoch": 0.680409175449473,
"grad_norm": 8.309978485107422,
"learning_rate": 1.319745815251085e-05,
"loss": 0.6655,
"step": 4390
},
{
"epoch": 0.6819590824550527,
"grad_norm": 5.269478797912598,
"learning_rate": 1.3181959082455054e-05,
"loss": 0.7487,
"step": 4400
},
{
"epoch": 0.6835089894606323,
"grad_norm": 6.342654705047607,
"learning_rate": 1.3166460012399259e-05,
"loss": 0.7688,
"step": 4410
},
{
"epoch": 0.685058896466212,
"grad_norm": 3.4577929973602295,
"learning_rate": 1.315096094234346e-05,
"loss": 0.5437,
"step": 4420
},
{
"epoch": 0.6866088034717917,
"grad_norm": 3.24524188041687,
"learning_rate": 1.3135461872287665e-05,
"loss": 0.7577,
"step": 4430
},
{
"epoch": 0.6881587104773713,
"grad_norm": 6.469138145446777,
"learning_rate": 1.3119962802231868e-05,
"loss": 0.5899,
"step": 4440
},
{
"epoch": 0.689708617482951,
"grad_norm": 2.1648213863372803,
"learning_rate": 1.310446373217607e-05,
"loss": 0.6514,
"step": 4450
},
{
"epoch": 0.6912585244885306,
"grad_norm": 5.5625762939453125,
"learning_rate": 1.3088964662120274e-05,
"loss": 0.8065,
"step": 4460
},
{
"epoch": 0.6928084314941103,
"grad_norm": 9.689412117004395,
"learning_rate": 1.3073465592064479e-05,
"loss": 0.6607,
"step": 4470
},
{
"epoch": 0.69435833849969,
"grad_norm": 3.1396737098693848,
"learning_rate": 1.305796652200868e-05,
"loss": 0.6567,
"step": 4480
},
{
"epoch": 0.6959082455052696,
"grad_norm": 4.68833589553833,
"learning_rate": 1.3042467451952885e-05,
"loss": 0.5374,
"step": 4490
},
{
"epoch": 0.6974581525108493,
"grad_norm": 3.161306619644165,
"learning_rate": 1.3026968381897088e-05,
"loss": 0.6759,
"step": 4500
},
{
"epoch": 0.699008059516429,
"grad_norm": 5.765093803405762,
"learning_rate": 1.3011469311841291e-05,
"loss": 0.6232,
"step": 4510
},
{
"epoch": 0.7005579665220086,
"grad_norm": 2.2126271724700928,
"learning_rate": 1.2995970241785494e-05,
"loss": 0.567,
"step": 4520
},
{
"epoch": 0.7021078735275883,
"grad_norm": 6.046818256378174,
"learning_rate": 1.2980471171729699e-05,
"loss": 0.8203,
"step": 4530
},
{
"epoch": 0.703657780533168,
"grad_norm": 9.789670944213867,
"learning_rate": 1.29649721016739e-05,
"loss": 0.7596,
"step": 4540
},
{
"epoch": 0.7052076875387476,
"grad_norm": 5.565737724304199,
"learning_rate": 1.2949473031618105e-05,
"loss": 0.7163,
"step": 4550
},
{
"epoch": 0.7067575945443273,
"grad_norm": 5.0460591316223145,
"learning_rate": 1.2933973961562308e-05,
"loss": 0.464,
"step": 4560
},
{
"epoch": 0.708307501549907,
"grad_norm": 3.645411252975464,
"learning_rate": 1.2918474891506511e-05,
"loss": 0.9075,
"step": 4570
},
{
"epoch": 0.7098574085554866,
"grad_norm": 12.882902145385742,
"learning_rate": 1.2902975821450714e-05,
"loss": 0.7601,
"step": 4580
},
{
"epoch": 0.7114073155610663,
"grad_norm": 7.9908881187438965,
"learning_rate": 1.2887476751394919e-05,
"loss": 0.3679,
"step": 4590
},
{
"epoch": 0.712957222566646,
"grad_norm": 5.731966972351074,
"learning_rate": 1.287197768133912e-05,
"loss": 0.5739,
"step": 4600
},
{
"epoch": 0.7145071295722256,
"grad_norm": 7.825245380401611,
"learning_rate": 1.2856478611283325e-05,
"loss": 0.8259,
"step": 4610
},
{
"epoch": 0.7160570365778053,
"grad_norm": 4.170419216156006,
"learning_rate": 1.2840979541227528e-05,
"loss": 0.6523,
"step": 4620
},
{
"epoch": 0.717606943583385,
"grad_norm": 6.501009464263916,
"learning_rate": 1.2825480471171731e-05,
"loss": 0.7955,
"step": 4630
},
{
"epoch": 0.7191568505889646,
"grad_norm": 5.350160598754883,
"learning_rate": 1.2809981401115934e-05,
"loss": 0.506,
"step": 4640
},
{
"epoch": 0.7207067575945443,
"grad_norm": 5.508749485015869,
"learning_rate": 1.2794482331060139e-05,
"loss": 0.6531,
"step": 4650
},
{
"epoch": 0.7222566646001239,
"grad_norm": 3.685305595397949,
"learning_rate": 1.277898326100434e-05,
"loss": 0.7643,
"step": 4660
},
{
"epoch": 0.7238065716057036,
"grad_norm": 4.835994720458984,
"learning_rate": 1.2763484190948545e-05,
"loss": 0.7906,
"step": 4670
},
{
"epoch": 0.7253564786112833,
"grad_norm": 4.646899700164795,
"learning_rate": 1.2747985120892748e-05,
"loss": 0.8304,
"step": 4680
},
{
"epoch": 0.7269063856168629,
"grad_norm": 8.185405731201172,
"learning_rate": 1.2732486050836951e-05,
"loss": 0.8615,
"step": 4690
},
{
"epoch": 0.7284562926224426,
"grad_norm": 4.894402980804443,
"learning_rate": 1.2716986980781154e-05,
"loss": 0.7577,
"step": 4700
},
{
"epoch": 0.7300061996280223,
"grad_norm": 6.026183128356934,
"learning_rate": 1.2701487910725359e-05,
"loss": 0.6623,
"step": 4710
},
{
"epoch": 0.7315561066336019,
"grad_norm": 8.507851600646973,
"learning_rate": 1.268598884066956e-05,
"loss": 0.6662,
"step": 4720
},
{
"epoch": 0.7331060136391816,
"grad_norm": 6.706274032592773,
"learning_rate": 1.2670489770613765e-05,
"loss": 0.8706,
"step": 4730
},
{
"epoch": 0.7346559206447613,
"grad_norm": 9.545132637023926,
"learning_rate": 1.2654990700557968e-05,
"loss": 0.4978,
"step": 4740
},
{
"epoch": 0.7362058276503409,
"grad_norm": 8.161598205566406,
"learning_rate": 1.2639491630502171e-05,
"loss": 0.7005,
"step": 4750
},
{
"epoch": 0.7377557346559206,
"grad_norm": 7.0445356369018555,
"learning_rate": 1.2623992560446374e-05,
"loss": 0.8165,
"step": 4760
},
{
"epoch": 0.7393056416615003,
"grad_norm": 4.3169965744018555,
"learning_rate": 1.2608493490390579e-05,
"loss": 0.6663,
"step": 4770
},
{
"epoch": 0.7408555486670799,
"grad_norm": 3.8627138137817383,
"learning_rate": 1.259299442033478e-05,
"loss": 0.7779,
"step": 4780
},
{
"epoch": 0.7424054556726596,
"grad_norm": 7.226226806640625,
"learning_rate": 1.2577495350278985e-05,
"loss": 0.7438,
"step": 4790
},
{
"epoch": 0.7439553626782393,
"grad_norm": 5.355091094970703,
"learning_rate": 1.2561996280223188e-05,
"loss": 0.4383,
"step": 4800
},
{
"epoch": 0.7455052696838189,
"grad_norm": 9.102039337158203,
"learning_rate": 1.2546497210167391e-05,
"loss": 0.6852,
"step": 4810
},
{
"epoch": 0.7470551766893986,
"grad_norm": 3.6771786212921143,
"learning_rate": 1.2530998140111594e-05,
"loss": 0.5601,
"step": 4820
},
{
"epoch": 0.7486050836949782,
"grad_norm": 7.723818302154541,
"learning_rate": 1.25154990700558e-05,
"loss": 0.7773,
"step": 4830
},
{
"epoch": 0.750154990700558,
"grad_norm": 11.004088401794434,
"learning_rate": 1.25e-05,
"loss": 0.7228,
"step": 4840
},
{
"epoch": 0.7517048977061377,
"grad_norm": 6.969561576843262,
"learning_rate": 1.2484500929944204e-05,
"loss": 0.8263,
"step": 4850
},
{
"epoch": 0.7532548047117174,
"grad_norm": 7.78557014465332,
"learning_rate": 1.2469001859888408e-05,
"loss": 0.5446,
"step": 4860
},
{
"epoch": 0.754804711717297,
"grad_norm": 5.880771636962891,
"learning_rate": 1.245350278983261e-05,
"loss": 0.581,
"step": 4870
},
{
"epoch": 0.7563546187228767,
"grad_norm": 3.1848342418670654,
"learning_rate": 1.2438003719776815e-05,
"loss": 0.7058,
"step": 4880
},
{
"epoch": 0.7579045257284563,
"grad_norm": 3.6056909561157227,
"learning_rate": 1.2422504649721016e-05,
"loss": 0.6554,
"step": 4890
},
{
"epoch": 0.759454432734036,
"grad_norm": 7.92929220199585,
"learning_rate": 1.240700557966522e-05,
"loss": 0.821,
"step": 4900
},
{
"epoch": 0.7610043397396157,
"grad_norm": 9.471344947814941,
"learning_rate": 1.2391506509609424e-05,
"loss": 0.7723,
"step": 4910
},
{
"epoch": 0.7625542467451953,
"grad_norm": 2.8073582649230957,
"learning_rate": 1.2376007439553629e-05,
"loss": 0.6256,
"step": 4920
},
{
"epoch": 0.764104153750775,
"grad_norm": 4.1729416847229,
"learning_rate": 1.236050836949783e-05,
"loss": 0.6186,
"step": 4930
},
{
"epoch": 0.7656540607563547,
"grad_norm": 9.117597579956055,
"learning_rate": 1.2345009299442035e-05,
"loss": 0.6793,
"step": 4940
},
{
"epoch": 0.7672039677619343,
"grad_norm": 6.196779251098633,
"learning_rate": 1.2329510229386236e-05,
"loss": 0.6492,
"step": 4950
},
{
"epoch": 0.768753874767514,
"grad_norm": 7.226120948791504,
"learning_rate": 1.231401115933044e-05,
"loss": 0.5859,
"step": 4960
},
{
"epoch": 0.7703037817730937,
"grad_norm": 4.2936811447143555,
"learning_rate": 1.2298512089274644e-05,
"loss": 0.535,
"step": 4970
},
{
"epoch": 0.7718536887786733,
"grad_norm": 8.011672973632812,
"learning_rate": 1.2283013019218849e-05,
"loss": 0.5563,
"step": 4980
},
{
"epoch": 0.773403595784253,
"grad_norm": 6.809650421142578,
"learning_rate": 1.226751394916305e-05,
"loss": 0.8006,
"step": 4990
},
{
"epoch": 0.7749535027898327,
"grad_norm": 3.524944305419922,
"learning_rate": 1.2252014879107255e-05,
"loss": 0.8,
"step": 5000
},
{
"epoch": 0.7765034097954123,
"grad_norm": 6.521675109863281,
"learning_rate": 1.2236515809051456e-05,
"loss": 0.6234,
"step": 5010
},
{
"epoch": 0.778053316800992,
"grad_norm": 4.332557201385498,
"learning_rate": 1.2221016738995661e-05,
"loss": 0.6875,
"step": 5020
},
{
"epoch": 0.7796032238065717,
"grad_norm": 6.0071001052856445,
"learning_rate": 1.2205517668939864e-05,
"loss": 0.6597,
"step": 5030
},
{
"epoch": 0.7811531308121513,
"grad_norm": 7.195069789886475,
"learning_rate": 1.2190018598884069e-05,
"loss": 0.58,
"step": 5040
},
{
"epoch": 0.782703037817731,
"grad_norm": 5.996945858001709,
"learning_rate": 1.217451952882827e-05,
"loss": 0.6546,
"step": 5050
},
{
"epoch": 0.7842529448233106,
"grad_norm": 5.592831134796143,
"learning_rate": 1.2159020458772475e-05,
"loss": 0.4924,
"step": 5060
},
{
"epoch": 0.7858028518288903,
"grad_norm": 6.841360569000244,
"learning_rate": 1.2143521388716676e-05,
"loss": 0.4969,
"step": 5070
},
{
"epoch": 0.78735275883447,
"grad_norm": 4.553008556365967,
"learning_rate": 1.2128022318660881e-05,
"loss": 0.6957,
"step": 5080
},
{
"epoch": 0.7889026658400496,
"grad_norm": 6.283394813537598,
"learning_rate": 1.2112523248605084e-05,
"loss": 0.8347,
"step": 5090
},
{
"epoch": 0.7904525728456293,
"grad_norm": 10.103006362915039,
"learning_rate": 1.2097024178549289e-05,
"loss": 0.7692,
"step": 5100
},
{
"epoch": 0.792002479851209,
"grad_norm": 7.552977085113525,
"learning_rate": 1.208152510849349e-05,
"loss": 0.5701,
"step": 5110
},
{
"epoch": 0.7935523868567886,
"grad_norm": 5.834062099456787,
"learning_rate": 1.2066026038437695e-05,
"loss": 0.5668,
"step": 5120
},
{
"epoch": 0.7951022938623683,
"grad_norm": 13.34677791595459,
"learning_rate": 1.2050526968381896e-05,
"loss": 0.8795,
"step": 5130
},
{
"epoch": 0.796652200867948,
"grad_norm": 3.1590523719787598,
"learning_rate": 1.2035027898326101e-05,
"loss": 0.681,
"step": 5140
},
{
"epoch": 0.7982021078735276,
"grad_norm": 10.1262845993042,
"learning_rate": 1.2019528828270304e-05,
"loss": 0.6726,
"step": 5150
},
{
"epoch": 0.7997520148791073,
"grad_norm": 6.833702564239502,
"learning_rate": 1.2004029758214509e-05,
"loss": 0.6204,
"step": 5160
},
{
"epoch": 0.801301921884687,
"grad_norm": 3.4104177951812744,
"learning_rate": 1.198853068815871e-05,
"loss": 0.4738,
"step": 5170
},
{
"epoch": 0.8028518288902666,
"grad_norm": 3.8799777030944824,
"learning_rate": 1.1973031618102915e-05,
"loss": 0.5993,
"step": 5180
},
{
"epoch": 0.8044017358958463,
"grad_norm": 3.472444772720337,
"learning_rate": 1.1957532548047116e-05,
"loss": 0.6978,
"step": 5190
},
{
"epoch": 0.805951642901426,
"grad_norm": 9.026004791259766,
"learning_rate": 1.1942033477991321e-05,
"loss": 0.864,
"step": 5200
},
{
"epoch": 0.8075015499070056,
"grad_norm": 5.485747814178467,
"learning_rate": 1.1926534407935524e-05,
"loss": 0.6341,
"step": 5210
},
{
"epoch": 0.8090514569125853,
"grad_norm": 7.51614236831665,
"learning_rate": 1.1911035337879729e-05,
"loss": 0.8633,
"step": 5220
},
{
"epoch": 0.810601363918165,
"grad_norm": 4.139168739318848,
"learning_rate": 1.189553626782393e-05,
"loss": 0.6028,
"step": 5230
},
{
"epoch": 0.8121512709237446,
"grad_norm": 6.211816310882568,
"learning_rate": 1.1880037197768135e-05,
"loss": 0.6971,
"step": 5240
},
{
"epoch": 0.8137011779293243,
"grad_norm": 5.013968467712402,
"learning_rate": 1.1864538127712336e-05,
"loss": 0.7027,
"step": 5250
},
{
"epoch": 0.815251084934904,
"grad_norm": 5.569365978240967,
"learning_rate": 1.1849039057656541e-05,
"loss": 0.6832,
"step": 5260
},
{
"epoch": 0.8168009919404836,
"grad_norm": 10.435909271240234,
"learning_rate": 1.1833539987600744e-05,
"loss": 0.6241,
"step": 5270
},
{
"epoch": 0.8183508989460633,
"grad_norm": 5.697261333465576,
"learning_rate": 1.1818040917544949e-05,
"loss": 0.747,
"step": 5280
},
{
"epoch": 0.8199008059516429,
"grad_norm": 1.861268401145935,
"learning_rate": 1.180254184748915e-05,
"loss": 0.4611,
"step": 5290
},
{
"epoch": 0.8214507129572226,
"grad_norm": 4.731881141662598,
"learning_rate": 1.1787042777433355e-05,
"loss": 0.747,
"step": 5300
},
{
"epoch": 0.8230006199628023,
"grad_norm": 3.6312947273254395,
"learning_rate": 1.1771543707377557e-05,
"loss": 0.7208,
"step": 5310
},
{
"epoch": 0.8245505269683819,
"grad_norm": 4.972537517547607,
"learning_rate": 1.1756044637321761e-05,
"loss": 0.6674,
"step": 5320
},
{
"epoch": 0.8261004339739616,
"grad_norm": 7.811915874481201,
"learning_rate": 1.1740545567265964e-05,
"loss": 0.8836,
"step": 5330
},
{
"epoch": 0.8276503409795413,
"grad_norm": 6.216588497161865,
"learning_rate": 1.172504649721017e-05,
"loss": 0.5343,
"step": 5340
},
{
"epoch": 0.8292002479851209,
"grad_norm": 4.911030292510986,
"learning_rate": 1.170954742715437e-05,
"loss": 0.8264,
"step": 5350
},
{
"epoch": 0.8307501549907006,
"grad_norm": 1.2918230295181274,
"learning_rate": 1.1694048357098575e-05,
"loss": 0.5548,
"step": 5360
},
{
"epoch": 0.8323000619962803,
"grad_norm": 6.884954929351807,
"learning_rate": 1.1678549287042777e-05,
"loss": 0.536,
"step": 5370
},
{
"epoch": 0.8338499690018599,
"grad_norm": 9.084111213684082,
"learning_rate": 1.1663050216986981e-05,
"loss": 0.5337,
"step": 5380
},
{
"epoch": 0.8353998760074396,
"grad_norm": 5.208802700042725,
"learning_rate": 1.1647551146931185e-05,
"loss": 0.5921,
"step": 5390
},
{
"epoch": 0.8369497830130193,
"grad_norm": 9.151023864746094,
"learning_rate": 1.163205207687539e-05,
"loss": 0.708,
"step": 5400
},
{
"epoch": 0.8384996900185989,
"grad_norm": 5.111368656158447,
"learning_rate": 1.161655300681959e-05,
"loss": 0.6403,
"step": 5410
},
{
"epoch": 0.8400495970241786,
"grad_norm": 10.290874481201172,
"learning_rate": 1.1601053936763795e-05,
"loss": 0.8278,
"step": 5420
},
{
"epoch": 0.8415995040297582,
"grad_norm": 7.0446577072143555,
"learning_rate": 1.1585554866707997e-05,
"loss": 0.6219,
"step": 5430
},
{
"epoch": 0.8431494110353379,
"grad_norm": 3.362149238586426,
"learning_rate": 1.1570055796652202e-05,
"loss": 0.4145,
"step": 5440
},
{
"epoch": 0.8446993180409176,
"grad_norm": 10.284631729125977,
"learning_rate": 1.1554556726596405e-05,
"loss": 0.7615,
"step": 5450
},
{
"epoch": 0.8462492250464972,
"grad_norm": 8.443507194519043,
"learning_rate": 1.153905765654061e-05,
"loss": 0.5863,
"step": 5460
},
{
"epoch": 0.8477991320520769,
"grad_norm": 10.936263084411621,
"learning_rate": 1.152355858648481e-05,
"loss": 0.5876,
"step": 5470
},
{
"epoch": 0.8493490390576566,
"grad_norm": 3.9119393825531006,
"learning_rate": 1.1508059516429015e-05,
"loss": 0.5987,
"step": 5480
},
{
"epoch": 0.8508989460632362,
"grad_norm": 12.641061782836914,
"learning_rate": 1.1492560446373217e-05,
"loss": 0.7522,
"step": 5490
},
{
"epoch": 0.8524488530688159,
"grad_norm": 5.811850070953369,
"learning_rate": 1.1477061376317422e-05,
"loss": 0.5882,
"step": 5500
},
{
"epoch": 0.8539987600743956,
"grad_norm": 12.124834060668945,
"learning_rate": 1.1461562306261625e-05,
"loss": 0.7449,
"step": 5510
},
{
"epoch": 0.8555486670799752,
"grad_norm": 5.387477874755859,
"learning_rate": 1.144606323620583e-05,
"loss": 0.6186,
"step": 5520
},
{
"epoch": 0.8570985740855549,
"grad_norm": 7.7974677085876465,
"learning_rate": 1.143056416615003e-05,
"loss": 0.6465,
"step": 5530
},
{
"epoch": 0.8586484810911346,
"grad_norm": 7.694963455200195,
"learning_rate": 1.1415065096094236e-05,
"loss": 0.6354,
"step": 5540
},
{
"epoch": 0.8601983880967142,
"grad_norm": 2.7126150131225586,
"learning_rate": 1.1399566026038437e-05,
"loss": 0.648,
"step": 5550
},
{
"epoch": 0.8617482951022939,
"grad_norm": 5.220517635345459,
"learning_rate": 1.1384066955982642e-05,
"loss": 0.7159,
"step": 5560
},
{
"epoch": 0.8632982021078736,
"grad_norm": 3.2614328861236572,
"learning_rate": 1.1368567885926845e-05,
"loss": 0.6143,
"step": 5570
},
{
"epoch": 0.8648481091134532,
"grad_norm": 7.20950174331665,
"learning_rate": 1.135306881587105e-05,
"loss": 0.6482,
"step": 5580
},
{
"epoch": 0.8663980161190329,
"grad_norm": 5.191160678863525,
"learning_rate": 1.1337569745815251e-05,
"loss": 0.6001,
"step": 5590
},
{
"epoch": 0.8679479231246126,
"grad_norm": 4.177618026733398,
"learning_rate": 1.1322070675759456e-05,
"loss": 0.5531,
"step": 5600
},
{
"epoch": 0.8694978301301922,
"grad_norm": 9.273628234863281,
"learning_rate": 1.1306571605703657e-05,
"loss": 0.7717,
"step": 5610
},
{
"epoch": 0.8710477371357719,
"grad_norm": 8.972816467285156,
"learning_rate": 1.1291072535647862e-05,
"loss": 0.6219,
"step": 5620
},
{
"epoch": 0.8725976441413515,
"grad_norm": 9.480229377746582,
"learning_rate": 1.1275573465592065e-05,
"loss": 0.5561,
"step": 5630
},
{
"epoch": 0.8741475511469312,
"grad_norm": 5.781515121459961,
"learning_rate": 1.126007439553627e-05,
"loss": 0.688,
"step": 5640
},
{
"epoch": 0.8756974581525109,
"grad_norm": 1.9353567361831665,
"learning_rate": 1.1244575325480471e-05,
"loss": 0.8956,
"step": 5650
},
{
"epoch": 0.8772473651580905,
"grad_norm": 4.833968639373779,
"learning_rate": 1.1229076255424676e-05,
"loss": 0.8678,
"step": 5660
},
{
"epoch": 0.8787972721636702,
"grad_norm": 4.125862121582031,
"learning_rate": 1.1213577185368877e-05,
"loss": 0.6416,
"step": 5670
},
{
"epoch": 0.8803471791692499,
"grad_norm": 5.829898834228516,
"learning_rate": 1.1198078115313082e-05,
"loss": 0.7404,
"step": 5680
},
{
"epoch": 0.8818970861748295,
"grad_norm": 3.0239686965942383,
"learning_rate": 1.1182579045257285e-05,
"loss": 0.612,
"step": 5690
},
{
"epoch": 0.8834469931804092,
"grad_norm": 6.91836404800415,
"learning_rate": 1.116707997520149e-05,
"loss": 0.7713,
"step": 5700
},
{
"epoch": 0.8849969001859889,
"grad_norm": 3.822946548461914,
"learning_rate": 1.1151580905145691e-05,
"loss": 0.5584,
"step": 5710
},
{
"epoch": 0.8865468071915685,
"grad_norm": 7.173694610595703,
"learning_rate": 1.1136081835089896e-05,
"loss": 0.6347,
"step": 5720
},
{
"epoch": 0.8880967141971482,
"grad_norm": 5.30385160446167,
"learning_rate": 1.1120582765034097e-05,
"loss": 0.652,
"step": 5730
},
{
"epoch": 0.8896466212027279,
"grad_norm": 5.35252571105957,
"learning_rate": 1.1105083694978302e-05,
"loss": 0.718,
"step": 5740
},
{
"epoch": 0.8911965282083075,
"grad_norm": 5.5714850425720215,
"learning_rate": 1.1089584624922505e-05,
"loss": 0.7949,
"step": 5750
},
{
"epoch": 0.8927464352138872,
"grad_norm": 1.9642515182495117,
"learning_rate": 1.107408555486671e-05,
"loss": 0.5296,
"step": 5760
},
{
"epoch": 0.8942963422194669,
"grad_norm": 5.072615623474121,
"learning_rate": 1.1058586484810911e-05,
"loss": 0.5537,
"step": 5770
},
{
"epoch": 0.8958462492250465,
"grad_norm": 3.3164658546447754,
"learning_rate": 1.1043087414755116e-05,
"loss": 0.6519,
"step": 5780
},
{
"epoch": 0.8973961562306262,
"grad_norm": 7.427783489227295,
"learning_rate": 1.1027588344699317e-05,
"loss": 0.7408,
"step": 5790
},
{
"epoch": 0.8989460632362059,
"grad_norm": 5.606306076049805,
"learning_rate": 1.1012089274643522e-05,
"loss": 0.7276,
"step": 5800
},
{
"epoch": 0.9004959702417855,
"grad_norm": 10.554597854614258,
"learning_rate": 1.0996590204587725e-05,
"loss": 0.8125,
"step": 5810
},
{
"epoch": 0.9020458772473652,
"grad_norm": 4.86166524887085,
"learning_rate": 1.098109113453193e-05,
"loss": 0.5918,
"step": 5820
},
{
"epoch": 0.9035957842529448,
"grad_norm": 7.691864013671875,
"learning_rate": 1.0965592064476131e-05,
"loss": 0.7419,
"step": 5830
},
{
"epoch": 0.9051456912585245,
"grad_norm": 7.993322849273682,
"learning_rate": 1.0950092994420336e-05,
"loss": 0.7957,
"step": 5840
},
{
"epoch": 0.9066955982641042,
"grad_norm": 5.241565227508545,
"learning_rate": 1.0934593924364537e-05,
"loss": 0.603,
"step": 5850
},
{
"epoch": 0.9082455052696838,
"grad_norm": 3.7309389114379883,
"learning_rate": 1.0919094854308742e-05,
"loss": 0.5684,
"step": 5860
},
{
"epoch": 0.9097954122752635,
"grad_norm": 6.568719387054443,
"learning_rate": 1.0903595784252945e-05,
"loss": 0.5626,
"step": 5870
},
{
"epoch": 0.9113453192808432,
"grad_norm": 12.020549774169922,
"learning_rate": 1.088809671419715e-05,
"loss": 0.6653,
"step": 5880
},
{
"epoch": 0.9128952262864228,
"grad_norm": 5.109013557434082,
"learning_rate": 1.0872597644141351e-05,
"loss": 0.6933,
"step": 5890
},
{
"epoch": 0.9144451332920025,
"grad_norm": 5.643796920776367,
"learning_rate": 1.0857098574085556e-05,
"loss": 0.5707,
"step": 5900
},
{
"epoch": 0.9159950402975822,
"grad_norm": 8.789315223693848,
"learning_rate": 1.0841599504029758e-05,
"loss": 0.5212,
"step": 5910
},
{
"epoch": 0.9175449473031618,
"grad_norm": 7.760068893432617,
"learning_rate": 1.0826100433973962e-05,
"loss": 0.618,
"step": 5920
},
{
"epoch": 0.9190948543087415,
"grad_norm": 6.435039520263672,
"learning_rate": 1.0810601363918165e-05,
"loss": 0.622,
"step": 5930
},
{
"epoch": 0.9206447613143212,
"grad_norm": 8.1256685256958,
"learning_rate": 1.079510229386237e-05,
"loss": 0.7558,
"step": 5940
},
{
"epoch": 0.9221946683199008,
"grad_norm": 6.529952049255371,
"learning_rate": 1.0779603223806571e-05,
"loss": 0.6672,
"step": 5950
},
{
"epoch": 0.9237445753254805,
"grad_norm": 7.354854583740234,
"learning_rate": 1.0764104153750776e-05,
"loss": 0.7919,
"step": 5960
},
{
"epoch": 0.9252944823310602,
"grad_norm": 4.898510456085205,
"learning_rate": 1.0748605083694978e-05,
"loss": 0.6045,
"step": 5970
},
{
"epoch": 0.9268443893366398,
"grad_norm": 13.176275253295898,
"learning_rate": 1.0733106013639182e-05,
"loss": 0.6485,
"step": 5980
},
{
"epoch": 0.9283942963422195,
"grad_norm": 8.560967445373535,
"learning_rate": 1.0717606943583385e-05,
"loss": 0.8658,
"step": 5990
},
{
"epoch": 0.9299442033477991,
"grad_norm": 5.788320541381836,
"learning_rate": 1.070210787352759e-05,
"loss": 0.7069,
"step": 6000
},
{
"epoch": 0.9314941103533788,
"grad_norm": 9.707194328308105,
"learning_rate": 1.0686608803471792e-05,
"loss": 0.5805,
"step": 6010
},
{
"epoch": 0.9330440173589585,
"grad_norm": 4.44306755065918,
"learning_rate": 1.0671109733415996e-05,
"loss": 0.6697,
"step": 6020
},
{
"epoch": 0.9345939243645381,
"grad_norm": 10.93558120727539,
"learning_rate": 1.0655610663360198e-05,
"loss": 0.5436,
"step": 6030
},
{
"epoch": 0.9361438313701178,
"grad_norm": 5.45318603515625,
"learning_rate": 1.0640111593304402e-05,
"loss": 0.4397,
"step": 6040
},
{
"epoch": 0.9376937383756975,
"grad_norm": 7.893631458282471,
"learning_rate": 1.0624612523248606e-05,
"loss": 0.6742,
"step": 6050
},
{
"epoch": 0.9392436453812771,
"grad_norm": 3.9682693481445312,
"learning_rate": 1.060911345319281e-05,
"loss": 0.75,
"step": 6060
},
{
"epoch": 0.9407935523868568,
"grad_norm": 9.17393684387207,
"learning_rate": 1.0593614383137012e-05,
"loss": 0.6765,
"step": 6070
},
{
"epoch": 0.9423434593924365,
"grad_norm": 6.118794918060303,
"learning_rate": 1.0578115313081216e-05,
"loss": 0.794,
"step": 6080
},
{
"epoch": 0.9438933663980161,
"grad_norm": 5.624363422393799,
"learning_rate": 1.056261624302542e-05,
"loss": 0.7256,
"step": 6090
},
{
"epoch": 0.9454432734035958,
"grad_norm": 4.654599666595459,
"learning_rate": 1.0547117172969623e-05,
"loss": 0.5472,
"step": 6100
},
{
"epoch": 0.9469931804091755,
"grad_norm": 3.4627890586853027,
"learning_rate": 1.0531618102913826e-05,
"loss": 0.787,
"step": 6110
},
{
"epoch": 0.9485430874147551,
"grad_norm": 4.292537212371826,
"learning_rate": 1.051611903285803e-05,
"loss": 0.719,
"step": 6120
},
{
"epoch": 0.9500929944203348,
"grad_norm": 8.54881477355957,
"learning_rate": 1.0500619962802232e-05,
"loss": 0.5448,
"step": 6130
},
{
"epoch": 0.9516429014259145,
"grad_norm": 6.626066207885742,
"learning_rate": 1.0485120892746437e-05,
"loss": 0.7148,
"step": 6140
},
{
"epoch": 0.9531928084314941,
"grad_norm": 5.619740009307861,
"learning_rate": 1.046962182269064e-05,
"loss": 0.9223,
"step": 6150
},
{
"epoch": 0.9547427154370738,
"grad_norm": 5.610943794250488,
"learning_rate": 1.0454122752634843e-05,
"loss": 0.7121,
"step": 6160
},
{
"epoch": 0.9562926224426535,
"grad_norm": 4.307107925415039,
"learning_rate": 1.0438623682579046e-05,
"loss": 0.7286,
"step": 6170
},
{
"epoch": 0.9578425294482331,
"grad_norm": 9.234688758850098,
"learning_rate": 1.042312461252325e-05,
"loss": 0.5873,
"step": 6180
},
{
"epoch": 0.9593924364538128,
"grad_norm": 4.530324935913086,
"learning_rate": 1.0407625542467452e-05,
"loss": 0.941,
"step": 6190
},
{
"epoch": 0.9609423434593924,
"grad_norm": 3.6115729808807373,
"learning_rate": 1.0392126472411657e-05,
"loss": 0.655,
"step": 6200
},
{
"epoch": 0.9624922504649721,
"grad_norm": 1.2821598052978516,
"learning_rate": 1.037662740235586e-05,
"loss": 0.6786,
"step": 6210
},
{
"epoch": 0.9640421574705518,
"grad_norm": 6.109910011291504,
"learning_rate": 1.0361128332300063e-05,
"loss": 0.7574,
"step": 6220
},
{
"epoch": 0.9655920644761314,
"grad_norm": 3.653637409210205,
"learning_rate": 1.0345629262244266e-05,
"loss": 0.5425,
"step": 6230
},
{
"epoch": 0.9671419714817111,
"grad_norm": 10.726699829101562,
"learning_rate": 1.033013019218847e-05,
"loss": 0.7053,
"step": 6240
},
{
"epoch": 0.9686918784872908,
"grad_norm": 4.3392863273620605,
"learning_rate": 1.0314631122132672e-05,
"loss": 0.5929,
"step": 6250
},
{
"epoch": 0.9702417854928704,
"grad_norm": 4.076502323150635,
"learning_rate": 1.0299132052076877e-05,
"loss": 0.6403,
"step": 6260
},
{
"epoch": 0.9717916924984501,
"grad_norm": 2.983644485473633,
"learning_rate": 1.028363298202108e-05,
"loss": 0.4164,
"step": 6270
},
{
"epoch": 0.9733415995040298,
"grad_norm": 3.6795578002929688,
"learning_rate": 1.0268133911965283e-05,
"loss": 0.7013,
"step": 6280
},
{
"epoch": 0.9748915065096094,
"grad_norm": 5.60479736328125,
"learning_rate": 1.0252634841909486e-05,
"loss": 1.0202,
"step": 6290
},
{
"epoch": 0.9764414135151891,
"grad_norm": 5.775755882263184,
"learning_rate": 1.023713577185369e-05,
"loss": 0.6361,
"step": 6300
},
{
"epoch": 0.9779913205207688,
"grad_norm": 5.529523849487305,
"learning_rate": 1.0221636701797892e-05,
"loss": 0.7928,
"step": 6310
},
{
"epoch": 0.9795412275263484,
"grad_norm": 6.792111396789551,
"learning_rate": 1.0206137631742097e-05,
"loss": 0.5417,
"step": 6320
},
{
"epoch": 0.9810911345319281,
"grad_norm": 8.089174270629883,
"learning_rate": 1.01906385616863e-05,
"loss": 0.4928,
"step": 6330
},
{
"epoch": 0.9826410415375078,
"grad_norm": 4.665356636047363,
"learning_rate": 1.0175139491630503e-05,
"loss": 0.6717,
"step": 6340
},
{
"epoch": 0.9841909485430874,
"grad_norm": 6.834210395812988,
"learning_rate": 1.0159640421574706e-05,
"loss": 0.9058,
"step": 6350
},
{
"epoch": 0.9857408555486671,
"grad_norm": 3.173269748687744,
"learning_rate": 1.014414135151891e-05,
"loss": 0.5223,
"step": 6360
},
{
"epoch": 0.9872907625542467,
"grad_norm": 5.208651065826416,
"learning_rate": 1.0128642281463112e-05,
"loss": 0.6229,
"step": 6370
},
{
"epoch": 0.9888406695598264,
"grad_norm": 4.102089881896973,
"learning_rate": 1.0113143211407317e-05,
"loss": 0.7312,
"step": 6380
},
{
"epoch": 0.9903905765654061,
"grad_norm": 3.437283754348755,
"learning_rate": 1.009764414135152e-05,
"loss": 0.6693,
"step": 6390
},
{
"epoch": 0.9919404835709857,
"grad_norm": 2.8786211013793945,
"learning_rate": 1.0082145071295723e-05,
"loss": 0.5546,
"step": 6400
},
{
"epoch": 0.9934903905765654,
"grad_norm": 4.383023262023926,
"learning_rate": 1.0066646001239926e-05,
"loss": 0.6212,
"step": 6410
},
{
"epoch": 0.9950402975821451,
"grad_norm": 9.466085433959961,
"learning_rate": 1.0051146931184131e-05,
"loss": 0.7268,
"step": 6420
},
{
"epoch": 0.9965902045877247,
"grad_norm": 6.23153018951416,
"learning_rate": 1.0035647861128332e-05,
"loss": 0.6134,
"step": 6430
},
{
"epoch": 0.9981401115933044,
"grad_norm": 5.540073394775391,
"learning_rate": 1.0020148791072537e-05,
"loss": 0.5933,
"step": 6440
},
{
"epoch": 0.9996900185988841,
"grad_norm": 3.962766170501709,
"learning_rate": 1.000464972101674e-05,
"loss": 0.5341,
"step": 6450
},
{
"epoch": 1.0,
"eval_accuracy": 0.3304628632938644,
"eval_loss": 1.8274908065795898,
"eval_runtime": 74.0511,
"eval_samples_per_second": 25.091,
"eval_steps_per_second": 3.146,
"step": 6452
},
{
"epoch": 1.0012399256044637,
"grad_norm": 3.318366050720215,
"learning_rate": 9.989150650960943e-06,
"loss": 0.6346,
"step": 6460
},
{
"epoch": 1.0027898326100435,
"grad_norm": 7.832874298095703,
"learning_rate": 9.973651580905146e-06,
"loss": 0.6954,
"step": 6470
},
{
"epoch": 1.004339739615623,
"grad_norm": 1.6306357383728027,
"learning_rate": 9.95815251084935e-06,
"loss": 0.5244,
"step": 6480
},
{
"epoch": 1.0058896466212028,
"grad_norm": 6.60695743560791,
"learning_rate": 9.942653440793552e-06,
"loss": 0.5221,
"step": 6490
},
{
"epoch": 1.0074395536267824,
"grad_norm": 7.101430416107178,
"learning_rate": 9.927154370737755e-06,
"loss": 0.8363,
"step": 6500
},
{
"epoch": 1.0089894606323622,
"grad_norm": 2.7308666706085205,
"learning_rate": 9.91165530068196e-06,
"loss": 0.497,
"step": 6510
},
{
"epoch": 1.0105393676379417,
"grad_norm": 7.153020858764648,
"learning_rate": 9.896156230626163e-06,
"loss": 0.733,
"step": 6520
},
{
"epoch": 1.0120892746435215,
"grad_norm": 8.50242805480957,
"learning_rate": 9.880657160570366e-06,
"loss": 0.6382,
"step": 6530
},
{
"epoch": 1.013639181649101,
"grad_norm": 3.3164360523223877,
"learning_rate": 9.86515809051457e-06,
"loss": 0.4422,
"step": 6540
},
{
"epoch": 1.0151890886546808,
"grad_norm": 9.223011016845703,
"learning_rate": 9.849659020458772e-06,
"loss": 0.7345,
"step": 6550
},
{
"epoch": 1.0167389956602604,
"grad_norm": 2.469008207321167,
"learning_rate": 9.834159950402975e-06,
"loss": 0.5223,
"step": 6560
},
{
"epoch": 1.0182889026658402,
"grad_norm": 8.593854904174805,
"learning_rate": 9.81866088034718e-06,
"loss": 0.6986,
"step": 6570
},
{
"epoch": 1.0198388096714197,
"grad_norm": 2.7528738975524902,
"learning_rate": 9.803161810291383e-06,
"loss": 0.5359,
"step": 6580
},
{
"epoch": 1.0213887166769995,
"grad_norm": 8.594300270080566,
"learning_rate": 9.787662740235586e-06,
"loss": 0.489,
"step": 6590
},
{
"epoch": 1.022938623682579,
"grad_norm": 10.482670783996582,
"learning_rate": 9.77216367017979e-06,
"loss": 0.5833,
"step": 6600
},
{
"epoch": 1.0244885306881588,
"grad_norm": 7.439445495605469,
"learning_rate": 9.756664600123993e-06,
"loss": 0.4675,
"step": 6610
},
{
"epoch": 1.0260384376937384,
"grad_norm": 5.366421699523926,
"learning_rate": 9.741165530068196e-06,
"loss": 0.5947,
"step": 6620
},
{
"epoch": 1.0275883446993181,
"grad_norm": 7.488732814788818,
"learning_rate": 9.7256664600124e-06,
"loss": 0.609,
"step": 6630
},
{
"epoch": 1.0291382517048977,
"grad_norm": 6.010669231414795,
"learning_rate": 9.710167389956603e-06,
"loss": 0.7328,
"step": 6640
},
{
"epoch": 1.0306881587104775,
"grad_norm": 5.864341735839844,
"learning_rate": 9.694668319900806e-06,
"loss": 0.4796,
"step": 6650
},
{
"epoch": 1.032238065716057,
"grad_norm": 9.411959648132324,
"learning_rate": 9.67916924984501e-06,
"loss": 0.4709,
"step": 6660
},
{
"epoch": 1.0337879727216368,
"grad_norm": 3.1275575160980225,
"learning_rate": 9.663670179789213e-06,
"loss": 0.4787,
"step": 6670
},
{
"epoch": 1.0353378797272164,
"grad_norm": 6.15601110458374,
"learning_rate": 9.648171109733416e-06,
"loss": 0.5337,
"step": 6680
},
{
"epoch": 1.0368877867327961,
"grad_norm": 5.146254062652588,
"learning_rate": 9.63267203967762e-06,
"loss": 0.5042,
"step": 6690
},
{
"epoch": 1.0384376937383757,
"grad_norm": 7.776717662811279,
"learning_rate": 9.617172969621824e-06,
"loss": 0.8024,
"step": 6700
},
{
"epoch": 1.0399876007439555,
"grad_norm": 7.797553539276123,
"learning_rate": 9.601673899566027e-06,
"loss": 0.5992,
"step": 6710
},
{
"epoch": 1.041537507749535,
"grad_norm": 6.587271213531494,
"learning_rate": 9.58617482951023e-06,
"loss": 0.5811,
"step": 6720
},
{
"epoch": 1.0430874147551148,
"grad_norm": 8.45751667022705,
"learning_rate": 9.570675759454433e-06,
"loss": 0.7937,
"step": 6730
},
{
"epoch": 1.0446373217606943,
"grad_norm": 6.197624206542969,
"learning_rate": 9.555176689398636e-06,
"loss": 0.5891,
"step": 6740
},
{
"epoch": 1.0461872287662741,
"grad_norm": 5.508509159088135,
"learning_rate": 9.53967761934284e-06,
"loss": 0.4753,
"step": 6750
},
{
"epoch": 1.0477371357718537,
"grad_norm": 4.740791320800781,
"learning_rate": 9.524178549287044e-06,
"loss": 0.6337,
"step": 6760
},
{
"epoch": 1.0492870427774335,
"grad_norm": 5.38060188293457,
"learning_rate": 9.508679479231247e-06,
"loss": 0.4552,
"step": 6770
},
{
"epoch": 1.050836949783013,
"grad_norm": 11.015870094299316,
"learning_rate": 9.49318040917545e-06,
"loss": 0.6587,
"step": 6780
},
{
"epoch": 1.0523868567885928,
"grad_norm": 6.224707126617432,
"learning_rate": 9.477681339119653e-06,
"loss": 0.5701,
"step": 6790
},
{
"epoch": 1.0539367637941723,
"grad_norm": 5.798990249633789,
"learning_rate": 9.462182269063856e-06,
"loss": 0.6514,
"step": 6800
},
{
"epoch": 1.0554866707997521,
"grad_norm": 4.468217372894287,
"learning_rate": 9.44668319900806e-06,
"loss": 0.6995,
"step": 6810
},
{
"epoch": 1.0570365778053317,
"grad_norm": 9.877872467041016,
"learning_rate": 9.431184128952264e-06,
"loss": 0.4892,
"step": 6820
},
{
"epoch": 1.0585864848109114,
"grad_norm": 1.780200481414795,
"learning_rate": 9.415685058896467e-06,
"loss": 0.498,
"step": 6830
},
{
"epoch": 1.060136391816491,
"grad_norm": 6.5555195808410645,
"learning_rate": 9.40018598884067e-06,
"loss": 0.4536,
"step": 6840
},
{
"epoch": 1.0616862988220708,
"grad_norm": 6.608431816101074,
"learning_rate": 9.384686918784873e-06,
"loss": 0.5125,
"step": 6850
},
{
"epoch": 1.0632362058276503,
"grad_norm": 2.031834125518799,
"learning_rate": 9.369187848729076e-06,
"loss": 0.4829,
"step": 6860
},
{
"epoch": 1.06478611283323,
"grad_norm": 2.5182507038116455,
"learning_rate": 9.35368877867328e-06,
"loss": 0.5326,
"step": 6870
},
{
"epoch": 1.0663360198388097,
"grad_norm": 5.3375396728515625,
"learning_rate": 9.338189708617484e-06,
"loss": 0.6348,
"step": 6880
},
{
"epoch": 1.0678859268443894,
"grad_norm": 3.0548906326293945,
"learning_rate": 9.322690638561687e-06,
"loss": 0.6643,
"step": 6890
},
{
"epoch": 1.069435833849969,
"grad_norm": 7.555820465087891,
"learning_rate": 9.30719156850589e-06,
"loss": 0.7388,
"step": 6900
},
{
"epoch": 1.0709857408555488,
"grad_norm": 6.28872537612915,
"learning_rate": 9.291692498450093e-06,
"loss": 0.5822,
"step": 6910
},
{
"epoch": 1.0725356478611283,
"grad_norm": 7.888534069061279,
"learning_rate": 9.276193428394296e-06,
"loss": 0.7131,
"step": 6920
},
{
"epoch": 1.074085554866708,
"grad_norm": 5.015232563018799,
"learning_rate": 9.2606943583385e-06,
"loss": 0.5975,
"step": 6930
},
{
"epoch": 1.0756354618722876,
"grad_norm": 3.5587122440338135,
"learning_rate": 9.245195288282704e-06,
"loss": 0.4989,
"step": 6940
},
{
"epoch": 1.0771853688778674,
"grad_norm": 6.01540994644165,
"learning_rate": 9.229696218226907e-06,
"loss": 0.7066,
"step": 6950
},
{
"epoch": 1.078735275883447,
"grad_norm": 4.600519180297852,
"learning_rate": 9.21419714817111e-06,
"loss": 0.4749,
"step": 6960
},
{
"epoch": 1.0802851828890268,
"grad_norm": 3.1370503902435303,
"learning_rate": 9.198698078115313e-06,
"loss": 0.4503,
"step": 6970
},
{
"epoch": 1.0818350898946063,
"grad_norm": 3.2662289142608643,
"learning_rate": 9.183199008059516e-06,
"loss": 0.4964,
"step": 6980
},
{
"epoch": 1.083384996900186,
"grad_norm": 5.026987075805664,
"learning_rate": 9.167699938003721e-06,
"loss": 0.7086,
"step": 6990
},
{
"epoch": 1.0849349039057656,
"grad_norm": 4.068531036376953,
"learning_rate": 9.152200867947924e-06,
"loss": 0.5591,
"step": 7000
},
{
"epoch": 1.0864848109113454,
"grad_norm": 6.460653781890869,
"learning_rate": 9.136701797892127e-06,
"loss": 0.5162,
"step": 7010
},
{
"epoch": 1.088034717916925,
"grad_norm": 4.980759620666504,
"learning_rate": 9.12120272783633e-06,
"loss": 0.6463,
"step": 7020
},
{
"epoch": 1.0895846249225047,
"grad_norm": 10.006918907165527,
"learning_rate": 9.105703657780533e-06,
"loss": 0.5328,
"step": 7030
},
{
"epoch": 1.0911345319280843,
"grad_norm": 8.877737998962402,
"learning_rate": 9.090204587724736e-06,
"loss": 0.7657,
"step": 7040
},
{
"epoch": 1.092684438933664,
"grad_norm": 5.694168567657471,
"learning_rate": 9.074705517668941e-06,
"loss": 0.562,
"step": 7050
},
{
"epoch": 1.0942343459392436,
"grad_norm": 3.7370598316192627,
"learning_rate": 9.059206447613144e-06,
"loss": 0.5905,
"step": 7060
},
{
"epoch": 1.0957842529448234,
"grad_norm": 4.786223411560059,
"learning_rate": 9.043707377557347e-06,
"loss": 0.5709,
"step": 7070
},
{
"epoch": 1.097334159950403,
"grad_norm": 3.233438014984131,
"learning_rate": 9.02820830750155e-06,
"loss": 0.5833,
"step": 7080
},
{
"epoch": 1.0988840669559827,
"grad_norm": 4.648927688598633,
"learning_rate": 9.012709237445753e-06,
"loss": 0.618,
"step": 7090
},
{
"epoch": 1.1004339739615623,
"grad_norm": 4.235779762268066,
"learning_rate": 8.997210167389956e-06,
"loss": 0.5516,
"step": 7100
},
{
"epoch": 1.101983880967142,
"grad_norm": 12.350091934204102,
"learning_rate": 8.981711097334161e-06,
"loss": 0.6225,
"step": 7110
},
{
"epoch": 1.1035337879727216,
"grad_norm": 2.811981678009033,
"learning_rate": 8.966212027278364e-06,
"loss": 0.6682,
"step": 7120
},
{
"epoch": 1.1050836949783014,
"grad_norm": 6.797405242919922,
"learning_rate": 8.950712957222567e-06,
"loss": 0.4458,
"step": 7130
},
{
"epoch": 1.106633601983881,
"grad_norm": 5.518606662750244,
"learning_rate": 8.93521388716677e-06,
"loss": 0.7404,
"step": 7140
},
{
"epoch": 1.1081835089894607,
"grad_norm": 5.039639472961426,
"learning_rate": 8.919714817110973e-06,
"loss": 0.5223,
"step": 7150
},
{
"epoch": 1.1097334159950403,
"grad_norm": 5.415022373199463,
"learning_rate": 8.904215747055176e-06,
"loss": 0.4564,
"step": 7160
},
{
"epoch": 1.11128332300062,
"grad_norm": 8.877534866333008,
"learning_rate": 8.888716676999381e-06,
"loss": 0.5022,
"step": 7170
},
{
"epoch": 1.1128332300061996,
"grad_norm": 2.6606900691986084,
"learning_rate": 8.873217606943584e-06,
"loss": 0.4871,
"step": 7180
},
{
"epoch": 1.1143831370117794,
"grad_norm": 2.137470006942749,
"learning_rate": 8.857718536887787e-06,
"loss": 0.433,
"step": 7190
},
{
"epoch": 1.115933044017359,
"grad_norm": 4.714048385620117,
"learning_rate": 8.84221946683199e-06,
"loss": 0.6005,
"step": 7200
},
{
"epoch": 1.1174829510229387,
"grad_norm": 0.6257525086402893,
"learning_rate": 8.826720396776193e-06,
"loss": 0.6731,
"step": 7210
},
{
"epoch": 1.1190328580285183,
"grad_norm": 6.286426544189453,
"learning_rate": 8.811221326720397e-06,
"loss": 0.6277,
"step": 7220
},
{
"epoch": 1.120582765034098,
"grad_norm": 2.719097137451172,
"learning_rate": 8.795722256664601e-06,
"loss": 0.4603,
"step": 7230
},
{
"epoch": 1.1221326720396776,
"grad_norm": 6.69702672958374,
"learning_rate": 8.780223186608804e-06,
"loss": 0.4442,
"step": 7240
},
{
"epoch": 1.1236825790452574,
"grad_norm": 8.40479850769043,
"learning_rate": 8.764724116553007e-06,
"loss": 0.5393,
"step": 7250
},
{
"epoch": 1.125232486050837,
"grad_norm": 1.8190903663635254,
"learning_rate": 8.74922504649721e-06,
"loss": 0.5887,
"step": 7260
},
{
"epoch": 1.1267823930564167,
"grad_norm": 6.128231048583984,
"learning_rate": 8.733725976441414e-06,
"loss": 0.815,
"step": 7270
},
{
"epoch": 1.1283323000619963,
"grad_norm": 4.16558313369751,
"learning_rate": 8.718226906385617e-06,
"loss": 0.374,
"step": 7280
},
{
"epoch": 1.129882207067576,
"grad_norm": 3.754733085632324,
"learning_rate": 8.702727836329821e-06,
"loss": 0.5951,
"step": 7290
},
{
"epoch": 1.1314321140731556,
"grad_norm": 4.329035758972168,
"learning_rate": 8.687228766274024e-06,
"loss": 0.5914,
"step": 7300
},
{
"epoch": 1.1329820210787354,
"grad_norm": 8.114266395568848,
"learning_rate": 8.671729696218228e-06,
"loss": 0.5051,
"step": 7310
},
{
"epoch": 1.134531928084315,
"grad_norm": 4.3097310066223145,
"learning_rate": 8.65623062616243e-06,
"loss": 0.649,
"step": 7320
},
{
"epoch": 1.1360818350898947,
"grad_norm": 8.02452278137207,
"learning_rate": 8.640731556106634e-06,
"loss": 0.583,
"step": 7330
},
{
"epoch": 1.1376317420954742,
"grad_norm": 9.19294261932373,
"learning_rate": 8.625232486050837e-06,
"loss": 0.611,
"step": 7340
},
{
"epoch": 1.139181649101054,
"grad_norm": 4.483541965484619,
"learning_rate": 8.609733415995041e-06,
"loss": 0.6056,
"step": 7350
},
{
"epoch": 1.1407315561066336,
"grad_norm": 7.466578960418701,
"learning_rate": 8.594234345939245e-06,
"loss": 0.5618,
"step": 7360
},
{
"epoch": 1.1422814631122133,
"grad_norm": 7.183987617492676,
"learning_rate": 8.578735275883448e-06,
"loss": 0.6829,
"step": 7370
},
{
"epoch": 1.143831370117793,
"grad_norm": 10.142129898071289,
"learning_rate": 8.56323620582765e-06,
"loss": 0.5294,
"step": 7380
},
{
"epoch": 1.1453812771233727,
"grad_norm": 6.052461624145508,
"learning_rate": 8.547737135771854e-06,
"loss": 0.6231,
"step": 7390
},
{
"epoch": 1.1469311841289522,
"grad_norm": 6.29191255569458,
"learning_rate": 8.532238065716057e-06,
"loss": 0.7254,
"step": 7400
},
{
"epoch": 1.148481091134532,
"grad_norm": 3.439829111099243,
"learning_rate": 8.516738995660262e-06,
"loss": 0.475,
"step": 7410
},
{
"epoch": 1.1500309981401116,
"grad_norm": 6.537298679351807,
"learning_rate": 8.501239925604465e-06,
"loss": 0.5389,
"step": 7420
},
{
"epoch": 1.1515809051456913,
"grad_norm": 6.487965106964111,
"learning_rate": 8.485740855548668e-06,
"loss": 0.7397,
"step": 7430
},
{
"epoch": 1.153130812151271,
"grad_norm": 8.069173812866211,
"learning_rate": 8.47024178549287e-06,
"loss": 0.6333,
"step": 7440
},
{
"epoch": 1.1546807191568507,
"grad_norm": 3.9428892135620117,
"learning_rate": 8.454742715437074e-06,
"loss": 0.5218,
"step": 7450
},
{
"epoch": 1.1562306261624302,
"grad_norm": 4.833446502685547,
"learning_rate": 8.439243645381277e-06,
"loss": 0.6174,
"step": 7460
},
{
"epoch": 1.15778053316801,
"grad_norm": 3.746244192123413,
"learning_rate": 8.423744575325482e-06,
"loss": 0.6375,
"step": 7470
},
{
"epoch": 1.1593304401735895,
"grad_norm": 11.369379043579102,
"learning_rate": 8.408245505269685e-06,
"loss": 0.5214,
"step": 7480
},
{
"epoch": 1.1608803471791693,
"grad_norm": 4.5933146476745605,
"learning_rate": 8.392746435213888e-06,
"loss": 0.5573,
"step": 7490
},
{
"epoch": 1.1624302541847489,
"grad_norm": 11.557657241821289,
"learning_rate": 8.377247365158091e-06,
"loss": 0.5721,
"step": 7500
},
{
"epoch": 1.1639801611903287,
"grad_norm": 8.945573806762695,
"learning_rate": 8.361748295102294e-06,
"loss": 0.6176,
"step": 7510
},
{
"epoch": 1.1655300681959082,
"grad_norm": 3.769106864929199,
"learning_rate": 8.346249225046497e-06,
"loss": 0.4116,
"step": 7520
},
{
"epoch": 1.167079975201488,
"grad_norm": 1.9121352434158325,
"learning_rate": 8.330750154990702e-06,
"loss": 0.4073,
"step": 7530
},
{
"epoch": 1.1686298822070675,
"grad_norm": 5.85322380065918,
"learning_rate": 8.315251084934905e-06,
"loss": 0.6172,
"step": 7540
},
{
"epoch": 1.1701797892126473,
"grad_norm": 5.601438045501709,
"learning_rate": 8.299752014879108e-06,
"loss": 0.6072,
"step": 7550
},
{
"epoch": 1.1717296962182269,
"grad_norm": 10.24169921875,
"learning_rate": 8.284252944823311e-06,
"loss": 0.7041,
"step": 7560
},
{
"epoch": 1.1732796032238066,
"grad_norm": 5.549180030822754,
"learning_rate": 8.268753874767514e-06,
"loss": 0.7634,
"step": 7570
},
{
"epoch": 1.1748295102293862,
"grad_norm": 9.62266731262207,
"learning_rate": 8.253254804711717e-06,
"loss": 0.6324,
"step": 7580
},
{
"epoch": 1.176379417234966,
"grad_norm": 10.882791519165039,
"learning_rate": 8.237755734655922e-06,
"loss": 0.5471,
"step": 7590
},
{
"epoch": 1.1779293242405455,
"grad_norm": 4.671418190002441,
"learning_rate": 8.222256664600125e-06,
"loss": 0.5437,
"step": 7600
},
{
"epoch": 1.1794792312461253,
"grad_norm": 4.53378963470459,
"learning_rate": 8.206757594544328e-06,
"loss": 0.4634,
"step": 7610
},
{
"epoch": 1.1810291382517049,
"grad_norm": 7.487468719482422,
"learning_rate": 8.191258524488531e-06,
"loss": 0.5741,
"step": 7620
},
{
"epoch": 1.1825790452572846,
"grad_norm": 2.953185796737671,
"learning_rate": 8.175759454432734e-06,
"loss": 0.4571,
"step": 7630
},
{
"epoch": 1.1841289522628642,
"grad_norm": 4.020585536956787,
"learning_rate": 8.160260384376937e-06,
"loss": 0.5718,
"step": 7640
},
{
"epoch": 1.185678859268444,
"grad_norm": 15.922945976257324,
"learning_rate": 8.144761314321142e-06,
"loss": 0.5363,
"step": 7650
},
{
"epoch": 1.1872287662740235,
"grad_norm": 12.803021430969238,
"learning_rate": 8.129262244265345e-06,
"loss": 0.5454,
"step": 7660
},
{
"epoch": 1.1887786732796033,
"grad_norm": 7.86555814743042,
"learning_rate": 8.113763174209548e-06,
"loss": 0.6119,
"step": 7670
},
{
"epoch": 1.1903285802851828,
"grad_norm": 0.6643087267875671,
"learning_rate": 8.098264104153751e-06,
"loss": 0.5261,
"step": 7680
},
{
"epoch": 1.1918784872907626,
"grad_norm": 6.745521545410156,
"learning_rate": 8.082765034097954e-06,
"loss": 0.55,
"step": 7690
},
{
"epoch": 1.1934283942963422,
"grad_norm": 9.781978607177734,
"learning_rate": 8.067265964042157e-06,
"loss": 0.838,
"step": 7700
},
{
"epoch": 1.194978301301922,
"grad_norm": 8.02038288116455,
"learning_rate": 8.051766893986362e-06,
"loss": 0.6207,
"step": 7710
},
{
"epoch": 1.1965282083075015,
"grad_norm": 12.820682525634766,
"learning_rate": 8.036267823930565e-06,
"loss": 0.7424,
"step": 7720
},
{
"epoch": 1.1980781153130813,
"grad_norm": 4.991171836853027,
"learning_rate": 8.020768753874768e-06,
"loss": 0.5592,
"step": 7730
},
{
"epoch": 1.1996280223186608,
"grad_norm": 4.925070762634277,
"learning_rate": 8.005269683818971e-06,
"loss": 0.5062,
"step": 7740
},
{
"epoch": 1.2011779293242406,
"grad_norm": 7.093862533569336,
"learning_rate": 7.989770613763174e-06,
"loss": 0.5812,
"step": 7750
},
{
"epoch": 1.2027278363298202,
"grad_norm": 4.071722507476807,
"learning_rate": 7.974271543707377e-06,
"loss": 0.5521,
"step": 7760
},
{
"epoch": 1.2042777433354,
"grad_norm": 3.5116024017333984,
"learning_rate": 7.958772473651582e-06,
"loss": 0.7707,
"step": 7770
},
{
"epoch": 1.2058276503409795,
"grad_norm": 2.9257960319519043,
"learning_rate": 7.943273403595785e-06,
"loss": 0.5412,
"step": 7780
},
{
"epoch": 1.2073775573465593,
"grad_norm": 10.328827857971191,
"learning_rate": 7.927774333539988e-06,
"loss": 0.5527,
"step": 7790
},
{
"epoch": 1.2089274643521388,
"grad_norm": 5.835555076599121,
"learning_rate": 7.912275263484191e-06,
"loss": 0.5774,
"step": 7800
},
{
"epoch": 1.2104773713577186,
"grad_norm": 16.687910079956055,
"learning_rate": 7.896776193428394e-06,
"loss": 0.7163,
"step": 7810
},
{
"epoch": 1.2120272783632982,
"grad_norm": 4.149749755859375,
"learning_rate": 7.881277123372597e-06,
"loss": 0.4435,
"step": 7820
},
{
"epoch": 1.213577185368878,
"grad_norm": 2.3621935844421387,
"learning_rate": 7.865778053316802e-06,
"loss": 0.4906,
"step": 7830
},
{
"epoch": 1.2151270923744575,
"grad_norm": 3.1414833068847656,
"learning_rate": 7.850278983261005e-06,
"loss": 0.4805,
"step": 7840
},
{
"epoch": 1.2166769993800373,
"grad_norm": 9.7128267288208,
"learning_rate": 7.834779913205208e-06,
"loss": 0.664,
"step": 7850
},
{
"epoch": 1.2182269063856168,
"grad_norm": 3.530634641647339,
"learning_rate": 7.819280843149411e-06,
"loss": 0.7153,
"step": 7860
},
{
"epoch": 1.2197768133911966,
"grad_norm": 7.077465534210205,
"learning_rate": 7.803781773093614e-06,
"loss": 0.5632,
"step": 7870
},
{
"epoch": 1.2213267203967761,
"grad_norm": 3.9259166717529297,
"learning_rate": 7.788282703037818e-06,
"loss": 0.6019,
"step": 7880
},
{
"epoch": 1.222876627402356,
"grad_norm": 8.732478141784668,
"learning_rate": 7.772783632982022e-06,
"loss": 0.6607,
"step": 7890
},
{
"epoch": 1.2244265344079355,
"grad_norm": 9.4745512008667,
"learning_rate": 7.757284562926225e-06,
"loss": 0.5651,
"step": 7900
},
{
"epoch": 1.2259764414135152,
"grad_norm": 3.427568197250366,
"learning_rate": 7.741785492870428e-06,
"loss": 0.5922,
"step": 7910
},
{
"epoch": 1.2275263484190948,
"grad_norm": 6.197938919067383,
"learning_rate": 7.726286422814632e-06,
"loss": 0.5734,
"step": 7920
},
{
"epoch": 1.2290762554246746,
"grad_norm": 5.879312038421631,
"learning_rate": 7.710787352758835e-06,
"loss": 0.7146,
"step": 7930
},
{
"epoch": 1.2306261624302541,
"grad_norm": 4.559432506561279,
"learning_rate": 7.695288282703038e-06,
"loss": 0.4991,
"step": 7940
},
{
"epoch": 1.232176069435834,
"grad_norm": 5.727523326873779,
"learning_rate": 7.679789212647242e-06,
"loss": 0.6537,
"step": 7950
},
{
"epoch": 1.2337259764414135,
"grad_norm": 7.712017059326172,
"learning_rate": 7.664290142591445e-06,
"loss": 0.6054,
"step": 7960
},
{
"epoch": 1.2352758834469932,
"grad_norm": 3.195868968963623,
"learning_rate": 7.648791072535649e-06,
"loss": 0.8091,
"step": 7970
},
{
"epoch": 1.2368257904525728,
"grad_norm": 9.342826843261719,
"learning_rate": 7.633292002479852e-06,
"loss": 0.5558,
"step": 7980
},
{
"epoch": 1.2383756974581526,
"grad_norm": 4.6453375816345215,
"learning_rate": 7.6177929324240555e-06,
"loss": 0.4514,
"step": 7990
},
{
"epoch": 1.2399256044637321,
"grad_norm": 8.93087100982666,
"learning_rate": 7.602293862368259e-06,
"loss": 0.6491,
"step": 8000
},
{
"epoch": 1.241475511469312,
"grad_norm": 3.648250102996826,
"learning_rate": 7.586794792312462e-06,
"loss": 0.6337,
"step": 8010
},
{
"epoch": 1.2430254184748915,
"grad_norm": 8.233920097351074,
"learning_rate": 7.5712957222566656e-06,
"loss": 0.5982,
"step": 8020
},
{
"epoch": 1.2445753254804712,
"grad_norm": 7.656149864196777,
"learning_rate": 7.555796652200869e-06,
"loss": 0.4163,
"step": 8030
},
{
"epoch": 1.2461252324860508,
"grad_norm": 3.028169631958008,
"learning_rate": 7.540297582145072e-06,
"loss": 0.5173,
"step": 8040
},
{
"epoch": 1.2476751394916306,
"grad_norm": 10.122218132019043,
"learning_rate": 7.524798512089276e-06,
"loss": 0.389,
"step": 8050
},
{
"epoch": 1.2492250464972101,
"grad_norm": 5.4110236167907715,
"learning_rate": 7.509299442033479e-06,
"loss": 0.5307,
"step": 8060
},
{
"epoch": 1.2507749535027899,
"grad_norm": 11.099316596984863,
"learning_rate": 7.493800371977682e-06,
"loss": 0.62,
"step": 8070
},
{
"epoch": 1.2523248605083694,
"grad_norm": 5.948646545410156,
"learning_rate": 7.478301301921886e-06,
"loss": 0.676,
"step": 8080
},
{
"epoch": 1.2538747675139492,
"grad_norm": 13.428414344787598,
"learning_rate": 7.462802231866089e-06,
"loss": 0.5017,
"step": 8090
},
{
"epoch": 1.2554246745195288,
"grad_norm": 7.999103546142578,
"learning_rate": 7.447303161810292e-06,
"loss": 0.511,
"step": 8100
},
{
"epoch": 1.2569745815251085,
"grad_norm": 3.18576717376709,
"learning_rate": 7.431804091754496e-06,
"loss": 0.4457,
"step": 8110
},
{
"epoch": 1.258524488530688,
"grad_norm": 6.90123176574707,
"learning_rate": 7.416305021698699e-06,
"loss": 0.4758,
"step": 8120
},
{
"epoch": 1.2600743955362679,
"grad_norm": 6.826712608337402,
"learning_rate": 7.400805951642902e-06,
"loss": 0.596,
"step": 8130
},
{
"epoch": 1.2616243025418474,
"grad_norm": 1.5102648735046387,
"learning_rate": 7.385306881587106e-06,
"loss": 0.5432,
"step": 8140
},
{
"epoch": 1.2631742095474272,
"grad_norm": 6.042043209075928,
"learning_rate": 7.369807811531309e-06,
"loss": 0.7783,
"step": 8150
},
{
"epoch": 1.2647241165530068,
"grad_norm": 4.189500331878662,
"learning_rate": 7.354308741475512e-06,
"loss": 0.4225,
"step": 8160
},
{
"epoch": 1.2662740235585865,
"grad_norm": 3.8514328002929688,
"learning_rate": 7.338809671419716e-06,
"loss": 0.7909,
"step": 8170
},
{
"epoch": 1.267823930564166,
"grad_norm": 10.183280944824219,
"learning_rate": 7.323310601363919e-06,
"loss": 0.7079,
"step": 8180
},
{
"epoch": 1.2693738375697459,
"grad_norm": 7.852726936340332,
"learning_rate": 7.307811531308122e-06,
"loss": 0.5453,
"step": 8190
},
{
"epoch": 1.2709237445753254,
"grad_norm": 6.670209884643555,
"learning_rate": 7.292312461252326e-06,
"loss": 0.534,
"step": 8200
},
{
"epoch": 1.2724736515809052,
"grad_norm": 14.171867370605469,
"learning_rate": 7.276813391196529e-06,
"loss": 0.6302,
"step": 8210
},
{
"epoch": 1.2740235585864848,
"grad_norm": 3.3877339363098145,
"learning_rate": 7.261314321140732e-06,
"loss": 0.5812,
"step": 8220
},
{
"epoch": 1.2755734655920645,
"grad_norm": 5.41657018661499,
"learning_rate": 7.245815251084936e-06,
"loss": 0.5913,
"step": 8230
},
{
"epoch": 1.277123372597644,
"grad_norm": 7.353067874908447,
"learning_rate": 7.230316181029139e-06,
"loss": 0.5484,
"step": 8240
},
{
"epoch": 1.2786732796032239,
"grad_norm": 4.998973369598389,
"learning_rate": 7.214817110973342e-06,
"loss": 0.6293,
"step": 8250
},
{
"epoch": 1.2802231866088034,
"grad_norm": 7.433141708374023,
"learning_rate": 7.199318040917546e-06,
"loss": 0.5088,
"step": 8260
},
{
"epoch": 1.2817730936143832,
"grad_norm": 10.44473934173584,
"learning_rate": 7.183818970861749e-06,
"loss": 0.7561,
"step": 8270
},
{
"epoch": 1.2833230006199627,
"grad_norm": 11.532959938049316,
"learning_rate": 7.168319900805952e-06,
"loss": 0.5057,
"step": 8280
},
{
"epoch": 1.2848729076255425,
"grad_norm": 0.80668044090271,
"learning_rate": 7.152820830750156e-06,
"loss": 0.5564,
"step": 8290
},
{
"epoch": 1.286422814631122,
"grad_norm": 5.980687141418457,
"learning_rate": 7.137321760694359e-06,
"loss": 0.4906,
"step": 8300
},
{
"epoch": 1.2879727216367018,
"grad_norm": 6.244668006896973,
"learning_rate": 7.121822690638562e-06,
"loss": 0.4905,
"step": 8310
},
{
"epoch": 1.2895226286422814,
"grad_norm": 6.673967361450195,
"learning_rate": 7.106323620582766e-06,
"loss": 0.5872,
"step": 8320
},
{
"epoch": 1.2910725356478612,
"grad_norm": 4.508692264556885,
"learning_rate": 7.090824550526969e-06,
"loss": 0.4922,
"step": 8330
},
{
"epoch": 1.2926224426534407,
"grad_norm": 9.3928804397583,
"learning_rate": 7.075325480471172e-06,
"loss": 0.4992,
"step": 8340
},
{
"epoch": 1.2941723496590205,
"grad_norm": 11.144902229309082,
"learning_rate": 7.059826410415376e-06,
"loss": 0.5935,
"step": 8350
},
{
"epoch": 1.2957222566646,
"grad_norm": 7.52712345123291,
"learning_rate": 7.044327340359579e-06,
"loss": 0.4821,
"step": 8360
},
{
"epoch": 1.2972721636701798,
"grad_norm": 5.781824588775635,
"learning_rate": 7.028828270303782e-06,
"loss": 0.4129,
"step": 8370
},
{
"epoch": 1.2988220706757594,
"grad_norm": 5.450551509857178,
"learning_rate": 7.013329200247986e-06,
"loss": 0.3729,
"step": 8380
},
{
"epoch": 1.3003719776813392,
"grad_norm": 8.195019721984863,
"learning_rate": 6.997830130192189e-06,
"loss": 0.5217,
"step": 8390
},
{
"epoch": 1.3019218846869187,
"grad_norm": 3.340574264526367,
"learning_rate": 6.982331060136392e-06,
"loss": 0.4438,
"step": 8400
},
{
"epoch": 1.3034717916924985,
"grad_norm": 8.651931762695312,
"learning_rate": 6.966831990080596e-06,
"loss": 0.4275,
"step": 8410
},
{
"epoch": 1.305021698698078,
"grad_norm": 7.959225654602051,
"learning_rate": 6.951332920024799e-06,
"loss": 0.6702,
"step": 8420
},
{
"epoch": 1.3065716057036578,
"grad_norm": 2.9623653888702393,
"learning_rate": 6.935833849969002e-06,
"loss": 0.596,
"step": 8430
},
{
"epoch": 1.3081215127092374,
"grad_norm": 4.792098522186279,
"learning_rate": 6.920334779913206e-06,
"loss": 0.5806,
"step": 8440
},
{
"epoch": 1.3096714197148172,
"grad_norm": 6.721735000610352,
"learning_rate": 6.904835709857409e-06,
"loss": 0.452,
"step": 8450
},
{
"epoch": 1.3112213267203967,
"grad_norm": 4.370045185089111,
"learning_rate": 6.889336639801612e-06,
"loss": 0.5525,
"step": 8460
},
{
"epoch": 1.3127712337259765,
"grad_norm": 8.37392520904541,
"learning_rate": 6.873837569745816e-06,
"loss": 0.3949,
"step": 8470
},
{
"epoch": 1.314321140731556,
"grad_norm": 3.662019968032837,
"learning_rate": 6.858338499690019e-06,
"loss": 0.6239,
"step": 8480
},
{
"epoch": 1.3158710477371358,
"grad_norm": 8.094034194946289,
"learning_rate": 6.842839429634222e-06,
"loss": 0.6033,
"step": 8490
},
{
"epoch": 1.3174209547427154,
"grad_norm": 4.170316696166992,
"learning_rate": 6.827340359578426e-06,
"loss": 0.7513,
"step": 8500
},
{
"epoch": 1.3189708617482951,
"grad_norm": 6.951697826385498,
"learning_rate": 6.811841289522629e-06,
"loss": 0.6167,
"step": 8510
},
{
"epoch": 1.3205207687538747,
"grad_norm": 8.492193222045898,
"learning_rate": 6.7963422194668325e-06,
"loss": 0.4893,
"step": 8520
},
{
"epoch": 1.3220706757594545,
"grad_norm": 1.626686692237854,
"learning_rate": 6.780843149411036e-06,
"loss": 0.5387,
"step": 8530
},
{
"epoch": 1.323620582765034,
"grad_norm": 4.959685802459717,
"learning_rate": 6.7653440793552394e-06,
"loss": 0.6753,
"step": 8540
},
{
"epoch": 1.3251704897706138,
"grad_norm": 5.371678352355957,
"learning_rate": 6.7498450092994425e-06,
"loss": 0.5722,
"step": 8550
},
{
"epoch": 1.3267203967761934,
"grad_norm": 5.171933650970459,
"learning_rate": 6.734345939243646e-06,
"loss": 0.4267,
"step": 8560
},
{
"epoch": 1.3282703037817731,
"grad_norm": 5.8038201332092285,
"learning_rate": 6.7188468691878495e-06,
"loss": 0.4323,
"step": 8570
},
{
"epoch": 1.3298202107873527,
"grad_norm": 7.018991470336914,
"learning_rate": 6.7033477991320526e-06,
"loss": 0.5106,
"step": 8580
},
{
"epoch": 1.3313701177929325,
"grad_norm": 6.064144611358643,
"learning_rate": 6.6878487290762565e-06,
"loss": 0.4794,
"step": 8590
},
{
"epoch": 1.332920024798512,
"grad_norm": 12.810579299926758,
"learning_rate": 6.6723496590204595e-06,
"loss": 0.4787,
"step": 8600
},
{
"epoch": 1.3344699318040918,
"grad_norm": 7.543432235717773,
"learning_rate": 6.656850588964663e-06,
"loss": 0.6821,
"step": 8610
},
{
"epoch": 1.3360198388096713,
"grad_norm": 6.412388801574707,
"learning_rate": 6.6413515189088665e-06,
"loss": 0.6768,
"step": 8620
},
{
"epoch": 1.3375697458152511,
"grad_norm": 7.301428318023682,
"learning_rate": 6.62585244885307e-06,
"loss": 0.3714,
"step": 8630
},
{
"epoch": 1.3391196528208307,
"grad_norm": 5.536515712738037,
"learning_rate": 6.610353378797273e-06,
"loss": 0.6862,
"step": 8640
},
{
"epoch": 1.3406695598264105,
"grad_norm": 7.190949440002441,
"learning_rate": 6.5948543087414766e-06,
"loss": 0.6415,
"step": 8650
},
{
"epoch": 1.34221946683199,
"grad_norm": 4.371210098266602,
"learning_rate": 6.57935523868568e-06,
"loss": 0.4792,
"step": 8660
},
{
"epoch": 1.3437693738375698,
"grad_norm": 3.757610559463501,
"learning_rate": 6.563856168629883e-06,
"loss": 0.6153,
"step": 8670
},
{
"epoch": 1.3453192808431493,
"grad_norm": 7.589756488800049,
"learning_rate": 6.548357098574087e-06,
"loss": 0.5786,
"step": 8680
},
{
"epoch": 1.346869187848729,
"grad_norm": 3.139828681945801,
"learning_rate": 6.53285802851829e-06,
"loss": 0.8185,
"step": 8690
},
{
"epoch": 1.3484190948543087,
"grad_norm": 10.385107040405273,
"learning_rate": 6.517358958462493e-06,
"loss": 0.6463,
"step": 8700
},
{
"epoch": 1.3499690018598884,
"grad_norm": 9.19307804107666,
"learning_rate": 6.501859888406697e-06,
"loss": 0.5454,
"step": 8710
},
{
"epoch": 1.351518908865468,
"grad_norm": 7.461082935333252,
"learning_rate": 6.4863608183509e-06,
"loss": 0.5529,
"step": 8720
},
{
"epoch": 1.3530688158710478,
"grad_norm": 8.469099998474121,
"learning_rate": 6.470861748295103e-06,
"loss": 0.521,
"step": 8730
},
{
"epoch": 1.3546187228766273,
"grad_norm": 5.376317501068115,
"learning_rate": 6.455362678239307e-06,
"loss": 0.4741,
"step": 8740
},
{
"epoch": 1.356168629882207,
"grad_norm": 7.034505367279053,
"learning_rate": 6.43986360818351e-06,
"loss": 0.5807,
"step": 8750
},
{
"epoch": 1.3577185368877867,
"grad_norm": 6.779096603393555,
"learning_rate": 6.424364538127713e-06,
"loss": 0.6814,
"step": 8760
},
{
"epoch": 1.3592684438933664,
"grad_norm": 7.2761077880859375,
"learning_rate": 6.408865468071917e-06,
"loss": 0.6118,
"step": 8770
},
{
"epoch": 1.360818350898946,
"grad_norm": 5.111270427703857,
"learning_rate": 6.39336639801612e-06,
"loss": 0.4242,
"step": 8780
},
{
"epoch": 1.3623682579045258,
"grad_norm": 7.900925159454346,
"learning_rate": 6.377867327960323e-06,
"loss": 0.6002,
"step": 8790
},
{
"epoch": 1.3639181649101053,
"grad_norm": 4.93954610824585,
"learning_rate": 6.362368257904527e-06,
"loss": 0.6001,
"step": 8800
},
{
"epoch": 1.365468071915685,
"grad_norm": 4.872836589813232,
"learning_rate": 6.34686918784873e-06,
"loss": 0.6633,
"step": 8810
},
{
"epoch": 1.3670179789212646,
"grad_norm": 6.094171047210693,
"learning_rate": 6.331370117792933e-06,
"loss": 0.485,
"step": 8820
},
{
"epoch": 1.3685678859268444,
"grad_norm": 6.762317180633545,
"learning_rate": 6.315871047737137e-06,
"loss": 0.5592,
"step": 8830
},
{
"epoch": 1.370117792932424,
"grad_norm": 6.48795747756958,
"learning_rate": 6.30037197768134e-06,
"loss": 0.5515,
"step": 8840
},
{
"epoch": 1.3716676999380037,
"grad_norm": 6.643194675445557,
"learning_rate": 6.284872907625543e-06,
"loss": 0.6364,
"step": 8850
},
{
"epoch": 1.3732176069435833,
"grad_norm": 10.059488296508789,
"learning_rate": 6.269373837569747e-06,
"loss": 0.6145,
"step": 8860
},
{
"epoch": 1.374767513949163,
"grad_norm": 6.285107612609863,
"learning_rate": 6.25387476751395e-06,
"loss": 0.571,
"step": 8870
},
{
"epoch": 1.3763174209547429,
"grad_norm": 9.381699562072754,
"learning_rate": 6.238375697458153e-06,
"loss": 0.5779,
"step": 8880
},
{
"epoch": 1.3778673279603224,
"grad_norm": 6.602116107940674,
"learning_rate": 6.222876627402357e-06,
"loss": 0.5559,
"step": 8890
},
{
"epoch": 1.379417234965902,
"grad_norm": 5.42208194732666,
"learning_rate": 6.20737755734656e-06,
"loss": 0.5229,
"step": 8900
},
{
"epoch": 1.3809671419714817,
"grad_norm": 5.746293067932129,
"learning_rate": 6.191878487290763e-06,
"loss": 0.4386,
"step": 8910
},
{
"epoch": 1.3825170489770615,
"grad_norm": 8.15822982788086,
"learning_rate": 6.176379417234967e-06,
"loss": 0.6447,
"step": 8920
},
{
"epoch": 1.384066955982641,
"grad_norm": 7.664370536804199,
"learning_rate": 6.16088034717917e-06,
"loss": 0.5102,
"step": 8930
},
{
"epoch": 1.3856168629882206,
"grad_norm": 8.099589347839355,
"learning_rate": 6.145381277123373e-06,
"loss": 0.4791,
"step": 8940
},
{
"epoch": 1.3871667699938004,
"grad_norm": 9.306650161743164,
"learning_rate": 6.129882207067577e-06,
"loss": 0.5595,
"step": 8950
},
{
"epoch": 1.3887166769993802,
"grad_norm": 8.718387603759766,
"learning_rate": 6.11438313701178e-06,
"loss": 0.3803,
"step": 8960
},
{
"epoch": 1.3902665840049597,
"grad_norm": 3.83396315574646,
"learning_rate": 6.098884066955983e-06,
"loss": 0.3988,
"step": 8970
},
{
"epoch": 1.3918164910105393,
"grad_norm": 8.484441757202148,
"learning_rate": 6.083384996900187e-06,
"loss": 0.5799,
"step": 8980
},
{
"epoch": 1.393366398016119,
"grad_norm": 5.386404037475586,
"learning_rate": 6.06788592684439e-06,
"loss": 0.3893,
"step": 8990
},
{
"epoch": 1.3949163050216988,
"grad_norm": 4.610734939575195,
"learning_rate": 6.052386856788593e-06,
"loss": 0.5469,
"step": 9000
},
{
"epoch": 1.3964662120272784,
"grad_norm": 11.3955717086792,
"learning_rate": 6.036887786732797e-06,
"loss": 0.4739,
"step": 9010
},
{
"epoch": 1.398016119032858,
"grad_norm": 9.982507705688477,
"learning_rate": 6.021388716677e-06,
"loss": 0.5295,
"step": 9020
},
{
"epoch": 1.3995660260384377,
"grad_norm": 6.788435459136963,
"learning_rate": 6.005889646621203e-06,
"loss": 0.5864,
"step": 9030
},
{
"epoch": 1.4011159330440175,
"grad_norm": 8.976727485656738,
"learning_rate": 5.990390576565407e-06,
"loss": 0.6665,
"step": 9040
},
{
"epoch": 1.402665840049597,
"grad_norm": 4.205535888671875,
"learning_rate": 5.97489150650961e-06,
"loss": 0.3614,
"step": 9050
},
{
"epoch": 1.4042157470551766,
"grad_norm": 6.974489212036133,
"learning_rate": 5.959392436453813e-06,
"loss": 0.5822,
"step": 9060
},
{
"epoch": 1.4057656540607564,
"grad_norm": 3.7475290298461914,
"learning_rate": 5.943893366398017e-06,
"loss": 0.2944,
"step": 9070
},
{
"epoch": 1.4073155610663362,
"grad_norm": 6.615023612976074,
"learning_rate": 5.92839429634222e-06,
"loss": 0.6612,
"step": 9080
},
{
"epoch": 1.4088654680719157,
"grad_norm": 3.359515905380249,
"learning_rate": 5.912895226286423e-06,
"loss": 0.6356,
"step": 9090
},
{
"epoch": 1.4104153750774953,
"grad_norm": 3.1997714042663574,
"learning_rate": 5.897396156230627e-06,
"loss": 0.6412,
"step": 9100
},
{
"epoch": 1.411965282083075,
"grad_norm": 8.04691219329834,
"learning_rate": 5.88189708617483e-06,
"loss": 0.4778,
"step": 9110
},
{
"epoch": 1.4135151890886548,
"grad_norm": 5.734005451202393,
"learning_rate": 5.866398016119033e-06,
"loss": 0.5637,
"step": 9120
},
{
"epoch": 1.4150650960942344,
"grad_norm": 8.435890197753906,
"learning_rate": 5.850898946063237e-06,
"loss": 0.6423,
"step": 9130
},
{
"epoch": 1.416615003099814,
"grad_norm": 9.176706314086914,
"learning_rate": 5.83539987600744e-06,
"loss": 0.4785,
"step": 9140
},
{
"epoch": 1.4181649101053937,
"grad_norm": 5.92742395401001,
"learning_rate": 5.8199008059516434e-06,
"loss": 0.5509,
"step": 9150
},
{
"epoch": 1.4197148171109735,
"grad_norm": 5.5970869064331055,
"learning_rate": 5.804401735895847e-06,
"loss": 0.7684,
"step": 9160
},
{
"epoch": 1.421264724116553,
"grad_norm": 7.216648578643799,
"learning_rate": 5.7889026658400504e-06,
"loss": 0.4177,
"step": 9170
},
{
"epoch": 1.4228146311221326,
"grad_norm": 4.122847557067871,
"learning_rate": 5.7734035957842535e-06,
"loss": 0.519,
"step": 9180
},
{
"epoch": 1.4243645381277124,
"grad_norm": 6.326296329498291,
"learning_rate": 5.757904525728457e-06,
"loss": 0.5741,
"step": 9190
},
{
"epoch": 1.4259144451332921,
"grad_norm": 3.2332475185394287,
"learning_rate": 5.7424054556726605e-06,
"loss": 0.5284,
"step": 9200
},
{
"epoch": 1.4274643521388717,
"grad_norm": 3.224771499633789,
"learning_rate": 5.7269063856168635e-06,
"loss": 0.5077,
"step": 9210
},
{
"epoch": 1.4290142591444512,
"grad_norm": 3.3739428520202637,
"learning_rate": 5.7114073155610675e-06,
"loss": 0.4111,
"step": 9220
},
{
"epoch": 1.430564166150031,
"grad_norm": 5.780101776123047,
"learning_rate": 5.6959082455052705e-06,
"loss": 0.4889,
"step": 9230
},
{
"epoch": 1.4321140731556108,
"grad_norm": 10.54281234741211,
"learning_rate": 5.680409175449474e-06,
"loss": 0.5395,
"step": 9240
},
{
"epoch": 1.4336639801611903,
"grad_norm": 4.650584697723389,
"learning_rate": 5.6649101053936775e-06,
"loss": 0.5086,
"step": 9250
},
{
"epoch": 1.43521388716677,
"grad_norm": 3.0759034156799316,
"learning_rate": 5.6494110353378806e-06,
"loss": 0.7147,
"step": 9260
},
{
"epoch": 1.4367637941723497,
"grad_norm": 5.136111259460449,
"learning_rate": 5.633911965282084e-06,
"loss": 0.6591,
"step": 9270
},
{
"epoch": 1.4383137011779294,
"grad_norm": 9.995561599731445,
"learning_rate": 5.6184128952262875e-06,
"loss": 0.5049,
"step": 9280
},
{
"epoch": 1.439863608183509,
"grad_norm": 5.644338130950928,
"learning_rate": 5.602913825170491e-06,
"loss": 0.4928,
"step": 9290
},
{
"epoch": 1.4414135151890886,
"grad_norm": 13.791778564453125,
"learning_rate": 5.587414755114694e-06,
"loss": 0.6969,
"step": 9300
},
{
"epoch": 1.4429634221946683,
"grad_norm": 3.6634018421173096,
"learning_rate": 5.571915685058898e-06,
"loss": 0.5763,
"step": 9310
},
{
"epoch": 1.444513329200248,
"grad_norm": 11.883929252624512,
"learning_rate": 5.556416615003101e-06,
"loss": 0.6579,
"step": 9320
},
{
"epoch": 1.4460632362058277,
"grad_norm": 10.741708755493164,
"learning_rate": 5.540917544947304e-06,
"loss": 0.6272,
"step": 9330
},
{
"epoch": 1.4476131432114072,
"grad_norm": 4.298150539398193,
"learning_rate": 5.525418474891508e-06,
"loss": 0.5472,
"step": 9340
},
{
"epoch": 1.449163050216987,
"grad_norm": 2.4050536155700684,
"learning_rate": 5.509919404835711e-06,
"loss": 0.6278,
"step": 9350
},
{
"epoch": 1.4507129572225668,
"grad_norm": 4.59657096862793,
"learning_rate": 5.494420334779914e-06,
"loss": 0.5389,
"step": 9360
},
{
"epoch": 1.4522628642281463,
"grad_norm": 2.8281540870666504,
"learning_rate": 5.478921264724118e-06,
"loss": 0.4964,
"step": 9370
},
{
"epoch": 1.4538127712337259,
"grad_norm": 8.060725212097168,
"learning_rate": 5.463422194668321e-06,
"loss": 0.5654,
"step": 9380
},
{
"epoch": 1.4553626782393057,
"grad_norm": 6.732635021209717,
"learning_rate": 5.447923124612524e-06,
"loss": 0.6084,
"step": 9390
},
{
"epoch": 1.4569125852448854,
"grad_norm": 17.33676528930664,
"learning_rate": 5.432424054556728e-06,
"loss": 0.4096,
"step": 9400
},
{
"epoch": 1.458462492250465,
"grad_norm": 8.642203330993652,
"learning_rate": 5.416924984500931e-06,
"loss": 0.6075,
"step": 9410
},
{
"epoch": 1.4600123992560445,
"grad_norm": 2.123789072036743,
"learning_rate": 5.401425914445134e-06,
"loss": 0.3668,
"step": 9420
},
{
"epoch": 1.4615623062616243,
"grad_norm": 7.111140251159668,
"learning_rate": 5.385926844389338e-06,
"loss": 0.5203,
"step": 9430
},
{
"epoch": 1.463112213267204,
"grad_norm": 8.846921920776367,
"learning_rate": 5.370427774333541e-06,
"loss": 0.428,
"step": 9440
},
{
"epoch": 1.4646621202727836,
"grad_norm": 4.978935718536377,
"learning_rate": 5.354928704277744e-06,
"loss": 0.5781,
"step": 9450
},
{
"epoch": 1.4662120272783632,
"grad_norm": 14.11387825012207,
"learning_rate": 5.339429634221948e-06,
"loss": 0.6948,
"step": 9460
},
{
"epoch": 1.467761934283943,
"grad_norm": 3.0608415603637695,
"learning_rate": 5.323930564166151e-06,
"loss": 0.535,
"step": 9470
},
{
"epoch": 1.4693118412895227,
"grad_norm": 7.584042072296143,
"learning_rate": 5.308431494110354e-06,
"loss": 0.4748,
"step": 9480
},
{
"epoch": 1.4708617482951023,
"grad_norm": 6.784172534942627,
"learning_rate": 5.292932424054558e-06,
"loss": 0.7326,
"step": 9490
},
{
"epoch": 1.4724116553006819,
"grad_norm": 9.559980392456055,
"learning_rate": 5.277433353998761e-06,
"loss": 0.4411,
"step": 9500
},
{
"epoch": 1.4739615623062616,
"grad_norm": 6.2408127784729,
"learning_rate": 5.261934283942964e-06,
"loss": 0.5753,
"step": 9510
},
{
"epoch": 1.4755114693118414,
"grad_norm": 4.772825241088867,
"learning_rate": 5.246435213887168e-06,
"loss": 0.5637,
"step": 9520
},
{
"epoch": 1.477061376317421,
"grad_norm": 4.578171730041504,
"learning_rate": 5.230936143831371e-06,
"loss": 0.5267,
"step": 9530
},
{
"epoch": 1.4786112833230005,
"grad_norm": 3.323122024536133,
"learning_rate": 5.215437073775574e-06,
"loss": 0.6315,
"step": 9540
},
{
"epoch": 1.4801611903285803,
"grad_norm": 4.294838905334473,
"learning_rate": 5.199938003719778e-06,
"loss": 0.6561,
"step": 9550
},
{
"epoch": 1.48171109733416,
"grad_norm": 5.863995552062988,
"learning_rate": 5.184438933663981e-06,
"loss": 0.437,
"step": 9560
},
{
"epoch": 1.4832610043397396,
"grad_norm": 8.996912002563477,
"learning_rate": 5.168939863608184e-06,
"loss": 0.5727,
"step": 9570
},
{
"epoch": 1.4848109113453192,
"grad_norm": 6.797782897949219,
"learning_rate": 5.153440793552388e-06,
"loss": 0.616,
"step": 9580
},
{
"epoch": 1.486360818350899,
"grad_norm": 3.5306284427642822,
"learning_rate": 5.137941723496591e-06,
"loss": 0.5054,
"step": 9590
},
{
"epoch": 1.4879107253564787,
"grad_norm": 0.761473536491394,
"learning_rate": 5.122442653440794e-06,
"loss": 0.5933,
"step": 9600
},
{
"epoch": 1.4894606323620583,
"grad_norm": 2.775047779083252,
"learning_rate": 5.106943583384998e-06,
"loss": 0.5478,
"step": 9610
},
{
"epoch": 1.4910105393676378,
"grad_norm": 5.2198028564453125,
"learning_rate": 5.091444513329201e-06,
"loss": 0.6019,
"step": 9620
},
{
"epoch": 1.4925604463732176,
"grad_norm": 5.083917140960693,
"learning_rate": 5.075945443273404e-06,
"loss": 0.3794,
"step": 9630
},
{
"epoch": 1.4941103533787974,
"grad_norm": 8.67032527923584,
"learning_rate": 5.060446373217608e-06,
"loss": 0.6841,
"step": 9640
},
{
"epoch": 1.495660260384377,
"grad_norm": 4.733889102935791,
"learning_rate": 5.044947303161811e-06,
"loss": 0.6018,
"step": 9650
},
{
"epoch": 1.4972101673899565,
"grad_norm": 9.206461906433105,
"learning_rate": 5.029448233106014e-06,
"loss": 0.5624,
"step": 9660
},
{
"epoch": 1.4987600743955363,
"grad_norm": 4.749617099761963,
"learning_rate": 5.013949163050218e-06,
"loss": 0.5591,
"step": 9670
},
{
"epoch": 1.500309981401116,
"grad_norm": 7.325601100921631,
"learning_rate": 4.99845009299442e-06,
"loss": 0.5467,
"step": 9680
},
{
"epoch": 1.5018598884066956,
"grad_norm": 7.182321071624756,
"learning_rate": 4.982951022938624e-06,
"loss": 0.769,
"step": 9690
},
{
"epoch": 1.5034097954122752,
"grad_norm": 7.016432285308838,
"learning_rate": 4.967451952882827e-06,
"loss": 0.7083,
"step": 9700
},
{
"epoch": 1.504959702417855,
"grad_norm": 1.7481181621551514,
"learning_rate": 4.9519528828270304e-06,
"loss": 0.4016,
"step": 9710
},
{
"epoch": 1.5065096094234347,
"grad_norm": 6.876219272613525,
"learning_rate": 4.936453812771234e-06,
"loss": 0.3005,
"step": 9720
},
{
"epoch": 1.5080595164290143,
"grad_norm": 17.568605422973633,
"learning_rate": 4.920954742715437e-06,
"loss": 0.483,
"step": 9730
},
{
"epoch": 1.5096094234345938,
"grad_norm": 7.156910419464111,
"learning_rate": 4.9054556726596405e-06,
"loss": 0.5408,
"step": 9740
},
{
"epoch": 1.5111593304401736,
"grad_norm": 2.5366811752319336,
"learning_rate": 4.889956602603844e-06,
"loss": 0.7988,
"step": 9750
},
{
"epoch": 1.5127092374457534,
"grad_norm": 10.356922149658203,
"learning_rate": 4.8744575325480475e-06,
"loss": 0.6228,
"step": 9760
},
{
"epoch": 1.514259144451333,
"grad_norm": 11.088748931884766,
"learning_rate": 4.8589584624922505e-06,
"loss": 0.639,
"step": 9770
},
{
"epoch": 1.5158090514569125,
"grad_norm": 10.523143768310547,
"learning_rate": 4.8434593924364544e-06,
"loss": 0.8511,
"step": 9780
},
{
"epoch": 1.5173589584624922,
"grad_norm": 6.977451324462891,
"learning_rate": 4.8279603223806575e-06,
"loss": 0.5901,
"step": 9790
},
{
"epoch": 1.518908865468072,
"grad_norm": 2.387295961380005,
"learning_rate": 4.8124612523248606e-06,
"loss": 0.5298,
"step": 9800
},
{
"epoch": 1.5204587724736516,
"grad_norm": 11.128771781921387,
"learning_rate": 4.7969621822690645e-06,
"loss": 0.4541,
"step": 9810
},
{
"epoch": 1.5220086794792311,
"grad_norm": 11.29007625579834,
"learning_rate": 4.7814631122132675e-06,
"loss": 0.564,
"step": 9820
},
{
"epoch": 1.523558586484811,
"grad_norm": 3.435899019241333,
"learning_rate": 4.765964042157471e-06,
"loss": 0.5353,
"step": 9830
},
{
"epoch": 1.5251084934903907,
"grad_norm": 6.636521816253662,
"learning_rate": 4.7504649721016745e-06,
"loss": 0.6093,
"step": 9840
},
{
"epoch": 1.5266584004959702,
"grad_norm": 7.338555335998535,
"learning_rate": 4.734965902045878e-06,
"loss": 0.4354,
"step": 9850
},
{
"epoch": 1.5282083075015498,
"grad_norm": 12.102242469787598,
"learning_rate": 4.719466831990081e-06,
"loss": 0.6617,
"step": 9860
},
{
"epoch": 1.5297582145071296,
"grad_norm": 6.004841327667236,
"learning_rate": 4.7039677619342846e-06,
"loss": 0.6266,
"step": 9870
},
{
"epoch": 1.5313081215127093,
"grad_norm": 9.303327560424805,
"learning_rate": 4.688468691878488e-06,
"loss": 0.5878,
"step": 9880
},
{
"epoch": 1.532858028518289,
"grad_norm": 5.710174083709717,
"learning_rate": 4.672969621822691e-06,
"loss": 0.4985,
"step": 9890
},
{
"epoch": 1.5344079355238684,
"grad_norm": 6.517455101013184,
"learning_rate": 4.657470551766895e-06,
"loss": 0.5533,
"step": 9900
},
{
"epoch": 1.5359578425294482,
"grad_norm": 8.90954875946045,
"learning_rate": 4.641971481711098e-06,
"loss": 0.7229,
"step": 9910
},
{
"epoch": 1.537507749535028,
"grad_norm": 5.908111572265625,
"learning_rate": 4.626472411655301e-06,
"loss": 0.5185,
"step": 9920
},
{
"epoch": 1.5390576565406076,
"grad_norm": 10.088567733764648,
"learning_rate": 4.610973341599505e-06,
"loss": 0.5258,
"step": 9930
},
{
"epoch": 1.540607563546187,
"grad_norm": 6.859228610992432,
"learning_rate": 4.595474271543708e-06,
"loss": 0.4885,
"step": 9940
},
{
"epoch": 1.5421574705517669,
"grad_norm": 7.489620208740234,
"learning_rate": 4.579975201487911e-06,
"loss": 0.453,
"step": 9950
},
{
"epoch": 1.5437073775573467,
"grad_norm": 8.25145435333252,
"learning_rate": 4.564476131432115e-06,
"loss": 0.5938,
"step": 9960
},
{
"epoch": 1.5452572845629262,
"grad_norm": 5.98444938659668,
"learning_rate": 4.548977061376318e-06,
"loss": 0.5637,
"step": 9970
},
{
"epoch": 1.5468071915685058,
"grad_norm": 4.948822498321533,
"learning_rate": 4.533477991320521e-06,
"loss": 0.4356,
"step": 9980
},
{
"epoch": 1.5483570985740855,
"grad_norm": 4.464200973510742,
"learning_rate": 4.517978921264725e-06,
"loss": 0.5302,
"step": 9990
},
{
"epoch": 1.5499070055796653,
"grad_norm": 9.048888206481934,
"learning_rate": 4.502479851208928e-06,
"loss": 0.5684,
"step": 10000
},
{
"epoch": 1.5514569125852449,
"grad_norm": 4.196654319763184,
"learning_rate": 4.486980781153131e-06,
"loss": 0.5561,
"step": 10010
},
{
"epoch": 1.5530068195908244,
"grad_norm": 5.6792144775390625,
"learning_rate": 4.471481711097335e-06,
"loss": 0.6369,
"step": 10020
},
{
"epoch": 1.5545567265964042,
"grad_norm": 4.345642566680908,
"learning_rate": 4.455982641041538e-06,
"loss": 0.4938,
"step": 10030
},
{
"epoch": 1.556106633601984,
"grad_norm": 5.05332612991333,
"learning_rate": 4.440483570985741e-06,
"loss": 0.4505,
"step": 10040
},
{
"epoch": 1.5576565406075635,
"grad_norm": 7.0360870361328125,
"learning_rate": 4.424984500929945e-06,
"loss": 0.4599,
"step": 10050
},
{
"epoch": 1.559206447613143,
"grad_norm": 11.911025047302246,
"learning_rate": 4.409485430874148e-06,
"loss": 0.5653,
"step": 10060
},
{
"epoch": 1.5607563546187229,
"grad_norm": 9.599674224853516,
"learning_rate": 4.393986360818351e-06,
"loss": 0.5005,
"step": 10070
},
{
"epoch": 1.5623062616243026,
"grad_norm": 6.970069885253906,
"learning_rate": 4.378487290762555e-06,
"loss": 0.7005,
"step": 10080
},
{
"epoch": 1.5638561686298822,
"grad_norm": 5.077493667602539,
"learning_rate": 4.362988220706758e-06,
"loss": 0.5908,
"step": 10090
},
{
"epoch": 1.5654060756354617,
"grad_norm": 10.840348243713379,
"learning_rate": 4.347489150650961e-06,
"loss": 0.394,
"step": 10100
},
{
"epoch": 1.5669559826410415,
"grad_norm": 4.2803120613098145,
"learning_rate": 4.331990080595165e-06,
"loss": 0.7187,
"step": 10110
},
{
"epoch": 1.5685058896466213,
"grad_norm": 5.350848197937012,
"learning_rate": 4.316491010539368e-06,
"loss": 0.622,
"step": 10120
},
{
"epoch": 1.5700557966522009,
"grad_norm": 4.250338554382324,
"learning_rate": 4.300991940483571e-06,
"loss": 0.4338,
"step": 10130
},
{
"epoch": 1.5716057036577804,
"grad_norm": 10.858535766601562,
"learning_rate": 4.285492870427775e-06,
"loss": 0.7283,
"step": 10140
},
{
"epoch": 1.5731556106633602,
"grad_norm": 6.90775728225708,
"learning_rate": 4.269993800371978e-06,
"loss": 0.4537,
"step": 10150
},
{
"epoch": 1.57470551766894,
"grad_norm": 3.502438545227051,
"learning_rate": 4.254494730316181e-06,
"loss": 0.4618,
"step": 10160
},
{
"epoch": 1.5762554246745195,
"grad_norm": 7.183608531951904,
"learning_rate": 4.238995660260385e-06,
"loss": 0.5586,
"step": 10170
},
{
"epoch": 1.577805331680099,
"grad_norm": 3.889638662338257,
"learning_rate": 4.223496590204588e-06,
"loss": 0.4065,
"step": 10180
},
{
"epoch": 1.5793552386856788,
"grad_norm": 6.343893527984619,
"learning_rate": 4.207997520148791e-06,
"loss": 0.4437,
"step": 10190
},
{
"epoch": 1.5809051456912586,
"grad_norm": 18.638702392578125,
"learning_rate": 4.192498450092995e-06,
"loss": 0.6633,
"step": 10200
},
{
"epoch": 1.5824550526968382,
"grad_norm": 11.82913875579834,
"learning_rate": 4.176999380037198e-06,
"loss": 0.8167,
"step": 10210
},
{
"epoch": 1.5840049597024177,
"grad_norm": 4.050469875335693,
"learning_rate": 4.161500309981401e-06,
"loss": 0.6837,
"step": 10220
},
{
"epoch": 1.5855548667079975,
"grad_norm": 13.781719207763672,
"learning_rate": 4.146001239925605e-06,
"loss": 0.6684,
"step": 10230
},
{
"epoch": 1.5871047737135773,
"grad_norm": 3.5966341495513916,
"learning_rate": 4.130502169869808e-06,
"loss": 0.5273,
"step": 10240
},
{
"epoch": 1.5886546807191568,
"grad_norm": 6.0468926429748535,
"learning_rate": 4.115003099814012e-06,
"loss": 0.4655,
"step": 10250
},
{
"epoch": 1.5902045877247364,
"grad_norm": 9.262929916381836,
"learning_rate": 4.099504029758215e-06,
"loss": 0.5839,
"step": 10260
},
{
"epoch": 1.5917544947303162,
"grad_norm": 8.276415824890137,
"learning_rate": 4.084004959702418e-06,
"loss": 0.5158,
"step": 10270
},
{
"epoch": 1.593304401735896,
"grad_norm": 7.415068626403809,
"learning_rate": 4.068505889646622e-06,
"loss": 0.4451,
"step": 10280
},
{
"epoch": 1.5948543087414755,
"grad_norm": 6.222273826599121,
"learning_rate": 4.053006819590825e-06,
"loss": 0.5222,
"step": 10290
},
{
"epoch": 1.596404215747055,
"grad_norm": 7.769200801849365,
"learning_rate": 4.037507749535028e-06,
"loss": 0.585,
"step": 10300
},
{
"epoch": 1.5979541227526348,
"grad_norm": 8.023396492004395,
"learning_rate": 4.022008679479232e-06,
"loss": 0.4002,
"step": 10310
},
{
"epoch": 1.5995040297582146,
"grad_norm": 10.689138412475586,
"learning_rate": 4.006509609423435e-06,
"loss": 0.7155,
"step": 10320
},
{
"epoch": 1.6010539367637941,
"grad_norm": 7.788012504577637,
"learning_rate": 3.991010539367638e-06,
"loss": 0.7031,
"step": 10330
},
{
"epoch": 1.6026038437693737,
"grad_norm": 6.007388114929199,
"learning_rate": 3.975511469311842e-06,
"loss": 0.4373,
"step": 10340
},
{
"epoch": 1.6041537507749535,
"grad_norm": 10.727594375610352,
"learning_rate": 3.960012399256045e-06,
"loss": 0.6134,
"step": 10350
},
{
"epoch": 1.6057036577805333,
"grad_norm": 4.113864898681641,
"learning_rate": 3.944513329200248e-06,
"loss": 0.5774,
"step": 10360
},
{
"epoch": 1.6072535647861128,
"grad_norm": 6.425714015960693,
"learning_rate": 3.929014259144452e-06,
"loss": 0.6391,
"step": 10370
},
{
"epoch": 1.6088034717916924,
"grad_norm": 4.605624675750732,
"learning_rate": 3.913515189088655e-06,
"loss": 0.5495,
"step": 10380
},
{
"epoch": 1.6103533787972721,
"grad_norm": 3.2867257595062256,
"learning_rate": 3.8980161190328584e-06,
"loss": 0.5751,
"step": 10390
},
{
"epoch": 1.611903285802852,
"grad_norm": 5.345250129699707,
"learning_rate": 3.882517048977062e-06,
"loss": 0.5068,
"step": 10400
},
{
"epoch": 1.6134531928084315,
"grad_norm": 3.655217409133911,
"learning_rate": 3.867017978921265e-06,
"loss": 0.679,
"step": 10410
},
{
"epoch": 1.615003099814011,
"grad_norm": 8.845053672790527,
"learning_rate": 3.8515189088654685e-06,
"loss": 0.5818,
"step": 10420
},
{
"epoch": 1.6165530068195908,
"grad_norm": 10.273194313049316,
"learning_rate": 3.836019838809672e-06,
"loss": 0.6227,
"step": 10430
},
{
"epoch": 1.6181029138251706,
"grad_norm": 2.203364849090576,
"learning_rate": 3.8205207687538755e-06,
"loss": 0.4314,
"step": 10440
},
{
"epoch": 1.6196528208307501,
"grad_norm": 5.975243091583252,
"learning_rate": 3.8050216986980785e-06,
"loss": 0.3087,
"step": 10450
},
{
"epoch": 1.6212027278363297,
"grad_norm": 4.337436676025391,
"learning_rate": 3.789522628642282e-06,
"loss": 0.6271,
"step": 10460
},
{
"epoch": 1.6227526348419095,
"grad_norm": 6.358315467834473,
"learning_rate": 3.7740235585864855e-06,
"loss": 0.6318,
"step": 10470
},
{
"epoch": 1.6243025418474892,
"grad_norm": 12.044097900390625,
"learning_rate": 3.7585244885306886e-06,
"loss": 0.3999,
"step": 10480
},
{
"epoch": 1.6258524488530688,
"grad_norm": 8.828185081481934,
"learning_rate": 3.7430254184748916e-06,
"loss": 0.5629,
"step": 10490
},
{
"epoch": 1.6274023558586483,
"grad_norm": 6.134864330291748,
"learning_rate": 3.7275263484190947e-06,
"loss": 0.5611,
"step": 10500
},
{
"epoch": 1.6289522628642281,
"grad_norm": 11.847331047058105,
"learning_rate": 3.712027278363298e-06,
"loss": 0.7173,
"step": 10510
},
{
"epoch": 1.630502169869808,
"grad_norm": 4.5125226974487305,
"learning_rate": 3.6965282083075017e-06,
"loss": 0.3994,
"step": 10520
},
{
"epoch": 1.6320520768753874,
"grad_norm": 12.239794731140137,
"learning_rate": 3.6810291382517048e-06,
"loss": 0.5533,
"step": 10530
},
{
"epoch": 1.633601983880967,
"grad_norm": 3.501194953918457,
"learning_rate": 3.6655300681959082e-06,
"loss": 0.6103,
"step": 10540
},
{
"epoch": 1.6351518908865468,
"grad_norm": 9.775247573852539,
"learning_rate": 3.6500309981401117e-06,
"loss": 0.8142,
"step": 10550
},
{
"epoch": 1.6367017978921266,
"grad_norm": 6.553702354431152,
"learning_rate": 3.634531928084315e-06,
"loss": 0.5786,
"step": 10560
},
{
"epoch": 1.638251704897706,
"grad_norm": 3.682405471801758,
"learning_rate": 3.6190328580285183e-06,
"loss": 0.4615,
"step": 10570
},
{
"epoch": 1.6398016119032857,
"grad_norm": 3.184781789779663,
"learning_rate": 3.6035337879727218e-06,
"loss": 0.4539,
"step": 10580
},
{
"epoch": 1.6413515189088654,
"grad_norm": 3.740588426589966,
"learning_rate": 3.588034717916925e-06,
"loss": 0.493,
"step": 10590
},
{
"epoch": 1.6429014259144452,
"grad_norm": 8.489069938659668,
"learning_rate": 3.5725356478611283e-06,
"loss": 0.4983,
"step": 10600
},
{
"epoch": 1.6444513329200248,
"grad_norm": 6.311090469360352,
"learning_rate": 3.557036577805332e-06,
"loss": 0.4644,
"step": 10610
},
{
"epoch": 1.6460012399256043,
"grad_norm": 14.392030715942383,
"learning_rate": 3.541537507749535e-06,
"loss": 0.503,
"step": 10620
},
{
"epoch": 1.647551146931184,
"grad_norm": 5.263167858123779,
"learning_rate": 3.5260384376937384e-06,
"loss": 0.4371,
"step": 10630
},
{
"epoch": 1.6491010539367639,
"grad_norm": 5.845542907714844,
"learning_rate": 3.510539367637942e-06,
"loss": 0.571,
"step": 10640
},
{
"epoch": 1.6506509609423434,
"grad_norm": 4.867492198944092,
"learning_rate": 3.495040297582145e-06,
"loss": 0.5561,
"step": 10650
},
{
"epoch": 1.652200867947923,
"grad_norm": 4.042713165283203,
"learning_rate": 3.4795412275263484e-06,
"loss": 0.3367,
"step": 10660
},
{
"epoch": 1.6537507749535028,
"grad_norm": 7.001228332519531,
"learning_rate": 3.464042157470552e-06,
"loss": 0.4961,
"step": 10670
},
{
"epoch": 1.6553006819590825,
"grad_norm": 5.529093265533447,
"learning_rate": 3.448543087414755e-06,
"loss": 0.536,
"step": 10680
},
{
"epoch": 1.656850588964662,
"grad_norm": 1.5025368928909302,
"learning_rate": 3.4330440173589585e-06,
"loss": 0.537,
"step": 10690
},
{
"epoch": 1.6584004959702416,
"grad_norm": 10.480669975280762,
"learning_rate": 3.417544947303162e-06,
"loss": 0.6538,
"step": 10700
},
{
"epoch": 1.6599504029758214,
"grad_norm": 6.886585712432861,
"learning_rate": 3.402045877247365e-06,
"loss": 0.6508,
"step": 10710
},
{
"epoch": 1.6615003099814012,
"grad_norm": 7.004822254180908,
"learning_rate": 3.3865468071915685e-06,
"loss": 0.5135,
"step": 10720
},
{
"epoch": 1.6630502169869807,
"grad_norm": 6.180261135101318,
"learning_rate": 3.371047737135772e-06,
"loss": 0.4208,
"step": 10730
},
{
"epoch": 1.6646001239925603,
"grad_norm": 12.792762756347656,
"learning_rate": 3.355548667079975e-06,
"loss": 0.5253,
"step": 10740
},
{
"epoch": 1.66615003099814,
"grad_norm": 8.852483749389648,
"learning_rate": 3.3400495970241786e-06,
"loss": 0.5145,
"step": 10750
},
{
"epoch": 1.6676999380037199,
"grad_norm": 7.686612129211426,
"learning_rate": 3.324550526968382e-06,
"loss": 0.4965,
"step": 10760
},
{
"epoch": 1.6692498450092994,
"grad_norm": 5.723041534423828,
"learning_rate": 3.309051456912585e-06,
"loss": 0.5359,
"step": 10770
},
{
"epoch": 1.670799752014879,
"grad_norm": 11.530805587768555,
"learning_rate": 3.2935523868567886e-06,
"loss": 0.5656,
"step": 10780
},
{
"epoch": 1.6723496590204587,
"grad_norm": 8.491327285766602,
"learning_rate": 3.278053316800992e-06,
"loss": 0.5958,
"step": 10790
},
{
"epoch": 1.6738995660260385,
"grad_norm": 4.736944198608398,
"learning_rate": 3.262554246745195e-06,
"loss": 0.5544,
"step": 10800
},
{
"epoch": 1.675449473031618,
"grad_norm": 11.811408042907715,
"learning_rate": 3.2470551766893987e-06,
"loss": 0.6796,
"step": 10810
},
{
"epoch": 1.6769993800371976,
"grad_norm": 6.621671199798584,
"learning_rate": 3.231556106633602e-06,
"loss": 0.664,
"step": 10820
},
{
"epoch": 1.6785492870427774,
"grad_norm": 5.999893665313721,
"learning_rate": 3.2160570365778052e-06,
"loss": 0.5491,
"step": 10830
},
{
"epoch": 1.6800991940483572,
"grad_norm": 7.547499179840088,
"learning_rate": 3.2005579665220087e-06,
"loss": 0.6801,
"step": 10840
},
{
"epoch": 1.6816491010539367,
"grad_norm": 3.695556402206421,
"learning_rate": 3.185058896466212e-06,
"loss": 0.6289,
"step": 10850
},
{
"epoch": 1.6831990080595163,
"grad_norm": 5.461249351501465,
"learning_rate": 3.1695598264104153e-06,
"loss": 0.5556,
"step": 10860
},
{
"epoch": 1.684748915065096,
"grad_norm": 1.9557623863220215,
"learning_rate": 3.1540607563546188e-06,
"loss": 0.3568,
"step": 10870
},
{
"epoch": 1.6862988220706758,
"grad_norm": 10.149141311645508,
"learning_rate": 3.1385616862988223e-06,
"loss": 0.6667,
"step": 10880
},
{
"epoch": 1.6878487290762554,
"grad_norm": 4.037074565887451,
"learning_rate": 3.1230626162430257e-06,
"loss": 0.4963,
"step": 10890
},
{
"epoch": 1.689398636081835,
"grad_norm": 3.6165964603424072,
"learning_rate": 3.107563546187229e-06,
"loss": 0.2717,
"step": 10900
},
{
"epoch": 1.6909485430874147,
"grad_norm": 10.908464431762695,
"learning_rate": 3.0920644761314323e-06,
"loss": 0.5929,
"step": 10910
},
{
"epoch": 1.6924984500929945,
"grad_norm": 7.784704208374023,
"learning_rate": 3.076565406075636e-06,
"loss": 0.4904,
"step": 10920
},
{
"epoch": 1.694048357098574,
"grad_norm": 3.434318780899048,
"learning_rate": 3.061066336019839e-06,
"loss": 0.4701,
"step": 10930
},
{
"epoch": 1.6955982641041536,
"grad_norm": 6.939949989318848,
"learning_rate": 3.0455672659640423e-06,
"loss": 0.5454,
"step": 10940
},
{
"epoch": 1.6971481711097334,
"grad_norm": 6.944652080535889,
"learning_rate": 3.030068195908246e-06,
"loss": 0.4567,
"step": 10950
},
{
"epoch": 1.6986980781153131,
"grad_norm": 6.0426788330078125,
"learning_rate": 3.014569125852449e-06,
"loss": 0.6604,
"step": 10960
},
{
"epoch": 1.7002479851208927,
"grad_norm": 3.7648766040802,
"learning_rate": 2.9990700557966524e-06,
"loss": 0.5468,
"step": 10970
},
{
"epoch": 1.7017978921264723,
"grad_norm": 5.345854759216309,
"learning_rate": 2.983570985740856e-06,
"loss": 0.6152,
"step": 10980
},
{
"epoch": 1.703347799132052,
"grad_norm": 5.971490383148193,
"learning_rate": 2.968071915685059e-06,
"loss": 0.7541,
"step": 10990
},
{
"epoch": 1.7048977061376318,
"grad_norm": 8.548897743225098,
"learning_rate": 2.9525728456292624e-06,
"loss": 0.5634,
"step": 11000
},
{
"epoch": 1.7064476131432114,
"grad_norm": 1.5280627012252808,
"learning_rate": 2.937073775573466e-06,
"loss": 0.4575,
"step": 11010
},
{
"epoch": 1.707997520148791,
"grad_norm": 10.257830619812012,
"learning_rate": 2.921574705517669e-06,
"loss": 0.444,
"step": 11020
},
{
"epoch": 1.7095474271543707,
"grad_norm": 3.970982313156128,
"learning_rate": 2.9060756354618725e-06,
"loss": 0.4747,
"step": 11030
},
{
"epoch": 1.7110973341599505,
"grad_norm": 4.753783702850342,
"learning_rate": 2.890576565406076e-06,
"loss": 0.6571,
"step": 11040
},
{
"epoch": 1.71264724116553,
"grad_norm": 8.187065124511719,
"learning_rate": 2.875077495350279e-06,
"loss": 0.5535,
"step": 11050
},
{
"epoch": 1.7141971481711096,
"grad_norm": 3.5233545303344727,
"learning_rate": 2.8595784252944825e-06,
"loss": 0.366,
"step": 11060
},
{
"epoch": 1.7157470551766894,
"grad_norm": 13.607765197753906,
"learning_rate": 2.844079355238686e-06,
"loss": 0.4742,
"step": 11070
},
{
"epoch": 1.7172969621822691,
"grad_norm": 14.473544120788574,
"learning_rate": 2.828580285182889e-06,
"loss": 0.3733,
"step": 11080
},
{
"epoch": 1.7188468691878487,
"grad_norm": 3.89443302154541,
"learning_rate": 2.8130812151270926e-06,
"loss": 0.7239,
"step": 11090
},
{
"epoch": 1.7203967761934282,
"grad_norm": 13.638595581054688,
"learning_rate": 2.797582145071296e-06,
"loss": 0.6501,
"step": 11100
},
{
"epoch": 1.721946683199008,
"grad_norm": 2.860119581222534,
"learning_rate": 2.782083075015499e-06,
"loss": 0.5003,
"step": 11110
},
{
"epoch": 1.7234965902045878,
"grad_norm": 5.019274711608887,
"learning_rate": 2.7665840049597026e-06,
"loss": 0.4175,
"step": 11120
},
{
"epoch": 1.7250464972101673,
"grad_norm": 10.342511177062988,
"learning_rate": 2.751084934903906e-06,
"loss": 0.6349,
"step": 11130
},
{
"epoch": 1.726596404215747,
"grad_norm": 11.173345565795898,
"learning_rate": 2.735585864848109e-06,
"loss": 0.6483,
"step": 11140
},
{
"epoch": 1.7281463112213267,
"grad_norm": 8.214896202087402,
"learning_rate": 2.7200867947923127e-06,
"loss": 0.5219,
"step": 11150
},
{
"epoch": 1.7296962182269064,
"grad_norm": 5.524158477783203,
"learning_rate": 2.704587724736516e-06,
"loss": 0.5078,
"step": 11160
},
{
"epoch": 1.731246125232486,
"grad_norm": 2.100583791732788,
"learning_rate": 2.6890886546807192e-06,
"loss": 0.7127,
"step": 11170
},
{
"epoch": 1.7327960322380656,
"grad_norm": 7.552937030792236,
"learning_rate": 2.6735895846249227e-06,
"loss": 0.3605,
"step": 11180
},
{
"epoch": 1.7343459392436453,
"grad_norm": 8.49806022644043,
"learning_rate": 2.6580905145691262e-06,
"loss": 0.4601,
"step": 11190
},
{
"epoch": 1.735895846249225,
"grad_norm": 7.082111835479736,
"learning_rate": 2.6425914445133293e-06,
"loss": 0.7208,
"step": 11200
},
{
"epoch": 1.7374457532548047,
"grad_norm": 3.600315809249878,
"learning_rate": 2.6270923744575328e-06,
"loss": 0.4493,
"step": 11210
},
{
"epoch": 1.7389956602603842,
"grad_norm": 1.5405011177062988,
"learning_rate": 2.6115933044017363e-06,
"loss": 0.4534,
"step": 11220
},
{
"epoch": 1.740545567265964,
"grad_norm": 9.279645919799805,
"learning_rate": 2.5960942343459393e-06,
"loss": 0.7194,
"step": 11230
},
{
"epoch": 1.7420954742715438,
"grad_norm": 5.783839702606201,
"learning_rate": 2.580595164290143e-06,
"loss": 0.5796,
"step": 11240
},
{
"epoch": 1.7436453812771233,
"grad_norm": 9.030817985534668,
"learning_rate": 2.5650960942343463e-06,
"loss": 0.5363,
"step": 11250
},
{
"epoch": 1.7451952882827029,
"grad_norm": 5.028315544128418,
"learning_rate": 2.5495970241785494e-06,
"loss": 0.4579,
"step": 11260
},
{
"epoch": 1.7467451952882826,
"grad_norm": 3.2063095569610596,
"learning_rate": 2.534097954122753e-06,
"loss": 0.3426,
"step": 11270
},
{
"epoch": 1.7482951022938624,
"grad_norm": 8.42487621307373,
"learning_rate": 2.5185988840669564e-06,
"loss": 0.4996,
"step": 11280
},
{
"epoch": 1.749845009299442,
"grad_norm": 5.428744792938232,
"learning_rate": 2.5030998140111594e-06,
"loss": 0.5806,
"step": 11290
},
{
"epoch": 1.7513949163050218,
"grad_norm": 14.892745971679688,
"learning_rate": 2.487600743955363e-06,
"loss": 0.5464,
"step": 11300
},
{
"epoch": 1.7529448233106013,
"grad_norm": 6.309720039367676,
"learning_rate": 2.4721016738995664e-06,
"loss": 0.5558,
"step": 11310
},
{
"epoch": 1.754494730316181,
"grad_norm": 4.976221084594727,
"learning_rate": 2.4566026038437695e-06,
"loss": 0.4486,
"step": 11320
},
{
"epoch": 1.7560446373217609,
"grad_norm": 5.923755645751953,
"learning_rate": 2.441103533787973e-06,
"loss": 0.4702,
"step": 11330
},
{
"epoch": 1.7575945443273404,
"grad_norm": 9.575067520141602,
"learning_rate": 2.4256044637321764e-06,
"loss": 0.5632,
"step": 11340
},
{
"epoch": 1.75914445133292,
"grad_norm": 12.65174388885498,
"learning_rate": 2.4101053936763795e-06,
"loss": 0.4884,
"step": 11350
},
{
"epoch": 1.7606943583384997,
"grad_norm": 10.048745155334473,
"learning_rate": 2.394606323620583e-06,
"loss": 0.4622,
"step": 11360
},
{
"epoch": 1.7622442653440795,
"grad_norm": 10.388143539428711,
"learning_rate": 2.3791072535647865e-06,
"loss": 0.6734,
"step": 11370
},
{
"epoch": 1.763794172349659,
"grad_norm": 7.223123073577881,
"learning_rate": 2.3636081835089896e-06,
"loss": 0.5858,
"step": 11380
},
{
"epoch": 1.7653440793552386,
"grad_norm": 14.674286842346191,
"learning_rate": 2.348109113453193e-06,
"loss": 0.589,
"step": 11390
},
{
"epoch": 1.7668939863608184,
"grad_norm": 10.186848640441895,
"learning_rate": 2.3326100433973965e-06,
"loss": 0.5389,
"step": 11400
},
{
"epoch": 1.7684438933663982,
"grad_norm": 2.5273337364196777,
"learning_rate": 2.3171109733415996e-06,
"loss": 0.4664,
"step": 11410
},
{
"epoch": 1.7699938003719777,
"grad_norm": 7.200957775115967,
"learning_rate": 2.301611903285803e-06,
"loss": 0.496,
"step": 11420
},
{
"epoch": 1.7715437073775573,
"grad_norm": 5.758810997009277,
"learning_rate": 2.2861128332300066e-06,
"loss": 0.5685,
"step": 11430
},
{
"epoch": 1.773093614383137,
"grad_norm": 4.028495788574219,
"learning_rate": 2.2706137631742097e-06,
"loss": 0.5332,
"step": 11440
},
{
"epoch": 1.7746435213887168,
"grad_norm": 6.298703193664551,
"learning_rate": 2.255114693118413e-06,
"loss": 0.5661,
"step": 11450
},
{
"epoch": 1.7761934283942964,
"grad_norm": 6.164602756500244,
"learning_rate": 2.2396156230626166e-06,
"loss": 0.519,
"step": 11460
},
{
"epoch": 1.777743335399876,
"grad_norm": 5.306787490844727,
"learning_rate": 2.2241165530068197e-06,
"loss": 0.6083,
"step": 11470
},
{
"epoch": 1.7792932424054557,
"grad_norm": 9.495424270629883,
"learning_rate": 2.208617482951023e-06,
"loss": 0.4819,
"step": 11480
},
{
"epoch": 1.7808431494110355,
"grad_norm": 2.389009475708008,
"learning_rate": 2.1931184128952267e-06,
"loss": 0.387,
"step": 11490
},
{
"epoch": 1.782393056416615,
"grad_norm": 8.894001007080078,
"learning_rate": 2.1776193428394298e-06,
"loss": 0.4082,
"step": 11500
},
{
"epoch": 1.7839429634221946,
"grad_norm": 8.785544395446777,
"learning_rate": 2.1621202727836332e-06,
"loss": 0.3576,
"step": 11510
},
{
"epoch": 1.7854928704277744,
"grad_norm": 12.12935733795166,
"learning_rate": 2.1466212027278367e-06,
"loss": 0.5719,
"step": 11520
},
{
"epoch": 1.7870427774333542,
"grad_norm": 7.645413398742676,
"learning_rate": 2.13112213267204e-06,
"loss": 0.9,
"step": 11530
},
{
"epoch": 1.7885926844389337,
"grad_norm": 6.584036827087402,
"learning_rate": 2.1156230626162433e-06,
"loss": 0.4937,
"step": 11540
},
{
"epoch": 1.7901425914445133,
"grad_norm": 9.352211952209473,
"learning_rate": 2.1001239925604468e-06,
"loss": 0.5416,
"step": 11550
},
{
"epoch": 1.791692498450093,
"grad_norm": 3.5377469062805176,
"learning_rate": 2.08462492250465e-06,
"loss": 0.4514,
"step": 11560
},
{
"epoch": 1.7932424054556728,
"grad_norm": 8.766286849975586,
"learning_rate": 2.0691258524488533e-06,
"loss": 0.4314,
"step": 11570
},
{
"epoch": 1.7947923124612524,
"grad_norm": 9.854792594909668,
"learning_rate": 2.053626782393057e-06,
"loss": 0.6012,
"step": 11580
},
{
"epoch": 1.796342219466832,
"grad_norm": 8.957939147949219,
"learning_rate": 2.03812771233726e-06,
"loss": 0.5714,
"step": 11590
},
{
"epoch": 1.7978921264724117,
"grad_norm": 5.054236888885498,
"learning_rate": 2.0226286422814634e-06,
"loss": 0.6261,
"step": 11600
},
{
"epoch": 1.7994420334779915,
"grad_norm": 6.757805824279785,
"learning_rate": 2.007129572225667e-06,
"loss": 0.5811,
"step": 11610
},
{
"epoch": 1.800991940483571,
"grad_norm": 4.554663181304932,
"learning_rate": 1.99163050216987e-06,
"loss": 0.5902,
"step": 11620
},
{
"epoch": 1.8025418474891506,
"grad_norm": 5.606589317321777,
"learning_rate": 1.9761314321140734e-06,
"loss": 0.4664,
"step": 11630
},
{
"epoch": 1.8040917544947304,
"grad_norm": 9.500225067138672,
"learning_rate": 1.960632362058277e-06,
"loss": 0.6353,
"step": 11640
},
{
"epoch": 1.8056416615003101,
"grad_norm": 9.842873573303223,
"learning_rate": 1.94513329200248e-06,
"loss": 0.4525,
"step": 11650
},
{
"epoch": 1.8071915685058897,
"grad_norm": 6.289135456085205,
"learning_rate": 1.9296342219466835e-06,
"loss": 0.486,
"step": 11660
},
{
"epoch": 1.8087414755114692,
"grad_norm": 5.714487075805664,
"learning_rate": 1.914135151890887e-06,
"loss": 0.4664,
"step": 11670
},
{
"epoch": 1.810291382517049,
"grad_norm": 6.768747806549072,
"learning_rate": 1.8986360818350902e-06,
"loss": 0.6371,
"step": 11680
},
{
"epoch": 1.8118412895226288,
"grad_norm": 6.78218412399292,
"learning_rate": 1.8831370117792935e-06,
"loss": 0.473,
"step": 11690
},
{
"epoch": 1.8133911965282083,
"grad_norm": 5.904227256774902,
"learning_rate": 1.8676379417234966e-06,
"loss": 0.5261,
"step": 11700
},
{
"epoch": 1.814941103533788,
"grad_norm": 10.271391868591309,
"learning_rate": 1.8521388716676999e-06,
"loss": 0.4882,
"step": 11710
},
{
"epoch": 1.8164910105393677,
"grad_norm": 3.0156373977661133,
"learning_rate": 1.8366398016119034e-06,
"loss": 0.5023,
"step": 11720
},
{
"epoch": 1.8180409175449475,
"grad_norm": 1.9927630424499512,
"learning_rate": 1.8211407315561066e-06,
"loss": 0.3889,
"step": 11730
},
{
"epoch": 1.819590824550527,
"grad_norm": 7.668587684631348,
"learning_rate": 1.80564166150031e-06,
"loss": 0.6783,
"step": 11740
},
{
"epoch": 1.8211407315561066,
"grad_norm": 6.309301853179932,
"learning_rate": 1.7901425914445134e-06,
"loss": 0.6409,
"step": 11750
},
{
"epoch": 1.8226906385616863,
"grad_norm": 10.230514526367188,
"learning_rate": 1.7746435213887167e-06,
"loss": 0.621,
"step": 11760
},
{
"epoch": 1.8242405455672661,
"grad_norm": 4.287505149841309,
"learning_rate": 1.75914445133292e-06,
"loss": 0.5267,
"step": 11770
},
{
"epoch": 1.8257904525728457,
"grad_norm": 10.235265731811523,
"learning_rate": 1.7436453812771235e-06,
"loss": 0.5312,
"step": 11780
},
{
"epoch": 1.8273403595784252,
"grad_norm": 4.249007701873779,
"learning_rate": 1.7281463112213267e-06,
"loss": 0.5245,
"step": 11790
},
{
"epoch": 1.828890266584005,
"grad_norm": 8.38654613494873,
"learning_rate": 1.71264724116553e-06,
"loss": 0.4768,
"step": 11800
},
{
"epoch": 1.8304401735895848,
"grad_norm": 6.889254093170166,
"learning_rate": 1.6971481711097335e-06,
"loss": 0.5,
"step": 11810
},
{
"epoch": 1.8319900805951643,
"grad_norm": 10.057848930358887,
"learning_rate": 1.6816491010539368e-06,
"loss": 0.5757,
"step": 11820
},
{
"epoch": 1.8335399876007439,
"grad_norm": 10.05346965789795,
"learning_rate": 1.66615003099814e-06,
"loss": 0.6162,
"step": 11830
},
{
"epoch": 1.8350898946063237,
"grad_norm": 11.503827095031738,
"learning_rate": 1.6506509609423435e-06,
"loss": 0.505,
"step": 11840
},
{
"epoch": 1.8366398016119034,
"grad_norm": 7.171069145202637,
"learning_rate": 1.6351518908865468e-06,
"loss": 0.6695,
"step": 11850
},
{
"epoch": 1.838189708617483,
"grad_norm": 4.7285075187683105,
"learning_rate": 1.61965282083075e-06,
"loss": 0.5785,
"step": 11860
},
{
"epoch": 1.8397396156230625,
"grad_norm": 4.173981666564941,
"learning_rate": 1.6041537507749536e-06,
"loss": 0.4007,
"step": 11870
},
{
"epoch": 1.8412895226286423,
"grad_norm": 7.87106466293335,
"learning_rate": 1.5886546807191569e-06,
"loss": 0.5988,
"step": 11880
},
{
"epoch": 1.842839429634222,
"grad_norm": 5.423832416534424,
"learning_rate": 1.5731556106633604e-06,
"loss": 0.4983,
"step": 11890
},
{
"epoch": 1.8443893366398016,
"grad_norm": 12.526191711425781,
"learning_rate": 1.5576565406075636e-06,
"loss": 0.4422,
"step": 11900
},
{
"epoch": 1.8459392436453812,
"grad_norm": 0.24000059068202972,
"learning_rate": 1.542157470551767e-06,
"loss": 0.4408,
"step": 11910
},
{
"epoch": 1.847489150650961,
"grad_norm": 5.664487838745117,
"learning_rate": 1.5266584004959704e-06,
"loss": 0.3943,
"step": 11920
},
{
"epoch": 1.8490390576565408,
"grad_norm": 4.590766906738281,
"learning_rate": 1.5111593304401737e-06,
"loss": 0.4653,
"step": 11930
},
{
"epoch": 1.8505889646621203,
"grad_norm": 8.723493576049805,
"learning_rate": 1.495660260384377e-06,
"loss": 0.5275,
"step": 11940
},
{
"epoch": 1.8521388716676999,
"grad_norm": 3.8995718955993652,
"learning_rate": 1.4801611903285805e-06,
"loss": 0.4465,
"step": 11950
},
{
"epoch": 1.8536887786732796,
"grad_norm": 9.986695289611816,
"learning_rate": 1.4646621202727837e-06,
"loss": 0.3274,
"step": 11960
},
{
"epoch": 1.8552386856788594,
"grad_norm": 9.503478050231934,
"learning_rate": 1.449163050216987e-06,
"loss": 0.6006,
"step": 11970
},
{
"epoch": 1.856788592684439,
"grad_norm": 11.161650657653809,
"learning_rate": 1.4336639801611905e-06,
"loss": 0.6187,
"step": 11980
},
{
"epoch": 1.8583384996900185,
"grad_norm": 4.863800048828125,
"learning_rate": 1.4181649101053938e-06,
"loss": 0.4654,
"step": 11990
},
{
"epoch": 1.8598884066955983,
"grad_norm": 11.651344299316406,
"learning_rate": 1.402665840049597e-06,
"loss": 0.7257,
"step": 12000
},
{
"epoch": 1.861438313701178,
"grad_norm": 6.425150394439697,
"learning_rate": 1.3871667699938005e-06,
"loss": 0.5763,
"step": 12010
},
{
"epoch": 1.8629882207067576,
"grad_norm": 4.594536781311035,
"learning_rate": 1.3716676999380038e-06,
"loss": 0.5372,
"step": 12020
},
{
"epoch": 1.8645381277123372,
"grad_norm": 5.005278587341309,
"learning_rate": 1.356168629882207e-06,
"loss": 0.4594,
"step": 12030
},
{
"epoch": 1.866088034717917,
"grad_norm": 6.88616943359375,
"learning_rate": 1.3406695598264106e-06,
"loss": 0.4703,
"step": 12040
},
{
"epoch": 1.8676379417234967,
"grad_norm": 4.090301513671875,
"learning_rate": 1.3251704897706139e-06,
"loss": 0.4165,
"step": 12050
},
{
"epoch": 1.8691878487290763,
"grad_norm": 3.528439521789551,
"learning_rate": 1.3096714197148172e-06,
"loss": 0.7157,
"step": 12060
},
{
"epoch": 1.8707377557346558,
"grad_norm": 5.5028533935546875,
"learning_rate": 1.2941723496590206e-06,
"loss": 0.5242,
"step": 12070
},
{
"epoch": 1.8722876627402356,
"grad_norm": 7.8318657875061035,
"learning_rate": 1.278673279603224e-06,
"loss": 0.648,
"step": 12080
},
{
"epoch": 1.8738375697458154,
"grad_norm": 6.823987007141113,
"learning_rate": 1.2631742095474272e-06,
"loss": 0.7727,
"step": 12090
},
{
"epoch": 1.875387476751395,
"grad_norm": 7.5707526206970215,
"learning_rate": 1.2476751394916307e-06,
"loss": 0.552,
"step": 12100
},
{
"epoch": 1.8769373837569745,
"grad_norm": 5.46683931350708,
"learning_rate": 1.232176069435834e-06,
"loss": 0.3831,
"step": 12110
},
{
"epoch": 1.8784872907625543,
"grad_norm": 9.674421310424805,
"learning_rate": 1.2166769993800372e-06,
"loss": 0.6312,
"step": 12120
},
{
"epoch": 1.880037197768134,
"grad_norm": 4.260156154632568,
"learning_rate": 1.2011779293242407e-06,
"loss": 0.5856,
"step": 12130
},
{
"epoch": 1.8815871047737136,
"grad_norm": 3.7952609062194824,
"learning_rate": 1.185678859268444e-06,
"loss": 0.4176,
"step": 12140
},
{
"epoch": 1.8831370117792932,
"grad_norm": 10.493324279785156,
"learning_rate": 1.1701797892126473e-06,
"loss": 0.578,
"step": 12150
},
{
"epoch": 1.884686918784873,
"grad_norm": 1.78457772731781,
"learning_rate": 1.1546807191568508e-06,
"loss": 0.4001,
"step": 12160
},
{
"epoch": 1.8862368257904527,
"grad_norm": 12.227059364318848,
"learning_rate": 1.139181649101054e-06,
"loss": 0.5989,
"step": 12170
},
{
"epoch": 1.8877867327960323,
"grad_norm": 7.4753241539001465,
"learning_rate": 1.1236825790452573e-06,
"loss": 0.3872,
"step": 12180
},
{
"epoch": 1.8893366398016118,
"grad_norm": 8.94823169708252,
"learning_rate": 1.1081835089894608e-06,
"loss": 0.3952,
"step": 12190
},
{
"epoch": 1.8908865468071916,
"grad_norm": 9.308780670166016,
"learning_rate": 1.0926844389336641e-06,
"loss": 0.6887,
"step": 12200
},
{
"epoch": 1.8924364538127714,
"grad_norm": 4.786229610443115,
"learning_rate": 1.0771853688778674e-06,
"loss": 0.4441,
"step": 12210
},
{
"epoch": 1.893986360818351,
"grad_norm": 5.059142112731934,
"learning_rate": 1.0616862988220709e-06,
"loss": 0.6729,
"step": 12220
},
{
"epoch": 1.8955362678239305,
"grad_norm": 2.991612195968628,
"learning_rate": 1.0461872287662742e-06,
"loss": 0.4542,
"step": 12230
},
{
"epoch": 1.8970861748295103,
"grad_norm": 2.8348984718322754,
"learning_rate": 1.0306881587104774e-06,
"loss": 0.3195,
"step": 12240
},
{
"epoch": 1.89863608183509,
"grad_norm": 10.625432014465332,
"learning_rate": 1.015189088654681e-06,
"loss": 0.5155,
"step": 12250
},
{
"epoch": 1.9001859888406696,
"grad_norm": 4.587387561798096,
"learning_rate": 9.996900185988842e-07,
"loss": 0.6146,
"step": 12260
},
{
"epoch": 1.9017358958462491,
"grad_norm": 5.292594909667969,
"learning_rate": 9.841909485430875e-07,
"loss": 0.4085,
"step": 12270
},
{
"epoch": 1.903285802851829,
"grad_norm": 2.4424188137054443,
"learning_rate": 9.68691878487291e-07,
"loss": 0.403,
"step": 12280
},
{
"epoch": 1.9048357098574087,
"grad_norm": 4.393378734588623,
"learning_rate": 9.531928084314942e-07,
"loss": 0.4688,
"step": 12290
},
{
"epoch": 1.9063856168629882,
"grad_norm": 8.843658447265625,
"learning_rate": 9.376937383756976e-07,
"loss": 0.5858,
"step": 12300
},
{
"epoch": 1.9079355238685678,
"grad_norm": 4.157933235168457,
"learning_rate": 9.221946683199008e-07,
"loss": 0.4303,
"step": 12310
},
{
"epoch": 1.9094854308741476,
"grad_norm": 4.35939884185791,
"learning_rate": 9.066955982641042e-07,
"loss": 0.3844,
"step": 12320
},
{
"epoch": 1.9110353378797273,
"grad_norm": 3.711383581161499,
"learning_rate": 8.911965282083075e-07,
"loss": 0.4983,
"step": 12330
},
{
"epoch": 1.912585244885307,
"grad_norm": 10.645997047424316,
"learning_rate": 8.756974581525109e-07,
"loss": 0.5123,
"step": 12340
},
{
"epoch": 1.9141351518908865,
"grad_norm": 11.995467185974121,
"learning_rate": 8.601983880967142e-07,
"loss": 0.5307,
"step": 12350
},
{
"epoch": 1.9156850588964662,
"grad_norm": 10.4423828125,
"learning_rate": 8.446993180409175e-07,
"loss": 0.5145,
"step": 12360
},
{
"epoch": 1.917234965902046,
"grad_norm": 2.7148125171661377,
"learning_rate": 8.292002479851209e-07,
"loss": 0.6484,
"step": 12370
},
{
"epoch": 1.9187848729076256,
"grad_norm": 3.2615230083465576,
"learning_rate": 8.137011779293243e-07,
"loss": 0.5534,
"step": 12380
},
{
"epoch": 1.9203347799132051,
"grad_norm": 5.097367286682129,
"learning_rate": 7.982021078735276e-07,
"loss": 0.475,
"step": 12390
},
{
"epoch": 1.921884686918785,
"grad_norm": 4.083049774169922,
"learning_rate": 7.827030378177309e-07,
"loss": 0.4121,
"step": 12400
},
{
"epoch": 1.9234345939243647,
"grad_norm": 7.365451335906982,
"learning_rate": 7.672039677619343e-07,
"loss": 0.518,
"step": 12410
},
{
"epoch": 1.9249845009299442,
"grad_norm": 5.423826217651367,
"learning_rate": 7.517048977061377e-07,
"loss": 0.5673,
"step": 12420
},
{
"epoch": 1.9265344079355238,
"grad_norm": 1.642021656036377,
"learning_rate": 7.36205827650341e-07,
"loss": 0.5057,
"step": 12430
},
{
"epoch": 1.9280843149411035,
"grad_norm": 8.40982437133789,
"learning_rate": 7.207067575945444e-07,
"loss": 0.3904,
"step": 12440
},
{
"epoch": 1.9296342219466833,
"grad_norm": 5.199375152587891,
"learning_rate": 7.052076875387478e-07,
"loss": 0.5082,
"step": 12450
},
{
"epoch": 1.9311841289522629,
"grad_norm": 2.0132346153259277,
"learning_rate": 6.89708617482951e-07,
"loss": 0.3823,
"step": 12460
},
{
"epoch": 1.9327340359578424,
"grad_norm": 7.0812201499938965,
"learning_rate": 6.742095474271544e-07,
"loss": 0.5928,
"step": 12470
},
{
"epoch": 1.9342839429634222,
"grad_norm": 8.322930335998535,
"learning_rate": 6.587104773713578e-07,
"loss": 0.6861,
"step": 12480
},
{
"epoch": 1.935833849969002,
"grad_norm": 3.1925439834594727,
"learning_rate": 6.432114073155611e-07,
"loss": 0.485,
"step": 12490
},
{
"epoch": 1.9373837569745815,
"grad_norm": 8.817286491394043,
"learning_rate": 6.277123372597645e-07,
"loss": 0.5448,
"step": 12500
},
{
"epoch": 1.938933663980161,
"grad_norm": 5.955672740936279,
"learning_rate": 6.122132672039679e-07,
"loss": 0.5664,
"step": 12510
},
{
"epoch": 1.9404835709857409,
"grad_norm": 4.034615516662598,
"learning_rate": 5.967141971481711e-07,
"loss": 0.5256,
"step": 12520
},
{
"epoch": 1.9420334779913206,
"grad_norm": 5.02003288269043,
"learning_rate": 5.812151270923745e-07,
"loss": 0.5397,
"step": 12530
},
{
"epoch": 1.9435833849969002,
"grad_norm": 4.505518913269043,
"learning_rate": 5.657160570365779e-07,
"loss": 0.5262,
"step": 12540
},
{
"epoch": 1.9451332920024798,
"grad_norm": 8.628135681152344,
"learning_rate": 5.502169869807812e-07,
"loss": 0.5993,
"step": 12550
},
{
"epoch": 1.9466831990080595,
"grad_norm": 4.474428653717041,
"learning_rate": 5.347179169249846e-07,
"loss": 0.6789,
"step": 12560
},
{
"epoch": 1.9482331060136393,
"grad_norm": 4.82116174697876,
"learning_rate": 5.19218846869188e-07,
"loss": 0.4887,
"step": 12570
},
{
"epoch": 1.9497830130192189,
"grad_norm": 8.986303329467773,
"learning_rate": 5.037197768133912e-07,
"loss": 0.8879,
"step": 12580
},
{
"epoch": 1.9513329200247984,
"grad_norm": 4.798040866851807,
"learning_rate": 4.882207067575946e-07,
"loss": 0.4352,
"step": 12590
},
{
"epoch": 1.9528828270303782,
"grad_norm": 3.8391406536102295,
"learning_rate": 4.7272163670179794e-07,
"loss": 0.5088,
"step": 12600
},
{
"epoch": 1.954432734035958,
"grad_norm": 0.2043791562318802,
"learning_rate": 4.572225666460012e-07,
"loss": 0.5993,
"step": 12610
},
{
"epoch": 1.9559826410415375,
"grad_norm": 6.672250747680664,
"learning_rate": 4.417234965902046e-07,
"loss": 0.6768,
"step": 12620
},
{
"epoch": 1.957532548047117,
"grad_norm": 12.16546630859375,
"learning_rate": 4.2622442653440794e-07,
"loss": 0.6934,
"step": 12630
},
{
"epoch": 1.9590824550526968,
"grad_norm": 10.988899230957031,
"learning_rate": 4.1072535647861127e-07,
"loss": 0.4547,
"step": 12640
},
{
"epoch": 1.9606323620582766,
"grad_norm": 10.418371200561523,
"learning_rate": 3.9522628642281465e-07,
"loss": 0.4585,
"step": 12650
},
{
"epoch": 1.9621822690638562,
"grad_norm": 9.15796184539795,
"learning_rate": 3.79727216367018e-07,
"loss": 0.473,
"step": 12660
},
{
"epoch": 1.9637321760694357,
"grad_norm": 5.8964762687683105,
"learning_rate": 3.6422814631122137e-07,
"loss": 0.4932,
"step": 12670
},
{
"epoch": 1.9652820830750155,
"grad_norm": 6.89042329788208,
"learning_rate": 3.487290762554247e-07,
"loss": 0.5725,
"step": 12680
},
{
"epoch": 1.9668319900805953,
"grad_norm": 5.677920818328857,
"learning_rate": 3.3323000619962803e-07,
"loss": 0.4091,
"step": 12690
},
{
"epoch": 1.9683818970861748,
"grad_norm": 7.661967754364014,
"learning_rate": 3.177309361438314e-07,
"loss": 0.4824,
"step": 12700
},
{
"epoch": 1.9699318040917544,
"grad_norm": 4.617781162261963,
"learning_rate": 3.0223186608803475e-07,
"loss": 0.588,
"step": 12710
},
{
"epoch": 1.9714817110973342,
"grad_norm": 4.782810688018799,
"learning_rate": 2.867327960322381e-07,
"loss": 0.5242,
"step": 12720
},
{
"epoch": 1.973031618102914,
"grad_norm": 8.13632583618164,
"learning_rate": 2.7123372597644146e-07,
"loss": 0.4487,
"step": 12730
},
{
"epoch": 1.9745815251084935,
"grad_norm": 3.317437171936035,
"learning_rate": 2.557346559206448e-07,
"loss": 0.4886,
"step": 12740
},
{
"epoch": 1.976131432114073,
"grad_norm": 12.30063533782959,
"learning_rate": 2.402355858648481e-07,
"loss": 0.6661,
"step": 12750
},
{
"epoch": 1.9776813391196528,
"grad_norm": 5.912576675415039,
"learning_rate": 2.2473651580905146e-07,
"loss": 0.407,
"step": 12760
},
{
"epoch": 1.9792312461252326,
"grad_norm": 5.114182472229004,
"learning_rate": 2.0923744575325482e-07,
"loss": 0.494,
"step": 12770
},
{
"epoch": 1.9807811531308122,
"grad_norm": 11.338470458984375,
"learning_rate": 1.9373837569745817e-07,
"loss": 0.5046,
"step": 12780
},
{
"epoch": 1.9823310601363917,
"grad_norm": 2.7713358402252197,
"learning_rate": 1.782393056416615e-07,
"loss": 0.555,
"step": 12790
},
{
"epoch": 1.9838809671419715,
"grad_norm": 4.620921611785889,
"learning_rate": 1.6274023558586486e-07,
"loss": 0.4252,
"step": 12800
},
{
"epoch": 1.9854308741475513,
"grad_norm": 4.4270806312561035,
"learning_rate": 1.4724116553006822e-07,
"loss": 0.4965,
"step": 12810
},
{
"epoch": 1.9869807811531308,
"grad_norm": 9.595443725585938,
"learning_rate": 1.3174209547427155e-07,
"loss": 0.6023,
"step": 12820
},
{
"epoch": 1.9885306881587104,
"grad_norm": 6.762941837310791,
"learning_rate": 1.162430254184749e-07,
"loss": 0.5376,
"step": 12830
},
{
"epoch": 1.9900805951642901,
"grad_norm": 7.066582679748535,
"learning_rate": 1.0074395536267824e-07,
"loss": 0.4774,
"step": 12840
},
{
"epoch": 1.99163050216987,
"grad_norm": 5.059195041656494,
"learning_rate": 8.52448853068816e-08,
"loss": 0.639,
"step": 12850
},
{
"epoch": 1.9931804091754495,
"grad_norm": 9.503751754760742,
"learning_rate": 6.974581525108494e-08,
"loss": 0.5106,
"step": 12860
},
{
"epoch": 1.994730316181029,
"grad_norm": 7.847515106201172,
"learning_rate": 5.424674519528828e-08,
"loss": 0.4999,
"step": 12870
},
{
"epoch": 1.9962802231866088,
"grad_norm": 4.666672706604004,
"learning_rate": 3.874767513949163e-08,
"loss": 0.4127,
"step": 12880
},
{
"epoch": 1.9978301301921886,
"grad_norm": 0.40573740005493164,
"learning_rate": 2.324860508369498e-08,
"loss": 0.5357,
"step": 12890
},
{
"epoch": 1.9993800371977681,
"grad_norm": 15.386001586914062,
"learning_rate": 7.749535027898328e-09,
"loss": 0.4906,
"step": 12900
},
{
"epoch": 2.0,
"eval_accuracy": 0.3503767491926803,
"eval_loss": 1.858621597290039,
"eval_runtime": 26.7923,
"eval_samples_per_second": 69.348,
"eval_steps_per_second": 8.697,
"step": 12904
}
],
"logging_steps": 10,
"max_steps": 12904,
"num_input_tokens_seen": 0,
"num_train_epochs": 2,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 7.999081618146877e+18,
"train_batch_size": 8,
"trial_name": null,
"trial_params": null
}