9b-85 / trainer_state.json
furproxy's picture
Upload folder using huggingface_hub
9242e51 verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 4.0,
"eval_steps": 500,
"global_step": 1804,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.004434589800443459,
"grad_norm": 8.667221069335938,
"learning_rate": 2.197802197802198e-07,
"loss": 1.8642287254333496,
"step": 2
},
{
"epoch": 0.008869179600886918,
"grad_norm": 8.565436363220215,
"learning_rate": 6.593406593406594e-07,
"loss": 2.1231369972229004,
"step": 4
},
{
"epoch": 0.013303769401330377,
"grad_norm": 3.455594301223755,
"learning_rate": 1.098901098901099e-06,
"loss": 1.89163339138031,
"step": 6
},
{
"epoch": 0.017738359201773836,
"grad_norm": 1.2133512496948242,
"learning_rate": 1.5384615384615387e-06,
"loss": 1.7869961261749268,
"step": 8
},
{
"epoch": 0.022172949002217297,
"grad_norm": 4.369198799133301,
"learning_rate": 1.9780219780219782e-06,
"loss": 1.5530983209609985,
"step": 10
},
{
"epoch": 0.026607538802660754,
"grad_norm": 6.942768096923828,
"learning_rate": 2.4175824175824177e-06,
"loss": 1.5389991998672485,
"step": 12
},
{
"epoch": 0.031042128603104215,
"grad_norm": 2.6731338500976562,
"learning_rate": 2.8571428571428573e-06,
"loss": 1.016729474067688,
"step": 14
},
{
"epoch": 0.03547671840354767,
"grad_norm": 6.184485912322998,
"learning_rate": 3.2967032967032968e-06,
"loss": 0.9146304130554199,
"step": 16
},
{
"epoch": 0.03991130820399113,
"grad_norm": 1.2878278493881226,
"learning_rate": 3.7362637362637367e-06,
"loss": 0.9207720756530762,
"step": 18
},
{
"epoch": 0.04434589800443459,
"grad_norm": 3.814846992492676,
"learning_rate": 4.175824175824177e-06,
"loss": 1.1863677501678467,
"step": 20
},
{
"epoch": 0.04878048780487805,
"grad_norm": 6.385776042938232,
"learning_rate": 4.615384615384616e-06,
"loss": 1.320576548576355,
"step": 22
},
{
"epoch": 0.05321507760532151,
"grad_norm": 1.8087103366851807,
"learning_rate": 5.054945054945055e-06,
"loss": 1.4690011739730835,
"step": 24
},
{
"epoch": 0.057649667405764965,
"grad_norm": 1.2201143503189087,
"learning_rate": 5.494505494505495e-06,
"loss": 1.2042573690414429,
"step": 26
},
{
"epoch": 0.06208425720620843,
"grad_norm": 1.1334809064865112,
"learning_rate": 5.934065934065935e-06,
"loss": 1.040234923362732,
"step": 28
},
{
"epoch": 0.06651884700665188,
"grad_norm": 1.8232964277267456,
"learning_rate": 6.373626373626373e-06,
"loss": 1.169386386871338,
"step": 30
},
{
"epoch": 0.07095343680709534,
"grad_norm": 2.845280885696411,
"learning_rate": 6.813186813186814e-06,
"loss": 1.0687059164047241,
"step": 32
},
{
"epoch": 0.07538802660753881,
"grad_norm": 0.9850583672523499,
"learning_rate": 7.252747252747253e-06,
"loss": 1.4395848512649536,
"step": 34
},
{
"epoch": 0.07982261640798226,
"grad_norm": 3.9745352268218994,
"learning_rate": 7.692307692307694e-06,
"loss": 1.1598668098449707,
"step": 36
},
{
"epoch": 0.08425720620842572,
"grad_norm": 4.545168399810791,
"learning_rate": 8.131868131868132e-06,
"loss": 1.515150785446167,
"step": 38
},
{
"epoch": 0.08869179600886919,
"grad_norm": 2.1682798862457275,
"learning_rate": 8.571428571428571e-06,
"loss": 0.7324872612953186,
"step": 40
},
{
"epoch": 0.09312638580931264,
"grad_norm": 0.8849499225616455,
"learning_rate": 9.010989010989011e-06,
"loss": 1.3759689331054688,
"step": 42
},
{
"epoch": 0.0975609756097561,
"grad_norm": 1.3699463605880737,
"learning_rate": 9.450549450549452e-06,
"loss": 1.3197706937789917,
"step": 44
},
{
"epoch": 0.10199556541019955,
"grad_norm": 1.9112352132797241,
"learning_rate": 9.890109890109892e-06,
"loss": 1.355299472808838,
"step": 46
},
{
"epoch": 0.10643015521064302,
"grad_norm": 4.099548816680908,
"learning_rate": 1.0329670329670332e-05,
"loss": 1.583066701889038,
"step": 48
},
{
"epoch": 0.11086474501108648,
"grad_norm": 1.642897605895996,
"learning_rate": 1.076923076923077e-05,
"loss": 1.4291114807128906,
"step": 50
},
{
"epoch": 0.11529933481152993,
"grad_norm": 2.4580774307250977,
"learning_rate": 1.120879120879121e-05,
"loss": 1.3548572063446045,
"step": 52
},
{
"epoch": 0.1197339246119734,
"grad_norm": 1.3751822710037231,
"learning_rate": 1.164835164835165e-05,
"loss": 1.367175579071045,
"step": 54
},
{
"epoch": 0.12416851441241686,
"grad_norm": 5.466642379760742,
"learning_rate": 1.2087912087912089e-05,
"loss": 1.136663794517517,
"step": 56
},
{
"epoch": 0.1286031042128603,
"grad_norm": 1.1212538480758667,
"learning_rate": 1.2527472527472529e-05,
"loss": 1.3549809455871582,
"step": 58
},
{
"epoch": 0.13303769401330376,
"grad_norm": 0.9152220487594604,
"learning_rate": 1.296703296703297e-05,
"loss": 1.4315998554229736,
"step": 60
},
{
"epoch": 0.13747228381374724,
"grad_norm": 1.2018588781356812,
"learning_rate": 1.3406593406593406e-05,
"loss": 1.3350356817245483,
"step": 62
},
{
"epoch": 0.1419068736141907,
"grad_norm": 1.993096947669983,
"learning_rate": 1.3846153846153847e-05,
"loss": 1.3968464136123657,
"step": 64
},
{
"epoch": 0.14634146341463414,
"grad_norm": 1.467322826385498,
"learning_rate": 1.4285714285714287e-05,
"loss": 1.4635425806045532,
"step": 66
},
{
"epoch": 0.15077605321507762,
"grad_norm": 0.7607141137123108,
"learning_rate": 1.4725274725274727e-05,
"loss": 1.3317251205444336,
"step": 68
},
{
"epoch": 0.15521064301552107,
"grad_norm": 7.009274959564209,
"learning_rate": 1.5164835164835166e-05,
"loss": 1.3160146474838257,
"step": 70
},
{
"epoch": 0.15964523281596452,
"grad_norm": 1.0283435583114624,
"learning_rate": 1.5604395604395605e-05,
"loss": 1.3538073301315308,
"step": 72
},
{
"epoch": 0.164079822616408,
"grad_norm": 0.7895150184631348,
"learning_rate": 1.6043956043956047e-05,
"loss": 1.2550619840621948,
"step": 74
},
{
"epoch": 0.16851441241685144,
"grad_norm": 0.7530434131622314,
"learning_rate": 1.6483516483516486e-05,
"loss": 1.355035424232483,
"step": 76
},
{
"epoch": 0.1729490022172949,
"grad_norm": 0.6738516688346863,
"learning_rate": 1.6923076923076924e-05,
"loss": 1.396584391593933,
"step": 78
},
{
"epoch": 0.17738359201773837,
"grad_norm": 1.0755456686019897,
"learning_rate": 1.7362637362637363e-05,
"loss": 1.3568543195724487,
"step": 80
},
{
"epoch": 0.18181818181818182,
"grad_norm": 1.3478541374206543,
"learning_rate": 1.78021978021978e-05,
"loss": 0.8403951525688171,
"step": 82
},
{
"epoch": 0.18625277161862527,
"grad_norm": 0.7471117973327637,
"learning_rate": 1.8241758241758244e-05,
"loss": 1.0819566249847412,
"step": 84
},
{
"epoch": 0.19068736141906872,
"grad_norm": 3.1562721729278564,
"learning_rate": 1.8681318681318682e-05,
"loss": 1.0565105676651,
"step": 86
},
{
"epoch": 0.1951219512195122,
"grad_norm": 0.9117481708526611,
"learning_rate": 1.9120879120879124e-05,
"loss": 1.336931586265564,
"step": 88
},
{
"epoch": 0.19955654101995565,
"grad_norm": 1.8324049711227417,
"learning_rate": 1.9560439560439563e-05,
"loss": 1.4609527587890625,
"step": 90
},
{
"epoch": 0.2039911308203991,
"grad_norm": 0.8476412892341614,
"learning_rate": 2e-05,
"loss": 1.317558765411377,
"step": 92
},
{
"epoch": 0.20842572062084258,
"grad_norm": 0.6812918782234192,
"learning_rate": 1.999993945796182e-05,
"loss": 1.309884786605835,
"step": 94
},
{
"epoch": 0.21286031042128603,
"grad_norm": 1.9555091857910156,
"learning_rate": 1.9999757832661787e-05,
"loss": 1.8222039937973022,
"step": 96
},
{
"epoch": 0.21729490022172948,
"grad_norm": 1.6802914142608643,
"learning_rate": 1.9999455126543454e-05,
"loss": 1.0341295003890991,
"step": 98
},
{
"epoch": 0.22172949002217296,
"grad_norm": 0.9253756403923035,
"learning_rate": 1.9999031343679364e-05,
"loss": 1.2889328002929688,
"step": 100
},
{
"epoch": 0.2261640798226164,
"grad_norm": 0.9691144824028015,
"learning_rate": 1.9998486489770998e-05,
"loss": 1.4229637384414673,
"step": 102
},
{
"epoch": 0.23059866962305986,
"grad_norm": 0.7583999037742615,
"learning_rate": 1.999782057214871e-05,
"loss": 1.1750223636627197,
"step": 104
},
{
"epoch": 0.23503325942350334,
"grad_norm": 0.7559353709220886,
"learning_rate": 1.999703359977161e-05,
"loss": 1.3722642660140991,
"step": 106
},
{
"epoch": 0.2394678492239468,
"grad_norm": 1.8747915029525757,
"learning_rate": 1.9996125583227458e-05,
"loss": 1.5751910209655762,
"step": 108
},
{
"epoch": 0.24390243902439024,
"grad_norm": 0.7324615120887756,
"learning_rate": 1.999509653473251e-05,
"loss": 1.1686367988586426,
"step": 110
},
{
"epoch": 0.24833702882483372,
"grad_norm": 5.023177623748779,
"learning_rate": 1.999394646813137e-05,
"loss": 1.368462324142456,
"step": 112
},
{
"epoch": 0.25277161862527714,
"grad_norm": 2.301079750061035,
"learning_rate": 1.9992675398896784e-05,
"loss": 0.8811516761779785,
"step": 114
},
{
"epoch": 0.2572062084257206,
"grad_norm": 0.6491958498954773,
"learning_rate": 1.9991283344129452e-05,
"loss": 1.4907201528549194,
"step": 116
},
{
"epoch": 0.2616407982261641,
"grad_norm": 0.6563892364501953,
"learning_rate": 1.998977032255777e-05,
"loss": 1.224129557609558,
"step": 118
},
{
"epoch": 0.2660753880266075,
"grad_norm": 0.708153486251831,
"learning_rate": 1.9988136354537615e-05,
"loss": 1.3663833141326904,
"step": 120
},
{
"epoch": 0.270509977827051,
"grad_norm": 1.3739961385726929,
"learning_rate": 1.9986381462052048e-05,
"loss": 1.2798233032226562,
"step": 122
},
{
"epoch": 0.2749445676274945,
"grad_norm": 1.1927521228790283,
"learning_rate": 1.9984505668711006e-05,
"loss": 1.6487520933151245,
"step": 124
},
{
"epoch": 0.2793791574279379,
"grad_norm": 1.0914132595062256,
"learning_rate": 1.998250899975102e-05,
"loss": 0.9563515186309814,
"step": 126
},
{
"epoch": 0.2838137472283814,
"grad_norm": 0.6142106056213379,
"learning_rate": 1.9980391482034844e-05,
"loss": 1.2922307252883911,
"step": 128
},
{
"epoch": 0.28824833702882485,
"grad_norm": 0.9818975925445557,
"learning_rate": 1.9978153144051108e-05,
"loss": 1.0446155071258545,
"step": 130
},
{
"epoch": 0.2926829268292683,
"grad_norm": 1.9593212604522705,
"learning_rate": 1.9975794015913936e-05,
"loss": 1.0657705068588257,
"step": 132
},
{
"epoch": 0.29711751662971175,
"grad_norm": 2.4713385105133057,
"learning_rate": 1.9973314129362533e-05,
"loss": 1.0481352806091309,
"step": 134
},
{
"epoch": 0.30155210643015523,
"grad_norm": 9.34296703338623,
"learning_rate": 1.997071351776076e-05,
"loss": 1.2620774507522583,
"step": 136
},
{
"epoch": 0.30598669623059865,
"grad_norm": 2.3951597213745117,
"learning_rate": 1.996799221609669e-05,
"loss": 0.8199646472930908,
"step": 138
},
{
"epoch": 0.31042128603104213,
"grad_norm": 1.0207390785217285,
"learning_rate": 1.9965150260982137e-05,
"loss": 1.2821062803268433,
"step": 140
},
{
"epoch": 0.3148558758314856,
"grad_norm": 0.808794379234314,
"learning_rate": 1.9962187690652157e-05,
"loss": 1.0488530397415161,
"step": 142
},
{
"epoch": 0.31929046563192903,
"grad_norm": 1.9180113077163696,
"learning_rate": 1.9959104544964536e-05,
"loss": 1.0375815629959106,
"step": 144
},
{
"epoch": 0.3237250554323725,
"grad_norm": 1.6617244482040405,
"learning_rate": 1.9955900865399257e-05,
"loss": 1.0013810396194458,
"step": 146
},
{
"epoch": 0.328159645232816,
"grad_norm": 0.9878438711166382,
"learning_rate": 1.9952576695057944e-05,
"loss": 1.4907773733139038,
"step": 148
},
{
"epoch": 0.3325942350332594,
"grad_norm": 1.895961046218872,
"learning_rate": 1.9949132078663268e-05,
"loss": 1.254366397857666,
"step": 150
},
{
"epoch": 0.3370288248337029,
"grad_norm": 1.2166792154312134,
"learning_rate": 1.9945567062558368e-05,
"loss": 1.1661312580108643,
"step": 152
},
{
"epoch": 0.34146341463414637,
"grad_norm": 0.8109827637672424,
"learning_rate": 1.9941881694706206e-05,
"loss": 1.3392776250839233,
"step": 154
},
{
"epoch": 0.3458980044345898,
"grad_norm": 2.391664505004883,
"learning_rate": 1.993807602468893e-05,
"loss": 1.3111441135406494,
"step": 156
},
{
"epoch": 0.35033259423503327,
"grad_norm": 0.941863477230072,
"learning_rate": 1.9934150103707217e-05,
"loss": 1.3535107374191284,
"step": 158
},
{
"epoch": 0.35476718403547675,
"grad_norm": 0.6483902335166931,
"learning_rate": 1.9930103984579564e-05,
"loss": 1.3064088821411133,
"step": 160
},
{
"epoch": 0.35920177383592017,
"grad_norm": 0.682521641254425,
"learning_rate": 1.9925937721741595e-05,
"loss": 0.9179922938346863,
"step": 162
},
{
"epoch": 0.36363636363636365,
"grad_norm": 1.1320902109146118,
"learning_rate": 1.992165137124532e-05,
"loss": 1.0206555128097534,
"step": 164
},
{
"epoch": 0.36807095343680707,
"grad_norm": 0.8146328926086426,
"learning_rate": 1.9917244990758385e-05,
"loss": 1.3475308418273926,
"step": 166
},
{
"epoch": 0.37250554323725055,
"grad_norm": 1.6250571012496948,
"learning_rate": 1.9912718639563285e-05,
"loss": 1.31868577003479,
"step": 168
},
{
"epoch": 0.376940133037694,
"grad_norm": 0.8682546615600586,
"learning_rate": 1.9908072378556585e-05,
"loss": 1.2381749153137207,
"step": 170
},
{
"epoch": 0.38137472283813745,
"grad_norm": 2.6199824810028076,
"learning_rate": 1.990330627024809e-05,
"loss": 0.8625264167785645,
"step": 172
},
{
"epoch": 0.3858093126385809,
"grad_norm": 1.7685837745666504,
"learning_rate": 1.989842037876e-05,
"loss": 1.7184687852859497,
"step": 174
},
{
"epoch": 0.3902439024390244,
"grad_norm": 0.9849699139595032,
"learning_rate": 1.9893414769826053e-05,
"loss": 1.369092583656311,
"step": 176
},
{
"epoch": 0.3946784922394678,
"grad_norm": 1.686566948890686,
"learning_rate": 1.9888289510790643e-05,
"loss": 1.383589744567871,
"step": 178
},
{
"epoch": 0.3991130820399113,
"grad_norm": 0.793823778629303,
"learning_rate": 1.988304467060791e-05,
"loss": 1.1963413953781128,
"step": 180
},
{
"epoch": 0.4035476718403548,
"grad_norm": 0.6959115266799927,
"learning_rate": 1.9877680319840813e-05,
"loss": 1.335618257522583,
"step": 182
},
{
"epoch": 0.4079822616407982,
"grad_norm": 1.3807117938995361,
"learning_rate": 1.987219653066018e-05,
"loss": 0.8666111826896667,
"step": 184
},
{
"epoch": 0.4124168514412417,
"grad_norm": 1.2673057317733765,
"learning_rate": 1.9866593376843743e-05,
"loss": 1.0503551959991455,
"step": 186
},
{
"epoch": 0.41685144124168516,
"grad_norm": 0.8807701468467712,
"learning_rate": 1.9860870933775128e-05,
"loss": 1.0260038375854492,
"step": 188
},
{
"epoch": 0.4212860310421286,
"grad_norm": 2.0024898052215576,
"learning_rate": 1.9855029278442865e-05,
"loss": 1.1095020771026611,
"step": 190
},
{
"epoch": 0.42572062084257206,
"grad_norm": 2.057466745376587,
"learning_rate": 1.984906848943934e-05,
"loss": 1.100471019744873,
"step": 192
},
{
"epoch": 0.43015521064301554,
"grad_norm": 1.057753324508667,
"learning_rate": 1.9842988646959723e-05,
"loss": 1.3441250324249268,
"step": 194
},
{
"epoch": 0.43458980044345896,
"grad_norm": 1.4172452688217163,
"learning_rate": 1.983678983280093e-05,
"loss": 1.6131374835968018,
"step": 196
},
{
"epoch": 0.43902439024390244,
"grad_norm": 1.8611360788345337,
"learning_rate": 1.983047213036047e-05,
"loss": 1.3363574743270874,
"step": 198
},
{
"epoch": 0.4434589800443459,
"grad_norm": 1.018568992614746,
"learning_rate": 1.9824035624635368e-05,
"loss": 1.2478539943695068,
"step": 200
},
{
"epoch": 0.44789356984478934,
"grad_norm": 1.5161771774291992,
"learning_rate": 1.9817480402220995e-05,
"loss": 1.3159914016723633,
"step": 202
},
{
"epoch": 0.4523281596452328,
"grad_norm": 1.3039575815200806,
"learning_rate": 1.9810806551309903e-05,
"loss": 1.2693634033203125,
"step": 204
},
{
"epoch": 0.4567627494456763,
"grad_norm": 1.2496814727783203,
"learning_rate": 1.9804014161690672e-05,
"loss": 1.1507153511047363,
"step": 206
},
{
"epoch": 0.4611973392461197,
"grad_norm": 4.592546463012695,
"learning_rate": 1.979710332474665e-05,
"loss": 1.1844661235809326,
"step": 208
},
{
"epoch": 0.4656319290465632,
"grad_norm": 0.856142520904541,
"learning_rate": 1.9790074133454765e-05,
"loss": 0.7224380970001221,
"step": 210
},
{
"epoch": 0.4700665188470067,
"grad_norm": 10.285343170166016,
"learning_rate": 1.9782926682384248e-05,
"loss": 0.8978222012519836,
"step": 212
},
{
"epoch": 0.4745011086474501,
"grad_norm": 0.5395671129226685,
"learning_rate": 1.977566106769538e-05,
"loss": 1.1894056797027588,
"step": 214
},
{
"epoch": 0.4789356984478936,
"grad_norm": 0.6740292310714722,
"learning_rate": 1.976827738713819e-05,
"loss": 1.2027900218963623,
"step": 216
},
{
"epoch": 0.48337028824833705,
"grad_norm": 0.9326871037483215,
"learning_rate": 1.976077574005114e-05,
"loss": 1.1885857582092285,
"step": 218
},
{
"epoch": 0.4878048780487805,
"grad_norm": 2.4017117023468018,
"learning_rate": 1.9753156227359783e-05,
"loss": 1.32407546043396,
"step": 220
},
{
"epoch": 0.49223946784922396,
"grad_norm": 1.3293203115463257,
"learning_rate": 1.9745418951575415e-05,
"loss": 1.2708196640014648,
"step": 222
},
{
"epoch": 0.49667405764966743,
"grad_norm": 0.804009199142456,
"learning_rate": 1.9737564016793696e-05,
"loss": 1.2493350505828857,
"step": 224
},
{
"epoch": 0.5011086474501109,
"grad_norm": 0.6624335050582886,
"learning_rate": 1.972959152869323e-05,
"loss": 1.236510992050171,
"step": 226
},
{
"epoch": 0.5055432372505543,
"grad_norm": 1.1144077777862549,
"learning_rate": 1.972150159453417e-05,
"loss": 1.2882966995239258,
"step": 228
},
{
"epoch": 0.5099778270509978,
"grad_norm": 2.013320207595825,
"learning_rate": 1.9713294323156768e-05,
"loss": 1.8960356712341309,
"step": 230
},
{
"epoch": 0.5144124168514412,
"grad_norm": 0.9120582342147827,
"learning_rate": 1.9704969824979893e-05,
"loss": 1.0289053916931152,
"step": 232
},
{
"epoch": 0.5188470066518847,
"grad_norm": 1.22536301612854,
"learning_rate": 1.9696528211999567e-05,
"loss": 1.3444561958312988,
"step": 234
},
{
"epoch": 0.5232815964523282,
"grad_norm": 0.7821425199508667,
"learning_rate": 1.9687969597787445e-05,
"loss": 1.1790920495986938,
"step": 236
},
{
"epoch": 0.5277161862527716,
"grad_norm": 0.8863709568977356,
"learning_rate": 1.967929409748929e-05,
"loss": 1.0798450708389282,
"step": 238
},
{
"epoch": 0.532150776053215,
"grad_norm": 0.5844965577125549,
"learning_rate": 1.967050182782344e-05,
"loss": 1.3156877756118774,
"step": 240
},
{
"epoch": 0.5365853658536586,
"grad_norm": 0.46499544382095337,
"learning_rate": 1.96615929070792e-05,
"loss": 1.3678405284881592,
"step": 242
},
{
"epoch": 0.541019955654102,
"grad_norm": 12.881924629211426,
"learning_rate": 1.9652567455115287e-05,
"loss": 1.0557224750518799,
"step": 244
},
{
"epoch": 0.5454545454545454,
"grad_norm": 1.112845778465271,
"learning_rate": 1.9643425593358212e-05,
"loss": 1.308203101158142,
"step": 246
},
{
"epoch": 0.549889135254989,
"grad_norm": 1.1576392650604248,
"learning_rate": 1.9634167444800618e-05,
"loss": 1.5463697910308838,
"step": 248
},
{
"epoch": 0.5543237250554324,
"grad_norm": 1.7358508110046387,
"learning_rate": 1.9624793133999663e-05,
"loss": 1.3133127689361572,
"step": 250
},
{
"epoch": 0.5587583148558758,
"grad_norm": 1.8306182622909546,
"learning_rate": 1.9615302787075317e-05,
"loss": 0.7901706695556641,
"step": 252
},
{
"epoch": 0.5631929046563193,
"grad_norm": 1.574388861656189,
"learning_rate": 1.9605696531708687e-05,
"loss": 1.5300947427749634,
"step": 254
},
{
"epoch": 0.5676274944567627,
"grad_norm": 0.6506041884422302,
"learning_rate": 1.9595974497140275e-05,
"loss": 1.3747804164886475,
"step": 256
},
{
"epoch": 0.5720620842572062,
"grad_norm": 0.5146905779838562,
"learning_rate": 1.958613681416825e-05,
"loss": 1.3938028812408447,
"step": 258
},
{
"epoch": 0.5764966740576497,
"grad_norm": 0.540286123752594,
"learning_rate": 1.95761836151467e-05,
"loss": 1.3356225490570068,
"step": 260
},
{
"epoch": 0.5809312638580931,
"grad_norm": 1.7904235124588013,
"learning_rate": 1.9566115033983843e-05,
"loss": 0.817384660243988,
"step": 262
},
{
"epoch": 0.5853658536585366,
"grad_norm": 1.602072834968567,
"learning_rate": 1.955593120614021e-05,
"loss": 1.4035075902938843,
"step": 264
},
{
"epoch": 0.5898004434589801,
"grad_norm": 0.951567530632019,
"learning_rate": 1.954563226862685e-05,
"loss": 1.164678692817688,
"step": 266
},
{
"epoch": 0.5942350332594235,
"grad_norm": 0.9935126900672913,
"learning_rate": 1.953521836000346e-05,
"loss": 1.6089775562286377,
"step": 268
},
{
"epoch": 0.5986696230598669,
"grad_norm": 1.1680865287780762,
"learning_rate": 1.9524689620376552e-05,
"loss": 1.2622849941253662,
"step": 270
},
{
"epoch": 0.6031042128603105,
"grad_norm": 0.8502325415611267,
"learning_rate": 1.9514046191397532e-05,
"loss": 1.2814254760742188,
"step": 272
},
{
"epoch": 0.6075388026607539,
"grad_norm": 0.822547972202301,
"learning_rate": 1.950328821626081e-05,
"loss": 1.278984785079956,
"step": 274
},
{
"epoch": 0.6119733924611973,
"grad_norm": 0.9921445250511169,
"learning_rate": 1.9492415839701902e-05,
"loss": 1.2716035842895508,
"step": 276
},
{
"epoch": 0.6164079822616408,
"grad_norm": 2.4694504737854004,
"learning_rate": 1.9481429207995424e-05,
"loss": 1.2899194955825806,
"step": 278
},
{
"epoch": 0.6208425720620843,
"grad_norm": 0.6362584829330444,
"learning_rate": 1.9470328468953176e-05,
"loss": 1.3732231855392456,
"step": 280
},
{
"epoch": 0.6252771618625277,
"grad_norm": 1.02562415599823,
"learning_rate": 1.9459113771922128e-05,
"loss": 1.0229641199111938,
"step": 282
},
{
"epoch": 0.6297117516629712,
"grad_norm": 0.6536508798599243,
"learning_rate": 1.944778526778242e-05,
"loss": 1.315395474433899,
"step": 284
},
{
"epoch": 0.6341463414634146,
"grad_norm": 0.7477055788040161,
"learning_rate": 1.9436343108945323e-05,
"loss": 1.3944462537765503,
"step": 286
},
{
"epoch": 0.6385809312638581,
"grad_norm": 0.5282856822013855,
"learning_rate": 1.9424787449351194e-05,
"loss": 1.3006008863449097,
"step": 288
},
{
"epoch": 0.6430155210643016,
"grad_norm": 0.4401320219039917,
"learning_rate": 1.9413118444467408e-05,
"loss": 1.2911877632141113,
"step": 290
},
{
"epoch": 0.647450110864745,
"grad_norm": 1.1002235412597656,
"learning_rate": 1.9401336251286264e-05,
"loss": 1.43943190574646,
"step": 292
},
{
"epoch": 0.6518847006651884,
"grad_norm": 0.5872219204902649,
"learning_rate": 1.9389441028322874e-05,
"loss": 1.026016116142273,
"step": 294
},
{
"epoch": 0.656319290465632,
"grad_norm": 0.5707578659057617,
"learning_rate": 1.9377432935613016e-05,
"loss": 1.0756226778030396,
"step": 296
},
{
"epoch": 0.6607538802660754,
"grad_norm": 0.6579997539520264,
"learning_rate": 1.936531213471101e-05,
"loss": 1.2744524478912354,
"step": 298
},
{
"epoch": 0.6651884700665188,
"grad_norm": 4.350220680236816,
"learning_rate": 1.935307878868752e-05,
"loss": 1.3224852085113525,
"step": 300
},
{
"epoch": 0.6696230598669624,
"grad_norm": 0.5770370364189148,
"learning_rate": 1.9340733062127373e-05,
"loss": 1.2629750967025757,
"step": 302
},
{
"epoch": 0.6740576496674058,
"grad_norm": 0.7492507696151733,
"learning_rate": 1.9328275121127325e-05,
"loss": 1.3276405334472656,
"step": 304
},
{
"epoch": 0.6784922394678492,
"grad_norm": 0.9730760455131531,
"learning_rate": 1.9315705133293857e-05,
"loss": 1.2819868326187134,
"step": 306
},
{
"epoch": 0.6829268292682927,
"grad_norm": 0.6775749921798706,
"learning_rate": 1.9303023267740902e-05,
"loss": 1.0328669548034668,
"step": 308
},
{
"epoch": 0.6873614190687362,
"grad_norm": 0.6441645622253418,
"learning_rate": 1.9290229695087562e-05,
"loss": 1.2884297370910645,
"step": 310
},
{
"epoch": 0.6917960088691796,
"grad_norm": 0.6459354162216187,
"learning_rate": 1.9277324587455833e-05,
"loss": 1.3426930904388428,
"step": 312
},
{
"epoch": 0.6962305986696231,
"grad_norm": 0.715065598487854,
"learning_rate": 1.9264308118468274e-05,
"loss": 1.2753427028656006,
"step": 314
},
{
"epoch": 0.7006651884700665,
"grad_norm": 0.7442984580993652,
"learning_rate": 1.9251180463245675e-05,
"loss": 1.4162836074829102,
"step": 316
},
{
"epoch": 0.70509977827051,
"grad_norm": 0.6542792916297913,
"learning_rate": 1.9237941798404708e-05,
"loss": 1.1363985538482666,
"step": 318
},
{
"epoch": 0.7095343680709535,
"grad_norm": 1.2111639976501465,
"learning_rate": 1.922459230205553e-05,
"loss": 1.0583592653274536,
"step": 320
},
{
"epoch": 0.7139689578713969,
"grad_norm": 1.301080346107483,
"learning_rate": 1.921113215379943e-05,
"loss": 1.300571322441101,
"step": 322
},
{
"epoch": 0.7184035476718403,
"grad_norm": 0.6510736346244812,
"learning_rate": 1.9197561534726347e-05,
"loss": 1.1844992637634277,
"step": 324
},
{
"epoch": 0.7228381374722838,
"grad_norm": 2.18395733833313,
"learning_rate": 1.9183880627412496e-05,
"loss": 1.2481743097305298,
"step": 326
},
{
"epoch": 0.7272727272727273,
"grad_norm": 0.287090003490448,
"learning_rate": 1.9170089615917884e-05,
"loss": 0.9507350325584412,
"step": 328
},
{
"epoch": 0.7317073170731707,
"grad_norm": 1.0061886310577393,
"learning_rate": 1.915618868578383e-05,
"loss": 0.961956799030304,
"step": 330
},
{
"epoch": 0.7361419068736141,
"grad_norm": 1.3770501613616943,
"learning_rate": 1.9142178024030475e-05,
"loss": 1.4702783823013306,
"step": 332
},
{
"epoch": 0.7405764966740577,
"grad_norm": 4.349529266357422,
"learning_rate": 1.9128057819154264e-05,
"loss": 1.3034319877624512,
"step": 334
},
{
"epoch": 0.7450110864745011,
"grad_norm": 0.5903530716896057,
"learning_rate": 1.911382826112542e-05,
"loss": 1.2915682792663574,
"step": 336
},
{
"epoch": 0.7494456762749445,
"grad_norm": 1.9257632493972778,
"learning_rate": 1.909948954138538e-05,
"loss": 0.859005868434906,
"step": 338
},
{
"epoch": 0.753880266075388,
"grad_norm": 4.127213001251221,
"learning_rate": 1.908504185284421e-05,
"loss": 0.4267387092113495,
"step": 340
},
{
"epoch": 0.7583148558758315,
"grad_norm": 0.7010712027549744,
"learning_rate": 1.9070485389878023e-05,
"loss": 0.9848529696464539,
"step": 342
},
{
"epoch": 0.7627494456762749,
"grad_norm": 0.5236337780952454,
"learning_rate": 1.9055820348326358e-05,
"loss": 1.400795340538025,
"step": 344
},
{
"epoch": 0.7671840354767184,
"grad_norm": 0.49078261852264404,
"learning_rate": 1.9041046925489552e-05,
"loss": 1.304659128189087,
"step": 346
},
{
"epoch": 0.7716186252771619,
"grad_norm": 0.8199257850646973,
"learning_rate": 1.902616532012608e-05,
"loss": 1.1828995943069458,
"step": 348
},
{
"epoch": 0.7760532150776053,
"grad_norm": 0.66054368019104,
"learning_rate": 1.9011175732449878e-05,
"loss": 1.2884124517440796,
"step": 350
},
{
"epoch": 0.7804878048780488,
"grad_norm": 1.6785452365875244,
"learning_rate": 1.8996078364127655e-05,
"loss": 1.2245346307754517,
"step": 352
},
{
"epoch": 0.7849223946784922,
"grad_norm": 1.8945343494415283,
"learning_rate": 1.898087341827618e-05,
"loss": 1.0871098041534424,
"step": 354
},
{
"epoch": 0.7893569844789357,
"grad_norm": 1.0700933933258057,
"learning_rate": 1.896556109945954e-05,
"loss": 1.2871757745742798,
"step": 356
},
{
"epoch": 0.7937915742793792,
"grad_norm": 1.8673183917999268,
"learning_rate": 1.8950141613686404e-05,
"loss": 1.358439564704895,
"step": 358
},
{
"epoch": 0.7982261640798226,
"grad_norm": 0.603571891784668,
"learning_rate": 1.8934615168407237e-05,
"loss": 1.295249104499817,
"step": 360
},
{
"epoch": 0.802660753880266,
"grad_norm": 1.2410091161727905,
"learning_rate": 1.891898197251151e-05,
"loss": 0.8056436777114868,
"step": 362
},
{
"epoch": 0.8070953436807096,
"grad_norm": 1.200040578842163,
"learning_rate": 1.8903242236324907e-05,
"loss": 1.4234434366226196,
"step": 364
},
{
"epoch": 0.811529933481153,
"grad_norm": 0.47995078563690186,
"learning_rate": 1.888739617160647e-05,
"loss": 1.261313557624817,
"step": 366
},
{
"epoch": 0.8159645232815964,
"grad_norm": 1.1774096488952637,
"learning_rate": 1.8871443991545768e-05,
"loss": 1.0709372758865356,
"step": 368
},
{
"epoch": 0.8203991130820399,
"grad_norm": 1.2842013835906982,
"learning_rate": 1.885538591076002e-05,
"loss": 0.9137963652610779,
"step": 370
},
{
"epoch": 0.8248337028824834,
"grad_norm": 0.7302650809288025,
"learning_rate": 1.8839222145291217e-05,
"loss": 1.29634690284729,
"step": 372
},
{
"epoch": 0.8292682926829268,
"grad_norm": 0.611242413520813,
"learning_rate": 1.88229529126032e-05,
"loss": 1.2931721210479736,
"step": 374
},
{
"epoch": 0.8337028824833703,
"grad_norm": 0.4736635684967041,
"learning_rate": 1.8806578431578747e-05,
"loss": 1.2644020318984985,
"step": 376
},
{
"epoch": 0.8381374722838137,
"grad_norm": 1.342694640159607,
"learning_rate": 1.8790098922516637e-05,
"loss": 1.3544180393218994,
"step": 378
},
{
"epoch": 0.8425720620842572,
"grad_norm": 1.1542247533798218,
"learning_rate": 1.8773514607128647e-05,
"loss": 0.9301992654800415,
"step": 380
},
{
"epoch": 0.8470066518847007,
"grad_norm": 0.5767038464546204,
"learning_rate": 1.875682570853662e-05,
"loss": 1.3983073234558105,
"step": 382
},
{
"epoch": 0.8514412416851441,
"grad_norm": 0.7877940535545349,
"learning_rate": 1.8740032451269438e-05,
"loss": 1.195070743560791,
"step": 384
},
{
"epoch": 0.8558758314855875,
"grad_norm": 0.44601938128471375,
"learning_rate": 1.8723135061259977e-05,
"loss": 1.3003090620040894,
"step": 386
},
{
"epoch": 0.8603104212860311,
"grad_norm": 0.35469481348991394,
"learning_rate": 1.8706133765842126e-05,
"loss": 1.2766008377075195,
"step": 388
},
{
"epoch": 0.8647450110864745,
"grad_norm": 0.4653916358947754,
"learning_rate": 1.8689028793747673e-05,
"loss": 1.3040666580200195,
"step": 390
},
{
"epoch": 0.8691796008869179,
"grad_norm": 0.6527778506278992,
"learning_rate": 1.8671820375103256e-05,
"loss": 1.0266871452331543,
"step": 392
},
{
"epoch": 0.8736141906873615,
"grad_norm": 0.5240263938903809,
"learning_rate": 1.8654508741427272e-05,
"loss": 1.2564506530761719,
"step": 394
},
{
"epoch": 0.8780487804878049,
"grad_norm": 0.437155157327652,
"learning_rate": 1.863709412562672e-05,
"loss": 1.246124505996704,
"step": 396
},
{
"epoch": 0.8824833702882483,
"grad_norm": 0.8538821935653687,
"learning_rate": 1.8619576761994137e-05,
"loss": 1.2513529062271118,
"step": 398
},
{
"epoch": 0.8869179600886918,
"grad_norm": 0.49160391092300415,
"learning_rate": 1.860195688620438e-05,
"loss": 0.6274079084396362,
"step": 400
},
{
"epoch": 0.8913525498891353,
"grad_norm": 0.4428112506866455,
"learning_rate": 1.8584234735311497e-05,
"loss": 1.119248390197754,
"step": 402
},
{
"epoch": 0.8957871396895787,
"grad_norm": 0.24119903147220612,
"learning_rate": 1.8566410547745514e-05,
"loss": 1.0662287473678589,
"step": 404
},
{
"epoch": 0.9002217294900222,
"grad_norm": 1.1826022863388062,
"learning_rate": 1.8548484563309243e-05,
"loss": 1.3069649934768677,
"step": 406
},
{
"epoch": 0.9046563192904656,
"grad_norm": 2.6790738105773926,
"learning_rate": 1.853045702317505e-05,
"loss": 1.210648536682129,
"step": 408
},
{
"epoch": 0.9090909090909091,
"grad_norm": 0.8086345195770264,
"learning_rate": 1.85123281698816e-05,
"loss": 1.22344172000885,
"step": 410
},
{
"epoch": 0.9135254988913526,
"grad_norm": 0.46482929587364197,
"learning_rate": 1.8494098247330613e-05,
"loss": 1.2734506130218506,
"step": 412
},
{
"epoch": 0.917960088691796,
"grad_norm": 0.6504107117652893,
"learning_rate": 1.847576750078357e-05,
"loss": 1.2879432439804077,
"step": 414
},
{
"epoch": 0.9223946784922394,
"grad_norm": 2.2455458641052246,
"learning_rate": 1.8457336176858425e-05,
"loss": 1.043541431427002,
"step": 416
},
{
"epoch": 0.926829268292683,
"grad_norm": 0.47505319118499756,
"learning_rate": 1.8438804523526258e-05,
"loss": 1.339963674545288,
"step": 418
},
{
"epoch": 0.9312638580931264,
"grad_norm": 1.4694486856460571,
"learning_rate": 1.8420172790107983e-05,
"loss": 0.8636243939399719,
"step": 420
},
{
"epoch": 0.9356984478935698,
"grad_norm": 1.0414270162582397,
"learning_rate": 1.8401441227270953e-05,
"loss": 1.5467491149902344,
"step": 422
},
{
"epoch": 0.9401330376940134,
"grad_norm": 1.4648017883300781,
"learning_rate": 1.838261008702561e-05,
"loss": 1.1460201740264893,
"step": 424
},
{
"epoch": 0.9445676274944568,
"grad_norm": 0.5038813352584839,
"learning_rate": 1.8363679622722096e-05,
"loss": 1.2603991031646729,
"step": 426
},
{
"epoch": 0.9490022172949002,
"grad_norm": 0.6404750347137451,
"learning_rate": 1.8344650089046826e-05,
"loss": 1.1844969987869263,
"step": 428
},
{
"epoch": 0.9534368070953437,
"grad_norm": 2.21321439743042,
"learning_rate": 1.832552174201908e-05,
"loss": 0.8131325840950012,
"step": 430
},
{
"epoch": 0.9578713968957872,
"grad_norm": 0.49369490146636963,
"learning_rate": 1.830629483898755e-05,
"loss": 1.2790230512619019,
"step": 432
},
{
"epoch": 0.9623059866962306,
"grad_norm": 6.766321659088135,
"learning_rate": 1.8286969638626882e-05,
"loss": 1.2089905738830566,
"step": 434
},
{
"epoch": 0.9667405764966741,
"grad_norm": 0.7347844839096069,
"learning_rate": 1.826754640093419e-05,
"loss": 1.3173238039016724,
"step": 436
},
{
"epoch": 0.9711751662971175,
"grad_norm": 0.564915657043457,
"learning_rate": 1.824802538722556e-05,
"loss": 1.2954607009887695,
"step": 438
},
{
"epoch": 0.975609756097561,
"grad_norm": 2.1599206924438477,
"learning_rate": 1.8228406860132545e-05,
"loss": 0.8611724376678467,
"step": 440
},
{
"epoch": 0.9800443458980045,
"grad_norm": 0.5106037259101868,
"learning_rate": 1.8208691083598607e-05,
"loss": 1.1488136053085327,
"step": 442
},
{
"epoch": 0.9844789356984479,
"grad_norm": 0.9815554618835449,
"learning_rate": 1.8188878322875594e-05,
"loss": 1.3558589220046997,
"step": 444
},
{
"epoch": 0.9889135254988913,
"grad_norm": 0.6858358979225159,
"learning_rate": 1.8168968844520157e-05,
"loss": 1.2466365098953247,
"step": 446
},
{
"epoch": 0.9933481152993349,
"grad_norm": 1.2758557796478271,
"learning_rate": 1.8148962916390154e-05,
"loss": 1.2831544876098633,
"step": 448
},
{
"epoch": 0.9977827050997783,
"grad_norm": 0.47892308235168457,
"learning_rate": 1.8128860807641076e-05,
"loss": 1.1054222583770752,
"step": 450
},
{
"epoch": 1.0022172949002217,
"grad_norm": 1.9382197856903076,
"learning_rate": 1.810866278872239e-05,
"loss": 1.0697418451309204,
"step": 452
},
{
"epoch": 1.0066518847006651,
"grad_norm": 1.1436439752578735,
"learning_rate": 1.8088369131373925e-05,
"loss": 1.2170673608779907,
"step": 454
},
{
"epoch": 1.0110864745011086,
"grad_norm": 0.7328348159790039,
"learning_rate": 1.8067980108622217e-05,
"loss": 1.1548501253128052,
"step": 456
},
{
"epoch": 1.0155210643015522,
"grad_norm": 0.8741162419319153,
"learning_rate": 1.8047495994776817e-05,
"loss": 0.7017601132392883,
"step": 458
},
{
"epoch": 1.0199556541019956,
"grad_norm": 0.3321545124053955,
"learning_rate": 1.8026917065426605e-05,
"loss": 0.7321120500564575,
"step": 460
},
{
"epoch": 1.024390243902439,
"grad_norm": 1.3679202795028687,
"learning_rate": 1.800624359743611e-05,
"loss": 0.5792034864425659,
"step": 462
},
{
"epoch": 1.0288248337028825,
"grad_norm": 0.7569698095321655,
"learning_rate": 1.798547586894175e-05,
"loss": 0.7689359188079834,
"step": 464
},
{
"epoch": 1.033259423503326,
"grad_norm": 0.7644620537757874,
"learning_rate": 1.7964614159348103e-05,
"loss": 0.698060154914856,
"step": 466
},
{
"epoch": 1.0376940133037693,
"grad_norm": 1.2388887405395508,
"learning_rate": 1.794365874932415e-05,
"loss": 0.8797460198402405,
"step": 468
},
{
"epoch": 1.042128603104213,
"grad_norm": 0.9471485018730164,
"learning_rate": 1.7922609920799493e-05,
"loss": 0.6286487579345703,
"step": 470
},
{
"epoch": 1.0465631929046564,
"grad_norm": 2.5266878604888916,
"learning_rate": 1.790146795696059e-05,
"loss": 1.0638426542282104,
"step": 472
},
{
"epoch": 1.0509977827050998,
"grad_norm": 0.6257596015930176,
"learning_rate": 1.7880233142246884e-05,
"loss": 1.0050872564315796,
"step": 474
},
{
"epoch": 1.0554323725055432,
"grad_norm": 0.5379915237426758,
"learning_rate": 1.7858905762347044e-05,
"loss": 0.9805111289024353,
"step": 476
},
{
"epoch": 1.0598669623059866,
"grad_norm": 0.8328865170478821,
"learning_rate": 1.783748610419508e-05,
"loss": 1.1784859895706177,
"step": 478
},
{
"epoch": 1.06430155210643,
"grad_norm": 0.44074714183807373,
"learning_rate": 1.7815974455966488e-05,
"loss": 0.6814610958099365,
"step": 480
},
{
"epoch": 1.0687361419068737,
"grad_norm": 1.1742632389068604,
"learning_rate": 1.7794371107074398e-05,
"loss": 1.1012016534805298,
"step": 482
},
{
"epoch": 1.0731707317073171,
"grad_norm": 0.37530067563056946,
"learning_rate": 1.7772676348165637e-05,
"loss": 0.9307145476341248,
"step": 484
},
{
"epoch": 1.0776053215077606,
"grad_norm": 0.42450839281082153,
"learning_rate": 1.7750890471116858e-05,
"loss": 0.963620662689209,
"step": 486
},
{
"epoch": 1.082039911308204,
"grad_norm": 0.47807762026786804,
"learning_rate": 1.7729013769030596e-05,
"loss": 0.7537004351615906,
"step": 488
},
{
"epoch": 1.0864745011086474,
"grad_norm": 0.4078989028930664,
"learning_rate": 1.7707046536231325e-05,
"loss": 0.854632556438446,
"step": 490
},
{
"epoch": 1.0909090909090908,
"grad_norm": 0.6203530430793762,
"learning_rate": 1.76849890682615e-05,
"loss": 0.9603514671325684,
"step": 492
},
{
"epoch": 1.0953436807095343,
"grad_norm": 1.7032476663589478,
"learning_rate": 1.7662841661877574e-05,
"loss": 1.0737708806991577,
"step": 494
},
{
"epoch": 1.099778270509978,
"grad_norm": 1.1234840154647827,
"learning_rate": 1.7640604615046025e-05,
"loss": 0.9386560320854187,
"step": 496
},
{
"epoch": 1.1042128603104213,
"grad_norm": 0.427051842212677,
"learning_rate": 1.7618278226939327e-05,
"loss": 0.9625406265258789,
"step": 498
},
{
"epoch": 1.1086474501108647,
"grad_norm": 0.7077636122703552,
"learning_rate": 1.7595862797931936e-05,
"loss": 0.6286700367927551,
"step": 500
},
{
"epoch": 1.1130820399113082,
"grad_norm": 0.5965766310691833,
"learning_rate": 1.757335862959624e-05,
"loss": 0.9419457316398621,
"step": 502
},
{
"epoch": 1.1175166297117516,
"grad_norm": 0.7379962801933289,
"learning_rate": 1.755076602469851e-05,
"loss": 0.8069853186607361,
"step": 504
},
{
"epoch": 1.1219512195121952,
"grad_norm": 1.0986132621765137,
"learning_rate": 1.7528085287194827e-05,
"loss": 0.8290332555770874,
"step": 506
},
{
"epoch": 1.1263858093126387,
"grad_norm": 1.4528342485427856,
"learning_rate": 1.750531672222698e-05,
"loss": 0.6308746933937073,
"step": 508
},
{
"epoch": 1.130820399113082,
"grad_norm": 0.7668278217315674,
"learning_rate": 1.7482460636118377e-05,
"loss": 1.0766762495040894,
"step": 510
},
{
"epoch": 1.1352549889135255,
"grad_norm": 1.3378920555114746,
"learning_rate": 1.745951733636992e-05,
"loss": 0.5383997559547424,
"step": 512
},
{
"epoch": 1.139689578713969,
"grad_norm": 1.2324367761611938,
"learning_rate": 1.7436487131655855e-05,
"loss": 0.4129646420478821,
"step": 514
},
{
"epoch": 1.1441241685144123,
"grad_norm": 0.6832541823387146,
"learning_rate": 1.7413370331819634e-05,
"loss": 0.8020773530006409,
"step": 516
},
{
"epoch": 1.1485587583148558,
"grad_norm": 1.0301239490509033,
"learning_rate": 1.7390167247869743e-05,
"loss": 0.9460446238517761,
"step": 518
},
{
"epoch": 1.1529933481152994,
"grad_norm": 1.7787998914718628,
"learning_rate": 1.7366878191975516e-05,
"loss": 1.080168604850769,
"step": 520
},
{
"epoch": 1.1574279379157428,
"grad_norm": 1.1747550964355469,
"learning_rate": 1.7343503477462927e-05,
"loss": 0.534135639667511,
"step": 522
},
{
"epoch": 1.1618625277161863,
"grad_norm": 0.5435235500335693,
"learning_rate": 1.7320043418810394e-05,
"loss": 0.9134470820426941,
"step": 524
},
{
"epoch": 1.1662971175166297,
"grad_norm": 0.5852527022361755,
"learning_rate": 1.729649833164453e-05,
"loss": 1.0747884511947632,
"step": 526
},
{
"epoch": 1.170731707317073,
"grad_norm": 0.5314655900001526,
"learning_rate": 1.727286853273591e-05,
"loss": 0.6440135836601257,
"step": 528
},
{
"epoch": 1.1751662971175167,
"grad_norm": 0.5095431208610535,
"learning_rate": 1.7249154339994788e-05,
"loss": 0.8419979810714722,
"step": 530
},
{
"epoch": 1.1796008869179602,
"grad_norm": 0.4051227569580078,
"learning_rate": 1.7225356072466856e-05,
"loss": 0.8316261768341064,
"step": 532
},
{
"epoch": 1.1840354767184036,
"grad_norm": 0.3643783628940582,
"learning_rate": 1.720147405032891e-05,
"loss": 0.9231957197189331,
"step": 534
},
{
"epoch": 1.188470066518847,
"grad_norm": 0.32051339745521545,
"learning_rate": 1.7177508594884576e-05,
"loss": 0.6917131543159485,
"step": 536
},
{
"epoch": 1.1929046563192904,
"grad_norm": 0.6921893358230591,
"learning_rate": 1.7153460028559964e-05,
"loss": 1.00527024269104,
"step": 538
},
{
"epoch": 1.1973392461197339,
"grad_norm": 0.6226311922073364,
"learning_rate": 1.7129328674899354e-05,
"loss": 0.7679756879806519,
"step": 540
},
{
"epoch": 1.2017738359201773,
"grad_norm": 1.1230734586715698,
"learning_rate": 1.7105114858560813e-05,
"loss": 0.6591505408287048,
"step": 542
},
{
"epoch": 1.206208425720621,
"grad_norm": 0.9631316661834717,
"learning_rate": 1.7080818905311853e-05,
"loss": 0.9413385987281799,
"step": 544
},
{
"epoch": 1.2106430155210643,
"grad_norm": 0.3299412727355957,
"learning_rate": 1.7056441142025037e-05,
"loss": 0.7805101275444031,
"step": 546
},
{
"epoch": 1.2150776053215078,
"grad_norm": 0.6347978115081787,
"learning_rate": 1.703198189667358e-05,
"loss": 1.2124230861663818,
"step": 548
},
{
"epoch": 1.2195121951219512,
"grad_norm": 1.2306925058364868,
"learning_rate": 1.7007441498326943e-05,
"loss": 0.6341520547866821,
"step": 550
},
{
"epoch": 1.2239467849223946,
"grad_norm": 0.6283694505691528,
"learning_rate": 1.6982820277146403e-05,
"loss": 0.971120297908783,
"step": 552
},
{
"epoch": 1.2283813747228383,
"grad_norm": 2.13574481010437,
"learning_rate": 1.6958118564380596e-05,
"loss": 0.7344387173652649,
"step": 554
},
{
"epoch": 1.2328159645232817,
"grad_norm": 0.3253254294395447,
"learning_rate": 1.6933336692361097e-05,
"loss": 0.7349171042442322,
"step": 556
},
{
"epoch": 1.237250554323725,
"grad_norm": 2.8170223236083984,
"learning_rate": 1.6908474994497912e-05,
"loss": 0.588421106338501,
"step": 558
},
{
"epoch": 1.2416851441241685,
"grad_norm": 1.3332557678222656,
"learning_rate": 1.688353380527501e-05,
"loss": 1.1083375215530396,
"step": 560
},
{
"epoch": 1.246119733924612,
"grad_norm": 1.18131685256958,
"learning_rate": 1.6858513460245818e-05,
"loss": 0.8837442398071289,
"step": 562
},
{
"epoch": 1.2505543237250554,
"grad_norm": 0.6048891544342041,
"learning_rate": 1.6833414296028717e-05,
"loss": 0.6526999473571777,
"step": 564
},
{
"epoch": 1.2549889135254988,
"grad_norm": 0.5266470909118652,
"learning_rate": 1.680823665030249e-05,
"loss": 0.8695023655891418,
"step": 566
},
{
"epoch": 1.2594235033259422,
"grad_norm": 0.5137091279029846,
"learning_rate": 1.6782980861801804e-05,
"loss": 0.8212327361106873,
"step": 568
},
{
"epoch": 1.2638580931263859,
"grad_norm": 0.5950433015823364,
"learning_rate": 1.6757647270312637e-05,
"loss": 1.1734381914138794,
"step": 570
},
{
"epoch": 1.2682926829268293,
"grad_norm": 0.4560319185256958,
"learning_rate": 1.6732236216667722e-05,
"loss": 0.739474892616272,
"step": 572
},
{
"epoch": 1.2727272727272727,
"grad_norm": 0.6213061809539795,
"learning_rate": 1.6706748042741935e-05,
"loss": 1.2839826345443726,
"step": 574
},
{
"epoch": 1.2771618625277161,
"grad_norm": 0.5989497900009155,
"learning_rate": 1.6681183091447722e-05,
"loss": 0.9160253405570984,
"step": 576
},
{
"epoch": 1.2815964523281598,
"grad_norm": 1.3319306373596191,
"learning_rate": 1.6655541706730476e-05,
"loss": 1.093945860862732,
"step": 578
},
{
"epoch": 1.2860310421286032,
"grad_norm": 0.5771936774253845,
"learning_rate": 1.6629824233563908e-05,
"loss": 1.0052553415298462,
"step": 580
},
{
"epoch": 1.2904656319290466,
"grad_norm": 2.056089401245117,
"learning_rate": 1.6604031017945403e-05,
"loss": 1.3277779817581177,
"step": 582
},
{
"epoch": 1.29490022172949,
"grad_norm": 0.4700315594673157,
"learning_rate": 1.657816240689137e-05,
"loss": 0.7094478607177734,
"step": 584
},
{
"epoch": 1.2993348115299335,
"grad_norm": 0.4772210419178009,
"learning_rate": 1.6552218748432572e-05,
"loss": 0.7443241477012634,
"step": 586
},
{
"epoch": 1.3037694013303769,
"grad_norm": 1.3316142559051514,
"learning_rate": 1.6526200391609445e-05,
"loss": 0.5478697419166565,
"step": 588
},
{
"epoch": 1.3082039911308203,
"grad_norm": 2.8271443843841553,
"learning_rate": 1.6500107686467407e-05,
"loss": 1.0316827297210693,
"step": 590
},
{
"epoch": 1.3126385809312637,
"grad_norm": 0.5958804488182068,
"learning_rate": 1.6473940984052125e-05,
"loss": 0.9526193141937256,
"step": 592
},
{
"epoch": 1.3170731707317074,
"grad_norm": 0.8103643655776978,
"learning_rate": 1.644770063640483e-05,
"loss": 0.956438422203064,
"step": 594
},
{
"epoch": 1.3215077605321508,
"grad_norm": 0.49165335297584534,
"learning_rate": 1.6421386996557546e-05,
"loss": 1.1481645107269287,
"step": 596
},
{
"epoch": 1.3259423503325942,
"grad_norm": 0.7782723903656006,
"learning_rate": 1.6395000418528362e-05,
"loss": 0.9521985650062561,
"step": 598
},
{
"epoch": 1.3303769401330376,
"grad_norm": 0.4783051609992981,
"learning_rate": 1.636854125731666e-05,
"loss": 0.47762957215309143,
"step": 600
},
{
"epoch": 1.3348115299334813,
"grad_norm": 0.8502888679504395,
"learning_rate": 1.6342009868898332e-05,
"loss": 0.7853302955627441,
"step": 602
},
{
"epoch": 1.3392461197339247,
"grad_norm": 0.7362395524978638,
"learning_rate": 1.6315406610221017e-05,
"loss": 0.842612087726593,
"step": 604
},
{
"epoch": 1.3436807095343681,
"grad_norm": 0.31031566858291626,
"learning_rate": 1.6288731839199265e-05,
"loss": 0.8278278708457947,
"step": 606
},
{
"epoch": 1.3481152993348116,
"grad_norm": 0.6640880703926086,
"learning_rate": 1.6261985914709745e-05,
"loss": 1.028430461883545,
"step": 608
},
{
"epoch": 1.352549889135255,
"grad_norm": 1.618883490562439,
"learning_rate": 1.6235169196586408e-05,
"loss": 1.1671243906021118,
"step": 610
},
{
"epoch": 1.3569844789356984,
"grad_norm": 0.8194751739501953,
"learning_rate": 1.6208282045615648e-05,
"loss": 0.717631459236145,
"step": 612
},
{
"epoch": 1.3614190687361418,
"grad_norm": 0.5236591100692749,
"learning_rate": 1.618132482353145e-05,
"loss": 1.0824005603790283,
"step": 614
},
{
"epoch": 1.3658536585365852,
"grad_norm": 0.30997705459594727,
"learning_rate": 1.6154297893010516e-05,
"loss": 0.705600917339325,
"step": 616
},
{
"epoch": 1.370288248337029,
"grad_norm": 0.5286486744880676,
"learning_rate": 1.6127201617667396e-05,
"loss": 0.8719974756240845,
"step": 618
},
{
"epoch": 1.3747228381374723,
"grad_norm": 0.5527012348175049,
"learning_rate": 1.6100036362049576e-05,
"loss": 0.10983101278543472,
"step": 620
},
{
"epoch": 1.3791574279379157,
"grad_norm": 0.4935061037540436,
"learning_rate": 1.6072802491632612e-05,
"loss": 0.9561376571655273,
"step": 622
},
{
"epoch": 1.3835920177383592,
"grad_norm": 0.8581332564353943,
"learning_rate": 1.6045500372815173e-05,
"loss": 0.9489790201187134,
"step": 624
},
{
"epoch": 1.3880266075388026,
"grad_norm": 1.1202986240386963,
"learning_rate": 1.6018130372914123e-05,
"loss": 0.9768886566162109,
"step": 626
},
{
"epoch": 1.3924611973392462,
"grad_norm": 0.5203282833099365,
"learning_rate": 1.5990692860159597e-05,
"loss": 0.8944608569145203,
"step": 628
},
{
"epoch": 1.3968957871396896,
"grad_norm": 0.44260817766189575,
"learning_rate": 1.5963188203690025e-05,
"loss": 1.0010405778884888,
"step": 630
},
{
"epoch": 1.401330376940133,
"grad_norm": 0.5329799652099609,
"learning_rate": 1.5935616773547182e-05,
"loss": 0.8816275000572205,
"step": 632
},
{
"epoch": 1.4057649667405765,
"grad_norm": 0.8102928400039673,
"learning_rate": 1.5907978940671183e-05,
"loss": 0.9644457101821899,
"step": 634
},
{
"epoch": 1.41019955654102,
"grad_norm": 0.551501989364624,
"learning_rate": 1.5880275076895537e-05,
"loss": 0.9486368894577026,
"step": 636
},
{
"epoch": 1.4146341463414633,
"grad_norm": 4.090445041656494,
"learning_rate": 1.58525055549421e-05,
"loss": 0.6854583024978638,
"step": 638
},
{
"epoch": 1.4190687361419068,
"grad_norm": 0.5645637512207031,
"learning_rate": 1.5824670748416085e-05,
"loss": 0.900244414806366,
"step": 640
},
{
"epoch": 1.4235033259423504,
"grad_norm": 0.7116575837135315,
"learning_rate": 1.5796771031801034e-05,
"loss": 0.8295862674713135,
"step": 642
},
{
"epoch": 1.4279379157427938,
"grad_norm": 0.7264999747276306,
"learning_rate": 1.5768806780453766e-05,
"loss": 0.6157872676849365,
"step": 644
},
{
"epoch": 1.4323725055432373,
"grad_norm": 0.608518123626709,
"learning_rate": 1.5740778370599344e-05,
"loss": 1.0620026588439941,
"step": 646
},
{
"epoch": 1.4368070953436807,
"grad_norm": 0.5453920364379883,
"learning_rate": 1.5712686179326004e-05,
"loss": 1.2050490379333496,
"step": 648
},
{
"epoch": 1.441241685144124,
"grad_norm": 0.42610880732536316,
"learning_rate": 1.5684530584580077e-05,
"loss": 1.1291793584823608,
"step": 650
},
{
"epoch": 1.4456762749445677,
"grad_norm": 2.327178716659546,
"learning_rate": 1.565631196516093e-05,
"loss": 0.8947151899337769,
"step": 652
},
{
"epoch": 1.4501108647450112,
"grad_norm": 0.7120440602302551,
"learning_rate": 1.5628030700715824e-05,
"loss": 0.8887991905212402,
"step": 654
},
{
"epoch": 1.4545454545454546,
"grad_norm": 1.0359218120574951,
"learning_rate": 1.5599687171734853e-05,
"loss": 0.7058618664741516,
"step": 656
},
{
"epoch": 1.458980044345898,
"grad_norm": 0.5742671489715576,
"learning_rate": 1.5571281759545793e-05,
"loss": 0.7722383141517639,
"step": 658
},
{
"epoch": 1.4634146341463414,
"grad_norm": 0.6867632865905762,
"learning_rate": 1.5542814846308996e-05,
"loss": 0.9778433442115784,
"step": 660
},
{
"epoch": 1.4678492239467849,
"grad_norm": 0.42144981026649475,
"learning_rate": 1.5514286815012222e-05,
"loss": 0.9305572509765625,
"step": 662
},
{
"epoch": 1.4722838137472283,
"grad_norm": 0.5244068503379822,
"learning_rate": 1.548569804946551e-05,
"loss": 0.7543381452560425,
"step": 664
},
{
"epoch": 1.476718403547672,
"grad_norm": 0.4360713064670563,
"learning_rate": 1.5457048934296e-05,
"loss": 0.4798527956008911,
"step": 666
},
{
"epoch": 1.4811529933481153,
"grad_norm": 0.905125081539154,
"learning_rate": 1.5428339854942757e-05,
"loss": 0.5245689749717712,
"step": 668
},
{
"epoch": 1.4855875831485588,
"grad_norm": 0.6136901378631592,
"learning_rate": 1.539957119765161e-05,
"loss": 0.9089503884315491,
"step": 670
},
{
"epoch": 1.4900221729490022,
"grad_norm": 0.4613928496837616,
"learning_rate": 1.537074334946992e-05,
"loss": 0.9715514779090881,
"step": 672
},
{
"epoch": 1.4944567627494456,
"grad_norm": 0.6848336458206177,
"learning_rate": 1.5341856698241397e-05,
"loss": 0.6604840755462646,
"step": 674
},
{
"epoch": 1.4988913525498893,
"grad_norm": 0.7074861526489258,
"learning_rate": 1.531291163260087e-05,
"loss": 0.6721962094306946,
"step": 676
},
{
"epoch": 1.5033259423503327,
"grad_norm": 0.7671158909797668,
"learning_rate": 1.5283908541969064e-05,
"loss": 1.0287514925003052,
"step": 678
},
{
"epoch": 1.507760532150776,
"grad_norm": 0.46018627285957336,
"learning_rate": 1.5254847816547366e-05,
"loss": 0.5789790153503418,
"step": 680
},
{
"epoch": 1.5121951219512195,
"grad_norm": 0.5391964316368103,
"learning_rate": 1.522572984731256e-05,
"loss": 0.5692949295043945,
"step": 682
},
{
"epoch": 1.516629711751663,
"grad_norm": 0.523459792137146,
"learning_rate": 1.5196555026011585e-05,
"loss": 0.934548556804657,
"step": 684
},
{
"epoch": 1.5210643015521064,
"grad_norm": 0.328876793384552,
"learning_rate": 1.5167323745156248e-05,
"loss": 0.9151366949081421,
"step": 686
},
{
"epoch": 1.5254988913525498,
"grad_norm": 0.5242407321929932,
"learning_rate": 1.5138036398017953e-05,
"loss": 0.5513712763786316,
"step": 688
},
{
"epoch": 1.5299334811529932,
"grad_norm": 0.38611844182014465,
"learning_rate": 1.510869337862241e-05,
"loss": 0.281048059463501,
"step": 690
},
{
"epoch": 1.5343680709534369,
"grad_norm": 1.463240146636963,
"learning_rate": 1.507929508174433e-05,
"loss": 0.8684556484222412,
"step": 692
},
{
"epoch": 1.5388026607538803,
"grad_norm": 1.3095505237579346,
"learning_rate": 1.5049841902902119e-05,
"loss": 0.8829594254493713,
"step": 694
},
{
"epoch": 1.5432372505543237,
"grad_norm": 1.3540315628051758,
"learning_rate": 1.5020334238352546e-05,
"loss": 0.5511650443077087,
"step": 696
},
{
"epoch": 1.5476718403547673,
"grad_norm": 0.36952298879623413,
"learning_rate": 1.499077248508542e-05,
"loss": 1.02639639377594,
"step": 698
},
{
"epoch": 1.5521064301552108,
"grad_norm": 1.0932236909866333,
"learning_rate": 1.496115704081826e-05,
"loss": 1.0058600902557373,
"step": 700
},
{
"epoch": 1.5565410199556542,
"grad_norm": 0.49011874198913574,
"learning_rate": 1.4931488303990916e-05,
"loss": 1.0263029336929321,
"step": 702
},
{
"epoch": 1.5609756097560976,
"grad_norm": 1.3680771589279175,
"learning_rate": 1.4901766673760232e-05,
"loss": 0.824455738067627,
"step": 704
},
{
"epoch": 1.565410199556541,
"grad_norm": 0.5223835110664368,
"learning_rate": 1.4871992549994673e-05,
"loss": 0.4502509832382202,
"step": 706
},
{
"epoch": 1.5698447893569845,
"grad_norm": 0.5144345164299011,
"learning_rate": 1.4842166333268932e-05,
"loss": 1.0360265970230103,
"step": 708
},
{
"epoch": 1.5742793791574279,
"grad_norm": 1.030713438987732,
"learning_rate": 1.481228842485856e-05,
"loss": 0.8033937215805054,
"step": 710
},
{
"epoch": 1.5787139689578713,
"grad_norm": 0.8714462518692017,
"learning_rate": 1.4782359226734544e-05,
"loss": 0.6804985404014587,
"step": 712
},
{
"epoch": 1.5831485587583147,
"grad_norm": 0.4418451488018036,
"learning_rate": 1.475237914155792e-05,
"loss": 0.9747523665428162,
"step": 714
},
{
"epoch": 1.5875831485587582,
"grad_norm": 0.4844651520252228,
"learning_rate": 1.472234857267435e-05,
"loss": 0.9988541603088379,
"step": 716
},
{
"epoch": 1.5920177383592018,
"grad_norm": 1.146903395652771,
"learning_rate": 1.4692267924108683e-05,
"loss": 1.0589611530303955,
"step": 718
},
{
"epoch": 1.5964523281596452,
"grad_norm": 1.1565581560134888,
"learning_rate": 1.466213760055954e-05,
"loss": 0.5897700786590576,
"step": 720
},
{
"epoch": 1.6008869179600886,
"grad_norm": 0.23559361696243286,
"learning_rate": 1.4631958007393854e-05,
"loss": 0.4846925735473633,
"step": 722
},
{
"epoch": 1.6053215077605323,
"grad_norm": 0.4940757751464844,
"learning_rate": 1.4601729550641417e-05,
"loss": 1.0242489576339722,
"step": 724
},
{
"epoch": 1.6097560975609757,
"grad_norm": 1.7630901336669922,
"learning_rate": 1.4571452636989433e-05,
"loss": 1.0372512340545654,
"step": 726
},
{
"epoch": 1.6141906873614191,
"grad_norm": 0.36424028873443604,
"learning_rate": 1.4541127673777021e-05,
"loss": 0.7359429001808167,
"step": 728
},
{
"epoch": 1.6186252771618626,
"grad_norm": 0.4631586968898773,
"learning_rate": 1.451075506898975e-05,
"loss": 0.9926391839981079,
"step": 730
},
{
"epoch": 1.623059866962306,
"grad_norm": 0.43977200984954834,
"learning_rate": 1.4480335231254164e-05,
"loss": 0.9845470786094666,
"step": 732
},
{
"epoch": 1.6274944567627494,
"grad_norm": 0.5064222812652588,
"learning_rate": 1.4449868569832253e-05,
"loss": 0.9982655048370361,
"step": 734
},
{
"epoch": 1.6319290465631928,
"grad_norm": 0.2603287994861603,
"learning_rate": 1.4419355494615963e-05,
"loss": 0.45653244853019714,
"step": 736
},
{
"epoch": 1.6363636363636362,
"grad_norm": 0.5068104863166809,
"learning_rate": 1.4388796416121696e-05,
"loss": 1.2514511346817017,
"step": 738
},
{
"epoch": 1.6407982261640797,
"grad_norm": 0.39673784375190735,
"learning_rate": 1.4358191745484755e-05,
"loss": 0.9661815166473389,
"step": 740
},
{
"epoch": 1.6452328159645233,
"grad_norm": 0.9892500638961792,
"learning_rate": 1.432754189445384e-05,
"loss": 1.1122088432312012,
"step": 742
},
{
"epoch": 1.6496674057649667,
"grad_norm": 0.6944724917411804,
"learning_rate": 1.4296847275385495e-05,
"loss": 0.7954747080802917,
"step": 744
},
{
"epoch": 1.6541019955654102,
"grad_norm": 1.078669548034668,
"learning_rate": 1.4266108301238564e-05,
"loss": 0.856575071811676,
"step": 746
},
{
"epoch": 1.6585365853658538,
"grad_norm": 0.7615432143211365,
"learning_rate": 1.4235325385568636e-05,
"loss": 0.6531709432601929,
"step": 748
},
{
"epoch": 1.6629711751662972,
"grad_norm": 0.47316062450408936,
"learning_rate": 1.4204498942522482e-05,
"loss": 0.971373975276947,
"step": 750
},
{
"epoch": 1.6674057649667406,
"grad_norm": 0.4431406259536743,
"learning_rate": 1.4173629386832473e-05,
"loss": 0.7244459390640259,
"step": 752
},
{
"epoch": 1.671840354767184,
"grad_norm": 0.5017882585525513,
"learning_rate": 1.4142717133811013e-05,
"loss": 0.5894262790679932,
"step": 754
},
{
"epoch": 1.6762749445676275,
"grad_norm": 0.7016173005104065,
"learning_rate": 1.4111762599344952e-05,
"loss": 1.006710171699524,
"step": 756
},
{
"epoch": 1.680709534368071,
"grad_norm": 0.8765194416046143,
"learning_rate": 1.4080766199889976e-05,
"loss": 0.9072303771972656,
"step": 758
},
{
"epoch": 1.6851441241685143,
"grad_norm": 1.2686158418655396,
"learning_rate": 1.404972835246502e-05,
"loss": 0.8974109292030334,
"step": 760
},
{
"epoch": 1.6895787139689578,
"grad_norm": 0.8306912183761597,
"learning_rate": 1.401864947464665e-05,
"loss": 0.8825592994689941,
"step": 762
},
{
"epoch": 1.6940133037694012,
"grad_norm": 1.107991337776184,
"learning_rate": 1.3987529984563444e-05,
"loss": 0.9357943534851074,
"step": 764
},
{
"epoch": 1.6984478935698448,
"grad_norm": 1.4103295803070068,
"learning_rate": 1.3956370300890374e-05,
"loss": 1.0407212972640991,
"step": 766
},
{
"epoch": 1.7028824833702882,
"grad_norm": 1.0025876760482788,
"learning_rate": 1.392517084284316e-05,
"loss": 0.6954239010810852,
"step": 768
},
{
"epoch": 1.7073170731707317,
"grad_norm": 0.5951728224754333,
"learning_rate": 1.3893932030172642e-05,
"loss": 0.9474072456359863,
"step": 770
},
{
"epoch": 1.7117516629711753,
"grad_norm": 1.6196831464767456,
"learning_rate": 1.386265428315913e-05,
"loss": 0.9979518055915833,
"step": 772
},
{
"epoch": 1.7161862527716187,
"grad_norm": 0.4795306622982025,
"learning_rate": 1.3831338022606748e-05,
"loss": 0.8625308275222778,
"step": 774
},
{
"epoch": 1.7206208425720622,
"grad_norm": 0.9456951022148132,
"learning_rate": 1.3799983669837768e-05,
"loss": 0.9803452491760254,
"step": 776
},
{
"epoch": 1.7250554323725056,
"grad_norm": 0.46205422282218933,
"learning_rate": 1.3768591646686957e-05,
"loss": 1.0163923501968384,
"step": 778
},
{
"epoch": 1.729490022172949,
"grad_norm": 0.6149927377700806,
"learning_rate": 1.3737162375495883e-05,
"loss": 0.5648257732391357,
"step": 780
},
{
"epoch": 1.7339246119733924,
"grad_norm": 0.35180729627609253,
"learning_rate": 1.3705696279107238e-05,
"loss": 0.9397526979446411,
"step": 782
},
{
"epoch": 1.7383592017738358,
"grad_norm": 0.3703164756298065,
"learning_rate": 1.3674193780859163e-05,
"loss": 0.6409098505973816,
"step": 784
},
{
"epoch": 1.7427937915742793,
"grad_norm": 0.4282858371734619,
"learning_rate": 1.3642655304579535e-05,
"loss": 0.7513792514801025,
"step": 786
},
{
"epoch": 1.7472283813747227,
"grad_norm": 0.3827633857727051,
"learning_rate": 1.3611081274580269e-05,
"loss": 0.6845064759254456,
"step": 788
},
{
"epoch": 1.7516629711751663,
"grad_norm": 1.2396421432495117,
"learning_rate": 1.3579472115651623e-05,
"loss": 0.6268539428710938,
"step": 790
},
{
"epoch": 1.7560975609756098,
"grad_norm": 0.40521490573883057,
"learning_rate": 1.354782825305646e-05,
"loss": 0.6478447914123535,
"step": 792
},
{
"epoch": 1.7605321507760532,
"grad_norm": 0.32460105419158936,
"learning_rate": 1.3516150112524542e-05,
"loss": 0.8190337419509888,
"step": 794
},
{
"epoch": 1.7649667405764968,
"grad_norm": 0.8050366640090942,
"learning_rate": 1.3484438120246806e-05,
"loss": 0.8022271394729614,
"step": 796
},
{
"epoch": 1.7694013303769403,
"grad_norm": 0.4470427930355072,
"learning_rate": 1.3452692702869619e-05,
"loss": 0.9513342380523682,
"step": 798
},
{
"epoch": 1.7738359201773837,
"grad_norm": 0.43522873520851135,
"learning_rate": 1.3420914287489037e-05,
"loss": 0.9605931043624878,
"step": 800
},
{
"epoch": 1.778270509977827,
"grad_norm": 0.6569511890411377,
"learning_rate": 1.3389103301645065e-05,
"loss": 0.9895227551460266,
"step": 802
},
{
"epoch": 1.7827050997782705,
"grad_norm": 0.5629826188087463,
"learning_rate": 1.3357260173315918e-05,
"loss": 1.1033282279968262,
"step": 804
},
{
"epoch": 1.787139689578714,
"grad_norm": 0.4114173352718353,
"learning_rate": 1.332538533091223e-05,
"loss": 0.74909508228302,
"step": 806
},
{
"epoch": 1.7915742793791574,
"grad_norm": 0.39374831318855286,
"learning_rate": 1.3293479203271322e-05,
"loss": 0.9650196433067322,
"step": 808
},
{
"epoch": 1.7960088691796008,
"grad_norm": 1.316881537437439,
"learning_rate": 1.3261542219651415e-05,
"loss": 0.5823323130607605,
"step": 810
},
{
"epoch": 1.8004434589800442,
"grad_norm": 0.8751013278961182,
"learning_rate": 1.3229574809725859e-05,
"loss": 0.5940043926239014,
"step": 812
},
{
"epoch": 1.8048780487804879,
"grad_norm": 1.2625625133514404,
"learning_rate": 1.3197577403577355e-05,
"loss": 0.9879517555236816,
"step": 814
},
{
"epoch": 1.8093126385809313,
"grad_norm": 2.798226833343506,
"learning_rate": 1.3165550431692164e-05,
"loss": 0.8953067064285278,
"step": 816
},
{
"epoch": 1.8137472283813747,
"grad_norm": 0.4607974588871002,
"learning_rate": 1.3133494324954328e-05,
"loss": 0.4630458652973175,
"step": 818
},
{
"epoch": 1.8181818181818183,
"grad_norm": 0.7473888993263245,
"learning_rate": 1.3101409514639847e-05,
"loss": 1.0197738409042358,
"step": 820
},
{
"epoch": 1.8226164079822618,
"grad_norm": 0.7188895344734192,
"learning_rate": 1.3069296432410905e-05,
"loss": 1.0835227966308594,
"step": 822
},
{
"epoch": 1.8270509977827052,
"grad_norm": 0.7948015928268433,
"learning_rate": 1.3037155510310047e-05,
"loss": 1.1620758771896362,
"step": 824
},
{
"epoch": 1.8314855875831486,
"grad_norm": 2.9718968868255615,
"learning_rate": 1.3004987180754367e-05,
"loss": 0.9052017331123352,
"step": 826
},
{
"epoch": 1.835920177383592,
"grad_norm": 2.999119281768799,
"learning_rate": 1.29727918765297e-05,
"loss": 0.8258069753646851,
"step": 828
},
{
"epoch": 1.8403547671840355,
"grad_norm": 0.6131216287612915,
"learning_rate": 1.2940570030784783e-05,
"loss": 0.9284101128578186,
"step": 830
},
{
"epoch": 1.8447893569844789,
"grad_norm": 1.4488681554794312,
"learning_rate": 1.290832207702544e-05,
"loss": 0.9328111410140991,
"step": 832
},
{
"epoch": 1.8492239467849223,
"grad_norm": 0.4498242139816284,
"learning_rate": 1.2876048449108756e-05,
"loss": 0.9122157096862793,
"step": 834
},
{
"epoch": 1.8536585365853657,
"grad_norm": 0.4527730643749237,
"learning_rate": 1.2843749581237216e-05,
"loss": 0.951221227645874,
"step": 836
},
{
"epoch": 1.8580931263858091,
"grad_norm": 0.5404245257377625,
"learning_rate": 1.2811425907952887e-05,
"loss": 0.904753565788269,
"step": 838
},
{
"epoch": 1.8625277161862528,
"grad_norm": 0.6924629807472229,
"learning_rate": 1.2779077864131566e-05,
"loss": 1.0605340003967285,
"step": 840
},
{
"epoch": 1.8669623059866962,
"grad_norm": 0.4970324635505676,
"learning_rate": 1.274670588497691e-05,
"loss": 0.5903202295303345,
"step": 842
},
{
"epoch": 1.8713968957871396,
"grad_norm": 0.793752133846283,
"learning_rate": 1.2714310406014613e-05,
"loss": 0.7120020389556885,
"step": 844
},
{
"epoch": 1.8758314855875833,
"grad_norm": 0.8532220721244812,
"learning_rate": 1.2681891863086526e-05,
"loss": 0.7570974230766296,
"step": 846
},
{
"epoch": 1.8802660753880267,
"grad_norm": 0.6667500734329224,
"learning_rate": 1.2649450692344798e-05,
"loss": 1.010290265083313,
"step": 848
},
{
"epoch": 1.8847006651884701,
"grad_norm": 0.5184866786003113,
"learning_rate": 1.2616987330246e-05,
"loss": 0.9949779510498047,
"step": 850
},
{
"epoch": 1.8891352549889135,
"grad_norm": 0.41842129826545715,
"learning_rate": 1.2584502213545273e-05,
"loss": 0.6566750407218933,
"step": 852
},
{
"epoch": 1.893569844789357,
"grad_norm": 0.47411566972732544,
"learning_rate": 1.2551995779290431e-05,
"loss": 0.9789588451385498,
"step": 854
},
{
"epoch": 1.8980044345898004,
"grad_norm": 0.41428887844085693,
"learning_rate": 1.2519468464816094e-05,
"loss": 0.8622305989265442,
"step": 856
},
{
"epoch": 1.9024390243902438,
"grad_norm": 0.5540589094161987,
"learning_rate": 1.2486920707737795e-05,
"loss": 0.7378232479095459,
"step": 858
},
{
"epoch": 1.9068736141906872,
"grad_norm": 0.9826019406318665,
"learning_rate": 1.2454352945946105e-05,
"loss": 0.7468891143798828,
"step": 860
},
{
"epoch": 1.9113082039911307,
"grad_norm": 1.3631356954574585,
"learning_rate": 1.2421765617600732e-05,
"loss": 0.9804845452308655,
"step": 862
},
{
"epoch": 1.9157427937915743,
"grad_norm": 0.6108648777008057,
"learning_rate": 1.238915916112462e-05,
"loss": 0.7339483499526978,
"step": 864
},
{
"epoch": 1.9201773835920177,
"grad_norm": 1.0804190635681152,
"learning_rate": 1.2356534015198067e-05,
"loss": 0.6702901721000671,
"step": 866
},
{
"epoch": 1.9246119733924612,
"grad_norm": 0.8905138373374939,
"learning_rate": 1.2323890618752818e-05,
"loss": 1.140580415725708,
"step": 868
},
{
"epoch": 1.9290465631929048,
"grad_norm": 0.4676206409931183,
"learning_rate": 1.229122941096615e-05,
"loss": 0.9294151663780212,
"step": 870
},
{
"epoch": 1.9334811529933482,
"grad_norm": 0.30312380194664,
"learning_rate": 1.225855083125497e-05,
"loss": 0.6089338660240173,
"step": 872
},
{
"epoch": 1.9379157427937916,
"grad_norm": 0.8847364783287048,
"learning_rate": 1.22258553192699e-05,
"loss": 0.645588219165802,
"step": 874
},
{
"epoch": 1.942350332594235,
"grad_norm": 0.6345183253288269,
"learning_rate": 1.219314331488938e-05,
"loss": 0.6743212938308716,
"step": 876
},
{
"epoch": 1.9467849223946785,
"grad_norm": 1.4533907175064087,
"learning_rate": 1.2160415258213719e-05,
"loss": 0.8229029774665833,
"step": 878
},
{
"epoch": 1.951219512195122,
"grad_norm": 0.656122624874115,
"learning_rate": 1.2127671589559195e-05,
"loss": 0.8455672860145569,
"step": 880
},
{
"epoch": 1.9556541019955653,
"grad_norm": 1.9663106203079224,
"learning_rate": 1.2094912749452134e-05,
"loss": 0.6619812846183777,
"step": 882
},
{
"epoch": 1.9600886917960088,
"grad_norm": 0.43535202741622925,
"learning_rate": 1.2062139178622963e-05,
"loss": 0.81618332862854,
"step": 884
},
{
"epoch": 1.9645232815964522,
"grad_norm": 0.42277711629867554,
"learning_rate": 1.20293513180003e-05,
"loss": 0.9992027878761292,
"step": 886
},
{
"epoch": 1.9689578713968958,
"grad_norm": 0.40196138620376587,
"learning_rate": 1.199654960870502e-05,
"loss": 0.9606343507766724,
"step": 888
},
{
"epoch": 1.9733924611973392,
"grad_norm": 0.42394229769706726,
"learning_rate": 1.1963734492044299e-05,
"loss": 0.9592314958572388,
"step": 890
},
{
"epoch": 1.9778270509977827,
"grad_norm": 0.549923300743103,
"learning_rate": 1.193090640950571e-05,
"loss": 1.0462260246276855,
"step": 892
},
{
"epoch": 1.9822616407982263,
"grad_norm": 0.6976901292800903,
"learning_rate": 1.1898065802751254e-05,
"loss": 0.9654414653778076,
"step": 894
},
{
"epoch": 1.9866962305986697,
"grad_norm": 1.4644861221313477,
"learning_rate": 1.1865213113611438e-05,
"loss": 0.8772508502006531,
"step": 896
},
{
"epoch": 1.9911308203991132,
"grad_norm": 0.6265084147453308,
"learning_rate": 1.1832348784079319e-05,
"loss": 0.9136525988578796,
"step": 898
},
{
"epoch": 1.9955654101995566,
"grad_norm": 0.4937969148159027,
"learning_rate": 1.1799473256304567e-05,
"loss": 0.7895318269729614,
"step": 900
},
{
"epoch": 2.0,
"grad_norm": 0.5102665424346924,
"learning_rate": 1.17665869725875e-05,
"loss": 0.9466162919998169,
"step": 902
},
{
"epoch": 2.0044345898004434,
"grad_norm": 0.4070099890232086,
"learning_rate": 1.1733690375373147e-05,
"loss": 0.715006411075592,
"step": 904
},
{
"epoch": 2.008869179600887,
"grad_norm": 0.5904584527015686,
"learning_rate": 1.1700783907245304e-05,
"loss": 0.6284165978431702,
"step": 906
},
{
"epoch": 2.0133037694013303,
"grad_norm": 0.4084486961364746,
"learning_rate": 1.1667868010920555e-05,
"loss": 0.4244351387023926,
"step": 908
},
{
"epoch": 2.0177383592017737,
"grad_norm": 0.8332369923591614,
"learning_rate": 1.1634943129242337e-05,
"loss": 0.5955982208251953,
"step": 910
},
{
"epoch": 2.022172949002217,
"grad_norm": 0.8778854012489319,
"learning_rate": 1.160200970517497e-05,
"loss": 0.50541752576828,
"step": 912
},
{
"epoch": 2.0266075388026605,
"grad_norm": 4.370595932006836,
"learning_rate": 1.1569068181797699e-05,
"loss": 0.5145138502120972,
"step": 914
},
{
"epoch": 2.0310421286031044,
"grad_norm": 1.4196687936782837,
"learning_rate": 1.1536119002298737e-05,
"loss": 0.47636979818344116,
"step": 916
},
{
"epoch": 2.035476718403548,
"grad_norm": 0.7198065519332886,
"learning_rate": 1.1503162609969314e-05,
"loss": 0.5563622713088989,
"step": 918
},
{
"epoch": 2.0399113082039912,
"grad_norm": 0.43456801772117615,
"learning_rate": 1.1470199448197677e-05,
"loss": 0.5351572632789612,
"step": 920
},
{
"epoch": 2.0443458980044347,
"grad_norm": 0.5137150287628174,
"learning_rate": 1.1437229960463163e-05,
"loss": 0.5629701614379883,
"step": 922
},
{
"epoch": 2.048780487804878,
"grad_norm": 0.3429313004016876,
"learning_rate": 1.1404254590330213e-05,
"loss": 0.15150287747383118,
"step": 924
},
{
"epoch": 2.0532150776053215,
"grad_norm": 0.5494690537452698,
"learning_rate": 1.137127378144241e-05,
"loss": 0.5665069222450256,
"step": 926
},
{
"epoch": 2.057649667405765,
"grad_norm": 1.4760738611221313,
"learning_rate": 1.1338287977516507e-05,
"loss": 0.23657920956611633,
"step": 928
},
{
"epoch": 2.0620842572062084,
"grad_norm": 0.3918812870979309,
"learning_rate": 1.1305297622336457e-05,
"loss": 0.3985291123390198,
"step": 930
},
{
"epoch": 2.066518847006652,
"grad_norm": 3.808762788772583,
"learning_rate": 1.1272303159747451e-05,
"loss": 0.46506452560424805,
"step": 932
},
{
"epoch": 2.070953436807095,
"grad_norm": 0.577021062374115,
"learning_rate": 1.1239305033649934e-05,
"loss": 0.5112553834915161,
"step": 934
},
{
"epoch": 2.0753880266075386,
"grad_norm": 0.7988712787628174,
"learning_rate": 1.1206303687993644e-05,
"loss": 0.7404617071151733,
"step": 936
},
{
"epoch": 2.079822616407982,
"grad_norm": 0.4242592751979828,
"learning_rate": 1.1173299566771626e-05,
"loss": 0.33282893896102905,
"step": 938
},
{
"epoch": 2.084257206208426,
"grad_norm": 0.46631020307540894,
"learning_rate": 1.1140293114014282e-05,
"loss": 0.4563349485397339,
"step": 940
},
{
"epoch": 2.0886917960088693,
"grad_norm": 1.1207689046859741,
"learning_rate": 1.1107284773783367e-05,
"loss": 0.5358268022537231,
"step": 942
},
{
"epoch": 2.0931263858093128,
"grad_norm": 0.6466286182403564,
"learning_rate": 1.1074274990166036e-05,
"loss": 0.406946063041687,
"step": 944
},
{
"epoch": 2.097560975609756,
"grad_norm": 0.6163548827171326,
"learning_rate": 1.1041264207268861e-05,
"loss": 0.5453028678894043,
"step": 946
},
{
"epoch": 2.1019955654101996,
"grad_norm": 0.7833722233772278,
"learning_rate": 1.1008252869211864e-05,
"loss": 0.5683756470680237,
"step": 948
},
{
"epoch": 2.106430155210643,
"grad_norm": 0.8931224942207336,
"learning_rate": 1.0975241420122524e-05,
"loss": 0.4366806149482727,
"step": 950
},
{
"epoch": 2.1108647450110865,
"grad_norm": 0.5928601026535034,
"learning_rate": 1.0942230304129831e-05,
"loss": 0.4392179846763611,
"step": 952
},
{
"epoch": 2.11529933481153,
"grad_norm": 1.2183982133865356,
"learning_rate": 1.0909219965358275e-05,
"loss": 0.49065983295440674,
"step": 954
},
{
"epoch": 2.1197339246119733,
"grad_norm": 0.6264125108718872,
"learning_rate": 1.0876210847921905e-05,
"loss": 0.5899641513824463,
"step": 956
},
{
"epoch": 2.1241685144124167,
"grad_norm": 0.6409426927566528,
"learning_rate": 1.0843203395918327e-05,
"loss": 0.4045730531215668,
"step": 958
},
{
"epoch": 2.12860310421286,
"grad_norm": 1.60128653049469,
"learning_rate": 1.0810198053422747e-05,
"loss": 0.22457213699817657,
"step": 960
},
{
"epoch": 2.1330376940133036,
"grad_norm": 1.197357177734375,
"learning_rate": 1.0777195264481988e-05,
"loss": 0.3387850224971771,
"step": 962
},
{
"epoch": 2.1374722838137474,
"grad_norm": 0.8524078130722046,
"learning_rate": 1.0744195473108522e-05,
"loss": 0.44860363006591797,
"step": 964
},
{
"epoch": 2.141906873614191,
"grad_norm": 0.548141598701477,
"learning_rate": 1.071119912327448e-05,
"loss": 0.7017822861671448,
"step": 966
},
{
"epoch": 2.1463414634146343,
"grad_norm": 0.4515199363231659,
"learning_rate": 1.0678206658905712e-05,
"loss": 0.3781665563583374,
"step": 968
},
{
"epoch": 2.1507760532150777,
"grad_norm": 0.6646062731742859,
"learning_rate": 1.0645218523875773e-05,
"loss": 0.51128089427948,
"step": 970
},
{
"epoch": 2.155210643015521,
"grad_norm": 0.5504773855209351,
"learning_rate": 1.0612235161999987e-05,
"loss": 0.3802485764026642,
"step": 972
},
{
"epoch": 2.1596452328159645,
"grad_norm": 0.527137279510498,
"learning_rate": 1.057925701702945e-05,
"loss": 0.6255434155464172,
"step": 974
},
{
"epoch": 2.164079822616408,
"grad_norm": 0.8251080513000488,
"learning_rate": 1.0546284532645077e-05,
"loss": 0.49471452832221985,
"step": 976
},
{
"epoch": 2.1685144124168514,
"grad_norm": 1.3507685661315918,
"learning_rate": 1.0513318152451627e-05,
"loss": 0.3210045397281647,
"step": 978
},
{
"epoch": 2.172949002217295,
"grad_norm": 0.6633515357971191,
"learning_rate": 1.0480358319971731e-05,
"loss": 0.6007053852081299,
"step": 980
},
{
"epoch": 2.1773835920177382,
"grad_norm": 0.36952632665634155,
"learning_rate": 1.0447405478639929e-05,
"loss": 0.2838934361934662,
"step": 982
},
{
"epoch": 2.1818181818181817,
"grad_norm": 0.5688261985778809,
"learning_rate": 1.0414460071796712e-05,
"loss": 0.18350011110305786,
"step": 984
},
{
"epoch": 2.186252771618625,
"grad_norm": 1.1831949949264526,
"learning_rate": 1.0381522542682536e-05,
"loss": 0.40068039298057556,
"step": 986
},
{
"epoch": 2.1906873614190685,
"grad_norm": 1.4388840198516846,
"learning_rate": 1.0348593334431878e-05,
"loss": 0.23880073428153992,
"step": 988
},
{
"epoch": 2.1951219512195124,
"grad_norm": 0.6307854652404785,
"learning_rate": 1.0315672890067271e-05,
"loss": 0.5894753932952881,
"step": 990
},
{
"epoch": 2.199556541019956,
"grad_norm": 2.421830415725708,
"learning_rate": 1.0282761652493334e-05,
"loss": 0.4432171583175659,
"step": 992
},
{
"epoch": 2.203991130820399,
"grad_norm": 0.5128687620162964,
"learning_rate": 1.024986006449083e-05,
"loss": 0.48450496792793274,
"step": 994
},
{
"epoch": 2.2084257206208426,
"grad_norm": 0.5676178932189941,
"learning_rate": 1.0216968568710679e-05,
"loss": 0.5746522545814514,
"step": 996
},
{
"epoch": 2.212860310421286,
"grad_norm": 0.5976463556289673,
"learning_rate": 1.0184087607668039e-05,
"loss": 0.5264995694160461,
"step": 998
},
{
"epoch": 2.2172949002217295,
"grad_norm": 0.7051799297332764,
"learning_rate": 1.0151217623736338e-05,
"loss": 0.46825850009918213,
"step": 1000
},
{
"epoch": 2.221729490022173,
"grad_norm": 0.8515892624855042,
"learning_rate": 1.0118359059141313e-05,
"loss": 0.27047228813171387,
"step": 1002
},
{
"epoch": 2.2261640798226163,
"grad_norm": 0.4068000316619873,
"learning_rate": 1.0085512355955067e-05,
"loss": 0.5676589608192444,
"step": 1004
},
{
"epoch": 2.2305986696230597,
"grad_norm": 0.8601819276809692,
"learning_rate": 1.0052677956090125e-05,
"loss": 0.46005040407180786,
"step": 1006
},
{
"epoch": 2.235033259423503,
"grad_norm": 0.7253012657165527,
"learning_rate": 1.0019856301293482e-05,
"loss": 0.5689443945884705,
"step": 1008
},
{
"epoch": 2.2394678492239466,
"grad_norm": 0.46540704369544983,
"learning_rate": 9.987047833140668e-06,
"loss": 0.3451939523220062,
"step": 1010
},
{
"epoch": 2.2439024390243905,
"grad_norm": 2.0232350826263428,
"learning_rate": 9.954252993029803e-06,
"loss": 0.5826783776283264,
"step": 1012
},
{
"epoch": 2.248337028824834,
"grad_norm": 3.809951066970825,
"learning_rate": 9.921472222175654e-06,
"loss": 0.5647210478782654,
"step": 1014
},
{
"epoch": 2.2527716186252773,
"grad_norm": 1.0120117664337158,
"learning_rate": 9.888705961603709e-06,
"loss": 0.6450280547142029,
"step": 1016
},
{
"epoch": 2.2572062084257207,
"grad_norm": 0.6231004595756531,
"learning_rate": 9.85595465214423e-06,
"loss": 0.24749194085597992,
"step": 1018
},
{
"epoch": 2.261640798226164,
"grad_norm": 0.5251925587654114,
"learning_rate": 9.823218734426336e-06,
"loss": 0.5488971471786499,
"step": 1020
},
{
"epoch": 2.2660753880266076,
"grad_norm": 0.22870703041553497,
"learning_rate": 9.79049864887207e-06,
"loss": 0.39323848485946655,
"step": 1022
},
{
"epoch": 2.270509977827051,
"grad_norm": 1.2425155639648438,
"learning_rate": 9.757794835690463e-06,
"loss": 0.8195447325706482,
"step": 1024
},
{
"epoch": 2.2749445676274944,
"grad_norm": 1.2200350761413574,
"learning_rate": 9.72510773487164e-06,
"loss": 0.39812397956848145,
"step": 1026
},
{
"epoch": 2.279379157427938,
"grad_norm": 0.5721977353096008,
"learning_rate": 9.692437786180852e-06,
"loss": 0.5707634687423706,
"step": 1028
},
{
"epoch": 2.2838137472283813,
"grad_norm": 0.47224897146224976,
"learning_rate": 9.659785429152615e-06,
"loss": 0.6199125051498413,
"step": 1030
},
{
"epoch": 2.2882483370288247,
"grad_norm": 1.0250192880630493,
"learning_rate": 9.627151103084763e-06,
"loss": 0.41856324672698975,
"step": 1032
},
{
"epoch": 2.292682926829268,
"grad_norm": 0.947811484336853,
"learning_rate": 9.594535247032543e-06,
"loss": 0.32791462540626526,
"step": 1034
},
{
"epoch": 2.2971175166297115,
"grad_norm": 0.6266341805458069,
"learning_rate": 9.561938299802709e-06,
"loss": 0.5352550745010376,
"step": 1036
},
{
"epoch": 2.3015521064301554,
"grad_norm": 4.217014789581299,
"learning_rate": 9.529360699947624e-06,
"loss": 0.6385710835456848,
"step": 1038
},
{
"epoch": 2.305986696230599,
"grad_norm": 0.8212743401527405,
"learning_rate": 9.496802885759349e-06,
"loss": 0.4557139277458191,
"step": 1040
},
{
"epoch": 2.3104212860310422,
"grad_norm": 1.0060659646987915,
"learning_rate": 9.464265295263762e-06,
"loss": 0.7039799690246582,
"step": 1042
},
{
"epoch": 2.3148558758314857,
"grad_norm": 12.946681022644043,
"learning_rate": 9.431748366214648e-06,
"loss": 0.4291222095489502,
"step": 1044
},
{
"epoch": 2.319290465631929,
"grad_norm": 0.5580220222473145,
"learning_rate": 9.399252536087822e-06,
"loss": 0.6024729013442993,
"step": 1046
},
{
"epoch": 2.3237250554323725,
"grad_norm": 0.607992947101593,
"learning_rate": 9.366778242075236e-06,
"loss": 0.5440095663070679,
"step": 1048
},
{
"epoch": 2.328159645232816,
"grad_norm": 0.6783135533332825,
"learning_rate": 9.334325921079104e-06,
"loss": 0.6058806777000427,
"step": 1050
},
{
"epoch": 2.3325942350332594,
"grad_norm": 0.6938934922218323,
"learning_rate": 9.301896009706012e-06,
"loss": 0.4494543671607971,
"step": 1052
},
{
"epoch": 2.337028824833703,
"grad_norm": 0.477782279253006,
"learning_rate": 9.269488944261058e-06,
"loss": 0.4361210763454437,
"step": 1054
},
{
"epoch": 2.341463414634146,
"grad_norm": 0.5728092193603516,
"learning_rate": 9.237105160741976e-06,
"loss": 0.5449360609054565,
"step": 1056
},
{
"epoch": 2.3458980044345896,
"grad_norm": 0.18092034757137299,
"learning_rate": 9.204745094833265e-06,
"loss": 0.3745296895503998,
"step": 1058
},
{
"epoch": 2.3503325942350335,
"grad_norm": 0.5357985496520996,
"learning_rate": 9.172409181900337e-06,
"loss": 0.6852684020996094,
"step": 1060
},
{
"epoch": 2.354767184035477,
"grad_norm": 0.790863037109375,
"learning_rate": 9.140097856983647e-06,
"loss": 0.2813524603843689,
"step": 1062
},
{
"epoch": 2.3592017738359203,
"grad_norm": 0.2192503809928894,
"learning_rate": 9.107811554792863e-06,
"loss": 0.3573903739452362,
"step": 1064
},
{
"epoch": 2.3636363636363638,
"grad_norm": 1.4538520574569702,
"learning_rate": 9.075550709700992e-06,
"loss": 0.5834711790084839,
"step": 1066
},
{
"epoch": 2.368070953436807,
"grad_norm": 0.641722559928894,
"learning_rate": 9.043315755738545e-06,
"loss": 0.5266854763031006,
"step": 1068
},
{
"epoch": 2.3725055432372506,
"grad_norm": 0.6017807126045227,
"learning_rate": 9.011107126587705e-06,
"loss": 0.5866771936416626,
"step": 1070
},
{
"epoch": 2.376940133037694,
"grad_norm": 0.707431435585022,
"learning_rate": 8.978925255576484e-06,
"loss": 0.4829937517642975,
"step": 1072
},
{
"epoch": 2.3813747228381374,
"grad_norm": 0.2395654022693634,
"learning_rate": 8.946770575672897e-06,
"loss": 0.04968187212944031,
"step": 1074
},
{
"epoch": 2.385809312638581,
"grad_norm": 0.5818225741386414,
"learning_rate": 8.914643519479134e-06,
"loss": 0.3766881227493286,
"step": 1076
},
{
"epoch": 2.3902439024390243,
"grad_norm": 0.4298112094402313,
"learning_rate": 8.882544519225737e-06,
"loss": 0.1799193024635315,
"step": 1078
},
{
"epoch": 2.3946784922394677,
"grad_norm": 1.3011754751205444,
"learning_rate": 8.850474006765806e-06,
"loss": 0.5404252409934998,
"step": 1080
},
{
"epoch": 2.399113082039911,
"grad_norm": 0.6072801351547241,
"learning_rate": 8.818432413569153e-06,
"loss": 0.42710888385772705,
"step": 1082
},
{
"epoch": 2.4035476718403546,
"grad_norm": 0.8172256350517273,
"learning_rate": 8.78642017071653e-06,
"loss": 0.4754990339279175,
"step": 1084
},
{
"epoch": 2.4079822616407984,
"grad_norm": 0.4423505961894989,
"learning_rate": 8.754437708893803e-06,
"loss": 0.5498704314231873,
"step": 1086
},
{
"epoch": 2.412416851441242,
"grad_norm": 0.292689710855484,
"learning_rate": 8.722485458386183e-06,
"loss": 0.14969071745872498,
"step": 1088
},
{
"epoch": 2.4168514412416853,
"grad_norm": 0.5658117532730103,
"learning_rate": 8.690563849072416e-06,
"loss": 0.593338131904602,
"step": 1090
},
{
"epoch": 2.4212860310421287,
"grad_norm": 1.8885061740875244,
"learning_rate": 8.65867331041901e-06,
"loss": 0.3968830704689026,
"step": 1092
},
{
"epoch": 2.425720620842572,
"grad_norm": 1.8343939781188965,
"learning_rate": 8.62681427147446e-06,
"loss": 0.28023120760917664,
"step": 1094
},
{
"epoch": 2.4301552106430155,
"grad_norm": 1.2832564115524292,
"learning_rate": 8.594987160863464e-06,
"loss": 0.3517853617668152,
"step": 1096
},
{
"epoch": 2.434589800443459,
"grad_norm": 0.32917505502700806,
"learning_rate": 8.563192406781164e-06,
"loss": 0.3207606077194214,
"step": 1098
},
{
"epoch": 2.4390243902439024,
"grad_norm": 0.9043774008750916,
"learning_rate": 8.53143043698739e-06,
"loss": 0.4255558252334595,
"step": 1100
},
{
"epoch": 2.443458980044346,
"grad_norm": 0.5287153124809265,
"learning_rate": 8.499701678800891e-06,
"loss": 0.6775237917900085,
"step": 1102
},
{
"epoch": 2.4478935698447892,
"grad_norm": 1.211562991142273,
"learning_rate": 8.4680065590936e-06,
"loss": 0.28972724080085754,
"step": 1104
},
{
"epoch": 2.4523281596452327,
"grad_norm": 0.5662131309509277,
"learning_rate": 8.436345504284884e-06,
"loss": 0.685287594795227,
"step": 1106
},
{
"epoch": 2.4567627494456765,
"grad_norm": 1.0978025197982788,
"learning_rate": 8.404718940335805e-06,
"loss": 0.647050142288208,
"step": 1108
},
{
"epoch": 2.4611973392461195,
"grad_norm": 0.48306140303611755,
"learning_rate": 8.373127292743392e-06,
"loss": 0.7415695190429688,
"step": 1110
},
{
"epoch": 2.4656319290465634,
"grad_norm": 0.4147641360759735,
"learning_rate": 8.341570986534926e-06,
"loss": 0.47963038086891174,
"step": 1112
},
{
"epoch": 2.470066518847007,
"grad_norm": 0.6168814301490784,
"learning_rate": 8.310050446262204e-06,
"loss": 0.5705453157424927,
"step": 1114
},
{
"epoch": 2.47450110864745,
"grad_norm": 0.8609782457351685,
"learning_rate": 8.278566095995837e-06,
"loss": 0.24776363372802734,
"step": 1116
},
{
"epoch": 2.4789356984478936,
"grad_norm": 0.41248947381973267,
"learning_rate": 8.247118359319542e-06,
"loss": 0.573097825050354,
"step": 1118
},
{
"epoch": 2.483370288248337,
"grad_norm": 0.5210030674934387,
"learning_rate": 8.215707659324448e-06,
"loss": 0.45975643396377563,
"step": 1120
},
{
"epoch": 2.4878048780487805,
"grad_norm": 0.48813074827194214,
"learning_rate": 8.1843344186034e-06,
"loss": 0.5684525370597839,
"step": 1122
},
{
"epoch": 2.492239467849224,
"grad_norm": 1.437232494354248,
"learning_rate": 8.152999059245273e-06,
"loss": 0.6159149408340454,
"step": 1124
},
{
"epoch": 2.4966740576496673,
"grad_norm": 0.6437961459159851,
"learning_rate": 8.121702002829291e-06,
"loss": 0.6514344811439514,
"step": 1126
},
{
"epoch": 2.5011086474501107,
"grad_norm": 0.4339181184768677,
"learning_rate": 8.090443670419368e-06,
"loss": 0.3893609642982483,
"step": 1128
},
{
"epoch": 2.505543237250554,
"grad_norm": 0.9250460863113403,
"learning_rate": 8.05922448255842e-06,
"loss": 0.5106027722358704,
"step": 1130
},
{
"epoch": 2.5099778270509976,
"grad_norm": 0.7213279008865356,
"learning_rate": 8.028044859262736e-06,
"loss": 0.5997860431671143,
"step": 1132
},
{
"epoch": 2.5144124168514415,
"grad_norm": 0.5925162434577942,
"learning_rate": 7.996905220016295e-06,
"loss": 0.37115636467933655,
"step": 1134
},
{
"epoch": 2.5188470066518844,
"grad_norm": 0.4195973575115204,
"learning_rate": 7.965805983765156e-06,
"loss": 0.6658072471618652,
"step": 1136
},
{
"epoch": 2.5232815964523283,
"grad_norm": 0.3894807994365692,
"learning_rate": 7.934747568911792e-06,
"loss": 0.48177286982536316,
"step": 1138
},
{
"epoch": 2.5277161862527717,
"grad_norm": 0.4482039213180542,
"learning_rate": 7.903730393309475e-06,
"loss": 0.5770375728607178,
"step": 1140
},
{
"epoch": 2.532150776053215,
"grad_norm": 1.4334046840667725,
"learning_rate": 7.872754874256658e-06,
"loss": 0.37715059518814087,
"step": 1142
},
{
"epoch": 2.5365853658536586,
"grad_norm": 0.18956467509269714,
"learning_rate": 7.841821428491358e-06,
"loss": 0.3323401212692261,
"step": 1144
},
{
"epoch": 2.541019955654102,
"grad_norm": 0.9211217761039734,
"learning_rate": 7.810930472185542e-06,
"loss": 0.7031457424163818,
"step": 1146
},
{
"epoch": 2.5454545454545454,
"grad_norm": 0.5245496034622192,
"learning_rate": 7.78008242093953e-06,
"loss": 0.6004937887191772,
"step": 1148
},
{
"epoch": 2.549889135254989,
"grad_norm": 0.4028185307979584,
"learning_rate": 7.749277689776411e-06,
"loss": 0.496783971786499,
"step": 1150
},
{
"epoch": 2.5543237250554323,
"grad_norm": 0.5988771915435791,
"learning_rate": 7.718516693136455e-06,
"loss": 0.38715416193008423,
"step": 1152
},
{
"epoch": 2.5587583148558757,
"grad_norm": 0.1802368313074112,
"learning_rate": 7.687799844871534e-06,
"loss": 0.14051398634910583,
"step": 1154
},
{
"epoch": 2.5631929046563195,
"grad_norm": 0.41661515831947327,
"learning_rate": 7.657127558239563e-06,
"loss": 0.3350878059864044,
"step": 1156
},
{
"epoch": 2.5676274944567625,
"grad_norm": 1.085957646369934,
"learning_rate": 7.626500245898927e-06,
"loss": 0.3848508596420288,
"step": 1158
},
{
"epoch": 2.5720620842572064,
"grad_norm": 0.8385711908340454,
"learning_rate": 7.595918319902939e-06,
"loss": 0.26338139176368713,
"step": 1160
},
{
"epoch": 2.57649667405765,
"grad_norm": 0.716660737991333,
"learning_rate": 7.565382191694302e-06,
"loss": 0.6448018550872803,
"step": 1162
},
{
"epoch": 2.5809312638580932,
"grad_norm": 0.6266429424285889,
"learning_rate": 7.53489227209955e-06,
"loss": 0.7049829363822937,
"step": 1164
},
{
"epoch": 2.5853658536585367,
"grad_norm": 0.4636557996273041,
"learning_rate": 7.50444897132355e-06,
"loss": 0.38826262950897217,
"step": 1166
},
{
"epoch": 2.58980044345898,
"grad_norm": 0.44733473658561707,
"learning_rate": 7.474052698943961e-06,
"loss": 0.5173879265785217,
"step": 1168
},
{
"epoch": 2.5942350332594235,
"grad_norm": 0.5354277491569519,
"learning_rate": 7.443703863905738e-06,
"loss": 0.5096431374549866,
"step": 1170
},
{
"epoch": 2.598669623059867,
"grad_norm": 1.4914095401763916,
"learning_rate": 7.413402874515616e-06,
"loss": 0.21273551881313324,
"step": 1172
},
{
"epoch": 2.6031042128603104,
"grad_norm": 0.40523943305015564,
"learning_rate": 7.383150138436628e-06,
"loss": 0.49439945816993713,
"step": 1174
},
{
"epoch": 2.6075388026607538,
"grad_norm": 0.5631287693977356,
"learning_rate": 7.352946062682626e-06,
"loss": 0.49207258224487305,
"step": 1176
},
{
"epoch": 2.611973392461197,
"grad_norm": 0.5385340452194214,
"learning_rate": 7.32279105361279e-06,
"loss": 0.3323192000389099,
"step": 1178
},
{
"epoch": 2.6164079822616406,
"grad_norm": 1.356742024421692,
"learning_rate": 7.292685516926161e-06,
"loss": 0.5721710324287415,
"step": 1180
},
{
"epoch": 2.6208425720620845,
"grad_norm": 0.4816894829273224,
"learning_rate": 7.262629857656198e-06,
"loss": 0.5175535082817078,
"step": 1182
},
{
"epoch": 2.6252771618625275,
"grad_norm": 0.4633226990699768,
"learning_rate": 7.232624480165318e-06,
"loss": 0.6447592973709106,
"step": 1184
},
{
"epoch": 2.6297117516629713,
"grad_norm": 0.6813458800315857,
"learning_rate": 7.202669788139456e-06,
"loss": 0.5713311433792114,
"step": 1186
},
{
"epoch": 2.6341463414634148,
"grad_norm": 2.180230140686035,
"learning_rate": 7.172766184582629e-06,
"loss": 0.6713429093360901,
"step": 1188
},
{
"epoch": 2.638580931263858,
"grad_norm": 0.5426626801490784,
"learning_rate": 7.142914071811535e-06,
"loss": 0.37241318821907043,
"step": 1190
},
{
"epoch": 2.6430155210643016,
"grad_norm": 1.2816245555877686,
"learning_rate": 7.113113851450122e-06,
"loss": 0.49532002210617065,
"step": 1192
},
{
"epoch": 2.647450110864745,
"grad_norm": 1.509843111038208,
"learning_rate": 7.083365924424175e-06,
"loss": 0.40875858068466187,
"step": 1194
},
{
"epoch": 2.6518847006651884,
"grad_norm": 0.5089661478996277,
"learning_rate": 7.053670690955956e-06,
"loss": 0.4947509467601776,
"step": 1196
},
{
"epoch": 2.656319290465632,
"grad_norm": 0.381073921918869,
"learning_rate": 7.024028550558781e-06,
"loss": 0.2214895784854889,
"step": 1198
},
{
"epoch": 2.6607538802660753,
"grad_norm": 0.521045446395874,
"learning_rate": 6.994439902031679e-06,
"loss": 0.6109291911125183,
"step": 1200
},
{
"epoch": 2.6651884700665187,
"grad_norm": 1.0478029251098633,
"learning_rate": 6.964905143453995e-06,
"loss": 0.6086549162864685,
"step": 1202
},
{
"epoch": 2.6696230598669626,
"grad_norm": 0.4481736719608307,
"learning_rate": 6.9354246721800685e-06,
"loss": 0.29336196184158325,
"step": 1204
},
{
"epoch": 2.6740576496674056,
"grad_norm": 1.675062894821167,
"learning_rate": 6.9059988848338466e-06,
"loss": 0.48426881432533264,
"step": 1206
},
{
"epoch": 2.6784922394678494,
"grad_norm": 0.8172009587287903,
"learning_rate": 6.8766281773035906e-06,
"loss": 0.4322719871997833,
"step": 1208
},
{
"epoch": 2.682926829268293,
"grad_norm": 0.5452362298965454,
"learning_rate": 6.847312944736524e-06,
"loss": 0.3221188187599182,
"step": 1210
},
{
"epoch": 2.6873614190687363,
"grad_norm": 1.4369398355484009,
"learning_rate": 6.818053581533512e-06,
"loss": 0.20389345288276672,
"step": 1212
},
{
"epoch": 2.6917960088691797,
"grad_norm": 0.5867207050323486,
"learning_rate": 6.788850481343782e-06,
"loss": 0.42180705070495605,
"step": 1214
},
{
"epoch": 2.696230598669623,
"grad_norm": 2.326925754547119,
"learning_rate": 6.759704037059598e-06,
"loss": 0.36190155148506165,
"step": 1216
},
{
"epoch": 2.7006651884700665,
"grad_norm": 1.7214257717132568,
"learning_rate": 6.7306146408109885e-06,
"loss": 0.34991076588630676,
"step": 1218
},
{
"epoch": 2.70509977827051,
"grad_norm": 0.5046329498291016,
"learning_rate": 6.701582683960481e-06,
"loss": 0.6116279363632202,
"step": 1220
},
{
"epoch": 2.7095343680709534,
"grad_norm": 0.8512217998504639,
"learning_rate": 6.672608557097806e-06,
"loss": 0.37688618898391724,
"step": 1222
},
{
"epoch": 2.713968957871397,
"grad_norm": 1.6093370914459229,
"learning_rate": 6.643692650034684e-06,
"loss": 0.7054269909858704,
"step": 1224
},
{
"epoch": 2.7184035476718402,
"grad_norm": 3.110217809677124,
"learning_rate": 6.614835351799549e-06,
"loss": 0.31694677472114563,
"step": 1226
},
{
"epoch": 2.7228381374722836,
"grad_norm": 0.5730735659599304,
"learning_rate": 6.586037050632315e-06,
"loss": 0.8013717532157898,
"step": 1228
},
{
"epoch": 2.7272727272727275,
"grad_norm": 0.7116084098815918,
"learning_rate": 6.557298133979177e-06,
"loss": 0.45755088329315186,
"step": 1230
},
{
"epoch": 2.7317073170731705,
"grad_norm": 0.4136090874671936,
"learning_rate": 6.528618988487373e-06,
"loss": 0.48779523372650146,
"step": 1232
},
{
"epoch": 2.7361419068736144,
"grad_norm": 0.9168877601623535,
"learning_rate": 6.500000000000003e-06,
"loss": 0.2947143614292145,
"step": 1234
},
{
"epoch": 2.740576496674058,
"grad_norm": 0.6739610433578491,
"learning_rate": 6.471441553550813e-06,
"loss": 0.6185624599456787,
"step": 1236
},
{
"epoch": 2.745011086474501,
"grad_norm": 0.5895893573760986,
"learning_rate": 6.442944033359042e-06,
"loss": 0.35551586747169495,
"step": 1238
},
{
"epoch": 2.7494456762749446,
"grad_norm": 0.37865594029426575,
"learning_rate": 6.4145078228242375e-06,
"loss": 0.3368171751499176,
"step": 1240
},
{
"epoch": 2.753880266075388,
"grad_norm": 0.45283424854278564,
"learning_rate": 6.386133304521094e-06,
"loss": 0.5998995304107666,
"step": 1242
},
{
"epoch": 2.7583148558758315,
"grad_norm": 0.7602055668830872,
"learning_rate": 6.357820860194321e-06,
"loss": 0.7485865354537964,
"step": 1244
},
{
"epoch": 2.762749445676275,
"grad_norm": 0.12720580399036407,
"learning_rate": 6.32957087075349e-06,
"loss": 0.18481549620628357,
"step": 1246
},
{
"epoch": 2.7671840354767183,
"grad_norm": 1.2511968612670898,
"learning_rate": 6.301383716267917e-06,
"loss": 0.3667486011981964,
"step": 1248
},
{
"epoch": 2.7716186252771617,
"grad_norm": 0.6795738339424133,
"learning_rate": 6.273259775961562e-06,
"loss": 0.43524369597435,
"step": 1250
},
{
"epoch": 2.776053215077605,
"grad_norm": 0.4668692946434021,
"learning_rate": 6.245199428207898e-06,
"loss": 0.7469791173934937,
"step": 1252
},
{
"epoch": 2.7804878048780486,
"grad_norm": 0.4733211100101471,
"learning_rate": 6.2172030505248515e-06,
"loss": 0.6893079876899719,
"step": 1254
},
{
"epoch": 2.7849223946784925,
"grad_norm": 0.4810378849506378,
"learning_rate": 6.189271019569707e-06,
"loss": 0.6243588328361511,
"step": 1256
},
{
"epoch": 2.7893569844789354,
"grad_norm": 0.21061930060386658,
"learning_rate": 6.161403711134031e-06,
"loss": 0.09384872019290924,
"step": 1258
},
{
"epoch": 2.7937915742793793,
"grad_norm": 0.4916951358318329,
"learning_rate": 6.133601500138643e-06,
"loss": 0.5685229301452637,
"step": 1260
},
{
"epoch": 2.7982261640798227,
"grad_norm": 0.8098857402801514,
"learning_rate": 6.1058647606285394e-06,
"loss": 0.3363065719604492,
"step": 1262
},
{
"epoch": 2.802660753880266,
"grad_norm": 0.5222221612930298,
"learning_rate": 6.078193865767893e-06,
"loss": 0.36431118845939636,
"step": 1264
},
{
"epoch": 2.8070953436807096,
"grad_norm": 0.48917877674102783,
"learning_rate": 6.050589187835001e-06,
"loss": 0.48057618737220764,
"step": 1266
},
{
"epoch": 2.811529933481153,
"grad_norm": 1.3627451658248901,
"learning_rate": 6.023051098217307e-06,
"loss": 0.4955880343914032,
"step": 1268
},
{
"epoch": 2.8159645232815964,
"grad_norm": 0.5931581854820251,
"learning_rate": 5.995579967406379e-06,
"loss": 0.5985972881317139,
"step": 1270
},
{
"epoch": 2.82039911308204,
"grad_norm": 1.0736427307128906,
"learning_rate": 5.968176164992938e-06,
"loss": 0.24213649332523346,
"step": 1272
},
{
"epoch": 2.8248337028824833,
"grad_norm": 0.6388216614723206,
"learning_rate": 5.940840059661892e-06,
"loss": 0.41631895303726196,
"step": 1274
},
{
"epoch": 2.8292682926829267,
"grad_norm": 0.49787789583206177,
"learning_rate": 5.913572019187355e-06,
"loss": 0.6338592171669006,
"step": 1276
},
{
"epoch": 2.8337028824833705,
"grad_norm": 0.4130885601043701,
"learning_rate": 5.886372410427709e-06,
"loss": 0.5558915734291077,
"step": 1278
},
{
"epoch": 2.8381374722838135,
"grad_norm": 0.4531559944152832,
"learning_rate": 5.859241599320686e-06,
"loss": 0.24562785029411316,
"step": 1280
},
{
"epoch": 2.8425720620842574,
"grad_norm": 1.1224136352539062,
"learning_rate": 5.832179950878414e-06,
"loss": 0.38200998306274414,
"step": 1282
},
{
"epoch": 2.847006651884701,
"grad_norm": 0.5757291913032532,
"learning_rate": 5.805187829182531e-06,
"loss": 0.40263280272483826,
"step": 1284
},
{
"epoch": 2.8514412416851442,
"grad_norm": 0.4876343607902527,
"learning_rate": 5.778265597379269e-06,
"loss": 0.5635562539100647,
"step": 1286
},
{
"epoch": 2.8558758314855877,
"grad_norm": 1.429746150970459,
"learning_rate": 5.751413617674584e-06,
"loss": 0.13587771356105804,
"step": 1288
},
{
"epoch": 2.860310421286031,
"grad_norm": 0.43107762932777405,
"learning_rate": 5.724632251329272e-06,
"loss": 0.5738257765769958,
"step": 1290
},
{
"epoch": 2.8647450110864745,
"grad_norm": 1.0720781087875366,
"learning_rate": 5.697921858654106e-06,
"loss": 0.36557459831237793,
"step": 1292
},
{
"epoch": 2.869179600886918,
"grad_norm": 0.4924733638763428,
"learning_rate": 5.671282799005009e-06,
"loss": 0.5723231434822083,
"step": 1294
},
{
"epoch": 2.8736141906873613,
"grad_norm": 0.4669732451438904,
"learning_rate": 5.644715430778187e-06,
"loss": 0.5587807893753052,
"step": 1296
},
{
"epoch": 2.8780487804878048,
"grad_norm": 0.8375265598297119,
"learning_rate": 5.6182201114053405e-06,
"loss": 0.407155841588974,
"step": 1298
},
{
"epoch": 2.882483370288248,
"grad_norm": 0.6367316246032715,
"learning_rate": 5.59179719734883e-06,
"loss": 0.581174373626709,
"step": 1300
},
{
"epoch": 2.8869179600886916,
"grad_norm": 1.9464964866638184,
"learning_rate": 5.565447044096888e-06,
"loss": 0.23274049162864685,
"step": 1302
},
{
"epoch": 2.8913525498891355,
"grad_norm": 0.4807678461074829,
"learning_rate": 5.539170006158859e-06,
"loss": 0.5287979245185852,
"step": 1304
},
{
"epoch": 2.8957871396895785,
"grad_norm": 0.5676413774490356,
"learning_rate": 5.512966437060383e-06,
"loss": 0.4669223129749298,
"step": 1306
},
{
"epoch": 2.9002217294900223,
"grad_norm": 0.19804784655570984,
"learning_rate": 5.4868366893386795e-06,
"loss": 0.1954198181629181,
"step": 1308
},
{
"epoch": 2.9046563192904657,
"grad_norm": 0.5282815098762512,
"learning_rate": 5.460781114537794e-06,
"loss": 0.3124288320541382,
"step": 1310
},
{
"epoch": 2.909090909090909,
"grad_norm": 0.6704612374305725,
"learning_rate": 5.434800063203855e-06,
"loss": 0.5746976733207703,
"step": 1312
},
{
"epoch": 2.9135254988913526,
"grad_norm": 0.48029983043670654,
"learning_rate": 5.408893884880382e-06,
"loss": 0.5503944158554077,
"step": 1314
},
{
"epoch": 2.917960088691796,
"grad_norm": 1.208801031112671,
"learning_rate": 5.383062928103551e-06,
"loss": 0.4464556872844696,
"step": 1316
},
{
"epoch": 2.9223946784922394,
"grad_norm": 0.5504411458969116,
"learning_rate": 5.357307540397541e-06,
"loss": 0.6808157563209534,
"step": 1318
},
{
"epoch": 2.926829268292683,
"grad_norm": 0.4721316397190094,
"learning_rate": 5.331628068269832e-06,
"loss": 0.3994528353214264,
"step": 1320
},
{
"epoch": 2.9312638580931263,
"grad_norm": 0.40078234672546387,
"learning_rate": 5.306024857206551e-06,
"loss": 0.589479386806488,
"step": 1322
},
{
"epoch": 2.9356984478935697,
"grad_norm": 0.4144805073738098,
"learning_rate": 5.28049825166783e-06,
"loss": 0.6008284687995911,
"step": 1324
},
{
"epoch": 2.9401330376940136,
"grad_norm": 0.4621680676937103,
"learning_rate": 5.255048595083161e-06,
"loss": 0.48713505268096924,
"step": 1326
},
{
"epoch": 2.9445676274944566,
"grad_norm": 0.6959161758422852,
"learning_rate": 5.229676229846788e-06,
"loss": 0.5818562507629395,
"step": 1328
},
{
"epoch": 2.9490022172949004,
"grad_norm": 0.8349772095680237,
"learning_rate": 5.204381497313089e-06,
"loss": 0.6031002402305603,
"step": 1330
},
{
"epoch": 2.953436807095344,
"grad_norm": 0.5815767645835876,
"learning_rate": 5.179164737791984e-06,
"loss": 0.6579894423484802,
"step": 1332
},
{
"epoch": 2.9578713968957873,
"grad_norm": 0.5155860781669617,
"learning_rate": 5.15402629054437e-06,
"loss": 0.3109511137008667,
"step": 1334
},
{
"epoch": 2.9623059866962307,
"grad_norm": 0.5490220189094543,
"learning_rate": 5.128966493777544e-06,
"loss": 0.5789236426353455,
"step": 1336
},
{
"epoch": 2.966740576496674,
"grad_norm": 0.5740970969200134,
"learning_rate": 5.103985684640653e-06,
"loss": 0.5203069448471069,
"step": 1338
},
{
"epoch": 2.9711751662971175,
"grad_norm": 0.5606107711791992,
"learning_rate": 5.079084199220168e-06,
"loss": 0.4374566376209259,
"step": 1340
},
{
"epoch": 2.975609756097561,
"grad_norm": 1.1846078634262085,
"learning_rate": 5.0542623725353455e-06,
"loss": 0.42820480465888977,
"step": 1342
},
{
"epoch": 2.9800443458980044,
"grad_norm": 0.19243869185447693,
"learning_rate": 5.029520538533742e-06,
"loss": 0.125463604927063,
"step": 1344
},
{
"epoch": 2.984478935698448,
"grad_norm": 0.4858459532260895,
"learning_rate": 5.0048590300867e-06,
"loss": 0.37778711318969727,
"step": 1346
},
{
"epoch": 2.988913525498891,
"grad_norm": 0.4838855564594269,
"learning_rate": 4.980278178984886e-06,
"loss": 0.33112236857414246,
"step": 1348
},
{
"epoch": 2.9933481152993346,
"grad_norm": 1.0332651138305664,
"learning_rate": 4.9557783159338134e-06,
"loss": 0.28946980834007263,
"step": 1350
},
{
"epoch": 2.9977827050997785,
"grad_norm": 1.0827792882919312,
"learning_rate": 4.9313597705494045e-06,
"loss": 0.44148802757263184,
"step": 1352
},
{
"epoch": 3.002217294900222,
"grad_norm": 0.3786047399044037,
"learning_rate": 4.907022871353554e-06,
"loss": 0.42598864436149597,
"step": 1354
},
{
"epoch": 3.0066518847006654,
"grad_norm": 0.35562005639076233,
"learning_rate": 4.882767945769696e-06,
"loss": 0.1402987688779831,
"step": 1356
},
{
"epoch": 3.011086474501109,
"grad_norm": 1.162191390991211,
"learning_rate": 4.858595320118419e-06,
"loss": 0.2594584822654724,
"step": 1358
},
{
"epoch": 3.015521064301552,
"grad_norm": 0.3751342296600342,
"learning_rate": 4.834505319613061e-06,
"loss": 0.3178204894065857,
"step": 1360
},
{
"epoch": 3.0199556541019956,
"grad_norm": 0.3661974370479584,
"learning_rate": 4.810498268355337e-06,
"loss": 0.2332019954919815,
"step": 1362
},
{
"epoch": 3.024390243902439,
"grad_norm": 0.5547940135002136,
"learning_rate": 4.786574489330988e-06,
"loss": 0.2809712886810303,
"step": 1364
},
{
"epoch": 3.0288248337028825,
"grad_norm": 0.08006221801042557,
"learning_rate": 4.762734304405419e-06,
"loss": 0.1403912454843521,
"step": 1366
},
{
"epoch": 3.033259423503326,
"grad_norm": 0.5086005926132202,
"learning_rate": 4.738978034319384e-06,
"loss": 0.13945481181144714,
"step": 1368
},
{
"epoch": 3.0376940133037693,
"grad_norm": 0.6609373688697815,
"learning_rate": 4.715305998684668e-06,
"loss": 0.14236144721508026,
"step": 1370
},
{
"epoch": 3.0421286031042127,
"grad_norm": 0.7926512956619263,
"learning_rate": 4.691718515979772e-06,
"loss": 0.2316332459449768,
"step": 1372
},
{
"epoch": 3.046563192904656,
"grad_norm": 0.6564216613769531,
"learning_rate": 4.668215903545652e-06,
"loss": 0.1165812611579895,
"step": 1374
},
{
"epoch": 3.0509977827050996,
"grad_norm": 1.1338090896606445,
"learning_rate": 4.644798477581427e-06,
"loss": 0.13446903228759766,
"step": 1376
},
{
"epoch": 3.0554323725055434,
"grad_norm": 0.34968799352645874,
"learning_rate": 4.6214665531401465e-06,
"loss": 0.0695309042930603,
"step": 1378
},
{
"epoch": 3.059866962305987,
"grad_norm": 0.15553732216358185,
"learning_rate": 4.5982204441245294e-06,
"loss": 0.1173941045999527,
"step": 1380
},
{
"epoch": 3.0643015521064303,
"grad_norm": 1.247266411781311,
"learning_rate": 4.5750604632827615e-06,
"loss": 0.05880206078290939,
"step": 1382
},
{
"epoch": 3.0687361419068737,
"grad_norm": 0.9541630744934082,
"learning_rate": 4.551986922204276e-06,
"loss": 0.11438459157943726,
"step": 1384
},
{
"epoch": 3.073170731707317,
"grad_norm": 0.11932838708162308,
"learning_rate": 4.529000131315559e-06,
"loss": 0.05259817838668823,
"step": 1386
},
{
"epoch": 3.0776053215077606,
"grad_norm": 0.3025910258293152,
"learning_rate": 4.5061003998759864e-06,
"loss": 0.0788898915052414,
"step": 1388
},
{
"epoch": 3.082039911308204,
"grad_norm": 0.41884443163871765,
"learning_rate": 4.483288035973647e-06,
"loss": 0.18548215925693512,
"step": 1390
},
{
"epoch": 3.0864745011086474,
"grad_norm": 0.69329434633255,
"learning_rate": 4.46056334652121e-06,
"loss": 0.07898163050413132,
"step": 1392
},
{
"epoch": 3.090909090909091,
"grad_norm": 1.9537714719772339,
"learning_rate": 4.43792663725179e-06,
"loss": 0.1453198343515396,
"step": 1394
},
{
"epoch": 3.0953436807095343,
"grad_norm": 0.5684086084365845,
"learning_rate": 4.415378212714833e-06,
"loss": 0.2133360058069229,
"step": 1396
},
{
"epoch": 3.0997782705099777,
"grad_norm": 0.4299287497997284,
"learning_rate": 4.392918376272028e-06,
"loss": 0.18916372954845428,
"step": 1398
},
{
"epoch": 3.104212860310421,
"grad_norm": 0.2804919481277466,
"learning_rate": 4.370547430093213e-06,
"loss": 0.15570159256458282,
"step": 1400
},
{
"epoch": 3.1086474501108645,
"grad_norm": 0.8112667798995972,
"learning_rate": 4.348265675152312e-06,
"loss": 0.05692750960588455,
"step": 1402
},
{
"epoch": 3.1130820399113084,
"grad_norm": 1.0895768404006958,
"learning_rate": 4.326073411223299e-06,
"loss": 0.072386234998703,
"step": 1404
},
{
"epoch": 3.117516629711752,
"grad_norm": 1.3162689208984375,
"learning_rate": 4.303970936876145e-06,
"loss": 0.2204161435365677,
"step": 1406
},
{
"epoch": 3.1219512195121952,
"grad_norm": 0.4283730983734131,
"learning_rate": 4.281958549472821e-06,
"loss": 0.24357332289218903,
"step": 1408
},
{
"epoch": 3.1263858093126387,
"grad_norm": 0.5136526226997375,
"learning_rate": 4.2600365451632755e-06,
"loss": 0.1705726683139801,
"step": 1410
},
{
"epoch": 3.130820399113082,
"grad_norm": 0.5153740644454956,
"learning_rate": 4.238205218881477e-06,
"loss": 0.1938788741827011,
"step": 1412
},
{
"epoch": 3.1352549889135255,
"grad_norm": 0.3389737606048584,
"learning_rate": 4.216464864341415e-06,
"loss": 0.1461533159017563,
"step": 1414
},
{
"epoch": 3.139689578713969,
"grad_norm": 0.24095015227794647,
"learning_rate": 4.1948157740331765e-06,
"loss": 0.016989566385746002,
"step": 1416
},
{
"epoch": 3.1441241685144123,
"grad_norm": 0.4946073591709137,
"learning_rate": 4.173258239218998e-06,
"loss": 0.16947562992572784,
"step": 1418
},
{
"epoch": 3.1485587583148558,
"grad_norm": 1.0035178661346436,
"learning_rate": 4.151792549929343e-06,
"loss": 0.17151474952697754,
"step": 1420
},
{
"epoch": 3.152993348115299,
"grad_norm": 0.925403356552124,
"learning_rate": 4.130418994959004e-06,
"loss": 0.12084448337554932,
"step": 1422
},
{
"epoch": 3.1574279379157426,
"grad_norm": 0.30737417936325073,
"learning_rate": 4.1091378618632276e-06,
"loss": 0.03554686903953552,
"step": 1424
},
{
"epoch": 3.1618625277161865,
"grad_norm": 0.9840001463890076,
"learning_rate": 4.087949436953822e-06,
"loss": 0.17049196362495422,
"step": 1426
},
{
"epoch": 3.16629711751663,
"grad_norm": 1.108886957168579,
"learning_rate": 4.066854005295336e-06,
"loss": 0.12697622179985046,
"step": 1428
},
{
"epoch": 3.1707317073170733,
"grad_norm": 0.6791403293609619,
"learning_rate": 4.045851850701189e-06,
"loss": 0.10053610801696777,
"step": 1430
},
{
"epoch": 3.1751662971175167,
"grad_norm": 0.23437856137752533,
"learning_rate": 4.024943255729886e-06,
"loss": 0.1366463154554367,
"step": 1432
},
{
"epoch": 3.17960088691796,
"grad_norm": 0.5337254405021667,
"learning_rate": 4.004128501681197e-06,
"loss": 0.1613321751356125,
"step": 1434
},
{
"epoch": 3.1840354767184036,
"grad_norm": 0.6539866924285889,
"learning_rate": 3.983407868592367e-06,
"loss": 0.03396349772810936,
"step": 1436
},
{
"epoch": 3.188470066518847,
"grad_norm": 0.5891013145446777,
"learning_rate": 3.9627816352343714e-06,
"loss": 0.1685631275177002,
"step": 1438
},
{
"epoch": 3.1929046563192904,
"grad_norm": 0.8137240409851074,
"learning_rate": 3.94225007910814e-06,
"loss": 0.16547633707523346,
"step": 1440
},
{
"epoch": 3.197339246119734,
"grad_norm": 0.4780210852622986,
"learning_rate": 3.921813476440845e-06,
"loss": 0.2140340805053711,
"step": 1442
},
{
"epoch": 3.2017738359201773,
"grad_norm": 0.7639121413230896,
"learning_rate": 3.901472102182168e-06,
"loss": 0.2164526730775833,
"step": 1444
},
{
"epoch": 3.2062084257206207,
"grad_norm": 0.44395381212234497,
"learning_rate": 3.881226230000607e-06,
"loss": 0.18533624708652496,
"step": 1446
},
{
"epoch": 3.210643015521064,
"grad_norm": 0.5062630772590637,
"learning_rate": 3.861076132279808e-06,
"loss": 0.053058087825775146,
"step": 1448
},
{
"epoch": 3.2150776053215075,
"grad_norm": 0.4987446069717407,
"learning_rate": 3.8410220801148735e-06,
"loss": 0.21477347612380981,
"step": 1450
},
{
"epoch": 3.2195121951219514,
"grad_norm": 4.220211029052734,
"learning_rate": 3.821064343308734e-06,
"loss": 0.04978083446621895,
"step": 1452
},
{
"epoch": 3.223946784922395,
"grad_norm": 0.555292010307312,
"learning_rate": 3.8012031903685174e-06,
"loss": 0.19708330929279327,
"step": 1454
},
{
"epoch": 3.2283813747228383,
"grad_norm": 0.9038100838661194,
"learning_rate": 3.7814388885019284e-06,
"loss": 0.16057579219341278,
"step": 1456
},
{
"epoch": 3.2328159645232817,
"grad_norm": 0.3948892652988434,
"learning_rate": 3.7617717036136623e-06,
"loss": 0.1567579060792923,
"step": 1458
},
{
"epoch": 3.237250554323725,
"grad_norm": 0.6105815768241882,
"learning_rate": 3.7422019003018174e-06,
"loss": 0.15115660429000854,
"step": 1460
},
{
"epoch": 3.2416851441241685,
"grad_norm": 0.7068625688552856,
"learning_rate": 3.7227297418543464e-06,
"loss": 0.17774607241153717,
"step": 1462
},
{
"epoch": 3.246119733924612,
"grad_norm": 1.291515588760376,
"learning_rate": 3.7033554902455105e-06,
"loss": 0.20271697640419006,
"step": 1464
},
{
"epoch": 3.2505543237250554,
"grad_norm": 0.4515579342842102,
"learning_rate": 3.684079406132344e-06,
"loss": 0.23176366090774536,
"step": 1466
},
{
"epoch": 3.254988913525499,
"grad_norm": 0.17358291149139404,
"learning_rate": 3.6649017488511684e-06,
"loss": 0.035076484084129333,
"step": 1468
},
{
"epoch": 3.259423503325942,
"grad_norm": 0.7106318473815918,
"learning_rate": 3.6458227764140796e-06,
"loss": 0.11743002384901047,
"step": 1470
},
{
"epoch": 3.2638580931263856,
"grad_norm": 0.524408221244812,
"learning_rate": 3.626842745505501e-06,
"loss": 0.2437806874513626,
"step": 1472
},
{
"epoch": 3.2682926829268295,
"grad_norm": 0.37512272596359253,
"learning_rate": 3.607961911478708e-06,
"loss": 0.03446941822767258,
"step": 1474
},
{
"epoch": 3.2727272727272725,
"grad_norm": 0.48498690128326416,
"learning_rate": 3.5891805283524055e-06,
"loss": 0.15878258645534515,
"step": 1476
},
{
"epoch": 3.2771618625277164,
"grad_norm": 0.1239403486251831,
"learning_rate": 3.570498848807308e-06,
"loss": 0.11845864355564117,
"step": 1478
},
{
"epoch": 3.2815964523281598,
"grad_norm": 0.23787540197372437,
"learning_rate": 3.5519171241827445e-06,
"loss": 0.13304200768470764,
"step": 1480
},
{
"epoch": 3.286031042128603,
"grad_norm": 0.46581289172172546,
"learning_rate": 3.533435604473259e-06,
"loss": 0.20721173286437988,
"step": 1482
},
{
"epoch": 3.2904656319290466,
"grad_norm": 0.6229859590530396,
"learning_rate": 3.515054538325272e-06,
"loss": 0.19322358071804047,
"step": 1484
},
{
"epoch": 3.29490022172949,
"grad_norm": 0.4470021426677704,
"learning_rate": 3.496774173033717e-06,
"loss": 0.17478328943252563,
"step": 1486
},
{
"epoch": 3.2993348115299335,
"grad_norm": 1.0204616785049438,
"learning_rate": 3.478594754538722e-06,
"loss": 0.10508938133716583,
"step": 1488
},
{
"epoch": 3.303769401330377,
"grad_norm": 0.4292312264442444,
"learning_rate": 3.460516527422298e-06,
"loss": 0.05400429666042328,
"step": 1490
},
{
"epoch": 3.3082039911308203,
"grad_norm": 0.514301061630249,
"learning_rate": 3.442539734905049e-06,
"loss": 0.15547773241996765,
"step": 1492
},
{
"epoch": 3.3126385809312637,
"grad_norm": 0.8231419920921326,
"learning_rate": 3.424664618842897e-06,
"loss": 0.1262798309326172,
"step": 1494
},
{
"epoch": 3.317073170731707,
"grad_norm": 0.6278258562088013,
"learning_rate": 3.4068914197238352e-06,
"loss": 0.17141902446746826,
"step": 1496
},
{
"epoch": 3.3215077605321506,
"grad_norm": 0.7143641710281372,
"learning_rate": 3.389220376664687e-06,
"loss": 0.2325032353401184,
"step": 1498
},
{
"epoch": 3.3259423503325944,
"grad_norm": 0.6291862726211548,
"learning_rate": 3.3716517274078842e-06,
"loss": 0.1395445019006729,
"step": 1500
},
{
"epoch": 3.330376940133038,
"grad_norm": 1.111968994140625,
"learning_rate": 3.354185708318284e-06,
"loss": 0.19360409677028656,
"step": 1502
},
{
"epoch": 3.3348115299334813,
"grad_norm": 0.4316374659538269,
"learning_rate": 3.3368225543799716e-06,
"loss": 0.19091464579105377,
"step": 1504
},
{
"epoch": 3.3392461197339247,
"grad_norm": 0.07719559222459793,
"learning_rate": 3.3195624991931074e-06,
"loss": 0.0855455994606018,
"step": 1506
},
{
"epoch": 3.343680709534368,
"grad_norm": 0.48246321082115173,
"learning_rate": 3.302405774970788e-06,
"loss": 0.08791041374206543,
"step": 1508
},
{
"epoch": 3.3481152993348116,
"grad_norm": 0.36730292439460754,
"learning_rate": 3.2853526125359105e-06,
"loss": 0.12776361405849457,
"step": 1510
},
{
"epoch": 3.352549889135255,
"grad_norm": 0.09562593698501587,
"learning_rate": 3.26840324131808e-06,
"loss": 0.0983489602804184,
"step": 1512
},
{
"epoch": 3.3569844789356984,
"grad_norm": 0.7086212038993835,
"learning_rate": 3.251557889350514e-06,
"loss": 0.23420387506484985,
"step": 1514
},
{
"epoch": 3.361419068736142,
"grad_norm": 5.378333568572998,
"learning_rate": 3.2348167832669754e-06,
"loss": 0.10752184689044952,
"step": 1516
},
{
"epoch": 3.3658536585365852,
"grad_norm": 0.5152938961982727,
"learning_rate": 3.218180148298732e-06,
"loss": 0.21186313033103943,
"step": 1518
},
{
"epoch": 3.3702882483370287,
"grad_norm": 1.4693471193313599,
"learning_rate": 3.201648208271507e-06,
"loss": 0.19114084541797638,
"step": 1520
},
{
"epoch": 3.374722838137472,
"grad_norm": 0.12920020520687103,
"learning_rate": 3.185221185602497e-06,
"loss": 0.12129313498735428,
"step": 1522
},
{
"epoch": 3.3791574279379155,
"grad_norm": 0.8857243061065674,
"learning_rate": 3.168899301297347e-06,
"loss": 0.21523553133010864,
"step": 1524
},
{
"epoch": 3.3835920177383594,
"grad_norm": 0.7426590919494629,
"learning_rate": 3.152682774947202e-06,
"loss": 0.1364864557981491,
"step": 1526
},
{
"epoch": 3.388026607538803,
"grad_norm": 0.7999682426452637,
"learning_rate": 3.136571824725744e-06,
"loss": 0.0897040143609047,
"step": 1528
},
{
"epoch": 3.3924611973392462,
"grad_norm": 0.6461058855056763,
"learning_rate": 3.1205666673862484e-06,
"loss": 0.09447822719812393,
"step": 1530
},
{
"epoch": 3.3968957871396896,
"grad_norm": 0.3650994300842285,
"learning_rate": 3.104667518258688e-06,
"loss": 0.041886042803525925,
"step": 1532
},
{
"epoch": 3.401330376940133,
"grad_norm": 1.1809720993041992,
"learning_rate": 3.0888745912468123e-06,
"loss": 0.13893677294254303,
"step": 1534
},
{
"epoch": 3.4057649667405765,
"grad_norm": 0.5130560398101807,
"learning_rate": 3.073188098825285e-06,
"loss": 0.19634631276130676,
"step": 1536
},
{
"epoch": 3.41019955654102,
"grad_norm": 0.7646129131317139,
"learning_rate": 3.0576082520368265e-06,
"loss": 0.11035222560167313,
"step": 1538
},
{
"epoch": 3.4146341463414633,
"grad_norm": 1.119156837463379,
"learning_rate": 3.0421352604893602e-06,
"loss": 0.23807543516159058,
"step": 1540
},
{
"epoch": 3.4190687361419068,
"grad_norm": 0.4573220908641815,
"learning_rate": 3.0267693323532116e-06,
"loss": 0.14719665050506592,
"step": 1542
},
{
"epoch": 3.42350332594235,
"grad_norm": 0.683412492275238,
"learning_rate": 3.0115106743582922e-06,
"loss": 0.21427640318870544,
"step": 1544
},
{
"epoch": 3.4279379157427936,
"grad_norm": 0.5579946637153625,
"learning_rate": 2.9963594917913248e-06,
"loss": 0.02915109321475029,
"step": 1546
},
{
"epoch": 3.4323725055432375,
"grad_norm": 0.10574361681938171,
"learning_rate": 2.981315988493084e-06,
"loss": 0.04074406251311302,
"step": 1548
},
{
"epoch": 3.436807095343681,
"grad_norm": 0.366202175617218,
"learning_rate": 2.9663803668556424e-06,
"loss": 0.22145552933216095,
"step": 1550
},
{
"epoch": 3.4412416851441243,
"grad_norm": 0.5682427287101746,
"learning_rate": 2.9515528278196665e-06,
"loss": 0.25287312269210815,
"step": 1552
},
{
"epoch": 3.4456762749445677,
"grad_norm": 0.10395639389753342,
"learning_rate": 2.936833570871694e-06,
"loss": 0.11668358743190765,
"step": 1554
},
{
"epoch": 3.450110864745011,
"grad_norm": 0.631152868270874,
"learning_rate": 2.922222794041464e-06,
"loss": 0.23132863640785217,
"step": 1556
},
{
"epoch": 3.4545454545454546,
"grad_norm": 0.881669282913208,
"learning_rate": 2.907720693899243e-06,
"loss": 0.330628901720047,
"step": 1558
},
{
"epoch": 3.458980044345898,
"grad_norm": 0.462612122297287,
"learning_rate": 2.8933274655531874e-06,
"loss": 0.25399714708328247,
"step": 1560
},
{
"epoch": 3.4634146341463414,
"grad_norm": 0.5779225826263428,
"learning_rate": 2.879043302646717e-06,
"loss": 0.039646755903959274,
"step": 1562
},
{
"epoch": 3.467849223946785,
"grad_norm": 0.32095006108283997,
"learning_rate": 2.8648683973559054e-06,
"loss": 0.23187652230262756,
"step": 1564
},
{
"epoch": 3.4722838137472283,
"grad_norm": 0.3223656415939331,
"learning_rate": 2.8508029403868962e-06,
"loss": 0.09090401232242584,
"step": 1566
},
{
"epoch": 3.4767184035476717,
"grad_norm": 0.5520133376121521,
"learning_rate": 2.836847120973345e-06,
"loss": 0.15556883811950684,
"step": 1568
},
{
"epoch": 3.481152993348115,
"grad_norm": 0.47338053584098816,
"learning_rate": 2.8230011268738593e-06,
"loss": 0.09746363013982773,
"step": 1570
},
{
"epoch": 3.4855875831485585,
"grad_norm": 0.1202714741230011,
"learning_rate": 2.8092651443694886e-06,
"loss": 0.13933829963207245,
"step": 1572
},
{
"epoch": 3.4900221729490024,
"grad_norm": 0.6928906440734863,
"learning_rate": 2.795639358261202e-06,
"loss": 0.43705928325653076,
"step": 1574
},
{
"epoch": 3.494456762749446,
"grad_norm": 0.22218959033489227,
"learning_rate": 2.782123951867415e-06,
"loss": 0.12843255698680878,
"step": 1576
},
{
"epoch": 3.4988913525498893,
"grad_norm": 0.4401395618915558,
"learning_rate": 2.7687191070215174e-06,
"loss": 0.11058890074491501,
"step": 1578
},
{
"epoch": 3.5033259423503327,
"grad_norm": 0.4982577860355377,
"learning_rate": 2.755425004069424e-06,
"loss": 0.20767910778522491,
"step": 1580
},
{
"epoch": 3.507760532150776,
"grad_norm": 0.5209600925445557,
"learning_rate": 2.7422418218671586e-06,
"loss": 0.3028036952018738,
"step": 1582
},
{
"epoch": 3.5121951219512195,
"grad_norm": 0.6526494026184082,
"learning_rate": 2.7291697377784325e-06,
"loss": 0.13182812929153442,
"step": 1584
},
{
"epoch": 3.516629711751663,
"grad_norm": 0.5955665707588196,
"learning_rate": 2.7162089276722746e-06,
"loss": 0.11612501740455627,
"step": 1586
},
{
"epoch": 3.5210643015521064,
"grad_norm": 0.5240582227706909,
"learning_rate": 2.703359565920651e-06,
"loss": 0.19106577336788177,
"step": 1588
},
{
"epoch": 3.52549889135255,
"grad_norm": 0.5816933512687683,
"learning_rate": 2.6906218253961285e-06,
"loss": 0.052692461758852005,
"step": 1590
},
{
"epoch": 3.529933481152993,
"grad_norm": 1.794288992881775,
"learning_rate": 2.6779958774695487e-06,
"loss": 0.15381264686584473,
"step": 1592
},
{
"epoch": 3.5343680709534366,
"grad_norm": 0.6399196982383728,
"learning_rate": 2.665481892007714e-06,
"loss": 0.25606346130371094,
"step": 1594
},
{
"epoch": 3.5388026607538805,
"grad_norm": 0.4062730371952057,
"learning_rate": 2.6530800373711097e-06,
"loss": 0.021856600418686867,
"step": 1596
},
{
"epoch": 3.5432372505543235,
"grad_norm": 0.5443702936172485,
"learning_rate": 2.640790480411638e-06,
"loss": 0.08779677748680115,
"step": 1598
},
{
"epoch": 3.5476718403547673,
"grad_norm": 1.7016083002090454,
"learning_rate": 2.628613386470371e-06,
"loss": 0.1265704333782196,
"step": 1600
},
{
"epoch": 3.5521064301552108,
"grad_norm": 0.5498143434524536,
"learning_rate": 2.61654891937533e-06,
"loss": 0.19086270034313202,
"step": 1602
},
{
"epoch": 3.556541019955654,
"grad_norm": 0.5192769765853882,
"learning_rate": 2.6045972414392735e-06,
"loss": 0.3860751688480377,
"step": 1604
},
{
"epoch": 3.5609756097560976,
"grad_norm": 1.643974781036377,
"learning_rate": 2.5927585134575233e-06,
"loss": 0.2832165062427521,
"step": 1606
},
{
"epoch": 3.565410199556541,
"grad_norm": 0.16696669161319733,
"learning_rate": 2.581032894705798e-06,
"loss": 0.013047085143625736,
"step": 1608
},
{
"epoch": 3.5698447893569845,
"grad_norm": 0.5006920099258423,
"learning_rate": 2.5694205429380616e-06,
"loss": 0.17075103521347046,
"step": 1610
},
{
"epoch": 3.574279379157428,
"grad_norm": 0.4067634642124176,
"learning_rate": 2.5579216143844153e-06,
"loss": 0.049309611320495605,
"step": 1612
},
{
"epoch": 3.5787139689578713,
"grad_norm": 0.8766622543334961,
"learning_rate": 2.5465362637489847e-06,
"loss": 0.1669972687959671,
"step": 1614
},
{
"epoch": 3.5831485587583147,
"grad_norm": 0.7486819624900818,
"learning_rate": 2.5352646442078472e-06,
"loss": 0.20184892416000366,
"step": 1616
},
{
"epoch": 3.587583148558758,
"grad_norm": 0.6373207569122314,
"learning_rate": 2.524106907406959e-06,
"loss": 0.1479307860136032,
"step": 1618
},
{
"epoch": 3.5920177383592016,
"grad_norm": 1.1294218301773071,
"learning_rate": 2.513063203460127e-06,
"loss": 0.15324336290359497,
"step": 1620
},
{
"epoch": 3.5964523281596454,
"grad_norm": 0.4940034747123718,
"learning_rate": 2.502133680946985e-06,
"loss": 0.260329931974411,
"step": 1622
},
{
"epoch": 3.6008869179600884,
"grad_norm": 0.5072565674781799,
"learning_rate": 2.4913184869109925e-06,
"loss": 0.14236906170845032,
"step": 1624
},
{
"epoch": 3.6053215077605323,
"grad_norm": 0.14243106544017792,
"learning_rate": 2.4806177668574564e-06,
"loss": 0.03839609771966934,
"step": 1626
},
{
"epoch": 3.6097560975609757,
"grad_norm": 0.462656706571579,
"learning_rate": 2.4700316647515805e-06,
"loss": 0.1687300205230713,
"step": 1628
},
{
"epoch": 3.614190687361419,
"grad_norm": 2.395517587661743,
"learning_rate": 2.459560323016518e-06,
"loss": 0.11912352591753006,
"step": 1630
},
{
"epoch": 3.6186252771618626,
"grad_norm": 0.4201103746891022,
"learning_rate": 2.4492038825314637e-06,
"loss": 0.148905947804451,
"step": 1632
},
{
"epoch": 3.623059866962306,
"grad_norm": 1.4418302774429321,
"learning_rate": 2.438962482629751e-06,
"loss": 0.19345171749591827,
"step": 1634
},
{
"epoch": 3.6274944567627494,
"grad_norm": 0.47817596793174744,
"learning_rate": 2.42883626109699e-06,
"loss": 0.12935222685337067,
"step": 1636
},
{
"epoch": 3.631929046563193,
"grad_norm": 0.543739914894104,
"learning_rate": 2.4188253541691973e-06,
"loss": 0.1430729478597641,
"step": 1638
},
{
"epoch": 3.6363636363636362,
"grad_norm": 0.43734827637672424,
"learning_rate": 2.4089298965309753e-06,
"loss": 0.19318100810050964,
"step": 1640
},
{
"epoch": 3.6407982261640797,
"grad_norm": 0.2320224642753601,
"learning_rate": 2.399150021313699e-06,
"loss": 0.0949181392788887,
"step": 1642
},
{
"epoch": 3.6452328159645235,
"grad_norm": 0.6401042938232422,
"learning_rate": 2.389485860093715e-06,
"loss": 0.2700011730194092,
"step": 1644
},
{
"epoch": 3.6496674057649665,
"grad_norm": 0.12314002215862274,
"learning_rate": 2.3799375428905864e-06,
"loss": 0.07954643666744232,
"step": 1646
},
{
"epoch": 3.6541019955654104,
"grad_norm": 0.7883126735687256,
"learning_rate": 2.3705051981653315e-06,
"loss": 0.07769718766212463,
"step": 1648
},
{
"epoch": 3.658536585365854,
"grad_norm": 0.7627129554748535,
"learning_rate": 2.361188952818697e-06,
"loss": 0.2676461338996887,
"step": 1650
},
{
"epoch": 3.662971175166297,
"grad_norm": 0.8268294334411621,
"learning_rate": 2.3519889321894603e-06,
"loss": 0.4033682346343994,
"step": 1652
},
{
"epoch": 3.6674057649667406,
"grad_norm": 2.1596076488494873,
"learning_rate": 2.34290526005273e-06,
"loss": 0.09330250322818756,
"step": 1654
},
{
"epoch": 3.671840354767184,
"grad_norm": 0.6786802411079407,
"learning_rate": 2.3339380586182904e-06,
"loss": 0.23048776388168335,
"step": 1656
},
{
"epoch": 3.6762749445676275,
"grad_norm": 0.8763942718505859,
"learning_rate": 2.3250874485289545e-06,
"loss": 0.13142776489257812,
"step": 1658
},
{
"epoch": 3.680709534368071,
"grad_norm": 0.49550583958625793,
"learning_rate": 2.3163535488589363e-06,
"loss": 0.17957837879657745,
"step": 1660
},
{
"epoch": 3.6851441241685143,
"grad_norm": 0.08660886436700821,
"learning_rate": 2.3077364771122573e-06,
"loss": 0.12105847150087357,
"step": 1662
},
{
"epoch": 3.6895787139689578,
"grad_norm": 0.2725079655647278,
"learning_rate": 2.299236349221157e-06,
"loss": 0.06378458440303802,
"step": 1664
},
{
"epoch": 3.694013303769401,
"grad_norm": 0.40256035327911377,
"learning_rate": 2.2908532795445414e-06,
"loss": 0.187424436211586,
"step": 1666
},
{
"epoch": 3.6984478935698446,
"grad_norm": 0.4576587975025177,
"learning_rate": 2.2825873808664363e-06,
"loss": 0.25221118330955505,
"step": 1668
},
{
"epoch": 3.7028824833702885,
"grad_norm": 0.5043409466743469,
"learning_rate": 2.2744387643944757e-06,
"loss": 0.1796739250421524,
"step": 1670
},
{
"epoch": 3.7073170731707314,
"grad_norm": 0.5289079546928406,
"learning_rate": 2.2664075397584066e-06,
"loss": 0.15418490767478943,
"step": 1672
},
{
"epoch": 3.7117516629711753,
"grad_norm": 0.5016271471977234,
"learning_rate": 2.258493815008605e-06,
"loss": 0.23040637373924255,
"step": 1674
},
{
"epoch": 3.7161862527716187,
"grad_norm": 0.5144860744476318,
"learning_rate": 2.2506976966146355e-06,
"loss": 0.21655163168907166,
"step": 1676
},
{
"epoch": 3.720620842572062,
"grad_norm": 0.5468173027038574,
"learning_rate": 2.2430192894638077e-06,
"loss": 0.19511225819587708,
"step": 1678
},
{
"epoch": 3.7250554323725056,
"grad_norm": 0.6539567112922668,
"learning_rate": 2.235458696859768e-06,
"loss": 0.05055548995733261,
"step": 1680
},
{
"epoch": 3.729490022172949,
"grad_norm": 0.5066478848457336,
"learning_rate": 2.228016020521116e-06,
"loss": 0.17900614440441132,
"step": 1682
},
{
"epoch": 3.7339246119733924,
"grad_norm": 0.5024972558021545,
"learning_rate": 2.2206913605800267e-06,
"loss": 0.12050139158964157,
"step": 1684
},
{
"epoch": 3.738359201773836,
"grad_norm": 0.5398685932159424,
"learning_rate": 2.213484815580911e-06,
"loss": 0.12008091807365417,
"step": 1686
},
{
"epoch": 3.7427937915742793,
"grad_norm": 0.10834494233131409,
"learning_rate": 2.206396482479084e-06,
"loss": 0.03123791143298149,
"step": 1688
},
{
"epoch": 3.7472283813747227,
"grad_norm": 0.6228474974632263,
"learning_rate": 2.199426456639465e-06,
"loss": 0.22591347992420197,
"step": 1690
},
{
"epoch": 3.7516629711751666,
"grad_norm": 0.8757428526878357,
"learning_rate": 2.192574831835291e-06,
"loss": 0.1378636211156845,
"step": 1692
},
{
"epoch": 3.7560975609756095,
"grad_norm": 0.5694209933280945,
"learning_rate": 2.185841700246857e-06,
"loss": 0.24412274360656738,
"step": 1694
},
{
"epoch": 3.7605321507760534,
"grad_norm": 0.494783490896225,
"learning_rate": 2.1792271524602786e-06,
"loss": 0.23211520910263062,
"step": 1696
},
{
"epoch": 3.764966740576497,
"grad_norm": 0.5232568979263306,
"learning_rate": 2.1727312774662656e-06,
"loss": 0.12440581619739532,
"step": 1698
},
{
"epoch": 3.7694013303769403,
"grad_norm": 0.4039710462093353,
"learning_rate": 2.1663541626589337e-06,
"loss": 0.11090154200792313,
"step": 1700
},
{
"epoch": 3.7738359201773837,
"grad_norm": 0.48914211988449097,
"learning_rate": 2.1600958938346202e-06,
"loss": 0.5025262832641602,
"step": 1702
},
{
"epoch": 3.778270509977827,
"grad_norm": 0.18319113552570343,
"learning_rate": 2.153956555190738e-06,
"loss": 0.02325468324124813,
"step": 1704
},
{
"epoch": 3.7827050997782705,
"grad_norm": 0.10409087687730789,
"learning_rate": 2.147936229324637e-06,
"loss": 0.1210860013961792,
"step": 1706
},
{
"epoch": 3.787139689578714,
"grad_norm": 0.5911566615104675,
"learning_rate": 2.1420349972324942e-06,
"loss": 0.11488822847604752,
"step": 1708
},
{
"epoch": 3.7915742793791574,
"grad_norm": 0.5132036209106445,
"learning_rate": 2.1362529383082255e-06,
"loss": 0.30858707427978516,
"step": 1710
},
{
"epoch": 3.796008869179601,
"grad_norm": 0.28792333602905273,
"learning_rate": 2.1305901303424143e-06,
"loss": 0.1212579756975174,
"step": 1712
},
{
"epoch": 3.800443458980044,
"grad_norm": 0.7928282618522644,
"learning_rate": 2.1250466495212697e-06,
"loss": 0.1450139433145523,
"step": 1714
},
{
"epoch": 3.8048780487804876,
"grad_norm": 2.0321249961853027,
"learning_rate": 2.119622570425598e-06,
"loss": 0.19779935479164124,
"step": 1716
},
{
"epoch": 3.8093126385809315,
"grad_norm": 0.3824866712093353,
"learning_rate": 2.1143179660298e-06,
"loss": 0.1265445351600647,
"step": 1718
},
{
"epoch": 3.8137472283813745,
"grad_norm": 0.5937016606330872,
"learning_rate": 2.109132907700888e-06,
"loss": 0.11517294496297836,
"step": 1720
},
{
"epoch": 3.8181818181818183,
"grad_norm": 0.7648696899414062,
"learning_rate": 2.1040674651975297e-06,
"loss": 0.21361251175403595,
"step": 1722
},
{
"epoch": 3.8226164079822618,
"grad_norm": 0.8102192282676697,
"learning_rate": 2.099121706669106e-06,
"loss": 0.24782630801200867,
"step": 1724
},
{
"epoch": 3.827050997782705,
"grad_norm": 0.5768070220947266,
"learning_rate": 2.0942956986547953e-06,
"loss": 0.3066186010837555,
"step": 1726
},
{
"epoch": 3.8314855875831486,
"grad_norm": 0.4514220356941223,
"learning_rate": 2.0895895060826777e-06,
"loss": 0.08890463411808014,
"step": 1728
},
{
"epoch": 3.835920177383592,
"grad_norm": 0.387746661901474,
"learning_rate": 2.085003192268862e-06,
"loss": 0.11902990192174911,
"step": 1730
},
{
"epoch": 3.8403547671840355,
"grad_norm": 0.5934492349624634,
"learning_rate": 2.0805368189166347e-06,
"loss": 0.26432839035987854,
"step": 1732
},
{
"epoch": 3.844789356984479,
"grad_norm": 0.8290284276008606,
"learning_rate": 2.076190446115625e-06,
"loss": 0.13800962269306183,
"step": 1734
},
{
"epoch": 3.8492239467849223,
"grad_norm": 0.42487943172454834,
"learning_rate": 2.0719641323410084e-06,
"loss": 0.1366715282201767,
"step": 1736
},
{
"epoch": 3.8536585365853657,
"grad_norm": 0.5068730711936951,
"learning_rate": 2.0678579344527038e-06,
"loss": 0.18744944036006927,
"step": 1738
},
{
"epoch": 3.858093126385809,
"grad_norm": 0.45418834686279297,
"learning_rate": 2.0638719076946213e-06,
"loss": 0.12399666011333466,
"step": 1740
},
{
"epoch": 3.8625277161862526,
"grad_norm": 0.5102381706237793,
"learning_rate": 2.060006105693913e-06,
"loss": 0.11724897474050522,
"step": 1742
},
{
"epoch": 3.8669623059866964,
"grad_norm": 0.5589990615844727,
"learning_rate": 2.056260580460251e-06,
"loss": 0.15366147458553314,
"step": 1744
},
{
"epoch": 3.8713968957871394,
"grad_norm": 0.4272408187389374,
"learning_rate": 2.052635382385134e-06,
"loss": 0.16997916996479034,
"step": 1746
},
{
"epoch": 3.8758314855875833,
"grad_norm": 0.8717123866081238,
"learning_rate": 2.0491305602411997e-06,
"loss": 0.11671534180641174,
"step": 1748
},
{
"epoch": 3.8802660753880267,
"grad_norm": 0.5559657216072083,
"learning_rate": 2.0457461611815782e-06,
"loss": 0.15400242805480957,
"step": 1750
},
{
"epoch": 3.88470066518847,
"grad_norm": 0.6432749032974243,
"learning_rate": 2.0424822307392493e-06,
"loss": 0.18111613392829895,
"step": 1752
},
{
"epoch": 3.8891352549889135,
"grad_norm": 0.5203759074211121,
"learning_rate": 2.039338812826436e-06,
"loss": 0.17084263265132904,
"step": 1754
},
{
"epoch": 3.893569844789357,
"grad_norm": 1.0190702676773071,
"learning_rate": 2.036315949734011e-06,
"loss": 0.1340053379535675,
"step": 1756
},
{
"epoch": 3.8980044345898004,
"grad_norm": 2.1606268882751465,
"learning_rate": 2.0334136821309286e-06,
"loss": 0.23111629486083984,
"step": 1758
},
{
"epoch": 3.902439024390244,
"grad_norm": 0.013512303121387959,
"learning_rate": 2.0306320490636767e-06,
"loss": 0.04244675859808922,
"step": 1760
},
{
"epoch": 3.9068736141906872,
"grad_norm": 0.33751603960990906,
"learning_rate": 2.027971087955753e-06,
"loss": 0.050674207508563995,
"step": 1762
},
{
"epoch": 3.9113082039911307,
"grad_norm": 0.043730415403842926,
"learning_rate": 2.0254308346071574e-06,
"loss": 0.13882163166999817,
"step": 1764
},
{
"epoch": 3.9157427937915745,
"grad_norm": 0.37790897488594055,
"learning_rate": 2.023011323193917e-06,
"loss": 0.16915282607078552,
"step": 1766
},
{
"epoch": 3.9201773835920175,
"grad_norm": 0.5498037338256836,
"learning_rate": 2.020712586267621e-06,
"loss": 0.24210064113140106,
"step": 1768
},
{
"epoch": 3.9246119733924614,
"grad_norm": 0.5351380109786987,
"learning_rate": 2.018534654754984e-06,
"loss": 0.2681524157524109,
"step": 1770
},
{
"epoch": 3.929046563192905,
"grad_norm": 0.7002694606781006,
"learning_rate": 2.016477557957432e-06,
"loss": 0.0865524411201477,
"step": 1772
},
{
"epoch": 3.933481152993348,
"grad_norm": 1.273296594619751,
"learning_rate": 2.0145413235507057e-06,
"loss": 0.15231235325336456,
"step": 1774
},
{
"epoch": 3.9379157427937916,
"grad_norm": 0.560060977935791,
"learning_rate": 2.0127259775844882e-06,
"loss": 0.2978004813194275,
"step": 1776
},
{
"epoch": 3.942350332594235,
"grad_norm": 0.08104455471038818,
"learning_rate": 2.0110315444820557e-06,
"loss": 0.015620124526321888,
"step": 1778
},
{
"epoch": 3.9467849223946785,
"grad_norm": 1.2214912176132202,
"learning_rate": 2.0094580470399507e-06,
"loss": 0.08288650959730148,
"step": 1780
},
{
"epoch": 3.951219512195122,
"grad_norm": 0.08862635493278503,
"learning_rate": 2.0080055064276703e-06,
"loss": 0.11820105463266373,
"step": 1782
},
{
"epoch": 3.9556541019955653,
"grad_norm": 0.6492655277252197,
"learning_rate": 2.0066739421873856e-06,
"loss": 0.23602721095085144,
"step": 1784
},
{
"epoch": 3.9600886917960088,
"grad_norm": 0.4690077602863312,
"learning_rate": 2.0054633722336776e-06,
"loss": 0.17881526052951813,
"step": 1786
},
{
"epoch": 3.964523281596452,
"grad_norm": 0.4643179774284363,
"learning_rate": 2.0043738128532943e-06,
"loss": 0.1461382508277893,
"step": 1788
},
{
"epoch": 3.9689578713968956,
"grad_norm": 0.33378270268440247,
"learning_rate": 2.003405278704937e-06,
"loss": 0.12822888791561127,
"step": 1790
},
{
"epoch": 3.9733924611973395,
"grad_norm": 0.7190065979957581,
"learning_rate": 2.002557782819055e-06,
"loss": 0.1802365928888321,
"step": 1792
},
{
"epoch": 3.9778270509977824,
"grad_norm": 0.6157906651496887,
"learning_rate": 2.001831336597679e-06,
"loss": 0.09615038335323334,
"step": 1794
},
{
"epoch": 3.9822616407982263,
"grad_norm": 0.4767264127731323,
"learning_rate": 2.0012259498142596e-06,
"loss": 0.13788002729415894,
"step": 1796
},
{
"epoch": 3.9866962305986697,
"grad_norm": 0.499039888381958,
"learning_rate": 2.00074163061354e-06,
"loss": 0.18137040734291077,
"step": 1798
},
{
"epoch": 3.991130820399113,
"grad_norm": 0.6075534224510193,
"learning_rate": 2.000378385511451e-06,
"loss": 0.10324703902006149,
"step": 1800
},
{
"epoch": 3.9955654101995566,
"grad_norm": 0.5070518851280212,
"learning_rate": 2.000136219395011e-06,
"loss": 0.16305242478847504,
"step": 1802
},
{
"epoch": 4.0,
"grad_norm": 0.4194043278694153,
"learning_rate": 2.0000151355222728e-06,
"loss": 0.06611192226409912,
"step": 1804
},
{
"epoch": 4.0,
"step": 1804,
"total_flos": 3.4175049861232067e+18,
"train_loss": 0.6816160985415268,
"train_runtime": 8301.6433,
"train_samples_per_second": 6.519,
"train_steps_per_second": 0.217
}
],
"logging_steps": 2,
"max_steps": 1804,
"num_input_tokens_seen": 0,
"num_train_epochs": 4,
"save_steps": 99999,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": false,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 3.4175049861232067e+18,
"train_batch_size": 3,
"trial_name": null,
"trial_params": null
}