redsgnaoh's picture
Upload folder using huggingface_hub
cd7b244 verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.7575068243858053,
"eval_steps": 500,
"global_step": 1665,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.00045495905368516835,
"grad_norm": 2.3685307115973546,
"learning_rate": 5e-06,
"loss": 0.0587,
"step": 1
},
{
"epoch": 0.0009099181073703367,
"grad_norm": 3.207290006513166,
"learning_rate": 4.999999897855645e-06,
"loss": 0.0976,
"step": 2
},
{
"epoch": 0.001364877161055505,
"grad_norm": 3.061584755625611,
"learning_rate": 4.9999995914225884e-06,
"loss": 0.1138,
"step": 3
},
{
"epoch": 0.0018198362147406734,
"grad_norm": 2.4708172493174265,
"learning_rate": 4.999999080700855e-06,
"loss": 0.102,
"step": 4
},
{
"epoch": 0.0022747952684258415,
"grad_norm": 2.7122863978048204,
"learning_rate": 4.999998365690486e-06,
"loss": 0.0899,
"step": 5
},
{
"epoch": 0.00272975432211101,
"grad_norm": 2.1348308028500367,
"learning_rate": 4.999997446391542e-06,
"loss": 0.0589,
"step": 6
},
{
"epoch": 0.0031847133757961785,
"grad_norm": 1.9525029408374595,
"learning_rate": 4.999996322804095e-06,
"loss": 0.0692,
"step": 7
},
{
"epoch": 0.003639672429481347,
"grad_norm": 2.4972521600201087,
"learning_rate": 4.999994994928239e-06,
"loss": 0.094,
"step": 8
},
{
"epoch": 0.004094631483166515,
"grad_norm": 1.3057783939017902,
"learning_rate": 4.999993462764082e-06,
"loss": 0.0401,
"step": 9
},
{
"epoch": 0.004549590536851683,
"grad_norm": 1.8178622655461494,
"learning_rate": 4.999991726311749e-06,
"loss": 0.0508,
"step": 10
},
{
"epoch": 0.005004549590536852,
"grad_norm": 1.8904298363447831,
"learning_rate": 4.999989785571382e-06,
"loss": 0.0466,
"step": 11
},
{
"epoch": 0.00545950864422202,
"grad_norm": 2.397431505721498,
"learning_rate": 4.999987640543139e-06,
"loss": 0.0684,
"step": 12
},
{
"epoch": 0.005914467697907188,
"grad_norm": 2.121710266227225,
"learning_rate": 4.999985291227196e-06,
"loss": 0.0729,
"step": 13
},
{
"epoch": 0.006369426751592357,
"grad_norm": 2.9696000985831614,
"learning_rate": 4.999982737623746e-06,
"loss": 0.0922,
"step": 14
},
{
"epoch": 0.006824385805277525,
"grad_norm": 2.270433126704546,
"learning_rate": 4.999979979732995e-06,
"loss": 0.0946,
"step": 15
},
{
"epoch": 0.007279344858962694,
"grad_norm": 1.9380248124362378,
"learning_rate": 4.999977017555171e-06,
"loss": 0.0578,
"step": 16
},
{
"epoch": 0.0077343039126478615,
"grad_norm": 2.6281882171357958,
"learning_rate": 4.999973851090514e-06,
"loss": 0.1147,
"step": 17
},
{
"epoch": 0.00818926296633303,
"grad_norm": 2.40029765076707,
"learning_rate": 4.999970480339284e-06,
"loss": 0.0906,
"step": 18
},
{
"epoch": 0.008644222020018199,
"grad_norm": 2.889640814144301,
"learning_rate": 4.9999669053017564e-06,
"loss": 0.0792,
"step": 19
},
{
"epoch": 0.009099181073703366,
"grad_norm": 2.3110994220860883,
"learning_rate": 4.9999631259782235e-06,
"loss": 0.0751,
"step": 20
},
{
"epoch": 0.009554140127388535,
"grad_norm": 2.6890244705482806,
"learning_rate": 4.999959142368993e-06,
"loss": 0.0966,
"step": 21
},
{
"epoch": 0.010009099181073703,
"grad_norm": 2.2488041264680563,
"learning_rate": 4.999954954474391e-06,
"loss": 0.0714,
"step": 22
},
{
"epoch": 0.010464058234758872,
"grad_norm": 2.0642223983397883,
"learning_rate": 4.9999505622947594e-06,
"loss": 0.0881,
"step": 23
},
{
"epoch": 0.01091901728844404,
"grad_norm": 2.384727655713489,
"learning_rate": 4.999945965830458e-06,
"loss": 0.0992,
"step": 24
},
{
"epoch": 0.011373976342129208,
"grad_norm": 2.2739375250381504,
"learning_rate": 4.999941165081863e-06,
"loss": 0.0831,
"step": 25
},
{
"epoch": 0.011828935395814377,
"grad_norm": 1.6418905911049972,
"learning_rate": 4.999936160049364e-06,
"loss": 0.0662,
"step": 26
},
{
"epoch": 0.012283894449499545,
"grad_norm": 2.029045596294324,
"learning_rate": 4.999930950733373e-06,
"loss": 0.097,
"step": 27
},
{
"epoch": 0.012738853503184714,
"grad_norm": 2.2833378337725287,
"learning_rate": 4.999925537134312e-06,
"loss": 0.0823,
"step": 28
},
{
"epoch": 0.013193812556869881,
"grad_norm": 2.611896749496796,
"learning_rate": 4.9999199192526286e-06,
"loss": 0.1115,
"step": 29
},
{
"epoch": 0.01364877161055505,
"grad_norm": 2.4812612616344865,
"learning_rate": 4.9999140970887775e-06,
"loss": 0.0854,
"step": 30
},
{
"epoch": 0.014103730664240218,
"grad_norm": 2.0837983680092904,
"learning_rate": 4.999908070643236e-06,
"loss": 0.0837,
"step": 31
},
{
"epoch": 0.014558689717925387,
"grad_norm": 2.0812008840647827,
"learning_rate": 4.999901839916495e-06,
"loss": 0.064,
"step": 32
},
{
"epoch": 0.015013648771610554,
"grad_norm": 1.5275195881020318,
"learning_rate": 4.999895404909067e-06,
"loss": 0.0582,
"step": 33
},
{
"epoch": 0.015468607825295723,
"grad_norm": 2.703502541064391,
"learning_rate": 4.999888765621476e-06,
"loss": 0.1102,
"step": 34
},
{
"epoch": 0.01592356687898089,
"grad_norm": 1.7231856796809104,
"learning_rate": 4.999881922054264e-06,
"loss": 0.0571,
"step": 35
},
{
"epoch": 0.01637852593266606,
"grad_norm": 1.6472076658400754,
"learning_rate": 4.999874874207991e-06,
"loss": 0.0536,
"step": 36
},
{
"epoch": 0.01683348498635123,
"grad_norm": 2.902300005488672,
"learning_rate": 4.999867622083232e-06,
"loss": 0.1302,
"step": 37
},
{
"epoch": 0.017288444040036398,
"grad_norm": 1.9543380822482044,
"learning_rate": 4.99986016568058e-06,
"loss": 0.0983,
"step": 38
},
{
"epoch": 0.017743403093721567,
"grad_norm": 1.814859572890468,
"learning_rate": 4.999852505000646e-06,
"loss": 0.0717,
"step": 39
},
{
"epoch": 0.018198362147406732,
"grad_norm": 1.882630749677819,
"learning_rate": 4.999844640044053e-06,
"loss": 0.07,
"step": 40
},
{
"epoch": 0.0186533212010919,
"grad_norm": 2.4063115131397823,
"learning_rate": 4.999836570811445e-06,
"loss": 0.0873,
"step": 41
},
{
"epoch": 0.01910828025477707,
"grad_norm": 2.9701013712692035,
"learning_rate": 4.999828297303483e-06,
"loss": 0.0957,
"step": 42
},
{
"epoch": 0.019563239308462238,
"grad_norm": 2.207833234895104,
"learning_rate": 4.9998198195208405e-06,
"loss": 0.0879,
"step": 43
},
{
"epoch": 0.020018198362147407,
"grad_norm": 2.168760551509319,
"learning_rate": 4.999811137464212e-06,
"loss": 0.0967,
"step": 44
},
{
"epoch": 0.020473157415832575,
"grad_norm": 2.12859962179133,
"learning_rate": 4.999802251134307e-06,
"loss": 0.1028,
"step": 45
},
{
"epoch": 0.020928116469517744,
"grad_norm": 1.8067595132130894,
"learning_rate": 4.99979316053185e-06,
"loss": 0.0778,
"step": 46
},
{
"epoch": 0.021383075523202913,
"grad_norm": 3.8815722657740594,
"learning_rate": 4.999783865657585e-06,
"loss": 0.1812,
"step": 47
},
{
"epoch": 0.02183803457688808,
"grad_norm": 4.142186542548352,
"learning_rate": 4.999774366512272e-06,
"loss": 0.1981,
"step": 48
},
{
"epoch": 0.022292993630573247,
"grad_norm": 2.4946427215064015,
"learning_rate": 4.9997646630966865e-06,
"loss": 0.0866,
"step": 49
},
{
"epoch": 0.022747952684258416,
"grad_norm": 2.219814267860857,
"learning_rate": 4.999754755411621e-06,
"loss": 0.0767,
"step": 50
},
{
"epoch": 0.023202911737943584,
"grad_norm": 1.7512451842619647,
"learning_rate": 4.9997446434578865e-06,
"loss": 0.0709,
"step": 51
},
{
"epoch": 0.023657870791628753,
"grad_norm": 1.9267762038567948,
"learning_rate": 4.999734327236307e-06,
"loss": 0.0791,
"step": 52
},
{
"epoch": 0.024112829845313922,
"grad_norm": 1.3192434416131813,
"learning_rate": 4.999723806747728e-06,
"loss": 0.0611,
"step": 53
},
{
"epoch": 0.02456778889899909,
"grad_norm": 2.0553891309583787,
"learning_rate": 4.99971308199301e-06,
"loss": 0.0708,
"step": 54
},
{
"epoch": 0.02502274795268426,
"grad_norm": 1.6809260342794263,
"learning_rate": 4.999702152973025e-06,
"loss": 0.0662,
"step": 55
},
{
"epoch": 0.025477707006369428,
"grad_norm": 2.0087287549898716,
"learning_rate": 4.9996910196886694e-06,
"loss": 0.0795,
"step": 56
},
{
"epoch": 0.025932666060054597,
"grad_norm": 1.3268510730840513,
"learning_rate": 4.999679682140852e-06,
"loss": 0.0422,
"step": 57
},
{
"epoch": 0.026387625113739762,
"grad_norm": 2.646053521216802,
"learning_rate": 4.999668140330499e-06,
"loss": 0.1284,
"step": 58
},
{
"epoch": 0.02684258416742493,
"grad_norm": 1.5857988579934552,
"learning_rate": 4.999656394258555e-06,
"loss": 0.0647,
"step": 59
},
{
"epoch": 0.0272975432211101,
"grad_norm": 1.756551616255058,
"learning_rate": 4.999644443925978e-06,
"loss": 0.078,
"step": 60
},
{
"epoch": 0.027752502274795268,
"grad_norm": 2.2102751228780546,
"learning_rate": 4.999632289333746e-06,
"loss": 0.0785,
"step": 61
},
{
"epoch": 0.028207461328480437,
"grad_norm": 2.338156657994438,
"learning_rate": 4.999619930482852e-06,
"loss": 0.0835,
"step": 62
},
{
"epoch": 0.028662420382165606,
"grad_norm": 2.0921557148636616,
"learning_rate": 4.999607367374304e-06,
"loss": 0.0974,
"step": 63
},
{
"epoch": 0.029117379435850774,
"grad_norm": 1.7535396635399074,
"learning_rate": 4.999594600009131e-06,
"loss": 0.0605,
"step": 64
},
{
"epoch": 0.029572338489535943,
"grad_norm": 2.2055708873696585,
"learning_rate": 4.999581628388375e-06,
"loss": 0.0946,
"step": 65
},
{
"epoch": 0.03002729754322111,
"grad_norm": 2.5001955714674216,
"learning_rate": 4.999568452513097e-06,
"loss": 0.1549,
"step": 66
},
{
"epoch": 0.030482256596906277,
"grad_norm": 2.417716838936908,
"learning_rate": 4.9995550723843726e-06,
"loss": 0.0953,
"step": 67
},
{
"epoch": 0.030937215650591446,
"grad_norm": 1.9976883408624455,
"learning_rate": 4.999541488003295e-06,
"loss": 0.0772,
"step": 68
},
{
"epoch": 0.03139217470427662,
"grad_norm": 1.9326277047503455,
"learning_rate": 4.999527699370975e-06,
"loss": 0.0764,
"step": 69
},
{
"epoch": 0.03184713375796178,
"grad_norm": 2.0337761312716527,
"learning_rate": 4.99951370648854e-06,
"loss": 0.0659,
"step": 70
},
{
"epoch": 0.03230209281164695,
"grad_norm": 1.895878774895592,
"learning_rate": 4.999499509357132e-06,
"loss": 0.0815,
"step": 71
},
{
"epoch": 0.03275705186533212,
"grad_norm": 2.0909717848011313,
"learning_rate": 4.999485107977912e-06,
"loss": 0.084,
"step": 72
},
{
"epoch": 0.033212010919017286,
"grad_norm": 1.5271836426577585,
"learning_rate": 4.999470502352057e-06,
"loss": 0.0645,
"step": 73
},
{
"epoch": 0.03366696997270246,
"grad_norm": 2.4817155636981223,
"learning_rate": 4.999455692480759e-06,
"loss": 0.1008,
"step": 74
},
{
"epoch": 0.034121929026387623,
"grad_norm": 1.6027477251164817,
"learning_rate": 4.999440678365229e-06,
"loss": 0.0722,
"step": 75
},
{
"epoch": 0.034576888080072796,
"grad_norm": 2.164861284274037,
"learning_rate": 4.999425460006695e-06,
"loss": 0.0876,
"step": 76
},
{
"epoch": 0.03503184713375796,
"grad_norm": 1.8147143711706584,
"learning_rate": 4.9994100374063995e-06,
"loss": 0.0739,
"step": 77
},
{
"epoch": 0.03548680618744313,
"grad_norm": 2.379478288499757,
"learning_rate": 4.9993944105656035e-06,
"loss": 0.1158,
"step": 78
},
{
"epoch": 0.0359417652411283,
"grad_norm": 1.7238147576191318,
"learning_rate": 4.999378579485582e-06,
"loss": 0.0749,
"step": 79
},
{
"epoch": 0.036396724294813464,
"grad_norm": 2.1444185576728323,
"learning_rate": 4.999362544167632e-06,
"loss": 0.0937,
"step": 80
},
{
"epoch": 0.036851683348498636,
"grad_norm": 1.18142283635082,
"learning_rate": 4.99934630461306e-06,
"loss": 0.0569,
"step": 81
},
{
"epoch": 0.0373066424021838,
"grad_norm": 2.3599788407160456,
"learning_rate": 4.999329860823197e-06,
"loss": 0.0848,
"step": 82
},
{
"epoch": 0.03776160145586897,
"grad_norm": 1.851574920799011,
"learning_rate": 4.999313212799383e-06,
"loss": 0.0882,
"step": 83
},
{
"epoch": 0.03821656050955414,
"grad_norm": 2.144291660745484,
"learning_rate": 4.99929636054298e-06,
"loss": 0.0881,
"step": 84
},
{
"epoch": 0.03867151956323931,
"grad_norm": 2.083071837291781,
"learning_rate": 4.999279304055366e-06,
"loss": 0.1109,
"step": 85
},
{
"epoch": 0.039126478616924476,
"grad_norm": 2.245491182317419,
"learning_rate": 4.999262043337933e-06,
"loss": 0.0933,
"step": 86
},
{
"epoch": 0.03958143767060965,
"grad_norm": 2.076902724310137,
"learning_rate": 4.999244578392094e-06,
"loss": 0.1004,
"step": 87
},
{
"epoch": 0.040036396724294813,
"grad_norm": 2.213157445111281,
"learning_rate": 4.9992269092192736e-06,
"loss": 0.1048,
"step": 88
},
{
"epoch": 0.04049135577797998,
"grad_norm": 1.8088256581500983,
"learning_rate": 4.9992090358209166e-06,
"loss": 0.0803,
"step": 89
},
{
"epoch": 0.04094631483166515,
"grad_norm": 1.6952266837081935,
"learning_rate": 4.9991909581984835e-06,
"loss": 0.0707,
"step": 90
},
{
"epoch": 0.041401273885350316,
"grad_norm": 1.2806634047624867,
"learning_rate": 4.999172676353451e-06,
"loss": 0.0405,
"step": 91
},
{
"epoch": 0.04185623293903549,
"grad_norm": 1.537222164184117,
"learning_rate": 4.999154190287314e-06,
"loss": 0.0678,
"step": 92
},
{
"epoch": 0.042311191992720654,
"grad_norm": 2.152654560935853,
"learning_rate": 4.999135500001583e-06,
"loss": 0.1323,
"step": 93
},
{
"epoch": 0.042766151046405826,
"grad_norm": 1.7293087783358614,
"learning_rate": 4.9991166054977844e-06,
"loss": 0.0851,
"step": 94
},
{
"epoch": 0.04322111010009099,
"grad_norm": 2.689089264886033,
"learning_rate": 4.999097506777463e-06,
"loss": 0.1018,
"step": 95
},
{
"epoch": 0.04367606915377616,
"grad_norm": 1.8242860351920025,
"learning_rate": 4.999078203842179e-06,
"loss": 0.1063,
"step": 96
},
{
"epoch": 0.04413102820746133,
"grad_norm": 1.5249963877818449,
"learning_rate": 4.999058696693511e-06,
"loss": 0.0593,
"step": 97
},
{
"epoch": 0.044585987261146494,
"grad_norm": 1.668772591755926,
"learning_rate": 4.99903898533305e-06,
"loss": 0.0709,
"step": 98
},
{
"epoch": 0.045040946314831666,
"grad_norm": 1.8521288885149407,
"learning_rate": 4.99901906976241e-06,
"loss": 0.0842,
"step": 99
},
{
"epoch": 0.04549590536851683,
"grad_norm": 2.106435857041323,
"learning_rate": 4.998998949983217e-06,
"loss": 0.0921,
"step": 100
},
{
"epoch": 0.045950864422202004,
"grad_norm": 2.104450695294598,
"learning_rate": 4.998978625997115e-06,
"loss": 0.1082,
"step": 101
},
{
"epoch": 0.04640582347588717,
"grad_norm": 2.1381043167125466,
"learning_rate": 4.998958097805765e-06,
"loss": 0.0966,
"step": 102
},
{
"epoch": 0.04686078252957234,
"grad_norm": 1.6962878781771613,
"learning_rate": 4.9989373654108445e-06,
"loss": 0.0721,
"step": 103
},
{
"epoch": 0.047315741583257506,
"grad_norm": 26.768545049591438,
"learning_rate": 4.9989164288140465e-06,
"loss": 0.362,
"step": 104
},
{
"epoch": 0.04777070063694268,
"grad_norm": 2.63813062408578,
"learning_rate": 4.998895288017085e-06,
"loss": 0.1373,
"step": 105
},
{
"epoch": 0.048225659690627844,
"grad_norm": 1.828826426920959,
"learning_rate": 4.998873943021684e-06,
"loss": 0.0743,
"step": 106
},
{
"epoch": 0.04868061874431301,
"grad_norm": 1.524672393516503,
"learning_rate": 4.998852393829589e-06,
"loss": 0.0693,
"step": 107
},
{
"epoch": 0.04913557779799818,
"grad_norm": 3.0873114713096683,
"learning_rate": 4.9988306404425625e-06,
"loss": 0.1492,
"step": 108
},
{
"epoch": 0.049590536851683346,
"grad_norm": 1.7541988764209069,
"learning_rate": 4.99880868286238e-06,
"loss": 0.0941,
"step": 109
},
{
"epoch": 0.05004549590536852,
"grad_norm": 2.3475973125438103,
"learning_rate": 4.998786521090836e-06,
"loss": 0.0925,
"step": 110
},
{
"epoch": 0.050500454959053684,
"grad_norm": 2.1297159392440452,
"learning_rate": 4.9987641551297426e-06,
"loss": 0.1209,
"step": 111
},
{
"epoch": 0.050955414012738856,
"grad_norm": 1.8188477873711246,
"learning_rate": 4.998741584980926e-06,
"loss": 0.1191,
"step": 112
},
{
"epoch": 0.05141037306642402,
"grad_norm": 2.0744703068317474,
"learning_rate": 4.9987188106462314e-06,
"loss": 0.0958,
"step": 113
},
{
"epoch": 0.051865332120109194,
"grad_norm": 1.67585557445257,
"learning_rate": 4.99869583212752e-06,
"loss": 0.0759,
"step": 114
},
{
"epoch": 0.05232029117379436,
"grad_norm": 2.9423649270306456,
"learning_rate": 4.9986726494266694e-06,
"loss": 0.1628,
"step": 115
},
{
"epoch": 0.052775250227479524,
"grad_norm": 1.9805897541793653,
"learning_rate": 4.998649262545574e-06,
"loss": 0.0865,
"step": 116
},
{
"epoch": 0.053230209281164696,
"grad_norm": 1.862673950464683,
"learning_rate": 4.998625671486144e-06,
"loss": 0.0841,
"step": 117
},
{
"epoch": 0.05368516833484986,
"grad_norm": 1.6852737490573195,
"learning_rate": 4.998601876250308e-06,
"loss": 0.0801,
"step": 118
},
{
"epoch": 0.054140127388535034,
"grad_norm": 1.8645780399689873,
"learning_rate": 4.998577876840011e-06,
"loss": 0.0822,
"step": 119
},
{
"epoch": 0.0545950864422202,
"grad_norm": 1.7705796593126653,
"learning_rate": 4.9985536732572124e-06,
"loss": 0.0836,
"step": 120
},
{
"epoch": 0.05505004549590537,
"grad_norm": 1.4380115814084553,
"learning_rate": 4.998529265503891e-06,
"loss": 0.0714,
"step": 121
},
{
"epoch": 0.055505004549590536,
"grad_norm": 1.841019746353449,
"learning_rate": 4.9985046535820416e-06,
"loss": 0.0925,
"step": 122
},
{
"epoch": 0.05595996360327571,
"grad_norm": 2.13633472088372,
"learning_rate": 4.998479837493675e-06,
"loss": 0.1098,
"step": 123
},
{
"epoch": 0.056414922656960874,
"grad_norm": 1.6795956051728682,
"learning_rate": 4.9984548172408195e-06,
"loss": 0.0623,
"step": 124
},
{
"epoch": 0.05686988171064604,
"grad_norm": 7.146738489798405,
"learning_rate": 4.998429592825519e-06,
"loss": 0.1803,
"step": 125
},
{
"epoch": 0.05732484076433121,
"grad_norm": 2.17497011974541,
"learning_rate": 4.998404164249835e-06,
"loss": 0.1209,
"step": 126
},
{
"epoch": 0.05777979981801638,
"grad_norm": 1.9663385354035616,
"learning_rate": 4.998378531515845e-06,
"loss": 0.0704,
"step": 127
},
{
"epoch": 0.05823475887170155,
"grad_norm": 2.398444068788508,
"learning_rate": 4.998352694625645e-06,
"loss": 0.0819,
"step": 128
},
{
"epoch": 0.058689717925386714,
"grad_norm": 1.5854929257305652,
"learning_rate": 4.998326653581343e-06,
"loss": 0.0775,
"step": 129
},
{
"epoch": 0.059144676979071886,
"grad_norm": 1.8831317521751245,
"learning_rate": 4.998300408385072e-06,
"loss": 0.0895,
"step": 130
},
{
"epoch": 0.05959963603275705,
"grad_norm": 2.624836374744882,
"learning_rate": 4.998273959038972e-06,
"loss": 0.1398,
"step": 131
},
{
"epoch": 0.06005459508644222,
"grad_norm": 1.8281764860819427,
"learning_rate": 4.998247305545207e-06,
"loss": 0.0979,
"step": 132
},
{
"epoch": 0.06050955414012739,
"grad_norm": 1.4175605750366638,
"learning_rate": 4.998220447905953e-06,
"loss": 0.0674,
"step": 133
},
{
"epoch": 0.060964513193812554,
"grad_norm": 2.0007328792439307,
"learning_rate": 4.998193386123408e-06,
"loss": 0.1082,
"step": 134
},
{
"epoch": 0.061419472247497726,
"grad_norm": 2.2534593276871355,
"learning_rate": 4.99816612019978e-06,
"loss": 0.1165,
"step": 135
},
{
"epoch": 0.06187443130118289,
"grad_norm": 7.223128092677242,
"learning_rate": 4.998138650137298e-06,
"loss": 0.1547,
"step": 136
},
{
"epoch": 0.062329390354868064,
"grad_norm": 2.0541187438324178,
"learning_rate": 4.998110975938208e-06,
"loss": 0.1153,
"step": 137
},
{
"epoch": 0.06278434940855324,
"grad_norm": 2.900003934434033,
"learning_rate": 4.998083097604769e-06,
"loss": 0.1227,
"step": 138
},
{
"epoch": 0.0632393084622384,
"grad_norm": 2.9930382656276655,
"learning_rate": 4.998055015139261e-06,
"loss": 0.0671,
"step": 139
},
{
"epoch": 0.06369426751592357,
"grad_norm": 1.8183166737473904,
"learning_rate": 4.998026728543979e-06,
"loss": 0.0879,
"step": 140
},
{
"epoch": 0.06414922656960874,
"grad_norm": 1.750231162848612,
"learning_rate": 4.997998237821233e-06,
"loss": 0.0973,
"step": 141
},
{
"epoch": 0.0646041856232939,
"grad_norm": 1.531092755332603,
"learning_rate": 4.997969542973352e-06,
"loss": 0.0755,
"step": 142
},
{
"epoch": 0.06505914467697907,
"grad_norm": 2.106588666489457,
"learning_rate": 4.997940644002681e-06,
"loss": 0.1014,
"step": 143
},
{
"epoch": 0.06551410373066424,
"grad_norm": 2.4260145417995513,
"learning_rate": 4.997911540911581e-06,
"loss": 0.0992,
"step": 144
},
{
"epoch": 0.06596906278434941,
"grad_norm": 1.9957158387709846,
"learning_rate": 4.99788223370243e-06,
"loss": 0.1074,
"step": 145
},
{
"epoch": 0.06642402183803457,
"grad_norm": 2.7359115449729385,
"learning_rate": 4.9978527223776245e-06,
"loss": 0.1298,
"step": 146
},
{
"epoch": 0.06687898089171974,
"grad_norm": 1.4774963397056595,
"learning_rate": 4.9978230069395735e-06,
"loss": 0.0725,
"step": 147
},
{
"epoch": 0.06733393994540492,
"grad_norm": 2.4431671333335188,
"learning_rate": 4.9977930873907065e-06,
"loss": 0.0983,
"step": 148
},
{
"epoch": 0.06778889899909009,
"grad_norm": 1.9906443670591782,
"learning_rate": 4.997762963733468e-06,
"loss": 0.1039,
"step": 149
},
{
"epoch": 0.06824385805277525,
"grad_norm": 2.0201798980001517,
"learning_rate": 4.997732635970321e-06,
"loss": 0.085,
"step": 150
},
{
"epoch": 0.06869881710646042,
"grad_norm": 1.7461931203369137,
"learning_rate": 4.9977021041037425e-06,
"loss": 0.0884,
"step": 151
},
{
"epoch": 0.06915377616014559,
"grad_norm": 2.339191302020108,
"learning_rate": 4.9976713681362265e-06,
"loss": 0.1159,
"step": 152
},
{
"epoch": 0.06960873521383075,
"grad_norm": 2.314166753359135,
"learning_rate": 4.997640428070286e-06,
"loss": 0.1338,
"step": 153
},
{
"epoch": 0.07006369426751592,
"grad_norm": 1.5963391451568967,
"learning_rate": 4.99760928390845e-06,
"loss": 0.0575,
"step": 154
},
{
"epoch": 0.0705186533212011,
"grad_norm": 1.7788915412646347,
"learning_rate": 4.997577935653262e-06,
"loss": 0.08,
"step": 155
},
{
"epoch": 0.07097361237488627,
"grad_norm": 1.5840889143049688,
"learning_rate": 4.9975463833072835e-06,
"loss": 0.0709,
"step": 156
},
{
"epoch": 0.07142857142857142,
"grad_norm": 2.1242834812157962,
"learning_rate": 4.997514626873093e-06,
"loss": 0.1078,
"step": 157
},
{
"epoch": 0.0718835304822566,
"grad_norm": 1.7256733994251798,
"learning_rate": 4.997482666353287e-06,
"loss": 0.0678,
"step": 158
},
{
"epoch": 0.07233848953594177,
"grad_norm": 2.2088750555704073,
"learning_rate": 4.997450501750476e-06,
"loss": 0.0981,
"step": 159
},
{
"epoch": 0.07279344858962693,
"grad_norm": 1.817598507902073,
"learning_rate": 4.997418133067288e-06,
"loss": 0.0829,
"step": 160
},
{
"epoch": 0.0732484076433121,
"grad_norm": 1.9174894618752205,
"learning_rate": 4.997385560306368e-06,
"loss": 0.0922,
"step": 161
},
{
"epoch": 0.07370336669699727,
"grad_norm": 1.7975593397664607,
"learning_rate": 4.997352783470379e-06,
"loss": 0.093,
"step": 162
},
{
"epoch": 0.07415832575068244,
"grad_norm": 2.1789877377155147,
"learning_rate": 4.997319802561997e-06,
"loss": 0.1044,
"step": 163
},
{
"epoch": 0.0746132848043676,
"grad_norm": 1.5046722090412417,
"learning_rate": 4.9972866175839196e-06,
"loss": 0.0806,
"step": 164
},
{
"epoch": 0.07506824385805277,
"grad_norm": 1.828261506678391,
"learning_rate": 4.9972532285388575e-06,
"loss": 0.1018,
"step": 165
},
{
"epoch": 0.07552320291173795,
"grad_norm": 1.853289616987827,
"learning_rate": 4.997219635429538e-06,
"loss": 0.1177,
"step": 166
},
{
"epoch": 0.07597816196542312,
"grad_norm": 1.9172069323651033,
"learning_rate": 4.997185838258709e-06,
"loss": 0.0817,
"step": 167
},
{
"epoch": 0.07643312101910828,
"grad_norm": 1.6956924002006215,
"learning_rate": 4.997151837029129e-06,
"loss": 0.0679,
"step": 168
},
{
"epoch": 0.07688808007279345,
"grad_norm": 1.8575330553269362,
"learning_rate": 4.997117631743579e-06,
"loss": 0.0855,
"step": 169
},
{
"epoch": 0.07734303912647862,
"grad_norm": 1.7266908578071283,
"learning_rate": 4.997083222404852e-06,
"loss": 0.0625,
"step": 170
},
{
"epoch": 0.07779799818016378,
"grad_norm": 1.6397125044179104,
"learning_rate": 4.997048609015762e-06,
"loss": 0.0751,
"step": 171
},
{
"epoch": 0.07825295723384895,
"grad_norm": 1.5340896344557344,
"learning_rate": 4.997013791579136e-06,
"loss": 0.0786,
"step": 172
},
{
"epoch": 0.07870791628753412,
"grad_norm": 1.9189331650587453,
"learning_rate": 4.996978770097819e-06,
"loss": 0.0953,
"step": 173
},
{
"epoch": 0.0791628753412193,
"grad_norm": 1.7773721601434869,
"learning_rate": 4.996943544574673e-06,
"loss": 0.083,
"step": 174
},
{
"epoch": 0.07961783439490445,
"grad_norm": 1.7663708027835396,
"learning_rate": 4.996908115012576e-06,
"loss": 0.0711,
"step": 175
},
{
"epoch": 0.08007279344858963,
"grad_norm": 2.0988130747441462,
"learning_rate": 4.996872481414425e-06,
"loss": 0.1068,
"step": 176
},
{
"epoch": 0.0805277525022748,
"grad_norm": 3.491649419917669,
"learning_rate": 4.9968366437831305e-06,
"loss": 0.1596,
"step": 177
},
{
"epoch": 0.08098271155595996,
"grad_norm": 0.9772529604089312,
"learning_rate": 4.99680060212162e-06,
"loss": 0.0469,
"step": 178
},
{
"epoch": 0.08143767060964513,
"grad_norm": 1.411497576217555,
"learning_rate": 4.996764356432841e-06,
"loss": 0.0799,
"step": 179
},
{
"epoch": 0.0818926296633303,
"grad_norm": 1.9634897057091474,
"learning_rate": 4.996727906719754e-06,
"loss": 0.0818,
"step": 180
},
{
"epoch": 0.08234758871701547,
"grad_norm": 1.8622777856402457,
"learning_rate": 4.9966912529853365e-06,
"loss": 0.0654,
"step": 181
},
{
"epoch": 0.08280254777070063,
"grad_norm": 1.6338074095796988,
"learning_rate": 4.996654395232585e-06,
"loss": 0.0744,
"step": 182
},
{
"epoch": 0.0832575068243858,
"grad_norm": 1.534919993971643,
"learning_rate": 4.996617333464512e-06,
"loss": 0.0639,
"step": 183
},
{
"epoch": 0.08371246587807098,
"grad_norm": 1.7391379315757225,
"learning_rate": 4.996580067684145e-06,
"loss": 0.0715,
"step": 184
},
{
"epoch": 0.08416742493175614,
"grad_norm": 1.7215093643580193,
"learning_rate": 4.996542597894528e-06,
"loss": 0.1192,
"step": 185
},
{
"epoch": 0.08462238398544131,
"grad_norm": 2.041088124472192,
"learning_rate": 4.996504924098726e-06,
"loss": 0.1078,
"step": 186
},
{
"epoch": 0.08507734303912648,
"grad_norm": 1.7083926900772908,
"learning_rate": 4.9964670462998145e-06,
"loss": 0.0922,
"step": 187
},
{
"epoch": 0.08553230209281165,
"grad_norm": 1.9950587953196364,
"learning_rate": 4.99642896450089e-06,
"loss": 0.125,
"step": 188
},
{
"epoch": 0.08598726114649681,
"grad_norm": 2.2702904646099022,
"learning_rate": 4.9963906787050656e-06,
"loss": 0.1318,
"step": 189
},
{
"epoch": 0.08644222020018198,
"grad_norm": 1.5062676480402928,
"learning_rate": 4.996352188915467e-06,
"loss": 0.0621,
"step": 190
},
{
"epoch": 0.08689717925386715,
"grad_norm": 2.6764229211241153,
"learning_rate": 4.996313495135242e-06,
"loss": 0.1112,
"step": 191
},
{
"epoch": 0.08735213830755233,
"grad_norm": 2.276483991348045,
"learning_rate": 4.9962745973675505e-06,
"loss": 0.1219,
"step": 192
},
{
"epoch": 0.08780709736123748,
"grad_norm": 1.4375762261827663,
"learning_rate": 4.996235495615572e-06,
"loss": 0.0641,
"step": 193
},
{
"epoch": 0.08826205641492266,
"grad_norm": 2.3164336329931094,
"learning_rate": 4.996196189882503e-06,
"loss": 0.1176,
"step": 194
},
{
"epoch": 0.08871701546860783,
"grad_norm": 2.225732764096407,
"learning_rate": 4.996156680171552e-06,
"loss": 0.1096,
"step": 195
},
{
"epoch": 0.08917197452229299,
"grad_norm": 1.8464739663611849,
"learning_rate": 4.996116966485951e-06,
"loss": 0.0817,
"step": 196
},
{
"epoch": 0.08962693357597816,
"grad_norm": 1.9290667932284378,
"learning_rate": 4.996077048828944e-06,
"loss": 0.1106,
"step": 197
},
{
"epoch": 0.09008189262966333,
"grad_norm": 1.6322378586848272,
"learning_rate": 4.996036927203793e-06,
"loss": 0.0972,
"step": 198
},
{
"epoch": 0.0905368516833485,
"grad_norm": 2.2100804969645416,
"learning_rate": 4.995996601613775e-06,
"loss": 0.0944,
"step": 199
},
{
"epoch": 0.09099181073703366,
"grad_norm": 1.5641835045850314,
"learning_rate": 4.9959560720621875e-06,
"loss": 0.0896,
"step": 200
},
{
"epoch": 0.09144676979071883,
"grad_norm": 2.2116837789953117,
"learning_rate": 4.995915338552341e-06,
"loss": 0.1331,
"step": 201
},
{
"epoch": 0.09190172884440401,
"grad_norm": 1.8792253280188753,
"learning_rate": 4.995874401087565e-06,
"loss": 0.0967,
"step": 202
},
{
"epoch": 0.09235668789808917,
"grad_norm": 2.167978668790899,
"learning_rate": 4.9958332596712035e-06,
"loss": 0.1141,
"step": 203
},
{
"epoch": 0.09281164695177434,
"grad_norm": 1.8621318139110883,
"learning_rate": 4.99579191430662e-06,
"loss": 0.0972,
"step": 204
},
{
"epoch": 0.09326660600545951,
"grad_norm": 1.8429430162012657,
"learning_rate": 4.995750364997192e-06,
"loss": 0.0967,
"step": 205
},
{
"epoch": 0.09372156505914468,
"grad_norm": 1.5424629326591568,
"learning_rate": 4.995708611746314e-06,
"loss": 0.0814,
"step": 206
},
{
"epoch": 0.09417652411282984,
"grad_norm": 2.0700985381007904,
"learning_rate": 4.995666654557399e-06,
"loss": 0.1038,
"step": 207
},
{
"epoch": 0.09463148316651501,
"grad_norm": 1.8765344045928045,
"learning_rate": 4.995624493433876e-06,
"loss": 0.1075,
"step": 208
},
{
"epoch": 0.09508644222020018,
"grad_norm": 1.8732891178471252,
"learning_rate": 4.995582128379189e-06,
"loss": 0.1001,
"step": 209
},
{
"epoch": 0.09554140127388536,
"grad_norm": 2.1418545940903373,
"learning_rate": 4.9955395593968e-06,
"loss": 0.1463,
"step": 210
},
{
"epoch": 0.09599636032757052,
"grad_norm": 1.905821465202796,
"learning_rate": 4.99549678649019e-06,
"loss": 0.0848,
"step": 211
},
{
"epoch": 0.09645131938125569,
"grad_norm": 1.7581366634538098,
"learning_rate": 4.99545380966285e-06,
"loss": 0.0976,
"step": 212
},
{
"epoch": 0.09690627843494086,
"grad_norm": 2.133882292644339,
"learning_rate": 4.995410628918294e-06,
"loss": 0.1036,
"step": 213
},
{
"epoch": 0.09736123748862602,
"grad_norm": 1.6491455235555508,
"learning_rate": 4.995367244260052e-06,
"loss": 0.1,
"step": 214
},
{
"epoch": 0.09781619654231119,
"grad_norm": 1.372315749578445,
"learning_rate": 4.995323655691667e-06,
"loss": 0.0543,
"step": 215
},
{
"epoch": 0.09827115559599636,
"grad_norm": 2.2929084487384297,
"learning_rate": 4.995279863216702e-06,
"loss": 0.1005,
"step": 216
},
{
"epoch": 0.09872611464968153,
"grad_norm": 1.8371182479654964,
"learning_rate": 4.995235866838735e-06,
"loss": 0.096,
"step": 217
},
{
"epoch": 0.09918107370336669,
"grad_norm": 1.4189314035725125,
"learning_rate": 4.995191666561361e-06,
"loss": 0.0707,
"step": 218
},
{
"epoch": 0.09963603275705187,
"grad_norm": 1.4036483642687965,
"learning_rate": 4.995147262388192e-06,
"loss": 0.0689,
"step": 219
},
{
"epoch": 0.10009099181073704,
"grad_norm": 1.7382878807357938,
"learning_rate": 4.995102654322858e-06,
"loss": 0.0829,
"step": 220
},
{
"epoch": 0.1005459508644222,
"grad_norm": 1.3102015447280675,
"learning_rate": 4.995057842369002e-06,
"loss": 0.0548,
"step": 221
},
{
"epoch": 0.10100090991810737,
"grad_norm": 1.8490525072637034,
"learning_rate": 4.995012826530287e-06,
"loss": 0.1044,
"step": 222
},
{
"epoch": 0.10145586897179254,
"grad_norm": 2.802543488000276,
"learning_rate": 4.99496760681039e-06,
"loss": 0.1393,
"step": 223
},
{
"epoch": 0.10191082802547771,
"grad_norm": 2.4234245545914295,
"learning_rate": 4.994922183213009e-06,
"loss": 0.1325,
"step": 224
},
{
"epoch": 0.10236578707916287,
"grad_norm": 1.1495372549504432,
"learning_rate": 4.9948765557418535e-06,
"loss": 0.0585,
"step": 225
},
{
"epoch": 0.10282074613284804,
"grad_norm": 2.1666263724534267,
"learning_rate": 4.994830724400653e-06,
"loss": 0.1063,
"step": 226
},
{
"epoch": 0.10327570518653321,
"grad_norm": 1.7066677970234532,
"learning_rate": 4.994784689193151e-06,
"loss": 0.1002,
"step": 227
},
{
"epoch": 0.10373066424021839,
"grad_norm": 1.5304723941528642,
"learning_rate": 4.994738450123111e-06,
"loss": 0.0825,
"step": 228
},
{
"epoch": 0.10418562329390355,
"grad_norm": 2.1125485884299486,
"learning_rate": 4.994692007194312e-06,
"loss": 0.1089,
"step": 229
},
{
"epoch": 0.10464058234758872,
"grad_norm": 1.4297773182355138,
"learning_rate": 4.994645360410547e-06,
"loss": 0.0855,
"step": 230
},
{
"epoch": 0.10509554140127389,
"grad_norm": 1.741498602747005,
"learning_rate": 4.99459850977563e-06,
"loss": 0.0884,
"step": 231
},
{
"epoch": 0.10555050045495905,
"grad_norm": 1.6875366585424447,
"learning_rate": 4.994551455293388e-06,
"loss": 0.068,
"step": 232
},
{
"epoch": 0.10600545950864422,
"grad_norm": 2.03347527932056,
"learning_rate": 4.9945041969676654e-06,
"loss": 0.0997,
"step": 233
},
{
"epoch": 0.10646041856232939,
"grad_norm": 1.5553350034126536,
"learning_rate": 4.994456734802325e-06,
"loss": 0.0709,
"step": 234
},
{
"epoch": 0.10691537761601456,
"grad_norm": 1.354348073951093,
"learning_rate": 4.994409068801247e-06,
"loss": 0.0858,
"step": 235
},
{
"epoch": 0.10737033666969972,
"grad_norm": 1.6048007960766557,
"learning_rate": 4.994361198968323e-06,
"loss": 0.0891,
"step": 236
},
{
"epoch": 0.1078252957233849,
"grad_norm": 2.3380973830643663,
"learning_rate": 4.994313125307466e-06,
"loss": 0.116,
"step": 237
},
{
"epoch": 0.10828025477707007,
"grad_norm": 1.68606521406513,
"learning_rate": 4.994264847822605e-06,
"loss": 0.09,
"step": 238
},
{
"epoch": 0.10873521383075523,
"grad_norm": 2.0274881934833715,
"learning_rate": 4.994216366517684e-06,
"loss": 0.0856,
"step": 239
},
{
"epoch": 0.1091901728844404,
"grad_norm": 1.9224041067300894,
"learning_rate": 4.994167681396667e-06,
"loss": 0.1032,
"step": 240
},
{
"epoch": 0.10964513193812557,
"grad_norm": 2.213562554498921,
"learning_rate": 4.994118792463529e-06,
"loss": 0.1125,
"step": 241
},
{
"epoch": 0.11010009099181074,
"grad_norm": 2.396477374166045,
"learning_rate": 4.994069699722267e-06,
"loss": 0.16,
"step": 242
},
{
"epoch": 0.1105550500454959,
"grad_norm": 1.6621616457271884,
"learning_rate": 4.994020403176893e-06,
"loss": 0.1088,
"step": 243
},
{
"epoch": 0.11101000909918107,
"grad_norm": 2.0137991000965862,
"learning_rate": 4.9939709028314345e-06,
"loss": 0.1203,
"step": 244
},
{
"epoch": 0.11146496815286625,
"grad_norm": 1.731498246221376,
"learning_rate": 4.993921198689935e-06,
"loss": 0.0779,
"step": 245
},
{
"epoch": 0.11191992720655142,
"grad_norm": 1.53319841517271,
"learning_rate": 4.993871290756459e-06,
"loss": 0.0859,
"step": 246
},
{
"epoch": 0.11237488626023658,
"grad_norm": 1.5738861001818754,
"learning_rate": 4.9938211790350835e-06,
"loss": 0.0822,
"step": 247
},
{
"epoch": 0.11282984531392175,
"grad_norm": 1.795556137822037,
"learning_rate": 4.993770863529902e-06,
"loss": 0.1082,
"step": 248
},
{
"epoch": 0.11328480436760692,
"grad_norm": 1.753136266606954,
"learning_rate": 4.993720344245029e-06,
"loss": 0.0826,
"step": 249
},
{
"epoch": 0.11373976342129208,
"grad_norm": 1.724266476242851,
"learning_rate": 4.99366962118459e-06,
"loss": 0.0851,
"step": 250
},
{
"epoch": 0.11419472247497725,
"grad_norm": 1.8081901179247517,
"learning_rate": 4.99361869435273e-06,
"loss": 0.0965,
"step": 251
},
{
"epoch": 0.11464968152866242,
"grad_norm": 2.064401083784083,
"learning_rate": 4.993567563753613e-06,
"loss": 0.0881,
"step": 252
},
{
"epoch": 0.1151046405823476,
"grad_norm": 1.6354098857617054,
"learning_rate": 4.993516229391414e-06,
"loss": 0.0933,
"step": 253
},
{
"epoch": 0.11555959963603275,
"grad_norm": 1.2711881947711132,
"learning_rate": 4.993464691270331e-06,
"loss": 0.0595,
"step": 254
},
{
"epoch": 0.11601455868971793,
"grad_norm": 1.5847340722430843,
"learning_rate": 4.993412949394572e-06,
"loss": 0.0812,
"step": 255
},
{
"epoch": 0.1164695177434031,
"grad_norm": 1.5774467606957123,
"learning_rate": 4.993361003768369e-06,
"loss": 0.081,
"step": 256
},
{
"epoch": 0.11692447679708826,
"grad_norm": 1.3573852133613107,
"learning_rate": 4.993308854395963e-06,
"loss": 0.0812,
"step": 257
},
{
"epoch": 0.11737943585077343,
"grad_norm": 1.5273272920136396,
"learning_rate": 4.993256501281618e-06,
"loss": 0.0634,
"step": 258
},
{
"epoch": 0.1178343949044586,
"grad_norm": 1.8382646613112785,
"learning_rate": 4.993203944429611e-06,
"loss": 0.1145,
"step": 259
},
{
"epoch": 0.11828935395814377,
"grad_norm": 1.5747608705636602,
"learning_rate": 4.993151183844236e-06,
"loss": 0.0801,
"step": 260
},
{
"epoch": 0.11874431301182893,
"grad_norm": 1.7065433305132354,
"learning_rate": 4.9930982195298065e-06,
"loss": 0.0742,
"step": 261
},
{
"epoch": 0.1191992720655141,
"grad_norm": 1.709109441111134,
"learning_rate": 4.9930450514906484e-06,
"loss": 0.1028,
"step": 262
},
{
"epoch": 0.11965423111919928,
"grad_norm": 1.6959707782927067,
"learning_rate": 4.9929916797311075e-06,
"loss": 0.0791,
"step": 263
},
{
"epoch": 0.12010919017288443,
"grad_norm": 2.374639715905283,
"learning_rate": 4.992938104255545e-06,
"loss": 0.1477,
"step": 264
},
{
"epoch": 0.1205641492265696,
"grad_norm": 1.6263809057131815,
"learning_rate": 4.992884325068339e-06,
"loss": 0.0916,
"step": 265
},
{
"epoch": 0.12101910828025478,
"grad_norm": 1.6207164559915699,
"learning_rate": 4.992830342173882e-06,
"loss": 0.1068,
"step": 266
},
{
"epoch": 0.12147406733393995,
"grad_norm": 2.0552449766971823,
"learning_rate": 4.992776155576589e-06,
"loss": 0.1145,
"step": 267
},
{
"epoch": 0.12192902638762511,
"grad_norm": 1.6692049909432523,
"learning_rate": 4.992721765280884e-06,
"loss": 0.1172,
"step": 268
},
{
"epoch": 0.12238398544131028,
"grad_norm": 2.456621954888186,
"learning_rate": 4.992667171291215e-06,
"loss": 0.1267,
"step": 269
},
{
"epoch": 0.12283894449499545,
"grad_norm": 1.5125250812884448,
"learning_rate": 4.992612373612042e-06,
"loss": 0.0661,
"step": 270
},
{
"epoch": 0.12329390354868063,
"grad_norm": 2.0952324870431553,
"learning_rate": 4.99255737224784e-06,
"loss": 0.0917,
"step": 271
},
{
"epoch": 0.12374886260236578,
"grad_norm": 1.4094336450761362,
"learning_rate": 4.9925021672031075e-06,
"loss": 0.0905,
"step": 272
},
{
"epoch": 0.12420382165605096,
"grad_norm": 2.239902062561175,
"learning_rate": 4.992446758482353e-06,
"loss": 0.0995,
"step": 273
},
{
"epoch": 0.12465878070973613,
"grad_norm": 2.696125395972354,
"learning_rate": 4.992391146090106e-06,
"loss": 0.1613,
"step": 274
},
{
"epoch": 0.1251137397634213,
"grad_norm": 1.4853155964847005,
"learning_rate": 4.99233533003091e-06,
"loss": 0.0826,
"step": 275
},
{
"epoch": 0.12556869881710647,
"grad_norm": 1.5393545957542452,
"learning_rate": 4.992279310309326e-06,
"loss": 0.1128,
"step": 276
},
{
"epoch": 0.12602365787079162,
"grad_norm": 2.4236941073693283,
"learning_rate": 4.9922230869299316e-06,
"loss": 0.1607,
"step": 277
},
{
"epoch": 0.1264786169244768,
"grad_norm": 1.6611888199243576,
"learning_rate": 4.992166659897321e-06,
"loss": 0.1005,
"step": 278
},
{
"epoch": 0.12693357597816196,
"grad_norm": 1.3896864345667146,
"learning_rate": 4.992110029216106e-06,
"loss": 0.079,
"step": 279
},
{
"epoch": 0.12738853503184713,
"grad_norm": 1.3647278081745937,
"learning_rate": 4.992053194890914e-06,
"loss": 0.0767,
"step": 280
},
{
"epoch": 0.1278434940855323,
"grad_norm": 2.0323876810575525,
"learning_rate": 4.991996156926388e-06,
"loss": 0.101,
"step": 281
},
{
"epoch": 0.12829845313921748,
"grad_norm": 1.948481701516796,
"learning_rate": 4.9919389153271904e-06,
"loss": 0.106,
"step": 282
},
{
"epoch": 0.12875341219290265,
"grad_norm": 1.3512588403363923,
"learning_rate": 4.991881470097998e-06,
"loss": 0.0897,
"step": 283
},
{
"epoch": 0.1292083712465878,
"grad_norm": 1.4862053800013564,
"learning_rate": 4.991823821243505e-06,
"loss": 0.0898,
"step": 284
},
{
"epoch": 0.12966333030027297,
"grad_norm": 2.287612016528911,
"learning_rate": 4.991765968768422e-06,
"loss": 0.1048,
"step": 285
},
{
"epoch": 0.13011828935395814,
"grad_norm": 1.8190624177647585,
"learning_rate": 4.991707912677477e-06,
"loss": 0.076,
"step": 286
},
{
"epoch": 0.1305732484076433,
"grad_norm": 1.4178411985180965,
"learning_rate": 4.991649652975414e-06,
"loss": 0.062,
"step": 287
},
{
"epoch": 0.13102820746132848,
"grad_norm": 1.7010811854624341,
"learning_rate": 4.991591189666994e-06,
"loss": 0.0928,
"step": 288
},
{
"epoch": 0.13148316651501366,
"grad_norm": 1.7824920481002249,
"learning_rate": 4.991532522756993e-06,
"loss": 0.09,
"step": 289
},
{
"epoch": 0.13193812556869883,
"grad_norm": 1.12093519239752,
"learning_rate": 4.991473652250207e-06,
"loss": 0.0564,
"step": 290
},
{
"epoch": 0.13239308462238397,
"grad_norm": 1.4956629959050902,
"learning_rate": 4.991414578151445e-06,
"loss": 0.0777,
"step": 291
},
{
"epoch": 0.13284804367606914,
"grad_norm": 3.467748085139679,
"learning_rate": 4.991355300465535e-06,
"loss": 0.193,
"step": 292
},
{
"epoch": 0.13330300272975432,
"grad_norm": 1.746518786410603,
"learning_rate": 4.99129581919732e-06,
"loss": 0.0862,
"step": 293
},
{
"epoch": 0.1337579617834395,
"grad_norm": 1.3513400373127227,
"learning_rate": 4.9912361343516616e-06,
"loss": 0.0588,
"step": 294
},
{
"epoch": 0.13421292083712466,
"grad_norm": 1.7841617467512154,
"learning_rate": 4.991176245933437e-06,
"loss": 0.0982,
"step": 295
},
{
"epoch": 0.13466787989080983,
"grad_norm": 1.6650575824861316,
"learning_rate": 4.9911161539475385e-06,
"loss": 0.0868,
"step": 296
},
{
"epoch": 0.135122838944495,
"grad_norm": 2.0850606622795667,
"learning_rate": 4.991055858398879e-06,
"loss": 0.1087,
"step": 297
},
{
"epoch": 0.13557779799818018,
"grad_norm": 2.27094495258401,
"learning_rate": 4.990995359292384e-06,
"loss": 0.1177,
"step": 298
},
{
"epoch": 0.13603275705186532,
"grad_norm": 1.8175215978998918,
"learning_rate": 4.990934656632997e-06,
"loss": 0.1029,
"step": 299
},
{
"epoch": 0.1364877161055505,
"grad_norm": 1.9580713421337124,
"learning_rate": 4.990873750425679e-06,
"loss": 0.0842,
"step": 300
},
{
"epoch": 0.13694267515923567,
"grad_norm": 1.5378181370134305,
"learning_rate": 4.990812640675406e-06,
"loss": 0.0813,
"step": 301
},
{
"epoch": 0.13739763421292084,
"grad_norm": 1.4646500614646956,
"learning_rate": 4.990751327387174e-06,
"loss": 0.0642,
"step": 302
},
{
"epoch": 0.137852593266606,
"grad_norm": 1.7132953215338962,
"learning_rate": 4.99068981056599e-06,
"loss": 0.0921,
"step": 303
},
{
"epoch": 0.13830755232029118,
"grad_norm": 2.020828034549401,
"learning_rate": 4.990628090216885e-06,
"loss": 0.1164,
"step": 304
},
{
"epoch": 0.13876251137397635,
"grad_norm": 1.4167009033800524,
"learning_rate": 4.990566166344898e-06,
"loss": 0.0695,
"step": 305
},
{
"epoch": 0.1392174704276615,
"grad_norm": 1.743676237886539,
"learning_rate": 4.990504038955092e-06,
"loss": 0.1083,
"step": 306
},
{
"epoch": 0.13967242948134667,
"grad_norm": 1.8343720931834766,
"learning_rate": 4.990441708052542e-06,
"loss": 0.0985,
"step": 307
},
{
"epoch": 0.14012738853503184,
"grad_norm": 1.4113998497835858,
"learning_rate": 4.9903791736423435e-06,
"loss": 0.081,
"step": 308
},
{
"epoch": 0.14058234758871702,
"grad_norm": 1.8830612535708886,
"learning_rate": 4.9903164357296044e-06,
"loss": 0.0954,
"step": 309
},
{
"epoch": 0.1410373066424022,
"grad_norm": 1.4208829323408247,
"learning_rate": 4.990253494319453e-06,
"loss": 0.0919,
"step": 310
},
{
"epoch": 0.14149226569608736,
"grad_norm": 1.3671067756437636,
"learning_rate": 4.990190349417032e-06,
"loss": 0.0928,
"step": 311
},
{
"epoch": 0.14194722474977253,
"grad_norm": 1.965673083316737,
"learning_rate": 4.990127001027501e-06,
"loss": 0.0849,
"step": 312
},
{
"epoch": 0.14240218380345768,
"grad_norm": 1.3933093475773835,
"learning_rate": 4.990063449156037e-06,
"loss": 0.0735,
"step": 313
},
{
"epoch": 0.14285714285714285,
"grad_norm": 1.8960360183192995,
"learning_rate": 4.989999693807832e-06,
"loss": 0.1141,
"step": 314
},
{
"epoch": 0.14331210191082802,
"grad_norm": 1.8316795975938271,
"learning_rate": 4.989935734988098e-06,
"loss": 0.1084,
"step": 315
},
{
"epoch": 0.1437670609645132,
"grad_norm": 1.6451238367574679,
"learning_rate": 4.98987157270206e-06,
"loss": 0.0739,
"step": 316
},
{
"epoch": 0.14422202001819837,
"grad_norm": 2.0644883617404854,
"learning_rate": 4.989807206954961e-06,
"loss": 0.1125,
"step": 317
},
{
"epoch": 0.14467697907188354,
"grad_norm": 1.322196438354388,
"learning_rate": 4.9897426377520605e-06,
"loss": 0.0792,
"step": 318
},
{
"epoch": 0.1451319381255687,
"grad_norm": 2.568915637493138,
"learning_rate": 4.989677865098636e-06,
"loss": 0.1236,
"step": 319
},
{
"epoch": 0.14558689717925385,
"grad_norm": 1.1659492648591403,
"learning_rate": 4.989612888999978e-06,
"loss": 0.0624,
"step": 320
},
{
"epoch": 0.14604185623293903,
"grad_norm": 1.431829324891758,
"learning_rate": 4.9895477094614e-06,
"loss": 0.0855,
"step": 321
},
{
"epoch": 0.1464968152866242,
"grad_norm": 1.1704367288212936,
"learning_rate": 4.989482326488225e-06,
"loss": 0.0741,
"step": 322
},
{
"epoch": 0.14695177434030937,
"grad_norm": 1.6170438514885752,
"learning_rate": 4.989416740085796e-06,
"loss": 0.1057,
"step": 323
},
{
"epoch": 0.14740673339399454,
"grad_norm": 1.639627544263893,
"learning_rate": 4.9893509502594735e-06,
"loss": 0.0784,
"step": 324
},
{
"epoch": 0.14786169244767972,
"grad_norm": 1.6437318926278874,
"learning_rate": 4.9892849570146335e-06,
"loss": 0.1105,
"step": 325
},
{
"epoch": 0.1483166515013649,
"grad_norm": 1.6588510281862943,
"learning_rate": 4.989218760356668e-06,
"loss": 0.106,
"step": 326
},
{
"epoch": 0.14877161055505003,
"grad_norm": 1.692767253326721,
"learning_rate": 4.989152360290987e-06,
"loss": 0.1068,
"step": 327
},
{
"epoch": 0.1492265696087352,
"grad_norm": 2.117777475502305,
"learning_rate": 4.989085756823015e-06,
"loss": 0.1274,
"step": 328
},
{
"epoch": 0.14968152866242038,
"grad_norm": 1.6877038030416243,
"learning_rate": 4.989018949958197e-06,
"loss": 0.1001,
"step": 329
},
{
"epoch": 0.15013648771610555,
"grad_norm": 2.018139319167573,
"learning_rate": 4.98895193970199e-06,
"loss": 0.0726,
"step": 330
},
{
"epoch": 0.15059144676979072,
"grad_norm": 1.7601822979826238,
"learning_rate": 4.9888847260598705e-06,
"loss": 0.0884,
"step": 331
},
{
"epoch": 0.1510464058234759,
"grad_norm": 2.153451550499006,
"learning_rate": 4.98881730903733e-06,
"loss": 0.1263,
"step": 332
},
{
"epoch": 0.15150136487716107,
"grad_norm": 1.7037846763057773,
"learning_rate": 4.98874968863988e-06,
"loss": 0.1017,
"step": 333
},
{
"epoch": 0.15195632393084624,
"grad_norm": 1.6373036503866722,
"learning_rate": 4.988681864873044e-06,
"loss": 0.0936,
"step": 334
},
{
"epoch": 0.15241128298453138,
"grad_norm": 1.5043938510579566,
"learning_rate": 4.988613837742364e-06,
"loss": 0.0841,
"step": 335
},
{
"epoch": 0.15286624203821655,
"grad_norm": 1.9480098961832564,
"learning_rate": 4.9885456072534015e-06,
"loss": 0.093,
"step": 336
},
{
"epoch": 0.15332120109190173,
"grad_norm": 2.0743334215437845,
"learning_rate": 4.988477173411728e-06,
"loss": 0.1001,
"step": 337
},
{
"epoch": 0.1537761601455869,
"grad_norm": 1.3686100112269117,
"learning_rate": 4.988408536222939e-06,
"loss": 0.0706,
"step": 338
},
{
"epoch": 0.15423111919927207,
"grad_norm": 1.7072624744285279,
"learning_rate": 4.9883396956926416e-06,
"loss": 0.0883,
"step": 339
},
{
"epoch": 0.15468607825295724,
"grad_norm": 1.2178991309049074,
"learning_rate": 4.988270651826462e-06,
"loss": 0.066,
"step": 340
},
{
"epoch": 0.15514103730664242,
"grad_norm": 1.5734145514066031,
"learning_rate": 4.988201404630041e-06,
"loss": 0.0818,
"step": 341
},
{
"epoch": 0.15559599636032756,
"grad_norm": 1.4266019263972631,
"learning_rate": 4.988131954109038e-06,
"loss": 0.0835,
"step": 342
},
{
"epoch": 0.15605095541401273,
"grad_norm": 2.2620036917930633,
"learning_rate": 4.988062300269128e-06,
"loss": 0.1374,
"step": 343
},
{
"epoch": 0.1565059144676979,
"grad_norm": 1.4975643248719304,
"learning_rate": 4.987992443116003e-06,
"loss": 0.0817,
"step": 344
},
{
"epoch": 0.15696087352138308,
"grad_norm": 1.723916950757741,
"learning_rate": 4.987922382655372e-06,
"loss": 0.086,
"step": 345
},
{
"epoch": 0.15741583257506825,
"grad_norm": 2.50033376989197,
"learning_rate": 4.987852118892958e-06,
"loss": 0.1498,
"step": 346
},
{
"epoch": 0.15787079162875342,
"grad_norm": 2.0776125106096934,
"learning_rate": 4.987781651834503e-06,
"loss": 0.1258,
"step": 347
},
{
"epoch": 0.1583257506824386,
"grad_norm": 2.186488732885297,
"learning_rate": 4.987710981485768e-06,
"loss": 0.1203,
"step": 348
},
{
"epoch": 0.15878070973612374,
"grad_norm": 2.0497982262406698,
"learning_rate": 4.987640107852525e-06,
"loss": 0.1365,
"step": 349
},
{
"epoch": 0.1592356687898089,
"grad_norm": 1.394060418907116,
"learning_rate": 4.987569030940567e-06,
"loss": 0.0811,
"step": 350
},
{
"epoch": 0.15969062784349408,
"grad_norm": 1.5257209721345255,
"learning_rate": 4.987497750755702e-06,
"loss": 0.0665,
"step": 351
},
{
"epoch": 0.16014558689717925,
"grad_norm": 2.328076306378438,
"learning_rate": 4.987426267303753e-06,
"loss": 0.1186,
"step": 352
},
{
"epoch": 0.16060054595086443,
"grad_norm": 1.8266119344469305,
"learning_rate": 4.987354580590563e-06,
"loss": 0.1011,
"step": 353
},
{
"epoch": 0.1610555050045496,
"grad_norm": 1.7369452160483552,
"learning_rate": 4.987282690621991e-06,
"loss": 0.117,
"step": 354
},
{
"epoch": 0.16151046405823477,
"grad_norm": 1.8346392689418392,
"learning_rate": 4.987210597403907e-06,
"loss": 0.1,
"step": 355
},
{
"epoch": 0.16196542311191992,
"grad_norm": 1.9402353280122917,
"learning_rate": 4.987138300942208e-06,
"loss": 0.0949,
"step": 356
},
{
"epoch": 0.1624203821656051,
"grad_norm": 1.4819316275042067,
"learning_rate": 4.987065801242798e-06,
"loss": 0.0855,
"step": 357
},
{
"epoch": 0.16287534121929026,
"grad_norm": 1.8440191145455884,
"learning_rate": 4.986993098311601e-06,
"loss": 0.1057,
"step": 358
},
{
"epoch": 0.16333030027297543,
"grad_norm": 1.712390016283102,
"learning_rate": 4.986920192154561e-06,
"loss": 0.0917,
"step": 359
},
{
"epoch": 0.1637852593266606,
"grad_norm": 1.2697535382377623,
"learning_rate": 4.986847082777632e-06,
"loss": 0.0729,
"step": 360
},
{
"epoch": 0.16424021838034578,
"grad_norm": 1.5330396115730802,
"learning_rate": 4.986773770186791e-06,
"loss": 0.0966,
"step": 361
},
{
"epoch": 0.16469517743403095,
"grad_norm": 2.359233717201702,
"learning_rate": 4.986700254388027e-06,
"loss": 0.1308,
"step": 362
},
{
"epoch": 0.1651501364877161,
"grad_norm": 1.330733109747955,
"learning_rate": 4.986626535387349e-06,
"loss": 0.0728,
"step": 363
},
{
"epoch": 0.16560509554140126,
"grad_norm": 1.7398719883146694,
"learning_rate": 4.9865526131907795e-06,
"loss": 0.0893,
"step": 364
},
{
"epoch": 0.16606005459508644,
"grad_norm": 2.018839749017437,
"learning_rate": 4.9864784878043595e-06,
"loss": 0.1268,
"step": 365
},
{
"epoch": 0.1665150136487716,
"grad_norm": 2.439244123753763,
"learning_rate": 4.986404159234146e-06,
"loss": 0.1047,
"step": 366
},
{
"epoch": 0.16696997270245678,
"grad_norm": 1.4077243142655576,
"learning_rate": 4.986329627486213e-06,
"loss": 0.07,
"step": 367
},
{
"epoch": 0.16742493175614195,
"grad_norm": 2.0634194365835583,
"learning_rate": 4.986254892566652e-06,
"loss": 0.1199,
"step": 368
},
{
"epoch": 0.16787989080982713,
"grad_norm": 1.507898380305614,
"learning_rate": 4.9861799544815684e-06,
"loss": 0.0798,
"step": 369
},
{
"epoch": 0.16833484986351227,
"grad_norm": 1.5689447325912511,
"learning_rate": 4.986104813237086e-06,
"loss": 0.0872,
"step": 370
},
{
"epoch": 0.16878980891719744,
"grad_norm": 1.5434828853102547,
"learning_rate": 4.986029468839346e-06,
"loss": 0.0756,
"step": 371
},
{
"epoch": 0.16924476797088261,
"grad_norm": 1.9546839136865664,
"learning_rate": 4.985953921294505e-06,
"loss": 0.129,
"step": 372
},
{
"epoch": 0.1696997270245678,
"grad_norm": 1.4457360634551049,
"learning_rate": 4.985878170608736e-06,
"loss": 0.0651,
"step": 373
},
{
"epoch": 0.17015468607825296,
"grad_norm": 1.7053082159754585,
"learning_rate": 4.985802216788228e-06,
"loss": 0.0786,
"step": 374
},
{
"epoch": 0.17060964513193813,
"grad_norm": 2.0831330601859643,
"learning_rate": 4.98572605983919e-06,
"loss": 0.1087,
"step": 375
},
{
"epoch": 0.1710646041856233,
"grad_norm": 1.3106266925763963,
"learning_rate": 4.985649699767842e-06,
"loss": 0.0666,
"step": 376
},
{
"epoch": 0.17151956323930848,
"grad_norm": 1.5931730936354696,
"learning_rate": 4.985573136580427e-06,
"loss": 0.1015,
"step": 377
},
{
"epoch": 0.17197452229299362,
"grad_norm": 1.3398175715153904,
"learning_rate": 4.9854963702832e-06,
"loss": 0.0706,
"step": 378
},
{
"epoch": 0.1724294813466788,
"grad_norm": 1.4932070031671647,
"learning_rate": 4.985419400882433e-06,
"loss": 0.1009,
"step": 379
},
{
"epoch": 0.17288444040036396,
"grad_norm": 2.05809614886543,
"learning_rate": 4.985342228384418e-06,
"loss": 0.1373,
"step": 380
},
{
"epoch": 0.17333939945404914,
"grad_norm": 25.314485102746445,
"learning_rate": 4.985264852795459e-06,
"loss": 0.529,
"step": 381
},
{
"epoch": 0.1737943585077343,
"grad_norm": 1.3496622625056474,
"learning_rate": 4.98518727412188e-06,
"loss": 0.0792,
"step": 382
},
{
"epoch": 0.17424931756141948,
"grad_norm": 2.042157493841037,
"learning_rate": 4.98510949237002e-06,
"loss": 0.1127,
"step": 383
},
{
"epoch": 0.17470427661510465,
"grad_norm": 2.093747109047391,
"learning_rate": 4.985031507546234e-06,
"loss": 0.0931,
"step": 384
},
{
"epoch": 0.1751592356687898,
"grad_norm": 2.620290737475778,
"learning_rate": 4.984953319656896e-06,
"loss": 0.1258,
"step": 385
},
{
"epoch": 0.17561419472247497,
"grad_norm": 1.7812499192074571,
"learning_rate": 4.984874928708395e-06,
"loss": 0.0934,
"step": 386
},
{
"epoch": 0.17606915377616014,
"grad_norm": 1.9861134139953058,
"learning_rate": 4.984796334707136e-06,
"loss": 0.1105,
"step": 387
},
{
"epoch": 0.17652411282984531,
"grad_norm": 9.71210910528449,
"learning_rate": 4.984717537659542e-06,
"loss": 0.119,
"step": 388
},
{
"epoch": 0.1769790718835305,
"grad_norm": 1.2902315877318344,
"learning_rate": 4.984638537572052e-06,
"loss": 0.0591,
"step": 389
},
{
"epoch": 0.17743403093721566,
"grad_norm": 1.693249076147672,
"learning_rate": 4.984559334451121e-06,
"loss": 0.0906,
"step": 390
},
{
"epoch": 0.17788898999090083,
"grad_norm": 1.7045791781932,
"learning_rate": 4.984479928303221e-06,
"loss": 0.066,
"step": 391
},
{
"epoch": 0.17834394904458598,
"grad_norm": 1.588345004423415,
"learning_rate": 4.984400319134841e-06,
"loss": 0.079,
"step": 392
},
{
"epoch": 0.17879890809827115,
"grad_norm": 2.8167066456613368,
"learning_rate": 4.984320506952487e-06,
"loss": 0.1743,
"step": 393
},
{
"epoch": 0.17925386715195632,
"grad_norm": 1.8409665855781128,
"learning_rate": 4.9842404917626796e-06,
"loss": 0.1009,
"step": 394
},
{
"epoch": 0.1797088262056415,
"grad_norm": 1.5444918002986228,
"learning_rate": 4.984160273571959e-06,
"loss": 0.0952,
"step": 395
},
{
"epoch": 0.18016378525932666,
"grad_norm": 1.9718645058282944,
"learning_rate": 4.9840798523868785e-06,
"loss": 0.1217,
"step": 396
},
{
"epoch": 0.18061874431301184,
"grad_norm": 1.669853882784426,
"learning_rate": 4.983999228214011e-06,
"loss": 0.083,
"step": 397
},
{
"epoch": 0.181073703366697,
"grad_norm": 1.5445667787054873,
"learning_rate": 4.983918401059943e-06,
"loss": 0.0838,
"step": 398
},
{
"epoch": 0.18152866242038215,
"grad_norm": 1.8477622601816133,
"learning_rate": 4.983837370931282e-06,
"loss": 0.1199,
"step": 399
},
{
"epoch": 0.18198362147406733,
"grad_norm": 2.295804335093856,
"learning_rate": 4.983756137834647e-06,
"loss": 0.1561,
"step": 400
},
{
"epoch": 0.1824385805277525,
"grad_norm": 2.1902816453958933,
"learning_rate": 4.9836747017766765e-06,
"loss": 0.1014,
"step": 401
},
{
"epoch": 0.18289353958143767,
"grad_norm": 1.7414949549224419,
"learning_rate": 4.983593062764027e-06,
"loss": 0.1046,
"step": 402
},
{
"epoch": 0.18334849863512284,
"grad_norm": 3.529761555914209,
"learning_rate": 4.983511220803367e-06,
"loss": 0.1573,
"step": 403
},
{
"epoch": 0.18380345768880801,
"grad_norm": 1.5931351386368249,
"learning_rate": 4.983429175901386e-06,
"loss": 0.0896,
"step": 404
},
{
"epoch": 0.1842584167424932,
"grad_norm": 1.4617184144821196,
"learning_rate": 4.983346928064788e-06,
"loss": 0.0698,
"step": 405
},
{
"epoch": 0.18471337579617833,
"grad_norm": 1.564679441746091,
"learning_rate": 4.9832644773002935e-06,
"loss": 0.0955,
"step": 406
},
{
"epoch": 0.1851683348498635,
"grad_norm": 1.4077890282448986,
"learning_rate": 4.98318182361464e-06,
"loss": 0.0887,
"step": 407
},
{
"epoch": 0.18562329390354868,
"grad_norm": 1.6028267121804223,
"learning_rate": 4.9830989670145825e-06,
"loss": 0.0989,
"step": 408
},
{
"epoch": 0.18607825295723385,
"grad_norm": 1.8479648547197383,
"learning_rate": 4.9830159075068905e-06,
"loss": 0.1009,
"step": 409
},
{
"epoch": 0.18653321201091902,
"grad_norm": 1.8145495712184487,
"learning_rate": 4.9829326450983514e-06,
"loss": 0.1125,
"step": 410
},
{
"epoch": 0.1869881710646042,
"grad_norm": 1.839873930402737,
"learning_rate": 4.98284917979577e-06,
"loss": 0.0975,
"step": 411
},
{
"epoch": 0.18744313011828936,
"grad_norm": 2.3433237407057863,
"learning_rate": 4.9827655116059656e-06,
"loss": 0.1061,
"step": 412
},
{
"epoch": 0.18789808917197454,
"grad_norm": 1.479552769836274,
"learning_rate": 4.9826816405357755e-06,
"loss": 0.105,
"step": 413
},
{
"epoch": 0.18835304822565968,
"grad_norm": 1.0380040250679141,
"learning_rate": 4.982597566592054e-06,
"loss": 0.0522,
"step": 414
},
{
"epoch": 0.18880800727934485,
"grad_norm": 2.2146611071914744,
"learning_rate": 4.982513289781671e-06,
"loss": 0.1403,
"step": 415
},
{
"epoch": 0.18926296633303002,
"grad_norm": 1.4265466923705232,
"learning_rate": 4.982428810111512e-06,
"loss": 0.0846,
"step": 416
},
{
"epoch": 0.1897179253867152,
"grad_norm": 1.4254072959974569,
"learning_rate": 4.9823441275884814e-06,
"loss": 0.0787,
"step": 417
},
{
"epoch": 0.19017288444040037,
"grad_norm": 2.353200458571576,
"learning_rate": 4.982259242219499e-06,
"loss": 0.1114,
"step": 418
},
{
"epoch": 0.19062784349408554,
"grad_norm": 1.3512279730893322,
"learning_rate": 4.9821741540115006e-06,
"loss": 0.0678,
"step": 419
},
{
"epoch": 0.1910828025477707,
"grad_norm": 1.728060266498106,
"learning_rate": 4.982088862971441e-06,
"loss": 0.1129,
"step": 420
},
{
"epoch": 0.19153776160145586,
"grad_norm": 1.8022543001727114,
"learning_rate": 4.982003369106287e-06,
"loss": 0.1036,
"step": 421
},
{
"epoch": 0.19199272065514103,
"grad_norm": 1.2312712834502222,
"learning_rate": 4.981917672423028e-06,
"loss": 0.065,
"step": 422
},
{
"epoch": 0.1924476797088262,
"grad_norm": 1.6183848549336255,
"learning_rate": 4.981831772928664e-06,
"loss": 0.0934,
"step": 423
},
{
"epoch": 0.19290263876251137,
"grad_norm": 2.001713262915152,
"learning_rate": 4.981745670630216e-06,
"loss": 0.1356,
"step": 424
},
{
"epoch": 0.19335759781619655,
"grad_norm": 2.0057745044552995,
"learning_rate": 4.981659365534718e-06,
"loss": 0.1285,
"step": 425
},
{
"epoch": 0.19381255686988172,
"grad_norm": 2.299079022869691,
"learning_rate": 4.981572857649225e-06,
"loss": 0.1195,
"step": 426
},
{
"epoch": 0.1942675159235669,
"grad_norm": 1.6869951958248894,
"learning_rate": 4.981486146980804e-06,
"loss": 0.0877,
"step": 427
},
{
"epoch": 0.19472247497725204,
"grad_norm": 1.9301190501764922,
"learning_rate": 4.9813992335365415e-06,
"loss": 0.0977,
"step": 428
},
{
"epoch": 0.1951774340309372,
"grad_norm": 1.6227704434432904,
"learning_rate": 4.98131211732354e-06,
"loss": 0.1035,
"step": 429
},
{
"epoch": 0.19563239308462238,
"grad_norm": 1.632769015838627,
"learning_rate": 4.981224798348917e-06,
"loss": 0.0833,
"step": 430
},
{
"epoch": 0.19608735213830755,
"grad_norm": 2.3862639707091082,
"learning_rate": 4.981137276619809e-06,
"loss": 0.1419,
"step": 431
},
{
"epoch": 0.19654231119199272,
"grad_norm": 1.2625986411158334,
"learning_rate": 4.9810495521433675e-06,
"loss": 0.078,
"step": 432
},
{
"epoch": 0.1969972702456779,
"grad_norm": 2.5081068393508157,
"learning_rate": 4.9809616249267616e-06,
"loss": 0.1478,
"step": 433
},
{
"epoch": 0.19745222929936307,
"grad_norm": 1.9644808854065114,
"learning_rate": 4.980873494977174e-06,
"loss": 0.121,
"step": 434
},
{
"epoch": 0.1979071883530482,
"grad_norm": 1.647433915922947,
"learning_rate": 4.98078516230181e-06,
"loss": 0.0865,
"step": 435
},
{
"epoch": 0.19836214740673339,
"grad_norm": 1.5774273491436515,
"learning_rate": 4.980696626907884e-06,
"loss": 0.0887,
"step": 436
},
{
"epoch": 0.19881710646041856,
"grad_norm": 1.5604062690588907,
"learning_rate": 4.980607888802633e-06,
"loss": 0.1,
"step": 437
},
{
"epoch": 0.19927206551410373,
"grad_norm": 1.548442809835796,
"learning_rate": 4.980518947993307e-06,
"loss": 0.1005,
"step": 438
},
{
"epoch": 0.1997270245677889,
"grad_norm": 1.6276180373825353,
"learning_rate": 4.980429804487176e-06,
"loss": 0.1006,
"step": 439
},
{
"epoch": 0.20018198362147407,
"grad_norm": 1.5718547041391637,
"learning_rate": 4.980340458291521e-06,
"loss": 0.0858,
"step": 440
},
{
"epoch": 0.20063694267515925,
"grad_norm": 1.3679183632524226,
"learning_rate": 4.980250909413646e-06,
"loss": 0.0901,
"step": 441
},
{
"epoch": 0.2010919017288444,
"grad_norm": 1.7491296961984788,
"learning_rate": 4.980161157860867e-06,
"loss": 0.0888,
"step": 442
},
{
"epoch": 0.20154686078252956,
"grad_norm": 2.0306839493761446,
"learning_rate": 4.980071203640519e-06,
"loss": 0.0893,
"step": 443
},
{
"epoch": 0.20200181983621474,
"grad_norm": 1.353153596211688,
"learning_rate": 4.979981046759952e-06,
"loss": 0.0753,
"step": 444
},
{
"epoch": 0.2024567788898999,
"grad_norm": 1.969605104045741,
"learning_rate": 4.979890687226533e-06,
"loss": 0.1033,
"step": 445
},
{
"epoch": 0.20291173794358508,
"grad_norm": 2.085518332646124,
"learning_rate": 4.979800125047647e-06,
"loss": 0.0979,
"step": 446
},
{
"epoch": 0.20336669699727025,
"grad_norm": 1.6181669031153556,
"learning_rate": 4.979709360230692e-06,
"loss": 0.0969,
"step": 447
},
{
"epoch": 0.20382165605095542,
"grad_norm": 1.6760914355637484,
"learning_rate": 4.979618392783087e-06,
"loss": 0.0883,
"step": 448
},
{
"epoch": 0.20427661510464057,
"grad_norm": 1.2907730003800948,
"learning_rate": 4.979527222712266e-06,
"loss": 0.0775,
"step": 449
},
{
"epoch": 0.20473157415832574,
"grad_norm": 1.241096973502198,
"learning_rate": 4.9794358500256765e-06,
"loss": 0.0599,
"step": 450
},
{
"epoch": 0.2051865332120109,
"grad_norm": 1.579037640818148,
"learning_rate": 4.979344274730786e-06,
"loss": 0.0831,
"step": 451
},
{
"epoch": 0.20564149226569609,
"grad_norm": 2.225915719971972,
"learning_rate": 4.979252496835079e-06,
"loss": 0.1116,
"step": 452
},
{
"epoch": 0.20609645131938126,
"grad_norm": 2.3031173397129923,
"learning_rate": 4.979160516346054e-06,
"loss": 0.1536,
"step": 453
},
{
"epoch": 0.20655141037306643,
"grad_norm": 27.297310781833385,
"learning_rate": 4.979068333271227e-06,
"loss": 0.9223,
"step": 454
},
{
"epoch": 0.2070063694267516,
"grad_norm": 2.4041431299507607,
"learning_rate": 4.978975947618131e-06,
"loss": 0.1184,
"step": 455
},
{
"epoch": 0.20746132848043677,
"grad_norm": 1.6683861662324915,
"learning_rate": 4.978883359394316e-06,
"loss": 0.1301,
"step": 456
},
{
"epoch": 0.20791628753412192,
"grad_norm": 1.9056814965685545,
"learning_rate": 4.978790568607347e-06,
"loss": 0.1001,
"step": 457
},
{
"epoch": 0.2083712465878071,
"grad_norm": 1.9713836323302738,
"learning_rate": 4.9786975752648076e-06,
"loss": 0.1174,
"step": 458
},
{
"epoch": 0.20882620564149226,
"grad_norm": 1.598376196967646,
"learning_rate": 4.978604379374295e-06,
"loss": 0.0986,
"step": 459
},
{
"epoch": 0.20928116469517744,
"grad_norm": 1.5517923833736031,
"learning_rate": 4.978510980943427e-06,
"loss": 0.0807,
"step": 460
},
{
"epoch": 0.2097361237488626,
"grad_norm": 2.004418653450344,
"learning_rate": 4.978417379979834e-06,
"loss": 0.1065,
"step": 461
},
{
"epoch": 0.21019108280254778,
"grad_norm": 1.7753220163198007,
"learning_rate": 4.978323576491165e-06,
"loss": 0.0987,
"step": 462
},
{
"epoch": 0.21064604185623295,
"grad_norm": 1.7384737383317277,
"learning_rate": 4.978229570485085e-06,
"loss": 0.1048,
"step": 463
},
{
"epoch": 0.2111010009099181,
"grad_norm": 1.5352099211420311,
"learning_rate": 4.978135361969276e-06,
"loss": 0.0983,
"step": 464
},
{
"epoch": 0.21155595996360327,
"grad_norm": 1.6028799125387194,
"learning_rate": 4.9780409509514375e-06,
"loss": 0.091,
"step": 465
},
{
"epoch": 0.21201091901728844,
"grad_norm": 1.9664054893168261,
"learning_rate": 4.977946337439282e-06,
"loss": 0.1495,
"step": 466
},
{
"epoch": 0.2124658780709736,
"grad_norm": 1.7122667851036462,
"learning_rate": 4.9778515214405436e-06,
"loss": 0.1139,
"step": 467
},
{
"epoch": 0.21292083712465878,
"grad_norm": 1.7566455248377864,
"learning_rate": 4.977756502962967e-06,
"loss": 0.1097,
"step": 468
},
{
"epoch": 0.21337579617834396,
"grad_norm": 1.1350501611425003,
"learning_rate": 4.97766128201432e-06,
"loss": 0.0629,
"step": 469
},
{
"epoch": 0.21383075523202913,
"grad_norm": 1.2023067292666059,
"learning_rate": 4.977565858602381e-06,
"loss": 0.0782,
"step": 470
},
{
"epoch": 0.21428571428571427,
"grad_norm": 1.628252441426902,
"learning_rate": 4.977470232734949e-06,
"loss": 0.0987,
"step": 471
},
{
"epoch": 0.21474067333939945,
"grad_norm": 1.724322735405813,
"learning_rate": 4.977374404419838e-06,
"loss": 0.0903,
"step": 472
},
{
"epoch": 0.21519563239308462,
"grad_norm": 1.470263169494043,
"learning_rate": 4.977278373664877e-06,
"loss": 0.0882,
"step": 473
},
{
"epoch": 0.2156505914467698,
"grad_norm": 2.599396527432543,
"learning_rate": 4.977182140477916e-06,
"loss": 0.1209,
"step": 474
},
{
"epoch": 0.21610555050045496,
"grad_norm": 1.6800447119151198,
"learning_rate": 4.977085704866817e-06,
"loss": 0.0776,
"step": 475
},
{
"epoch": 0.21656050955414013,
"grad_norm": 1.5595540666125045,
"learning_rate": 4.97698906683946e-06,
"loss": 0.103,
"step": 476
},
{
"epoch": 0.2170154686078253,
"grad_norm": 2.248635180290087,
"learning_rate": 4.9768922264037435e-06,
"loss": 0.1388,
"step": 477
},
{
"epoch": 0.21747042766151045,
"grad_norm": 1.1547627152960565,
"learning_rate": 4.976795183567579e-06,
"loss": 0.0624,
"step": 478
},
{
"epoch": 0.21792538671519562,
"grad_norm": 1.56353757750327,
"learning_rate": 4.976697938338898e-06,
"loss": 0.0856,
"step": 479
},
{
"epoch": 0.2183803457688808,
"grad_norm": 1.2335181237621284,
"learning_rate": 4.976600490725645e-06,
"loss": 0.0644,
"step": 480
},
{
"epoch": 0.21883530482256597,
"grad_norm": 1.900991648340467,
"learning_rate": 4.976502840735785e-06,
"loss": 0.153,
"step": 481
},
{
"epoch": 0.21929026387625114,
"grad_norm": 1.3078243371858722,
"learning_rate": 4.976404988377297e-06,
"loss": 0.0621,
"step": 482
},
{
"epoch": 0.2197452229299363,
"grad_norm": 2.0047686247285923,
"learning_rate": 4.976306933658176e-06,
"loss": 0.1136,
"step": 483
},
{
"epoch": 0.22020018198362148,
"grad_norm": 1.8552855878852923,
"learning_rate": 4.976208676586435e-06,
"loss": 0.1284,
"step": 484
},
{
"epoch": 0.22065514103730663,
"grad_norm": 1.8525936784229493,
"learning_rate": 4.976110217170104e-06,
"loss": 0.0917,
"step": 485
},
{
"epoch": 0.2211101000909918,
"grad_norm": 1.4658188242525991,
"learning_rate": 4.976011555417228e-06,
"loss": 0.0749,
"step": 486
},
{
"epoch": 0.22156505914467697,
"grad_norm": 1.1511032936840262,
"learning_rate": 4.975912691335869e-06,
"loss": 0.0761,
"step": 487
},
{
"epoch": 0.22202001819836215,
"grad_norm": 1.458580259230844,
"learning_rate": 4.975813624934106e-06,
"loss": 0.0768,
"step": 488
},
{
"epoch": 0.22247497725204732,
"grad_norm": 1.5627508232221192,
"learning_rate": 4.975714356220035e-06,
"loss": 0.0823,
"step": 489
},
{
"epoch": 0.2229299363057325,
"grad_norm": 1.075721834306004,
"learning_rate": 4.975614885201766e-06,
"loss": 0.0504,
"step": 490
},
{
"epoch": 0.22338489535941766,
"grad_norm": 1.6198884733457342,
"learning_rate": 4.975515211887429e-06,
"loss": 0.1024,
"step": 491
},
{
"epoch": 0.22383985441310283,
"grad_norm": 1.6346417323820548,
"learning_rate": 4.9754153362851684e-06,
"loss": 0.0851,
"step": 492
},
{
"epoch": 0.22429481346678798,
"grad_norm": 2.448143027911265,
"learning_rate": 4.975315258403145e-06,
"loss": 0.1479,
"step": 493
},
{
"epoch": 0.22474977252047315,
"grad_norm": 1.6016068432961146,
"learning_rate": 4.975214978249537e-06,
"loss": 0.0886,
"step": 494
},
{
"epoch": 0.22520473157415832,
"grad_norm": 1.4721161321318619,
"learning_rate": 4.975114495832539e-06,
"loss": 0.0976,
"step": 495
},
{
"epoch": 0.2256596906278435,
"grad_norm": 1.7625335294527533,
"learning_rate": 4.975013811160362e-06,
"loss": 0.0898,
"step": 496
},
{
"epoch": 0.22611464968152867,
"grad_norm": 1.9298670425360585,
"learning_rate": 4.974912924241233e-06,
"loss": 0.1027,
"step": 497
},
{
"epoch": 0.22656960873521384,
"grad_norm": 1.4996755802132458,
"learning_rate": 4.974811835083397e-06,
"loss": 0.0978,
"step": 498
},
{
"epoch": 0.227024567788899,
"grad_norm": 2.1147277125940955,
"learning_rate": 4.974710543695114e-06,
"loss": 0.1063,
"step": 499
},
{
"epoch": 0.22747952684258416,
"grad_norm": 2.529920688558412,
"learning_rate": 4.974609050084661e-06,
"loss": 0.1476,
"step": 500
},
{
"epoch": 0.22793448589626933,
"grad_norm": 2.14209787933433,
"learning_rate": 4.974507354260332e-06,
"loss": 0.1261,
"step": 501
},
{
"epoch": 0.2283894449499545,
"grad_norm": 1.9058176611193165,
"learning_rate": 4.974405456230436e-06,
"loss": 0.1203,
"step": 502
},
{
"epoch": 0.22884440400363967,
"grad_norm": 1.8980074058725056,
"learning_rate": 4.974303356003301e-06,
"loss": 0.0996,
"step": 503
},
{
"epoch": 0.22929936305732485,
"grad_norm": 1.4579903539692274,
"learning_rate": 4.974201053587268e-06,
"loss": 0.0943,
"step": 504
},
{
"epoch": 0.22975432211101002,
"grad_norm": 1.3940386820106656,
"learning_rate": 4.9740985489907005e-06,
"loss": 0.0663,
"step": 505
},
{
"epoch": 0.2302092811646952,
"grad_norm": 2.441971054754706,
"learning_rate": 4.973995842221971e-06,
"loss": 0.1245,
"step": 506
},
{
"epoch": 0.23066424021838033,
"grad_norm": 1.919620601900113,
"learning_rate": 4.973892933289476e-06,
"loss": 0.1159,
"step": 507
},
{
"epoch": 0.2311191992720655,
"grad_norm": 1.672712776153676,
"learning_rate": 4.97378982220162e-06,
"loss": 0.0981,
"step": 508
},
{
"epoch": 0.23157415832575068,
"grad_norm": 1.2125382683302124,
"learning_rate": 4.973686508966832e-06,
"loss": 0.0601,
"step": 509
},
{
"epoch": 0.23202911737943585,
"grad_norm": 1.222443145221144,
"learning_rate": 4.973582993593554e-06,
"loss": 0.0715,
"step": 510
},
{
"epoch": 0.23248407643312102,
"grad_norm": 1.5223951861259333,
"learning_rate": 4.973479276090244e-06,
"loss": 0.0795,
"step": 511
},
{
"epoch": 0.2329390354868062,
"grad_norm": 1.2392582362318094,
"learning_rate": 4.973375356465378e-06,
"loss": 0.061,
"step": 512
},
{
"epoch": 0.23339399454049137,
"grad_norm": 1.7285156139774616,
"learning_rate": 4.973271234727447e-06,
"loss": 0.1201,
"step": 513
},
{
"epoch": 0.2338489535941765,
"grad_norm": 1.4723786585295477,
"learning_rate": 4.97316691088496e-06,
"loss": 0.0885,
"step": 514
},
{
"epoch": 0.23430391264786168,
"grad_norm": 2.25192801645438,
"learning_rate": 4.973062384946442e-06,
"loss": 0.135,
"step": 515
},
{
"epoch": 0.23475887170154686,
"grad_norm": 1.1373098395352674,
"learning_rate": 4.9729576569204345e-06,
"loss": 0.0728,
"step": 516
},
{
"epoch": 0.23521383075523203,
"grad_norm": 1.5300830315604266,
"learning_rate": 4.972852726815495e-06,
"loss": 0.0941,
"step": 517
},
{
"epoch": 0.2356687898089172,
"grad_norm": 1.8026113068627658,
"learning_rate": 4.972747594640197e-06,
"loss": 0.1247,
"step": 518
},
{
"epoch": 0.23612374886260237,
"grad_norm": 1.794104737159684,
"learning_rate": 4.9726422604031335e-06,
"loss": 0.095,
"step": 519
},
{
"epoch": 0.23657870791628755,
"grad_norm": 1.1504559186965777,
"learning_rate": 4.97253672411291e-06,
"loss": 0.0674,
"step": 520
},
{
"epoch": 0.2370336669699727,
"grad_norm": 1.4316672986650767,
"learning_rate": 4.972430985778152e-06,
"loss": 0.0702,
"step": 521
},
{
"epoch": 0.23748862602365786,
"grad_norm": 1.5328603666600327,
"learning_rate": 4.972325045407499e-06,
"loss": 0.0675,
"step": 522
},
{
"epoch": 0.23794358507734303,
"grad_norm": 3.2405357176859857,
"learning_rate": 4.972218903009608e-06,
"loss": 0.1212,
"step": 523
},
{
"epoch": 0.2383985441310282,
"grad_norm": 1.5109558607242208,
"learning_rate": 4.972112558593153e-06,
"loss": 0.0938,
"step": 524
},
{
"epoch": 0.23885350318471338,
"grad_norm": 1.264935168060258,
"learning_rate": 4.972006012166823e-06,
"loss": 0.0742,
"step": 525
},
{
"epoch": 0.23930846223839855,
"grad_norm": 1.3461924059029533,
"learning_rate": 4.971899263739326e-06,
"loss": 0.0844,
"step": 526
},
{
"epoch": 0.23976342129208372,
"grad_norm": 1.7441591810954875,
"learning_rate": 4.971792313319384e-06,
"loss": 0.1139,
"step": 527
},
{
"epoch": 0.24021838034576887,
"grad_norm": 1.7027600325330141,
"learning_rate": 4.971685160915737e-06,
"loss": 0.0867,
"step": 528
},
{
"epoch": 0.24067333939945404,
"grad_norm": 1.6301828004618641,
"learning_rate": 4.971577806537139e-06,
"loss": 0.0943,
"step": 529
},
{
"epoch": 0.2411282984531392,
"grad_norm": 1.6173281507194255,
"learning_rate": 4.971470250192366e-06,
"loss": 0.1052,
"step": 530
},
{
"epoch": 0.24158325750682438,
"grad_norm": 17.712189021618492,
"learning_rate": 4.9713624918902045e-06,
"loss": 0.3191,
"step": 531
},
{
"epoch": 0.24203821656050956,
"grad_norm": 2.336934606774547,
"learning_rate": 4.971254531639461e-06,
"loss": 0.1347,
"step": 532
},
{
"epoch": 0.24249317561419473,
"grad_norm": 1.8922827015678323,
"learning_rate": 4.971146369448957e-06,
"loss": 0.1144,
"step": 533
},
{
"epoch": 0.2429481346678799,
"grad_norm": 1.7408688040721931,
"learning_rate": 4.971038005327532e-06,
"loss": 0.1143,
"step": 534
},
{
"epoch": 0.24340309372156507,
"grad_norm": 1.9327103804196282,
"learning_rate": 4.970929439284039e-06,
"loss": 0.1377,
"step": 535
},
{
"epoch": 0.24385805277525022,
"grad_norm": 2.0181579320929224,
"learning_rate": 4.970820671327351e-06,
"loss": 0.1259,
"step": 536
},
{
"epoch": 0.2443130118289354,
"grad_norm": 1.1056426992050885,
"learning_rate": 4.9707117014663565e-06,
"loss": 0.0633,
"step": 537
},
{
"epoch": 0.24476797088262056,
"grad_norm": 1.853338129642874,
"learning_rate": 4.97060252970996e-06,
"loss": 0.1215,
"step": 538
},
{
"epoch": 0.24522292993630573,
"grad_norm": 1.6843406450831364,
"learning_rate": 4.970493156067081e-06,
"loss": 0.1016,
"step": 539
},
{
"epoch": 0.2456778889899909,
"grad_norm": 1.1701908663612965,
"learning_rate": 4.970383580546658e-06,
"loss": 0.0731,
"step": 540
},
{
"epoch": 0.24613284804367608,
"grad_norm": 1.7890527407391215,
"learning_rate": 4.970273803157645e-06,
"loss": 0.1097,
"step": 541
},
{
"epoch": 0.24658780709736125,
"grad_norm": 1.4169073671700831,
"learning_rate": 4.970163823909013e-06,
"loss": 0.0845,
"step": 542
},
{
"epoch": 0.2470427661510464,
"grad_norm": 1.5828589024944335,
"learning_rate": 4.970053642809748e-06,
"loss": 0.0921,
"step": 543
},
{
"epoch": 0.24749772520473157,
"grad_norm": 1.6370747251722932,
"learning_rate": 4.969943259868853e-06,
"loss": 0.1088,
"step": 544
},
{
"epoch": 0.24795268425841674,
"grad_norm": 2.023470308157194,
"learning_rate": 4.969832675095351e-06,
"loss": 0.1052,
"step": 545
},
{
"epoch": 0.2484076433121019,
"grad_norm": 1.7462230999429424,
"learning_rate": 4.969721888498275e-06,
"loss": 0.1141,
"step": 546
},
{
"epoch": 0.24886260236578708,
"grad_norm": 1.428774250085193,
"learning_rate": 4.96961090008668e-06,
"loss": 0.0824,
"step": 547
},
{
"epoch": 0.24931756141947226,
"grad_norm": 1.6447081301063733,
"learning_rate": 4.969499709869635e-06,
"loss": 0.1324,
"step": 548
},
{
"epoch": 0.24977252047315743,
"grad_norm": 2.0250820847646054,
"learning_rate": 4.969388317856225e-06,
"loss": 0.1122,
"step": 549
},
{
"epoch": 0.2502274795268426,
"grad_norm": 2.060820071851061,
"learning_rate": 4.969276724055554e-06,
"loss": 0.128,
"step": 550
},
{
"epoch": 0.25068243858052774,
"grad_norm": 1.8421595012757042,
"learning_rate": 4.969164928476741e-06,
"loss": 0.0929,
"step": 551
},
{
"epoch": 0.25113739763421294,
"grad_norm": 1.8378761522798848,
"learning_rate": 4.969052931128919e-06,
"loss": 0.1038,
"step": 552
},
{
"epoch": 0.2515923566878981,
"grad_norm": 1.4559119574869848,
"learning_rate": 4.968940732021243e-06,
"loss": 0.0884,
"step": 553
},
{
"epoch": 0.25204731574158323,
"grad_norm": 1.9971887851212364,
"learning_rate": 4.9688283311628795e-06,
"loss": 0.1353,
"step": 554
},
{
"epoch": 0.25250227479526843,
"grad_norm": 1.7386639848323409,
"learning_rate": 4.968715728563014e-06,
"loss": 0.1025,
"step": 555
},
{
"epoch": 0.2529572338489536,
"grad_norm": 1.260155855896464,
"learning_rate": 4.968602924230847e-06,
"loss": 0.0684,
"step": 556
},
{
"epoch": 0.2534121929026388,
"grad_norm": 2.3395689748358843,
"learning_rate": 4.968489918175598e-06,
"loss": 0.1151,
"step": 557
},
{
"epoch": 0.2538671519563239,
"grad_norm": 2.0737729432038137,
"learning_rate": 4.9683767104065014e-06,
"loss": 0.107,
"step": 558
},
{
"epoch": 0.2543221110100091,
"grad_norm": 1.4554456387078378,
"learning_rate": 4.968263300932806e-06,
"loss": 0.0674,
"step": 559
},
{
"epoch": 0.25477707006369427,
"grad_norm": 1.236095562563839,
"learning_rate": 4.968149689763781e-06,
"loss": 0.0771,
"step": 560
},
{
"epoch": 0.2552320291173794,
"grad_norm": 1.6261579693523964,
"learning_rate": 4.968035876908708e-06,
"loss": 0.1033,
"step": 561
},
{
"epoch": 0.2556869881710646,
"grad_norm": 1.8267174614929946,
"learning_rate": 4.967921862376889e-06,
"loss": 0.1153,
"step": 562
},
{
"epoch": 0.25614194722474976,
"grad_norm": 1.9897704292294367,
"learning_rate": 4.9678076461776415e-06,
"loss": 0.1168,
"step": 563
},
{
"epoch": 0.25659690627843496,
"grad_norm": 1.9727936679798233,
"learning_rate": 4.9676932283202965e-06,
"loss": 0.1389,
"step": 564
},
{
"epoch": 0.2570518653321201,
"grad_norm": 1.8484690700205213,
"learning_rate": 4.967578608814205e-06,
"loss": 0.1024,
"step": 565
},
{
"epoch": 0.2575068243858053,
"grad_norm": 1.4833575893287436,
"learning_rate": 4.9674637876687345e-06,
"loss": 0.0959,
"step": 566
},
{
"epoch": 0.25796178343949044,
"grad_norm": 1.0731244531443167,
"learning_rate": 4.967348764893265e-06,
"loss": 0.0652,
"step": 567
},
{
"epoch": 0.2584167424931756,
"grad_norm": 1.882586364820984,
"learning_rate": 4.967233540497197e-06,
"loss": 0.0887,
"step": 568
},
{
"epoch": 0.2588717015468608,
"grad_norm": 1.5585900206462215,
"learning_rate": 4.967118114489946e-06,
"loss": 0.0705,
"step": 569
},
{
"epoch": 0.25932666060054593,
"grad_norm": 1.4304247727655925,
"learning_rate": 4.967002486880944e-06,
"loss": 0.0689,
"step": 570
},
{
"epoch": 0.25978161965423113,
"grad_norm": 1.996611084455256,
"learning_rate": 4.966886657679641e-06,
"loss": 0.1134,
"step": 571
},
{
"epoch": 0.2602365787079163,
"grad_norm": 2.573142554440562,
"learning_rate": 4.966770626895499e-06,
"loss": 0.137,
"step": 572
},
{
"epoch": 0.2606915377616015,
"grad_norm": 1.7759211248358038,
"learning_rate": 4.966654394538002e-06,
"loss": 0.097,
"step": 573
},
{
"epoch": 0.2611464968152866,
"grad_norm": 1.3021079669208342,
"learning_rate": 4.966537960616646e-06,
"loss": 0.0774,
"step": 574
},
{
"epoch": 0.26160145586897177,
"grad_norm": 2.328733131052364,
"learning_rate": 4.9664213251409486e-06,
"loss": 0.1105,
"step": 575
},
{
"epoch": 0.26205641492265697,
"grad_norm": 2.281267812919593,
"learning_rate": 4.9663044881204375e-06,
"loss": 0.1556,
"step": 576
},
{
"epoch": 0.2625113739763421,
"grad_norm": 1.7215892787568372,
"learning_rate": 4.9661874495646615e-06,
"loss": 0.0917,
"step": 577
},
{
"epoch": 0.2629663330300273,
"grad_norm": 1.3072003221216781,
"learning_rate": 4.9660702094831845e-06,
"loss": 0.0818,
"step": 578
},
{
"epoch": 0.26342129208371245,
"grad_norm": 2.141135787879026,
"learning_rate": 4.965952767885587e-06,
"loss": 0.1187,
"step": 579
},
{
"epoch": 0.26387625113739765,
"grad_norm": 2.3440295569320857,
"learning_rate": 4.965835124781465e-06,
"loss": 0.1336,
"step": 580
},
{
"epoch": 0.2643312101910828,
"grad_norm": 1.2377586425554465,
"learning_rate": 4.965717280180432e-06,
"loss": 0.0771,
"step": 581
},
{
"epoch": 0.26478616924476794,
"grad_norm": 1.5553208083958672,
"learning_rate": 4.965599234092118e-06,
"loss": 0.0906,
"step": 582
},
{
"epoch": 0.26524112829845314,
"grad_norm": 1.676762616981095,
"learning_rate": 4.96548098652617e-06,
"loss": 0.1091,
"step": 583
},
{
"epoch": 0.2656960873521383,
"grad_norm": 1.8329426527347645,
"learning_rate": 4.965362537492249e-06,
"loss": 0.1171,
"step": 584
},
{
"epoch": 0.2661510464058235,
"grad_norm": 1.2752855217123082,
"learning_rate": 4.9652438870000356e-06,
"loss": 0.0726,
"step": 585
},
{
"epoch": 0.26660600545950863,
"grad_norm": 1.188941544645384,
"learning_rate": 4.965125035059224e-06,
"loss": 0.0801,
"step": 586
},
{
"epoch": 0.26706096451319383,
"grad_norm": 1.4654127807937742,
"learning_rate": 4.965005981679527e-06,
"loss": 0.0839,
"step": 587
},
{
"epoch": 0.267515923566879,
"grad_norm": 2.0288718475884107,
"learning_rate": 4.964886726870673e-06,
"loss": 0.1239,
"step": 588
},
{
"epoch": 0.2679708826205642,
"grad_norm": 1.972686660841513,
"learning_rate": 4.964767270642407e-06,
"loss": 0.1004,
"step": 589
},
{
"epoch": 0.2684258416742493,
"grad_norm": 1.6499743360699521,
"learning_rate": 4.964647613004491e-06,
"loss": 0.0976,
"step": 590
},
{
"epoch": 0.26888080072793447,
"grad_norm": 1.5661213245685233,
"learning_rate": 4.964527753966702e-06,
"loss": 0.0818,
"step": 591
},
{
"epoch": 0.26933575978161967,
"grad_norm": 1.387453226127614,
"learning_rate": 4.964407693538834e-06,
"loss": 0.0813,
"step": 592
},
{
"epoch": 0.2697907188353048,
"grad_norm": 1.8652006740776592,
"learning_rate": 4.9642874317307e-06,
"loss": 0.1092,
"step": 593
},
{
"epoch": 0.27024567788899,
"grad_norm": 1.6739291749648295,
"learning_rate": 4.964166968552124e-06,
"loss": 0.1262,
"step": 594
},
{
"epoch": 0.27070063694267515,
"grad_norm": 1.4965319066427345,
"learning_rate": 4.9640463040129525e-06,
"loss": 0.0749,
"step": 595
},
{
"epoch": 0.27115559599636035,
"grad_norm": 1.483777185503557,
"learning_rate": 4.963925438123044e-06,
"loss": 0.075,
"step": 596
},
{
"epoch": 0.2716105550500455,
"grad_norm": 1.646106287941782,
"learning_rate": 4.963804370892276e-06,
"loss": 0.0948,
"step": 597
},
{
"epoch": 0.27206551410373064,
"grad_norm": 1.8923424637891237,
"learning_rate": 4.9636831023305405e-06,
"loss": 0.1296,
"step": 598
},
{
"epoch": 0.27252047315741584,
"grad_norm": 1.453967822900046,
"learning_rate": 4.963561632447748e-06,
"loss": 0.0777,
"step": 599
},
{
"epoch": 0.272975432211101,
"grad_norm": 1.2633146266239919,
"learning_rate": 4.9634399612538255e-06,
"loss": 0.0704,
"step": 600
},
{
"epoch": 0.2734303912647862,
"grad_norm": 24.856853600017228,
"learning_rate": 4.963318088758714e-06,
"loss": 0.4372,
"step": 601
},
{
"epoch": 0.27388535031847133,
"grad_norm": 1.6301604814034822,
"learning_rate": 4.963196014972371e-06,
"loss": 0.0879,
"step": 602
},
{
"epoch": 0.27434030937215653,
"grad_norm": 1.556460730817159,
"learning_rate": 4.963073739904775e-06,
"loss": 0.0893,
"step": 603
},
{
"epoch": 0.2747952684258417,
"grad_norm": 1.657318032059153,
"learning_rate": 4.962951263565915e-06,
"loss": 0.0933,
"step": 604
},
{
"epoch": 0.2752502274795268,
"grad_norm": 2.273490391362205,
"learning_rate": 4.962828585965801e-06,
"loss": 0.1038,
"step": 605
},
{
"epoch": 0.275705186533212,
"grad_norm": 1.5114052665682505,
"learning_rate": 4.962705707114457e-06,
"loss": 0.097,
"step": 606
},
{
"epoch": 0.27616014558689717,
"grad_norm": 1.7683179621585026,
"learning_rate": 4.962582627021923e-06,
"loss": 0.1127,
"step": 607
},
{
"epoch": 0.27661510464058237,
"grad_norm": 1.8859941959717001,
"learning_rate": 4.962459345698258e-06,
"loss": 0.1152,
"step": 608
},
{
"epoch": 0.2770700636942675,
"grad_norm": 1.9839838015935523,
"learning_rate": 4.962335863153537e-06,
"loss": 0.1198,
"step": 609
},
{
"epoch": 0.2775250227479527,
"grad_norm": 1.3671283570292578,
"learning_rate": 4.962212179397847e-06,
"loss": 0.0876,
"step": 610
},
{
"epoch": 0.27797998180163785,
"grad_norm": 1.4623540558631782,
"learning_rate": 4.962088294441299e-06,
"loss": 0.0754,
"step": 611
},
{
"epoch": 0.278434940855323,
"grad_norm": 2.3501285954750806,
"learning_rate": 4.9619642082940135e-06,
"loss": 0.1,
"step": 612
},
{
"epoch": 0.2788898999090082,
"grad_norm": 1.6593172768016098,
"learning_rate": 4.9618399209661305e-06,
"loss": 0.0918,
"step": 613
},
{
"epoch": 0.27934485896269334,
"grad_norm": 1.4913746956676242,
"learning_rate": 4.961715432467807e-06,
"loss": 0.0788,
"step": 614
},
{
"epoch": 0.27979981801637854,
"grad_norm": 1.3335438953393988,
"learning_rate": 4.961590742809216e-06,
"loss": 0.0743,
"step": 615
},
{
"epoch": 0.2802547770700637,
"grad_norm": 1.4631866469804606,
"learning_rate": 4.961465852000545e-06,
"loss": 0.0869,
"step": 616
},
{
"epoch": 0.2807097361237489,
"grad_norm": 1.8021656107937525,
"learning_rate": 4.961340760052001e-06,
"loss": 0.0906,
"step": 617
},
{
"epoch": 0.28116469517743403,
"grad_norm": 1.74213914067233,
"learning_rate": 4.961215466973806e-06,
"loss": 0.0926,
"step": 618
},
{
"epoch": 0.2816196542311192,
"grad_norm": 2.764803909834576,
"learning_rate": 4.961089972776197e-06,
"loss": 0.1823,
"step": 619
},
{
"epoch": 0.2820746132848044,
"grad_norm": 1.3665676735119967,
"learning_rate": 4.9609642774694285e-06,
"loss": 0.0734,
"step": 620
},
{
"epoch": 0.2825295723384895,
"grad_norm": 1.9426323562959267,
"learning_rate": 4.960838381063774e-06,
"loss": 0.0972,
"step": 621
},
{
"epoch": 0.2829845313921747,
"grad_norm": 2.3374254341147322,
"learning_rate": 4.960712283569521e-06,
"loss": 0.1411,
"step": 622
},
{
"epoch": 0.28343949044585987,
"grad_norm": 2.2747894788958543,
"learning_rate": 4.960585984996971e-06,
"loss": 0.1033,
"step": 623
},
{
"epoch": 0.28389444949954507,
"grad_norm": 1.7445142059152803,
"learning_rate": 4.960459485356447e-06,
"loss": 0.1222,
"step": 624
},
{
"epoch": 0.2843494085532302,
"grad_norm": 1.5220008831965313,
"learning_rate": 4.960332784658285e-06,
"loss": 0.1027,
"step": 625
},
{
"epoch": 0.28480436760691535,
"grad_norm": 2.1347326062219034,
"learning_rate": 4.960205882912839e-06,
"loss": 0.1237,
"step": 626
},
{
"epoch": 0.28525932666060055,
"grad_norm": 2.5984695620436002,
"learning_rate": 4.9600787801304785e-06,
"loss": 0.1871,
"step": 627
},
{
"epoch": 0.2857142857142857,
"grad_norm": 2.1207792848317375,
"learning_rate": 4.959951476321589e-06,
"loss": 0.1205,
"step": 628
},
{
"epoch": 0.2861692447679709,
"grad_norm": 1.1897630810057305,
"learning_rate": 4.959823971496575e-06,
"loss": 0.0773,
"step": 629
},
{
"epoch": 0.28662420382165604,
"grad_norm": 3.4920069239312976,
"learning_rate": 4.959696265665853e-06,
"loss": 0.1897,
"step": 630
},
{
"epoch": 0.28707916287534124,
"grad_norm": 1.425742783647833,
"learning_rate": 4.959568358839862e-06,
"loss": 0.0635,
"step": 631
},
{
"epoch": 0.2875341219290264,
"grad_norm": 1.330689822741385,
"learning_rate": 4.95944025102905e-06,
"loss": 0.0722,
"step": 632
},
{
"epoch": 0.28798908098271153,
"grad_norm": 1.99039564333339,
"learning_rate": 4.959311942243888e-06,
"loss": 0.1158,
"step": 633
},
{
"epoch": 0.28844404003639673,
"grad_norm": 1.593751969696495,
"learning_rate": 4.95918343249486e-06,
"loss": 0.0861,
"step": 634
},
{
"epoch": 0.2888989990900819,
"grad_norm": 1.8945402616067804,
"learning_rate": 4.959054721792469e-06,
"loss": 0.1171,
"step": 635
},
{
"epoch": 0.2893539581437671,
"grad_norm": 1.4569740573581391,
"learning_rate": 4.958925810147231e-06,
"loss": 0.0777,
"step": 636
},
{
"epoch": 0.2898089171974522,
"grad_norm": 1.7102068304451903,
"learning_rate": 4.958796697569679e-06,
"loss": 0.0872,
"step": 637
},
{
"epoch": 0.2902638762511374,
"grad_norm": 1.5378977203553044,
"learning_rate": 4.958667384070365e-06,
"loss": 0.0796,
"step": 638
},
{
"epoch": 0.29071883530482256,
"grad_norm": 1.9723232607058794,
"learning_rate": 4.958537869659855e-06,
"loss": 0.1204,
"step": 639
},
{
"epoch": 0.2911737943585077,
"grad_norm": 1.4856408560761394,
"learning_rate": 4.958408154348734e-06,
"loss": 0.0763,
"step": 640
},
{
"epoch": 0.2916287534121929,
"grad_norm": 1.7342797592944788,
"learning_rate": 4.9582782381476e-06,
"loss": 0.1104,
"step": 641
},
{
"epoch": 0.29208371246587805,
"grad_norm": 2.179383476129295,
"learning_rate": 4.958148121067071e-06,
"loss": 0.1694,
"step": 642
},
{
"epoch": 0.29253867151956325,
"grad_norm": 1.8609060135735762,
"learning_rate": 4.9580178031177775e-06,
"loss": 0.1303,
"step": 643
},
{
"epoch": 0.2929936305732484,
"grad_norm": 1.4742279064065518,
"learning_rate": 4.9578872843103694e-06,
"loss": 0.1001,
"step": 644
},
{
"epoch": 0.2934485896269336,
"grad_norm": 1.7670333338462736,
"learning_rate": 4.957756564655513e-06,
"loss": 0.1022,
"step": 645
},
{
"epoch": 0.29390354868061874,
"grad_norm": 1.6630538784639108,
"learning_rate": 4.957625644163888e-06,
"loss": 0.1055,
"step": 646
},
{
"epoch": 0.2943585077343039,
"grad_norm": 1.9118546637397547,
"learning_rate": 4.957494522846194e-06,
"loss": 0.1029,
"step": 647
},
{
"epoch": 0.2948134667879891,
"grad_norm": 1.7468783195584092,
"learning_rate": 4.957363200713146e-06,
"loss": 0.13,
"step": 648
},
{
"epoch": 0.29526842584167423,
"grad_norm": 1.4923304655802225,
"learning_rate": 4.957231677775475e-06,
"loss": 0.0846,
"step": 649
},
{
"epoch": 0.29572338489535943,
"grad_norm": 2.0864859163635407,
"learning_rate": 4.957099954043928e-06,
"loss": 0.1363,
"step": 650
},
{
"epoch": 0.2961783439490446,
"grad_norm": 1.467640729386297,
"learning_rate": 4.956968029529269e-06,
"loss": 0.113,
"step": 651
},
{
"epoch": 0.2966333030027298,
"grad_norm": 1.5940129351295147,
"learning_rate": 4.956835904242277e-06,
"loss": 0.1121,
"step": 652
},
{
"epoch": 0.2970882620564149,
"grad_norm": 1.305300483782713,
"learning_rate": 4.9567035781937516e-06,
"loss": 0.0569,
"step": 653
},
{
"epoch": 0.29754322111010006,
"grad_norm": 1.8626374769697236,
"learning_rate": 4.9565710513945024e-06,
"loss": 0.095,
"step": 654
},
{
"epoch": 0.29799818016378526,
"grad_norm": 1.9350135167075724,
"learning_rate": 4.956438323855362e-06,
"loss": 0.11,
"step": 655
},
{
"epoch": 0.2984531392174704,
"grad_norm": 1.7292500874953625,
"learning_rate": 4.956305395587174e-06,
"loss": 0.1259,
"step": 656
},
{
"epoch": 0.2989080982711556,
"grad_norm": 1.7021672274359103,
"learning_rate": 4.956172266600802e-06,
"loss": 0.0857,
"step": 657
},
{
"epoch": 0.29936305732484075,
"grad_norm": 1.2481942065304896,
"learning_rate": 4.956038936907125e-06,
"loss": 0.0776,
"step": 658
},
{
"epoch": 0.29981801637852595,
"grad_norm": 1.4091727470459356,
"learning_rate": 4.955905406517036e-06,
"loss": 0.0706,
"step": 659
},
{
"epoch": 0.3002729754322111,
"grad_norm": 1.8640524340898077,
"learning_rate": 4.95577167544145e-06,
"loss": 0.1176,
"step": 660
},
{
"epoch": 0.30072793448589624,
"grad_norm": 2.0619543797721698,
"learning_rate": 4.955637743691291e-06,
"loss": 0.1148,
"step": 661
},
{
"epoch": 0.30118289353958144,
"grad_norm": 1.9364848961200234,
"learning_rate": 4.955503611277506e-06,
"loss": 0.0964,
"step": 662
},
{
"epoch": 0.3016378525932666,
"grad_norm": 1.5509916734065172,
"learning_rate": 4.955369278211055e-06,
"loss": 0.0824,
"step": 663
},
{
"epoch": 0.3020928116469518,
"grad_norm": 1.8848317603882998,
"learning_rate": 4.955234744502914e-06,
"loss": 0.1,
"step": 664
},
{
"epoch": 0.30254777070063693,
"grad_norm": 1.7147002197137917,
"learning_rate": 4.955100010164079e-06,
"loss": 0.1042,
"step": 665
},
{
"epoch": 0.30300272975432213,
"grad_norm": 1.8287392204283686,
"learning_rate": 4.954965075205557e-06,
"loss": 0.0894,
"step": 666
},
{
"epoch": 0.3034576888080073,
"grad_norm": 3.2978505813072765,
"learning_rate": 4.9548299396383755e-06,
"loss": 0.1555,
"step": 667
},
{
"epoch": 0.3039126478616925,
"grad_norm": 1.733214316892207,
"learning_rate": 4.954694603473578e-06,
"loss": 0.0848,
"step": 668
},
{
"epoch": 0.3043676069153776,
"grad_norm": 2.1290440022616917,
"learning_rate": 4.954559066722222e-06,
"loss": 0.1329,
"step": 669
},
{
"epoch": 0.30482256596906276,
"grad_norm": 1.7482728884321743,
"learning_rate": 4.954423329395385e-06,
"loss": 0.1135,
"step": 670
},
{
"epoch": 0.30527752502274796,
"grad_norm": 1.8272762006745102,
"learning_rate": 4.954287391504156e-06,
"loss": 0.1233,
"step": 671
},
{
"epoch": 0.3057324840764331,
"grad_norm": 2.276356474817249,
"learning_rate": 4.9541512530596455e-06,
"loss": 0.1426,
"step": 672
},
{
"epoch": 0.3061874431301183,
"grad_norm": 1.5212465132609405,
"learning_rate": 4.954014914072978e-06,
"loss": 0.0908,
"step": 673
},
{
"epoch": 0.30664240218380345,
"grad_norm": 1.7081770141846233,
"learning_rate": 4.9538783745552934e-06,
"loss": 0.1069,
"step": 674
},
{
"epoch": 0.30709736123748865,
"grad_norm": 2.2065783569813755,
"learning_rate": 4.95374163451775e-06,
"loss": 0.1303,
"step": 675
},
{
"epoch": 0.3075523202911738,
"grad_norm": 1.9717809133208803,
"learning_rate": 4.953604693971521e-06,
"loss": 0.0969,
"step": 676
},
{
"epoch": 0.30800727934485894,
"grad_norm": 1.5094990032560427,
"learning_rate": 4.953467552927798e-06,
"loss": 0.059,
"step": 677
},
{
"epoch": 0.30846223839854414,
"grad_norm": 2.5084055121202726,
"learning_rate": 4.9533302113977845e-06,
"loss": 0.141,
"step": 678
},
{
"epoch": 0.3089171974522293,
"grad_norm": 2.1105100650062814,
"learning_rate": 4.9531926693927055e-06,
"loss": 0.1162,
"step": 679
},
{
"epoch": 0.3093721565059145,
"grad_norm": 1.9374617838160508,
"learning_rate": 4.953054926923801e-06,
"loss": 0.1119,
"step": 680
},
{
"epoch": 0.30982711555959963,
"grad_norm": 2.266159358282095,
"learning_rate": 4.952916984002325e-06,
"loss": 0.1188,
"step": 681
},
{
"epoch": 0.31028207461328483,
"grad_norm": 2.1490900129362243,
"learning_rate": 4.95277884063955e-06,
"loss": 0.1337,
"step": 682
},
{
"epoch": 0.31073703366697,
"grad_norm": 1.5330806658735066,
"learning_rate": 4.952640496846766e-06,
"loss": 0.109,
"step": 683
},
{
"epoch": 0.3111919927206551,
"grad_norm": 1.41231573264733,
"learning_rate": 4.952501952635276e-06,
"loss": 0.0837,
"step": 684
},
{
"epoch": 0.3116469517743403,
"grad_norm": 1.993511064296186,
"learning_rate": 4.952363208016402e-06,
"loss": 0.1272,
"step": 685
},
{
"epoch": 0.31210191082802546,
"grad_norm": 1.6098606771380728,
"learning_rate": 4.952224263001482e-06,
"loss": 0.0816,
"step": 686
},
{
"epoch": 0.31255686988171066,
"grad_norm": 1.2309412681015492,
"learning_rate": 4.952085117601868e-06,
"loss": 0.0692,
"step": 687
},
{
"epoch": 0.3130118289353958,
"grad_norm": 1.7997377974129165,
"learning_rate": 4.951945771828933e-06,
"loss": 0.1322,
"step": 688
},
{
"epoch": 0.313466787989081,
"grad_norm": 1.3223154067967124,
"learning_rate": 4.951806225694061e-06,
"loss": 0.0979,
"step": 689
},
{
"epoch": 0.31392174704276615,
"grad_norm": 1.9747397800251965,
"learning_rate": 4.951666479208658e-06,
"loss": 0.1184,
"step": 690
},
{
"epoch": 0.3143767060964513,
"grad_norm": 1.4466542632801185,
"learning_rate": 4.951526532384141e-06,
"loss": 0.085,
"step": 691
},
{
"epoch": 0.3148316651501365,
"grad_norm": 1.8649877852775587,
"learning_rate": 4.951386385231946e-06,
"loss": 0.1011,
"step": 692
},
{
"epoch": 0.31528662420382164,
"grad_norm": 1.2680670071467166,
"learning_rate": 4.951246037763528e-06,
"loss": 0.0748,
"step": 693
},
{
"epoch": 0.31574158325750684,
"grad_norm": 1.5151831279551418,
"learning_rate": 4.9511054899903524e-06,
"loss": 0.0874,
"step": 694
},
{
"epoch": 0.316196542311192,
"grad_norm": 1.6436638497099227,
"learning_rate": 4.950964741923905e-06,
"loss": 0.0982,
"step": 695
},
{
"epoch": 0.3166515013648772,
"grad_norm": 1.5379093700813176,
"learning_rate": 4.950823793575688e-06,
"loss": 0.0857,
"step": 696
},
{
"epoch": 0.31710646041856233,
"grad_norm": 2.4063943761092452,
"learning_rate": 4.950682644957218e-06,
"loss": 0.1253,
"step": 697
},
{
"epoch": 0.3175614194722475,
"grad_norm": 2.5063143673804844,
"learning_rate": 4.9505412960800295e-06,
"loss": 0.1511,
"step": 698
},
{
"epoch": 0.3180163785259327,
"grad_norm": 1.722833309256951,
"learning_rate": 4.950399746955673e-06,
"loss": 0.0999,
"step": 699
},
{
"epoch": 0.3184713375796178,
"grad_norm": 1.8190148406823232,
"learning_rate": 4.950257997595716e-06,
"loss": 0.0895,
"step": 700
},
{
"epoch": 0.318926296633303,
"grad_norm": 1.9186747250049239,
"learning_rate": 4.950116048011739e-06,
"loss": 0.0964,
"step": 701
},
{
"epoch": 0.31938125568698816,
"grad_norm": 1.372930302125184,
"learning_rate": 4.949973898215344e-06,
"loss": 0.0589,
"step": 702
},
{
"epoch": 0.31983621474067336,
"grad_norm": 1.9707430002902289,
"learning_rate": 4.949831548218146e-06,
"loss": 0.1054,
"step": 703
},
{
"epoch": 0.3202911737943585,
"grad_norm": 2.0845604349239832,
"learning_rate": 4.949688998031777e-06,
"loss": 0.1105,
"step": 704
},
{
"epoch": 0.32074613284804365,
"grad_norm": 1.4969274131429369,
"learning_rate": 4.949546247667886e-06,
"loss": 0.0814,
"step": 705
},
{
"epoch": 0.32120109190172885,
"grad_norm": 1.9940826155791407,
"learning_rate": 4.949403297138137e-06,
"loss": 0.1064,
"step": 706
},
{
"epoch": 0.321656050955414,
"grad_norm": 1.7246519891154302,
"learning_rate": 4.949260146454212e-06,
"loss": 0.1093,
"step": 707
},
{
"epoch": 0.3221110100090992,
"grad_norm": 1.6890948945842699,
"learning_rate": 4.94911679562781e-06,
"loss": 0.0888,
"step": 708
},
{
"epoch": 0.32256596906278434,
"grad_norm": 2.0455963687465837,
"learning_rate": 4.948973244670643e-06,
"loss": 0.1019,
"step": 709
},
{
"epoch": 0.32302092811646954,
"grad_norm": 1.7678121189421865,
"learning_rate": 4.948829493594441e-06,
"loss": 0.0961,
"step": 710
},
{
"epoch": 0.3234758871701547,
"grad_norm": 1.3731566726245188,
"learning_rate": 4.9486855424109524e-06,
"loss": 0.072,
"step": 711
},
{
"epoch": 0.32393084622383983,
"grad_norm": 1.4962983653581472,
"learning_rate": 4.948541391131939e-06,
"loss": 0.0905,
"step": 712
},
{
"epoch": 0.32438580527752503,
"grad_norm": 1.4198695601427125,
"learning_rate": 4.948397039769181e-06,
"loss": 0.0616,
"step": 713
},
{
"epoch": 0.3248407643312102,
"grad_norm": 1.131377673368795,
"learning_rate": 4.948252488334474e-06,
"loss": 0.0526,
"step": 714
},
{
"epoch": 0.3252957233848954,
"grad_norm": 1.1969683311404917,
"learning_rate": 4.948107736839629e-06,
"loss": 0.0763,
"step": 715
},
{
"epoch": 0.3257506824385805,
"grad_norm": 1.6793927846583725,
"learning_rate": 4.947962785296476e-06,
"loss": 0.1153,
"step": 716
},
{
"epoch": 0.3262056414922657,
"grad_norm": 2.070694963019659,
"learning_rate": 4.9478176337168594e-06,
"loss": 0.1153,
"step": 717
},
{
"epoch": 0.32666060054595086,
"grad_norm": 2.7729923804058516,
"learning_rate": 4.9476722821126386e-06,
"loss": 0.171,
"step": 718
},
{
"epoch": 0.327115559599636,
"grad_norm": 1.4442284620787837,
"learning_rate": 4.9475267304956945e-06,
"loss": 0.0997,
"step": 719
},
{
"epoch": 0.3275705186533212,
"grad_norm": 2.0979816044129413,
"learning_rate": 4.947380978877917e-06,
"loss": 0.1138,
"step": 720
},
{
"epoch": 0.32802547770700635,
"grad_norm": 1.9982881232916472,
"learning_rate": 4.947235027271219e-06,
"loss": 0.1402,
"step": 721
},
{
"epoch": 0.32848043676069155,
"grad_norm": 1.3317844805683108,
"learning_rate": 4.9470888756875265e-06,
"loss": 0.0707,
"step": 722
},
{
"epoch": 0.3289353958143767,
"grad_norm": 1.4665146144499257,
"learning_rate": 4.946942524138782e-06,
"loss": 0.075,
"step": 723
},
{
"epoch": 0.3293903548680619,
"grad_norm": 1.6321427811402383,
"learning_rate": 4.946795972636944e-06,
"loss": 0.0971,
"step": 724
},
{
"epoch": 0.32984531392174704,
"grad_norm": 1.9541110640157349,
"learning_rate": 4.94664922119399e-06,
"loss": 0.1347,
"step": 725
},
{
"epoch": 0.3303002729754322,
"grad_norm": 1.664760132709453,
"learning_rate": 4.94650226982191e-06,
"loss": 0.0959,
"step": 726
},
{
"epoch": 0.3307552320291174,
"grad_norm": 2.509161708357272,
"learning_rate": 4.9463551185327115e-06,
"loss": 0.1885,
"step": 727
},
{
"epoch": 0.33121019108280253,
"grad_norm": 1.7296886670922147,
"learning_rate": 4.946207767338422e-06,
"loss": 0.0867,
"step": 728
},
{
"epoch": 0.33166515013648773,
"grad_norm": 1.5254904811287948,
"learning_rate": 4.9460602162510805e-06,
"loss": 0.09,
"step": 729
},
{
"epoch": 0.3321201091901729,
"grad_norm": 1.3404896968358107,
"learning_rate": 4.945912465282744e-06,
"loss": 0.0782,
"step": 730
},
{
"epoch": 0.3325750682438581,
"grad_norm": 1.79952897501454,
"learning_rate": 4.945764514445487e-06,
"loss": 0.1444,
"step": 731
},
{
"epoch": 0.3330300272975432,
"grad_norm": 2.48899319031489,
"learning_rate": 4.9456163637513986e-06,
"loss": 0.1136,
"step": 732
},
{
"epoch": 0.33348498635122836,
"grad_norm": 1.8285171425829347,
"learning_rate": 4.945468013212585e-06,
"loss": 0.1052,
"step": 733
},
{
"epoch": 0.33393994540491356,
"grad_norm": 1.7843881981445446,
"learning_rate": 4.945319462841169e-06,
"loss": 0.1116,
"step": 734
},
{
"epoch": 0.3343949044585987,
"grad_norm": 2.181301353034186,
"learning_rate": 4.94517071264929e-06,
"loss": 0.1251,
"step": 735
},
{
"epoch": 0.3348498635122839,
"grad_norm": 1.2980326592722402,
"learning_rate": 4.945021762649102e-06,
"loss": 0.0648,
"step": 736
},
{
"epoch": 0.33530482256596905,
"grad_norm": 1.3874782347309536,
"learning_rate": 4.9448726128527776e-06,
"loss": 0.0978,
"step": 737
},
{
"epoch": 0.33575978161965425,
"grad_norm": 1.8955499231356112,
"learning_rate": 4.944723263272504e-06,
"loss": 0.0998,
"step": 738
},
{
"epoch": 0.3362147406733394,
"grad_norm": 1.6102418502733031,
"learning_rate": 4.944573713920485e-06,
"loss": 0.1055,
"step": 739
},
{
"epoch": 0.33666969972702454,
"grad_norm": 3.355056116777925,
"learning_rate": 4.944423964808943e-06,
"loss": 0.1831,
"step": 740
},
{
"epoch": 0.33712465878070974,
"grad_norm": 1.507329867530008,
"learning_rate": 4.944274015950113e-06,
"loss": 0.0889,
"step": 741
},
{
"epoch": 0.3375796178343949,
"grad_norm": 1.610548678904166,
"learning_rate": 4.944123867356249e-06,
"loss": 0.0752,
"step": 742
},
{
"epoch": 0.3380345768880801,
"grad_norm": 1.918715600058829,
"learning_rate": 4.943973519039619e-06,
"loss": 0.1335,
"step": 743
},
{
"epoch": 0.33848953594176523,
"grad_norm": 1.3921163271356483,
"learning_rate": 4.943822971012511e-06,
"loss": 0.0727,
"step": 744
},
{
"epoch": 0.33894449499545043,
"grad_norm": 1.2023922578586952,
"learning_rate": 4.943672223287226e-06,
"loss": 0.0628,
"step": 745
},
{
"epoch": 0.3393994540491356,
"grad_norm": 2.2794421985003317,
"learning_rate": 4.9435212758760815e-06,
"loss": 0.1404,
"step": 746
},
{
"epoch": 0.3398544131028208,
"grad_norm": 1.3986125533304865,
"learning_rate": 4.943370128791413e-06,
"loss": 0.0787,
"step": 747
},
{
"epoch": 0.3403093721565059,
"grad_norm": 1.5259961799310353,
"learning_rate": 4.943218782045574e-06,
"loss": 0.1079,
"step": 748
},
{
"epoch": 0.34076433121019106,
"grad_norm": 1.8181192019120165,
"learning_rate": 4.943067235650927e-06,
"loss": 0.1195,
"step": 749
},
{
"epoch": 0.34121929026387626,
"grad_norm": 1.831268771798402,
"learning_rate": 4.942915489619859e-06,
"loss": 0.1065,
"step": 750
},
{
"epoch": 0.3416742493175614,
"grad_norm": 1.7306841826817951,
"learning_rate": 4.9427635439647704e-06,
"loss": 0.1232,
"step": 751
},
{
"epoch": 0.3421292083712466,
"grad_norm": 1.7076927486745839,
"learning_rate": 4.942611398698075e-06,
"loss": 0.0912,
"step": 752
},
{
"epoch": 0.34258416742493175,
"grad_norm": 1.7425991433970283,
"learning_rate": 4.942459053832208e-06,
"loss": 0.0997,
"step": 753
},
{
"epoch": 0.34303912647861695,
"grad_norm": 1.809200639541382,
"learning_rate": 4.942306509379617e-06,
"loss": 0.1085,
"step": 754
},
{
"epoch": 0.3434940855323021,
"grad_norm": 1.293751880354007,
"learning_rate": 4.942153765352767e-06,
"loss": 0.0966,
"step": 755
},
{
"epoch": 0.34394904458598724,
"grad_norm": 1.2918089478267207,
"learning_rate": 4.94200082176414e-06,
"loss": 0.078,
"step": 756
},
{
"epoch": 0.34440400363967244,
"grad_norm": 1.5059276244213293,
"learning_rate": 4.941847678626234e-06,
"loss": 0.0805,
"step": 757
},
{
"epoch": 0.3448589626933576,
"grad_norm": 1.4851814064844335,
"learning_rate": 4.941694335951563e-06,
"loss": 0.0983,
"step": 758
},
{
"epoch": 0.3453139217470428,
"grad_norm": 1.8989617812022122,
"learning_rate": 4.9415407937526575e-06,
"loss": 0.1107,
"step": 759
},
{
"epoch": 0.34576888080072793,
"grad_norm": 1.8347292963195811,
"learning_rate": 4.9413870520420635e-06,
"loss": 0.1237,
"step": 760
},
{
"epoch": 0.34622383985441313,
"grad_norm": 1.5924498433598573,
"learning_rate": 4.941233110832346e-06,
"loss": 0.0735,
"step": 761
},
{
"epoch": 0.3466787989080983,
"grad_norm": 2.3326854621993984,
"learning_rate": 4.941078970136082e-06,
"loss": 0.1295,
"step": 762
},
{
"epoch": 0.3471337579617834,
"grad_norm": 1.7112828341096407,
"learning_rate": 4.940924629965869e-06,
"loss": 0.1162,
"step": 763
},
{
"epoch": 0.3475887170154686,
"grad_norm": 1.5436956280322631,
"learning_rate": 4.940770090334319e-06,
"loss": 0.0861,
"step": 764
},
{
"epoch": 0.34804367606915376,
"grad_norm": 1.6236751771508604,
"learning_rate": 4.940615351254059e-06,
"loss": 0.0968,
"step": 765
},
{
"epoch": 0.34849863512283896,
"grad_norm": 1.0400997330052792,
"learning_rate": 4.940460412737734e-06,
"loss": 0.0711,
"step": 766
},
{
"epoch": 0.3489535941765241,
"grad_norm": 1.623731539624473,
"learning_rate": 4.940305274798005e-06,
"loss": 0.0929,
"step": 767
},
{
"epoch": 0.3494085532302093,
"grad_norm": 1.3764287278870393,
"learning_rate": 4.940149937447549e-06,
"loss": 0.1002,
"step": 768
},
{
"epoch": 0.34986351228389445,
"grad_norm": 1.1571526873015439,
"learning_rate": 4.939994400699061e-06,
"loss": 0.0659,
"step": 769
},
{
"epoch": 0.3503184713375796,
"grad_norm": 1.3670356182264325,
"learning_rate": 4.939838664565248e-06,
"loss": 0.0991,
"step": 770
},
{
"epoch": 0.3507734303912648,
"grad_norm": 1.2532975621868427,
"learning_rate": 4.939682729058839e-06,
"loss": 0.0713,
"step": 771
},
{
"epoch": 0.35122838944494994,
"grad_norm": 1.3003896066972325,
"learning_rate": 4.939526594192574e-06,
"loss": 0.0784,
"step": 772
},
{
"epoch": 0.35168334849863514,
"grad_norm": 1.4253255736587618,
"learning_rate": 4.939370259979213e-06,
"loss": 0.0826,
"step": 773
},
{
"epoch": 0.3521383075523203,
"grad_norm": 2.0399381310170766,
"learning_rate": 4.9392137264315295e-06,
"loss": 0.1293,
"step": 774
},
{
"epoch": 0.3525932666060055,
"grad_norm": 1.938165172266556,
"learning_rate": 4.939056993562316e-06,
"loss": 0.1407,
"step": 775
},
{
"epoch": 0.35304822565969063,
"grad_norm": 1.5665447950299711,
"learning_rate": 4.9389000613843805e-06,
"loss": 0.0942,
"step": 776
},
{
"epoch": 0.3535031847133758,
"grad_norm": 1.6514430942693614,
"learning_rate": 4.938742929910546e-06,
"loss": 0.0927,
"step": 777
},
{
"epoch": 0.353958143767061,
"grad_norm": 1.0136329941515525,
"learning_rate": 4.938585599153652e-06,
"loss": 0.0676,
"step": 778
},
{
"epoch": 0.3544131028207461,
"grad_norm": 1.6808166258098367,
"learning_rate": 4.938428069126555e-06,
"loss": 0.1029,
"step": 779
},
{
"epoch": 0.3548680618744313,
"grad_norm": 1.6649052760273926,
"learning_rate": 4.9382703398421285e-06,
"loss": 0.0952,
"step": 780
},
{
"epoch": 0.35532302092811646,
"grad_norm": 1.734423574608651,
"learning_rate": 4.938112411313261e-06,
"loss": 0.1098,
"step": 781
},
{
"epoch": 0.35577797998180166,
"grad_norm": 1.5154424391674823,
"learning_rate": 4.937954283552858e-06,
"loss": 0.0808,
"step": 782
},
{
"epoch": 0.3562329390354868,
"grad_norm": 1.6988796126790968,
"learning_rate": 4.93779595657384e-06,
"loss": 0.1066,
"step": 783
},
{
"epoch": 0.35668789808917195,
"grad_norm": 2.050921985283142,
"learning_rate": 4.937637430389145e-06,
"loss": 0.1184,
"step": 784
},
{
"epoch": 0.35714285714285715,
"grad_norm": 1.5678672253769157,
"learning_rate": 4.937478705011729e-06,
"loss": 0.0709,
"step": 785
},
{
"epoch": 0.3575978161965423,
"grad_norm": 1.5215473079480804,
"learning_rate": 4.937319780454559e-06,
"loss": 0.1086,
"step": 786
},
{
"epoch": 0.3580527752502275,
"grad_norm": 1.4009067409412712,
"learning_rate": 4.937160656730625e-06,
"loss": 0.1004,
"step": 787
},
{
"epoch": 0.35850773430391264,
"grad_norm": 1.538795370618956,
"learning_rate": 4.9370013338529274e-06,
"loss": 0.0897,
"step": 788
},
{
"epoch": 0.35896269335759784,
"grad_norm": 1.3446100123630027,
"learning_rate": 4.936841811834486e-06,
"loss": 0.0907,
"step": 789
},
{
"epoch": 0.359417652411283,
"grad_norm": 1.9381081676057568,
"learning_rate": 4.936682090688337e-06,
"loss": 0.1534,
"step": 790
},
{
"epoch": 0.35987261146496813,
"grad_norm": 1.787589837431021,
"learning_rate": 4.936522170427531e-06,
"loss": 0.0919,
"step": 791
},
{
"epoch": 0.36032757051865333,
"grad_norm": 1.7189621906826116,
"learning_rate": 4.936362051065136e-06,
"loss": 0.0799,
"step": 792
},
{
"epoch": 0.3607825295723385,
"grad_norm": 1.615638183805568,
"learning_rate": 4.936201732614238e-06,
"loss": 0.0898,
"step": 793
},
{
"epoch": 0.3612374886260237,
"grad_norm": 1.899483445293266,
"learning_rate": 4.9360412150879355e-06,
"loss": 0.1086,
"step": 794
},
{
"epoch": 0.3616924476797088,
"grad_norm": 1.8831302635176637,
"learning_rate": 4.935880498499346e-06,
"loss": 0.0951,
"step": 795
},
{
"epoch": 0.362147406733394,
"grad_norm": 2.0172166216160594,
"learning_rate": 4.935719582861604e-06,
"loss": 0.0983,
"step": 796
},
{
"epoch": 0.36260236578707916,
"grad_norm": 1.7713001106130557,
"learning_rate": 4.935558468187855e-06,
"loss": 0.1177,
"step": 797
},
{
"epoch": 0.3630573248407643,
"grad_norm": 2.049007453668216,
"learning_rate": 4.935397154491268e-06,
"loss": 0.1349,
"step": 798
},
{
"epoch": 0.3635122838944495,
"grad_norm": 2.02340700279538,
"learning_rate": 4.935235641785023e-06,
"loss": 0.1419,
"step": 799
},
{
"epoch": 0.36396724294813465,
"grad_norm": 1.5504094804690502,
"learning_rate": 4.935073930082319e-06,
"loss": 0.1141,
"step": 800
},
{
"epoch": 0.36442220200181985,
"grad_norm": 1.3892292745868653,
"learning_rate": 4.93491201939637e-06,
"loss": 0.0859,
"step": 801
},
{
"epoch": 0.364877161055505,
"grad_norm": 1.636711407623354,
"learning_rate": 4.934749909740408e-06,
"loss": 0.1168,
"step": 802
},
{
"epoch": 0.3653321201091902,
"grad_norm": 1.5867549476191922,
"learning_rate": 4.934587601127677e-06,
"loss": 0.0941,
"step": 803
},
{
"epoch": 0.36578707916287534,
"grad_norm": 1.5019646850922737,
"learning_rate": 4.934425093571442e-06,
"loss": 0.0931,
"step": 804
},
{
"epoch": 0.3662420382165605,
"grad_norm": 1.5412581659446851,
"learning_rate": 4.934262387084984e-06,
"loss": 0.0931,
"step": 805
},
{
"epoch": 0.3666969972702457,
"grad_norm": 1.3579602631174856,
"learning_rate": 4.934099481681595e-06,
"loss": 0.0745,
"step": 806
},
{
"epoch": 0.36715195632393083,
"grad_norm": 1.800459979497766,
"learning_rate": 4.933936377374589e-06,
"loss": 0.1072,
"step": 807
},
{
"epoch": 0.36760691537761603,
"grad_norm": 1.1946995764469395,
"learning_rate": 4.933773074177293e-06,
"loss": 0.0848,
"step": 808
},
{
"epoch": 0.3680618744313012,
"grad_norm": 1.6651644751131276,
"learning_rate": 4.933609572103053e-06,
"loss": 0.0965,
"step": 809
},
{
"epoch": 0.3685168334849864,
"grad_norm": 1.913995880200427,
"learning_rate": 4.933445871165229e-06,
"loss": 0.1315,
"step": 810
},
{
"epoch": 0.3689717925386715,
"grad_norm": 1.5517430124798408,
"learning_rate": 4.933281971377197e-06,
"loss": 0.0856,
"step": 811
},
{
"epoch": 0.36942675159235666,
"grad_norm": 1.474632001508129,
"learning_rate": 4.933117872752352e-06,
"loss": 0.0989,
"step": 812
},
{
"epoch": 0.36988171064604186,
"grad_norm": 1.8862093944877263,
"learning_rate": 4.932953575304102e-06,
"loss": 0.1087,
"step": 813
},
{
"epoch": 0.370336669699727,
"grad_norm": 1.6830668966166524,
"learning_rate": 4.932789079045873e-06,
"loss": 0.1213,
"step": 814
},
{
"epoch": 0.3707916287534122,
"grad_norm": 1.7198476556190763,
"learning_rate": 4.932624383991106e-06,
"loss": 0.1215,
"step": 815
},
{
"epoch": 0.37124658780709735,
"grad_norm": 2.109229814604393,
"learning_rate": 4.9324594901532605e-06,
"loss": 0.1337,
"step": 816
},
{
"epoch": 0.37170154686078255,
"grad_norm": 1.4154701665481155,
"learning_rate": 4.93229439754581e-06,
"loss": 0.0944,
"step": 817
},
{
"epoch": 0.3721565059144677,
"grad_norm": 1.973608289061544,
"learning_rate": 4.932129106182246e-06,
"loss": 0.0901,
"step": 818
},
{
"epoch": 0.37261146496815284,
"grad_norm": 1.651833939526615,
"learning_rate": 4.931963616076075e-06,
"loss": 0.0876,
"step": 819
},
{
"epoch": 0.37306642402183804,
"grad_norm": 1.3876140677966586,
"learning_rate": 4.93179792724082e-06,
"loss": 0.0791,
"step": 820
},
{
"epoch": 0.3735213830755232,
"grad_norm": 1.4201117298181156,
"learning_rate": 4.9316320396900195e-06,
"loss": 0.0857,
"step": 821
},
{
"epoch": 0.3739763421292084,
"grad_norm": 2.158894018361071,
"learning_rate": 4.9314659534372305e-06,
"loss": 0.1499,
"step": 822
},
{
"epoch": 0.37443130118289353,
"grad_norm": 1.2722019893377066,
"learning_rate": 4.931299668496024e-06,
"loss": 0.0626,
"step": 823
},
{
"epoch": 0.37488626023657873,
"grad_norm": 1.5889108253283166,
"learning_rate": 4.931133184879988e-06,
"loss": 0.1003,
"step": 824
},
{
"epoch": 0.37534121929026387,
"grad_norm": 1.133918642525753,
"learning_rate": 4.930966502602727e-06,
"loss": 0.0714,
"step": 825
},
{
"epoch": 0.37579617834394907,
"grad_norm": 2.1296168633446615,
"learning_rate": 4.930799621677862e-06,
"loss": 0.1276,
"step": 826
},
{
"epoch": 0.3762511373976342,
"grad_norm": 2.018575113751553,
"learning_rate": 4.93063254211903e-06,
"loss": 0.134,
"step": 827
},
{
"epoch": 0.37670609645131936,
"grad_norm": 1.2247931548507431,
"learning_rate": 4.930465263939882e-06,
"loss": 0.0617,
"step": 828
},
{
"epoch": 0.37716105550500456,
"grad_norm": 2.032637719937323,
"learning_rate": 4.9302977871540894e-06,
"loss": 0.1191,
"step": 829
},
{
"epoch": 0.3776160145586897,
"grad_norm": 1.8922514826155596,
"learning_rate": 4.930130111775336e-06,
"loss": 0.1136,
"step": 830
},
{
"epoch": 0.3780709736123749,
"grad_norm": 1.2345527477299194,
"learning_rate": 4.9299622378173245e-06,
"loss": 0.0613,
"step": 831
},
{
"epoch": 0.37852593266606005,
"grad_norm": 2.2369584057058693,
"learning_rate": 4.929794165293773e-06,
"loss": 0.1384,
"step": 832
},
{
"epoch": 0.37898089171974525,
"grad_norm": 1.2980952577352378,
"learning_rate": 4.9296258942184145e-06,
"loss": 0.0889,
"step": 833
},
{
"epoch": 0.3794358507734304,
"grad_norm": 2.116237658876168,
"learning_rate": 4.929457424605e-06,
"loss": 0.1156,
"step": 834
},
{
"epoch": 0.37989080982711554,
"grad_norm": 1.820103679143319,
"learning_rate": 4.929288756467296e-06,
"loss": 0.1224,
"step": 835
},
{
"epoch": 0.38034576888080074,
"grad_norm": 1.6658306682266317,
"learning_rate": 4.929119889819086e-06,
"loss": 0.0871,
"step": 836
},
{
"epoch": 0.3808007279344859,
"grad_norm": 2.7831412779318128,
"learning_rate": 4.928950824674169e-06,
"loss": 0.1447,
"step": 837
},
{
"epoch": 0.3812556869881711,
"grad_norm": 1.460745158832598,
"learning_rate": 4.928781561046359e-06,
"loss": 0.0902,
"step": 838
},
{
"epoch": 0.3817106460418562,
"grad_norm": 1.544649379546627,
"learning_rate": 4.928612098949488e-06,
"loss": 0.0995,
"step": 839
},
{
"epoch": 0.3821656050955414,
"grad_norm": 1.583411250445995,
"learning_rate": 4.9284424383974026e-06,
"loss": 0.1007,
"step": 840
},
{
"epoch": 0.38262056414922657,
"grad_norm": 1.2960669635575661,
"learning_rate": 4.928272579403969e-06,
"loss": 0.0679,
"step": 841
},
{
"epoch": 0.3830755232029117,
"grad_norm": 1.4865280371498417,
"learning_rate": 4.928102521983067e-06,
"loss": 0.1208,
"step": 842
},
{
"epoch": 0.3835304822565969,
"grad_norm": 2.1345090660254145,
"learning_rate": 4.9279322661485906e-06,
"loss": 0.1489,
"step": 843
},
{
"epoch": 0.38398544131028206,
"grad_norm": 1.705469805887344,
"learning_rate": 4.927761811914455e-06,
"loss": 0.1084,
"step": 844
},
{
"epoch": 0.38444040036396726,
"grad_norm": 1.358954041720105,
"learning_rate": 4.927591159294587e-06,
"loss": 0.0827,
"step": 845
},
{
"epoch": 0.3848953594176524,
"grad_norm": 1.8335314647218843,
"learning_rate": 4.927420308302933e-06,
"loss": 0.102,
"step": 846
},
{
"epoch": 0.3853503184713376,
"grad_norm": 1.710141204765745,
"learning_rate": 4.927249258953454e-06,
"loss": 0.1091,
"step": 847
},
{
"epoch": 0.38580527752502275,
"grad_norm": 1.7784989569871608,
"learning_rate": 4.927078011260126e-06,
"loss": 0.1094,
"step": 848
},
{
"epoch": 0.3862602365787079,
"grad_norm": 1.9072996593932403,
"learning_rate": 4.926906565236943e-06,
"loss": 0.1255,
"step": 849
},
{
"epoch": 0.3867151956323931,
"grad_norm": 1.7435526255624214,
"learning_rate": 4.926734920897916e-06,
"loss": 0.1076,
"step": 850
},
{
"epoch": 0.38717015468607824,
"grad_norm": 1.3254342460194672,
"learning_rate": 4.926563078257071e-06,
"loss": 0.099,
"step": 851
},
{
"epoch": 0.38762511373976344,
"grad_norm": 1.0985508710385608,
"learning_rate": 4.926391037328448e-06,
"loss": 0.0848,
"step": 852
},
{
"epoch": 0.3880800727934486,
"grad_norm": 1.6344858491886853,
"learning_rate": 4.926218798126108e-06,
"loss": 0.1102,
"step": 853
},
{
"epoch": 0.3885350318471338,
"grad_norm": 1.694464350768917,
"learning_rate": 4.926046360664124e-06,
"loss": 0.0868,
"step": 854
},
{
"epoch": 0.3889899909008189,
"grad_norm": 1.865189060623283,
"learning_rate": 4.925873724956588e-06,
"loss": 0.1152,
"step": 855
},
{
"epoch": 0.38944494995450407,
"grad_norm": 1.794490671041637,
"learning_rate": 4.9257008910176065e-06,
"loss": 0.1443,
"step": 856
},
{
"epoch": 0.38989990900818927,
"grad_norm": 1.6294296423553156,
"learning_rate": 4.925527858861302e-06,
"loss": 0.092,
"step": 857
},
{
"epoch": 0.3903548680618744,
"grad_norm": 1.7424555145921712,
"learning_rate": 4.925354628501814e-06,
"loss": 0.1002,
"step": 858
},
{
"epoch": 0.3908098271155596,
"grad_norm": 2.309513172607415,
"learning_rate": 4.925181199953299e-06,
"loss": 0.1288,
"step": 859
},
{
"epoch": 0.39126478616924476,
"grad_norm": 1.3668641274774587,
"learning_rate": 4.9250075732299285e-06,
"loss": 0.0903,
"step": 860
},
{
"epoch": 0.39171974522292996,
"grad_norm": 1.7785057619158235,
"learning_rate": 4.92483374834589e-06,
"loss": 0.1181,
"step": 861
},
{
"epoch": 0.3921747042766151,
"grad_norm": 1.5234971151354315,
"learning_rate": 4.9246597253153884e-06,
"loss": 0.0935,
"step": 862
},
{
"epoch": 0.39262966333030025,
"grad_norm": 1.1791645313929775,
"learning_rate": 4.924485504152644e-06,
"loss": 0.0822,
"step": 863
},
{
"epoch": 0.39308462238398545,
"grad_norm": 1.5983057485508323,
"learning_rate": 4.924311084871892e-06,
"loss": 0.0966,
"step": 864
},
{
"epoch": 0.3935395814376706,
"grad_norm": 1.6634965227764558,
"learning_rate": 4.924136467487387e-06,
"loss": 0.0759,
"step": 865
},
{
"epoch": 0.3939945404913558,
"grad_norm": 1.5231170961334706,
"learning_rate": 4.923961652013397e-06,
"loss": 0.0881,
"step": 866
},
{
"epoch": 0.39444949954504094,
"grad_norm": 1.4495990250164725,
"learning_rate": 4.923786638464207e-06,
"loss": 0.0941,
"step": 867
},
{
"epoch": 0.39490445859872614,
"grad_norm": 1.3390712595063252,
"learning_rate": 4.9236114268541196e-06,
"loss": 0.0846,
"step": 868
},
{
"epoch": 0.3953594176524113,
"grad_norm": 1.627122973701433,
"learning_rate": 4.923436017197451e-06,
"loss": 0.0819,
"step": 869
},
{
"epoch": 0.3958143767060964,
"grad_norm": 1.3377642278691055,
"learning_rate": 4.923260409508535e-06,
"loss": 0.088,
"step": 870
},
{
"epoch": 0.3962693357597816,
"grad_norm": 1.9694748985572026,
"learning_rate": 4.9230846038017214e-06,
"loss": 0.151,
"step": 871
},
{
"epoch": 0.39672429481346677,
"grad_norm": 1.4923965061921258,
"learning_rate": 4.922908600091378e-06,
"loss": 0.0795,
"step": 872
},
{
"epoch": 0.39717925386715197,
"grad_norm": 1.8057120373297069,
"learning_rate": 4.9227323983918835e-06,
"loss": 0.1439,
"step": 873
},
{
"epoch": 0.3976342129208371,
"grad_norm": 1.226146313826682,
"learning_rate": 4.922555998717639e-06,
"loss": 0.0845,
"step": 874
},
{
"epoch": 0.3980891719745223,
"grad_norm": 1.4188073442884932,
"learning_rate": 4.922379401083058e-06,
"loss": 0.0723,
"step": 875
},
{
"epoch": 0.39854413102820746,
"grad_norm": 1.6044422866063657,
"learning_rate": 4.922202605502573e-06,
"loss": 0.0981,
"step": 876
},
{
"epoch": 0.3989990900818926,
"grad_norm": 1.645096377490142,
"learning_rate": 4.922025611990629e-06,
"loss": 0.0882,
"step": 877
},
{
"epoch": 0.3994540491355778,
"grad_norm": 1.4988618969542298,
"learning_rate": 4.92184842056169e-06,
"loss": 0.0914,
"step": 878
},
{
"epoch": 0.39990900818926295,
"grad_norm": 1.4716766649704647,
"learning_rate": 4.921671031230235e-06,
"loss": 0.0843,
"step": 879
},
{
"epoch": 0.40036396724294815,
"grad_norm": 1.8151437273817552,
"learning_rate": 4.921493444010759e-06,
"loss": 0.1115,
"step": 880
},
{
"epoch": 0.4008189262966333,
"grad_norm": 1.3841092562389385,
"learning_rate": 4.921315658917774e-06,
"loss": 0.0821,
"step": 881
},
{
"epoch": 0.4012738853503185,
"grad_norm": 1.5281014710080694,
"learning_rate": 4.921137675965809e-06,
"loss": 0.0894,
"step": 882
},
{
"epoch": 0.40172884440400364,
"grad_norm": 1.1860457913745353,
"learning_rate": 4.920959495169406e-06,
"loss": 0.0819,
"step": 883
},
{
"epoch": 0.4021838034576888,
"grad_norm": 1.9670434695091386,
"learning_rate": 4.920781116543126e-06,
"loss": 0.1198,
"step": 884
},
{
"epoch": 0.402638762511374,
"grad_norm": 1.4837005110977715,
"learning_rate": 4.920602540101546e-06,
"loss": 0.0871,
"step": 885
},
{
"epoch": 0.4030937215650591,
"grad_norm": 1.8269163623820734,
"learning_rate": 4.920423765859257e-06,
"loss": 0.0956,
"step": 886
},
{
"epoch": 0.4035486806187443,
"grad_norm": 1.6998774179110374,
"learning_rate": 4.920244793830869e-06,
"loss": 0.0973,
"step": 887
},
{
"epoch": 0.40400363967242947,
"grad_norm": 1.6596471546846747,
"learning_rate": 4.920065624031006e-06,
"loss": 0.1085,
"step": 888
},
{
"epoch": 0.40445859872611467,
"grad_norm": 1.4077908132773769,
"learning_rate": 4.919886256474309e-06,
"loss": 0.0904,
"step": 889
},
{
"epoch": 0.4049135577797998,
"grad_norm": 1.7022215596121757,
"learning_rate": 4.919706691175435e-06,
"loss": 0.091,
"step": 890
},
{
"epoch": 0.40536851683348496,
"grad_norm": 2.1232813584307455,
"learning_rate": 4.919526928149058e-06,
"loss": 0.1366,
"step": 891
},
{
"epoch": 0.40582347588717016,
"grad_norm": 1.6341211456957871,
"learning_rate": 4.919346967409867e-06,
"loss": 0.1108,
"step": 892
},
{
"epoch": 0.4062784349408553,
"grad_norm": 1.5324489468460818,
"learning_rate": 4.919166808972567e-06,
"loss": 0.1228,
"step": 893
},
{
"epoch": 0.4067333939945405,
"grad_norm": 2.099437608372934,
"learning_rate": 4.918986452851881e-06,
"loss": 0.1245,
"step": 894
},
{
"epoch": 0.40718835304822565,
"grad_norm": 1.3588941988828955,
"learning_rate": 4.918805899062545e-06,
"loss": 0.0621,
"step": 895
},
{
"epoch": 0.40764331210191085,
"grad_norm": 0.8277266375645331,
"learning_rate": 4.9186251476193146e-06,
"loss": 0.0499,
"step": 896
},
{
"epoch": 0.408098271155596,
"grad_norm": 1.7852175335240448,
"learning_rate": 4.918444198536959e-06,
"loss": 0.1206,
"step": 897
},
{
"epoch": 0.40855323020928114,
"grad_norm": 1.5382745011065326,
"learning_rate": 4.918263051830267e-06,
"loss": 0.1081,
"step": 898
},
{
"epoch": 0.40900818926296634,
"grad_norm": 1.621296590196374,
"learning_rate": 4.918081707514037e-06,
"loss": 0.0881,
"step": 899
},
{
"epoch": 0.4094631483166515,
"grad_norm": 2.178092466242458,
"learning_rate": 4.917900165603091e-06,
"loss": 0.1364,
"step": 900
},
{
"epoch": 0.4099181073703367,
"grad_norm": 1.5880350908655525,
"learning_rate": 4.9177184261122624e-06,
"loss": 0.1073,
"step": 901
},
{
"epoch": 0.4103730664240218,
"grad_norm": 1.8483741427612825,
"learning_rate": 4.917536489056402e-06,
"loss": 0.0972,
"step": 902
},
{
"epoch": 0.410828025477707,
"grad_norm": 1.5893537500919641,
"learning_rate": 4.9173543544503775e-06,
"loss": 0.0851,
"step": 903
},
{
"epoch": 0.41128298453139217,
"grad_norm": 1.144493331243443,
"learning_rate": 4.917172022309072e-06,
"loss": 0.0637,
"step": 904
},
{
"epoch": 0.41173794358507737,
"grad_norm": 1.139422632834299,
"learning_rate": 4.916989492647385e-06,
"loss": 0.065,
"step": 905
},
{
"epoch": 0.4121929026387625,
"grad_norm": 1.2858602055549935,
"learning_rate": 4.916806765480231e-06,
"loss": 0.079,
"step": 906
},
{
"epoch": 0.41264786169244766,
"grad_norm": 1.9716514818564959,
"learning_rate": 4.9166238408225416e-06,
"loss": 0.161,
"step": 907
},
{
"epoch": 0.41310282074613286,
"grad_norm": 1.6206512831659239,
"learning_rate": 4.916440718689267e-06,
"loss": 0.0958,
"step": 908
},
{
"epoch": 0.413557779799818,
"grad_norm": 1.2472167749456646,
"learning_rate": 4.916257399095369e-06,
"loss": 0.0705,
"step": 909
},
{
"epoch": 0.4140127388535032,
"grad_norm": 1.1891048303298737,
"learning_rate": 4.916073882055827e-06,
"loss": 0.0671,
"step": 910
},
{
"epoch": 0.41446769790718835,
"grad_norm": 1.9533245506572903,
"learning_rate": 4.91589016758564e-06,
"loss": 0.1203,
"step": 911
},
{
"epoch": 0.41492265696087355,
"grad_norm": 1.7223916244259532,
"learning_rate": 4.915706255699817e-06,
"loss": 0.1171,
"step": 912
},
{
"epoch": 0.4153776160145587,
"grad_norm": 2.042050502050582,
"learning_rate": 4.915522146413389e-06,
"loss": 0.152,
"step": 913
},
{
"epoch": 0.41583257506824384,
"grad_norm": 1.5213892799482642,
"learning_rate": 4.9153378397413985e-06,
"loss": 0.1011,
"step": 914
},
{
"epoch": 0.41628753412192904,
"grad_norm": 1.8893914267841023,
"learning_rate": 4.915153335698908e-06,
"loss": 0.1133,
"step": 915
},
{
"epoch": 0.4167424931756142,
"grad_norm": 1.7882796521112458,
"learning_rate": 4.914968634300994e-06,
"loss": 0.1081,
"step": 916
},
{
"epoch": 0.4171974522292994,
"grad_norm": 1.186974851727905,
"learning_rate": 4.914783735562748e-06,
"loss": 0.0791,
"step": 917
},
{
"epoch": 0.4176524112829845,
"grad_norm": 1.3276822787818023,
"learning_rate": 4.914598639499281e-06,
"loss": 0.0929,
"step": 918
},
{
"epoch": 0.4181073703366697,
"grad_norm": 1.3143453344689244,
"learning_rate": 4.914413346125717e-06,
"loss": 0.0907,
"step": 919
},
{
"epoch": 0.41856232939035487,
"grad_norm": 1.2706441279848544,
"learning_rate": 4.914227855457199e-06,
"loss": 0.0797,
"step": 920
},
{
"epoch": 0.41901728844404,
"grad_norm": 1.8437493208675002,
"learning_rate": 4.914042167508881e-06,
"loss": 0.0851,
"step": 921
},
{
"epoch": 0.4194722474977252,
"grad_norm": 1.4975873837594447,
"learning_rate": 4.9138562822959416e-06,
"loss": 0.0735,
"step": 922
},
{
"epoch": 0.41992720655141036,
"grad_norm": 1.8590378932388973,
"learning_rate": 4.913670199833566e-06,
"loss": 0.0955,
"step": 923
},
{
"epoch": 0.42038216560509556,
"grad_norm": 1.6110342357827778,
"learning_rate": 4.913483920136961e-06,
"loss": 0.0904,
"step": 924
},
{
"epoch": 0.4208371246587807,
"grad_norm": 1.761284240310015,
"learning_rate": 4.91329744322135e-06,
"loss": 0.0967,
"step": 925
},
{
"epoch": 0.4212920837124659,
"grad_norm": 1.3709410104557458,
"learning_rate": 4.913110769101971e-06,
"loss": 0.0872,
"step": 926
},
{
"epoch": 0.42174704276615105,
"grad_norm": 1.6539854986144262,
"learning_rate": 4.912923897794077e-06,
"loss": 0.0982,
"step": 927
},
{
"epoch": 0.4222020018198362,
"grad_norm": 1.6465498130671066,
"learning_rate": 4.912736829312938e-06,
"loss": 0.1093,
"step": 928
},
{
"epoch": 0.4226569608735214,
"grad_norm": 1.8873864205133448,
"learning_rate": 4.912549563673842e-06,
"loss": 0.1239,
"step": 929
},
{
"epoch": 0.42311191992720654,
"grad_norm": 1.5496708014603886,
"learning_rate": 4.912362100892091e-06,
"loss": 0.1273,
"step": 930
},
{
"epoch": 0.42356687898089174,
"grad_norm": 1.1519662533075623,
"learning_rate": 4.912174440983002e-06,
"loss": 0.0729,
"step": 931
},
{
"epoch": 0.4240218380345769,
"grad_norm": 1.6674274772885138,
"learning_rate": 4.911986583961912e-06,
"loss": 0.1107,
"step": 932
},
{
"epoch": 0.4244767970882621,
"grad_norm": 1.8943327104641587,
"learning_rate": 4.91179852984417e-06,
"loss": 0.0989,
"step": 933
},
{
"epoch": 0.4249317561419472,
"grad_norm": 1.3387420389544245,
"learning_rate": 4.911610278645144e-06,
"loss": 0.0873,
"step": 934
},
{
"epoch": 0.42538671519563237,
"grad_norm": 1.3086866571732974,
"learning_rate": 4.911421830380217e-06,
"loss": 0.0767,
"step": 935
},
{
"epoch": 0.42584167424931757,
"grad_norm": 2.04544186641041,
"learning_rate": 4.911233185064788e-06,
"loss": 0.1285,
"step": 936
},
{
"epoch": 0.4262966333030027,
"grad_norm": 1.6906012723967403,
"learning_rate": 4.911044342714272e-06,
"loss": 0.0997,
"step": 937
},
{
"epoch": 0.4267515923566879,
"grad_norm": 1.439162135385858,
"learning_rate": 4.9108553033440995e-06,
"loss": 0.0744,
"step": 938
},
{
"epoch": 0.42720655141037306,
"grad_norm": 1.2593154408057343,
"learning_rate": 4.91066606696972e-06,
"loss": 0.074,
"step": 939
},
{
"epoch": 0.42766151046405826,
"grad_norm": 1.7514521824191083,
"learning_rate": 4.910476633606597e-06,
"loss": 0.0971,
"step": 940
},
{
"epoch": 0.4281164695177434,
"grad_norm": 1.5625231909908295,
"learning_rate": 4.9102870032702075e-06,
"loss": 0.0689,
"step": 941
},
{
"epoch": 0.42857142857142855,
"grad_norm": 1.5194579023544843,
"learning_rate": 4.910097175976049e-06,
"loss": 0.0824,
"step": 942
},
{
"epoch": 0.42902638762511375,
"grad_norm": 1.4223453649486908,
"learning_rate": 4.909907151739634e-06,
"loss": 0.0747,
"step": 943
},
{
"epoch": 0.4294813466787989,
"grad_norm": 2.2121264200483393,
"learning_rate": 4.909716930576489e-06,
"loss": 0.1463,
"step": 944
},
{
"epoch": 0.4299363057324841,
"grad_norm": 1.5012792406542972,
"learning_rate": 4.909526512502158e-06,
"loss": 0.1241,
"step": 945
},
{
"epoch": 0.43039126478616924,
"grad_norm": 1.6714102508168673,
"learning_rate": 4.9093358975322025e-06,
"loss": 0.1045,
"step": 946
},
{
"epoch": 0.43084622383985444,
"grad_norm": 1.5613346147429912,
"learning_rate": 4.909145085682198e-06,
"loss": 0.1105,
"step": 947
},
{
"epoch": 0.4313011828935396,
"grad_norm": 1.4864622392832871,
"learning_rate": 4.908954076967737e-06,
"loss": 0.0831,
"step": 948
},
{
"epoch": 0.4317561419472247,
"grad_norm": 1.5530391149425158,
"learning_rate": 4.908762871404427e-06,
"loss": 0.1345,
"step": 949
},
{
"epoch": 0.4322111010009099,
"grad_norm": 1.5444429676980205,
"learning_rate": 4.908571469007893e-06,
"loss": 0.0886,
"step": 950
},
{
"epoch": 0.43266606005459507,
"grad_norm": 1.8034818342216412,
"learning_rate": 4.908379869793776e-06,
"loss": 0.1046,
"step": 951
},
{
"epoch": 0.43312101910828027,
"grad_norm": 1.3153452614362922,
"learning_rate": 4.908188073777732e-06,
"loss": 0.0715,
"step": 952
},
{
"epoch": 0.4335759781619654,
"grad_norm": 2.0825682650521857,
"learning_rate": 4.9079960809754334e-06,
"loss": 0.135,
"step": 953
},
{
"epoch": 0.4340309372156506,
"grad_norm": 1.3431541090651076,
"learning_rate": 4.90780389140257e-06,
"loss": 0.0812,
"step": 954
},
{
"epoch": 0.43448589626933576,
"grad_norm": 2.018134282960315,
"learning_rate": 4.907611505074846e-06,
"loss": 0.1001,
"step": 955
},
{
"epoch": 0.4349408553230209,
"grad_norm": 1.8270847906398506,
"learning_rate": 4.907418922007983e-06,
"loss": 0.1054,
"step": 956
},
{
"epoch": 0.4353958143767061,
"grad_norm": 1.5502670619333374,
"learning_rate": 4.907226142217717e-06,
"loss": 0.0832,
"step": 957
},
{
"epoch": 0.43585077343039125,
"grad_norm": 1.5099564094926066,
"learning_rate": 4.9070331657198015e-06,
"loss": 0.093,
"step": 958
},
{
"epoch": 0.43630573248407645,
"grad_norm": 1.6580816557213998,
"learning_rate": 4.906839992530006e-06,
"loss": 0.1133,
"step": 959
},
{
"epoch": 0.4367606915377616,
"grad_norm": 1.9468112171012433,
"learning_rate": 4.906646622664115e-06,
"loss": 0.1122,
"step": 960
},
{
"epoch": 0.4372156505914468,
"grad_norm": 1.3246750710377195,
"learning_rate": 4.906453056137931e-06,
"loss": 0.0572,
"step": 961
},
{
"epoch": 0.43767060964513194,
"grad_norm": 2.1577598041780846,
"learning_rate": 4.90625929296727e-06,
"loss": 0.1419,
"step": 962
},
{
"epoch": 0.4381255686988171,
"grad_norm": 1.3649728107391488,
"learning_rate": 4.9060653331679665e-06,
"loss": 0.1026,
"step": 963
},
{
"epoch": 0.4385805277525023,
"grad_norm": 1.7954750394301047,
"learning_rate": 4.90587117675587e-06,
"loss": 0.124,
"step": 964
},
{
"epoch": 0.4390354868061874,
"grad_norm": 1.6192897762023186,
"learning_rate": 4.905676823746846e-06,
"loss": 0.102,
"step": 965
},
{
"epoch": 0.4394904458598726,
"grad_norm": 1.183156466195084,
"learning_rate": 4.9054822741567745e-06,
"loss": 0.0741,
"step": 966
},
{
"epoch": 0.43994540491355777,
"grad_norm": 1.791057313794206,
"learning_rate": 4.905287528001555e-06,
"loss": 0.0986,
"step": 967
},
{
"epoch": 0.44040036396724297,
"grad_norm": 1.5587372758795195,
"learning_rate": 4.905092585297102e-06,
"loss": 0.0959,
"step": 968
},
{
"epoch": 0.4408553230209281,
"grad_norm": 1.9086814389692623,
"learning_rate": 4.904897446059344e-06,
"loss": 0.1124,
"step": 969
},
{
"epoch": 0.44131028207461326,
"grad_norm": 1.5518685718016205,
"learning_rate": 4.9047021103042255e-06,
"loss": 0.0802,
"step": 970
},
{
"epoch": 0.44176524112829846,
"grad_norm": 1.5626634869227398,
"learning_rate": 4.904506578047712e-06,
"loss": 0.0966,
"step": 971
},
{
"epoch": 0.4422202001819836,
"grad_norm": 1.6777151282946248,
"learning_rate": 4.9043108493057785e-06,
"loss": 0.0946,
"step": 972
},
{
"epoch": 0.4426751592356688,
"grad_norm": 1.3918546303467518,
"learning_rate": 4.904114924094421e-06,
"loss": 0.0776,
"step": 973
},
{
"epoch": 0.44313011828935395,
"grad_norm": 1.7054781101293177,
"learning_rate": 4.903918802429648e-06,
"loss": 0.1076,
"step": 974
},
{
"epoch": 0.44358507734303915,
"grad_norm": 0.9435161970580179,
"learning_rate": 4.9037224843274875e-06,
"loss": 0.055,
"step": 975
},
{
"epoch": 0.4440400363967243,
"grad_norm": 1.8279732096534727,
"learning_rate": 4.903525969803979e-06,
"loss": 0.144,
"step": 976
},
{
"epoch": 0.44449499545040944,
"grad_norm": 1.5827975534285916,
"learning_rate": 4.903329258875184e-06,
"loss": 0.0876,
"step": 977
},
{
"epoch": 0.44494995450409464,
"grad_norm": 1.5817514212508765,
"learning_rate": 4.903132351557175e-06,
"loss": 0.1003,
"step": 978
},
{
"epoch": 0.4454049135577798,
"grad_norm": 1.55794858043461,
"learning_rate": 4.902935247866043e-06,
"loss": 0.0901,
"step": 979
},
{
"epoch": 0.445859872611465,
"grad_norm": 1.7648097170403771,
"learning_rate": 4.9027379478178935e-06,
"loss": 0.1117,
"step": 980
},
{
"epoch": 0.4463148316651501,
"grad_norm": 1.4493752053158233,
"learning_rate": 4.90254045142885e-06,
"loss": 0.0824,
"step": 981
},
{
"epoch": 0.4467697907188353,
"grad_norm": 1.4618354488172722,
"learning_rate": 4.90234275871505e-06,
"loss": 0.08,
"step": 982
},
{
"epoch": 0.44722474977252047,
"grad_norm": 2.314057245131694,
"learning_rate": 4.9021448696926486e-06,
"loss": 0.1437,
"step": 983
},
{
"epoch": 0.44767970882620567,
"grad_norm": 1.2365214796695643,
"learning_rate": 4.901946784377816e-06,
"loss": 0.0955,
"step": 984
},
{
"epoch": 0.4481346678798908,
"grad_norm": 1.2633152164234291,
"learning_rate": 4.90174850278674e-06,
"loss": 0.0803,
"step": 985
},
{
"epoch": 0.44858962693357596,
"grad_norm": 1.5083171008818446,
"learning_rate": 4.901550024935623e-06,
"loss": 0.0942,
"step": 986
},
{
"epoch": 0.44904458598726116,
"grad_norm": 1.1583463791947812,
"learning_rate": 4.901351350840683e-06,
"loss": 0.0786,
"step": 987
},
{
"epoch": 0.4494995450409463,
"grad_norm": 1.343367085202188,
"learning_rate": 4.901152480518155e-06,
"loss": 0.0724,
"step": 988
},
{
"epoch": 0.4499545040946315,
"grad_norm": 1.1159650914918346,
"learning_rate": 4.900953413984289e-06,
"loss": 0.0681,
"step": 989
},
{
"epoch": 0.45040946314831665,
"grad_norm": 2.0950998044271025,
"learning_rate": 4.900754151255353e-06,
"loss": 0.1541,
"step": 990
},
{
"epoch": 0.45086442220200185,
"grad_norm": 1.4260341278646986,
"learning_rate": 4.9005546923476305e-06,
"loss": 0.0707,
"step": 991
},
{
"epoch": 0.451319381255687,
"grad_norm": 1.6502415030386688,
"learning_rate": 4.9003550372774185e-06,
"loss": 0.1111,
"step": 992
},
{
"epoch": 0.45177434030937214,
"grad_norm": 1.280806174818392,
"learning_rate": 4.900155186061033e-06,
"loss": 0.0789,
"step": 993
},
{
"epoch": 0.45222929936305734,
"grad_norm": 1.9745186799391785,
"learning_rate": 4.8999551387148045e-06,
"loss": 0.1125,
"step": 994
},
{
"epoch": 0.4526842584167425,
"grad_norm": 1.2542781615680096,
"learning_rate": 4.89975489525508e-06,
"loss": 0.0814,
"step": 995
},
{
"epoch": 0.4531392174704277,
"grad_norm": 1.5218729573521388,
"learning_rate": 4.899554455698223e-06,
"loss": 0.0849,
"step": 996
},
{
"epoch": 0.4535941765241128,
"grad_norm": 1.4911465655176248,
"learning_rate": 4.899353820060612e-06,
"loss": 0.0887,
"step": 997
},
{
"epoch": 0.454049135577798,
"grad_norm": 1.8552177664529743,
"learning_rate": 4.899152988358643e-06,
"loss": 0.1153,
"step": 998
},
{
"epoch": 0.45450409463148317,
"grad_norm": 1.3462289694693903,
"learning_rate": 4.898951960608725e-06,
"loss": 0.0768,
"step": 999
},
{
"epoch": 0.4549590536851683,
"grad_norm": 1.5105165626051191,
"learning_rate": 4.8987507368272865e-06,
"loss": 0.0916,
"step": 1000
},
{
"epoch": 0.4554140127388535,
"grad_norm": 1.7874012401425645,
"learning_rate": 4.898549317030772e-06,
"loss": 0.1228,
"step": 1001
},
{
"epoch": 0.45586897179253866,
"grad_norm": 1.8678564128703685,
"learning_rate": 4.898347701235637e-06,
"loss": 0.1226,
"step": 1002
},
{
"epoch": 0.45632393084622386,
"grad_norm": 1.9367180322034927,
"learning_rate": 4.89814588945836e-06,
"loss": 0.1239,
"step": 1003
},
{
"epoch": 0.456778889899909,
"grad_norm": 1.8462049373063074,
"learning_rate": 4.89794388171543e-06,
"loss": 0.1106,
"step": 1004
},
{
"epoch": 0.4572338489535942,
"grad_norm": 1.7977459529642075,
"learning_rate": 4.897741678023356e-06,
"loss": 0.1137,
"step": 1005
},
{
"epoch": 0.45768880800727935,
"grad_norm": 1.4317415496884898,
"learning_rate": 4.897539278398659e-06,
"loss": 0.0835,
"step": 1006
},
{
"epoch": 0.4581437670609645,
"grad_norm": 1.947224769167489,
"learning_rate": 4.8973366828578804e-06,
"loss": 0.1087,
"step": 1007
},
{
"epoch": 0.4585987261146497,
"grad_norm": 1.6840082807319827,
"learning_rate": 4.897133891417574e-06,
"loss": 0.1004,
"step": 1008
},
{
"epoch": 0.45905368516833484,
"grad_norm": 1.6722996299672828,
"learning_rate": 4.896930904094311e-06,
"loss": 0.0869,
"step": 1009
},
{
"epoch": 0.45950864422202004,
"grad_norm": 2.2431321251776986,
"learning_rate": 4.896727720904679e-06,
"loss": 0.121,
"step": 1010
},
{
"epoch": 0.4599636032757052,
"grad_norm": 1.2761704386307018,
"learning_rate": 4.896524341865282e-06,
"loss": 0.0736,
"step": 1011
},
{
"epoch": 0.4604185623293904,
"grad_norm": 1.6413390038739506,
"learning_rate": 4.896320766992737e-06,
"loss": 0.1286,
"step": 1012
},
{
"epoch": 0.4608735213830755,
"grad_norm": 1.5251335582402008,
"learning_rate": 4.896116996303682e-06,
"loss": 0.0989,
"step": 1013
},
{
"epoch": 0.46132848043676067,
"grad_norm": 1.8038369878473837,
"learning_rate": 4.895913029814766e-06,
"loss": 0.097,
"step": 1014
},
{
"epoch": 0.46178343949044587,
"grad_norm": 2.012861641550116,
"learning_rate": 4.895708867542658e-06,
"loss": 0.1111,
"step": 1015
},
{
"epoch": 0.462238398544131,
"grad_norm": 1.7366035889417508,
"learning_rate": 4.895504509504039e-06,
"loss": 0.1029,
"step": 1016
},
{
"epoch": 0.4626933575978162,
"grad_norm": 1.3763665767496873,
"learning_rate": 4.89529995571561e-06,
"loss": 0.0938,
"step": 1017
},
{
"epoch": 0.46314831665150136,
"grad_norm": 1.6906151679744952,
"learning_rate": 4.895095206194086e-06,
"loss": 0.1085,
"step": 1018
},
{
"epoch": 0.46360327570518656,
"grad_norm": 1.5053749521419235,
"learning_rate": 4.894890260956198e-06,
"loss": 0.0884,
"step": 1019
},
{
"epoch": 0.4640582347588717,
"grad_norm": 1.5334372638839222,
"learning_rate": 4.8946851200186925e-06,
"loss": 0.1015,
"step": 1020
},
{
"epoch": 0.46451319381255685,
"grad_norm": 1.576638091265577,
"learning_rate": 4.894479783398334e-06,
"loss": 0.0903,
"step": 1021
},
{
"epoch": 0.46496815286624205,
"grad_norm": 1.7368682352331435,
"learning_rate": 4.8942742511119004e-06,
"loss": 0.1029,
"step": 1022
},
{
"epoch": 0.4654231119199272,
"grad_norm": 3.9669130222003455,
"learning_rate": 4.894068523176187e-06,
"loss": 0.2383,
"step": 1023
},
{
"epoch": 0.4658780709736124,
"grad_norm": 1.5974114766744798,
"learning_rate": 4.8938625996080056e-06,
"loss": 0.1116,
"step": 1024
},
{
"epoch": 0.46633303002729753,
"grad_norm": 1.1252846797063132,
"learning_rate": 4.893656480424184e-06,
"loss": 0.0673,
"step": 1025
},
{
"epoch": 0.46678798908098273,
"grad_norm": 1.5329254322284862,
"learning_rate": 4.893450165641564e-06,
"loss": 0.1066,
"step": 1026
},
{
"epoch": 0.4672429481346679,
"grad_norm": 1.3116647286111784,
"learning_rate": 4.893243655277005e-06,
"loss": 0.086,
"step": 1027
},
{
"epoch": 0.467697907188353,
"grad_norm": 1.5621452726926597,
"learning_rate": 4.893036949347383e-06,
"loss": 0.0937,
"step": 1028
},
{
"epoch": 0.4681528662420382,
"grad_norm": 1.44299341979305,
"learning_rate": 4.892830047869588e-06,
"loss": 0.0922,
"step": 1029
},
{
"epoch": 0.46860782529572337,
"grad_norm": 1.2004173985623205,
"learning_rate": 4.892622950860527e-06,
"loss": 0.0545,
"step": 1030
},
{
"epoch": 0.46906278434940857,
"grad_norm": 1.2933675353670258,
"learning_rate": 4.892415658337123e-06,
"loss": 0.0938,
"step": 1031
},
{
"epoch": 0.4695177434030937,
"grad_norm": 1.3899639516557423,
"learning_rate": 4.892208170316317e-06,
"loss": 0.0807,
"step": 1032
},
{
"epoch": 0.4699727024567789,
"grad_norm": 1.2103198454795117,
"learning_rate": 4.892000486815062e-06,
"loss": 0.0724,
"step": 1033
},
{
"epoch": 0.47042766151046406,
"grad_norm": 1.4625912187815495,
"learning_rate": 4.891792607850328e-06,
"loss": 0.0944,
"step": 1034
},
{
"epoch": 0.4708826205641492,
"grad_norm": 2.3778377956475074,
"learning_rate": 4.891584533439104e-06,
"loss": 0.1301,
"step": 1035
},
{
"epoch": 0.4713375796178344,
"grad_norm": 1.6240877825800288,
"learning_rate": 4.891376263598393e-06,
"loss": 0.1056,
"step": 1036
},
{
"epoch": 0.47179253867151955,
"grad_norm": 1.377205820937822,
"learning_rate": 4.891167798345213e-06,
"loss": 0.0879,
"step": 1037
},
{
"epoch": 0.47224749772520475,
"grad_norm": 1.918358313853146,
"learning_rate": 4.890959137696598e-06,
"loss": 0.1218,
"step": 1038
},
{
"epoch": 0.4727024567788899,
"grad_norm": 1.9802948601827106,
"learning_rate": 4.890750281669601e-06,
"loss": 0.0966,
"step": 1039
},
{
"epoch": 0.4731574158325751,
"grad_norm": 1.209426799273833,
"learning_rate": 4.890541230281287e-06,
"loss": 0.0687,
"step": 1040
},
{
"epoch": 0.47361237488626023,
"grad_norm": 1.714672711362897,
"learning_rate": 4.8903319835487385e-06,
"loss": 0.1119,
"step": 1041
},
{
"epoch": 0.4740673339399454,
"grad_norm": 1.8426958086935912,
"learning_rate": 4.890122541489056e-06,
"loss": 0.1071,
"step": 1042
},
{
"epoch": 0.4745222929936306,
"grad_norm": 1.5412332450392434,
"learning_rate": 4.889912904119353e-06,
"loss": 0.1194,
"step": 1043
},
{
"epoch": 0.4749772520473157,
"grad_norm": 1.5900743055736573,
"learning_rate": 4.88970307145676e-06,
"loss": 0.0905,
"step": 1044
},
{
"epoch": 0.4754322111010009,
"grad_norm": 1.299438309320783,
"learning_rate": 4.889493043518423e-06,
"loss": 0.0782,
"step": 1045
},
{
"epoch": 0.47588717015468607,
"grad_norm": 1.2775434133946648,
"learning_rate": 4.889282820321506e-06,
"loss": 0.067,
"step": 1046
},
{
"epoch": 0.47634212920837127,
"grad_norm": 2.0181187729173313,
"learning_rate": 4.889072401883187e-06,
"loss": 0.1039,
"step": 1047
},
{
"epoch": 0.4767970882620564,
"grad_norm": 1.3673144633984753,
"learning_rate": 4.88886178822066e-06,
"loss": 0.0871,
"step": 1048
},
{
"epoch": 0.47725204731574156,
"grad_norm": 1.5512598399498212,
"learning_rate": 4.888650979351136e-06,
"loss": 0.0936,
"step": 1049
},
{
"epoch": 0.47770700636942676,
"grad_norm": 1.8862924775266208,
"learning_rate": 4.888439975291841e-06,
"loss": 0.149,
"step": 1050
},
{
"epoch": 0.4781619654231119,
"grad_norm": 1.527860807788029,
"learning_rate": 4.888228776060017e-06,
"loss": 0.0981,
"step": 1051
},
{
"epoch": 0.4786169244767971,
"grad_norm": 1.635801739367282,
"learning_rate": 4.888017381672923e-06,
"loss": 0.1004,
"step": 1052
},
{
"epoch": 0.47907188353048225,
"grad_norm": 1.496869794404093,
"learning_rate": 4.887805792147832e-06,
"loss": 0.0921,
"step": 1053
},
{
"epoch": 0.47952684258416745,
"grad_norm": 1.729233289880027,
"learning_rate": 4.887594007502036e-06,
"loss": 0.089,
"step": 1054
},
{
"epoch": 0.4799818016378526,
"grad_norm": 1.9599768924005974,
"learning_rate": 4.887382027752838e-06,
"loss": 0.1029,
"step": 1055
},
{
"epoch": 0.48043676069153773,
"grad_norm": 1.6584360062505734,
"learning_rate": 4.8871698529175636e-06,
"loss": 0.1173,
"step": 1056
},
{
"epoch": 0.48089171974522293,
"grad_norm": 1.631421092772313,
"learning_rate": 4.886957483013549e-06,
"loss": 0.1231,
"step": 1057
},
{
"epoch": 0.4813466787989081,
"grad_norm": 2.3766899063373996,
"learning_rate": 4.886744918058149e-06,
"loss": 0.13,
"step": 1058
},
{
"epoch": 0.4818016378525933,
"grad_norm": 1.7346716794855597,
"learning_rate": 4.886532158068732e-06,
"loss": 0.0938,
"step": 1059
},
{
"epoch": 0.4822565969062784,
"grad_norm": 1.5214305907929453,
"learning_rate": 4.886319203062683e-06,
"loss": 0.0761,
"step": 1060
},
{
"epoch": 0.4827115559599636,
"grad_norm": 1.6073102647133055,
"learning_rate": 4.886106053057408e-06,
"loss": 0.0818,
"step": 1061
},
{
"epoch": 0.48316651501364877,
"grad_norm": 1.803380712114119,
"learning_rate": 4.88589270807032e-06,
"loss": 0.1231,
"step": 1062
},
{
"epoch": 0.48362147406733397,
"grad_norm": 1.5275199982317587,
"learning_rate": 4.885679168118855e-06,
"loss": 0.1105,
"step": 1063
},
{
"epoch": 0.4840764331210191,
"grad_norm": 1.8472965185652206,
"learning_rate": 4.8854654332204635e-06,
"loss": 0.1324,
"step": 1064
},
{
"epoch": 0.48453139217470426,
"grad_norm": 1.41701925154465,
"learning_rate": 4.885251503392607e-06,
"loss": 0.0767,
"step": 1065
},
{
"epoch": 0.48498635122838946,
"grad_norm": 2.00437974621472,
"learning_rate": 4.885037378652771e-06,
"loss": 0.1336,
"step": 1066
},
{
"epoch": 0.4854413102820746,
"grad_norm": 1.4895968911800157,
"learning_rate": 4.884823059018451e-06,
"loss": 0.0726,
"step": 1067
},
{
"epoch": 0.4858962693357598,
"grad_norm": 1.5673178312119351,
"learning_rate": 4.88460854450716e-06,
"loss": 0.0843,
"step": 1068
},
{
"epoch": 0.48635122838944495,
"grad_norm": 1.1450505304026162,
"learning_rate": 4.884393835136427e-06,
"loss": 0.073,
"step": 1069
},
{
"epoch": 0.48680618744313015,
"grad_norm": 1.5223195045028948,
"learning_rate": 4.884178930923799e-06,
"loss": 0.0823,
"step": 1070
},
{
"epoch": 0.4872611464968153,
"grad_norm": 1.912651615279676,
"learning_rate": 4.883963831886834e-06,
"loss": 0.0989,
"step": 1071
},
{
"epoch": 0.48771610555050043,
"grad_norm": 1.6904540179044927,
"learning_rate": 4.8837485380431115e-06,
"loss": 0.0981,
"step": 1072
},
{
"epoch": 0.48817106460418563,
"grad_norm": 1.4559744514600277,
"learning_rate": 4.883533049410223e-06,
"loss": 0.0874,
"step": 1073
},
{
"epoch": 0.4886260236578708,
"grad_norm": 1.9041018278788933,
"learning_rate": 4.8833173660057785e-06,
"loss": 0.1065,
"step": 1074
},
{
"epoch": 0.489080982711556,
"grad_norm": 1.582657768337463,
"learning_rate": 4.8831014878474004e-06,
"loss": 0.0993,
"step": 1075
},
{
"epoch": 0.4895359417652411,
"grad_norm": 1.487895945323618,
"learning_rate": 4.882885414952732e-06,
"loss": 0.0887,
"step": 1076
},
{
"epoch": 0.4899909008189263,
"grad_norm": 1.1105199391014717,
"learning_rate": 4.882669147339428e-06,
"loss": 0.0521,
"step": 1077
},
{
"epoch": 0.49044585987261147,
"grad_norm": 1.3448385373486804,
"learning_rate": 4.882452685025161e-06,
"loss": 0.0606,
"step": 1078
},
{
"epoch": 0.4909008189262966,
"grad_norm": 1.9169790386878416,
"learning_rate": 4.88223602802762e-06,
"loss": 0.1103,
"step": 1079
},
{
"epoch": 0.4913557779799818,
"grad_norm": 1.4350936971881065,
"learning_rate": 4.882019176364509e-06,
"loss": 0.1052,
"step": 1080
},
{
"epoch": 0.49181073703366696,
"grad_norm": 1.9005260167330429,
"learning_rate": 4.881802130053548e-06,
"loss": 0.1217,
"step": 1081
},
{
"epoch": 0.49226569608735216,
"grad_norm": 1.4814940279383466,
"learning_rate": 4.881584889112473e-06,
"loss": 0.079,
"step": 1082
},
{
"epoch": 0.4927206551410373,
"grad_norm": 1.7134074599855604,
"learning_rate": 4.881367453559036e-06,
"loss": 0.1025,
"step": 1083
},
{
"epoch": 0.4931756141947225,
"grad_norm": 1.2847311247280295,
"learning_rate": 4.881149823411005e-06,
"loss": 0.0587,
"step": 1084
},
{
"epoch": 0.49363057324840764,
"grad_norm": 1.196984822353409,
"learning_rate": 4.880931998686162e-06,
"loss": 0.0779,
"step": 1085
},
{
"epoch": 0.4940855323020928,
"grad_norm": 2.247552936990941,
"learning_rate": 4.880713979402311e-06,
"loss": 0.1534,
"step": 1086
},
{
"epoch": 0.494540491355778,
"grad_norm": 2.5523444538687645,
"learning_rate": 4.880495765577263e-06,
"loss": 0.146,
"step": 1087
},
{
"epoch": 0.49499545040946313,
"grad_norm": 1.7690099480339412,
"learning_rate": 4.880277357228852e-06,
"loss": 0.084,
"step": 1088
},
{
"epoch": 0.49545040946314833,
"grad_norm": 1.2117156565437108,
"learning_rate": 4.880058754374923e-06,
"loss": 0.0833,
"step": 1089
},
{
"epoch": 0.4959053685168335,
"grad_norm": 1.5484757487864966,
"learning_rate": 4.879839957033343e-06,
"loss": 0.0938,
"step": 1090
},
{
"epoch": 0.4963603275705187,
"grad_norm": 1.5534223234923523,
"learning_rate": 4.879620965221987e-06,
"loss": 0.09,
"step": 1091
},
{
"epoch": 0.4968152866242038,
"grad_norm": 1.3405465803260945,
"learning_rate": 4.879401778958755e-06,
"loss": 0.0784,
"step": 1092
},
{
"epoch": 0.49727024567788897,
"grad_norm": 1.3343510524547628,
"learning_rate": 4.8791823982615525e-06,
"loss": 0.064,
"step": 1093
},
{
"epoch": 0.49772520473157417,
"grad_norm": 1.2315640234775116,
"learning_rate": 4.878962823148308e-06,
"loss": 0.067,
"step": 1094
},
{
"epoch": 0.4981801637852593,
"grad_norm": 1.654273388728327,
"learning_rate": 4.878743053636968e-06,
"loss": 0.0964,
"step": 1095
},
{
"epoch": 0.4986351228389445,
"grad_norm": 1.3344367681027707,
"learning_rate": 4.878523089745485e-06,
"loss": 0.0865,
"step": 1096
},
{
"epoch": 0.49909008189262966,
"grad_norm": 1.0737534169537484,
"learning_rate": 4.878302931491837e-06,
"loss": 0.0722,
"step": 1097
},
{
"epoch": 0.49954504094631486,
"grad_norm": 1.2217058614506033,
"learning_rate": 4.8780825788940145e-06,
"loss": 0.0531,
"step": 1098
},
{
"epoch": 0.5,
"grad_norm": 1.765512273684173,
"learning_rate": 4.877862031970023e-06,
"loss": 0.1016,
"step": 1099
},
{
"epoch": 0.5004549590536852,
"grad_norm": 2.1360497116346444,
"learning_rate": 4.8776412907378845e-06,
"loss": 0.1095,
"step": 1100
},
{
"epoch": 0.5009099181073703,
"grad_norm": 1.5928570797543171,
"learning_rate": 4.877420355215637e-06,
"loss": 0.0909,
"step": 1101
},
{
"epoch": 0.5013648771610555,
"grad_norm": 1.9221830556747463,
"learning_rate": 4.877199225421334e-06,
"loss": 0.123,
"step": 1102
},
{
"epoch": 0.5018198362147407,
"grad_norm": 1.967973587212139,
"learning_rate": 4.8769779013730454e-06,
"loss": 0.1535,
"step": 1103
},
{
"epoch": 0.5022747952684259,
"grad_norm": 2.02512821365078,
"learning_rate": 4.876756383088858e-06,
"loss": 0.1173,
"step": 1104
},
{
"epoch": 0.502729754322111,
"grad_norm": 1.3904167109659709,
"learning_rate": 4.876534670586872e-06,
"loss": 0.0839,
"step": 1105
},
{
"epoch": 0.5031847133757962,
"grad_norm": 1.4435165077122623,
"learning_rate": 4.8763127638852045e-06,
"loss": 0.0924,
"step": 1106
},
{
"epoch": 0.5036396724294814,
"grad_norm": 1.7029448773247835,
"learning_rate": 4.87609066300199e-06,
"loss": 0.1076,
"step": 1107
},
{
"epoch": 0.5040946314831665,
"grad_norm": 1.750067106251082,
"learning_rate": 4.875868367955376e-06,
"loss": 0.1077,
"step": 1108
},
{
"epoch": 0.5045495905368517,
"grad_norm": 1.9748651822243342,
"learning_rate": 4.87564587876353e-06,
"loss": 0.1294,
"step": 1109
},
{
"epoch": 0.5050045495905369,
"grad_norm": 1.7656971074259822,
"learning_rate": 4.87542319544463e-06,
"loss": 0.0974,
"step": 1110
},
{
"epoch": 0.5054595086442221,
"grad_norm": 1.4817675230155858,
"learning_rate": 4.875200318016873e-06,
"loss": 0.0959,
"step": 1111
},
{
"epoch": 0.5059144676979072,
"grad_norm": 1.603234528593141,
"learning_rate": 4.8749772464984736e-06,
"loss": 0.115,
"step": 1112
},
{
"epoch": 0.5063694267515924,
"grad_norm": 1.7632465098077008,
"learning_rate": 4.874753980907658e-06,
"loss": 0.1224,
"step": 1113
},
{
"epoch": 0.5068243858052776,
"grad_norm": 1.409315497870279,
"learning_rate": 4.8745305212626714e-06,
"loss": 0.0886,
"step": 1114
},
{
"epoch": 0.5072793448589626,
"grad_norm": 1.3116197456740595,
"learning_rate": 4.874306867581775e-06,
"loss": 0.0853,
"step": 1115
},
{
"epoch": 0.5077343039126478,
"grad_norm": 1.1746077003548339,
"learning_rate": 4.874083019883242e-06,
"loss": 0.0543,
"step": 1116
},
{
"epoch": 0.508189262966333,
"grad_norm": 1.941012957682845,
"learning_rate": 4.873858978185367e-06,
"loss": 0.1137,
"step": 1117
},
{
"epoch": 0.5086442220200182,
"grad_norm": 2.32531280724128,
"learning_rate": 4.8736347425064565e-06,
"loss": 0.1627,
"step": 1118
},
{
"epoch": 0.5090991810737033,
"grad_norm": 1.638539845007192,
"learning_rate": 4.873410312864833e-06,
"loss": 0.0988,
"step": 1119
},
{
"epoch": 0.5095541401273885,
"grad_norm": 1.5695637896435937,
"learning_rate": 4.8731856892788384e-06,
"loss": 0.0918,
"step": 1120
},
{
"epoch": 0.5100090991810737,
"grad_norm": 2.011157500272583,
"learning_rate": 4.872960871766826e-06,
"loss": 0.1316,
"step": 1121
},
{
"epoch": 0.5104640582347588,
"grad_norm": 1.3312452781498474,
"learning_rate": 4.8727358603471675e-06,
"loss": 0.1007,
"step": 1122
},
{
"epoch": 0.510919017288444,
"grad_norm": 1.9359844901160286,
"learning_rate": 4.872510655038249e-06,
"loss": 0.1026,
"step": 1123
},
{
"epoch": 0.5113739763421292,
"grad_norm": 1.2898375591874278,
"learning_rate": 4.872285255858476e-06,
"loss": 0.0929,
"step": 1124
},
{
"epoch": 0.5118289353958144,
"grad_norm": 1.920657444015991,
"learning_rate": 4.872059662826263e-06,
"loss": 0.1129,
"step": 1125
},
{
"epoch": 0.5122838944494995,
"grad_norm": 1.4550346247477233,
"learning_rate": 4.8718338759600465e-06,
"loss": 0.0824,
"step": 1126
},
{
"epoch": 0.5127388535031847,
"grad_norm": 1.5791466307448474,
"learning_rate": 4.871607895278278e-06,
"loss": 0.1206,
"step": 1127
},
{
"epoch": 0.5131938125568699,
"grad_norm": 1.204733566103446,
"learning_rate": 4.871381720799421e-06,
"loss": 0.0665,
"step": 1128
},
{
"epoch": 0.513648771610555,
"grad_norm": 1.6684092224882034,
"learning_rate": 4.8711553525419595e-06,
"loss": 0.1075,
"step": 1129
},
{
"epoch": 0.5141037306642402,
"grad_norm": 1.4239501992031698,
"learning_rate": 4.87092879052439e-06,
"loss": 0.0957,
"step": 1130
},
{
"epoch": 0.5145586897179254,
"grad_norm": 1.0934030596754927,
"learning_rate": 4.8707020347652275e-06,
"loss": 0.0686,
"step": 1131
},
{
"epoch": 0.5150136487716106,
"grad_norm": 1.5870890463044125,
"learning_rate": 4.870475085283001e-06,
"loss": 0.1027,
"step": 1132
},
{
"epoch": 0.5154686078252957,
"grad_norm": 1.6559311395509346,
"learning_rate": 4.870247942096254e-06,
"loss": 0.1008,
"step": 1133
},
{
"epoch": 0.5159235668789809,
"grad_norm": 1.155174213270752,
"learning_rate": 4.870020605223551e-06,
"loss": 0.0592,
"step": 1134
},
{
"epoch": 0.5163785259326661,
"grad_norm": 1.6869955821352955,
"learning_rate": 4.869793074683466e-06,
"loss": 0.0913,
"step": 1135
},
{
"epoch": 0.5168334849863512,
"grad_norm": 2.19769614213437,
"learning_rate": 4.8695653504945925e-06,
"loss": 0.1237,
"step": 1136
},
{
"epoch": 0.5172884440400364,
"grad_norm": 2.393558826937421,
"learning_rate": 4.8693374326755405e-06,
"loss": 0.1401,
"step": 1137
},
{
"epoch": 0.5177434030937216,
"grad_norm": 1.3656006242910685,
"learning_rate": 4.869109321244932e-06,
"loss": 0.09,
"step": 1138
},
{
"epoch": 0.5181983621474068,
"grad_norm": 1.4542523027566732,
"learning_rate": 4.86888101622141e-06,
"loss": 0.0918,
"step": 1139
},
{
"epoch": 0.5186533212010919,
"grad_norm": 1.443069001120561,
"learning_rate": 4.868652517623629e-06,
"loss": 0.066,
"step": 1140
},
{
"epoch": 0.5191082802547771,
"grad_norm": 1.3192549477432447,
"learning_rate": 4.86842382547026e-06,
"loss": 0.07,
"step": 1141
},
{
"epoch": 0.5195632393084623,
"grad_norm": 1.4610522043176968,
"learning_rate": 4.868194939779992e-06,
"loss": 0.0603,
"step": 1142
},
{
"epoch": 0.5200181983621474,
"grad_norm": 1.3807495660521953,
"learning_rate": 4.867965860571529e-06,
"loss": 0.086,
"step": 1143
},
{
"epoch": 0.5204731574158326,
"grad_norm": 1.7439827425180354,
"learning_rate": 4.867736587863589e-06,
"loss": 0.1175,
"step": 1144
},
{
"epoch": 0.5209281164695178,
"grad_norm": 3.8341122094242586,
"learning_rate": 4.867507121674907e-06,
"loss": 0.1369,
"step": 1145
},
{
"epoch": 0.521383075523203,
"grad_norm": 1.6708528784620404,
"learning_rate": 4.867277462024235e-06,
"loss": 0.0788,
"step": 1146
},
{
"epoch": 0.521838034576888,
"grad_norm": 1.8971649447454588,
"learning_rate": 4.8670476089303395e-06,
"loss": 0.138,
"step": 1147
},
{
"epoch": 0.5222929936305732,
"grad_norm": 1.8468924709684824,
"learning_rate": 4.866817562412003e-06,
"loss": 0.1438,
"step": 1148
},
{
"epoch": 0.5227479526842584,
"grad_norm": 1.6403934984754582,
"learning_rate": 4.866587322488024e-06,
"loss": 0.1223,
"step": 1149
},
{
"epoch": 0.5232029117379435,
"grad_norm": 2.6178432136946843,
"learning_rate": 4.866356889177216e-06,
"loss": 0.1626,
"step": 1150
},
{
"epoch": 0.5236578707916287,
"grad_norm": 1.7176781702000803,
"learning_rate": 4.866126262498409e-06,
"loss": 0.1169,
"step": 1151
},
{
"epoch": 0.5241128298453139,
"grad_norm": 2.4788262927152256,
"learning_rate": 4.865895442470449e-06,
"loss": 0.1366,
"step": 1152
},
{
"epoch": 0.5245677888989991,
"grad_norm": 1.4130512402331137,
"learning_rate": 4.865664429112199e-06,
"loss": 0.075,
"step": 1153
},
{
"epoch": 0.5250227479526842,
"grad_norm": 2.161183666624184,
"learning_rate": 4.8654332224425345e-06,
"loss": 0.1219,
"step": 1154
},
{
"epoch": 0.5254777070063694,
"grad_norm": 1.7134676925151036,
"learning_rate": 4.865201822480349e-06,
"loss": 0.1068,
"step": 1155
},
{
"epoch": 0.5259326660600546,
"grad_norm": 1.2631225946147446,
"learning_rate": 4.864970229244552e-06,
"loss": 0.0732,
"step": 1156
},
{
"epoch": 0.5263876251137397,
"grad_norm": 1.151791721954015,
"learning_rate": 4.864738442754068e-06,
"loss": 0.0612,
"step": 1157
},
{
"epoch": 0.5268425841674249,
"grad_norm": 1.298125985364791,
"learning_rate": 4.864506463027837e-06,
"loss": 0.0841,
"step": 1158
},
{
"epoch": 0.5272975432211101,
"grad_norm": 1.828500217819582,
"learning_rate": 4.864274290084816e-06,
"loss": 0.1279,
"step": 1159
},
{
"epoch": 0.5277525022747953,
"grad_norm": 1.872568934497448,
"learning_rate": 4.864041923943978e-06,
"loss": 0.1041,
"step": 1160
},
{
"epoch": 0.5282074613284804,
"grad_norm": 1.59985877807279,
"learning_rate": 4.863809364624309e-06,
"loss": 0.0996,
"step": 1161
},
{
"epoch": 0.5286624203821656,
"grad_norm": 1.4920832769727852,
"learning_rate": 4.863576612144814e-06,
"loss": 0.1002,
"step": 1162
},
{
"epoch": 0.5291173794358508,
"grad_norm": 1.9606964487777765,
"learning_rate": 4.863343666524512e-06,
"loss": 0.1113,
"step": 1163
},
{
"epoch": 0.5295723384895359,
"grad_norm": 2.2204981872927774,
"learning_rate": 4.863110527782437e-06,
"loss": 0.1106,
"step": 1164
},
{
"epoch": 0.5300272975432211,
"grad_norm": 1.7885324238047555,
"learning_rate": 4.8628771959376435e-06,
"loss": 0.1085,
"step": 1165
},
{
"epoch": 0.5304822565969063,
"grad_norm": 1.7918603713541985,
"learning_rate": 4.862643671009195e-06,
"loss": 0.1007,
"step": 1166
},
{
"epoch": 0.5309372156505915,
"grad_norm": 1.0998167564155898,
"learning_rate": 4.862409953016175e-06,
"loss": 0.0968,
"step": 1167
},
{
"epoch": 0.5313921747042766,
"grad_norm": 1.853940722458201,
"learning_rate": 4.862176041977683e-06,
"loss": 0.1216,
"step": 1168
},
{
"epoch": 0.5318471337579618,
"grad_norm": 1.4646094216764547,
"learning_rate": 4.861941937912832e-06,
"loss": 0.1116,
"step": 1169
},
{
"epoch": 0.532302092811647,
"grad_norm": 1.2365450205781439,
"learning_rate": 4.861707640840752e-06,
"loss": 0.0819,
"step": 1170
},
{
"epoch": 0.5327570518653321,
"grad_norm": 1.6463867940760566,
"learning_rate": 4.861473150780589e-06,
"loss": 0.1094,
"step": 1171
},
{
"epoch": 0.5332120109190173,
"grad_norm": 1.686704498138834,
"learning_rate": 4.8612384677515054e-06,
"loss": 0.1071,
"step": 1172
},
{
"epoch": 0.5336669699727025,
"grad_norm": 1.2716060091758528,
"learning_rate": 4.861003591772677e-06,
"loss": 0.0788,
"step": 1173
},
{
"epoch": 0.5341219290263877,
"grad_norm": 1.596228030510201,
"learning_rate": 4.860768522863297e-06,
"loss": 0.0716,
"step": 1174
},
{
"epoch": 0.5345768880800728,
"grad_norm": 1.6508703177098787,
"learning_rate": 4.860533261042574e-06,
"loss": 0.0977,
"step": 1175
},
{
"epoch": 0.535031847133758,
"grad_norm": 1.3185419902691182,
"learning_rate": 4.8602978063297336e-06,
"loss": 0.1103,
"step": 1176
},
{
"epoch": 0.5354868061874432,
"grad_norm": 1.6903360885675578,
"learning_rate": 4.8600621587440155e-06,
"loss": 0.0933,
"step": 1177
},
{
"epoch": 0.5359417652411284,
"grad_norm": 1.5059509187961821,
"learning_rate": 4.859826318304676e-06,
"loss": 0.1093,
"step": 1178
},
{
"epoch": 0.5363967242948134,
"grad_norm": 1.156363062560368,
"learning_rate": 4.859590285030986e-06,
"loss": 0.091,
"step": 1179
},
{
"epoch": 0.5368516833484986,
"grad_norm": 1.4254896552320762,
"learning_rate": 4.859354058942234e-06,
"loss": 0.099,
"step": 1180
},
{
"epoch": 0.5373066424021838,
"grad_norm": 1.6756998416867424,
"learning_rate": 4.859117640057723e-06,
"loss": 0.1058,
"step": 1181
},
{
"epoch": 0.5377616014558689,
"grad_norm": 1.906068462189616,
"learning_rate": 4.858881028396773e-06,
"loss": 0.1344,
"step": 1182
},
{
"epoch": 0.5382165605095541,
"grad_norm": 1.6813817476503583,
"learning_rate": 4.8586442239787165e-06,
"loss": 0.0938,
"step": 1183
},
{
"epoch": 0.5386715195632393,
"grad_norm": 1.4947308906180774,
"learning_rate": 4.858407226822906e-06,
"loss": 0.1089,
"step": 1184
},
{
"epoch": 0.5391264786169245,
"grad_norm": 1.5326514903244322,
"learning_rate": 4.858170036948707e-06,
"loss": 0.0903,
"step": 1185
},
{
"epoch": 0.5395814376706096,
"grad_norm": 1.3397075921608799,
"learning_rate": 4.857932654375503e-06,
"loss": 0.079,
"step": 1186
},
{
"epoch": 0.5400363967242948,
"grad_norm": 2.3382844220202963,
"learning_rate": 4.857695079122691e-06,
"loss": 0.1606,
"step": 1187
},
{
"epoch": 0.54049135577798,
"grad_norm": 1.2780125171194971,
"learning_rate": 4.857457311209683e-06,
"loss": 0.0819,
"step": 1188
},
{
"epoch": 0.5409463148316651,
"grad_norm": 1.3621256537302653,
"learning_rate": 4.857219350655911e-06,
"loss": 0.0837,
"step": 1189
},
{
"epoch": 0.5414012738853503,
"grad_norm": 1.4753266540938175,
"learning_rate": 4.856981197480818e-06,
"loss": 0.092,
"step": 1190
},
{
"epoch": 0.5418562329390355,
"grad_norm": 1.1486583975675493,
"learning_rate": 4.856742851703866e-06,
"loss": 0.0695,
"step": 1191
},
{
"epoch": 0.5423111919927207,
"grad_norm": 1.6118421470322997,
"learning_rate": 4.856504313344531e-06,
"loss": 0.1306,
"step": 1192
},
{
"epoch": 0.5427661510464058,
"grad_norm": 1.654223645513978,
"learning_rate": 4.8562655824223055e-06,
"loss": 0.0868,
"step": 1193
},
{
"epoch": 0.543221110100091,
"grad_norm": 1.166432446622458,
"learning_rate": 4.856026658956697e-06,
"loss": 0.0592,
"step": 1194
},
{
"epoch": 0.5436760691537762,
"grad_norm": 1.2408147318232963,
"learning_rate": 4.8557875429672295e-06,
"loss": 0.0893,
"step": 1195
},
{
"epoch": 0.5441310282074613,
"grad_norm": 1.4658290533217708,
"learning_rate": 4.855548234473444e-06,
"loss": 0.1193,
"step": 1196
},
{
"epoch": 0.5445859872611465,
"grad_norm": 1.5813217399288642,
"learning_rate": 4.8553087334948935e-06,
"loss": 0.1027,
"step": 1197
},
{
"epoch": 0.5450409463148317,
"grad_norm": 1.346354212639339,
"learning_rate": 4.855069040051149e-06,
"loss": 0.0842,
"step": 1198
},
{
"epoch": 0.5454959053685169,
"grad_norm": 1.7976208225125645,
"learning_rate": 4.854829154161799e-06,
"loss": 0.1231,
"step": 1199
},
{
"epoch": 0.545950864422202,
"grad_norm": 1.468188785415714,
"learning_rate": 4.854589075846445e-06,
"loss": 0.0941,
"step": 1200
},
{
"epoch": 0.5464058234758872,
"grad_norm": 1.2900368220049758,
"learning_rate": 4.854348805124704e-06,
"loss": 0.0866,
"step": 1201
},
{
"epoch": 0.5468607825295724,
"grad_norm": 1.465762931238317,
"learning_rate": 4.85410834201621e-06,
"loss": 0.0917,
"step": 1202
},
{
"epoch": 0.5473157415832575,
"grad_norm": 2.030229358227215,
"learning_rate": 4.8538676865406155e-06,
"loss": 0.1367,
"step": 1203
},
{
"epoch": 0.5477707006369427,
"grad_norm": 1.3216014713960686,
"learning_rate": 4.853626838717582e-06,
"loss": 0.0744,
"step": 1204
},
{
"epoch": 0.5482256596906279,
"grad_norm": 1.34429128033589,
"learning_rate": 4.853385798566793e-06,
"loss": 0.072,
"step": 1205
},
{
"epoch": 0.5486806187443131,
"grad_norm": 1.3681907039168972,
"learning_rate": 4.8531445661079444e-06,
"loss": 0.0772,
"step": 1206
},
{
"epoch": 0.5491355777979982,
"grad_norm": 1.7634866119794534,
"learning_rate": 4.852903141360749e-06,
"loss": 0.1093,
"step": 1207
},
{
"epoch": 0.5495905368516834,
"grad_norm": 1.3755217621758322,
"learning_rate": 4.852661524344933e-06,
"loss": 0.0706,
"step": 1208
},
{
"epoch": 0.5500454959053686,
"grad_norm": 1.8792585200640362,
"learning_rate": 4.852419715080244e-06,
"loss": 0.1248,
"step": 1209
},
{
"epoch": 0.5505004549590536,
"grad_norm": 1.3604609211138492,
"learning_rate": 4.852177713586437e-06,
"loss": 0.0849,
"step": 1210
},
{
"epoch": 0.5509554140127388,
"grad_norm": 1.3077627182539715,
"learning_rate": 4.85193551988329e-06,
"loss": 0.0876,
"step": 1211
},
{
"epoch": 0.551410373066424,
"grad_norm": 1.5010970994642232,
"learning_rate": 4.851693133990594e-06,
"loss": 0.0887,
"step": 1212
},
{
"epoch": 0.5518653321201092,
"grad_norm": 1.2366263332853158,
"learning_rate": 4.851450555928155e-06,
"loss": 0.0677,
"step": 1213
},
{
"epoch": 0.5523202911737943,
"grad_norm": 1.9682815492889902,
"learning_rate": 4.851207785715797e-06,
"loss": 0.1605,
"step": 1214
},
{
"epoch": 0.5527752502274795,
"grad_norm": 1.4810464832161876,
"learning_rate": 4.850964823373355e-06,
"loss": 0.1194,
"step": 1215
},
{
"epoch": 0.5532302092811647,
"grad_norm": 1.269367325606048,
"learning_rate": 4.850721668920685e-06,
"loss": 0.0869,
"step": 1216
},
{
"epoch": 0.5536851683348498,
"grad_norm": 1.811102361348233,
"learning_rate": 4.850478322377657e-06,
"loss": 0.113,
"step": 1217
},
{
"epoch": 0.554140127388535,
"grad_norm": 1.8234155506944059,
"learning_rate": 4.8502347837641536e-06,
"loss": 0.1337,
"step": 1218
},
{
"epoch": 0.5545950864422202,
"grad_norm": 1.5374689189034605,
"learning_rate": 4.8499910531000776e-06,
"loss": 0.0923,
"step": 1219
},
{
"epoch": 0.5550500454959054,
"grad_norm": 2.1434119748623583,
"learning_rate": 4.849747130405346e-06,
"loss": 0.1165,
"step": 1220
},
{
"epoch": 0.5555050045495905,
"grad_norm": 1.5741068071079671,
"learning_rate": 4.849503015699889e-06,
"loss": 0.0833,
"step": 1221
},
{
"epoch": 0.5559599636032757,
"grad_norm": 1.4450089536449229,
"learning_rate": 4.849258709003657e-06,
"loss": 0.0874,
"step": 1222
},
{
"epoch": 0.5564149226569609,
"grad_norm": 2.0523390040501206,
"learning_rate": 4.849014210336612e-06,
"loss": 0.1311,
"step": 1223
},
{
"epoch": 0.556869881710646,
"grad_norm": 1.6272370459349303,
"learning_rate": 4.848769519718734e-06,
"loss": 0.1283,
"step": 1224
},
{
"epoch": 0.5573248407643312,
"grad_norm": 1.7795199436155464,
"learning_rate": 4.848524637170018e-06,
"loss": 0.1053,
"step": 1225
},
{
"epoch": 0.5577797998180164,
"grad_norm": 2.039787438198539,
"learning_rate": 4.848279562710474e-06,
"loss": 0.119,
"step": 1226
},
{
"epoch": 0.5582347588717016,
"grad_norm": 1.048713205847522,
"learning_rate": 4.848034296360129e-06,
"loss": 0.0613,
"step": 1227
},
{
"epoch": 0.5586897179253867,
"grad_norm": 1.2246704661323997,
"learning_rate": 4.847788838139025e-06,
"loss": 0.0907,
"step": 1228
},
{
"epoch": 0.5591446769790719,
"grad_norm": 1.4248227073394217,
"learning_rate": 4.847543188067219e-06,
"loss": 0.0831,
"step": 1229
},
{
"epoch": 0.5595996360327571,
"grad_norm": 1.6554531335771108,
"learning_rate": 4.847297346164786e-06,
"loss": 0.098,
"step": 1230
},
{
"epoch": 0.5600545950864422,
"grad_norm": 1.6618601198336995,
"learning_rate": 4.8470513124518134e-06,
"loss": 0.1067,
"step": 1231
},
{
"epoch": 0.5605095541401274,
"grad_norm": 1.910127735430222,
"learning_rate": 4.8468050869484075e-06,
"loss": 0.1153,
"step": 1232
},
{
"epoch": 0.5609645131938126,
"grad_norm": 1.662154262618556,
"learning_rate": 4.846558669674688e-06,
"loss": 0.0858,
"step": 1233
},
{
"epoch": 0.5614194722474978,
"grad_norm": 1.666011221920497,
"learning_rate": 4.8463120606507904e-06,
"loss": 0.087,
"step": 1234
},
{
"epoch": 0.5618744313011829,
"grad_norm": 1.8392638033651618,
"learning_rate": 4.846065259896867e-06,
"loss": 0.1007,
"step": 1235
},
{
"epoch": 0.5623293903548681,
"grad_norm": 1.823608778063299,
"learning_rate": 4.845818267433086e-06,
"loss": 0.1234,
"step": 1236
},
{
"epoch": 0.5627843494085533,
"grad_norm": 1.6001337547517656,
"learning_rate": 4.845571083279629e-06,
"loss": 0.0992,
"step": 1237
},
{
"epoch": 0.5632393084622384,
"grad_norm": 1.244896894294659,
"learning_rate": 4.845323707456696e-06,
"loss": 0.0911,
"step": 1238
},
{
"epoch": 0.5636942675159236,
"grad_norm": 1.6134676145738456,
"learning_rate": 4.845076139984502e-06,
"loss": 0.0988,
"step": 1239
},
{
"epoch": 0.5641492265696088,
"grad_norm": 1.817921705994322,
"learning_rate": 4.844828380883274e-06,
"loss": 0.1137,
"step": 1240
},
{
"epoch": 0.564604185623294,
"grad_norm": 1.223760267965902,
"learning_rate": 4.844580430173261e-06,
"loss": 0.0912,
"step": 1241
},
{
"epoch": 0.565059144676979,
"grad_norm": 1.0223923432784907,
"learning_rate": 4.8443322878747236e-06,
"loss": 0.0549,
"step": 1242
},
{
"epoch": 0.5655141037306642,
"grad_norm": 1.4179515952754742,
"learning_rate": 4.844083954007938e-06,
"loss": 0.0909,
"step": 1243
},
{
"epoch": 0.5659690627843494,
"grad_norm": 1.964821324684815,
"learning_rate": 4.843835428593198e-06,
"loss": 0.1331,
"step": 1244
},
{
"epoch": 0.5664240218380345,
"grad_norm": 1.8460290937807686,
"learning_rate": 4.84358671165081e-06,
"loss": 0.1355,
"step": 1245
},
{
"epoch": 0.5668789808917197,
"grad_norm": 1.9533421795112815,
"learning_rate": 4.843337803201102e-06,
"loss": 0.1493,
"step": 1246
},
{
"epoch": 0.5673339399454049,
"grad_norm": 1.7429301575956597,
"learning_rate": 4.8430887032644094e-06,
"loss": 0.1208,
"step": 1247
},
{
"epoch": 0.5677888989990901,
"grad_norm": 1.6048397609024965,
"learning_rate": 4.842839411861089e-06,
"loss": 0.1016,
"step": 1248
},
{
"epoch": 0.5682438580527752,
"grad_norm": 1.5611018277418034,
"learning_rate": 4.842589929011513e-06,
"loss": 0.0996,
"step": 1249
},
{
"epoch": 0.5686988171064604,
"grad_norm": 1.549763833499855,
"learning_rate": 4.8423402547360665e-06,
"loss": 0.1047,
"step": 1250
},
{
"epoch": 0.5691537761601456,
"grad_norm": 1.5794849405940026,
"learning_rate": 4.842090389055153e-06,
"loss": 0.0885,
"step": 1251
},
{
"epoch": 0.5696087352138307,
"grad_norm": 1.340948229500544,
"learning_rate": 4.841840331989189e-06,
"loss": 0.082,
"step": 1252
},
{
"epoch": 0.5700636942675159,
"grad_norm": 1.187480617941468,
"learning_rate": 4.841590083558608e-06,
"loss": 0.0757,
"step": 1253
},
{
"epoch": 0.5705186533212011,
"grad_norm": 1.6889387454247615,
"learning_rate": 4.841339643783861e-06,
"loss": 0.1007,
"step": 1254
},
{
"epoch": 0.5709736123748863,
"grad_norm": 1.8032486510427874,
"learning_rate": 4.841089012685412e-06,
"loss": 0.1387,
"step": 1255
},
{
"epoch": 0.5714285714285714,
"grad_norm": 1.518781686351209,
"learning_rate": 4.840838190283741e-06,
"loss": 0.1073,
"step": 1256
},
{
"epoch": 0.5718835304822566,
"grad_norm": 1.2622352263295604,
"learning_rate": 4.8405871765993435e-06,
"loss": 0.0611,
"step": 1257
},
{
"epoch": 0.5723384895359418,
"grad_norm": 1.3733958676153404,
"learning_rate": 4.840335971652732e-06,
"loss": 0.0806,
"step": 1258
},
{
"epoch": 0.5727934485896269,
"grad_norm": 1.414930922234482,
"learning_rate": 4.840084575464434e-06,
"loss": 0.0967,
"step": 1259
},
{
"epoch": 0.5732484076433121,
"grad_norm": 1.3132222404269749,
"learning_rate": 4.839832988054992e-06,
"loss": 0.0844,
"step": 1260
},
{
"epoch": 0.5737033666969973,
"grad_norm": 1.4304276264926878,
"learning_rate": 4.839581209444966e-06,
"loss": 0.08,
"step": 1261
},
{
"epoch": 0.5741583257506825,
"grad_norm": 1.6261976055252851,
"learning_rate": 4.839329239654927e-06,
"loss": 0.1086,
"step": 1262
},
{
"epoch": 0.5746132848043676,
"grad_norm": 1.4905660158866907,
"learning_rate": 4.839077078705468e-06,
"loss": 0.0758,
"step": 1263
},
{
"epoch": 0.5750682438580528,
"grad_norm": 1.6218355961437578,
"learning_rate": 4.838824726617194e-06,
"loss": 0.1066,
"step": 1264
},
{
"epoch": 0.575523202911738,
"grad_norm": 1.7405100413536567,
"learning_rate": 4.838572183410725e-06,
"loss": 0.1103,
"step": 1265
},
{
"epoch": 0.5759781619654231,
"grad_norm": 1.5825357430240847,
"learning_rate": 4.838319449106697e-06,
"loss": 0.1026,
"step": 1266
},
{
"epoch": 0.5764331210191083,
"grad_norm": 1.4234319951879078,
"learning_rate": 4.838066523725764e-06,
"loss": 0.0761,
"step": 1267
},
{
"epoch": 0.5768880800727935,
"grad_norm": 1.4883172887933762,
"learning_rate": 4.837813407288594e-06,
"loss": 0.0989,
"step": 1268
},
{
"epoch": 0.5773430391264787,
"grad_norm": 1.437934945090456,
"learning_rate": 4.837560099815869e-06,
"loss": 0.0874,
"step": 1269
},
{
"epoch": 0.5777979981801638,
"grad_norm": 1.6175863411283686,
"learning_rate": 4.837306601328289e-06,
"loss": 0.1074,
"step": 1270
},
{
"epoch": 0.578252957233849,
"grad_norm": 1.3546376195879695,
"learning_rate": 4.837052911846569e-06,
"loss": 0.099,
"step": 1271
},
{
"epoch": 0.5787079162875342,
"grad_norm": 1.615443707505004,
"learning_rate": 4.836799031391439e-06,
"loss": 0.1093,
"step": 1272
},
{
"epoch": 0.5791628753412192,
"grad_norm": 0.7225881399048506,
"learning_rate": 4.836544959983645e-06,
"loss": 0.0439,
"step": 1273
},
{
"epoch": 0.5796178343949044,
"grad_norm": 2.1011993101699926,
"learning_rate": 4.8362906976439485e-06,
"loss": 0.1277,
"step": 1274
},
{
"epoch": 0.5800727934485896,
"grad_norm": 2.000601957434587,
"learning_rate": 4.836036244393127e-06,
"loss": 0.1495,
"step": 1275
},
{
"epoch": 0.5805277525022748,
"grad_norm": 1.6950265520988297,
"learning_rate": 4.835781600251973e-06,
"loss": 0.0976,
"step": 1276
},
{
"epoch": 0.5809827115559599,
"grad_norm": 1.3727073330890776,
"learning_rate": 4.835526765241295e-06,
"loss": 0.0828,
"step": 1277
},
{
"epoch": 0.5814376706096451,
"grad_norm": 1.5570369931283408,
"learning_rate": 4.835271739381917e-06,
"loss": 0.1109,
"step": 1278
},
{
"epoch": 0.5818926296633303,
"grad_norm": 1.0713801990040446,
"learning_rate": 4.835016522694678e-06,
"loss": 0.0757,
"step": 1279
},
{
"epoch": 0.5823475887170154,
"grad_norm": 1.942364052088125,
"learning_rate": 4.834761115200434e-06,
"loss": 0.1642,
"step": 1280
},
{
"epoch": 0.5828025477707006,
"grad_norm": 1.7377055370855508,
"learning_rate": 4.834505516920055e-06,
"loss": 0.1187,
"step": 1281
},
{
"epoch": 0.5832575068243858,
"grad_norm": 1.6956294426437164,
"learning_rate": 4.834249727874428e-06,
"loss": 0.1051,
"step": 1282
},
{
"epoch": 0.583712465878071,
"grad_norm": 1.4102019730152917,
"learning_rate": 4.833993748084455e-06,
"loss": 0.0704,
"step": 1283
},
{
"epoch": 0.5841674249317561,
"grad_norm": 1.2666669426637933,
"learning_rate": 4.833737577571052e-06,
"loss": 0.072,
"step": 1284
},
{
"epoch": 0.5846223839854413,
"grad_norm": 1.451859405282776,
"learning_rate": 4.833481216355153e-06,
"loss": 0.0833,
"step": 1285
},
{
"epoch": 0.5850773430391265,
"grad_norm": 2.2038986828884846,
"learning_rate": 4.833224664457709e-06,
"loss": 0.1247,
"step": 1286
},
{
"epoch": 0.5855323020928116,
"grad_norm": 2.170783563626466,
"learning_rate": 4.83296792189968e-06,
"loss": 0.0991,
"step": 1287
},
{
"epoch": 0.5859872611464968,
"grad_norm": 1.8083451546198175,
"learning_rate": 4.83271098870205e-06,
"loss": 0.1067,
"step": 1288
},
{
"epoch": 0.586442220200182,
"grad_norm": 1.764270130263968,
"learning_rate": 4.832453864885811e-06,
"loss": 0.1181,
"step": 1289
},
{
"epoch": 0.5868971792538672,
"grad_norm": 1.3642172399097685,
"learning_rate": 4.832196550471976e-06,
"loss": 0.0844,
"step": 1290
},
{
"epoch": 0.5873521383075523,
"grad_norm": 1.4693026944828678,
"learning_rate": 4.831939045481571e-06,
"loss": 0.1103,
"step": 1291
},
{
"epoch": 0.5878070973612375,
"grad_norm": 1.370206188315079,
"learning_rate": 4.8316813499356375e-06,
"loss": 0.0914,
"step": 1292
},
{
"epoch": 0.5882620564149227,
"grad_norm": 1.3729593032500749,
"learning_rate": 4.831423463855235e-06,
"loss": 0.0719,
"step": 1293
},
{
"epoch": 0.5887170154686078,
"grad_norm": 1.4507728916778564,
"learning_rate": 4.8311653872614345e-06,
"loss": 0.086,
"step": 1294
},
{
"epoch": 0.589171974522293,
"grad_norm": 1.3070476542527247,
"learning_rate": 4.830907120175327e-06,
"loss": 0.077,
"step": 1295
},
{
"epoch": 0.5896269335759782,
"grad_norm": 2.4221015667648045,
"learning_rate": 4.830648662618015e-06,
"loss": 0.1596,
"step": 1296
},
{
"epoch": 0.5900818926296634,
"grad_norm": 1.103239260506278,
"learning_rate": 4.83039001461062e-06,
"loss": 0.0581,
"step": 1297
},
{
"epoch": 0.5905368516833485,
"grad_norm": 1.8298909001729466,
"learning_rate": 4.830131176174276e-06,
"loss": 0.1082,
"step": 1298
},
{
"epoch": 0.5909918107370337,
"grad_norm": 1.9201560834557836,
"learning_rate": 4.829872147330136e-06,
"loss": 0.1147,
"step": 1299
},
{
"epoch": 0.5914467697907189,
"grad_norm": 1.332697111328447,
"learning_rate": 4.829612928099366e-06,
"loss": 0.0906,
"step": 1300
},
{
"epoch": 0.591901728844404,
"grad_norm": 1.2286901595425765,
"learning_rate": 4.829353518503147e-06,
"loss": 0.0741,
"step": 1301
},
{
"epoch": 0.5923566878980892,
"grad_norm": 1.21692580464079,
"learning_rate": 4.829093918562678e-06,
"loss": 0.0657,
"step": 1302
},
{
"epoch": 0.5928116469517744,
"grad_norm": 1.776387560928479,
"learning_rate": 4.828834128299173e-06,
"loss": 0.122,
"step": 1303
},
{
"epoch": 0.5932666060054596,
"grad_norm": 2.2576443805946003,
"learning_rate": 4.828574147733859e-06,
"loss": 0.1395,
"step": 1304
},
{
"epoch": 0.5937215650591446,
"grad_norm": 1.6394742041639938,
"learning_rate": 4.828313976887982e-06,
"loss": 0.0886,
"step": 1305
},
{
"epoch": 0.5941765241128298,
"grad_norm": 1.729743531966717,
"learning_rate": 4.8280536157828e-06,
"loss": 0.1191,
"step": 1306
},
{
"epoch": 0.594631483166515,
"grad_norm": 1.4769755060752687,
"learning_rate": 4.827793064439592e-06,
"loss": 0.0965,
"step": 1307
},
{
"epoch": 0.5950864422202001,
"grad_norm": 1.4080505436977253,
"learning_rate": 4.8275323228796455e-06,
"loss": 0.0874,
"step": 1308
},
{
"epoch": 0.5955414012738853,
"grad_norm": 0.9123649868426729,
"learning_rate": 4.8272713911242695e-06,
"loss": 0.0402,
"step": 1309
},
{
"epoch": 0.5959963603275705,
"grad_norm": 1.1294729714943839,
"learning_rate": 4.827010269194785e-06,
"loss": 0.0631,
"step": 1310
},
{
"epoch": 0.5964513193812557,
"grad_norm": 1.9689287013341512,
"learning_rate": 4.8267489571125295e-06,
"loss": 0.1181,
"step": 1311
},
{
"epoch": 0.5969062784349408,
"grad_norm": 2.330161760291491,
"learning_rate": 4.826487454898857e-06,
"loss": 0.1448,
"step": 1312
},
{
"epoch": 0.597361237488626,
"grad_norm": 1.2992174727337271,
"learning_rate": 4.826225762575136e-06,
"loss": 0.0857,
"step": 1313
},
{
"epoch": 0.5978161965423112,
"grad_norm": 1.4247199067825551,
"learning_rate": 4.825963880162752e-06,
"loss": 0.0863,
"step": 1314
},
{
"epoch": 0.5982711555959963,
"grad_norm": 2.01495341050897,
"learning_rate": 4.825701807683102e-06,
"loss": 0.1072,
"step": 1315
},
{
"epoch": 0.5987261146496815,
"grad_norm": 1.7412264774469277,
"learning_rate": 4.825439545157603e-06,
"loss": 0.1092,
"step": 1316
},
{
"epoch": 0.5991810737033667,
"grad_norm": 1.4724909601046332,
"learning_rate": 4.825177092607687e-06,
"loss": 0.0999,
"step": 1317
},
{
"epoch": 0.5996360327570519,
"grad_norm": 1.3473250398166379,
"learning_rate": 4.8249144500547995e-06,
"loss": 0.0847,
"step": 1318
},
{
"epoch": 0.600090991810737,
"grad_norm": 1.3069589653313691,
"learning_rate": 4.824651617520402e-06,
"loss": 0.0669,
"step": 1319
},
{
"epoch": 0.6005459508644222,
"grad_norm": 1.5442197540840334,
"learning_rate": 4.824388595025972e-06,
"loss": 0.1178,
"step": 1320
},
{
"epoch": 0.6010009099181074,
"grad_norm": 1.5331976112900332,
"learning_rate": 4.824125382593003e-06,
"loss": 0.0874,
"step": 1321
},
{
"epoch": 0.6014558689717925,
"grad_norm": 1.4665462333148995,
"learning_rate": 4.823861980243003e-06,
"loss": 0.1106,
"step": 1322
},
{
"epoch": 0.6019108280254777,
"grad_norm": 1.909519129682131,
"learning_rate": 4.823598387997497e-06,
"loss": 0.1163,
"step": 1323
},
{
"epoch": 0.6023657870791629,
"grad_norm": 1.5641688210807196,
"learning_rate": 4.823334605878024e-06,
"loss": 0.0797,
"step": 1324
},
{
"epoch": 0.6028207461328481,
"grad_norm": 1.572854435679942,
"learning_rate": 4.82307063390614e-06,
"loss": 0.09,
"step": 1325
},
{
"epoch": 0.6032757051865332,
"grad_norm": 1.6242534333910885,
"learning_rate": 4.822806472103413e-06,
"loss": 0.1031,
"step": 1326
},
{
"epoch": 0.6037306642402184,
"grad_norm": 1.3730669374310474,
"learning_rate": 4.822542120491431e-06,
"loss": 0.0842,
"step": 1327
},
{
"epoch": 0.6041856232939036,
"grad_norm": 1.12030081002078,
"learning_rate": 4.822277579091796e-06,
"loss": 0.0933,
"step": 1328
},
{
"epoch": 0.6046405823475887,
"grad_norm": 1.2764536589561721,
"learning_rate": 4.822012847926125e-06,
"loss": 0.0795,
"step": 1329
},
{
"epoch": 0.6050955414012739,
"grad_norm": 1.4682540895282241,
"learning_rate": 4.821747927016049e-06,
"loss": 0.0834,
"step": 1330
},
{
"epoch": 0.6055505004549591,
"grad_norm": 1.5003874511683086,
"learning_rate": 4.821482816383219e-06,
"loss": 0.1096,
"step": 1331
},
{
"epoch": 0.6060054595086443,
"grad_norm": 1.2445527510541503,
"learning_rate": 4.821217516049296e-06,
"loss": 0.0789,
"step": 1332
},
{
"epoch": 0.6064604185623294,
"grad_norm": 1.3266125786690217,
"learning_rate": 4.82095202603596e-06,
"loss": 0.0796,
"step": 1333
},
{
"epoch": 0.6069153776160146,
"grad_norm": 1.5070167125246237,
"learning_rate": 4.820686346364906e-06,
"loss": 0.0924,
"step": 1334
},
{
"epoch": 0.6073703366696998,
"grad_norm": 1.9776742406411276,
"learning_rate": 4.820420477057843e-06,
"loss": 0.1066,
"step": 1335
},
{
"epoch": 0.607825295723385,
"grad_norm": 1.7020369242588063,
"learning_rate": 4.820154418136498e-06,
"loss": 0.1212,
"step": 1336
},
{
"epoch": 0.60828025477707,
"grad_norm": 1.8050978290349085,
"learning_rate": 4.819888169622612e-06,
"loss": 0.1102,
"step": 1337
},
{
"epoch": 0.6087352138307552,
"grad_norm": 1.4892394361348396,
"learning_rate": 4.819621731537942e-06,
"loss": 0.1139,
"step": 1338
},
{
"epoch": 0.6091901728844404,
"grad_norm": 1.4499858080485506,
"learning_rate": 4.819355103904259e-06,
"loss": 0.0833,
"step": 1339
},
{
"epoch": 0.6096451319381255,
"grad_norm": 1.5725512633612637,
"learning_rate": 4.81908828674335e-06,
"loss": 0.0915,
"step": 1340
},
{
"epoch": 0.6101000909918107,
"grad_norm": 1.122002936682905,
"learning_rate": 4.81882128007702e-06,
"loss": 0.0706,
"step": 1341
},
{
"epoch": 0.6105550500454959,
"grad_norm": 1.6231339345844462,
"learning_rate": 4.818554083927086e-06,
"loss": 0.0989,
"step": 1342
},
{
"epoch": 0.6110100090991811,
"grad_norm": 1.5566168283978299,
"learning_rate": 4.818286698315383e-06,
"loss": 0.0802,
"step": 1343
},
{
"epoch": 0.6114649681528662,
"grad_norm": 1.5209649714120241,
"learning_rate": 4.818019123263761e-06,
"loss": 0.1202,
"step": 1344
},
{
"epoch": 0.6119199272065514,
"grad_norm": 1.5198574931775437,
"learning_rate": 4.817751358794084e-06,
"loss": 0.0824,
"step": 1345
},
{
"epoch": 0.6123748862602366,
"grad_norm": 1.3969905074954028,
"learning_rate": 4.8174834049282325e-06,
"loss": 0.1004,
"step": 1346
},
{
"epoch": 0.6128298453139217,
"grad_norm": 2.1750619266428455,
"learning_rate": 4.817215261688104e-06,
"loss": 0.1479,
"step": 1347
},
{
"epoch": 0.6132848043676069,
"grad_norm": 1.4757724334002973,
"learning_rate": 4.816946929095607e-06,
"loss": 0.0816,
"step": 1348
},
{
"epoch": 0.6137397634212921,
"grad_norm": 1.5237192624117821,
"learning_rate": 4.816678407172671e-06,
"loss": 0.1043,
"step": 1349
},
{
"epoch": 0.6141947224749773,
"grad_norm": 1.369442898723999,
"learning_rate": 4.816409695941238e-06,
"loss": 0.092,
"step": 1350
},
{
"epoch": 0.6146496815286624,
"grad_norm": 1.3552993829733393,
"learning_rate": 4.816140795423265e-06,
"loss": 0.0896,
"step": 1351
},
{
"epoch": 0.6151046405823476,
"grad_norm": 1.914785073036727,
"learning_rate": 4.8158717056407255e-06,
"loss": 0.1405,
"step": 1352
},
{
"epoch": 0.6155595996360328,
"grad_norm": 2.510056256789934,
"learning_rate": 4.815602426615609e-06,
"loss": 0.1347,
"step": 1353
},
{
"epoch": 0.6160145586897179,
"grad_norm": 1.6994784582879867,
"learning_rate": 4.815332958369919e-06,
"loss": 0.1043,
"step": 1354
},
{
"epoch": 0.6164695177434031,
"grad_norm": 1.609212664276651,
"learning_rate": 4.815063300925677e-06,
"loss": 0.0801,
"step": 1355
},
{
"epoch": 0.6169244767970883,
"grad_norm": 1.3059644313522971,
"learning_rate": 4.814793454304915e-06,
"loss": 0.0962,
"step": 1356
},
{
"epoch": 0.6173794358507735,
"grad_norm": 1.316795599125537,
"learning_rate": 4.814523418529686e-06,
"loss": 0.0945,
"step": 1357
},
{
"epoch": 0.6178343949044586,
"grad_norm": 1.458401958119273,
"learning_rate": 4.814253193622056e-06,
"loss": 0.0931,
"step": 1358
},
{
"epoch": 0.6182893539581438,
"grad_norm": 1.5782519499021963,
"learning_rate": 4.813982779604106e-06,
"loss": 0.086,
"step": 1359
},
{
"epoch": 0.618744313011829,
"grad_norm": 1.4337607882677579,
"learning_rate": 4.813712176497933e-06,
"loss": 0.1051,
"step": 1360
},
{
"epoch": 0.6191992720655141,
"grad_norm": 1.7873980657918327,
"learning_rate": 4.813441384325649e-06,
"loss": 0.1049,
"step": 1361
},
{
"epoch": 0.6196542311191993,
"grad_norm": 1.3606232019090971,
"learning_rate": 4.813170403109383e-06,
"loss": 0.0708,
"step": 1362
},
{
"epoch": 0.6201091901728845,
"grad_norm": 1.3563405384219576,
"learning_rate": 4.8128992328712774e-06,
"loss": 0.086,
"step": 1363
},
{
"epoch": 0.6205641492265697,
"grad_norm": 1.3192980800606737,
"learning_rate": 4.812627873633492e-06,
"loss": 0.0781,
"step": 1364
},
{
"epoch": 0.6210191082802548,
"grad_norm": 1.7840648545688607,
"learning_rate": 4.8123563254182e-06,
"loss": 0.1361,
"step": 1365
},
{
"epoch": 0.62147406733394,
"grad_norm": 1.8322981514345795,
"learning_rate": 4.8120845882475924e-06,
"loss": 0.1282,
"step": 1366
},
{
"epoch": 0.6219290263876252,
"grad_norm": 2.0823134031423267,
"learning_rate": 4.8118126621438734e-06,
"loss": 0.1303,
"step": 1367
},
{
"epoch": 0.6223839854413102,
"grad_norm": 1.8738406581860008,
"learning_rate": 4.811540547129263e-06,
"loss": 0.1603,
"step": 1368
},
{
"epoch": 0.6228389444949954,
"grad_norm": 1.7465048715810059,
"learning_rate": 4.811268243225999e-06,
"loss": 0.1157,
"step": 1369
},
{
"epoch": 0.6232939035486806,
"grad_norm": 1.3620940982420815,
"learning_rate": 4.810995750456331e-06,
"loss": 0.0794,
"step": 1370
},
{
"epoch": 0.6237488626023658,
"grad_norm": 1.7874358637623151,
"learning_rate": 4.810723068842526e-06,
"loss": 0.1272,
"step": 1371
},
{
"epoch": 0.6242038216560509,
"grad_norm": 1.7579304475520012,
"learning_rate": 4.810450198406867e-06,
"loss": 0.1185,
"step": 1372
},
{
"epoch": 0.6246587807097361,
"grad_norm": 2.467789845960662,
"learning_rate": 4.810177139171653e-06,
"loss": 0.1557,
"step": 1373
},
{
"epoch": 0.6251137397634213,
"grad_norm": 1.1425822722647716,
"learning_rate": 4.809903891159195e-06,
"loss": 0.0657,
"step": 1374
},
{
"epoch": 0.6255686988171064,
"grad_norm": 2.016266262602286,
"learning_rate": 4.809630454391822e-06,
"loss": 0.107,
"step": 1375
},
{
"epoch": 0.6260236578707916,
"grad_norm": 1.7559713706649986,
"learning_rate": 4.80935682889188e-06,
"loss": 0.1506,
"step": 1376
},
{
"epoch": 0.6264786169244768,
"grad_norm": 1.4915046053791412,
"learning_rate": 4.809083014681726e-06,
"loss": 0.1212,
"step": 1377
},
{
"epoch": 0.626933575978162,
"grad_norm": 1.632149901378183,
"learning_rate": 4.808809011783735e-06,
"loss": 0.1266,
"step": 1378
},
{
"epoch": 0.6273885350318471,
"grad_norm": 1.3124240257866033,
"learning_rate": 4.808534820220299e-06,
"loss": 0.0837,
"step": 1379
},
{
"epoch": 0.6278434940855323,
"grad_norm": 1.7180772149333445,
"learning_rate": 4.8082604400138226e-06,
"loss": 0.1287,
"step": 1380
},
{
"epoch": 0.6282984531392175,
"grad_norm": 1.071227301223936,
"learning_rate": 4.807985871186726e-06,
"loss": 0.0776,
"step": 1381
},
{
"epoch": 0.6287534121929026,
"grad_norm": 1.7108717630459847,
"learning_rate": 4.8077111137614484e-06,
"loss": 0.0991,
"step": 1382
},
{
"epoch": 0.6292083712465878,
"grad_norm": 1.6365913346705507,
"learning_rate": 4.8074361677604394e-06,
"loss": 0.1004,
"step": 1383
},
{
"epoch": 0.629663330300273,
"grad_norm": 1.6392222223495618,
"learning_rate": 4.807161033206168e-06,
"loss": 0.1002,
"step": 1384
},
{
"epoch": 0.6301182893539582,
"grad_norm": 1.687969288374962,
"learning_rate": 4.806885710121114e-06,
"loss": 0.1099,
"step": 1385
},
{
"epoch": 0.6305732484076433,
"grad_norm": 1.4063826448960124,
"learning_rate": 4.806610198527779e-06,
"loss": 0.0896,
"step": 1386
},
{
"epoch": 0.6310282074613285,
"grad_norm": 1.540144583948253,
"learning_rate": 4.8063344984486755e-06,
"loss": 0.0879,
"step": 1387
},
{
"epoch": 0.6314831665150137,
"grad_norm": 1.5064915998503037,
"learning_rate": 4.806058609906331e-06,
"loss": 0.0962,
"step": 1388
},
{
"epoch": 0.6319381255686988,
"grad_norm": 2.1627291975031104,
"learning_rate": 4.805782532923292e-06,
"loss": 0.128,
"step": 1389
},
{
"epoch": 0.632393084622384,
"grad_norm": 1.647216495001309,
"learning_rate": 4.805506267522116e-06,
"loss": 0.1248,
"step": 1390
},
{
"epoch": 0.6328480436760692,
"grad_norm": 1.9302875416620158,
"learning_rate": 4.80522981372538e-06,
"loss": 0.1297,
"step": 1391
},
{
"epoch": 0.6333030027297544,
"grad_norm": 1.1401865771531547,
"learning_rate": 4.804953171555674e-06,
"loss": 0.077,
"step": 1392
},
{
"epoch": 0.6337579617834395,
"grad_norm": 2.3827232130583513,
"learning_rate": 4.8046763410356046e-06,
"loss": 0.1231,
"step": 1393
},
{
"epoch": 0.6342129208371247,
"grad_norm": 2.132009387110179,
"learning_rate": 4.804399322187791e-06,
"loss": 0.1363,
"step": 1394
},
{
"epoch": 0.6346678798908099,
"grad_norm": 1.914550517915578,
"learning_rate": 4.8041221150348725e-06,
"loss": 0.1408,
"step": 1395
},
{
"epoch": 0.635122838944495,
"grad_norm": 1.5194825054621766,
"learning_rate": 4.8038447195995e-06,
"loss": 0.1107,
"step": 1396
},
{
"epoch": 0.6355777979981801,
"grad_norm": 1.6908421741011026,
"learning_rate": 4.80356713590434e-06,
"loss": 0.1057,
"step": 1397
},
{
"epoch": 0.6360327570518653,
"grad_norm": 1.957264325451557,
"learning_rate": 4.803289363972078e-06,
"loss": 0.1279,
"step": 1398
},
{
"epoch": 0.6364877161055505,
"grad_norm": 1.429753125674933,
"learning_rate": 4.8030114038254094e-06,
"loss": 0.0906,
"step": 1399
},
{
"epoch": 0.6369426751592356,
"grad_norm": 1.574683320179916,
"learning_rate": 4.80273325548705e-06,
"loss": 0.0951,
"step": 1400
},
{
"epoch": 0.6373976342129208,
"grad_norm": 1.422366848550457,
"learning_rate": 4.802454918979728e-06,
"loss": 0.0906,
"step": 1401
},
{
"epoch": 0.637852593266606,
"grad_norm": 1.9963358207494448,
"learning_rate": 4.802176394326187e-06,
"loss": 0.1483,
"step": 1402
},
{
"epoch": 0.6383075523202911,
"grad_norm": 1.781860008561357,
"learning_rate": 4.801897681549188e-06,
"loss": 0.0878,
"step": 1403
},
{
"epoch": 0.6387625113739763,
"grad_norm": 1.635142292837631,
"learning_rate": 4.801618780671506e-06,
"loss": 0.1054,
"step": 1404
},
{
"epoch": 0.6392174704276615,
"grad_norm": 1.3235648640664877,
"learning_rate": 4.801339691715932e-06,
"loss": 0.0939,
"step": 1405
},
{
"epoch": 0.6396724294813467,
"grad_norm": 1.2245139670763607,
"learning_rate": 4.8010604147052695e-06,
"loss": 0.0625,
"step": 1406
},
{
"epoch": 0.6401273885350318,
"grad_norm": 1.4675657307946148,
"learning_rate": 4.800780949662343e-06,
"loss": 0.0994,
"step": 1407
},
{
"epoch": 0.640582347588717,
"grad_norm": 1.493372713452032,
"learning_rate": 4.800501296609986e-06,
"loss": 0.0841,
"step": 1408
},
{
"epoch": 0.6410373066424022,
"grad_norm": 1.7340947187812135,
"learning_rate": 4.800221455571053e-06,
"loss": 0.1088,
"step": 1409
},
{
"epoch": 0.6414922656960873,
"grad_norm": 1.2980113793311265,
"learning_rate": 4.7999414265684105e-06,
"loss": 0.0852,
"step": 1410
},
{
"epoch": 0.6419472247497725,
"grad_norm": 1.4464636793664913,
"learning_rate": 4.79966120962494e-06,
"loss": 0.0976,
"step": 1411
},
{
"epoch": 0.6424021838034577,
"grad_norm": 1.4659649640116845,
"learning_rate": 4.799380804763542e-06,
"loss": 0.0901,
"step": 1412
},
{
"epoch": 0.6428571428571429,
"grad_norm": 1.703460078887615,
"learning_rate": 4.799100212007128e-06,
"loss": 0.1074,
"step": 1413
},
{
"epoch": 0.643312101910828,
"grad_norm": 1.3106092828093312,
"learning_rate": 4.7988194313786275e-06,
"loss": 0.0736,
"step": 1414
},
{
"epoch": 0.6437670609645132,
"grad_norm": 0.9724381635858095,
"learning_rate": 4.798538462900984e-06,
"loss": 0.0657,
"step": 1415
},
{
"epoch": 0.6442220200181984,
"grad_norm": 1.3180852195340405,
"learning_rate": 4.798257306597157e-06,
"loss": 0.0791,
"step": 1416
},
{
"epoch": 0.6446769790718835,
"grad_norm": 1.3806990093425773,
"learning_rate": 4.797975962490122e-06,
"loss": 0.102,
"step": 1417
},
{
"epoch": 0.6451319381255687,
"grad_norm": 1.0796594549250105,
"learning_rate": 4.797694430602869e-06,
"loss": 0.0521,
"step": 1418
},
{
"epoch": 0.6455868971792539,
"grad_norm": 1.8299905872463706,
"learning_rate": 4.797412710958405e-06,
"loss": 0.1117,
"step": 1419
},
{
"epoch": 0.6460418562329391,
"grad_norm": 1.7103989898617438,
"learning_rate": 4.797130803579747e-06,
"loss": 0.1034,
"step": 1420
},
{
"epoch": 0.6464968152866242,
"grad_norm": 1.9920043416958193,
"learning_rate": 4.796848708489935e-06,
"loss": 0.1314,
"step": 1421
},
{
"epoch": 0.6469517743403094,
"grad_norm": 1.55952000492946,
"learning_rate": 4.796566425712018e-06,
"loss": 0.1094,
"step": 1422
},
{
"epoch": 0.6474067333939946,
"grad_norm": 1.569073968162044,
"learning_rate": 4.796283955269065e-06,
"loss": 0.1288,
"step": 1423
},
{
"epoch": 0.6478616924476797,
"grad_norm": 1.9345498009875362,
"learning_rate": 4.796001297184156e-06,
"loss": 0.1276,
"step": 1424
},
{
"epoch": 0.6483166515013649,
"grad_norm": 1.996849276778458,
"learning_rate": 4.79571845148039e-06,
"loss": 0.1443,
"step": 1425
},
{
"epoch": 0.6487716105550501,
"grad_norm": 1.1655015182194328,
"learning_rate": 4.795435418180879e-06,
"loss": 0.0895,
"step": 1426
},
{
"epoch": 0.6492265696087353,
"grad_norm": 1.6476688817001566,
"learning_rate": 4.795152197308753e-06,
"loss": 0.0993,
"step": 1427
},
{
"epoch": 0.6496815286624203,
"grad_norm": 1.0099999351331836,
"learning_rate": 4.794868788887154e-06,
"loss": 0.0671,
"step": 1428
},
{
"epoch": 0.6501364877161055,
"grad_norm": 1.8391539690012708,
"learning_rate": 4.79458519293924e-06,
"loss": 0.1345,
"step": 1429
},
{
"epoch": 0.6505914467697907,
"grad_norm": 1.4752057458255263,
"learning_rate": 4.794301409488187e-06,
"loss": 0.0873,
"step": 1430
},
{
"epoch": 0.6510464058234758,
"grad_norm": 1.2943024580621056,
"learning_rate": 4.7940174385571835e-06,
"loss": 0.0802,
"step": 1431
},
{
"epoch": 0.651501364877161,
"grad_norm": 1.3918512180039062,
"learning_rate": 4.793733280169435e-06,
"loss": 0.0993,
"step": 1432
},
{
"epoch": 0.6519563239308462,
"grad_norm": 2.2174420994103574,
"learning_rate": 4.7934489343481614e-06,
"loss": 0.1425,
"step": 1433
},
{
"epoch": 0.6524112829845314,
"grad_norm": 1.774834870886046,
"learning_rate": 4.7931644011165975e-06,
"loss": 0.0982,
"step": 1434
},
{
"epoch": 0.6528662420382165,
"grad_norm": 1.2208864014501382,
"learning_rate": 4.792879680497995e-06,
"loss": 0.0807,
"step": 1435
},
{
"epoch": 0.6533212010919017,
"grad_norm": 1.8182347519697841,
"learning_rate": 4.79259477251562e-06,
"loss": 0.1194,
"step": 1436
},
{
"epoch": 0.6537761601455869,
"grad_norm": 1.8801650010523618,
"learning_rate": 4.792309677192753e-06,
"loss": 0.1326,
"step": 1437
},
{
"epoch": 0.654231119199272,
"grad_norm": 1.776650087976607,
"learning_rate": 4.79202439455269e-06,
"loss": 0.0995,
"step": 1438
},
{
"epoch": 0.6546860782529572,
"grad_norm": 1.2419464528847455,
"learning_rate": 4.791738924618745e-06,
"loss": 0.0819,
"step": 1439
},
{
"epoch": 0.6551410373066424,
"grad_norm": 1.3878814997047564,
"learning_rate": 4.791453267414245e-06,
"loss": 0.077,
"step": 1440
},
{
"epoch": 0.6555959963603276,
"grad_norm": 1.3963850212985605,
"learning_rate": 4.7911674229625316e-06,
"loss": 0.0797,
"step": 1441
},
{
"epoch": 0.6560509554140127,
"grad_norm": 1.9634000929904991,
"learning_rate": 4.790881391286963e-06,
"loss": 0.1173,
"step": 1442
},
{
"epoch": 0.6565059144676979,
"grad_norm": 1.5553330936936114,
"learning_rate": 4.790595172410914e-06,
"loss": 0.099,
"step": 1443
},
{
"epoch": 0.6569608735213831,
"grad_norm": 1.9255393679593797,
"learning_rate": 4.79030876635777e-06,
"loss": 0.1353,
"step": 1444
},
{
"epoch": 0.6574158325750682,
"grad_norm": 1.461167870438619,
"learning_rate": 4.790022173150938e-06,
"loss": 0.1049,
"step": 1445
},
{
"epoch": 0.6578707916287534,
"grad_norm": 1.0062740037097007,
"learning_rate": 4.789735392813835e-06,
"loss": 0.0594,
"step": 1446
},
{
"epoch": 0.6583257506824386,
"grad_norm": 1.4058443933458273,
"learning_rate": 4.789448425369896e-06,
"loss": 0.0872,
"step": 1447
},
{
"epoch": 0.6587807097361238,
"grad_norm": 1.5311615159042697,
"learning_rate": 4.789161270842571e-06,
"loss": 0.0939,
"step": 1448
},
{
"epoch": 0.6592356687898089,
"grad_norm": 1.6595649465936542,
"learning_rate": 4.7888739292553235e-06,
"loss": 0.1248,
"step": 1449
},
{
"epoch": 0.6596906278434941,
"grad_norm": 1.7051412400140817,
"learning_rate": 4.788586400631636e-06,
"loss": 0.1197,
"step": 1450
},
{
"epoch": 0.6601455868971793,
"grad_norm": 1.2115114973668668,
"learning_rate": 4.788298684995003e-06,
"loss": 0.0905,
"step": 1451
},
{
"epoch": 0.6606005459508644,
"grad_norm": 1.4239694731611245,
"learning_rate": 4.7880107823689355e-06,
"loss": 0.0801,
"step": 1452
},
{
"epoch": 0.6610555050045496,
"grad_norm": 1.5925606772355265,
"learning_rate": 4.787722692776958e-06,
"loss": 0.1183,
"step": 1453
},
{
"epoch": 0.6615104640582348,
"grad_norm": 1.7931970729363222,
"learning_rate": 4.787434416242615e-06,
"loss": 0.1189,
"step": 1454
},
{
"epoch": 0.66196542311192,
"grad_norm": 2.3171059544303874,
"learning_rate": 4.787145952789461e-06,
"loss": 0.1436,
"step": 1455
},
{
"epoch": 0.6624203821656051,
"grad_norm": 1.4441484331538328,
"learning_rate": 4.786857302441069e-06,
"loss": 0.0781,
"step": 1456
},
{
"epoch": 0.6628753412192903,
"grad_norm": 1.690439275216053,
"learning_rate": 4.786568465221025e-06,
"loss": 0.1111,
"step": 1457
},
{
"epoch": 0.6633303002729755,
"grad_norm": 1.6812302333143159,
"learning_rate": 4.7862794411529315e-06,
"loss": 0.1175,
"step": 1458
},
{
"epoch": 0.6637852593266605,
"grad_norm": 1.9541579133281037,
"learning_rate": 4.7859902302604075e-06,
"loss": 0.1329,
"step": 1459
},
{
"epoch": 0.6642402183803457,
"grad_norm": 1.8591409223722424,
"learning_rate": 4.785700832567085e-06,
"loss": 0.1211,
"step": 1460
},
{
"epoch": 0.664695177434031,
"grad_norm": 1.325162611861324,
"learning_rate": 4.785411248096613e-06,
"loss": 0.0743,
"step": 1461
},
{
"epoch": 0.6651501364877161,
"grad_norm": 1.3065112220161235,
"learning_rate": 4.785121476872654e-06,
"loss": 0.1034,
"step": 1462
},
{
"epoch": 0.6656050955414012,
"grad_norm": 1.5925894626386907,
"learning_rate": 4.784831518918888e-06,
"loss": 0.1196,
"step": 1463
},
{
"epoch": 0.6660600545950864,
"grad_norm": 1.1820283205821733,
"learning_rate": 4.784541374259008e-06,
"loss": 0.0769,
"step": 1464
},
{
"epoch": 0.6665150136487716,
"grad_norm": 1.571736758093102,
"learning_rate": 4.7842510429167244e-06,
"loss": 0.1,
"step": 1465
},
{
"epoch": 0.6669699727024567,
"grad_norm": 1.5876822973446192,
"learning_rate": 4.783960524915761e-06,
"loss": 0.1214,
"step": 1466
},
{
"epoch": 0.6674249317561419,
"grad_norm": 1.5160576603586384,
"learning_rate": 4.783669820279858e-06,
"loss": 0.0979,
"step": 1467
},
{
"epoch": 0.6678798908098271,
"grad_norm": 1.2434477128547956,
"learning_rate": 4.783378929032769e-06,
"loss": 0.0824,
"step": 1468
},
{
"epoch": 0.6683348498635123,
"grad_norm": 1.46291955617626,
"learning_rate": 4.783087851198267e-06,
"loss": 0.0942,
"step": 1469
},
{
"epoch": 0.6687898089171974,
"grad_norm": 1.7951492565076614,
"learning_rate": 4.7827965868001356e-06,
"loss": 0.1192,
"step": 1470
},
{
"epoch": 0.6692447679708826,
"grad_norm": 1.4406289448080234,
"learning_rate": 4.782505135862176e-06,
"loss": 0.1009,
"step": 1471
},
{
"epoch": 0.6696997270245678,
"grad_norm": 1.4538780681359404,
"learning_rate": 4.782213498408205e-06,
"loss": 0.1012,
"step": 1472
},
{
"epoch": 0.6701546860782529,
"grad_norm": 1.4490300401257787,
"learning_rate": 4.781921674462053e-06,
"loss": 0.0782,
"step": 1473
},
{
"epoch": 0.6706096451319381,
"grad_norm": 1.8860995116874109,
"learning_rate": 4.781629664047566e-06,
"loss": 0.1148,
"step": 1474
},
{
"epoch": 0.6710646041856233,
"grad_norm": 1.3918036510588907,
"learning_rate": 4.781337467188607e-06,
"loss": 0.1025,
"step": 1475
},
{
"epoch": 0.6715195632393085,
"grad_norm": 2.3859380054935344,
"learning_rate": 4.781045083909053e-06,
"loss": 0.1219,
"step": 1476
},
{
"epoch": 0.6719745222929936,
"grad_norm": 1.9401784591368603,
"learning_rate": 4.780752514232796e-06,
"loss": 0.1022,
"step": 1477
},
{
"epoch": 0.6724294813466788,
"grad_norm": 1.374892200929808,
"learning_rate": 4.780459758183743e-06,
"loss": 0.0896,
"step": 1478
},
{
"epoch": 0.672884440400364,
"grad_norm": 1.4250914966637114,
"learning_rate": 4.780166815785817e-06,
"loss": 0.0907,
"step": 1479
},
{
"epoch": 0.6733393994540491,
"grad_norm": 1.3888650548243648,
"learning_rate": 4.7798736870629554e-06,
"loss": 0.1102,
"step": 1480
},
{
"epoch": 0.6737943585077343,
"grad_norm": 1.5225956652456023,
"learning_rate": 4.779580372039113e-06,
"loss": 0.0809,
"step": 1481
},
{
"epoch": 0.6742493175614195,
"grad_norm": 2.133500594182355,
"learning_rate": 4.779286870738256e-06,
"loss": 0.1069,
"step": 1482
},
{
"epoch": 0.6747042766151047,
"grad_norm": 1.6417529269403512,
"learning_rate": 4.778993183184371e-06,
"loss": 0.0879,
"step": 1483
},
{
"epoch": 0.6751592356687898,
"grad_norm": 2.188184230975794,
"learning_rate": 4.778699309401453e-06,
"loss": 0.1196,
"step": 1484
},
{
"epoch": 0.675614194722475,
"grad_norm": 1.366654497975806,
"learning_rate": 4.7784052494135195e-06,
"loss": 0.0952,
"step": 1485
},
{
"epoch": 0.6760691537761602,
"grad_norm": 2.2251300669835734,
"learning_rate": 4.778111003244596e-06,
"loss": 0.0962,
"step": 1486
},
{
"epoch": 0.6765241128298453,
"grad_norm": 1.2239477453163228,
"learning_rate": 4.777816570918731e-06,
"loss": 0.0771,
"step": 1487
},
{
"epoch": 0.6769790718835305,
"grad_norm": 1.4442063624509236,
"learning_rate": 4.777521952459982e-06,
"loss": 0.0881,
"step": 1488
},
{
"epoch": 0.6774340309372157,
"grad_norm": 1.792892312265488,
"learning_rate": 4.777227147892424e-06,
"loss": 0.108,
"step": 1489
},
{
"epoch": 0.6778889899909009,
"grad_norm": 1.5848897809985478,
"learning_rate": 4.776932157240147e-06,
"loss": 0.0973,
"step": 1490
},
{
"epoch": 0.678343949044586,
"grad_norm": 1.5924788947742,
"learning_rate": 4.776636980527257e-06,
"loss": 0.0997,
"step": 1491
},
{
"epoch": 0.6787989080982711,
"grad_norm": 1.7689554235448024,
"learning_rate": 4.776341617777874e-06,
"loss": 0.0907,
"step": 1492
},
{
"epoch": 0.6792538671519563,
"grad_norm": 1.561936690334899,
"learning_rate": 4.776046069016135e-06,
"loss": 0.1045,
"step": 1493
},
{
"epoch": 0.6797088262056415,
"grad_norm": 1.978023029084926,
"learning_rate": 4.775750334266188e-06,
"loss": 0.1316,
"step": 1494
},
{
"epoch": 0.6801637852593266,
"grad_norm": 1.2221171400180673,
"learning_rate": 4.775454413552202e-06,
"loss": 0.0708,
"step": 1495
},
{
"epoch": 0.6806187443130118,
"grad_norm": 2.2916692264154848,
"learning_rate": 4.775158306898358e-06,
"loss": 0.1045,
"step": 1496
},
{
"epoch": 0.681073703366697,
"grad_norm": 1.5270730953843772,
"learning_rate": 4.774862014328849e-06,
"loss": 0.087,
"step": 1497
},
{
"epoch": 0.6815286624203821,
"grad_norm": 1.5001501033936573,
"learning_rate": 4.774565535867892e-06,
"loss": 0.083,
"step": 1498
},
{
"epoch": 0.6819836214740673,
"grad_norm": 2.228962091730558,
"learning_rate": 4.77426887153971e-06,
"loss": 0.132,
"step": 1499
},
{
"epoch": 0.6824385805277525,
"grad_norm": 1.7756631467911705,
"learning_rate": 4.773972021368546e-06,
"loss": 0.1156,
"step": 1500
},
{
"epoch": 0.6828935395814377,
"grad_norm": 1.9028113721779674,
"learning_rate": 4.773674985378658e-06,
"loss": 0.1692,
"step": 1501
},
{
"epoch": 0.6833484986351228,
"grad_norm": 1.591856567558633,
"learning_rate": 4.773377763594319e-06,
"loss": 0.0829,
"step": 1502
},
{
"epoch": 0.683803457688808,
"grad_norm": 1.7330424169213765,
"learning_rate": 4.773080356039814e-06,
"loss": 0.1079,
"step": 1503
},
{
"epoch": 0.6842584167424932,
"grad_norm": 1.3093378510726064,
"learning_rate": 4.772782762739448e-06,
"loss": 0.0919,
"step": 1504
},
{
"epoch": 0.6847133757961783,
"grad_norm": 1.5644465201102973,
"learning_rate": 4.772484983717539e-06,
"loss": 0.096,
"step": 1505
},
{
"epoch": 0.6851683348498635,
"grad_norm": 1.7535246249527565,
"learning_rate": 4.77218701899842e-06,
"loss": 0.1025,
"step": 1506
},
{
"epoch": 0.6856232939035487,
"grad_norm": 1.682557507776212,
"learning_rate": 4.771888868606438e-06,
"loss": 0.1245,
"step": 1507
},
{
"epoch": 0.6860782529572339,
"grad_norm": 1.1063626083550568,
"learning_rate": 4.771590532565957e-06,
"loss": 0.0628,
"step": 1508
},
{
"epoch": 0.686533212010919,
"grad_norm": 1.447485907138006,
"learning_rate": 4.771292010901357e-06,
"loss": 0.0756,
"step": 1509
},
{
"epoch": 0.6869881710646042,
"grad_norm": 1.9968564435349099,
"learning_rate": 4.77099330363703e-06,
"loss": 0.1121,
"step": 1510
},
{
"epoch": 0.6874431301182894,
"grad_norm": 1.331414088559165,
"learning_rate": 4.770694410797387e-06,
"loss": 0.0918,
"step": 1511
},
{
"epoch": 0.6878980891719745,
"grad_norm": 1.7374051988448433,
"learning_rate": 4.770395332406851e-06,
"loss": 0.1046,
"step": 1512
},
{
"epoch": 0.6883530482256597,
"grad_norm": 1.5590482284052172,
"learning_rate": 4.770096068489861e-06,
"loss": 0.1045,
"step": 1513
},
{
"epoch": 0.6888080072793449,
"grad_norm": 1.2266167614387768,
"learning_rate": 4.769796619070872e-06,
"loss": 0.0877,
"step": 1514
},
{
"epoch": 0.6892629663330301,
"grad_norm": 1.1438287132644533,
"learning_rate": 4.769496984174353e-06,
"loss": 0.0759,
"step": 1515
},
{
"epoch": 0.6897179253867152,
"grad_norm": 1.5191110521315079,
"learning_rate": 4.769197163824791e-06,
"loss": 0.0839,
"step": 1516
},
{
"epoch": 0.6901728844404004,
"grad_norm": 1.5352637302100918,
"learning_rate": 4.768897158046683e-06,
"loss": 0.0927,
"step": 1517
},
{
"epoch": 0.6906278434940856,
"grad_norm": 1.224151460496261,
"learning_rate": 4.768596966864546e-06,
"loss": 0.0758,
"step": 1518
},
{
"epoch": 0.6910828025477707,
"grad_norm": 2.097275342036678,
"learning_rate": 4.76829659030291e-06,
"loss": 0.1606,
"step": 1519
},
{
"epoch": 0.6915377616014559,
"grad_norm": 1.773445388033648,
"learning_rate": 4.767996028386319e-06,
"loss": 0.1071,
"step": 1520
},
{
"epoch": 0.6919927206551411,
"grad_norm": 1.798404416562804,
"learning_rate": 4.767695281139336e-06,
"loss": 0.0882,
"step": 1521
},
{
"epoch": 0.6924476797088263,
"grad_norm": 1.6643609283655776,
"learning_rate": 4.767394348586535e-06,
"loss": 0.0986,
"step": 1522
},
{
"epoch": 0.6929026387625113,
"grad_norm": 1.8351458616302123,
"learning_rate": 4.767093230752507e-06,
"loss": 0.1398,
"step": 1523
},
{
"epoch": 0.6933575978161965,
"grad_norm": 1.695947028633324,
"learning_rate": 4.766791927661859e-06,
"loss": 0.1217,
"step": 1524
},
{
"epoch": 0.6938125568698817,
"grad_norm": 1.706097971198418,
"learning_rate": 4.766490439339211e-06,
"loss": 0.0852,
"step": 1525
},
{
"epoch": 0.6942675159235668,
"grad_norm": 1.6641835764066073,
"learning_rate": 4.7661887658092e-06,
"loss": 0.1078,
"step": 1526
},
{
"epoch": 0.694722474977252,
"grad_norm": 1.4721263946542316,
"learning_rate": 4.765886907096477e-06,
"loss": 0.1046,
"step": 1527
},
{
"epoch": 0.6951774340309372,
"grad_norm": 1.7677748922664356,
"learning_rate": 4.7655848632257084e-06,
"loss": 0.1257,
"step": 1528
},
{
"epoch": 0.6956323930846224,
"grad_norm": 1.5849838243983163,
"learning_rate": 4.7652826342215764e-06,
"loss": 0.113,
"step": 1529
},
{
"epoch": 0.6960873521383075,
"grad_norm": 1.8167247958495556,
"learning_rate": 4.764980220108777e-06,
"loss": 0.1308,
"step": 1530
},
{
"epoch": 0.6965423111919927,
"grad_norm": 2.259597776447737,
"learning_rate": 4.764677620912022e-06,
"loss": 0.1488,
"step": 1531
},
{
"epoch": 0.6969972702456779,
"grad_norm": 1.3871244274886438,
"learning_rate": 4.764374836656041e-06,
"loss": 0.1014,
"step": 1532
},
{
"epoch": 0.697452229299363,
"grad_norm": 1.261518456907349,
"learning_rate": 4.764071867365571e-06,
"loss": 0.0998,
"step": 1533
},
{
"epoch": 0.6979071883530482,
"grad_norm": 1.7720377742538196,
"learning_rate": 4.763768713065375e-06,
"loss": 0.1003,
"step": 1534
},
{
"epoch": 0.6983621474067334,
"grad_norm": 1.9316342411609453,
"learning_rate": 4.763465373780223e-06,
"loss": 0.1218,
"step": 1535
},
{
"epoch": 0.6988171064604186,
"grad_norm": 1.7090441393124594,
"learning_rate": 4.763161849534902e-06,
"loss": 0.1016,
"step": 1536
},
{
"epoch": 0.6992720655141037,
"grad_norm": 1.0772372058571478,
"learning_rate": 4.762858140354214e-06,
"loss": 0.0795,
"step": 1537
},
{
"epoch": 0.6997270245677889,
"grad_norm": 1.5989783419371975,
"learning_rate": 4.7625542462629785e-06,
"loss": 0.1051,
"step": 1538
},
{
"epoch": 0.7001819836214741,
"grad_norm": 0.9329076652331691,
"learning_rate": 4.762250167286027e-06,
"loss": 0.0492,
"step": 1539
},
{
"epoch": 0.7006369426751592,
"grad_norm": 1.7557978189042716,
"learning_rate": 4.761945903448209e-06,
"loss": 0.1336,
"step": 1540
},
{
"epoch": 0.7010919017288444,
"grad_norm": 1.1252616618728841,
"learning_rate": 4.761641454774386e-06,
"loss": 0.085,
"step": 1541
},
{
"epoch": 0.7015468607825296,
"grad_norm": 1.9520354546929128,
"learning_rate": 4.761336821289436e-06,
"loss": 0.158,
"step": 1542
},
{
"epoch": 0.7020018198362148,
"grad_norm": 1.088110444545801,
"learning_rate": 4.761032003018254e-06,
"loss": 0.0667,
"step": 1543
},
{
"epoch": 0.7024567788898999,
"grad_norm": 1.353551986968956,
"learning_rate": 4.760726999985748e-06,
"loss": 0.0748,
"step": 1544
},
{
"epoch": 0.7029117379435851,
"grad_norm": 1.2483430565784006,
"learning_rate": 4.7604218122168406e-06,
"loss": 0.0821,
"step": 1545
},
{
"epoch": 0.7033666969972703,
"grad_norm": 2.014581699156683,
"learning_rate": 4.760116439736471e-06,
"loss": 0.1376,
"step": 1546
},
{
"epoch": 0.7038216560509554,
"grad_norm": 2.2990546871467386,
"learning_rate": 4.759810882569591e-06,
"loss": 0.1528,
"step": 1547
},
{
"epoch": 0.7042766151046406,
"grad_norm": 1.062682027844058,
"learning_rate": 4.759505140741172e-06,
"loss": 0.0646,
"step": 1548
},
{
"epoch": 0.7047315741583258,
"grad_norm": 2.1924162550625863,
"learning_rate": 4.759199214276196e-06,
"loss": 0.1277,
"step": 1549
},
{
"epoch": 0.705186533212011,
"grad_norm": 1.4339312162219853,
"learning_rate": 4.758893103199665e-06,
"loss": 0.1056,
"step": 1550
},
{
"epoch": 0.7056414922656961,
"grad_norm": 1.6814902406994063,
"learning_rate": 4.758586807536588e-06,
"loss": 0.0968,
"step": 1551
},
{
"epoch": 0.7060964513193813,
"grad_norm": 1.055808036587697,
"learning_rate": 4.758280327311998e-06,
"loss": 0.0624,
"step": 1552
},
{
"epoch": 0.7065514103730665,
"grad_norm": 2.092612313664783,
"learning_rate": 4.757973662550938e-06,
"loss": 0.1076,
"step": 1553
},
{
"epoch": 0.7070063694267515,
"grad_norm": 1.2099784449421243,
"learning_rate": 4.757666813278466e-06,
"loss": 0.0791,
"step": 1554
},
{
"epoch": 0.7074613284804367,
"grad_norm": 1.7701219392423706,
"learning_rate": 4.757359779519659e-06,
"loss": 0.1158,
"step": 1555
},
{
"epoch": 0.707916287534122,
"grad_norm": 1.9442818433331057,
"learning_rate": 4.757052561299604e-06,
"loss": 0.1498,
"step": 1556
},
{
"epoch": 0.7083712465878071,
"grad_norm": 2.1934930579734417,
"learning_rate": 4.756745158643407e-06,
"loss": 0.1446,
"step": 1557
},
{
"epoch": 0.7088262056414922,
"grad_norm": 1.852211386061071,
"learning_rate": 4.7564375715761865e-06,
"loss": 0.1163,
"step": 1558
},
{
"epoch": 0.7092811646951774,
"grad_norm": 0.8096640629799587,
"learning_rate": 4.756129800123078e-06,
"loss": 0.0398,
"step": 1559
},
{
"epoch": 0.7097361237488626,
"grad_norm": 1.414444864803518,
"learning_rate": 4.755821844309232e-06,
"loss": 0.1126,
"step": 1560
},
{
"epoch": 0.7101910828025477,
"grad_norm": 1.598441885528022,
"learning_rate": 4.75551370415981e-06,
"loss": 0.1008,
"step": 1561
},
{
"epoch": 0.7106460418562329,
"grad_norm": 1.7052656116179543,
"learning_rate": 4.755205379699996e-06,
"loss": 0.105,
"step": 1562
},
{
"epoch": 0.7111010009099181,
"grad_norm": 1.570140158085679,
"learning_rate": 4.75489687095498e-06,
"loss": 0.103,
"step": 1563
},
{
"epoch": 0.7115559599636033,
"grad_norm": 1.6128979312038125,
"learning_rate": 4.754588177949977e-06,
"loss": 0.0947,
"step": 1564
},
{
"epoch": 0.7120109190172884,
"grad_norm": 1.5157416875909306,
"learning_rate": 4.7542793007102086e-06,
"loss": 0.0826,
"step": 1565
},
{
"epoch": 0.7124658780709736,
"grad_norm": 1.7615482286425264,
"learning_rate": 4.7539702392609165e-06,
"loss": 0.1349,
"step": 1566
},
{
"epoch": 0.7129208371246588,
"grad_norm": 1.1762971295347604,
"learning_rate": 4.753660993627356e-06,
"loss": 0.0649,
"step": 1567
},
{
"epoch": 0.7133757961783439,
"grad_norm": 2.155472421625263,
"learning_rate": 4.753351563834795e-06,
"loss": 0.1308,
"step": 1568
},
{
"epoch": 0.7138307552320291,
"grad_norm": 1.7676905218706818,
"learning_rate": 4.753041949908521e-06,
"loss": 0.1034,
"step": 1569
},
{
"epoch": 0.7142857142857143,
"grad_norm": 1.8250626593850294,
"learning_rate": 4.752732151873834e-06,
"loss": 0.1,
"step": 1570
},
{
"epoch": 0.7147406733393995,
"grad_norm": 1.4984330035047126,
"learning_rate": 4.752422169756048e-06,
"loss": 0.1107,
"step": 1571
},
{
"epoch": 0.7151956323930846,
"grad_norm": 1.2161952645703746,
"learning_rate": 4.752112003580495e-06,
"loss": 0.0772,
"step": 1572
},
{
"epoch": 0.7156505914467698,
"grad_norm": 1.8268634010084226,
"learning_rate": 4.751801653372518e-06,
"loss": 0.0853,
"step": 1573
},
{
"epoch": 0.716105550500455,
"grad_norm": 1.6855455239576989,
"learning_rate": 4.751491119157481e-06,
"loss": 0.1055,
"step": 1574
},
{
"epoch": 0.7165605095541401,
"grad_norm": 1.1214993041730539,
"learning_rate": 4.751180400960756e-06,
"loss": 0.0653,
"step": 1575
},
{
"epoch": 0.7170154686078253,
"grad_norm": 1.8475828844832658,
"learning_rate": 4.7508694988077355e-06,
"loss": 0.1416,
"step": 1576
},
{
"epoch": 0.7174704276615105,
"grad_norm": 1.4469787222461497,
"learning_rate": 4.750558412723824e-06,
"loss": 0.0766,
"step": 1577
},
{
"epoch": 0.7179253867151957,
"grad_norm": 1.6682547194818422,
"learning_rate": 4.750247142734442e-06,
"loss": 0.073,
"step": 1578
},
{
"epoch": 0.7183803457688808,
"grad_norm": 1.8235039708297685,
"learning_rate": 4.749935688865026e-06,
"loss": 0.1299,
"step": 1579
},
{
"epoch": 0.718835304822566,
"grad_norm": 1.2674959382982702,
"learning_rate": 4.749624051141026e-06,
"loss": 0.0639,
"step": 1580
},
{
"epoch": 0.7192902638762512,
"grad_norm": 1.1814301599394401,
"learning_rate": 4.7493122295879076e-06,
"loss": 0.074,
"step": 1581
},
{
"epoch": 0.7197452229299363,
"grad_norm": 1.8607689058316668,
"learning_rate": 4.7490002242311525e-06,
"loss": 0.1202,
"step": 1582
},
{
"epoch": 0.7202001819836215,
"grad_norm": 1.40248476110639,
"learning_rate": 4.748688035096255e-06,
"loss": 0.0831,
"step": 1583
},
{
"epoch": 0.7206551410373067,
"grad_norm": 1.376835864910441,
"learning_rate": 4.748375662208726e-06,
"loss": 0.0627,
"step": 1584
},
{
"epoch": 0.7211101000909919,
"grad_norm": 1.7445419287373105,
"learning_rate": 4.748063105594092e-06,
"loss": 0.1182,
"step": 1585
},
{
"epoch": 0.721565059144677,
"grad_norm": 1.6298546358892563,
"learning_rate": 4.747750365277892e-06,
"loss": 0.1203,
"step": 1586
},
{
"epoch": 0.7220200181983621,
"grad_norm": 1.78857652271692,
"learning_rate": 4.747437441285684e-06,
"loss": 0.0845,
"step": 1587
},
{
"epoch": 0.7224749772520473,
"grad_norm": 1.5543624854659128,
"learning_rate": 4.747124333643038e-06,
"loss": 0.1067,
"step": 1588
},
{
"epoch": 0.7229299363057324,
"grad_norm": 1.6938973264546118,
"learning_rate": 4.746811042375538e-06,
"loss": 0.1092,
"step": 1589
},
{
"epoch": 0.7233848953594176,
"grad_norm": 1.4339359801015907,
"learning_rate": 4.746497567508787e-06,
"loss": 0.1009,
"step": 1590
},
{
"epoch": 0.7238398544131028,
"grad_norm": 1.370915821139941,
"learning_rate": 4.7461839090684e-06,
"loss": 0.0967,
"step": 1591
},
{
"epoch": 0.724294813466788,
"grad_norm": 1.65404522408881,
"learning_rate": 4.745870067080007e-06,
"loss": 0.0936,
"step": 1592
},
{
"epoch": 0.7247497725204731,
"grad_norm": 2.5744395171768026,
"learning_rate": 4.7455560415692545e-06,
"loss": 0.1734,
"step": 1593
},
{
"epoch": 0.7252047315741583,
"grad_norm": 1.6130757907987123,
"learning_rate": 4.745241832561803e-06,
"loss": 0.0782,
"step": 1594
},
{
"epoch": 0.7256596906278435,
"grad_norm": 1.3264278567683987,
"learning_rate": 4.744927440083329e-06,
"loss": 0.0883,
"step": 1595
},
{
"epoch": 0.7261146496815286,
"grad_norm": 1.4845169251283168,
"learning_rate": 4.744612864159522e-06,
"loss": 0.0866,
"step": 1596
},
{
"epoch": 0.7265696087352138,
"grad_norm": 1.867201501230081,
"learning_rate": 4.7442981048160895e-06,
"loss": 0.1239,
"step": 1597
},
{
"epoch": 0.727024567788899,
"grad_norm": 1.5395932028522379,
"learning_rate": 4.74398316207875e-06,
"loss": 0.0937,
"step": 1598
},
{
"epoch": 0.7274795268425842,
"grad_norm": 2.47394198911153,
"learning_rate": 4.74366803597324e-06,
"loss": 0.1896,
"step": 1599
},
{
"epoch": 0.7279344858962693,
"grad_norm": 1.6788148875306355,
"learning_rate": 4.743352726525311e-06,
"loss": 0.1001,
"step": 1600
},
{
"epoch": 0.7283894449499545,
"grad_norm": 1.1785705121541328,
"learning_rate": 4.743037233760728e-06,
"loss": 0.0723,
"step": 1601
},
{
"epoch": 0.7288444040036397,
"grad_norm": 1.5889628523330563,
"learning_rate": 4.742721557705271e-06,
"loss": 0.0978,
"step": 1602
},
{
"epoch": 0.7292993630573248,
"grad_norm": 1.3734642738638374,
"learning_rate": 4.7424056983847374e-06,
"loss": 0.0961,
"step": 1603
},
{
"epoch": 0.72975432211101,
"grad_norm": 1.6433399521175855,
"learning_rate": 4.7420896558249366e-06,
"loss": 0.1037,
"step": 1604
},
{
"epoch": 0.7302092811646952,
"grad_norm": 1.0189389361932368,
"learning_rate": 4.741773430051694e-06,
"loss": 0.0571,
"step": 1605
},
{
"epoch": 0.7306642402183804,
"grad_norm": 1.8326786415176635,
"learning_rate": 4.74145702109085e-06,
"loss": 0.1069,
"step": 1606
},
{
"epoch": 0.7311191992720655,
"grad_norm": 1.9145632983548877,
"learning_rate": 4.741140428968261e-06,
"loss": 0.1155,
"step": 1607
},
{
"epoch": 0.7315741583257507,
"grad_norm": 0.8975672007604479,
"learning_rate": 4.740823653709797e-06,
"loss": 0.0594,
"step": 1608
},
{
"epoch": 0.7320291173794359,
"grad_norm": 1.1104882324072687,
"learning_rate": 4.740506695341343e-06,
"loss": 0.0774,
"step": 1609
},
{
"epoch": 0.732484076433121,
"grad_norm": 1.8804023117943707,
"learning_rate": 4.740189553888801e-06,
"loss": 0.1265,
"step": 1610
},
{
"epoch": 0.7329390354868062,
"grad_norm": 1.3783166591523974,
"learning_rate": 4.739872229378085e-06,
"loss": 0.0849,
"step": 1611
},
{
"epoch": 0.7333939945404914,
"grad_norm": 1.5383875985636057,
"learning_rate": 4.739554721835125e-06,
"loss": 0.0764,
"step": 1612
},
{
"epoch": 0.7338489535941766,
"grad_norm": 1.7836575489679842,
"learning_rate": 4.739237031285867e-06,
"loss": 0.1208,
"step": 1613
},
{
"epoch": 0.7343039126478617,
"grad_norm": 2.0374287466508343,
"learning_rate": 4.738919157756272e-06,
"loss": 0.1283,
"step": 1614
},
{
"epoch": 0.7347588717015469,
"grad_norm": 1.4713023421634537,
"learning_rate": 4.738601101272313e-06,
"loss": 0.1143,
"step": 1615
},
{
"epoch": 0.7352138307552321,
"grad_norm": 1.3004252033026868,
"learning_rate": 4.738282861859983e-06,
"loss": 0.0785,
"step": 1616
},
{
"epoch": 0.7356687898089171,
"grad_norm": 1.7078107635335555,
"learning_rate": 4.737964439545284e-06,
"loss": 0.0989,
"step": 1617
},
{
"epoch": 0.7361237488626023,
"grad_norm": 1.482235192071265,
"learning_rate": 4.737645834354238e-06,
"loss": 0.0889,
"step": 1618
},
{
"epoch": 0.7365787079162875,
"grad_norm": 1.3632184750760454,
"learning_rate": 4.737327046312879e-06,
"loss": 0.0728,
"step": 1619
},
{
"epoch": 0.7370336669699727,
"grad_norm": 1.6185932631828381,
"learning_rate": 4.737008075447259e-06,
"loss": 0.0967,
"step": 1620
},
{
"epoch": 0.7374886260236578,
"grad_norm": 1.7060869720795129,
"learning_rate": 4.73668892178344e-06,
"loss": 0.1054,
"step": 1621
},
{
"epoch": 0.737943585077343,
"grad_norm": 1.672488053873089,
"learning_rate": 4.736369585347503e-06,
"loss": 0.1172,
"step": 1622
},
{
"epoch": 0.7383985441310282,
"grad_norm": 2.009207481858011,
"learning_rate": 4.736050066165544e-06,
"loss": 0.1104,
"step": 1623
},
{
"epoch": 0.7388535031847133,
"grad_norm": 1.7386909135986017,
"learning_rate": 4.735730364263671e-06,
"loss": 0.1142,
"step": 1624
},
{
"epoch": 0.7393084622383985,
"grad_norm": 1.6299431755796778,
"learning_rate": 4.735410479668009e-06,
"loss": 0.109,
"step": 1625
},
{
"epoch": 0.7397634212920837,
"grad_norm": 1.5971057123643035,
"learning_rate": 4.735090412404697e-06,
"loss": 0.1037,
"step": 1626
},
{
"epoch": 0.7402183803457689,
"grad_norm": 1.4066558803560258,
"learning_rate": 4.734770162499891e-06,
"loss": 0.0718,
"step": 1627
},
{
"epoch": 0.740673339399454,
"grad_norm": 1.3437849408188942,
"learning_rate": 4.734449729979759e-06,
"loss": 0.0775,
"step": 1628
},
{
"epoch": 0.7411282984531392,
"grad_norm": 1.8126383722195984,
"learning_rate": 4.734129114870486e-06,
"loss": 0.1097,
"step": 1629
},
{
"epoch": 0.7415832575068244,
"grad_norm": 1.7276681892706887,
"learning_rate": 4.733808317198271e-06,
"loss": 0.075,
"step": 1630
},
{
"epoch": 0.7420382165605095,
"grad_norm": 1.4303092464154914,
"learning_rate": 4.733487336989327e-06,
"loss": 0.0839,
"step": 1631
},
{
"epoch": 0.7424931756141947,
"grad_norm": 1.8755052783018096,
"learning_rate": 4.733166174269886e-06,
"loss": 0.1156,
"step": 1632
},
{
"epoch": 0.7429481346678799,
"grad_norm": 1.4937298948438007,
"learning_rate": 4.732844829066189e-06,
"loss": 0.1005,
"step": 1633
},
{
"epoch": 0.7434030937215651,
"grad_norm": 1.641256737556786,
"learning_rate": 4.732523301404497e-06,
"loss": 0.1038,
"step": 1634
},
{
"epoch": 0.7438580527752502,
"grad_norm": 1.8968655868657809,
"learning_rate": 4.732201591311082e-06,
"loss": 0.1318,
"step": 1635
},
{
"epoch": 0.7443130118289354,
"grad_norm": 1.5647661977098755,
"learning_rate": 4.731879698812233e-06,
"loss": 0.1295,
"step": 1636
},
{
"epoch": 0.7447679708826206,
"grad_norm": 1.3130665672457837,
"learning_rate": 4.731557623934255e-06,
"loss": 0.0797,
"step": 1637
},
{
"epoch": 0.7452229299363057,
"grad_norm": 2.2524036787204236,
"learning_rate": 4.7312353667034645e-06,
"loss": 0.1549,
"step": 1638
},
{
"epoch": 0.7456778889899909,
"grad_norm": 2.171706574250327,
"learning_rate": 4.730912927146197e-06,
"loss": 0.1221,
"step": 1639
},
{
"epoch": 0.7461328480436761,
"grad_norm": 1.3055559061415911,
"learning_rate": 4.7305903052888e-06,
"loss": 0.0797,
"step": 1640
},
{
"epoch": 0.7465878070973613,
"grad_norm": 1.9092438244747783,
"learning_rate": 4.730267501157636e-06,
"loss": 0.1211,
"step": 1641
},
{
"epoch": 0.7470427661510464,
"grad_norm": 1.3873103303782754,
"learning_rate": 4.729944514779084e-06,
"loss": 0.0863,
"step": 1642
},
{
"epoch": 0.7474977252047316,
"grad_norm": 1.3769315994876887,
"learning_rate": 4.729621346179536e-06,
"loss": 0.095,
"step": 1643
},
{
"epoch": 0.7479526842584168,
"grad_norm": 1.3309888167219324,
"learning_rate": 4.7292979953854e-06,
"loss": 0.091,
"step": 1644
},
{
"epoch": 0.7484076433121019,
"grad_norm": 1.3388937850633889,
"learning_rate": 4.7289744624231004e-06,
"loss": 0.0715,
"step": 1645
},
{
"epoch": 0.7488626023657871,
"grad_norm": 2.9889212809141026,
"learning_rate": 4.728650747319073e-06,
"loss": 0.1403,
"step": 1646
},
{
"epoch": 0.7493175614194723,
"grad_norm": 1.7436207494414042,
"learning_rate": 4.728326850099771e-06,
"loss": 0.11,
"step": 1647
},
{
"epoch": 0.7497725204731575,
"grad_norm": 1.2990437768947476,
"learning_rate": 4.728002770791663e-06,
"loss": 0.0982,
"step": 1648
},
{
"epoch": 0.7502274795268425,
"grad_norm": 1.344045724677696,
"learning_rate": 4.727678509421229e-06,
"loss": 0.0922,
"step": 1649
},
{
"epoch": 0.7506824385805277,
"grad_norm": 1.1045854705826224,
"learning_rate": 4.727354066014968e-06,
"loss": 0.0704,
"step": 1650
},
{
"epoch": 0.7511373976342129,
"grad_norm": 1.5988720844668791,
"learning_rate": 4.727029440599391e-06,
"loss": 0.1066,
"step": 1651
},
{
"epoch": 0.7515923566878981,
"grad_norm": 1.3512878420396681,
"learning_rate": 4.726704633201025e-06,
"loss": 0.074,
"step": 1652
},
{
"epoch": 0.7520473157415832,
"grad_norm": 1.669678273086279,
"learning_rate": 4.726379643846412e-06,
"loss": 0.1167,
"step": 1653
},
{
"epoch": 0.7525022747952684,
"grad_norm": 1.8860050110009976,
"learning_rate": 4.726054472562109e-06,
"loss": 0.1203,
"step": 1654
},
{
"epoch": 0.7529572338489536,
"grad_norm": 1.1328199081442367,
"learning_rate": 4.725729119374687e-06,
"loss": 0.0715,
"step": 1655
},
{
"epoch": 0.7534121929026387,
"grad_norm": 1.369550149899098,
"learning_rate": 4.725403584310734e-06,
"loss": 0.0788,
"step": 1656
},
{
"epoch": 0.7538671519563239,
"grad_norm": 1.7251897843263797,
"learning_rate": 4.725077867396849e-06,
"loss": 0.0951,
"step": 1657
},
{
"epoch": 0.7543221110100091,
"grad_norm": 1.4350282883675796,
"learning_rate": 4.724751968659648e-06,
"loss": 0.1096,
"step": 1658
},
{
"epoch": 0.7547770700636943,
"grad_norm": 1.9342343144020262,
"learning_rate": 4.724425888125764e-06,
"loss": 0.125,
"step": 1659
},
{
"epoch": 0.7552320291173794,
"grad_norm": 1.6341803441145442,
"learning_rate": 4.724099625821842e-06,
"loss": 0.0945,
"step": 1660
},
{
"epoch": 0.7556869881710646,
"grad_norm": 1.545830512814091,
"learning_rate": 4.723773181774543e-06,
"loss": 0.0961,
"step": 1661
},
{
"epoch": 0.7561419472247498,
"grad_norm": 1.7914456776458303,
"learning_rate": 4.723446556010542e-06,
"loss": 0.1092,
"step": 1662
},
{
"epoch": 0.7565969062784349,
"grad_norm": 1.2264032188306588,
"learning_rate": 4.7231197485565275e-06,
"loss": 0.096,
"step": 1663
},
{
"epoch": 0.7570518653321201,
"grad_norm": 1.838239870158386,
"learning_rate": 4.722792759439209e-06,
"loss": 0.129,
"step": 1664
},
{
"epoch": 0.7575068243858053,
"grad_norm": 1.8429853108458891,
"learning_rate": 4.722465588685302e-06,
"loss": 0.147,
"step": 1665
}
],
"logging_steps": 1,
"max_steps": 10990,
"num_input_tokens_seen": 0,
"num_train_epochs": 5,
"save_steps": 555,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 10958994579456.0,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}