redsgnaoh's picture
Upload folder using huggingface_hub
cd7b244 verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 1.0100090991810737,
"eval_steps": 500,
"global_step": 2220,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.00045495905368516835,
"grad_norm": 2.3685307115973546,
"learning_rate": 5e-06,
"loss": 0.0587,
"step": 1
},
{
"epoch": 0.0009099181073703367,
"grad_norm": 3.207290006513166,
"learning_rate": 4.999999897855645e-06,
"loss": 0.0976,
"step": 2
},
{
"epoch": 0.001364877161055505,
"grad_norm": 3.061584755625611,
"learning_rate": 4.9999995914225884e-06,
"loss": 0.1138,
"step": 3
},
{
"epoch": 0.0018198362147406734,
"grad_norm": 2.4708172493174265,
"learning_rate": 4.999999080700855e-06,
"loss": 0.102,
"step": 4
},
{
"epoch": 0.0022747952684258415,
"grad_norm": 2.7122863978048204,
"learning_rate": 4.999998365690486e-06,
"loss": 0.0899,
"step": 5
},
{
"epoch": 0.00272975432211101,
"grad_norm": 2.1348308028500367,
"learning_rate": 4.999997446391542e-06,
"loss": 0.0589,
"step": 6
},
{
"epoch": 0.0031847133757961785,
"grad_norm": 1.9525029408374595,
"learning_rate": 4.999996322804095e-06,
"loss": 0.0692,
"step": 7
},
{
"epoch": 0.003639672429481347,
"grad_norm": 2.4972521600201087,
"learning_rate": 4.999994994928239e-06,
"loss": 0.094,
"step": 8
},
{
"epoch": 0.004094631483166515,
"grad_norm": 1.3057783939017902,
"learning_rate": 4.999993462764082e-06,
"loss": 0.0401,
"step": 9
},
{
"epoch": 0.004549590536851683,
"grad_norm": 1.8178622655461494,
"learning_rate": 4.999991726311749e-06,
"loss": 0.0508,
"step": 10
},
{
"epoch": 0.005004549590536852,
"grad_norm": 1.8904298363447831,
"learning_rate": 4.999989785571382e-06,
"loss": 0.0466,
"step": 11
},
{
"epoch": 0.00545950864422202,
"grad_norm": 2.397431505721498,
"learning_rate": 4.999987640543139e-06,
"loss": 0.0684,
"step": 12
},
{
"epoch": 0.005914467697907188,
"grad_norm": 2.121710266227225,
"learning_rate": 4.999985291227196e-06,
"loss": 0.0729,
"step": 13
},
{
"epoch": 0.006369426751592357,
"grad_norm": 2.9696000985831614,
"learning_rate": 4.999982737623746e-06,
"loss": 0.0922,
"step": 14
},
{
"epoch": 0.006824385805277525,
"grad_norm": 2.270433126704546,
"learning_rate": 4.999979979732995e-06,
"loss": 0.0946,
"step": 15
},
{
"epoch": 0.007279344858962694,
"grad_norm": 1.9380248124362378,
"learning_rate": 4.999977017555171e-06,
"loss": 0.0578,
"step": 16
},
{
"epoch": 0.0077343039126478615,
"grad_norm": 2.6281882171357958,
"learning_rate": 4.999973851090514e-06,
"loss": 0.1147,
"step": 17
},
{
"epoch": 0.00818926296633303,
"grad_norm": 2.40029765076707,
"learning_rate": 4.999970480339284e-06,
"loss": 0.0906,
"step": 18
},
{
"epoch": 0.008644222020018199,
"grad_norm": 2.889640814144301,
"learning_rate": 4.9999669053017564e-06,
"loss": 0.0792,
"step": 19
},
{
"epoch": 0.009099181073703366,
"grad_norm": 2.3110994220860883,
"learning_rate": 4.9999631259782235e-06,
"loss": 0.0751,
"step": 20
},
{
"epoch": 0.009554140127388535,
"grad_norm": 2.6890244705482806,
"learning_rate": 4.999959142368993e-06,
"loss": 0.0966,
"step": 21
},
{
"epoch": 0.010009099181073703,
"grad_norm": 2.2488041264680563,
"learning_rate": 4.999954954474391e-06,
"loss": 0.0714,
"step": 22
},
{
"epoch": 0.010464058234758872,
"grad_norm": 2.0642223983397883,
"learning_rate": 4.9999505622947594e-06,
"loss": 0.0881,
"step": 23
},
{
"epoch": 0.01091901728844404,
"grad_norm": 2.384727655713489,
"learning_rate": 4.999945965830458e-06,
"loss": 0.0992,
"step": 24
},
{
"epoch": 0.011373976342129208,
"grad_norm": 2.2739375250381504,
"learning_rate": 4.999941165081863e-06,
"loss": 0.0831,
"step": 25
},
{
"epoch": 0.011828935395814377,
"grad_norm": 1.6418905911049972,
"learning_rate": 4.999936160049364e-06,
"loss": 0.0662,
"step": 26
},
{
"epoch": 0.012283894449499545,
"grad_norm": 2.029045596294324,
"learning_rate": 4.999930950733373e-06,
"loss": 0.097,
"step": 27
},
{
"epoch": 0.012738853503184714,
"grad_norm": 2.2833378337725287,
"learning_rate": 4.999925537134312e-06,
"loss": 0.0823,
"step": 28
},
{
"epoch": 0.013193812556869881,
"grad_norm": 2.611896749496796,
"learning_rate": 4.9999199192526286e-06,
"loss": 0.1115,
"step": 29
},
{
"epoch": 0.01364877161055505,
"grad_norm": 2.4812612616344865,
"learning_rate": 4.9999140970887775e-06,
"loss": 0.0854,
"step": 30
},
{
"epoch": 0.014103730664240218,
"grad_norm": 2.0837983680092904,
"learning_rate": 4.999908070643236e-06,
"loss": 0.0837,
"step": 31
},
{
"epoch": 0.014558689717925387,
"grad_norm": 2.0812008840647827,
"learning_rate": 4.999901839916495e-06,
"loss": 0.064,
"step": 32
},
{
"epoch": 0.015013648771610554,
"grad_norm": 1.5275195881020318,
"learning_rate": 4.999895404909067e-06,
"loss": 0.0582,
"step": 33
},
{
"epoch": 0.015468607825295723,
"grad_norm": 2.703502541064391,
"learning_rate": 4.999888765621476e-06,
"loss": 0.1102,
"step": 34
},
{
"epoch": 0.01592356687898089,
"grad_norm": 1.7231856796809104,
"learning_rate": 4.999881922054264e-06,
"loss": 0.0571,
"step": 35
},
{
"epoch": 0.01637852593266606,
"grad_norm": 1.6472076658400754,
"learning_rate": 4.999874874207991e-06,
"loss": 0.0536,
"step": 36
},
{
"epoch": 0.01683348498635123,
"grad_norm": 2.902300005488672,
"learning_rate": 4.999867622083232e-06,
"loss": 0.1302,
"step": 37
},
{
"epoch": 0.017288444040036398,
"grad_norm": 1.9543380822482044,
"learning_rate": 4.99986016568058e-06,
"loss": 0.0983,
"step": 38
},
{
"epoch": 0.017743403093721567,
"grad_norm": 1.814859572890468,
"learning_rate": 4.999852505000646e-06,
"loss": 0.0717,
"step": 39
},
{
"epoch": 0.018198362147406732,
"grad_norm": 1.882630749677819,
"learning_rate": 4.999844640044053e-06,
"loss": 0.07,
"step": 40
},
{
"epoch": 0.0186533212010919,
"grad_norm": 2.4063115131397823,
"learning_rate": 4.999836570811445e-06,
"loss": 0.0873,
"step": 41
},
{
"epoch": 0.01910828025477707,
"grad_norm": 2.9701013712692035,
"learning_rate": 4.999828297303483e-06,
"loss": 0.0957,
"step": 42
},
{
"epoch": 0.019563239308462238,
"grad_norm": 2.207833234895104,
"learning_rate": 4.9998198195208405e-06,
"loss": 0.0879,
"step": 43
},
{
"epoch": 0.020018198362147407,
"grad_norm": 2.168760551509319,
"learning_rate": 4.999811137464212e-06,
"loss": 0.0967,
"step": 44
},
{
"epoch": 0.020473157415832575,
"grad_norm": 2.12859962179133,
"learning_rate": 4.999802251134307e-06,
"loss": 0.1028,
"step": 45
},
{
"epoch": 0.020928116469517744,
"grad_norm": 1.8067595132130894,
"learning_rate": 4.99979316053185e-06,
"loss": 0.0778,
"step": 46
},
{
"epoch": 0.021383075523202913,
"grad_norm": 3.8815722657740594,
"learning_rate": 4.999783865657585e-06,
"loss": 0.1812,
"step": 47
},
{
"epoch": 0.02183803457688808,
"grad_norm": 4.142186542548352,
"learning_rate": 4.999774366512272e-06,
"loss": 0.1981,
"step": 48
},
{
"epoch": 0.022292993630573247,
"grad_norm": 2.4946427215064015,
"learning_rate": 4.9997646630966865e-06,
"loss": 0.0866,
"step": 49
},
{
"epoch": 0.022747952684258416,
"grad_norm": 2.219814267860857,
"learning_rate": 4.999754755411621e-06,
"loss": 0.0767,
"step": 50
},
{
"epoch": 0.023202911737943584,
"grad_norm": 1.7512451842619647,
"learning_rate": 4.9997446434578865e-06,
"loss": 0.0709,
"step": 51
},
{
"epoch": 0.023657870791628753,
"grad_norm": 1.9267762038567948,
"learning_rate": 4.999734327236307e-06,
"loss": 0.0791,
"step": 52
},
{
"epoch": 0.024112829845313922,
"grad_norm": 1.3192434416131813,
"learning_rate": 4.999723806747728e-06,
"loss": 0.0611,
"step": 53
},
{
"epoch": 0.02456778889899909,
"grad_norm": 2.0553891309583787,
"learning_rate": 4.99971308199301e-06,
"loss": 0.0708,
"step": 54
},
{
"epoch": 0.02502274795268426,
"grad_norm": 1.6809260342794263,
"learning_rate": 4.999702152973025e-06,
"loss": 0.0662,
"step": 55
},
{
"epoch": 0.025477707006369428,
"grad_norm": 2.0087287549898716,
"learning_rate": 4.9996910196886694e-06,
"loss": 0.0795,
"step": 56
},
{
"epoch": 0.025932666060054597,
"grad_norm": 1.3268510730840513,
"learning_rate": 4.999679682140852e-06,
"loss": 0.0422,
"step": 57
},
{
"epoch": 0.026387625113739762,
"grad_norm": 2.646053521216802,
"learning_rate": 4.999668140330499e-06,
"loss": 0.1284,
"step": 58
},
{
"epoch": 0.02684258416742493,
"grad_norm": 1.5857988579934552,
"learning_rate": 4.999656394258555e-06,
"loss": 0.0647,
"step": 59
},
{
"epoch": 0.0272975432211101,
"grad_norm": 1.756551616255058,
"learning_rate": 4.999644443925978e-06,
"loss": 0.078,
"step": 60
},
{
"epoch": 0.027752502274795268,
"grad_norm": 2.2102751228780546,
"learning_rate": 4.999632289333746e-06,
"loss": 0.0785,
"step": 61
},
{
"epoch": 0.028207461328480437,
"grad_norm": 2.338156657994438,
"learning_rate": 4.999619930482852e-06,
"loss": 0.0835,
"step": 62
},
{
"epoch": 0.028662420382165606,
"grad_norm": 2.0921557148636616,
"learning_rate": 4.999607367374304e-06,
"loss": 0.0974,
"step": 63
},
{
"epoch": 0.029117379435850774,
"grad_norm": 1.7535396635399074,
"learning_rate": 4.999594600009131e-06,
"loss": 0.0605,
"step": 64
},
{
"epoch": 0.029572338489535943,
"grad_norm": 2.2055708873696585,
"learning_rate": 4.999581628388375e-06,
"loss": 0.0946,
"step": 65
},
{
"epoch": 0.03002729754322111,
"grad_norm": 2.5001955714674216,
"learning_rate": 4.999568452513097e-06,
"loss": 0.1549,
"step": 66
},
{
"epoch": 0.030482256596906277,
"grad_norm": 2.417716838936908,
"learning_rate": 4.9995550723843726e-06,
"loss": 0.0953,
"step": 67
},
{
"epoch": 0.030937215650591446,
"grad_norm": 1.9976883408624455,
"learning_rate": 4.999541488003295e-06,
"loss": 0.0772,
"step": 68
},
{
"epoch": 0.03139217470427662,
"grad_norm": 1.9326277047503455,
"learning_rate": 4.999527699370975e-06,
"loss": 0.0764,
"step": 69
},
{
"epoch": 0.03184713375796178,
"grad_norm": 2.0337761312716527,
"learning_rate": 4.99951370648854e-06,
"loss": 0.0659,
"step": 70
},
{
"epoch": 0.03230209281164695,
"grad_norm": 1.895878774895592,
"learning_rate": 4.999499509357132e-06,
"loss": 0.0815,
"step": 71
},
{
"epoch": 0.03275705186533212,
"grad_norm": 2.0909717848011313,
"learning_rate": 4.999485107977912e-06,
"loss": 0.084,
"step": 72
},
{
"epoch": 0.033212010919017286,
"grad_norm": 1.5271836426577585,
"learning_rate": 4.999470502352057e-06,
"loss": 0.0645,
"step": 73
},
{
"epoch": 0.03366696997270246,
"grad_norm": 2.4817155636981223,
"learning_rate": 4.999455692480759e-06,
"loss": 0.1008,
"step": 74
},
{
"epoch": 0.034121929026387623,
"grad_norm": 1.6027477251164817,
"learning_rate": 4.999440678365229e-06,
"loss": 0.0722,
"step": 75
},
{
"epoch": 0.034576888080072796,
"grad_norm": 2.164861284274037,
"learning_rate": 4.999425460006695e-06,
"loss": 0.0876,
"step": 76
},
{
"epoch": 0.03503184713375796,
"grad_norm": 1.8147143711706584,
"learning_rate": 4.9994100374063995e-06,
"loss": 0.0739,
"step": 77
},
{
"epoch": 0.03548680618744313,
"grad_norm": 2.379478288499757,
"learning_rate": 4.9993944105656035e-06,
"loss": 0.1158,
"step": 78
},
{
"epoch": 0.0359417652411283,
"grad_norm": 1.7238147576191318,
"learning_rate": 4.999378579485582e-06,
"loss": 0.0749,
"step": 79
},
{
"epoch": 0.036396724294813464,
"grad_norm": 2.1444185576728323,
"learning_rate": 4.999362544167632e-06,
"loss": 0.0937,
"step": 80
},
{
"epoch": 0.036851683348498636,
"grad_norm": 1.18142283635082,
"learning_rate": 4.99934630461306e-06,
"loss": 0.0569,
"step": 81
},
{
"epoch": 0.0373066424021838,
"grad_norm": 2.3599788407160456,
"learning_rate": 4.999329860823197e-06,
"loss": 0.0848,
"step": 82
},
{
"epoch": 0.03776160145586897,
"grad_norm": 1.851574920799011,
"learning_rate": 4.999313212799383e-06,
"loss": 0.0882,
"step": 83
},
{
"epoch": 0.03821656050955414,
"grad_norm": 2.144291660745484,
"learning_rate": 4.99929636054298e-06,
"loss": 0.0881,
"step": 84
},
{
"epoch": 0.03867151956323931,
"grad_norm": 2.083071837291781,
"learning_rate": 4.999279304055366e-06,
"loss": 0.1109,
"step": 85
},
{
"epoch": 0.039126478616924476,
"grad_norm": 2.245491182317419,
"learning_rate": 4.999262043337933e-06,
"loss": 0.0933,
"step": 86
},
{
"epoch": 0.03958143767060965,
"grad_norm": 2.076902724310137,
"learning_rate": 4.999244578392094e-06,
"loss": 0.1004,
"step": 87
},
{
"epoch": 0.040036396724294813,
"grad_norm": 2.213157445111281,
"learning_rate": 4.9992269092192736e-06,
"loss": 0.1048,
"step": 88
},
{
"epoch": 0.04049135577797998,
"grad_norm": 1.8088256581500983,
"learning_rate": 4.9992090358209166e-06,
"loss": 0.0803,
"step": 89
},
{
"epoch": 0.04094631483166515,
"grad_norm": 1.6952266837081935,
"learning_rate": 4.9991909581984835e-06,
"loss": 0.0707,
"step": 90
},
{
"epoch": 0.041401273885350316,
"grad_norm": 1.2806634047624867,
"learning_rate": 4.999172676353451e-06,
"loss": 0.0405,
"step": 91
},
{
"epoch": 0.04185623293903549,
"grad_norm": 1.537222164184117,
"learning_rate": 4.999154190287314e-06,
"loss": 0.0678,
"step": 92
},
{
"epoch": 0.042311191992720654,
"grad_norm": 2.152654560935853,
"learning_rate": 4.999135500001583e-06,
"loss": 0.1323,
"step": 93
},
{
"epoch": 0.042766151046405826,
"grad_norm": 1.7293087783358614,
"learning_rate": 4.9991166054977844e-06,
"loss": 0.0851,
"step": 94
},
{
"epoch": 0.04322111010009099,
"grad_norm": 2.689089264886033,
"learning_rate": 4.999097506777463e-06,
"loss": 0.1018,
"step": 95
},
{
"epoch": 0.04367606915377616,
"grad_norm": 1.8242860351920025,
"learning_rate": 4.999078203842179e-06,
"loss": 0.1063,
"step": 96
},
{
"epoch": 0.04413102820746133,
"grad_norm": 1.5249963877818449,
"learning_rate": 4.999058696693511e-06,
"loss": 0.0593,
"step": 97
},
{
"epoch": 0.044585987261146494,
"grad_norm": 1.668772591755926,
"learning_rate": 4.99903898533305e-06,
"loss": 0.0709,
"step": 98
},
{
"epoch": 0.045040946314831666,
"grad_norm": 1.8521288885149407,
"learning_rate": 4.99901906976241e-06,
"loss": 0.0842,
"step": 99
},
{
"epoch": 0.04549590536851683,
"grad_norm": 2.106435857041323,
"learning_rate": 4.998998949983217e-06,
"loss": 0.0921,
"step": 100
},
{
"epoch": 0.045950864422202004,
"grad_norm": 2.104450695294598,
"learning_rate": 4.998978625997115e-06,
"loss": 0.1082,
"step": 101
},
{
"epoch": 0.04640582347588717,
"grad_norm": 2.1381043167125466,
"learning_rate": 4.998958097805765e-06,
"loss": 0.0966,
"step": 102
},
{
"epoch": 0.04686078252957234,
"grad_norm": 1.6962878781771613,
"learning_rate": 4.9989373654108445e-06,
"loss": 0.0721,
"step": 103
},
{
"epoch": 0.047315741583257506,
"grad_norm": 26.768545049591438,
"learning_rate": 4.9989164288140465e-06,
"loss": 0.362,
"step": 104
},
{
"epoch": 0.04777070063694268,
"grad_norm": 2.63813062408578,
"learning_rate": 4.998895288017085e-06,
"loss": 0.1373,
"step": 105
},
{
"epoch": 0.048225659690627844,
"grad_norm": 1.828826426920959,
"learning_rate": 4.998873943021684e-06,
"loss": 0.0743,
"step": 106
},
{
"epoch": 0.04868061874431301,
"grad_norm": 1.524672393516503,
"learning_rate": 4.998852393829589e-06,
"loss": 0.0693,
"step": 107
},
{
"epoch": 0.04913557779799818,
"grad_norm": 3.0873114713096683,
"learning_rate": 4.9988306404425625e-06,
"loss": 0.1492,
"step": 108
},
{
"epoch": 0.049590536851683346,
"grad_norm": 1.7541988764209069,
"learning_rate": 4.99880868286238e-06,
"loss": 0.0941,
"step": 109
},
{
"epoch": 0.05004549590536852,
"grad_norm": 2.3475973125438103,
"learning_rate": 4.998786521090836e-06,
"loss": 0.0925,
"step": 110
},
{
"epoch": 0.050500454959053684,
"grad_norm": 2.1297159392440452,
"learning_rate": 4.9987641551297426e-06,
"loss": 0.1209,
"step": 111
},
{
"epoch": 0.050955414012738856,
"grad_norm": 1.8188477873711246,
"learning_rate": 4.998741584980926e-06,
"loss": 0.1191,
"step": 112
},
{
"epoch": 0.05141037306642402,
"grad_norm": 2.0744703068317474,
"learning_rate": 4.9987188106462314e-06,
"loss": 0.0958,
"step": 113
},
{
"epoch": 0.051865332120109194,
"grad_norm": 1.67585557445257,
"learning_rate": 4.99869583212752e-06,
"loss": 0.0759,
"step": 114
},
{
"epoch": 0.05232029117379436,
"grad_norm": 2.9423649270306456,
"learning_rate": 4.9986726494266694e-06,
"loss": 0.1628,
"step": 115
},
{
"epoch": 0.052775250227479524,
"grad_norm": 1.9805897541793653,
"learning_rate": 4.998649262545574e-06,
"loss": 0.0865,
"step": 116
},
{
"epoch": 0.053230209281164696,
"grad_norm": 1.862673950464683,
"learning_rate": 4.998625671486144e-06,
"loss": 0.0841,
"step": 117
},
{
"epoch": 0.05368516833484986,
"grad_norm": 1.6852737490573195,
"learning_rate": 4.998601876250308e-06,
"loss": 0.0801,
"step": 118
},
{
"epoch": 0.054140127388535034,
"grad_norm": 1.8645780399689873,
"learning_rate": 4.998577876840011e-06,
"loss": 0.0822,
"step": 119
},
{
"epoch": 0.0545950864422202,
"grad_norm": 1.7705796593126653,
"learning_rate": 4.9985536732572124e-06,
"loss": 0.0836,
"step": 120
},
{
"epoch": 0.05505004549590537,
"grad_norm": 1.4380115814084553,
"learning_rate": 4.998529265503891e-06,
"loss": 0.0714,
"step": 121
},
{
"epoch": 0.055505004549590536,
"grad_norm": 1.841019746353449,
"learning_rate": 4.9985046535820416e-06,
"loss": 0.0925,
"step": 122
},
{
"epoch": 0.05595996360327571,
"grad_norm": 2.13633472088372,
"learning_rate": 4.998479837493675e-06,
"loss": 0.1098,
"step": 123
},
{
"epoch": 0.056414922656960874,
"grad_norm": 1.6795956051728682,
"learning_rate": 4.9984548172408195e-06,
"loss": 0.0623,
"step": 124
},
{
"epoch": 0.05686988171064604,
"grad_norm": 7.146738489798405,
"learning_rate": 4.998429592825519e-06,
"loss": 0.1803,
"step": 125
},
{
"epoch": 0.05732484076433121,
"grad_norm": 2.17497011974541,
"learning_rate": 4.998404164249835e-06,
"loss": 0.1209,
"step": 126
},
{
"epoch": 0.05777979981801638,
"grad_norm": 1.9663385354035616,
"learning_rate": 4.998378531515845e-06,
"loss": 0.0704,
"step": 127
},
{
"epoch": 0.05823475887170155,
"grad_norm": 2.398444068788508,
"learning_rate": 4.998352694625645e-06,
"loss": 0.0819,
"step": 128
},
{
"epoch": 0.058689717925386714,
"grad_norm": 1.5854929257305652,
"learning_rate": 4.998326653581343e-06,
"loss": 0.0775,
"step": 129
},
{
"epoch": 0.059144676979071886,
"grad_norm": 1.8831317521751245,
"learning_rate": 4.998300408385072e-06,
"loss": 0.0895,
"step": 130
},
{
"epoch": 0.05959963603275705,
"grad_norm": 2.624836374744882,
"learning_rate": 4.998273959038972e-06,
"loss": 0.1398,
"step": 131
},
{
"epoch": 0.06005459508644222,
"grad_norm": 1.8281764860819427,
"learning_rate": 4.998247305545207e-06,
"loss": 0.0979,
"step": 132
},
{
"epoch": 0.06050955414012739,
"grad_norm": 1.4175605750366638,
"learning_rate": 4.998220447905953e-06,
"loss": 0.0674,
"step": 133
},
{
"epoch": 0.060964513193812554,
"grad_norm": 2.0007328792439307,
"learning_rate": 4.998193386123408e-06,
"loss": 0.1082,
"step": 134
},
{
"epoch": 0.061419472247497726,
"grad_norm": 2.2534593276871355,
"learning_rate": 4.99816612019978e-06,
"loss": 0.1165,
"step": 135
},
{
"epoch": 0.06187443130118289,
"grad_norm": 7.223128092677242,
"learning_rate": 4.998138650137298e-06,
"loss": 0.1547,
"step": 136
},
{
"epoch": 0.062329390354868064,
"grad_norm": 2.0541187438324178,
"learning_rate": 4.998110975938208e-06,
"loss": 0.1153,
"step": 137
},
{
"epoch": 0.06278434940855324,
"grad_norm": 2.900003934434033,
"learning_rate": 4.998083097604769e-06,
"loss": 0.1227,
"step": 138
},
{
"epoch": 0.0632393084622384,
"grad_norm": 2.9930382656276655,
"learning_rate": 4.998055015139261e-06,
"loss": 0.0671,
"step": 139
},
{
"epoch": 0.06369426751592357,
"grad_norm": 1.8183166737473904,
"learning_rate": 4.998026728543979e-06,
"loss": 0.0879,
"step": 140
},
{
"epoch": 0.06414922656960874,
"grad_norm": 1.750231162848612,
"learning_rate": 4.997998237821233e-06,
"loss": 0.0973,
"step": 141
},
{
"epoch": 0.0646041856232939,
"grad_norm": 1.531092755332603,
"learning_rate": 4.997969542973352e-06,
"loss": 0.0755,
"step": 142
},
{
"epoch": 0.06505914467697907,
"grad_norm": 2.106588666489457,
"learning_rate": 4.997940644002681e-06,
"loss": 0.1014,
"step": 143
},
{
"epoch": 0.06551410373066424,
"grad_norm": 2.4260145417995513,
"learning_rate": 4.997911540911581e-06,
"loss": 0.0992,
"step": 144
},
{
"epoch": 0.06596906278434941,
"grad_norm": 1.9957158387709846,
"learning_rate": 4.99788223370243e-06,
"loss": 0.1074,
"step": 145
},
{
"epoch": 0.06642402183803457,
"grad_norm": 2.7359115449729385,
"learning_rate": 4.9978527223776245e-06,
"loss": 0.1298,
"step": 146
},
{
"epoch": 0.06687898089171974,
"grad_norm": 1.4774963397056595,
"learning_rate": 4.9978230069395735e-06,
"loss": 0.0725,
"step": 147
},
{
"epoch": 0.06733393994540492,
"grad_norm": 2.4431671333335188,
"learning_rate": 4.9977930873907065e-06,
"loss": 0.0983,
"step": 148
},
{
"epoch": 0.06778889899909009,
"grad_norm": 1.9906443670591782,
"learning_rate": 4.997762963733468e-06,
"loss": 0.1039,
"step": 149
},
{
"epoch": 0.06824385805277525,
"grad_norm": 2.0201798980001517,
"learning_rate": 4.997732635970321e-06,
"loss": 0.085,
"step": 150
},
{
"epoch": 0.06869881710646042,
"grad_norm": 1.7461931203369137,
"learning_rate": 4.9977021041037425e-06,
"loss": 0.0884,
"step": 151
},
{
"epoch": 0.06915377616014559,
"grad_norm": 2.339191302020108,
"learning_rate": 4.9976713681362265e-06,
"loss": 0.1159,
"step": 152
},
{
"epoch": 0.06960873521383075,
"grad_norm": 2.314166753359135,
"learning_rate": 4.997640428070286e-06,
"loss": 0.1338,
"step": 153
},
{
"epoch": 0.07006369426751592,
"grad_norm": 1.5963391451568967,
"learning_rate": 4.99760928390845e-06,
"loss": 0.0575,
"step": 154
},
{
"epoch": 0.0705186533212011,
"grad_norm": 1.7788915412646347,
"learning_rate": 4.997577935653262e-06,
"loss": 0.08,
"step": 155
},
{
"epoch": 0.07097361237488627,
"grad_norm": 1.5840889143049688,
"learning_rate": 4.9975463833072835e-06,
"loss": 0.0709,
"step": 156
},
{
"epoch": 0.07142857142857142,
"grad_norm": 2.1242834812157962,
"learning_rate": 4.997514626873093e-06,
"loss": 0.1078,
"step": 157
},
{
"epoch": 0.0718835304822566,
"grad_norm": 1.7256733994251798,
"learning_rate": 4.997482666353287e-06,
"loss": 0.0678,
"step": 158
},
{
"epoch": 0.07233848953594177,
"grad_norm": 2.2088750555704073,
"learning_rate": 4.997450501750476e-06,
"loss": 0.0981,
"step": 159
},
{
"epoch": 0.07279344858962693,
"grad_norm": 1.817598507902073,
"learning_rate": 4.997418133067288e-06,
"loss": 0.0829,
"step": 160
},
{
"epoch": 0.0732484076433121,
"grad_norm": 1.9174894618752205,
"learning_rate": 4.997385560306368e-06,
"loss": 0.0922,
"step": 161
},
{
"epoch": 0.07370336669699727,
"grad_norm": 1.7975593397664607,
"learning_rate": 4.997352783470379e-06,
"loss": 0.093,
"step": 162
},
{
"epoch": 0.07415832575068244,
"grad_norm": 2.1789877377155147,
"learning_rate": 4.997319802561997e-06,
"loss": 0.1044,
"step": 163
},
{
"epoch": 0.0746132848043676,
"grad_norm": 1.5046722090412417,
"learning_rate": 4.9972866175839196e-06,
"loss": 0.0806,
"step": 164
},
{
"epoch": 0.07506824385805277,
"grad_norm": 1.828261506678391,
"learning_rate": 4.9972532285388575e-06,
"loss": 0.1018,
"step": 165
},
{
"epoch": 0.07552320291173795,
"grad_norm": 1.853289616987827,
"learning_rate": 4.997219635429538e-06,
"loss": 0.1177,
"step": 166
},
{
"epoch": 0.07597816196542312,
"grad_norm": 1.9172069323651033,
"learning_rate": 4.997185838258709e-06,
"loss": 0.0817,
"step": 167
},
{
"epoch": 0.07643312101910828,
"grad_norm": 1.6956924002006215,
"learning_rate": 4.997151837029129e-06,
"loss": 0.0679,
"step": 168
},
{
"epoch": 0.07688808007279345,
"grad_norm": 1.8575330553269362,
"learning_rate": 4.997117631743579e-06,
"loss": 0.0855,
"step": 169
},
{
"epoch": 0.07734303912647862,
"grad_norm": 1.7266908578071283,
"learning_rate": 4.997083222404852e-06,
"loss": 0.0625,
"step": 170
},
{
"epoch": 0.07779799818016378,
"grad_norm": 1.6397125044179104,
"learning_rate": 4.997048609015762e-06,
"loss": 0.0751,
"step": 171
},
{
"epoch": 0.07825295723384895,
"grad_norm": 1.5340896344557344,
"learning_rate": 4.997013791579136e-06,
"loss": 0.0786,
"step": 172
},
{
"epoch": 0.07870791628753412,
"grad_norm": 1.9189331650587453,
"learning_rate": 4.996978770097819e-06,
"loss": 0.0953,
"step": 173
},
{
"epoch": 0.0791628753412193,
"grad_norm": 1.7773721601434869,
"learning_rate": 4.996943544574673e-06,
"loss": 0.083,
"step": 174
},
{
"epoch": 0.07961783439490445,
"grad_norm": 1.7663708027835396,
"learning_rate": 4.996908115012576e-06,
"loss": 0.0711,
"step": 175
},
{
"epoch": 0.08007279344858963,
"grad_norm": 2.0988130747441462,
"learning_rate": 4.996872481414425e-06,
"loss": 0.1068,
"step": 176
},
{
"epoch": 0.0805277525022748,
"grad_norm": 3.491649419917669,
"learning_rate": 4.9968366437831305e-06,
"loss": 0.1596,
"step": 177
},
{
"epoch": 0.08098271155595996,
"grad_norm": 0.9772529604089312,
"learning_rate": 4.99680060212162e-06,
"loss": 0.0469,
"step": 178
},
{
"epoch": 0.08143767060964513,
"grad_norm": 1.411497576217555,
"learning_rate": 4.996764356432841e-06,
"loss": 0.0799,
"step": 179
},
{
"epoch": 0.0818926296633303,
"grad_norm": 1.9634897057091474,
"learning_rate": 4.996727906719754e-06,
"loss": 0.0818,
"step": 180
},
{
"epoch": 0.08234758871701547,
"grad_norm": 1.8622777856402457,
"learning_rate": 4.9966912529853365e-06,
"loss": 0.0654,
"step": 181
},
{
"epoch": 0.08280254777070063,
"grad_norm": 1.6338074095796988,
"learning_rate": 4.996654395232585e-06,
"loss": 0.0744,
"step": 182
},
{
"epoch": 0.0832575068243858,
"grad_norm": 1.534919993971643,
"learning_rate": 4.996617333464512e-06,
"loss": 0.0639,
"step": 183
},
{
"epoch": 0.08371246587807098,
"grad_norm": 1.7391379315757225,
"learning_rate": 4.996580067684145e-06,
"loss": 0.0715,
"step": 184
},
{
"epoch": 0.08416742493175614,
"grad_norm": 1.7215093643580193,
"learning_rate": 4.996542597894528e-06,
"loss": 0.1192,
"step": 185
},
{
"epoch": 0.08462238398544131,
"grad_norm": 2.041088124472192,
"learning_rate": 4.996504924098726e-06,
"loss": 0.1078,
"step": 186
},
{
"epoch": 0.08507734303912648,
"grad_norm": 1.7083926900772908,
"learning_rate": 4.9964670462998145e-06,
"loss": 0.0922,
"step": 187
},
{
"epoch": 0.08553230209281165,
"grad_norm": 1.9950587953196364,
"learning_rate": 4.99642896450089e-06,
"loss": 0.125,
"step": 188
},
{
"epoch": 0.08598726114649681,
"grad_norm": 2.2702904646099022,
"learning_rate": 4.9963906787050656e-06,
"loss": 0.1318,
"step": 189
},
{
"epoch": 0.08644222020018198,
"grad_norm": 1.5062676480402928,
"learning_rate": 4.996352188915467e-06,
"loss": 0.0621,
"step": 190
},
{
"epoch": 0.08689717925386715,
"grad_norm": 2.6764229211241153,
"learning_rate": 4.996313495135242e-06,
"loss": 0.1112,
"step": 191
},
{
"epoch": 0.08735213830755233,
"grad_norm": 2.276483991348045,
"learning_rate": 4.9962745973675505e-06,
"loss": 0.1219,
"step": 192
},
{
"epoch": 0.08780709736123748,
"grad_norm": 1.4375762261827663,
"learning_rate": 4.996235495615572e-06,
"loss": 0.0641,
"step": 193
},
{
"epoch": 0.08826205641492266,
"grad_norm": 2.3164336329931094,
"learning_rate": 4.996196189882503e-06,
"loss": 0.1176,
"step": 194
},
{
"epoch": 0.08871701546860783,
"grad_norm": 2.225732764096407,
"learning_rate": 4.996156680171552e-06,
"loss": 0.1096,
"step": 195
},
{
"epoch": 0.08917197452229299,
"grad_norm": 1.8464739663611849,
"learning_rate": 4.996116966485951e-06,
"loss": 0.0817,
"step": 196
},
{
"epoch": 0.08962693357597816,
"grad_norm": 1.9290667932284378,
"learning_rate": 4.996077048828944e-06,
"loss": 0.1106,
"step": 197
},
{
"epoch": 0.09008189262966333,
"grad_norm": 1.6322378586848272,
"learning_rate": 4.996036927203793e-06,
"loss": 0.0972,
"step": 198
},
{
"epoch": 0.0905368516833485,
"grad_norm": 2.2100804969645416,
"learning_rate": 4.995996601613775e-06,
"loss": 0.0944,
"step": 199
},
{
"epoch": 0.09099181073703366,
"grad_norm": 1.5641835045850314,
"learning_rate": 4.9959560720621875e-06,
"loss": 0.0896,
"step": 200
},
{
"epoch": 0.09144676979071883,
"grad_norm": 2.2116837789953117,
"learning_rate": 4.995915338552341e-06,
"loss": 0.1331,
"step": 201
},
{
"epoch": 0.09190172884440401,
"grad_norm": 1.8792253280188753,
"learning_rate": 4.995874401087565e-06,
"loss": 0.0967,
"step": 202
},
{
"epoch": 0.09235668789808917,
"grad_norm": 2.167978668790899,
"learning_rate": 4.9958332596712035e-06,
"loss": 0.1141,
"step": 203
},
{
"epoch": 0.09281164695177434,
"grad_norm": 1.8621318139110883,
"learning_rate": 4.99579191430662e-06,
"loss": 0.0972,
"step": 204
},
{
"epoch": 0.09326660600545951,
"grad_norm": 1.8429430162012657,
"learning_rate": 4.995750364997192e-06,
"loss": 0.0967,
"step": 205
},
{
"epoch": 0.09372156505914468,
"grad_norm": 1.5424629326591568,
"learning_rate": 4.995708611746314e-06,
"loss": 0.0814,
"step": 206
},
{
"epoch": 0.09417652411282984,
"grad_norm": 2.0700985381007904,
"learning_rate": 4.995666654557399e-06,
"loss": 0.1038,
"step": 207
},
{
"epoch": 0.09463148316651501,
"grad_norm": 1.8765344045928045,
"learning_rate": 4.995624493433876e-06,
"loss": 0.1075,
"step": 208
},
{
"epoch": 0.09508644222020018,
"grad_norm": 1.8732891178471252,
"learning_rate": 4.995582128379189e-06,
"loss": 0.1001,
"step": 209
},
{
"epoch": 0.09554140127388536,
"grad_norm": 2.1418545940903373,
"learning_rate": 4.9955395593968e-06,
"loss": 0.1463,
"step": 210
},
{
"epoch": 0.09599636032757052,
"grad_norm": 1.905821465202796,
"learning_rate": 4.99549678649019e-06,
"loss": 0.0848,
"step": 211
},
{
"epoch": 0.09645131938125569,
"grad_norm": 1.7581366634538098,
"learning_rate": 4.99545380966285e-06,
"loss": 0.0976,
"step": 212
},
{
"epoch": 0.09690627843494086,
"grad_norm": 2.133882292644339,
"learning_rate": 4.995410628918294e-06,
"loss": 0.1036,
"step": 213
},
{
"epoch": 0.09736123748862602,
"grad_norm": 1.6491455235555508,
"learning_rate": 4.995367244260052e-06,
"loss": 0.1,
"step": 214
},
{
"epoch": 0.09781619654231119,
"grad_norm": 1.372315749578445,
"learning_rate": 4.995323655691667e-06,
"loss": 0.0543,
"step": 215
},
{
"epoch": 0.09827115559599636,
"grad_norm": 2.2929084487384297,
"learning_rate": 4.995279863216702e-06,
"loss": 0.1005,
"step": 216
},
{
"epoch": 0.09872611464968153,
"grad_norm": 1.8371182479654964,
"learning_rate": 4.995235866838735e-06,
"loss": 0.096,
"step": 217
},
{
"epoch": 0.09918107370336669,
"grad_norm": 1.4189314035725125,
"learning_rate": 4.995191666561361e-06,
"loss": 0.0707,
"step": 218
},
{
"epoch": 0.09963603275705187,
"grad_norm": 1.4036483642687965,
"learning_rate": 4.995147262388192e-06,
"loss": 0.0689,
"step": 219
},
{
"epoch": 0.10009099181073704,
"grad_norm": 1.7382878807357938,
"learning_rate": 4.995102654322858e-06,
"loss": 0.0829,
"step": 220
},
{
"epoch": 0.1005459508644222,
"grad_norm": 1.3102015447280675,
"learning_rate": 4.995057842369002e-06,
"loss": 0.0548,
"step": 221
},
{
"epoch": 0.10100090991810737,
"grad_norm": 1.8490525072637034,
"learning_rate": 4.995012826530287e-06,
"loss": 0.1044,
"step": 222
},
{
"epoch": 0.10145586897179254,
"grad_norm": 2.802543488000276,
"learning_rate": 4.99496760681039e-06,
"loss": 0.1393,
"step": 223
},
{
"epoch": 0.10191082802547771,
"grad_norm": 2.4234245545914295,
"learning_rate": 4.994922183213009e-06,
"loss": 0.1325,
"step": 224
},
{
"epoch": 0.10236578707916287,
"grad_norm": 1.1495372549504432,
"learning_rate": 4.9948765557418535e-06,
"loss": 0.0585,
"step": 225
},
{
"epoch": 0.10282074613284804,
"grad_norm": 2.1666263724534267,
"learning_rate": 4.994830724400653e-06,
"loss": 0.1063,
"step": 226
},
{
"epoch": 0.10327570518653321,
"grad_norm": 1.7066677970234532,
"learning_rate": 4.994784689193151e-06,
"loss": 0.1002,
"step": 227
},
{
"epoch": 0.10373066424021839,
"grad_norm": 1.5304723941528642,
"learning_rate": 4.994738450123111e-06,
"loss": 0.0825,
"step": 228
},
{
"epoch": 0.10418562329390355,
"grad_norm": 2.1125485884299486,
"learning_rate": 4.994692007194312e-06,
"loss": 0.1089,
"step": 229
},
{
"epoch": 0.10464058234758872,
"grad_norm": 1.4297773182355138,
"learning_rate": 4.994645360410547e-06,
"loss": 0.0855,
"step": 230
},
{
"epoch": 0.10509554140127389,
"grad_norm": 1.741498602747005,
"learning_rate": 4.99459850977563e-06,
"loss": 0.0884,
"step": 231
},
{
"epoch": 0.10555050045495905,
"grad_norm": 1.6875366585424447,
"learning_rate": 4.994551455293388e-06,
"loss": 0.068,
"step": 232
},
{
"epoch": 0.10600545950864422,
"grad_norm": 2.03347527932056,
"learning_rate": 4.9945041969676654e-06,
"loss": 0.0997,
"step": 233
},
{
"epoch": 0.10646041856232939,
"grad_norm": 1.5553350034126536,
"learning_rate": 4.994456734802325e-06,
"loss": 0.0709,
"step": 234
},
{
"epoch": 0.10691537761601456,
"grad_norm": 1.354348073951093,
"learning_rate": 4.994409068801247e-06,
"loss": 0.0858,
"step": 235
},
{
"epoch": 0.10737033666969972,
"grad_norm": 1.6048007960766557,
"learning_rate": 4.994361198968323e-06,
"loss": 0.0891,
"step": 236
},
{
"epoch": 0.1078252957233849,
"grad_norm": 2.3380973830643663,
"learning_rate": 4.994313125307466e-06,
"loss": 0.116,
"step": 237
},
{
"epoch": 0.10828025477707007,
"grad_norm": 1.68606521406513,
"learning_rate": 4.994264847822605e-06,
"loss": 0.09,
"step": 238
},
{
"epoch": 0.10873521383075523,
"grad_norm": 2.0274881934833715,
"learning_rate": 4.994216366517684e-06,
"loss": 0.0856,
"step": 239
},
{
"epoch": 0.1091901728844404,
"grad_norm": 1.9224041067300894,
"learning_rate": 4.994167681396667e-06,
"loss": 0.1032,
"step": 240
},
{
"epoch": 0.10964513193812557,
"grad_norm": 2.213562554498921,
"learning_rate": 4.994118792463529e-06,
"loss": 0.1125,
"step": 241
},
{
"epoch": 0.11010009099181074,
"grad_norm": 2.396477374166045,
"learning_rate": 4.994069699722267e-06,
"loss": 0.16,
"step": 242
},
{
"epoch": 0.1105550500454959,
"grad_norm": 1.6621616457271884,
"learning_rate": 4.994020403176893e-06,
"loss": 0.1088,
"step": 243
},
{
"epoch": 0.11101000909918107,
"grad_norm": 2.0137991000965862,
"learning_rate": 4.9939709028314345e-06,
"loss": 0.1203,
"step": 244
},
{
"epoch": 0.11146496815286625,
"grad_norm": 1.731498246221376,
"learning_rate": 4.993921198689935e-06,
"loss": 0.0779,
"step": 245
},
{
"epoch": 0.11191992720655142,
"grad_norm": 1.53319841517271,
"learning_rate": 4.993871290756459e-06,
"loss": 0.0859,
"step": 246
},
{
"epoch": 0.11237488626023658,
"grad_norm": 1.5738861001818754,
"learning_rate": 4.9938211790350835e-06,
"loss": 0.0822,
"step": 247
},
{
"epoch": 0.11282984531392175,
"grad_norm": 1.795556137822037,
"learning_rate": 4.993770863529902e-06,
"loss": 0.1082,
"step": 248
},
{
"epoch": 0.11328480436760692,
"grad_norm": 1.753136266606954,
"learning_rate": 4.993720344245029e-06,
"loss": 0.0826,
"step": 249
},
{
"epoch": 0.11373976342129208,
"grad_norm": 1.724266476242851,
"learning_rate": 4.99366962118459e-06,
"loss": 0.0851,
"step": 250
},
{
"epoch": 0.11419472247497725,
"grad_norm": 1.8081901179247517,
"learning_rate": 4.99361869435273e-06,
"loss": 0.0965,
"step": 251
},
{
"epoch": 0.11464968152866242,
"grad_norm": 2.064401083784083,
"learning_rate": 4.993567563753613e-06,
"loss": 0.0881,
"step": 252
},
{
"epoch": 0.1151046405823476,
"grad_norm": 1.6354098857617054,
"learning_rate": 4.993516229391414e-06,
"loss": 0.0933,
"step": 253
},
{
"epoch": 0.11555959963603275,
"grad_norm": 1.2711881947711132,
"learning_rate": 4.993464691270331e-06,
"loss": 0.0595,
"step": 254
},
{
"epoch": 0.11601455868971793,
"grad_norm": 1.5847340722430843,
"learning_rate": 4.993412949394572e-06,
"loss": 0.0812,
"step": 255
},
{
"epoch": 0.1164695177434031,
"grad_norm": 1.5774467606957123,
"learning_rate": 4.993361003768369e-06,
"loss": 0.081,
"step": 256
},
{
"epoch": 0.11692447679708826,
"grad_norm": 1.3573852133613107,
"learning_rate": 4.993308854395963e-06,
"loss": 0.0812,
"step": 257
},
{
"epoch": 0.11737943585077343,
"grad_norm": 1.5273272920136396,
"learning_rate": 4.993256501281618e-06,
"loss": 0.0634,
"step": 258
},
{
"epoch": 0.1178343949044586,
"grad_norm": 1.8382646613112785,
"learning_rate": 4.993203944429611e-06,
"loss": 0.1145,
"step": 259
},
{
"epoch": 0.11828935395814377,
"grad_norm": 1.5747608705636602,
"learning_rate": 4.993151183844236e-06,
"loss": 0.0801,
"step": 260
},
{
"epoch": 0.11874431301182893,
"grad_norm": 1.7065433305132354,
"learning_rate": 4.9930982195298065e-06,
"loss": 0.0742,
"step": 261
},
{
"epoch": 0.1191992720655141,
"grad_norm": 1.709109441111134,
"learning_rate": 4.9930450514906484e-06,
"loss": 0.1028,
"step": 262
},
{
"epoch": 0.11965423111919928,
"grad_norm": 1.6959707782927067,
"learning_rate": 4.9929916797311075e-06,
"loss": 0.0791,
"step": 263
},
{
"epoch": 0.12010919017288443,
"grad_norm": 2.374639715905283,
"learning_rate": 4.992938104255545e-06,
"loss": 0.1477,
"step": 264
},
{
"epoch": 0.1205641492265696,
"grad_norm": 1.6263809057131815,
"learning_rate": 4.992884325068339e-06,
"loss": 0.0916,
"step": 265
},
{
"epoch": 0.12101910828025478,
"grad_norm": 1.6207164559915699,
"learning_rate": 4.992830342173882e-06,
"loss": 0.1068,
"step": 266
},
{
"epoch": 0.12147406733393995,
"grad_norm": 2.0552449766971823,
"learning_rate": 4.992776155576589e-06,
"loss": 0.1145,
"step": 267
},
{
"epoch": 0.12192902638762511,
"grad_norm": 1.6692049909432523,
"learning_rate": 4.992721765280884e-06,
"loss": 0.1172,
"step": 268
},
{
"epoch": 0.12238398544131028,
"grad_norm": 2.456621954888186,
"learning_rate": 4.992667171291215e-06,
"loss": 0.1267,
"step": 269
},
{
"epoch": 0.12283894449499545,
"grad_norm": 1.5125250812884448,
"learning_rate": 4.992612373612042e-06,
"loss": 0.0661,
"step": 270
},
{
"epoch": 0.12329390354868063,
"grad_norm": 2.0952324870431553,
"learning_rate": 4.99255737224784e-06,
"loss": 0.0917,
"step": 271
},
{
"epoch": 0.12374886260236578,
"grad_norm": 1.4094336450761362,
"learning_rate": 4.9925021672031075e-06,
"loss": 0.0905,
"step": 272
},
{
"epoch": 0.12420382165605096,
"grad_norm": 2.239902062561175,
"learning_rate": 4.992446758482353e-06,
"loss": 0.0995,
"step": 273
},
{
"epoch": 0.12465878070973613,
"grad_norm": 2.696125395972354,
"learning_rate": 4.992391146090106e-06,
"loss": 0.1613,
"step": 274
},
{
"epoch": 0.1251137397634213,
"grad_norm": 1.4853155964847005,
"learning_rate": 4.99233533003091e-06,
"loss": 0.0826,
"step": 275
},
{
"epoch": 0.12556869881710647,
"grad_norm": 1.5393545957542452,
"learning_rate": 4.992279310309326e-06,
"loss": 0.1128,
"step": 276
},
{
"epoch": 0.12602365787079162,
"grad_norm": 2.4236941073693283,
"learning_rate": 4.9922230869299316e-06,
"loss": 0.1607,
"step": 277
},
{
"epoch": 0.1264786169244768,
"grad_norm": 1.6611888199243576,
"learning_rate": 4.992166659897321e-06,
"loss": 0.1005,
"step": 278
},
{
"epoch": 0.12693357597816196,
"grad_norm": 1.3896864345667146,
"learning_rate": 4.992110029216106e-06,
"loss": 0.079,
"step": 279
},
{
"epoch": 0.12738853503184713,
"grad_norm": 1.3647278081745937,
"learning_rate": 4.992053194890914e-06,
"loss": 0.0767,
"step": 280
},
{
"epoch": 0.1278434940855323,
"grad_norm": 2.0323876810575525,
"learning_rate": 4.991996156926388e-06,
"loss": 0.101,
"step": 281
},
{
"epoch": 0.12829845313921748,
"grad_norm": 1.948481701516796,
"learning_rate": 4.9919389153271904e-06,
"loss": 0.106,
"step": 282
},
{
"epoch": 0.12875341219290265,
"grad_norm": 1.3512588403363923,
"learning_rate": 4.991881470097998e-06,
"loss": 0.0897,
"step": 283
},
{
"epoch": 0.1292083712465878,
"grad_norm": 1.4862053800013564,
"learning_rate": 4.991823821243505e-06,
"loss": 0.0898,
"step": 284
},
{
"epoch": 0.12966333030027297,
"grad_norm": 2.287612016528911,
"learning_rate": 4.991765968768422e-06,
"loss": 0.1048,
"step": 285
},
{
"epoch": 0.13011828935395814,
"grad_norm": 1.8190624177647585,
"learning_rate": 4.991707912677477e-06,
"loss": 0.076,
"step": 286
},
{
"epoch": 0.1305732484076433,
"grad_norm": 1.4178411985180965,
"learning_rate": 4.991649652975414e-06,
"loss": 0.062,
"step": 287
},
{
"epoch": 0.13102820746132848,
"grad_norm": 1.7010811854624341,
"learning_rate": 4.991591189666994e-06,
"loss": 0.0928,
"step": 288
},
{
"epoch": 0.13148316651501366,
"grad_norm": 1.7824920481002249,
"learning_rate": 4.991532522756993e-06,
"loss": 0.09,
"step": 289
},
{
"epoch": 0.13193812556869883,
"grad_norm": 1.12093519239752,
"learning_rate": 4.991473652250207e-06,
"loss": 0.0564,
"step": 290
},
{
"epoch": 0.13239308462238397,
"grad_norm": 1.4956629959050902,
"learning_rate": 4.991414578151445e-06,
"loss": 0.0777,
"step": 291
},
{
"epoch": 0.13284804367606914,
"grad_norm": 3.467748085139679,
"learning_rate": 4.991355300465535e-06,
"loss": 0.193,
"step": 292
},
{
"epoch": 0.13330300272975432,
"grad_norm": 1.746518786410603,
"learning_rate": 4.99129581919732e-06,
"loss": 0.0862,
"step": 293
},
{
"epoch": 0.1337579617834395,
"grad_norm": 1.3513400373127227,
"learning_rate": 4.9912361343516616e-06,
"loss": 0.0588,
"step": 294
},
{
"epoch": 0.13421292083712466,
"grad_norm": 1.7841617467512154,
"learning_rate": 4.991176245933437e-06,
"loss": 0.0982,
"step": 295
},
{
"epoch": 0.13466787989080983,
"grad_norm": 1.6650575824861316,
"learning_rate": 4.9911161539475385e-06,
"loss": 0.0868,
"step": 296
},
{
"epoch": 0.135122838944495,
"grad_norm": 2.0850606622795667,
"learning_rate": 4.991055858398879e-06,
"loss": 0.1087,
"step": 297
},
{
"epoch": 0.13557779799818018,
"grad_norm": 2.27094495258401,
"learning_rate": 4.990995359292384e-06,
"loss": 0.1177,
"step": 298
},
{
"epoch": 0.13603275705186532,
"grad_norm": 1.8175215978998918,
"learning_rate": 4.990934656632997e-06,
"loss": 0.1029,
"step": 299
},
{
"epoch": 0.1364877161055505,
"grad_norm": 1.9580713421337124,
"learning_rate": 4.990873750425679e-06,
"loss": 0.0842,
"step": 300
},
{
"epoch": 0.13694267515923567,
"grad_norm": 1.5378181370134305,
"learning_rate": 4.990812640675406e-06,
"loss": 0.0813,
"step": 301
},
{
"epoch": 0.13739763421292084,
"grad_norm": 1.4646500614646956,
"learning_rate": 4.990751327387174e-06,
"loss": 0.0642,
"step": 302
},
{
"epoch": 0.137852593266606,
"grad_norm": 1.7132953215338962,
"learning_rate": 4.99068981056599e-06,
"loss": 0.0921,
"step": 303
},
{
"epoch": 0.13830755232029118,
"grad_norm": 2.020828034549401,
"learning_rate": 4.990628090216885e-06,
"loss": 0.1164,
"step": 304
},
{
"epoch": 0.13876251137397635,
"grad_norm": 1.4167009033800524,
"learning_rate": 4.990566166344898e-06,
"loss": 0.0695,
"step": 305
},
{
"epoch": 0.1392174704276615,
"grad_norm": 1.743676237886539,
"learning_rate": 4.990504038955092e-06,
"loss": 0.1083,
"step": 306
},
{
"epoch": 0.13967242948134667,
"grad_norm": 1.8343720931834766,
"learning_rate": 4.990441708052542e-06,
"loss": 0.0985,
"step": 307
},
{
"epoch": 0.14012738853503184,
"grad_norm": 1.4113998497835858,
"learning_rate": 4.9903791736423435e-06,
"loss": 0.081,
"step": 308
},
{
"epoch": 0.14058234758871702,
"grad_norm": 1.8830612535708886,
"learning_rate": 4.9903164357296044e-06,
"loss": 0.0954,
"step": 309
},
{
"epoch": 0.1410373066424022,
"grad_norm": 1.4208829323408247,
"learning_rate": 4.990253494319453e-06,
"loss": 0.0919,
"step": 310
},
{
"epoch": 0.14149226569608736,
"grad_norm": 1.3671067756437636,
"learning_rate": 4.990190349417032e-06,
"loss": 0.0928,
"step": 311
},
{
"epoch": 0.14194722474977253,
"grad_norm": 1.965673083316737,
"learning_rate": 4.990127001027501e-06,
"loss": 0.0849,
"step": 312
},
{
"epoch": 0.14240218380345768,
"grad_norm": 1.3933093475773835,
"learning_rate": 4.990063449156037e-06,
"loss": 0.0735,
"step": 313
},
{
"epoch": 0.14285714285714285,
"grad_norm": 1.8960360183192995,
"learning_rate": 4.989999693807832e-06,
"loss": 0.1141,
"step": 314
},
{
"epoch": 0.14331210191082802,
"grad_norm": 1.8316795975938271,
"learning_rate": 4.989935734988098e-06,
"loss": 0.1084,
"step": 315
},
{
"epoch": 0.1437670609645132,
"grad_norm": 1.6451238367574679,
"learning_rate": 4.98987157270206e-06,
"loss": 0.0739,
"step": 316
},
{
"epoch": 0.14422202001819837,
"grad_norm": 2.0644883617404854,
"learning_rate": 4.989807206954961e-06,
"loss": 0.1125,
"step": 317
},
{
"epoch": 0.14467697907188354,
"grad_norm": 1.322196438354388,
"learning_rate": 4.9897426377520605e-06,
"loss": 0.0792,
"step": 318
},
{
"epoch": 0.1451319381255687,
"grad_norm": 2.568915637493138,
"learning_rate": 4.989677865098636e-06,
"loss": 0.1236,
"step": 319
},
{
"epoch": 0.14558689717925385,
"grad_norm": 1.1659492648591403,
"learning_rate": 4.989612888999978e-06,
"loss": 0.0624,
"step": 320
},
{
"epoch": 0.14604185623293903,
"grad_norm": 1.431829324891758,
"learning_rate": 4.9895477094614e-06,
"loss": 0.0855,
"step": 321
},
{
"epoch": 0.1464968152866242,
"grad_norm": 1.1704367288212936,
"learning_rate": 4.989482326488225e-06,
"loss": 0.0741,
"step": 322
},
{
"epoch": 0.14695177434030937,
"grad_norm": 1.6170438514885752,
"learning_rate": 4.989416740085796e-06,
"loss": 0.1057,
"step": 323
},
{
"epoch": 0.14740673339399454,
"grad_norm": 1.639627544263893,
"learning_rate": 4.9893509502594735e-06,
"loss": 0.0784,
"step": 324
},
{
"epoch": 0.14786169244767972,
"grad_norm": 1.6437318926278874,
"learning_rate": 4.9892849570146335e-06,
"loss": 0.1105,
"step": 325
},
{
"epoch": 0.1483166515013649,
"grad_norm": 1.6588510281862943,
"learning_rate": 4.989218760356668e-06,
"loss": 0.106,
"step": 326
},
{
"epoch": 0.14877161055505003,
"grad_norm": 1.692767253326721,
"learning_rate": 4.989152360290987e-06,
"loss": 0.1068,
"step": 327
},
{
"epoch": 0.1492265696087352,
"grad_norm": 2.117777475502305,
"learning_rate": 4.989085756823015e-06,
"loss": 0.1274,
"step": 328
},
{
"epoch": 0.14968152866242038,
"grad_norm": 1.6877038030416243,
"learning_rate": 4.989018949958197e-06,
"loss": 0.1001,
"step": 329
},
{
"epoch": 0.15013648771610555,
"grad_norm": 2.018139319167573,
"learning_rate": 4.98895193970199e-06,
"loss": 0.0726,
"step": 330
},
{
"epoch": 0.15059144676979072,
"grad_norm": 1.7601822979826238,
"learning_rate": 4.9888847260598705e-06,
"loss": 0.0884,
"step": 331
},
{
"epoch": 0.1510464058234759,
"grad_norm": 2.153451550499006,
"learning_rate": 4.98881730903733e-06,
"loss": 0.1263,
"step": 332
},
{
"epoch": 0.15150136487716107,
"grad_norm": 1.7037846763057773,
"learning_rate": 4.98874968863988e-06,
"loss": 0.1017,
"step": 333
},
{
"epoch": 0.15195632393084624,
"grad_norm": 1.6373036503866722,
"learning_rate": 4.988681864873044e-06,
"loss": 0.0936,
"step": 334
},
{
"epoch": 0.15241128298453138,
"grad_norm": 1.5043938510579566,
"learning_rate": 4.988613837742364e-06,
"loss": 0.0841,
"step": 335
},
{
"epoch": 0.15286624203821655,
"grad_norm": 1.9480098961832564,
"learning_rate": 4.9885456072534015e-06,
"loss": 0.093,
"step": 336
},
{
"epoch": 0.15332120109190173,
"grad_norm": 2.0743334215437845,
"learning_rate": 4.988477173411728e-06,
"loss": 0.1001,
"step": 337
},
{
"epoch": 0.1537761601455869,
"grad_norm": 1.3686100112269117,
"learning_rate": 4.988408536222939e-06,
"loss": 0.0706,
"step": 338
},
{
"epoch": 0.15423111919927207,
"grad_norm": 1.7072624744285279,
"learning_rate": 4.9883396956926416e-06,
"loss": 0.0883,
"step": 339
},
{
"epoch": 0.15468607825295724,
"grad_norm": 1.2178991309049074,
"learning_rate": 4.988270651826462e-06,
"loss": 0.066,
"step": 340
},
{
"epoch": 0.15514103730664242,
"grad_norm": 1.5734145514066031,
"learning_rate": 4.988201404630041e-06,
"loss": 0.0818,
"step": 341
},
{
"epoch": 0.15559599636032756,
"grad_norm": 1.4266019263972631,
"learning_rate": 4.988131954109038e-06,
"loss": 0.0835,
"step": 342
},
{
"epoch": 0.15605095541401273,
"grad_norm": 2.2620036917930633,
"learning_rate": 4.988062300269128e-06,
"loss": 0.1374,
"step": 343
},
{
"epoch": 0.1565059144676979,
"grad_norm": 1.4975643248719304,
"learning_rate": 4.987992443116003e-06,
"loss": 0.0817,
"step": 344
},
{
"epoch": 0.15696087352138308,
"grad_norm": 1.723916950757741,
"learning_rate": 4.987922382655372e-06,
"loss": 0.086,
"step": 345
},
{
"epoch": 0.15741583257506825,
"grad_norm": 2.50033376989197,
"learning_rate": 4.987852118892958e-06,
"loss": 0.1498,
"step": 346
},
{
"epoch": 0.15787079162875342,
"grad_norm": 2.0776125106096934,
"learning_rate": 4.987781651834503e-06,
"loss": 0.1258,
"step": 347
},
{
"epoch": 0.1583257506824386,
"grad_norm": 2.186488732885297,
"learning_rate": 4.987710981485768e-06,
"loss": 0.1203,
"step": 348
},
{
"epoch": 0.15878070973612374,
"grad_norm": 2.0497982262406698,
"learning_rate": 4.987640107852525e-06,
"loss": 0.1365,
"step": 349
},
{
"epoch": 0.1592356687898089,
"grad_norm": 1.394060418907116,
"learning_rate": 4.987569030940567e-06,
"loss": 0.0811,
"step": 350
},
{
"epoch": 0.15969062784349408,
"grad_norm": 1.5257209721345255,
"learning_rate": 4.987497750755702e-06,
"loss": 0.0665,
"step": 351
},
{
"epoch": 0.16014558689717925,
"grad_norm": 2.328076306378438,
"learning_rate": 4.987426267303753e-06,
"loss": 0.1186,
"step": 352
},
{
"epoch": 0.16060054595086443,
"grad_norm": 1.8266119344469305,
"learning_rate": 4.987354580590563e-06,
"loss": 0.1011,
"step": 353
},
{
"epoch": 0.1610555050045496,
"grad_norm": 1.7369452160483552,
"learning_rate": 4.987282690621991e-06,
"loss": 0.117,
"step": 354
},
{
"epoch": 0.16151046405823477,
"grad_norm": 1.8346392689418392,
"learning_rate": 4.987210597403907e-06,
"loss": 0.1,
"step": 355
},
{
"epoch": 0.16196542311191992,
"grad_norm": 1.9402353280122917,
"learning_rate": 4.987138300942208e-06,
"loss": 0.0949,
"step": 356
},
{
"epoch": 0.1624203821656051,
"grad_norm": 1.4819316275042067,
"learning_rate": 4.987065801242798e-06,
"loss": 0.0855,
"step": 357
},
{
"epoch": 0.16287534121929026,
"grad_norm": 1.8440191145455884,
"learning_rate": 4.986993098311601e-06,
"loss": 0.1057,
"step": 358
},
{
"epoch": 0.16333030027297543,
"grad_norm": 1.712390016283102,
"learning_rate": 4.986920192154561e-06,
"loss": 0.0917,
"step": 359
},
{
"epoch": 0.1637852593266606,
"grad_norm": 1.2697535382377623,
"learning_rate": 4.986847082777632e-06,
"loss": 0.0729,
"step": 360
},
{
"epoch": 0.16424021838034578,
"grad_norm": 1.5330396115730802,
"learning_rate": 4.986773770186791e-06,
"loss": 0.0966,
"step": 361
},
{
"epoch": 0.16469517743403095,
"grad_norm": 2.359233717201702,
"learning_rate": 4.986700254388027e-06,
"loss": 0.1308,
"step": 362
},
{
"epoch": 0.1651501364877161,
"grad_norm": 1.330733109747955,
"learning_rate": 4.986626535387349e-06,
"loss": 0.0728,
"step": 363
},
{
"epoch": 0.16560509554140126,
"grad_norm": 1.7398719883146694,
"learning_rate": 4.9865526131907795e-06,
"loss": 0.0893,
"step": 364
},
{
"epoch": 0.16606005459508644,
"grad_norm": 2.018839749017437,
"learning_rate": 4.9864784878043595e-06,
"loss": 0.1268,
"step": 365
},
{
"epoch": 0.1665150136487716,
"grad_norm": 2.439244123753763,
"learning_rate": 4.986404159234146e-06,
"loss": 0.1047,
"step": 366
},
{
"epoch": 0.16696997270245678,
"grad_norm": 1.4077243142655576,
"learning_rate": 4.986329627486213e-06,
"loss": 0.07,
"step": 367
},
{
"epoch": 0.16742493175614195,
"grad_norm": 2.0634194365835583,
"learning_rate": 4.986254892566652e-06,
"loss": 0.1199,
"step": 368
},
{
"epoch": 0.16787989080982713,
"grad_norm": 1.507898380305614,
"learning_rate": 4.9861799544815684e-06,
"loss": 0.0798,
"step": 369
},
{
"epoch": 0.16833484986351227,
"grad_norm": 1.5689447325912511,
"learning_rate": 4.986104813237086e-06,
"loss": 0.0872,
"step": 370
},
{
"epoch": 0.16878980891719744,
"grad_norm": 1.5434828853102547,
"learning_rate": 4.986029468839346e-06,
"loss": 0.0756,
"step": 371
},
{
"epoch": 0.16924476797088261,
"grad_norm": 1.9546839136865664,
"learning_rate": 4.985953921294505e-06,
"loss": 0.129,
"step": 372
},
{
"epoch": 0.1696997270245678,
"grad_norm": 1.4457360634551049,
"learning_rate": 4.985878170608736e-06,
"loss": 0.0651,
"step": 373
},
{
"epoch": 0.17015468607825296,
"grad_norm": 1.7053082159754585,
"learning_rate": 4.985802216788228e-06,
"loss": 0.0786,
"step": 374
},
{
"epoch": 0.17060964513193813,
"grad_norm": 2.0831330601859643,
"learning_rate": 4.98572605983919e-06,
"loss": 0.1087,
"step": 375
},
{
"epoch": 0.1710646041856233,
"grad_norm": 1.3106266925763963,
"learning_rate": 4.985649699767842e-06,
"loss": 0.0666,
"step": 376
},
{
"epoch": 0.17151956323930848,
"grad_norm": 1.5931730936354696,
"learning_rate": 4.985573136580427e-06,
"loss": 0.1015,
"step": 377
},
{
"epoch": 0.17197452229299362,
"grad_norm": 1.3398175715153904,
"learning_rate": 4.9854963702832e-06,
"loss": 0.0706,
"step": 378
},
{
"epoch": 0.1724294813466788,
"grad_norm": 1.4932070031671647,
"learning_rate": 4.985419400882433e-06,
"loss": 0.1009,
"step": 379
},
{
"epoch": 0.17288444040036396,
"grad_norm": 2.05809614886543,
"learning_rate": 4.985342228384418e-06,
"loss": 0.1373,
"step": 380
},
{
"epoch": 0.17333939945404914,
"grad_norm": 25.314485102746445,
"learning_rate": 4.985264852795459e-06,
"loss": 0.529,
"step": 381
},
{
"epoch": 0.1737943585077343,
"grad_norm": 1.3496622625056474,
"learning_rate": 4.98518727412188e-06,
"loss": 0.0792,
"step": 382
},
{
"epoch": 0.17424931756141948,
"grad_norm": 2.042157493841037,
"learning_rate": 4.98510949237002e-06,
"loss": 0.1127,
"step": 383
},
{
"epoch": 0.17470427661510465,
"grad_norm": 2.093747109047391,
"learning_rate": 4.985031507546234e-06,
"loss": 0.0931,
"step": 384
},
{
"epoch": 0.1751592356687898,
"grad_norm": 2.620290737475778,
"learning_rate": 4.984953319656896e-06,
"loss": 0.1258,
"step": 385
},
{
"epoch": 0.17561419472247497,
"grad_norm": 1.7812499192074571,
"learning_rate": 4.984874928708395e-06,
"loss": 0.0934,
"step": 386
},
{
"epoch": 0.17606915377616014,
"grad_norm": 1.9861134139953058,
"learning_rate": 4.984796334707136e-06,
"loss": 0.1105,
"step": 387
},
{
"epoch": 0.17652411282984531,
"grad_norm": 9.71210910528449,
"learning_rate": 4.984717537659542e-06,
"loss": 0.119,
"step": 388
},
{
"epoch": 0.1769790718835305,
"grad_norm": 1.2902315877318344,
"learning_rate": 4.984638537572052e-06,
"loss": 0.0591,
"step": 389
},
{
"epoch": 0.17743403093721566,
"grad_norm": 1.693249076147672,
"learning_rate": 4.984559334451121e-06,
"loss": 0.0906,
"step": 390
},
{
"epoch": 0.17788898999090083,
"grad_norm": 1.7045791781932,
"learning_rate": 4.984479928303221e-06,
"loss": 0.066,
"step": 391
},
{
"epoch": 0.17834394904458598,
"grad_norm": 1.588345004423415,
"learning_rate": 4.984400319134841e-06,
"loss": 0.079,
"step": 392
},
{
"epoch": 0.17879890809827115,
"grad_norm": 2.8167066456613368,
"learning_rate": 4.984320506952487e-06,
"loss": 0.1743,
"step": 393
},
{
"epoch": 0.17925386715195632,
"grad_norm": 1.8409665855781128,
"learning_rate": 4.9842404917626796e-06,
"loss": 0.1009,
"step": 394
},
{
"epoch": 0.1797088262056415,
"grad_norm": 1.5444918002986228,
"learning_rate": 4.984160273571959e-06,
"loss": 0.0952,
"step": 395
},
{
"epoch": 0.18016378525932666,
"grad_norm": 1.9718645058282944,
"learning_rate": 4.9840798523868785e-06,
"loss": 0.1217,
"step": 396
},
{
"epoch": 0.18061874431301184,
"grad_norm": 1.669853882784426,
"learning_rate": 4.983999228214011e-06,
"loss": 0.083,
"step": 397
},
{
"epoch": 0.181073703366697,
"grad_norm": 1.5445667787054873,
"learning_rate": 4.983918401059943e-06,
"loss": 0.0838,
"step": 398
},
{
"epoch": 0.18152866242038215,
"grad_norm": 1.8477622601816133,
"learning_rate": 4.983837370931282e-06,
"loss": 0.1199,
"step": 399
},
{
"epoch": 0.18198362147406733,
"grad_norm": 2.295804335093856,
"learning_rate": 4.983756137834647e-06,
"loss": 0.1561,
"step": 400
},
{
"epoch": 0.1824385805277525,
"grad_norm": 2.1902816453958933,
"learning_rate": 4.9836747017766765e-06,
"loss": 0.1014,
"step": 401
},
{
"epoch": 0.18289353958143767,
"grad_norm": 1.7414949549224419,
"learning_rate": 4.983593062764027e-06,
"loss": 0.1046,
"step": 402
},
{
"epoch": 0.18334849863512284,
"grad_norm": 3.529761555914209,
"learning_rate": 4.983511220803367e-06,
"loss": 0.1573,
"step": 403
},
{
"epoch": 0.18380345768880801,
"grad_norm": 1.5931351386368249,
"learning_rate": 4.983429175901386e-06,
"loss": 0.0896,
"step": 404
},
{
"epoch": 0.1842584167424932,
"grad_norm": 1.4617184144821196,
"learning_rate": 4.983346928064788e-06,
"loss": 0.0698,
"step": 405
},
{
"epoch": 0.18471337579617833,
"grad_norm": 1.564679441746091,
"learning_rate": 4.9832644773002935e-06,
"loss": 0.0955,
"step": 406
},
{
"epoch": 0.1851683348498635,
"grad_norm": 1.4077890282448986,
"learning_rate": 4.98318182361464e-06,
"loss": 0.0887,
"step": 407
},
{
"epoch": 0.18562329390354868,
"grad_norm": 1.6028267121804223,
"learning_rate": 4.9830989670145825e-06,
"loss": 0.0989,
"step": 408
},
{
"epoch": 0.18607825295723385,
"grad_norm": 1.8479648547197383,
"learning_rate": 4.9830159075068905e-06,
"loss": 0.1009,
"step": 409
},
{
"epoch": 0.18653321201091902,
"grad_norm": 1.8145495712184487,
"learning_rate": 4.9829326450983514e-06,
"loss": 0.1125,
"step": 410
},
{
"epoch": 0.1869881710646042,
"grad_norm": 1.839873930402737,
"learning_rate": 4.98284917979577e-06,
"loss": 0.0975,
"step": 411
},
{
"epoch": 0.18744313011828936,
"grad_norm": 2.3433237407057863,
"learning_rate": 4.9827655116059656e-06,
"loss": 0.1061,
"step": 412
},
{
"epoch": 0.18789808917197454,
"grad_norm": 1.479552769836274,
"learning_rate": 4.9826816405357755e-06,
"loss": 0.105,
"step": 413
},
{
"epoch": 0.18835304822565968,
"grad_norm": 1.0380040250679141,
"learning_rate": 4.982597566592054e-06,
"loss": 0.0522,
"step": 414
},
{
"epoch": 0.18880800727934485,
"grad_norm": 2.2146611071914744,
"learning_rate": 4.982513289781671e-06,
"loss": 0.1403,
"step": 415
},
{
"epoch": 0.18926296633303002,
"grad_norm": 1.4265466923705232,
"learning_rate": 4.982428810111512e-06,
"loss": 0.0846,
"step": 416
},
{
"epoch": 0.1897179253867152,
"grad_norm": 1.4254072959974569,
"learning_rate": 4.9823441275884814e-06,
"loss": 0.0787,
"step": 417
},
{
"epoch": 0.19017288444040037,
"grad_norm": 2.353200458571576,
"learning_rate": 4.982259242219499e-06,
"loss": 0.1114,
"step": 418
},
{
"epoch": 0.19062784349408554,
"grad_norm": 1.3512279730893322,
"learning_rate": 4.9821741540115006e-06,
"loss": 0.0678,
"step": 419
},
{
"epoch": 0.1910828025477707,
"grad_norm": 1.728060266498106,
"learning_rate": 4.982088862971441e-06,
"loss": 0.1129,
"step": 420
},
{
"epoch": 0.19153776160145586,
"grad_norm": 1.8022543001727114,
"learning_rate": 4.982003369106287e-06,
"loss": 0.1036,
"step": 421
},
{
"epoch": 0.19199272065514103,
"grad_norm": 1.2312712834502222,
"learning_rate": 4.981917672423028e-06,
"loss": 0.065,
"step": 422
},
{
"epoch": 0.1924476797088262,
"grad_norm": 1.6183848549336255,
"learning_rate": 4.981831772928664e-06,
"loss": 0.0934,
"step": 423
},
{
"epoch": 0.19290263876251137,
"grad_norm": 2.001713262915152,
"learning_rate": 4.981745670630216e-06,
"loss": 0.1356,
"step": 424
},
{
"epoch": 0.19335759781619655,
"grad_norm": 2.0057745044552995,
"learning_rate": 4.981659365534718e-06,
"loss": 0.1285,
"step": 425
},
{
"epoch": 0.19381255686988172,
"grad_norm": 2.299079022869691,
"learning_rate": 4.981572857649225e-06,
"loss": 0.1195,
"step": 426
},
{
"epoch": 0.1942675159235669,
"grad_norm": 1.6869951958248894,
"learning_rate": 4.981486146980804e-06,
"loss": 0.0877,
"step": 427
},
{
"epoch": 0.19472247497725204,
"grad_norm": 1.9301190501764922,
"learning_rate": 4.9813992335365415e-06,
"loss": 0.0977,
"step": 428
},
{
"epoch": 0.1951774340309372,
"grad_norm": 1.6227704434432904,
"learning_rate": 4.98131211732354e-06,
"loss": 0.1035,
"step": 429
},
{
"epoch": 0.19563239308462238,
"grad_norm": 1.632769015838627,
"learning_rate": 4.981224798348917e-06,
"loss": 0.0833,
"step": 430
},
{
"epoch": 0.19608735213830755,
"grad_norm": 2.3862639707091082,
"learning_rate": 4.981137276619809e-06,
"loss": 0.1419,
"step": 431
},
{
"epoch": 0.19654231119199272,
"grad_norm": 1.2625986411158334,
"learning_rate": 4.9810495521433675e-06,
"loss": 0.078,
"step": 432
},
{
"epoch": 0.1969972702456779,
"grad_norm": 2.5081068393508157,
"learning_rate": 4.9809616249267616e-06,
"loss": 0.1478,
"step": 433
},
{
"epoch": 0.19745222929936307,
"grad_norm": 1.9644808854065114,
"learning_rate": 4.980873494977174e-06,
"loss": 0.121,
"step": 434
},
{
"epoch": 0.1979071883530482,
"grad_norm": 1.647433915922947,
"learning_rate": 4.98078516230181e-06,
"loss": 0.0865,
"step": 435
},
{
"epoch": 0.19836214740673339,
"grad_norm": 1.5774273491436515,
"learning_rate": 4.980696626907884e-06,
"loss": 0.0887,
"step": 436
},
{
"epoch": 0.19881710646041856,
"grad_norm": 1.5604062690588907,
"learning_rate": 4.980607888802633e-06,
"loss": 0.1,
"step": 437
},
{
"epoch": 0.19927206551410373,
"grad_norm": 1.548442809835796,
"learning_rate": 4.980518947993307e-06,
"loss": 0.1005,
"step": 438
},
{
"epoch": 0.1997270245677889,
"grad_norm": 1.6276180373825353,
"learning_rate": 4.980429804487176e-06,
"loss": 0.1006,
"step": 439
},
{
"epoch": 0.20018198362147407,
"grad_norm": 1.5718547041391637,
"learning_rate": 4.980340458291521e-06,
"loss": 0.0858,
"step": 440
},
{
"epoch": 0.20063694267515925,
"grad_norm": 1.3679183632524226,
"learning_rate": 4.980250909413646e-06,
"loss": 0.0901,
"step": 441
},
{
"epoch": 0.2010919017288444,
"grad_norm": 1.7491296961984788,
"learning_rate": 4.980161157860867e-06,
"loss": 0.0888,
"step": 442
},
{
"epoch": 0.20154686078252956,
"grad_norm": 2.0306839493761446,
"learning_rate": 4.980071203640519e-06,
"loss": 0.0893,
"step": 443
},
{
"epoch": 0.20200181983621474,
"grad_norm": 1.353153596211688,
"learning_rate": 4.979981046759952e-06,
"loss": 0.0753,
"step": 444
},
{
"epoch": 0.2024567788898999,
"grad_norm": 1.969605104045741,
"learning_rate": 4.979890687226533e-06,
"loss": 0.1033,
"step": 445
},
{
"epoch": 0.20291173794358508,
"grad_norm": 2.085518332646124,
"learning_rate": 4.979800125047647e-06,
"loss": 0.0979,
"step": 446
},
{
"epoch": 0.20336669699727025,
"grad_norm": 1.6181669031153556,
"learning_rate": 4.979709360230692e-06,
"loss": 0.0969,
"step": 447
},
{
"epoch": 0.20382165605095542,
"grad_norm": 1.6760914355637484,
"learning_rate": 4.979618392783087e-06,
"loss": 0.0883,
"step": 448
},
{
"epoch": 0.20427661510464057,
"grad_norm": 1.2907730003800948,
"learning_rate": 4.979527222712266e-06,
"loss": 0.0775,
"step": 449
},
{
"epoch": 0.20473157415832574,
"grad_norm": 1.241096973502198,
"learning_rate": 4.9794358500256765e-06,
"loss": 0.0599,
"step": 450
},
{
"epoch": 0.2051865332120109,
"grad_norm": 1.579037640818148,
"learning_rate": 4.979344274730786e-06,
"loss": 0.0831,
"step": 451
},
{
"epoch": 0.20564149226569609,
"grad_norm": 2.225915719971972,
"learning_rate": 4.979252496835079e-06,
"loss": 0.1116,
"step": 452
},
{
"epoch": 0.20609645131938126,
"grad_norm": 2.3031173397129923,
"learning_rate": 4.979160516346054e-06,
"loss": 0.1536,
"step": 453
},
{
"epoch": 0.20655141037306643,
"grad_norm": 27.297310781833385,
"learning_rate": 4.979068333271227e-06,
"loss": 0.9223,
"step": 454
},
{
"epoch": 0.2070063694267516,
"grad_norm": 2.4041431299507607,
"learning_rate": 4.978975947618131e-06,
"loss": 0.1184,
"step": 455
},
{
"epoch": 0.20746132848043677,
"grad_norm": 1.6683861662324915,
"learning_rate": 4.978883359394316e-06,
"loss": 0.1301,
"step": 456
},
{
"epoch": 0.20791628753412192,
"grad_norm": 1.9056814965685545,
"learning_rate": 4.978790568607347e-06,
"loss": 0.1001,
"step": 457
},
{
"epoch": 0.2083712465878071,
"grad_norm": 1.9713836323302738,
"learning_rate": 4.9786975752648076e-06,
"loss": 0.1174,
"step": 458
},
{
"epoch": 0.20882620564149226,
"grad_norm": 1.598376196967646,
"learning_rate": 4.978604379374295e-06,
"loss": 0.0986,
"step": 459
},
{
"epoch": 0.20928116469517744,
"grad_norm": 1.5517923833736031,
"learning_rate": 4.978510980943427e-06,
"loss": 0.0807,
"step": 460
},
{
"epoch": 0.2097361237488626,
"grad_norm": 2.004418653450344,
"learning_rate": 4.978417379979834e-06,
"loss": 0.1065,
"step": 461
},
{
"epoch": 0.21019108280254778,
"grad_norm": 1.7753220163198007,
"learning_rate": 4.978323576491165e-06,
"loss": 0.0987,
"step": 462
},
{
"epoch": 0.21064604185623295,
"grad_norm": 1.7384737383317277,
"learning_rate": 4.978229570485085e-06,
"loss": 0.1048,
"step": 463
},
{
"epoch": 0.2111010009099181,
"grad_norm": 1.5352099211420311,
"learning_rate": 4.978135361969276e-06,
"loss": 0.0983,
"step": 464
},
{
"epoch": 0.21155595996360327,
"grad_norm": 1.6028799125387194,
"learning_rate": 4.9780409509514375e-06,
"loss": 0.091,
"step": 465
},
{
"epoch": 0.21201091901728844,
"grad_norm": 1.9664054893168261,
"learning_rate": 4.977946337439282e-06,
"loss": 0.1495,
"step": 466
},
{
"epoch": 0.2124658780709736,
"grad_norm": 1.7122667851036462,
"learning_rate": 4.9778515214405436e-06,
"loss": 0.1139,
"step": 467
},
{
"epoch": 0.21292083712465878,
"grad_norm": 1.7566455248377864,
"learning_rate": 4.977756502962967e-06,
"loss": 0.1097,
"step": 468
},
{
"epoch": 0.21337579617834396,
"grad_norm": 1.1350501611425003,
"learning_rate": 4.97766128201432e-06,
"loss": 0.0629,
"step": 469
},
{
"epoch": 0.21383075523202913,
"grad_norm": 1.2023067292666059,
"learning_rate": 4.977565858602381e-06,
"loss": 0.0782,
"step": 470
},
{
"epoch": 0.21428571428571427,
"grad_norm": 1.628252441426902,
"learning_rate": 4.977470232734949e-06,
"loss": 0.0987,
"step": 471
},
{
"epoch": 0.21474067333939945,
"grad_norm": 1.724322735405813,
"learning_rate": 4.977374404419838e-06,
"loss": 0.0903,
"step": 472
},
{
"epoch": 0.21519563239308462,
"grad_norm": 1.470263169494043,
"learning_rate": 4.977278373664877e-06,
"loss": 0.0882,
"step": 473
},
{
"epoch": 0.2156505914467698,
"grad_norm": 2.599396527432543,
"learning_rate": 4.977182140477916e-06,
"loss": 0.1209,
"step": 474
},
{
"epoch": 0.21610555050045496,
"grad_norm": 1.6800447119151198,
"learning_rate": 4.977085704866817e-06,
"loss": 0.0776,
"step": 475
},
{
"epoch": 0.21656050955414013,
"grad_norm": 1.5595540666125045,
"learning_rate": 4.97698906683946e-06,
"loss": 0.103,
"step": 476
},
{
"epoch": 0.2170154686078253,
"grad_norm": 2.248635180290087,
"learning_rate": 4.9768922264037435e-06,
"loss": 0.1388,
"step": 477
},
{
"epoch": 0.21747042766151045,
"grad_norm": 1.1547627152960565,
"learning_rate": 4.976795183567579e-06,
"loss": 0.0624,
"step": 478
},
{
"epoch": 0.21792538671519562,
"grad_norm": 1.56353757750327,
"learning_rate": 4.976697938338898e-06,
"loss": 0.0856,
"step": 479
},
{
"epoch": 0.2183803457688808,
"grad_norm": 1.2335181237621284,
"learning_rate": 4.976600490725645e-06,
"loss": 0.0644,
"step": 480
},
{
"epoch": 0.21883530482256597,
"grad_norm": 1.900991648340467,
"learning_rate": 4.976502840735785e-06,
"loss": 0.153,
"step": 481
},
{
"epoch": 0.21929026387625114,
"grad_norm": 1.3078243371858722,
"learning_rate": 4.976404988377297e-06,
"loss": 0.0621,
"step": 482
},
{
"epoch": 0.2197452229299363,
"grad_norm": 2.0047686247285923,
"learning_rate": 4.976306933658176e-06,
"loss": 0.1136,
"step": 483
},
{
"epoch": 0.22020018198362148,
"grad_norm": 1.8552855878852923,
"learning_rate": 4.976208676586435e-06,
"loss": 0.1284,
"step": 484
},
{
"epoch": 0.22065514103730663,
"grad_norm": 1.8525936784229493,
"learning_rate": 4.976110217170104e-06,
"loss": 0.0917,
"step": 485
},
{
"epoch": 0.2211101000909918,
"grad_norm": 1.4658188242525991,
"learning_rate": 4.976011555417228e-06,
"loss": 0.0749,
"step": 486
},
{
"epoch": 0.22156505914467697,
"grad_norm": 1.1511032936840262,
"learning_rate": 4.975912691335869e-06,
"loss": 0.0761,
"step": 487
},
{
"epoch": 0.22202001819836215,
"grad_norm": 1.458580259230844,
"learning_rate": 4.975813624934106e-06,
"loss": 0.0768,
"step": 488
},
{
"epoch": 0.22247497725204732,
"grad_norm": 1.5627508232221192,
"learning_rate": 4.975714356220035e-06,
"loss": 0.0823,
"step": 489
},
{
"epoch": 0.2229299363057325,
"grad_norm": 1.075721834306004,
"learning_rate": 4.975614885201766e-06,
"loss": 0.0504,
"step": 490
},
{
"epoch": 0.22338489535941766,
"grad_norm": 1.6198884733457342,
"learning_rate": 4.975515211887429e-06,
"loss": 0.1024,
"step": 491
},
{
"epoch": 0.22383985441310283,
"grad_norm": 1.6346417323820548,
"learning_rate": 4.9754153362851684e-06,
"loss": 0.0851,
"step": 492
},
{
"epoch": 0.22429481346678798,
"grad_norm": 2.448143027911265,
"learning_rate": 4.975315258403145e-06,
"loss": 0.1479,
"step": 493
},
{
"epoch": 0.22474977252047315,
"grad_norm": 1.6016068432961146,
"learning_rate": 4.975214978249537e-06,
"loss": 0.0886,
"step": 494
},
{
"epoch": 0.22520473157415832,
"grad_norm": 1.4721161321318619,
"learning_rate": 4.975114495832539e-06,
"loss": 0.0976,
"step": 495
},
{
"epoch": 0.2256596906278435,
"grad_norm": 1.7625335294527533,
"learning_rate": 4.975013811160362e-06,
"loss": 0.0898,
"step": 496
},
{
"epoch": 0.22611464968152867,
"grad_norm": 1.9298670425360585,
"learning_rate": 4.974912924241233e-06,
"loss": 0.1027,
"step": 497
},
{
"epoch": 0.22656960873521384,
"grad_norm": 1.4996755802132458,
"learning_rate": 4.974811835083397e-06,
"loss": 0.0978,
"step": 498
},
{
"epoch": 0.227024567788899,
"grad_norm": 2.1147277125940955,
"learning_rate": 4.974710543695114e-06,
"loss": 0.1063,
"step": 499
},
{
"epoch": 0.22747952684258416,
"grad_norm": 2.529920688558412,
"learning_rate": 4.974609050084661e-06,
"loss": 0.1476,
"step": 500
},
{
"epoch": 0.22793448589626933,
"grad_norm": 2.14209787933433,
"learning_rate": 4.974507354260332e-06,
"loss": 0.1261,
"step": 501
},
{
"epoch": 0.2283894449499545,
"grad_norm": 1.9058176611193165,
"learning_rate": 4.974405456230436e-06,
"loss": 0.1203,
"step": 502
},
{
"epoch": 0.22884440400363967,
"grad_norm": 1.8980074058725056,
"learning_rate": 4.974303356003301e-06,
"loss": 0.0996,
"step": 503
},
{
"epoch": 0.22929936305732485,
"grad_norm": 1.4579903539692274,
"learning_rate": 4.974201053587268e-06,
"loss": 0.0943,
"step": 504
},
{
"epoch": 0.22975432211101002,
"grad_norm": 1.3940386820106656,
"learning_rate": 4.9740985489907005e-06,
"loss": 0.0663,
"step": 505
},
{
"epoch": 0.2302092811646952,
"grad_norm": 2.441971054754706,
"learning_rate": 4.973995842221971e-06,
"loss": 0.1245,
"step": 506
},
{
"epoch": 0.23066424021838033,
"grad_norm": 1.919620601900113,
"learning_rate": 4.973892933289476e-06,
"loss": 0.1159,
"step": 507
},
{
"epoch": 0.2311191992720655,
"grad_norm": 1.672712776153676,
"learning_rate": 4.97378982220162e-06,
"loss": 0.0981,
"step": 508
},
{
"epoch": 0.23157415832575068,
"grad_norm": 1.2125382683302124,
"learning_rate": 4.973686508966832e-06,
"loss": 0.0601,
"step": 509
},
{
"epoch": 0.23202911737943585,
"grad_norm": 1.222443145221144,
"learning_rate": 4.973582993593554e-06,
"loss": 0.0715,
"step": 510
},
{
"epoch": 0.23248407643312102,
"grad_norm": 1.5223951861259333,
"learning_rate": 4.973479276090244e-06,
"loss": 0.0795,
"step": 511
},
{
"epoch": 0.2329390354868062,
"grad_norm": 1.2392582362318094,
"learning_rate": 4.973375356465378e-06,
"loss": 0.061,
"step": 512
},
{
"epoch": 0.23339399454049137,
"grad_norm": 1.7285156139774616,
"learning_rate": 4.973271234727447e-06,
"loss": 0.1201,
"step": 513
},
{
"epoch": 0.2338489535941765,
"grad_norm": 1.4723786585295477,
"learning_rate": 4.97316691088496e-06,
"loss": 0.0885,
"step": 514
},
{
"epoch": 0.23430391264786168,
"grad_norm": 2.25192801645438,
"learning_rate": 4.973062384946442e-06,
"loss": 0.135,
"step": 515
},
{
"epoch": 0.23475887170154686,
"grad_norm": 1.1373098395352674,
"learning_rate": 4.9729576569204345e-06,
"loss": 0.0728,
"step": 516
},
{
"epoch": 0.23521383075523203,
"grad_norm": 1.5300830315604266,
"learning_rate": 4.972852726815495e-06,
"loss": 0.0941,
"step": 517
},
{
"epoch": 0.2356687898089172,
"grad_norm": 1.8026113068627658,
"learning_rate": 4.972747594640197e-06,
"loss": 0.1247,
"step": 518
},
{
"epoch": 0.23612374886260237,
"grad_norm": 1.794104737159684,
"learning_rate": 4.9726422604031335e-06,
"loss": 0.095,
"step": 519
},
{
"epoch": 0.23657870791628755,
"grad_norm": 1.1504559186965777,
"learning_rate": 4.97253672411291e-06,
"loss": 0.0674,
"step": 520
},
{
"epoch": 0.2370336669699727,
"grad_norm": 1.4316672986650767,
"learning_rate": 4.972430985778152e-06,
"loss": 0.0702,
"step": 521
},
{
"epoch": 0.23748862602365786,
"grad_norm": 1.5328603666600327,
"learning_rate": 4.972325045407499e-06,
"loss": 0.0675,
"step": 522
},
{
"epoch": 0.23794358507734303,
"grad_norm": 3.2405357176859857,
"learning_rate": 4.972218903009608e-06,
"loss": 0.1212,
"step": 523
},
{
"epoch": 0.2383985441310282,
"grad_norm": 1.5109558607242208,
"learning_rate": 4.972112558593153e-06,
"loss": 0.0938,
"step": 524
},
{
"epoch": 0.23885350318471338,
"grad_norm": 1.264935168060258,
"learning_rate": 4.972006012166823e-06,
"loss": 0.0742,
"step": 525
},
{
"epoch": 0.23930846223839855,
"grad_norm": 1.3461924059029533,
"learning_rate": 4.971899263739326e-06,
"loss": 0.0844,
"step": 526
},
{
"epoch": 0.23976342129208372,
"grad_norm": 1.7441591810954875,
"learning_rate": 4.971792313319384e-06,
"loss": 0.1139,
"step": 527
},
{
"epoch": 0.24021838034576887,
"grad_norm": 1.7027600325330141,
"learning_rate": 4.971685160915737e-06,
"loss": 0.0867,
"step": 528
},
{
"epoch": 0.24067333939945404,
"grad_norm": 1.6301828004618641,
"learning_rate": 4.971577806537139e-06,
"loss": 0.0943,
"step": 529
},
{
"epoch": 0.2411282984531392,
"grad_norm": 1.6173281507194255,
"learning_rate": 4.971470250192366e-06,
"loss": 0.1052,
"step": 530
},
{
"epoch": 0.24158325750682438,
"grad_norm": 17.712189021618492,
"learning_rate": 4.9713624918902045e-06,
"loss": 0.3191,
"step": 531
},
{
"epoch": 0.24203821656050956,
"grad_norm": 2.336934606774547,
"learning_rate": 4.971254531639461e-06,
"loss": 0.1347,
"step": 532
},
{
"epoch": 0.24249317561419473,
"grad_norm": 1.8922827015678323,
"learning_rate": 4.971146369448957e-06,
"loss": 0.1144,
"step": 533
},
{
"epoch": 0.2429481346678799,
"grad_norm": 1.7408688040721931,
"learning_rate": 4.971038005327532e-06,
"loss": 0.1143,
"step": 534
},
{
"epoch": 0.24340309372156507,
"grad_norm": 1.9327103804196282,
"learning_rate": 4.970929439284039e-06,
"loss": 0.1377,
"step": 535
},
{
"epoch": 0.24385805277525022,
"grad_norm": 2.0181579320929224,
"learning_rate": 4.970820671327351e-06,
"loss": 0.1259,
"step": 536
},
{
"epoch": 0.2443130118289354,
"grad_norm": 1.1056426992050885,
"learning_rate": 4.9707117014663565e-06,
"loss": 0.0633,
"step": 537
},
{
"epoch": 0.24476797088262056,
"grad_norm": 1.853338129642874,
"learning_rate": 4.97060252970996e-06,
"loss": 0.1215,
"step": 538
},
{
"epoch": 0.24522292993630573,
"grad_norm": 1.6843406450831364,
"learning_rate": 4.970493156067081e-06,
"loss": 0.1016,
"step": 539
},
{
"epoch": 0.2456778889899909,
"grad_norm": 1.1701908663612965,
"learning_rate": 4.970383580546658e-06,
"loss": 0.0731,
"step": 540
},
{
"epoch": 0.24613284804367608,
"grad_norm": 1.7890527407391215,
"learning_rate": 4.970273803157645e-06,
"loss": 0.1097,
"step": 541
},
{
"epoch": 0.24658780709736125,
"grad_norm": 1.4169073671700831,
"learning_rate": 4.970163823909013e-06,
"loss": 0.0845,
"step": 542
},
{
"epoch": 0.2470427661510464,
"grad_norm": 1.5828589024944335,
"learning_rate": 4.970053642809748e-06,
"loss": 0.0921,
"step": 543
},
{
"epoch": 0.24749772520473157,
"grad_norm": 1.6370747251722932,
"learning_rate": 4.969943259868853e-06,
"loss": 0.1088,
"step": 544
},
{
"epoch": 0.24795268425841674,
"grad_norm": 2.023470308157194,
"learning_rate": 4.969832675095351e-06,
"loss": 0.1052,
"step": 545
},
{
"epoch": 0.2484076433121019,
"grad_norm": 1.7462230999429424,
"learning_rate": 4.969721888498275e-06,
"loss": 0.1141,
"step": 546
},
{
"epoch": 0.24886260236578708,
"grad_norm": 1.428774250085193,
"learning_rate": 4.96961090008668e-06,
"loss": 0.0824,
"step": 547
},
{
"epoch": 0.24931756141947226,
"grad_norm": 1.6447081301063733,
"learning_rate": 4.969499709869635e-06,
"loss": 0.1324,
"step": 548
},
{
"epoch": 0.24977252047315743,
"grad_norm": 2.0250820847646054,
"learning_rate": 4.969388317856225e-06,
"loss": 0.1122,
"step": 549
},
{
"epoch": 0.2502274795268426,
"grad_norm": 2.060820071851061,
"learning_rate": 4.969276724055554e-06,
"loss": 0.128,
"step": 550
},
{
"epoch": 0.25068243858052774,
"grad_norm": 1.8421595012757042,
"learning_rate": 4.969164928476741e-06,
"loss": 0.0929,
"step": 551
},
{
"epoch": 0.25113739763421294,
"grad_norm": 1.8378761522798848,
"learning_rate": 4.969052931128919e-06,
"loss": 0.1038,
"step": 552
},
{
"epoch": 0.2515923566878981,
"grad_norm": 1.4559119574869848,
"learning_rate": 4.968940732021243e-06,
"loss": 0.0884,
"step": 553
},
{
"epoch": 0.25204731574158323,
"grad_norm": 1.9971887851212364,
"learning_rate": 4.9688283311628795e-06,
"loss": 0.1353,
"step": 554
},
{
"epoch": 0.25250227479526843,
"grad_norm": 1.7386639848323409,
"learning_rate": 4.968715728563014e-06,
"loss": 0.1025,
"step": 555
},
{
"epoch": 0.2529572338489536,
"grad_norm": 1.260155855896464,
"learning_rate": 4.968602924230847e-06,
"loss": 0.0684,
"step": 556
},
{
"epoch": 0.2534121929026388,
"grad_norm": 2.3395689748358843,
"learning_rate": 4.968489918175598e-06,
"loss": 0.1151,
"step": 557
},
{
"epoch": 0.2538671519563239,
"grad_norm": 2.0737729432038137,
"learning_rate": 4.9683767104065014e-06,
"loss": 0.107,
"step": 558
},
{
"epoch": 0.2543221110100091,
"grad_norm": 1.4554456387078378,
"learning_rate": 4.968263300932806e-06,
"loss": 0.0674,
"step": 559
},
{
"epoch": 0.25477707006369427,
"grad_norm": 1.236095562563839,
"learning_rate": 4.968149689763781e-06,
"loss": 0.0771,
"step": 560
},
{
"epoch": 0.2552320291173794,
"grad_norm": 1.6261579693523964,
"learning_rate": 4.968035876908708e-06,
"loss": 0.1033,
"step": 561
},
{
"epoch": 0.2556869881710646,
"grad_norm": 1.8267174614929946,
"learning_rate": 4.967921862376889e-06,
"loss": 0.1153,
"step": 562
},
{
"epoch": 0.25614194722474976,
"grad_norm": 1.9897704292294367,
"learning_rate": 4.9678076461776415e-06,
"loss": 0.1168,
"step": 563
},
{
"epoch": 0.25659690627843496,
"grad_norm": 1.9727936679798233,
"learning_rate": 4.9676932283202965e-06,
"loss": 0.1389,
"step": 564
},
{
"epoch": 0.2570518653321201,
"grad_norm": 1.8484690700205213,
"learning_rate": 4.967578608814205e-06,
"loss": 0.1024,
"step": 565
},
{
"epoch": 0.2575068243858053,
"grad_norm": 1.4833575893287436,
"learning_rate": 4.9674637876687345e-06,
"loss": 0.0959,
"step": 566
},
{
"epoch": 0.25796178343949044,
"grad_norm": 1.0731244531443167,
"learning_rate": 4.967348764893265e-06,
"loss": 0.0652,
"step": 567
},
{
"epoch": 0.2584167424931756,
"grad_norm": 1.882586364820984,
"learning_rate": 4.967233540497197e-06,
"loss": 0.0887,
"step": 568
},
{
"epoch": 0.2588717015468608,
"grad_norm": 1.5585900206462215,
"learning_rate": 4.967118114489946e-06,
"loss": 0.0705,
"step": 569
},
{
"epoch": 0.25932666060054593,
"grad_norm": 1.4304247727655925,
"learning_rate": 4.967002486880944e-06,
"loss": 0.0689,
"step": 570
},
{
"epoch": 0.25978161965423113,
"grad_norm": 1.996611084455256,
"learning_rate": 4.966886657679641e-06,
"loss": 0.1134,
"step": 571
},
{
"epoch": 0.2602365787079163,
"grad_norm": 2.573142554440562,
"learning_rate": 4.966770626895499e-06,
"loss": 0.137,
"step": 572
},
{
"epoch": 0.2606915377616015,
"grad_norm": 1.7759211248358038,
"learning_rate": 4.966654394538002e-06,
"loss": 0.097,
"step": 573
},
{
"epoch": 0.2611464968152866,
"grad_norm": 1.3021079669208342,
"learning_rate": 4.966537960616646e-06,
"loss": 0.0774,
"step": 574
},
{
"epoch": 0.26160145586897177,
"grad_norm": 2.328733131052364,
"learning_rate": 4.9664213251409486e-06,
"loss": 0.1105,
"step": 575
},
{
"epoch": 0.26205641492265697,
"grad_norm": 2.281267812919593,
"learning_rate": 4.9663044881204375e-06,
"loss": 0.1556,
"step": 576
},
{
"epoch": 0.2625113739763421,
"grad_norm": 1.7215892787568372,
"learning_rate": 4.9661874495646615e-06,
"loss": 0.0917,
"step": 577
},
{
"epoch": 0.2629663330300273,
"grad_norm": 1.3072003221216781,
"learning_rate": 4.9660702094831845e-06,
"loss": 0.0818,
"step": 578
},
{
"epoch": 0.26342129208371245,
"grad_norm": 2.141135787879026,
"learning_rate": 4.965952767885587e-06,
"loss": 0.1187,
"step": 579
},
{
"epoch": 0.26387625113739765,
"grad_norm": 2.3440295569320857,
"learning_rate": 4.965835124781465e-06,
"loss": 0.1336,
"step": 580
},
{
"epoch": 0.2643312101910828,
"grad_norm": 1.2377586425554465,
"learning_rate": 4.965717280180432e-06,
"loss": 0.0771,
"step": 581
},
{
"epoch": 0.26478616924476794,
"grad_norm": 1.5553208083958672,
"learning_rate": 4.965599234092118e-06,
"loss": 0.0906,
"step": 582
},
{
"epoch": 0.26524112829845314,
"grad_norm": 1.676762616981095,
"learning_rate": 4.96548098652617e-06,
"loss": 0.1091,
"step": 583
},
{
"epoch": 0.2656960873521383,
"grad_norm": 1.8329426527347645,
"learning_rate": 4.965362537492249e-06,
"loss": 0.1171,
"step": 584
},
{
"epoch": 0.2661510464058235,
"grad_norm": 1.2752855217123082,
"learning_rate": 4.9652438870000356e-06,
"loss": 0.0726,
"step": 585
},
{
"epoch": 0.26660600545950863,
"grad_norm": 1.188941544645384,
"learning_rate": 4.965125035059224e-06,
"loss": 0.0801,
"step": 586
},
{
"epoch": 0.26706096451319383,
"grad_norm": 1.4654127807937742,
"learning_rate": 4.965005981679527e-06,
"loss": 0.0839,
"step": 587
},
{
"epoch": 0.267515923566879,
"grad_norm": 2.0288718475884107,
"learning_rate": 4.964886726870673e-06,
"loss": 0.1239,
"step": 588
},
{
"epoch": 0.2679708826205642,
"grad_norm": 1.972686660841513,
"learning_rate": 4.964767270642407e-06,
"loss": 0.1004,
"step": 589
},
{
"epoch": 0.2684258416742493,
"grad_norm": 1.6499743360699521,
"learning_rate": 4.964647613004491e-06,
"loss": 0.0976,
"step": 590
},
{
"epoch": 0.26888080072793447,
"grad_norm": 1.5661213245685233,
"learning_rate": 4.964527753966702e-06,
"loss": 0.0818,
"step": 591
},
{
"epoch": 0.26933575978161967,
"grad_norm": 1.387453226127614,
"learning_rate": 4.964407693538834e-06,
"loss": 0.0813,
"step": 592
},
{
"epoch": 0.2697907188353048,
"grad_norm": 1.8652006740776592,
"learning_rate": 4.9642874317307e-06,
"loss": 0.1092,
"step": 593
},
{
"epoch": 0.27024567788899,
"grad_norm": 1.6739291749648295,
"learning_rate": 4.964166968552124e-06,
"loss": 0.1262,
"step": 594
},
{
"epoch": 0.27070063694267515,
"grad_norm": 1.4965319066427345,
"learning_rate": 4.9640463040129525e-06,
"loss": 0.0749,
"step": 595
},
{
"epoch": 0.27115559599636035,
"grad_norm": 1.483777185503557,
"learning_rate": 4.963925438123044e-06,
"loss": 0.075,
"step": 596
},
{
"epoch": 0.2716105550500455,
"grad_norm": 1.646106287941782,
"learning_rate": 4.963804370892276e-06,
"loss": 0.0948,
"step": 597
},
{
"epoch": 0.27206551410373064,
"grad_norm": 1.8923424637891237,
"learning_rate": 4.9636831023305405e-06,
"loss": 0.1296,
"step": 598
},
{
"epoch": 0.27252047315741584,
"grad_norm": 1.453967822900046,
"learning_rate": 4.963561632447748e-06,
"loss": 0.0777,
"step": 599
},
{
"epoch": 0.272975432211101,
"grad_norm": 1.2633146266239919,
"learning_rate": 4.9634399612538255e-06,
"loss": 0.0704,
"step": 600
},
{
"epoch": 0.2734303912647862,
"grad_norm": 24.856853600017228,
"learning_rate": 4.963318088758714e-06,
"loss": 0.4372,
"step": 601
},
{
"epoch": 0.27388535031847133,
"grad_norm": 1.6301604814034822,
"learning_rate": 4.963196014972371e-06,
"loss": 0.0879,
"step": 602
},
{
"epoch": 0.27434030937215653,
"grad_norm": 1.556460730817159,
"learning_rate": 4.963073739904775e-06,
"loss": 0.0893,
"step": 603
},
{
"epoch": 0.2747952684258417,
"grad_norm": 1.657318032059153,
"learning_rate": 4.962951263565915e-06,
"loss": 0.0933,
"step": 604
},
{
"epoch": 0.2752502274795268,
"grad_norm": 2.273490391362205,
"learning_rate": 4.962828585965801e-06,
"loss": 0.1038,
"step": 605
},
{
"epoch": 0.275705186533212,
"grad_norm": 1.5114052665682505,
"learning_rate": 4.962705707114457e-06,
"loss": 0.097,
"step": 606
},
{
"epoch": 0.27616014558689717,
"grad_norm": 1.7683179621585026,
"learning_rate": 4.962582627021923e-06,
"loss": 0.1127,
"step": 607
},
{
"epoch": 0.27661510464058237,
"grad_norm": 1.8859941959717001,
"learning_rate": 4.962459345698258e-06,
"loss": 0.1152,
"step": 608
},
{
"epoch": 0.2770700636942675,
"grad_norm": 1.9839838015935523,
"learning_rate": 4.962335863153537e-06,
"loss": 0.1198,
"step": 609
},
{
"epoch": 0.2775250227479527,
"grad_norm": 1.3671283570292578,
"learning_rate": 4.962212179397847e-06,
"loss": 0.0876,
"step": 610
},
{
"epoch": 0.27797998180163785,
"grad_norm": 1.4623540558631782,
"learning_rate": 4.962088294441299e-06,
"loss": 0.0754,
"step": 611
},
{
"epoch": 0.278434940855323,
"grad_norm": 2.3501285954750806,
"learning_rate": 4.9619642082940135e-06,
"loss": 0.1,
"step": 612
},
{
"epoch": 0.2788898999090082,
"grad_norm": 1.6593172768016098,
"learning_rate": 4.9618399209661305e-06,
"loss": 0.0918,
"step": 613
},
{
"epoch": 0.27934485896269334,
"grad_norm": 1.4913746956676242,
"learning_rate": 4.961715432467807e-06,
"loss": 0.0788,
"step": 614
},
{
"epoch": 0.27979981801637854,
"grad_norm": 1.3335438953393988,
"learning_rate": 4.961590742809216e-06,
"loss": 0.0743,
"step": 615
},
{
"epoch": 0.2802547770700637,
"grad_norm": 1.4631866469804606,
"learning_rate": 4.961465852000545e-06,
"loss": 0.0869,
"step": 616
},
{
"epoch": 0.2807097361237489,
"grad_norm": 1.8021656107937525,
"learning_rate": 4.961340760052001e-06,
"loss": 0.0906,
"step": 617
},
{
"epoch": 0.28116469517743403,
"grad_norm": 1.74213914067233,
"learning_rate": 4.961215466973806e-06,
"loss": 0.0926,
"step": 618
},
{
"epoch": 0.2816196542311192,
"grad_norm": 2.764803909834576,
"learning_rate": 4.961089972776197e-06,
"loss": 0.1823,
"step": 619
},
{
"epoch": 0.2820746132848044,
"grad_norm": 1.3665676735119967,
"learning_rate": 4.9609642774694285e-06,
"loss": 0.0734,
"step": 620
},
{
"epoch": 0.2825295723384895,
"grad_norm": 1.9426323562959267,
"learning_rate": 4.960838381063774e-06,
"loss": 0.0972,
"step": 621
},
{
"epoch": 0.2829845313921747,
"grad_norm": 2.3374254341147322,
"learning_rate": 4.960712283569521e-06,
"loss": 0.1411,
"step": 622
},
{
"epoch": 0.28343949044585987,
"grad_norm": 2.2747894788958543,
"learning_rate": 4.960585984996971e-06,
"loss": 0.1033,
"step": 623
},
{
"epoch": 0.28389444949954507,
"grad_norm": 1.7445142059152803,
"learning_rate": 4.960459485356447e-06,
"loss": 0.1222,
"step": 624
},
{
"epoch": 0.2843494085532302,
"grad_norm": 1.5220008831965313,
"learning_rate": 4.960332784658285e-06,
"loss": 0.1027,
"step": 625
},
{
"epoch": 0.28480436760691535,
"grad_norm": 2.1347326062219034,
"learning_rate": 4.960205882912839e-06,
"loss": 0.1237,
"step": 626
},
{
"epoch": 0.28525932666060055,
"grad_norm": 2.5984695620436002,
"learning_rate": 4.9600787801304785e-06,
"loss": 0.1871,
"step": 627
},
{
"epoch": 0.2857142857142857,
"grad_norm": 2.1207792848317375,
"learning_rate": 4.959951476321589e-06,
"loss": 0.1205,
"step": 628
},
{
"epoch": 0.2861692447679709,
"grad_norm": 1.1897630810057305,
"learning_rate": 4.959823971496575e-06,
"loss": 0.0773,
"step": 629
},
{
"epoch": 0.28662420382165604,
"grad_norm": 3.4920069239312976,
"learning_rate": 4.959696265665853e-06,
"loss": 0.1897,
"step": 630
},
{
"epoch": 0.28707916287534124,
"grad_norm": 1.425742783647833,
"learning_rate": 4.959568358839862e-06,
"loss": 0.0635,
"step": 631
},
{
"epoch": 0.2875341219290264,
"grad_norm": 1.330689822741385,
"learning_rate": 4.95944025102905e-06,
"loss": 0.0722,
"step": 632
},
{
"epoch": 0.28798908098271153,
"grad_norm": 1.99039564333339,
"learning_rate": 4.959311942243888e-06,
"loss": 0.1158,
"step": 633
},
{
"epoch": 0.28844404003639673,
"grad_norm": 1.593751969696495,
"learning_rate": 4.95918343249486e-06,
"loss": 0.0861,
"step": 634
},
{
"epoch": 0.2888989990900819,
"grad_norm": 1.8945402616067804,
"learning_rate": 4.959054721792469e-06,
"loss": 0.1171,
"step": 635
},
{
"epoch": 0.2893539581437671,
"grad_norm": 1.4569740573581391,
"learning_rate": 4.958925810147231e-06,
"loss": 0.0777,
"step": 636
},
{
"epoch": 0.2898089171974522,
"grad_norm": 1.7102068304451903,
"learning_rate": 4.958796697569679e-06,
"loss": 0.0872,
"step": 637
},
{
"epoch": 0.2902638762511374,
"grad_norm": 1.5378977203553044,
"learning_rate": 4.958667384070365e-06,
"loss": 0.0796,
"step": 638
},
{
"epoch": 0.29071883530482256,
"grad_norm": 1.9723232607058794,
"learning_rate": 4.958537869659855e-06,
"loss": 0.1204,
"step": 639
},
{
"epoch": 0.2911737943585077,
"grad_norm": 1.4856408560761394,
"learning_rate": 4.958408154348734e-06,
"loss": 0.0763,
"step": 640
},
{
"epoch": 0.2916287534121929,
"grad_norm": 1.7342797592944788,
"learning_rate": 4.9582782381476e-06,
"loss": 0.1104,
"step": 641
},
{
"epoch": 0.29208371246587805,
"grad_norm": 2.179383476129295,
"learning_rate": 4.958148121067071e-06,
"loss": 0.1694,
"step": 642
},
{
"epoch": 0.29253867151956325,
"grad_norm": 1.8609060135735762,
"learning_rate": 4.9580178031177775e-06,
"loss": 0.1303,
"step": 643
},
{
"epoch": 0.2929936305732484,
"grad_norm": 1.4742279064065518,
"learning_rate": 4.9578872843103694e-06,
"loss": 0.1001,
"step": 644
},
{
"epoch": 0.2934485896269336,
"grad_norm": 1.7670333338462736,
"learning_rate": 4.957756564655513e-06,
"loss": 0.1022,
"step": 645
},
{
"epoch": 0.29390354868061874,
"grad_norm": 1.6630538784639108,
"learning_rate": 4.957625644163888e-06,
"loss": 0.1055,
"step": 646
},
{
"epoch": 0.2943585077343039,
"grad_norm": 1.9118546637397547,
"learning_rate": 4.957494522846194e-06,
"loss": 0.1029,
"step": 647
},
{
"epoch": 0.2948134667879891,
"grad_norm": 1.7468783195584092,
"learning_rate": 4.957363200713146e-06,
"loss": 0.13,
"step": 648
},
{
"epoch": 0.29526842584167423,
"grad_norm": 1.4923304655802225,
"learning_rate": 4.957231677775475e-06,
"loss": 0.0846,
"step": 649
},
{
"epoch": 0.29572338489535943,
"grad_norm": 2.0864859163635407,
"learning_rate": 4.957099954043928e-06,
"loss": 0.1363,
"step": 650
},
{
"epoch": 0.2961783439490446,
"grad_norm": 1.467640729386297,
"learning_rate": 4.956968029529269e-06,
"loss": 0.113,
"step": 651
},
{
"epoch": 0.2966333030027298,
"grad_norm": 1.5940129351295147,
"learning_rate": 4.956835904242277e-06,
"loss": 0.1121,
"step": 652
},
{
"epoch": 0.2970882620564149,
"grad_norm": 1.305300483782713,
"learning_rate": 4.9567035781937516e-06,
"loss": 0.0569,
"step": 653
},
{
"epoch": 0.29754322111010006,
"grad_norm": 1.8626374769697236,
"learning_rate": 4.9565710513945024e-06,
"loss": 0.095,
"step": 654
},
{
"epoch": 0.29799818016378526,
"grad_norm": 1.9350135167075724,
"learning_rate": 4.956438323855362e-06,
"loss": 0.11,
"step": 655
},
{
"epoch": 0.2984531392174704,
"grad_norm": 1.7292500874953625,
"learning_rate": 4.956305395587174e-06,
"loss": 0.1259,
"step": 656
},
{
"epoch": 0.2989080982711556,
"grad_norm": 1.7021672274359103,
"learning_rate": 4.956172266600802e-06,
"loss": 0.0857,
"step": 657
},
{
"epoch": 0.29936305732484075,
"grad_norm": 1.2481942065304896,
"learning_rate": 4.956038936907125e-06,
"loss": 0.0776,
"step": 658
},
{
"epoch": 0.29981801637852595,
"grad_norm": 1.4091727470459356,
"learning_rate": 4.955905406517036e-06,
"loss": 0.0706,
"step": 659
},
{
"epoch": 0.3002729754322111,
"grad_norm": 1.8640524340898077,
"learning_rate": 4.95577167544145e-06,
"loss": 0.1176,
"step": 660
},
{
"epoch": 0.30072793448589624,
"grad_norm": 2.0619543797721698,
"learning_rate": 4.955637743691291e-06,
"loss": 0.1148,
"step": 661
},
{
"epoch": 0.30118289353958144,
"grad_norm": 1.9364848961200234,
"learning_rate": 4.955503611277506e-06,
"loss": 0.0964,
"step": 662
},
{
"epoch": 0.3016378525932666,
"grad_norm": 1.5509916734065172,
"learning_rate": 4.955369278211055e-06,
"loss": 0.0824,
"step": 663
},
{
"epoch": 0.3020928116469518,
"grad_norm": 1.8848317603882998,
"learning_rate": 4.955234744502914e-06,
"loss": 0.1,
"step": 664
},
{
"epoch": 0.30254777070063693,
"grad_norm": 1.7147002197137917,
"learning_rate": 4.955100010164079e-06,
"loss": 0.1042,
"step": 665
},
{
"epoch": 0.30300272975432213,
"grad_norm": 1.8287392204283686,
"learning_rate": 4.954965075205557e-06,
"loss": 0.0894,
"step": 666
},
{
"epoch": 0.3034576888080073,
"grad_norm": 3.2978505813072765,
"learning_rate": 4.9548299396383755e-06,
"loss": 0.1555,
"step": 667
},
{
"epoch": 0.3039126478616925,
"grad_norm": 1.733214316892207,
"learning_rate": 4.954694603473578e-06,
"loss": 0.0848,
"step": 668
},
{
"epoch": 0.3043676069153776,
"grad_norm": 2.1290440022616917,
"learning_rate": 4.954559066722222e-06,
"loss": 0.1329,
"step": 669
},
{
"epoch": 0.30482256596906276,
"grad_norm": 1.7482728884321743,
"learning_rate": 4.954423329395385e-06,
"loss": 0.1135,
"step": 670
},
{
"epoch": 0.30527752502274796,
"grad_norm": 1.8272762006745102,
"learning_rate": 4.954287391504156e-06,
"loss": 0.1233,
"step": 671
},
{
"epoch": 0.3057324840764331,
"grad_norm": 2.276356474817249,
"learning_rate": 4.9541512530596455e-06,
"loss": 0.1426,
"step": 672
},
{
"epoch": 0.3061874431301183,
"grad_norm": 1.5212465132609405,
"learning_rate": 4.954014914072978e-06,
"loss": 0.0908,
"step": 673
},
{
"epoch": 0.30664240218380345,
"grad_norm": 1.7081770141846233,
"learning_rate": 4.9538783745552934e-06,
"loss": 0.1069,
"step": 674
},
{
"epoch": 0.30709736123748865,
"grad_norm": 2.2065783569813755,
"learning_rate": 4.95374163451775e-06,
"loss": 0.1303,
"step": 675
},
{
"epoch": 0.3075523202911738,
"grad_norm": 1.9717809133208803,
"learning_rate": 4.953604693971521e-06,
"loss": 0.0969,
"step": 676
},
{
"epoch": 0.30800727934485894,
"grad_norm": 1.5094990032560427,
"learning_rate": 4.953467552927798e-06,
"loss": 0.059,
"step": 677
},
{
"epoch": 0.30846223839854414,
"grad_norm": 2.5084055121202726,
"learning_rate": 4.9533302113977845e-06,
"loss": 0.141,
"step": 678
},
{
"epoch": 0.3089171974522293,
"grad_norm": 2.1105100650062814,
"learning_rate": 4.9531926693927055e-06,
"loss": 0.1162,
"step": 679
},
{
"epoch": 0.3093721565059145,
"grad_norm": 1.9374617838160508,
"learning_rate": 4.953054926923801e-06,
"loss": 0.1119,
"step": 680
},
{
"epoch": 0.30982711555959963,
"grad_norm": 2.266159358282095,
"learning_rate": 4.952916984002325e-06,
"loss": 0.1188,
"step": 681
},
{
"epoch": 0.31028207461328483,
"grad_norm": 2.1490900129362243,
"learning_rate": 4.95277884063955e-06,
"loss": 0.1337,
"step": 682
},
{
"epoch": 0.31073703366697,
"grad_norm": 1.5330806658735066,
"learning_rate": 4.952640496846766e-06,
"loss": 0.109,
"step": 683
},
{
"epoch": 0.3111919927206551,
"grad_norm": 1.41231573264733,
"learning_rate": 4.952501952635276e-06,
"loss": 0.0837,
"step": 684
},
{
"epoch": 0.3116469517743403,
"grad_norm": 1.993511064296186,
"learning_rate": 4.952363208016402e-06,
"loss": 0.1272,
"step": 685
},
{
"epoch": 0.31210191082802546,
"grad_norm": 1.6098606771380728,
"learning_rate": 4.952224263001482e-06,
"loss": 0.0816,
"step": 686
},
{
"epoch": 0.31255686988171066,
"grad_norm": 1.2309412681015492,
"learning_rate": 4.952085117601868e-06,
"loss": 0.0692,
"step": 687
},
{
"epoch": 0.3130118289353958,
"grad_norm": 1.7997377974129165,
"learning_rate": 4.951945771828933e-06,
"loss": 0.1322,
"step": 688
},
{
"epoch": 0.313466787989081,
"grad_norm": 1.3223154067967124,
"learning_rate": 4.951806225694061e-06,
"loss": 0.0979,
"step": 689
},
{
"epoch": 0.31392174704276615,
"grad_norm": 1.9747397800251965,
"learning_rate": 4.951666479208658e-06,
"loss": 0.1184,
"step": 690
},
{
"epoch": 0.3143767060964513,
"grad_norm": 1.4466542632801185,
"learning_rate": 4.951526532384141e-06,
"loss": 0.085,
"step": 691
},
{
"epoch": 0.3148316651501365,
"grad_norm": 1.8649877852775587,
"learning_rate": 4.951386385231946e-06,
"loss": 0.1011,
"step": 692
},
{
"epoch": 0.31528662420382164,
"grad_norm": 1.2680670071467166,
"learning_rate": 4.951246037763528e-06,
"loss": 0.0748,
"step": 693
},
{
"epoch": 0.31574158325750684,
"grad_norm": 1.5151831279551418,
"learning_rate": 4.9511054899903524e-06,
"loss": 0.0874,
"step": 694
},
{
"epoch": 0.316196542311192,
"grad_norm": 1.6436638497099227,
"learning_rate": 4.950964741923905e-06,
"loss": 0.0982,
"step": 695
},
{
"epoch": 0.3166515013648772,
"grad_norm": 1.5379093700813176,
"learning_rate": 4.950823793575688e-06,
"loss": 0.0857,
"step": 696
},
{
"epoch": 0.31710646041856233,
"grad_norm": 2.4063943761092452,
"learning_rate": 4.950682644957218e-06,
"loss": 0.1253,
"step": 697
},
{
"epoch": 0.3175614194722475,
"grad_norm": 2.5063143673804844,
"learning_rate": 4.9505412960800295e-06,
"loss": 0.1511,
"step": 698
},
{
"epoch": 0.3180163785259327,
"grad_norm": 1.722833309256951,
"learning_rate": 4.950399746955673e-06,
"loss": 0.0999,
"step": 699
},
{
"epoch": 0.3184713375796178,
"grad_norm": 1.8190148406823232,
"learning_rate": 4.950257997595716e-06,
"loss": 0.0895,
"step": 700
},
{
"epoch": 0.318926296633303,
"grad_norm": 1.9186747250049239,
"learning_rate": 4.950116048011739e-06,
"loss": 0.0964,
"step": 701
},
{
"epoch": 0.31938125568698816,
"grad_norm": 1.372930302125184,
"learning_rate": 4.949973898215344e-06,
"loss": 0.0589,
"step": 702
},
{
"epoch": 0.31983621474067336,
"grad_norm": 1.9707430002902289,
"learning_rate": 4.949831548218146e-06,
"loss": 0.1054,
"step": 703
},
{
"epoch": 0.3202911737943585,
"grad_norm": 2.0845604349239832,
"learning_rate": 4.949688998031777e-06,
"loss": 0.1105,
"step": 704
},
{
"epoch": 0.32074613284804365,
"grad_norm": 1.4969274131429369,
"learning_rate": 4.949546247667886e-06,
"loss": 0.0814,
"step": 705
},
{
"epoch": 0.32120109190172885,
"grad_norm": 1.9940826155791407,
"learning_rate": 4.949403297138137e-06,
"loss": 0.1064,
"step": 706
},
{
"epoch": 0.321656050955414,
"grad_norm": 1.7246519891154302,
"learning_rate": 4.949260146454212e-06,
"loss": 0.1093,
"step": 707
},
{
"epoch": 0.3221110100090992,
"grad_norm": 1.6890948945842699,
"learning_rate": 4.94911679562781e-06,
"loss": 0.0888,
"step": 708
},
{
"epoch": 0.32256596906278434,
"grad_norm": 2.0455963687465837,
"learning_rate": 4.948973244670643e-06,
"loss": 0.1019,
"step": 709
},
{
"epoch": 0.32302092811646954,
"grad_norm": 1.7678121189421865,
"learning_rate": 4.948829493594441e-06,
"loss": 0.0961,
"step": 710
},
{
"epoch": 0.3234758871701547,
"grad_norm": 1.3731566726245188,
"learning_rate": 4.9486855424109524e-06,
"loss": 0.072,
"step": 711
},
{
"epoch": 0.32393084622383983,
"grad_norm": 1.4962983653581472,
"learning_rate": 4.948541391131939e-06,
"loss": 0.0905,
"step": 712
},
{
"epoch": 0.32438580527752503,
"grad_norm": 1.4198695601427125,
"learning_rate": 4.948397039769181e-06,
"loss": 0.0616,
"step": 713
},
{
"epoch": 0.3248407643312102,
"grad_norm": 1.131377673368795,
"learning_rate": 4.948252488334474e-06,
"loss": 0.0526,
"step": 714
},
{
"epoch": 0.3252957233848954,
"grad_norm": 1.1969683311404917,
"learning_rate": 4.948107736839629e-06,
"loss": 0.0763,
"step": 715
},
{
"epoch": 0.3257506824385805,
"grad_norm": 1.6793927846583725,
"learning_rate": 4.947962785296476e-06,
"loss": 0.1153,
"step": 716
},
{
"epoch": 0.3262056414922657,
"grad_norm": 2.070694963019659,
"learning_rate": 4.9478176337168594e-06,
"loss": 0.1153,
"step": 717
},
{
"epoch": 0.32666060054595086,
"grad_norm": 2.7729923804058516,
"learning_rate": 4.9476722821126386e-06,
"loss": 0.171,
"step": 718
},
{
"epoch": 0.327115559599636,
"grad_norm": 1.4442284620787837,
"learning_rate": 4.9475267304956945e-06,
"loss": 0.0997,
"step": 719
},
{
"epoch": 0.3275705186533212,
"grad_norm": 2.0979816044129413,
"learning_rate": 4.947380978877917e-06,
"loss": 0.1138,
"step": 720
},
{
"epoch": 0.32802547770700635,
"grad_norm": 1.9982881232916472,
"learning_rate": 4.947235027271219e-06,
"loss": 0.1402,
"step": 721
},
{
"epoch": 0.32848043676069155,
"grad_norm": 1.3317844805683108,
"learning_rate": 4.9470888756875265e-06,
"loss": 0.0707,
"step": 722
},
{
"epoch": 0.3289353958143767,
"grad_norm": 1.4665146144499257,
"learning_rate": 4.946942524138782e-06,
"loss": 0.075,
"step": 723
},
{
"epoch": 0.3293903548680619,
"grad_norm": 1.6321427811402383,
"learning_rate": 4.946795972636944e-06,
"loss": 0.0971,
"step": 724
},
{
"epoch": 0.32984531392174704,
"grad_norm": 1.9541110640157349,
"learning_rate": 4.94664922119399e-06,
"loss": 0.1347,
"step": 725
},
{
"epoch": 0.3303002729754322,
"grad_norm": 1.664760132709453,
"learning_rate": 4.94650226982191e-06,
"loss": 0.0959,
"step": 726
},
{
"epoch": 0.3307552320291174,
"grad_norm": 2.509161708357272,
"learning_rate": 4.9463551185327115e-06,
"loss": 0.1885,
"step": 727
},
{
"epoch": 0.33121019108280253,
"grad_norm": 1.7296886670922147,
"learning_rate": 4.946207767338422e-06,
"loss": 0.0867,
"step": 728
},
{
"epoch": 0.33166515013648773,
"grad_norm": 1.5254904811287948,
"learning_rate": 4.9460602162510805e-06,
"loss": 0.09,
"step": 729
},
{
"epoch": 0.3321201091901729,
"grad_norm": 1.3404896968358107,
"learning_rate": 4.945912465282744e-06,
"loss": 0.0782,
"step": 730
},
{
"epoch": 0.3325750682438581,
"grad_norm": 1.79952897501454,
"learning_rate": 4.945764514445487e-06,
"loss": 0.1444,
"step": 731
},
{
"epoch": 0.3330300272975432,
"grad_norm": 2.48899319031489,
"learning_rate": 4.9456163637513986e-06,
"loss": 0.1136,
"step": 732
},
{
"epoch": 0.33348498635122836,
"grad_norm": 1.8285171425829347,
"learning_rate": 4.945468013212585e-06,
"loss": 0.1052,
"step": 733
},
{
"epoch": 0.33393994540491356,
"grad_norm": 1.7843881981445446,
"learning_rate": 4.945319462841169e-06,
"loss": 0.1116,
"step": 734
},
{
"epoch": 0.3343949044585987,
"grad_norm": 2.181301353034186,
"learning_rate": 4.94517071264929e-06,
"loss": 0.1251,
"step": 735
},
{
"epoch": 0.3348498635122839,
"grad_norm": 1.2980326592722402,
"learning_rate": 4.945021762649102e-06,
"loss": 0.0648,
"step": 736
},
{
"epoch": 0.33530482256596905,
"grad_norm": 1.3874782347309536,
"learning_rate": 4.9448726128527776e-06,
"loss": 0.0978,
"step": 737
},
{
"epoch": 0.33575978161965425,
"grad_norm": 1.8955499231356112,
"learning_rate": 4.944723263272504e-06,
"loss": 0.0998,
"step": 738
},
{
"epoch": 0.3362147406733394,
"grad_norm": 1.6102418502733031,
"learning_rate": 4.944573713920485e-06,
"loss": 0.1055,
"step": 739
},
{
"epoch": 0.33666969972702454,
"grad_norm": 3.355056116777925,
"learning_rate": 4.944423964808943e-06,
"loss": 0.1831,
"step": 740
},
{
"epoch": 0.33712465878070974,
"grad_norm": 1.507329867530008,
"learning_rate": 4.944274015950113e-06,
"loss": 0.0889,
"step": 741
},
{
"epoch": 0.3375796178343949,
"grad_norm": 1.610548678904166,
"learning_rate": 4.944123867356249e-06,
"loss": 0.0752,
"step": 742
},
{
"epoch": 0.3380345768880801,
"grad_norm": 1.918715600058829,
"learning_rate": 4.943973519039619e-06,
"loss": 0.1335,
"step": 743
},
{
"epoch": 0.33848953594176523,
"grad_norm": 1.3921163271356483,
"learning_rate": 4.943822971012511e-06,
"loss": 0.0727,
"step": 744
},
{
"epoch": 0.33894449499545043,
"grad_norm": 1.2023922578586952,
"learning_rate": 4.943672223287226e-06,
"loss": 0.0628,
"step": 745
},
{
"epoch": 0.3393994540491356,
"grad_norm": 2.2794421985003317,
"learning_rate": 4.9435212758760815e-06,
"loss": 0.1404,
"step": 746
},
{
"epoch": 0.3398544131028208,
"grad_norm": 1.3986125533304865,
"learning_rate": 4.943370128791413e-06,
"loss": 0.0787,
"step": 747
},
{
"epoch": 0.3403093721565059,
"grad_norm": 1.5259961799310353,
"learning_rate": 4.943218782045574e-06,
"loss": 0.1079,
"step": 748
},
{
"epoch": 0.34076433121019106,
"grad_norm": 1.8181192019120165,
"learning_rate": 4.943067235650927e-06,
"loss": 0.1195,
"step": 749
},
{
"epoch": 0.34121929026387626,
"grad_norm": 1.831268771798402,
"learning_rate": 4.942915489619859e-06,
"loss": 0.1065,
"step": 750
},
{
"epoch": 0.3416742493175614,
"grad_norm": 1.7306841826817951,
"learning_rate": 4.9427635439647704e-06,
"loss": 0.1232,
"step": 751
},
{
"epoch": 0.3421292083712466,
"grad_norm": 1.7076927486745839,
"learning_rate": 4.942611398698075e-06,
"loss": 0.0912,
"step": 752
},
{
"epoch": 0.34258416742493175,
"grad_norm": 1.7425991433970283,
"learning_rate": 4.942459053832208e-06,
"loss": 0.0997,
"step": 753
},
{
"epoch": 0.34303912647861695,
"grad_norm": 1.809200639541382,
"learning_rate": 4.942306509379617e-06,
"loss": 0.1085,
"step": 754
},
{
"epoch": 0.3434940855323021,
"grad_norm": 1.293751880354007,
"learning_rate": 4.942153765352767e-06,
"loss": 0.0966,
"step": 755
},
{
"epoch": 0.34394904458598724,
"grad_norm": 1.2918089478267207,
"learning_rate": 4.94200082176414e-06,
"loss": 0.078,
"step": 756
},
{
"epoch": 0.34440400363967244,
"grad_norm": 1.5059276244213293,
"learning_rate": 4.941847678626234e-06,
"loss": 0.0805,
"step": 757
},
{
"epoch": 0.3448589626933576,
"grad_norm": 1.4851814064844335,
"learning_rate": 4.941694335951563e-06,
"loss": 0.0983,
"step": 758
},
{
"epoch": 0.3453139217470428,
"grad_norm": 1.8989617812022122,
"learning_rate": 4.9415407937526575e-06,
"loss": 0.1107,
"step": 759
},
{
"epoch": 0.34576888080072793,
"grad_norm": 1.8347292963195811,
"learning_rate": 4.9413870520420635e-06,
"loss": 0.1237,
"step": 760
},
{
"epoch": 0.34622383985441313,
"grad_norm": 1.5924498433598573,
"learning_rate": 4.941233110832346e-06,
"loss": 0.0735,
"step": 761
},
{
"epoch": 0.3466787989080983,
"grad_norm": 2.3326854621993984,
"learning_rate": 4.941078970136082e-06,
"loss": 0.1295,
"step": 762
},
{
"epoch": 0.3471337579617834,
"grad_norm": 1.7112828341096407,
"learning_rate": 4.940924629965869e-06,
"loss": 0.1162,
"step": 763
},
{
"epoch": 0.3475887170154686,
"grad_norm": 1.5436956280322631,
"learning_rate": 4.940770090334319e-06,
"loss": 0.0861,
"step": 764
},
{
"epoch": 0.34804367606915376,
"grad_norm": 1.6236751771508604,
"learning_rate": 4.940615351254059e-06,
"loss": 0.0968,
"step": 765
},
{
"epoch": 0.34849863512283896,
"grad_norm": 1.0400997330052792,
"learning_rate": 4.940460412737734e-06,
"loss": 0.0711,
"step": 766
},
{
"epoch": 0.3489535941765241,
"grad_norm": 1.623731539624473,
"learning_rate": 4.940305274798005e-06,
"loss": 0.0929,
"step": 767
},
{
"epoch": 0.3494085532302093,
"grad_norm": 1.3764287278870393,
"learning_rate": 4.940149937447549e-06,
"loss": 0.1002,
"step": 768
},
{
"epoch": 0.34986351228389445,
"grad_norm": 1.1571526873015439,
"learning_rate": 4.939994400699061e-06,
"loss": 0.0659,
"step": 769
},
{
"epoch": 0.3503184713375796,
"grad_norm": 1.3670356182264325,
"learning_rate": 4.939838664565248e-06,
"loss": 0.0991,
"step": 770
},
{
"epoch": 0.3507734303912648,
"grad_norm": 1.2532975621868427,
"learning_rate": 4.939682729058839e-06,
"loss": 0.0713,
"step": 771
},
{
"epoch": 0.35122838944494994,
"grad_norm": 1.3003896066972325,
"learning_rate": 4.939526594192574e-06,
"loss": 0.0784,
"step": 772
},
{
"epoch": 0.35168334849863514,
"grad_norm": 1.4253255736587618,
"learning_rate": 4.939370259979213e-06,
"loss": 0.0826,
"step": 773
},
{
"epoch": 0.3521383075523203,
"grad_norm": 2.0399381310170766,
"learning_rate": 4.9392137264315295e-06,
"loss": 0.1293,
"step": 774
},
{
"epoch": 0.3525932666060055,
"grad_norm": 1.938165172266556,
"learning_rate": 4.939056993562316e-06,
"loss": 0.1407,
"step": 775
},
{
"epoch": 0.35304822565969063,
"grad_norm": 1.5665447950299711,
"learning_rate": 4.9389000613843805e-06,
"loss": 0.0942,
"step": 776
},
{
"epoch": 0.3535031847133758,
"grad_norm": 1.6514430942693614,
"learning_rate": 4.938742929910546e-06,
"loss": 0.0927,
"step": 777
},
{
"epoch": 0.353958143767061,
"grad_norm": 1.0136329941515525,
"learning_rate": 4.938585599153652e-06,
"loss": 0.0676,
"step": 778
},
{
"epoch": 0.3544131028207461,
"grad_norm": 1.6808166258098367,
"learning_rate": 4.938428069126555e-06,
"loss": 0.1029,
"step": 779
},
{
"epoch": 0.3548680618744313,
"grad_norm": 1.6649052760273926,
"learning_rate": 4.9382703398421285e-06,
"loss": 0.0952,
"step": 780
},
{
"epoch": 0.35532302092811646,
"grad_norm": 1.734423574608651,
"learning_rate": 4.938112411313261e-06,
"loss": 0.1098,
"step": 781
},
{
"epoch": 0.35577797998180166,
"grad_norm": 1.5154424391674823,
"learning_rate": 4.937954283552858e-06,
"loss": 0.0808,
"step": 782
},
{
"epoch": 0.3562329390354868,
"grad_norm": 1.6988796126790968,
"learning_rate": 4.93779595657384e-06,
"loss": 0.1066,
"step": 783
},
{
"epoch": 0.35668789808917195,
"grad_norm": 2.050921985283142,
"learning_rate": 4.937637430389145e-06,
"loss": 0.1184,
"step": 784
},
{
"epoch": 0.35714285714285715,
"grad_norm": 1.5678672253769157,
"learning_rate": 4.937478705011729e-06,
"loss": 0.0709,
"step": 785
},
{
"epoch": 0.3575978161965423,
"grad_norm": 1.5215473079480804,
"learning_rate": 4.937319780454559e-06,
"loss": 0.1086,
"step": 786
},
{
"epoch": 0.3580527752502275,
"grad_norm": 1.4009067409412712,
"learning_rate": 4.937160656730625e-06,
"loss": 0.1004,
"step": 787
},
{
"epoch": 0.35850773430391264,
"grad_norm": 1.538795370618956,
"learning_rate": 4.9370013338529274e-06,
"loss": 0.0897,
"step": 788
},
{
"epoch": 0.35896269335759784,
"grad_norm": 1.3446100123630027,
"learning_rate": 4.936841811834486e-06,
"loss": 0.0907,
"step": 789
},
{
"epoch": 0.359417652411283,
"grad_norm": 1.9381081676057568,
"learning_rate": 4.936682090688337e-06,
"loss": 0.1534,
"step": 790
},
{
"epoch": 0.35987261146496813,
"grad_norm": 1.787589837431021,
"learning_rate": 4.936522170427531e-06,
"loss": 0.0919,
"step": 791
},
{
"epoch": 0.36032757051865333,
"grad_norm": 1.7189621906826116,
"learning_rate": 4.936362051065136e-06,
"loss": 0.0799,
"step": 792
},
{
"epoch": 0.3607825295723385,
"grad_norm": 1.615638183805568,
"learning_rate": 4.936201732614238e-06,
"loss": 0.0898,
"step": 793
},
{
"epoch": 0.3612374886260237,
"grad_norm": 1.899483445293266,
"learning_rate": 4.9360412150879355e-06,
"loss": 0.1086,
"step": 794
},
{
"epoch": 0.3616924476797088,
"grad_norm": 1.8831302635176637,
"learning_rate": 4.935880498499346e-06,
"loss": 0.0951,
"step": 795
},
{
"epoch": 0.362147406733394,
"grad_norm": 2.0172166216160594,
"learning_rate": 4.935719582861604e-06,
"loss": 0.0983,
"step": 796
},
{
"epoch": 0.36260236578707916,
"grad_norm": 1.7713001106130557,
"learning_rate": 4.935558468187855e-06,
"loss": 0.1177,
"step": 797
},
{
"epoch": 0.3630573248407643,
"grad_norm": 2.049007453668216,
"learning_rate": 4.935397154491268e-06,
"loss": 0.1349,
"step": 798
},
{
"epoch": 0.3635122838944495,
"grad_norm": 2.02340700279538,
"learning_rate": 4.935235641785023e-06,
"loss": 0.1419,
"step": 799
},
{
"epoch": 0.36396724294813465,
"grad_norm": 1.5504094804690502,
"learning_rate": 4.935073930082319e-06,
"loss": 0.1141,
"step": 800
},
{
"epoch": 0.36442220200181985,
"grad_norm": 1.3892292745868653,
"learning_rate": 4.93491201939637e-06,
"loss": 0.0859,
"step": 801
},
{
"epoch": 0.364877161055505,
"grad_norm": 1.636711407623354,
"learning_rate": 4.934749909740408e-06,
"loss": 0.1168,
"step": 802
},
{
"epoch": 0.3653321201091902,
"grad_norm": 1.5867549476191922,
"learning_rate": 4.934587601127677e-06,
"loss": 0.0941,
"step": 803
},
{
"epoch": 0.36578707916287534,
"grad_norm": 1.5019646850922737,
"learning_rate": 4.934425093571442e-06,
"loss": 0.0931,
"step": 804
},
{
"epoch": 0.3662420382165605,
"grad_norm": 1.5412581659446851,
"learning_rate": 4.934262387084984e-06,
"loss": 0.0931,
"step": 805
},
{
"epoch": 0.3666969972702457,
"grad_norm": 1.3579602631174856,
"learning_rate": 4.934099481681595e-06,
"loss": 0.0745,
"step": 806
},
{
"epoch": 0.36715195632393083,
"grad_norm": 1.800459979497766,
"learning_rate": 4.933936377374589e-06,
"loss": 0.1072,
"step": 807
},
{
"epoch": 0.36760691537761603,
"grad_norm": 1.1946995764469395,
"learning_rate": 4.933773074177293e-06,
"loss": 0.0848,
"step": 808
},
{
"epoch": 0.3680618744313012,
"grad_norm": 1.6651644751131276,
"learning_rate": 4.933609572103053e-06,
"loss": 0.0965,
"step": 809
},
{
"epoch": 0.3685168334849864,
"grad_norm": 1.913995880200427,
"learning_rate": 4.933445871165229e-06,
"loss": 0.1315,
"step": 810
},
{
"epoch": 0.3689717925386715,
"grad_norm": 1.5517430124798408,
"learning_rate": 4.933281971377197e-06,
"loss": 0.0856,
"step": 811
},
{
"epoch": 0.36942675159235666,
"grad_norm": 1.474632001508129,
"learning_rate": 4.933117872752352e-06,
"loss": 0.0989,
"step": 812
},
{
"epoch": 0.36988171064604186,
"grad_norm": 1.8862093944877263,
"learning_rate": 4.932953575304102e-06,
"loss": 0.1087,
"step": 813
},
{
"epoch": 0.370336669699727,
"grad_norm": 1.6830668966166524,
"learning_rate": 4.932789079045873e-06,
"loss": 0.1213,
"step": 814
},
{
"epoch": 0.3707916287534122,
"grad_norm": 1.7198476556190763,
"learning_rate": 4.932624383991106e-06,
"loss": 0.1215,
"step": 815
},
{
"epoch": 0.37124658780709735,
"grad_norm": 2.109229814604393,
"learning_rate": 4.9324594901532605e-06,
"loss": 0.1337,
"step": 816
},
{
"epoch": 0.37170154686078255,
"grad_norm": 1.4154701665481155,
"learning_rate": 4.93229439754581e-06,
"loss": 0.0944,
"step": 817
},
{
"epoch": 0.3721565059144677,
"grad_norm": 1.973608289061544,
"learning_rate": 4.932129106182246e-06,
"loss": 0.0901,
"step": 818
},
{
"epoch": 0.37261146496815284,
"grad_norm": 1.651833939526615,
"learning_rate": 4.931963616076075e-06,
"loss": 0.0876,
"step": 819
},
{
"epoch": 0.37306642402183804,
"grad_norm": 1.3876140677966586,
"learning_rate": 4.93179792724082e-06,
"loss": 0.0791,
"step": 820
},
{
"epoch": 0.3735213830755232,
"grad_norm": 1.4201117298181156,
"learning_rate": 4.9316320396900195e-06,
"loss": 0.0857,
"step": 821
},
{
"epoch": 0.3739763421292084,
"grad_norm": 2.158894018361071,
"learning_rate": 4.9314659534372305e-06,
"loss": 0.1499,
"step": 822
},
{
"epoch": 0.37443130118289353,
"grad_norm": 1.2722019893377066,
"learning_rate": 4.931299668496024e-06,
"loss": 0.0626,
"step": 823
},
{
"epoch": 0.37488626023657873,
"grad_norm": 1.5889108253283166,
"learning_rate": 4.931133184879988e-06,
"loss": 0.1003,
"step": 824
},
{
"epoch": 0.37534121929026387,
"grad_norm": 1.133918642525753,
"learning_rate": 4.930966502602727e-06,
"loss": 0.0714,
"step": 825
},
{
"epoch": 0.37579617834394907,
"grad_norm": 2.1296168633446615,
"learning_rate": 4.930799621677862e-06,
"loss": 0.1276,
"step": 826
},
{
"epoch": 0.3762511373976342,
"grad_norm": 2.018575113751553,
"learning_rate": 4.93063254211903e-06,
"loss": 0.134,
"step": 827
},
{
"epoch": 0.37670609645131936,
"grad_norm": 1.2247931548507431,
"learning_rate": 4.930465263939882e-06,
"loss": 0.0617,
"step": 828
},
{
"epoch": 0.37716105550500456,
"grad_norm": 2.032637719937323,
"learning_rate": 4.9302977871540894e-06,
"loss": 0.1191,
"step": 829
},
{
"epoch": 0.3776160145586897,
"grad_norm": 1.8922514826155596,
"learning_rate": 4.930130111775336e-06,
"loss": 0.1136,
"step": 830
},
{
"epoch": 0.3780709736123749,
"grad_norm": 1.2345527477299194,
"learning_rate": 4.9299622378173245e-06,
"loss": 0.0613,
"step": 831
},
{
"epoch": 0.37852593266606005,
"grad_norm": 2.2369584057058693,
"learning_rate": 4.929794165293773e-06,
"loss": 0.1384,
"step": 832
},
{
"epoch": 0.37898089171974525,
"grad_norm": 1.2980952577352378,
"learning_rate": 4.9296258942184145e-06,
"loss": 0.0889,
"step": 833
},
{
"epoch": 0.3794358507734304,
"grad_norm": 2.116237658876168,
"learning_rate": 4.929457424605e-06,
"loss": 0.1156,
"step": 834
},
{
"epoch": 0.37989080982711554,
"grad_norm": 1.820103679143319,
"learning_rate": 4.929288756467296e-06,
"loss": 0.1224,
"step": 835
},
{
"epoch": 0.38034576888080074,
"grad_norm": 1.6658306682266317,
"learning_rate": 4.929119889819086e-06,
"loss": 0.0871,
"step": 836
},
{
"epoch": 0.3808007279344859,
"grad_norm": 2.7831412779318128,
"learning_rate": 4.928950824674169e-06,
"loss": 0.1447,
"step": 837
},
{
"epoch": 0.3812556869881711,
"grad_norm": 1.460745158832598,
"learning_rate": 4.928781561046359e-06,
"loss": 0.0902,
"step": 838
},
{
"epoch": 0.3817106460418562,
"grad_norm": 1.544649379546627,
"learning_rate": 4.928612098949488e-06,
"loss": 0.0995,
"step": 839
},
{
"epoch": 0.3821656050955414,
"grad_norm": 1.583411250445995,
"learning_rate": 4.9284424383974026e-06,
"loss": 0.1007,
"step": 840
},
{
"epoch": 0.38262056414922657,
"grad_norm": 1.2960669635575661,
"learning_rate": 4.928272579403969e-06,
"loss": 0.0679,
"step": 841
},
{
"epoch": 0.3830755232029117,
"grad_norm": 1.4865280371498417,
"learning_rate": 4.928102521983067e-06,
"loss": 0.1208,
"step": 842
},
{
"epoch": 0.3835304822565969,
"grad_norm": 2.1345090660254145,
"learning_rate": 4.9279322661485906e-06,
"loss": 0.1489,
"step": 843
},
{
"epoch": 0.38398544131028206,
"grad_norm": 1.705469805887344,
"learning_rate": 4.927761811914455e-06,
"loss": 0.1084,
"step": 844
},
{
"epoch": 0.38444040036396726,
"grad_norm": 1.358954041720105,
"learning_rate": 4.927591159294587e-06,
"loss": 0.0827,
"step": 845
},
{
"epoch": 0.3848953594176524,
"grad_norm": 1.8335314647218843,
"learning_rate": 4.927420308302933e-06,
"loss": 0.102,
"step": 846
},
{
"epoch": 0.3853503184713376,
"grad_norm": 1.710141204765745,
"learning_rate": 4.927249258953454e-06,
"loss": 0.1091,
"step": 847
},
{
"epoch": 0.38580527752502275,
"grad_norm": 1.7784989569871608,
"learning_rate": 4.927078011260126e-06,
"loss": 0.1094,
"step": 848
},
{
"epoch": 0.3862602365787079,
"grad_norm": 1.9072996593932403,
"learning_rate": 4.926906565236943e-06,
"loss": 0.1255,
"step": 849
},
{
"epoch": 0.3867151956323931,
"grad_norm": 1.7435526255624214,
"learning_rate": 4.926734920897916e-06,
"loss": 0.1076,
"step": 850
},
{
"epoch": 0.38717015468607824,
"grad_norm": 1.3254342460194672,
"learning_rate": 4.926563078257071e-06,
"loss": 0.099,
"step": 851
},
{
"epoch": 0.38762511373976344,
"grad_norm": 1.0985508710385608,
"learning_rate": 4.926391037328448e-06,
"loss": 0.0848,
"step": 852
},
{
"epoch": 0.3880800727934486,
"grad_norm": 1.6344858491886853,
"learning_rate": 4.926218798126108e-06,
"loss": 0.1102,
"step": 853
},
{
"epoch": 0.3885350318471338,
"grad_norm": 1.694464350768917,
"learning_rate": 4.926046360664124e-06,
"loss": 0.0868,
"step": 854
},
{
"epoch": 0.3889899909008189,
"grad_norm": 1.865189060623283,
"learning_rate": 4.925873724956588e-06,
"loss": 0.1152,
"step": 855
},
{
"epoch": 0.38944494995450407,
"grad_norm": 1.794490671041637,
"learning_rate": 4.9257008910176065e-06,
"loss": 0.1443,
"step": 856
},
{
"epoch": 0.38989990900818927,
"grad_norm": 1.6294296423553156,
"learning_rate": 4.925527858861302e-06,
"loss": 0.092,
"step": 857
},
{
"epoch": 0.3903548680618744,
"grad_norm": 1.7424555145921712,
"learning_rate": 4.925354628501814e-06,
"loss": 0.1002,
"step": 858
},
{
"epoch": 0.3908098271155596,
"grad_norm": 2.309513172607415,
"learning_rate": 4.925181199953299e-06,
"loss": 0.1288,
"step": 859
},
{
"epoch": 0.39126478616924476,
"grad_norm": 1.3668641274774587,
"learning_rate": 4.9250075732299285e-06,
"loss": 0.0903,
"step": 860
},
{
"epoch": 0.39171974522292996,
"grad_norm": 1.7785057619158235,
"learning_rate": 4.92483374834589e-06,
"loss": 0.1181,
"step": 861
},
{
"epoch": 0.3921747042766151,
"grad_norm": 1.5234971151354315,
"learning_rate": 4.9246597253153884e-06,
"loss": 0.0935,
"step": 862
},
{
"epoch": 0.39262966333030025,
"grad_norm": 1.1791645313929775,
"learning_rate": 4.924485504152644e-06,
"loss": 0.0822,
"step": 863
},
{
"epoch": 0.39308462238398545,
"grad_norm": 1.5983057485508323,
"learning_rate": 4.924311084871892e-06,
"loss": 0.0966,
"step": 864
},
{
"epoch": 0.3935395814376706,
"grad_norm": 1.6634965227764558,
"learning_rate": 4.924136467487387e-06,
"loss": 0.0759,
"step": 865
},
{
"epoch": 0.3939945404913558,
"grad_norm": 1.5231170961334706,
"learning_rate": 4.923961652013397e-06,
"loss": 0.0881,
"step": 866
},
{
"epoch": 0.39444949954504094,
"grad_norm": 1.4495990250164725,
"learning_rate": 4.923786638464207e-06,
"loss": 0.0941,
"step": 867
},
{
"epoch": 0.39490445859872614,
"grad_norm": 1.3390712595063252,
"learning_rate": 4.9236114268541196e-06,
"loss": 0.0846,
"step": 868
},
{
"epoch": 0.3953594176524113,
"grad_norm": 1.627122973701433,
"learning_rate": 4.923436017197451e-06,
"loss": 0.0819,
"step": 869
},
{
"epoch": 0.3958143767060964,
"grad_norm": 1.3377642278691055,
"learning_rate": 4.923260409508535e-06,
"loss": 0.088,
"step": 870
},
{
"epoch": 0.3962693357597816,
"grad_norm": 1.9694748985572026,
"learning_rate": 4.9230846038017214e-06,
"loss": 0.151,
"step": 871
},
{
"epoch": 0.39672429481346677,
"grad_norm": 1.4923965061921258,
"learning_rate": 4.922908600091378e-06,
"loss": 0.0795,
"step": 872
},
{
"epoch": 0.39717925386715197,
"grad_norm": 1.8057120373297069,
"learning_rate": 4.9227323983918835e-06,
"loss": 0.1439,
"step": 873
},
{
"epoch": 0.3976342129208371,
"grad_norm": 1.226146313826682,
"learning_rate": 4.922555998717639e-06,
"loss": 0.0845,
"step": 874
},
{
"epoch": 0.3980891719745223,
"grad_norm": 1.4188073442884932,
"learning_rate": 4.922379401083058e-06,
"loss": 0.0723,
"step": 875
},
{
"epoch": 0.39854413102820746,
"grad_norm": 1.6044422866063657,
"learning_rate": 4.922202605502573e-06,
"loss": 0.0981,
"step": 876
},
{
"epoch": 0.3989990900818926,
"grad_norm": 1.645096377490142,
"learning_rate": 4.922025611990629e-06,
"loss": 0.0882,
"step": 877
},
{
"epoch": 0.3994540491355778,
"grad_norm": 1.4988618969542298,
"learning_rate": 4.92184842056169e-06,
"loss": 0.0914,
"step": 878
},
{
"epoch": 0.39990900818926295,
"grad_norm": 1.4716766649704647,
"learning_rate": 4.921671031230235e-06,
"loss": 0.0843,
"step": 879
},
{
"epoch": 0.40036396724294815,
"grad_norm": 1.8151437273817552,
"learning_rate": 4.921493444010759e-06,
"loss": 0.1115,
"step": 880
},
{
"epoch": 0.4008189262966333,
"grad_norm": 1.3841092562389385,
"learning_rate": 4.921315658917774e-06,
"loss": 0.0821,
"step": 881
},
{
"epoch": 0.4012738853503185,
"grad_norm": 1.5281014710080694,
"learning_rate": 4.921137675965809e-06,
"loss": 0.0894,
"step": 882
},
{
"epoch": 0.40172884440400364,
"grad_norm": 1.1860457913745353,
"learning_rate": 4.920959495169406e-06,
"loss": 0.0819,
"step": 883
},
{
"epoch": 0.4021838034576888,
"grad_norm": 1.9670434695091386,
"learning_rate": 4.920781116543126e-06,
"loss": 0.1198,
"step": 884
},
{
"epoch": 0.402638762511374,
"grad_norm": 1.4837005110977715,
"learning_rate": 4.920602540101546e-06,
"loss": 0.0871,
"step": 885
},
{
"epoch": 0.4030937215650591,
"grad_norm": 1.8269163623820734,
"learning_rate": 4.920423765859257e-06,
"loss": 0.0956,
"step": 886
},
{
"epoch": 0.4035486806187443,
"grad_norm": 1.6998774179110374,
"learning_rate": 4.920244793830869e-06,
"loss": 0.0973,
"step": 887
},
{
"epoch": 0.40400363967242947,
"grad_norm": 1.6596471546846747,
"learning_rate": 4.920065624031006e-06,
"loss": 0.1085,
"step": 888
},
{
"epoch": 0.40445859872611467,
"grad_norm": 1.4077908132773769,
"learning_rate": 4.919886256474309e-06,
"loss": 0.0904,
"step": 889
},
{
"epoch": 0.4049135577797998,
"grad_norm": 1.7022215596121757,
"learning_rate": 4.919706691175435e-06,
"loss": 0.091,
"step": 890
},
{
"epoch": 0.40536851683348496,
"grad_norm": 2.1232813584307455,
"learning_rate": 4.919526928149058e-06,
"loss": 0.1366,
"step": 891
},
{
"epoch": 0.40582347588717016,
"grad_norm": 1.6341211456957871,
"learning_rate": 4.919346967409867e-06,
"loss": 0.1108,
"step": 892
},
{
"epoch": 0.4062784349408553,
"grad_norm": 1.5324489468460818,
"learning_rate": 4.919166808972567e-06,
"loss": 0.1228,
"step": 893
},
{
"epoch": 0.4067333939945405,
"grad_norm": 2.099437608372934,
"learning_rate": 4.918986452851881e-06,
"loss": 0.1245,
"step": 894
},
{
"epoch": 0.40718835304822565,
"grad_norm": 1.3588941988828955,
"learning_rate": 4.918805899062545e-06,
"loss": 0.0621,
"step": 895
},
{
"epoch": 0.40764331210191085,
"grad_norm": 0.8277266375645331,
"learning_rate": 4.9186251476193146e-06,
"loss": 0.0499,
"step": 896
},
{
"epoch": 0.408098271155596,
"grad_norm": 1.7852175335240448,
"learning_rate": 4.918444198536959e-06,
"loss": 0.1206,
"step": 897
},
{
"epoch": 0.40855323020928114,
"grad_norm": 1.5382745011065326,
"learning_rate": 4.918263051830267e-06,
"loss": 0.1081,
"step": 898
},
{
"epoch": 0.40900818926296634,
"grad_norm": 1.621296590196374,
"learning_rate": 4.918081707514037e-06,
"loss": 0.0881,
"step": 899
},
{
"epoch": 0.4094631483166515,
"grad_norm": 2.178092466242458,
"learning_rate": 4.917900165603091e-06,
"loss": 0.1364,
"step": 900
},
{
"epoch": 0.4099181073703367,
"grad_norm": 1.5880350908655525,
"learning_rate": 4.9177184261122624e-06,
"loss": 0.1073,
"step": 901
},
{
"epoch": 0.4103730664240218,
"grad_norm": 1.8483741427612825,
"learning_rate": 4.917536489056402e-06,
"loss": 0.0972,
"step": 902
},
{
"epoch": 0.410828025477707,
"grad_norm": 1.5893537500919641,
"learning_rate": 4.9173543544503775e-06,
"loss": 0.0851,
"step": 903
},
{
"epoch": 0.41128298453139217,
"grad_norm": 1.144493331243443,
"learning_rate": 4.917172022309072e-06,
"loss": 0.0637,
"step": 904
},
{
"epoch": 0.41173794358507737,
"grad_norm": 1.139422632834299,
"learning_rate": 4.916989492647385e-06,
"loss": 0.065,
"step": 905
},
{
"epoch": 0.4121929026387625,
"grad_norm": 1.2858602055549935,
"learning_rate": 4.916806765480231e-06,
"loss": 0.079,
"step": 906
},
{
"epoch": 0.41264786169244766,
"grad_norm": 1.9716514818564959,
"learning_rate": 4.9166238408225416e-06,
"loss": 0.161,
"step": 907
},
{
"epoch": 0.41310282074613286,
"grad_norm": 1.6206512831659239,
"learning_rate": 4.916440718689267e-06,
"loss": 0.0958,
"step": 908
},
{
"epoch": 0.413557779799818,
"grad_norm": 1.2472167749456646,
"learning_rate": 4.916257399095369e-06,
"loss": 0.0705,
"step": 909
},
{
"epoch": 0.4140127388535032,
"grad_norm": 1.1891048303298737,
"learning_rate": 4.916073882055827e-06,
"loss": 0.0671,
"step": 910
},
{
"epoch": 0.41446769790718835,
"grad_norm": 1.9533245506572903,
"learning_rate": 4.91589016758564e-06,
"loss": 0.1203,
"step": 911
},
{
"epoch": 0.41492265696087355,
"grad_norm": 1.7223916244259532,
"learning_rate": 4.915706255699817e-06,
"loss": 0.1171,
"step": 912
},
{
"epoch": 0.4153776160145587,
"grad_norm": 2.042050502050582,
"learning_rate": 4.915522146413389e-06,
"loss": 0.152,
"step": 913
},
{
"epoch": 0.41583257506824384,
"grad_norm": 1.5213892799482642,
"learning_rate": 4.9153378397413985e-06,
"loss": 0.1011,
"step": 914
},
{
"epoch": 0.41628753412192904,
"grad_norm": 1.8893914267841023,
"learning_rate": 4.915153335698908e-06,
"loss": 0.1133,
"step": 915
},
{
"epoch": 0.4167424931756142,
"grad_norm": 1.7882796521112458,
"learning_rate": 4.914968634300994e-06,
"loss": 0.1081,
"step": 916
},
{
"epoch": 0.4171974522292994,
"grad_norm": 1.186974851727905,
"learning_rate": 4.914783735562748e-06,
"loss": 0.0791,
"step": 917
},
{
"epoch": 0.4176524112829845,
"grad_norm": 1.3276822787818023,
"learning_rate": 4.914598639499281e-06,
"loss": 0.0929,
"step": 918
},
{
"epoch": 0.4181073703366697,
"grad_norm": 1.3143453344689244,
"learning_rate": 4.914413346125717e-06,
"loss": 0.0907,
"step": 919
},
{
"epoch": 0.41856232939035487,
"grad_norm": 1.2706441279848544,
"learning_rate": 4.914227855457199e-06,
"loss": 0.0797,
"step": 920
},
{
"epoch": 0.41901728844404,
"grad_norm": 1.8437493208675002,
"learning_rate": 4.914042167508881e-06,
"loss": 0.0851,
"step": 921
},
{
"epoch": 0.4194722474977252,
"grad_norm": 1.4975873837594447,
"learning_rate": 4.9138562822959416e-06,
"loss": 0.0735,
"step": 922
},
{
"epoch": 0.41992720655141036,
"grad_norm": 1.8590378932388973,
"learning_rate": 4.913670199833566e-06,
"loss": 0.0955,
"step": 923
},
{
"epoch": 0.42038216560509556,
"grad_norm": 1.6110342357827778,
"learning_rate": 4.913483920136961e-06,
"loss": 0.0904,
"step": 924
},
{
"epoch": 0.4208371246587807,
"grad_norm": 1.761284240310015,
"learning_rate": 4.91329744322135e-06,
"loss": 0.0967,
"step": 925
},
{
"epoch": 0.4212920837124659,
"grad_norm": 1.3709410104557458,
"learning_rate": 4.913110769101971e-06,
"loss": 0.0872,
"step": 926
},
{
"epoch": 0.42174704276615105,
"grad_norm": 1.6539854986144262,
"learning_rate": 4.912923897794077e-06,
"loss": 0.0982,
"step": 927
},
{
"epoch": 0.4222020018198362,
"grad_norm": 1.6465498130671066,
"learning_rate": 4.912736829312938e-06,
"loss": 0.1093,
"step": 928
},
{
"epoch": 0.4226569608735214,
"grad_norm": 1.8873864205133448,
"learning_rate": 4.912549563673842e-06,
"loss": 0.1239,
"step": 929
},
{
"epoch": 0.42311191992720654,
"grad_norm": 1.5496708014603886,
"learning_rate": 4.912362100892091e-06,
"loss": 0.1273,
"step": 930
},
{
"epoch": 0.42356687898089174,
"grad_norm": 1.1519662533075623,
"learning_rate": 4.912174440983002e-06,
"loss": 0.0729,
"step": 931
},
{
"epoch": 0.4240218380345769,
"grad_norm": 1.6674274772885138,
"learning_rate": 4.911986583961912e-06,
"loss": 0.1107,
"step": 932
},
{
"epoch": 0.4244767970882621,
"grad_norm": 1.8943327104641587,
"learning_rate": 4.91179852984417e-06,
"loss": 0.0989,
"step": 933
},
{
"epoch": 0.4249317561419472,
"grad_norm": 1.3387420389544245,
"learning_rate": 4.911610278645144e-06,
"loss": 0.0873,
"step": 934
},
{
"epoch": 0.42538671519563237,
"grad_norm": 1.3086866571732974,
"learning_rate": 4.911421830380217e-06,
"loss": 0.0767,
"step": 935
},
{
"epoch": 0.42584167424931757,
"grad_norm": 2.04544186641041,
"learning_rate": 4.911233185064788e-06,
"loss": 0.1285,
"step": 936
},
{
"epoch": 0.4262966333030027,
"grad_norm": 1.6906012723967403,
"learning_rate": 4.911044342714272e-06,
"loss": 0.0997,
"step": 937
},
{
"epoch": 0.4267515923566879,
"grad_norm": 1.439162135385858,
"learning_rate": 4.9108553033440995e-06,
"loss": 0.0744,
"step": 938
},
{
"epoch": 0.42720655141037306,
"grad_norm": 1.2593154408057343,
"learning_rate": 4.91066606696972e-06,
"loss": 0.074,
"step": 939
},
{
"epoch": 0.42766151046405826,
"grad_norm": 1.7514521824191083,
"learning_rate": 4.910476633606597e-06,
"loss": 0.0971,
"step": 940
},
{
"epoch": 0.4281164695177434,
"grad_norm": 1.5625231909908295,
"learning_rate": 4.9102870032702075e-06,
"loss": 0.0689,
"step": 941
},
{
"epoch": 0.42857142857142855,
"grad_norm": 1.5194579023544843,
"learning_rate": 4.910097175976049e-06,
"loss": 0.0824,
"step": 942
},
{
"epoch": 0.42902638762511375,
"grad_norm": 1.4223453649486908,
"learning_rate": 4.909907151739634e-06,
"loss": 0.0747,
"step": 943
},
{
"epoch": 0.4294813466787989,
"grad_norm": 2.2121264200483393,
"learning_rate": 4.909716930576489e-06,
"loss": 0.1463,
"step": 944
},
{
"epoch": 0.4299363057324841,
"grad_norm": 1.5012792406542972,
"learning_rate": 4.909526512502158e-06,
"loss": 0.1241,
"step": 945
},
{
"epoch": 0.43039126478616924,
"grad_norm": 1.6714102508168673,
"learning_rate": 4.9093358975322025e-06,
"loss": 0.1045,
"step": 946
},
{
"epoch": 0.43084622383985444,
"grad_norm": 1.5613346147429912,
"learning_rate": 4.909145085682198e-06,
"loss": 0.1105,
"step": 947
},
{
"epoch": 0.4313011828935396,
"grad_norm": 1.4864622392832871,
"learning_rate": 4.908954076967737e-06,
"loss": 0.0831,
"step": 948
},
{
"epoch": 0.4317561419472247,
"grad_norm": 1.5530391149425158,
"learning_rate": 4.908762871404427e-06,
"loss": 0.1345,
"step": 949
},
{
"epoch": 0.4322111010009099,
"grad_norm": 1.5444429676980205,
"learning_rate": 4.908571469007893e-06,
"loss": 0.0886,
"step": 950
},
{
"epoch": 0.43266606005459507,
"grad_norm": 1.8034818342216412,
"learning_rate": 4.908379869793776e-06,
"loss": 0.1046,
"step": 951
},
{
"epoch": 0.43312101910828027,
"grad_norm": 1.3153452614362922,
"learning_rate": 4.908188073777732e-06,
"loss": 0.0715,
"step": 952
},
{
"epoch": 0.4335759781619654,
"grad_norm": 2.0825682650521857,
"learning_rate": 4.9079960809754334e-06,
"loss": 0.135,
"step": 953
},
{
"epoch": 0.4340309372156506,
"grad_norm": 1.3431541090651076,
"learning_rate": 4.90780389140257e-06,
"loss": 0.0812,
"step": 954
},
{
"epoch": 0.43448589626933576,
"grad_norm": 2.018134282960315,
"learning_rate": 4.907611505074846e-06,
"loss": 0.1001,
"step": 955
},
{
"epoch": 0.4349408553230209,
"grad_norm": 1.8270847906398506,
"learning_rate": 4.907418922007983e-06,
"loss": 0.1054,
"step": 956
},
{
"epoch": 0.4353958143767061,
"grad_norm": 1.5502670619333374,
"learning_rate": 4.907226142217717e-06,
"loss": 0.0832,
"step": 957
},
{
"epoch": 0.43585077343039125,
"grad_norm": 1.5099564094926066,
"learning_rate": 4.9070331657198015e-06,
"loss": 0.093,
"step": 958
},
{
"epoch": 0.43630573248407645,
"grad_norm": 1.6580816557213998,
"learning_rate": 4.906839992530006e-06,
"loss": 0.1133,
"step": 959
},
{
"epoch": 0.4367606915377616,
"grad_norm": 1.9468112171012433,
"learning_rate": 4.906646622664115e-06,
"loss": 0.1122,
"step": 960
},
{
"epoch": 0.4372156505914468,
"grad_norm": 1.3246750710377195,
"learning_rate": 4.906453056137931e-06,
"loss": 0.0572,
"step": 961
},
{
"epoch": 0.43767060964513194,
"grad_norm": 2.1577598041780846,
"learning_rate": 4.90625929296727e-06,
"loss": 0.1419,
"step": 962
},
{
"epoch": 0.4381255686988171,
"grad_norm": 1.3649728107391488,
"learning_rate": 4.9060653331679665e-06,
"loss": 0.1026,
"step": 963
},
{
"epoch": 0.4385805277525023,
"grad_norm": 1.7954750394301047,
"learning_rate": 4.90587117675587e-06,
"loss": 0.124,
"step": 964
},
{
"epoch": 0.4390354868061874,
"grad_norm": 1.6192897762023186,
"learning_rate": 4.905676823746846e-06,
"loss": 0.102,
"step": 965
},
{
"epoch": 0.4394904458598726,
"grad_norm": 1.183156466195084,
"learning_rate": 4.9054822741567745e-06,
"loss": 0.0741,
"step": 966
},
{
"epoch": 0.43994540491355777,
"grad_norm": 1.791057313794206,
"learning_rate": 4.905287528001555e-06,
"loss": 0.0986,
"step": 967
},
{
"epoch": 0.44040036396724297,
"grad_norm": 1.5587372758795195,
"learning_rate": 4.905092585297102e-06,
"loss": 0.0959,
"step": 968
},
{
"epoch": 0.4408553230209281,
"grad_norm": 1.9086814389692623,
"learning_rate": 4.904897446059344e-06,
"loss": 0.1124,
"step": 969
},
{
"epoch": 0.44131028207461326,
"grad_norm": 1.5518685718016205,
"learning_rate": 4.9047021103042255e-06,
"loss": 0.0802,
"step": 970
},
{
"epoch": 0.44176524112829846,
"grad_norm": 1.5626634869227398,
"learning_rate": 4.904506578047712e-06,
"loss": 0.0966,
"step": 971
},
{
"epoch": 0.4422202001819836,
"grad_norm": 1.6777151282946248,
"learning_rate": 4.9043108493057785e-06,
"loss": 0.0946,
"step": 972
},
{
"epoch": 0.4426751592356688,
"grad_norm": 1.3918546303467518,
"learning_rate": 4.904114924094421e-06,
"loss": 0.0776,
"step": 973
},
{
"epoch": 0.44313011828935395,
"grad_norm": 1.7054781101293177,
"learning_rate": 4.903918802429648e-06,
"loss": 0.1076,
"step": 974
},
{
"epoch": 0.44358507734303915,
"grad_norm": 0.9435161970580179,
"learning_rate": 4.9037224843274875e-06,
"loss": 0.055,
"step": 975
},
{
"epoch": 0.4440400363967243,
"grad_norm": 1.8279732096534727,
"learning_rate": 4.903525969803979e-06,
"loss": 0.144,
"step": 976
},
{
"epoch": 0.44449499545040944,
"grad_norm": 1.5827975534285916,
"learning_rate": 4.903329258875184e-06,
"loss": 0.0876,
"step": 977
},
{
"epoch": 0.44494995450409464,
"grad_norm": 1.5817514212508765,
"learning_rate": 4.903132351557175e-06,
"loss": 0.1003,
"step": 978
},
{
"epoch": 0.4454049135577798,
"grad_norm": 1.55794858043461,
"learning_rate": 4.902935247866043e-06,
"loss": 0.0901,
"step": 979
},
{
"epoch": 0.445859872611465,
"grad_norm": 1.7648097170403771,
"learning_rate": 4.9027379478178935e-06,
"loss": 0.1117,
"step": 980
},
{
"epoch": 0.4463148316651501,
"grad_norm": 1.4493752053158233,
"learning_rate": 4.90254045142885e-06,
"loss": 0.0824,
"step": 981
},
{
"epoch": 0.4467697907188353,
"grad_norm": 1.4618354488172722,
"learning_rate": 4.90234275871505e-06,
"loss": 0.08,
"step": 982
},
{
"epoch": 0.44722474977252047,
"grad_norm": 2.314057245131694,
"learning_rate": 4.9021448696926486e-06,
"loss": 0.1437,
"step": 983
},
{
"epoch": 0.44767970882620567,
"grad_norm": 1.2365214796695643,
"learning_rate": 4.901946784377816e-06,
"loss": 0.0955,
"step": 984
},
{
"epoch": 0.4481346678798908,
"grad_norm": 1.2633152164234291,
"learning_rate": 4.90174850278674e-06,
"loss": 0.0803,
"step": 985
},
{
"epoch": 0.44858962693357596,
"grad_norm": 1.5083171008818446,
"learning_rate": 4.901550024935623e-06,
"loss": 0.0942,
"step": 986
},
{
"epoch": 0.44904458598726116,
"grad_norm": 1.1583463791947812,
"learning_rate": 4.901351350840683e-06,
"loss": 0.0786,
"step": 987
},
{
"epoch": 0.4494995450409463,
"grad_norm": 1.343367085202188,
"learning_rate": 4.901152480518155e-06,
"loss": 0.0724,
"step": 988
},
{
"epoch": 0.4499545040946315,
"grad_norm": 1.1159650914918346,
"learning_rate": 4.900953413984289e-06,
"loss": 0.0681,
"step": 989
},
{
"epoch": 0.45040946314831665,
"grad_norm": 2.0950998044271025,
"learning_rate": 4.900754151255353e-06,
"loss": 0.1541,
"step": 990
},
{
"epoch": 0.45086442220200185,
"grad_norm": 1.4260341278646986,
"learning_rate": 4.9005546923476305e-06,
"loss": 0.0707,
"step": 991
},
{
"epoch": 0.451319381255687,
"grad_norm": 1.6502415030386688,
"learning_rate": 4.9003550372774185e-06,
"loss": 0.1111,
"step": 992
},
{
"epoch": 0.45177434030937214,
"grad_norm": 1.280806174818392,
"learning_rate": 4.900155186061033e-06,
"loss": 0.0789,
"step": 993
},
{
"epoch": 0.45222929936305734,
"grad_norm": 1.9745186799391785,
"learning_rate": 4.8999551387148045e-06,
"loss": 0.1125,
"step": 994
},
{
"epoch": 0.4526842584167425,
"grad_norm": 1.2542781615680096,
"learning_rate": 4.89975489525508e-06,
"loss": 0.0814,
"step": 995
},
{
"epoch": 0.4531392174704277,
"grad_norm": 1.5218729573521388,
"learning_rate": 4.899554455698223e-06,
"loss": 0.0849,
"step": 996
},
{
"epoch": 0.4535941765241128,
"grad_norm": 1.4911465655176248,
"learning_rate": 4.899353820060612e-06,
"loss": 0.0887,
"step": 997
},
{
"epoch": 0.454049135577798,
"grad_norm": 1.8552177664529743,
"learning_rate": 4.899152988358643e-06,
"loss": 0.1153,
"step": 998
},
{
"epoch": 0.45450409463148317,
"grad_norm": 1.3462289694693903,
"learning_rate": 4.898951960608725e-06,
"loss": 0.0768,
"step": 999
},
{
"epoch": 0.4549590536851683,
"grad_norm": 1.5105165626051191,
"learning_rate": 4.8987507368272865e-06,
"loss": 0.0916,
"step": 1000
},
{
"epoch": 0.4554140127388535,
"grad_norm": 1.7874012401425645,
"learning_rate": 4.898549317030772e-06,
"loss": 0.1228,
"step": 1001
},
{
"epoch": 0.45586897179253866,
"grad_norm": 1.8678564128703685,
"learning_rate": 4.898347701235637e-06,
"loss": 0.1226,
"step": 1002
},
{
"epoch": 0.45632393084622386,
"grad_norm": 1.9367180322034927,
"learning_rate": 4.89814588945836e-06,
"loss": 0.1239,
"step": 1003
},
{
"epoch": 0.456778889899909,
"grad_norm": 1.8462049373063074,
"learning_rate": 4.89794388171543e-06,
"loss": 0.1106,
"step": 1004
},
{
"epoch": 0.4572338489535942,
"grad_norm": 1.7977459529642075,
"learning_rate": 4.897741678023356e-06,
"loss": 0.1137,
"step": 1005
},
{
"epoch": 0.45768880800727935,
"grad_norm": 1.4317415496884898,
"learning_rate": 4.897539278398659e-06,
"loss": 0.0835,
"step": 1006
},
{
"epoch": 0.4581437670609645,
"grad_norm": 1.947224769167489,
"learning_rate": 4.8973366828578804e-06,
"loss": 0.1087,
"step": 1007
},
{
"epoch": 0.4585987261146497,
"grad_norm": 1.6840082807319827,
"learning_rate": 4.897133891417574e-06,
"loss": 0.1004,
"step": 1008
},
{
"epoch": 0.45905368516833484,
"grad_norm": 1.6722996299672828,
"learning_rate": 4.896930904094311e-06,
"loss": 0.0869,
"step": 1009
},
{
"epoch": 0.45950864422202004,
"grad_norm": 2.2431321251776986,
"learning_rate": 4.896727720904679e-06,
"loss": 0.121,
"step": 1010
},
{
"epoch": 0.4599636032757052,
"grad_norm": 1.2761704386307018,
"learning_rate": 4.896524341865282e-06,
"loss": 0.0736,
"step": 1011
},
{
"epoch": 0.4604185623293904,
"grad_norm": 1.6413390038739506,
"learning_rate": 4.896320766992737e-06,
"loss": 0.1286,
"step": 1012
},
{
"epoch": 0.4608735213830755,
"grad_norm": 1.5251335582402008,
"learning_rate": 4.896116996303682e-06,
"loss": 0.0989,
"step": 1013
},
{
"epoch": 0.46132848043676067,
"grad_norm": 1.8038369878473837,
"learning_rate": 4.895913029814766e-06,
"loss": 0.097,
"step": 1014
},
{
"epoch": 0.46178343949044587,
"grad_norm": 2.012861641550116,
"learning_rate": 4.895708867542658e-06,
"loss": 0.1111,
"step": 1015
},
{
"epoch": 0.462238398544131,
"grad_norm": 1.7366035889417508,
"learning_rate": 4.895504509504039e-06,
"loss": 0.1029,
"step": 1016
},
{
"epoch": 0.4626933575978162,
"grad_norm": 1.3763665767496873,
"learning_rate": 4.89529995571561e-06,
"loss": 0.0938,
"step": 1017
},
{
"epoch": 0.46314831665150136,
"grad_norm": 1.6906151679744952,
"learning_rate": 4.895095206194086e-06,
"loss": 0.1085,
"step": 1018
},
{
"epoch": 0.46360327570518656,
"grad_norm": 1.5053749521419235,
"learning_rate": 4.894890260956198e-06,
"loss": 0.0884,
"step": 1019
},
{
"epoch": 0.4640582347588717,
"grad_norm": 1.5334372638839222,
"learning_rate": 4.8946851200186925e-06,
"loss": 0.1015,
"step": 1020
},
{
"epoch": 0.46451319381255685,
"grad_norm": 1.576638091265577,
"learning_rate": 4.894479783398334e-06,
"loss": 0.0903,
"step": 1021
},
{
"epoch": 0.46496815286624205,
"grad_norm": 1.7368682352331435,
"learning_rate": 4.8942742511119004e-06,
"loss": 0.1029,
"step": 1022
},
{
"epoch": 0.4654231119199272,
"grad_norm": 3.9669130222003455,
"learning_rate": 4.894068523176187e-06,
"loss": 0.2383,
"step": 1023
},
{
"epoch": 0.4658780709736124,
"grad_norm": 1.5974114766744798,
"learning_rate": 4.8938625996080056e-06,
"loss": 0.1116,
"step": 1024
},
{
"epoch": 0.46633303002729753,
"grad_norm": 1.1252846797063132,
"learning_rate": 4.893656480424184e-06,
"loss": 0.0673,
"step": 1025
},
{
"epoch": 0.46678798908098273,
"grad_norm": 1.5329254322284862,
"learning_rate": 4.893450165641564e-06,
"loss": 0.1066,
"step": 1026
},
{
"epoch": 0.4672429481346679,
"grad_norm": 1.3116647286111784,
"learning_rate": 4.893243655277005e-06,
"loss": 0.086,
"step": 1027
},
{
"epoch": 0.467697907188353,
"grad_norm": 1.5621452726926597,
"learning_rate": 4.893036949347383e-06,
"loss": 0.0937,
"step": 1028
},
{
"epoch": 0.4681528662420382,
"grad_norm": 1.44299341979305,
"learning_rate": 4.892830047869588e-06,
"loss": 0.0922,
"step": 1029
},
{
"epoch": 0.46860782529572337,
"grad_norm": 1.2004173985623205,
"learning_rate": 4.892622950860527e-06,
"loss": 0.0545,
"step": 1030
},
{
"epoch": 0.46906278434940857,
"grad_norm": 1.2933675353670258,
"learning_rate": 4.892415658337123e-06,
"loss": 0.0938,
"step": 1031
},
{
"epoch": 0.4695177434030937,
"grad_norm": 1.3899639516557423,
"learning_rate": 4.892208170316317e-06,
"loss": 0.0807,
"step": 1032
},
{
"epoch": 0.4699727024567789,
"grad_norm": 1.2103198454795117,
"learning_rate": 4.892000486815062e-06,
"loss": 0.0724,
"step": 1033
},
{
"epoch": 0.47042766151046406,
"grad_norm": 1.4625912187815495,
"learning_rate": 4.891792607850328e-06,
"loss": 0.0944,
"step": 1034
},
{
"epoch": 0.4708826205641492,
"grad_norm": 2.3778377956475074,
"learning_rate": 4.891584533439104e-06,
"loss": 0.1301,
"step": 1035
},
{
"epoch": 0.4713375796178344,
"grad_norm": 1.6240877825800288,
"learning_rate": 4.891376263598393e-06,
"loss": 0.1056,
"step": 1036
},
{
"epoch": 0.47179253867151955,
"grad_norm": 1.377205820937822,
"learning_rate": 4.891167798345213e-06,
"loss": 0.0879,
"step": 1037
},
{
"epoch": 0.47224749772520475,
"grad_norm": 1.918358313853146,
"learning_rate": 4.890959137696598e-06,
"loss": 0.1218,
"step": 1038
},
{
"epoch": 0.4727024567788899,
"grad_norm": 1.9802948601827106,
"learning_rate": 4.890750281669601e-06,
"loss": 0.0966,
"step": 1039
},
{
"epoch": 0.4731574158325751,
"grad_norm": 1.209426799273833,
"learning_rate": 4.890541230281287e-06,
"loss": 0.0687,
"step": 1040
},
{
"epoch": 0.47361237488626023,
"grad_norm": 1.714672711362897,
"learning_rate": 4.8903319835487385e-06,
"loss": 0.1119,
"step": 1041
},
{
"epoch": 0.4740673339399454,
"grad_norm": 1.8426958086935912,
"learning_rate": 4.890122541489056e-06,
"loss": 0.1071,
"step": 1042
},
{
"epoch": 0.4745222929936306,
"grad_norm": 1.5412332450392434,
"learning_rate": 4.889912904119353e-06,
"loss": 0.1194,
"step": 1043
},
{
"epoch": 0.4749772520473157,
"grad_norm": 1.5900743055736573,
"learning_rate": 4.88970307145676e-06,
"loss": 0.0905,
"step": 1044
},
{
"epoch": 0.4754322111010009,
"grad_norm": 1.299438309320783,
"learning_rate": 4.889493043518423e-06,
"loss": 0.0782,
"step": 1045
},
{
"epoch": 0.47588717015468607,
"grad_norm": 1.2775434133946648,
"learning_rate": 4.889282820321506e-06,
"loss": 0.067,
"step": 1046
},
{
"epoch": 0.47634212920837127,
"grad_norm": 2.0181187729173313,
"learning_rate": 4.889072401883187e-06,
"loss": 0.1039,
"step": 1047
},
{
"epoch": 0.4767970882620564,
"grad_norm": 1.3673144633984753,
"learning_rate": 4.88886178822066e-06,
"loss": 0.0871,
"step": 1048
},
{
"epoch": 0.47725204731574156,
"grad_norm": 1.5512598399498212,
"learning_rate": 4.888650979351136e-06,
"loss": 0.0936,
"step": 1049
},
{
"epoch": 0.47770700636942676,
"grad_norm": 1.8862924775266208,
"learning_rate": 4.888439975291841e-06,
"loss": 0.149,
"step": 1050
},
{
"epoch": 0.4781619654231119,
"grad_norm": 1.527860807788029,
"learning_rate": 4.888228776060017e-06,
"loss": 0.0981,
"step": 1051
},
{
"epoch": 0.4786169244767971,
"grad_norm": 1.635801739367282,
"learning_rate": 4.888017381672923e-06,
"loss": 0.1004,
"step": 1052
},
{
"epoch": 0.47907188353048225,
"grad_norm": 1.496869794404093,
"learning_rate": 4.887805792147832e-06,
"loss": 0.0921,
"step": 1053
},
{
"epoch": 0.47952684258416745,
"grad_norm": 1.729233289880027,
"learning_rate": 4.887594007502036e-06,
"loss": 0.089,
"step": 1054
},
{
"epoch": 0.4799818016378526,
"grad_norm": 1.9599768924005974,
"learning_rate": 4.887382027752838e-06,
"loss": 0.1029,
"step": 1055
},
{
"epoch": 0.48043676069153773,
"grad_norm": 1.6584360062505734,
"learning_rate": 4.8871698529175636e-06,
"loss": 0.1173,
"step": 1056
},
{
"epoch": 0.48089171974522293,
"grad_norm": 1.631421092772313,
"learning_rate": 4.886957483013549e-06,
"loss": 0.1231,
"step": 1057
},
{
"epoch": 0.4813466787989081,
"grad_norm": 2.3766899063373996,
"learning_rate": 4.886744918058149e-06,
"loss": 0.13,
"step": 1058
},
{
"epoch": 0.4818016378525933,
"grad_norm": 1.7346716794855597,
"learning_rate": 4.886532158068732e-06,
"loss": 0.0938,
"step": 1059
},
{
"epoch": 0.4822565969062784,
"grad_norm": 1.5214305907929453,
"learning_rate": 4.886319203062683e-06,
"loss": 0.0761,
"step": 1060
},
{
"epoch": 0.4827115559599636,
"grad_norm": 1.6073102647133055,
"learning_rate": 4.886106053057408e-06,
"loss": 0.0818,
"step": 1061
},
{
"epoch": 0.48316651501364877,
"grad_norm": 1.803380712114119,
"learning_rate": 4.88589270807032e-06,
"loss": 0.1231,
"step": 1062
},
{
"epoch": 0.48362147406733397,
"grad_norm": 1.5275199982317587,
"learning_rate": 4.885679168118855e-06,
"loss": 0.1105,
"step": 1063
},
{
"epoch": 0.4840764331210191,
"grad_norm": 1.8472965185652206,
"learning_rate": 4.8854654332204635e-06,
"loss": 0.1324,
"step": 1064
},
{
"epoch": 0.48453139217470426,
"grad_norm": 1.41701925154465,
"learning_rate": 4.885251503392607e-06,
"loss": 0.0767,
"step": 1065
},
{
"epoch": 0.48498635122838946,
"grad_norm": 2.00437974621472,
"learning_rate": 4.885037378652771e-06,
"loss": 0.1336,
"step": 1066
},
{
"epoch": 0.4854413102820746,
"grad_norm": 1.4895968911800157,
"learning_rate": 4.884823059018451e-06,
"loss": 0.0726,
"step": 1067
},
{
"epoch": 0.4858962693357598,
"grad_norm": 1.5673178312119351,
"learning_rate": 4.88460854450716e-06,
"loss": 0.0843,
"step": 1068
},
{
"epoch": 0.48635122838944495,
"grad_norm": 1.1450505304026162,
"learning_rate": 4.884393835136427e-06,
"loss": 0.073,
"step": 1069
},
{
"epoch": 0.48680618744313015,
"grad_norm": 1.5223195045028948,
"learning_rate": 4.884178930923799e-06,
"loss": 0.0823,
"step": 1070
},
{
"epoch": 0.4872611464968153,
"grad_norm": 1.912651615279676,
"learning_rate": 4.883963831886834e-06,
"loss": 0.0989,
"step": 1071
},
{
"epoch": 0.48771610555050043,
"grad_norm": 1.6904540179044927,
"learning_rate": 4.8837485380431115e-06,
"loss": 0.0981,
"step": 1072
},
{
"epoch": 0.48817106460418563,
"grad_norm": 1.4559744514600277,
"learning_rate": 4.883533049410223e-06,
"loss": 0.0874,
"step": 1073
},
{
"epoch": 0.4886260236578708,
"grad_norm": 1.9041018278788933,
"learning_rate": 4.8833173660057785e-06,
"loss": 0.1065,
"step": 1074
},
{
"epoch": 0.489080982711556,
"grad_norm": 1.582657768337463,
"learning_rate": 4.8831014878474004e-06,
"loss": 0.0993,
"step": 1075
},
{
"epoch": 0.4895359417652411,
"grad_norm": 1.487895945323618,
"learning_rate": 4.882885414952732e-06,
"loss": 0.0887,
"step": 1076
},
{
"epoch": 0.4899909008189263,
"grad_norm": 1.1105199391014717,
"learning_rate": 4.882669147339428e-06,
"loss": 0.0521,
"step": 1077
},
{
"epoch": 0.49044585987261147,
"grad_norm": 1.3448385373486804,
"learning_rate": 4.882452685025161e-06,
"loss": 0.0606,
"step": 1078
},
{
"epoch": 0.4909008189262966,
"grad_norm": 1.9169790386878416,
"learning_rate": 4.88223602802762e-06,
"loss": 0.1103,
"step": 1079
},
{
"epoch": 0.4913557779799818,
"grad_norm": 1.4350936971881065,
"learning_rate": 4.882019176364509e-06,
"loss": 0.1052,
"step": 1080
},
{
"epoch": 0.49181073703366696,
"grad_norm": 1.9005260167330429,
"learning_rate": 4.881802130053548e-06,
"loss": 0.1217,
"step": 1081
},
{
"epoch": 0.49226569608735216,
"grad_norm": 1.4814940279383466,
"learning_rate": 4.881584889112473e-06,
"loss": 0.079,
"step": 1082
},
{
"epoch": 0.4927206551410373,
"grad_norm": 1.7134074599855604,
"learning_rate": 4.881367453559036e-06,
"loss": 0.1025,
"step": 1083
},
{
"epoch": 0.4931756141947225,
"grad_norm": 1.2847311247280295,
"learning_rate": 4.881149823411005e-06,
"loss": 0.0587,
"step": 1084
},
{
"epoch": 0.49363057324840764,
"grad_norm": 1.196984822353409,
"learning_rate": 4.880931998686162e-06,
"loss": 0.0779,
"step": 1085
},
{
"epoch": 0.4940855323020928,
"grad_norm": 2.247552936990941,
"learning_rate": 4.880713979402311e-06,
"loss": 0.1534,
"step": 1086
},
{
"epoch": 0.494540491355778,
"grad_norm": 2.5523444538687645,
"learning_rate": 4.880495765577263e-06,
"loss": 0.146,
"step": 1087
},
{
"epoch": 0.49499545040946313,
"grad_norm": 1.7690099480339412,
"learning_rate": 4.880277357228852e-06,
"loss": 0.084,
"step": 1088
},
{
"epoch": 0.49545040946314833,
"grad_norm": 1.2117156565437108,
"learning_rate": 4.880058754374923e-06,
"loss": 0.0833,
"step": 1089
},
{
"epoch": 0.4959053685168335,
"grad_norm": 1.5484757487864966,
"learning_rate": 4.879839957033343e-06,
"loss": 0.0938,
"step": 1090
},
{
"epoch": 0.4963603275705187,
"grad_norm": 1.5534223234923523,
"learning_rate": 4.879620965221987e-06,
"loss": 0.09,
"step": 1091
},
{
"epoch": 0.4968152866242038,
"grad_norm": 1.3405465803260945,
"learning_rate": 4.879401778958755e-06,
"loss": 0.0784,
"step": 1092
},
{
"epoch": 0.49727024567788897,
"grad_norm": 1.3343510524547628,
"learning_rate": 4.8791823982615525e-06,
"loss": 0.064,
"step": 1093
},
{
"epoch": 0.49772520473157417,
"grad_norm": 1.2315640234775116,
"learning_rate": 4.878962823148308e-06,
"loss": 0.067,
"step": 1094
},
{
"epoch": 0.4981801637852593,
"grad_norm": 1.654273388728327,
"learning_rate": 4.878743053636968e-06,
"loss": 0.0964,
"step": 1095
},
{
"epoch": 0.4986351228389445,
"grad_norm": 1.3344367681027707,
"learning_rate": 4.878523089745485e-06,
"loss": 0.0865,
"step": 1096
},
{
"epoch": 0.49909008189262966,
"grad_norm": 1.0737534169537484,
"learning_rate": 4.878302931491837e-06,
"loss": 0.0722,
"step": 1097
},
{
"epoch": 0.49954504094631486,
"grad_norm": 1.2217058614506033,
"learning_rate": 4.8780825788940145e-06,
"loss": 0.0531,
"step": 1098
},
{
"epoch": 0.5,
"grad_norm": 1.765512273684173,
"learning_rate": 4.877862031970023e-06,
"loss": 0.1016,
"step": 1099
},
{
"epoch": 0.5004549590536852,
"grad_norm": 2.1360497116346444,
"learning_rate": 4.8776412907378845e-06,
"loss": 0.1095,
"step": 1100
},
{
"epoch": 0.5009099181073703,
"grad_norm": 1.5928570797543171,
"learning_rate": 4.877420355215637e-06,
"loss": 0.0909,
"step": 1101
},
{
"epoch": 0.5013648771610555,
"grad_norm": 1.9221830556747463,
"learning_rate": 4.877199225421334e-06,
"loss": 0.123,
"step": 1102
},
{
"epoch": 0.5018198362147407,
"grad_norm": 1.967973587212139,
"learning_rate": 4.8769779013730454e-06,
"loss": 0.1535,
"step": 1103
},
{
"epoch": 0.5022747952684259,
"grad_norm": 2.02512821365078,
"learning_rate": 4.876756383088858e-06,
"loss": 0.1173,
"step": 1104
},
{
"epoch": 0.502729754322111,
"grad_norm": 1.3904167109659709,
"learning_rate": 4.876534670586872e-06,
"loss": 0.0839,
"step": 1105
},
{
"epoch": 0.5031847133757962,
"grad_norm": 1.4435165077122623,
"learning_rate": 4.8763127638852045e-06,
"loss": 0.0924,
"step": 1106
},
{
"epoch": 0.5036396724294814,
"grad_norm": 1.7029448773247835,
"learning_rate": 4.87609066300199e-06,
"loss": 0.1076,
"step": 1107
},
{
"epoch": 0.5040946314831665,
"grad_norm": 1.750067106251082,
"learning_rate": 4.875868367955376e-06,
"loss": 0.1077,
"step": 1108
},
{
"epoch": 0.5045495905368517,
"grad_norm": 1.9748651822243342,
"learning_rate": 4.87564587876353e-06,
"loss": 0.1294,
"step": 1109
},
{
"epoch": 0.5050045495905369,
"grad_norm": 1.7656971074259822,
"learning_rate": 4.87542319544463e-06,
"loss": 0.0974,
"step": 1110
},
{
"epoch": 0.5054595086442221,
"grad_norm": 1.4817675230155858,
"learning_rate": 4.875200318016873e-06,
"loss": 0.0959,
"step": 1111
},
{
"epoch": 0.5059144676979072,
"grad_norm": 1.603234528593141,
"learning_rate": 4.8749772464984736e-06,
"loss": 0.115,
"step": 1112
},
{
"epoch": 0.5063694267515924,
"grad_norm": 1.7632465098077008,
"learning_rate": 4.874753980907658e-06,
"loss": 0.1224,
"step": 1113
},
{
"epoch": 0.5068243858052776,
"grad_norm": 1.409315497870279,
"learning_rate": 4.8745305212626714e-06,
"loss": 0.0886,
"step": 1114
},
{
"epoch": 0.5072793448589626,
"grad_norm": 1.3116197456740595,
"learning_rate": 4.874306867581775e-06,
"loss": 0.0853,
"step": 1115
},
{
"epoch": 0.5077343039126478,
"grad_norm": 1.1746077003548339,
"learning_rate": 4.874083019883242e-06,
"loss": 0.0543,
"step": 1116
},
{
"epoch": 0.508189262966333,
"grad_norm": 1.941012957682845,
"learning_rate": 4.873858978185367e-06,
"loss": 0.1137,
"step": 1117
},
{
"epoch": 0.5086442220200182,
"grad_norm": 2.32531280724128,
"learning_rate": 4.8736347425064565e-06,
"loss": 0.1627,
"step": 1118
},
{
"epoch": 0.5090991810737033,
"grad_norm": 1.638539845007192,
"learning_rate": 4.873410312864833e-06,
"loss": 0.0988,
"step": 1119
},
{
"epoch": 0.5095541401273885,
"grad_norm": 1.5695637896435937,
"learning_rate": 4.8731856892788384e-06,
"loss": 0.0918,
"step": 1120
},
{
"epoch": 0.5100090991810737,
"grad_norm": 2.011157500272583,
"learning_rate": 4.872960871766826e-06,
"loss": 0.1316,
"step": 1121
},
{
"epoch": 0.5104640582347588,
"grad_norm": 1.3312452781498474,
"learning_rate": 4.8727358603471675e-06,
"loss": 0.1007,
"step": 1122
},
{
"epoch": 0.510919017288444,
"grad_norm": 1.9359844901160286,
"learning_rate": 4.872510655038249e-06,
"loss": 0.1026,
"step": 1123
},
{
"epoch": 0.5113739763421292,
"grad_norm": 1.2898375591874278,
"learning_rate": 4.872285255858476e-06,
"loss": 0.0929,
"step": 1124
},
{
"epoch": 0.5118289353958144,
"grad_norm": 1.920657444015991,
"learning_rate": 4.872059662826263e-06,
"loss": 0.1129,
"step": 1125
},
{
"epoch": 0.5122838944494995,
"grad_norm": 1.4550346247477233,
"learning_rate": 4.8718338759600465e-06,
"loss": 0.0824,
"step": 1126
},
{
"epoch": 0.5127388535031847,
"grad_norm": 1.5791466307448474,
"learning_rate": 4.871607895278278e-06,
"loss": 0.1206,
"step": 1127
},
{
"epoch": 0.5131938125568699,
"grad_norm": 1.204733566103446,
"learning_rate": 4.871381720799421e-06,
"loss": 0.0665,
"step": 1128
},
{
"epoch": 0.513648771610555,
"grad_norm": 1.6684092224882034,
"learning_rate": 4.8711553525419595e-06,
"loss": 0.1075,
"step": 1129
},
{
"epoch": 0.5141037306642402,
"grad_norm": 1.4239501992031698,
"learning_rate": 4.87092879052439e-06,
"loss": 0.0957,
"step": 1130
},
{
"epoch": 0.5145586897179254,
"grad_norm": 1.0934030596754927,
"learning_rate": 4.8707020347652275e-06,
"loss": 0.0686,
"step": 1131
},
{
"epoch": 0.5150136487716106,
"grad_norm": 1.5870890463044125,
"learning_rate": 4.870475085283001e-06,
"loss": 0.1027,
"step": 1132
},
{
"epoch": 0.5154686078252957,
"grad_norm": 1.6559311395509346,
"learning_rate": 4.870247942096254e-06,
"loss": 0.1008,
"step": 1133
},
{
"epoch": 0.5159235668789809,
"grad_norm": 1.155174213270752,
"learning_rate": 4.870020605223551e-06,
"loss": 0.0592,
"step": 1134
},
{
"epoch": 0.5163785259326661,
"grad_norm": 1.6869955821352955,
"learning_rate": 4.869793074683466e-06,
"loss": 0.0913,
"step": 1135
},
{
"epoch": 0.5168334849863512,
"grad_norm": 2.19769614213437,
"learning_rate": 4.8695653504945925e-06,
"loss": 0.1237,
"step": 1136
},
{
"epoch": 0.5172884440400364,
"grad_norm": 2.393558826937421,
"learning_rate": 4.8693374326755405e-06,
"loss": 0.1401,
"step": 1137
},
{
"epoch": 0.5177434030937216,
"grad_norm": 1.3656006242910685,
"learning_rate": 4.869109321244932e-06,
"loss": 0.09,
"step": 1138
},
{
"epoch": 0.5181983621474068,
"grad_norm": 1.4542523027566732,
"learning_rate": 4.86888101622141e-06,
"loss": 0.0918,
"step": 1139
},
{
"epoch": 0.5186533212010919,
"grad_norm": 1.443069001120561,
"learning_rate": 4.868652517623629e-06,
"loss": 0.066,
"step": 1140
},
{
"epoch": 0.5191082802547771,
"grad_norm": 1.3192549477432447,
"learning_rate": 4.86842382547026e-06,
"loss": 0.07,
"step": 1141
},
{
"epoch": 0.5195632393084623,
"grad_norm": 1.4610522043176968,
"learning_rate": 4.868194939779992e-06,
"loss": 0.0603,
"step": 1142
},
{
"epoch": 0.5200181983621474,
"grad_norm": 1.3807495660521953,
"learning_rate": 4.867965860571529e-06,
"loss": 0.086,
"step": 1143
},
{
"epoch": 0.5204731574158326,
"grad_norm": 1.7439827425180354,
"learning_rate": 4.867736587863589e-06,
"loss": 0.1175,
"step": 1144
},
{
"epoch": 0.5209281164695178,
"grad_norm": 3.8341122094242586,
"learning_rate": 4.867507121674907e-06,
"loss": 0.1369,
"step": 1145
},
{
"epoch": 0.521383075523203,
"grad_norm": 1.6708528784620404,
"learning_rate": 4.867277462024235e-06,
"loss": 0.0788,
"step": 1146
},
{
"epoch": 0.521838034576888,
"grad_norm": 1.8971649447454588,
"learning_rate": 4.8670476089303395e-06,
"loss": 0.138,
"step": 1147
},
{
"epoch": 0.5222929936305732,
"grad_norm": 1.8468924709684824,
"learning_rate": 4.866817562412003e-06,
"loss": 0.1438,
"step": 1148
},
{
"epoch": 0.5227479526842584,
"grad_norm": 1.6403934984754582,
"learning_rate": 4.866587322488024e-06,
"loss": 0.1223,
"step": 1149
},
{
"epoch": 0.5232029117379435,
"grad_norm": 2.6178432136946843,
"learning_rate": 4.866356889177216e-06,
"loss": 0.1626,
"step": 1150
},
{
"epoch": 0.5236578707916287,
"grad_norm": 1.7176781702000803,
"learning_rate": 4.866126262498409e-06,
"loss": 0.1169,
"step": 1151
},
{
"epoch": 0.5241128298453139,
"grad_norm": 2.4788262927152256,
"learning_rate": 4.865895442470449e-06,
"loss": 0.1366,
"step": 1152
},
{
"epoch": 0.5245677888989991,
"grad_norm": 1.4130512402331137,
"learning_rate": 4.865664429112199e-06,
"loss": 0.075,
"step": 1153
},
{
"epoch": 0.5250227479526842,
"grad_norm": 2.161183666624184,
"learning_rate": 4.8654332224425345e-06,
"loss": 0.1219,
"step": 1154
},
{
"epoch": 0.5254777070063694,
"grad_norm": 1.7134676925151036,
"learning_rate": 4.865201822480349e-06,
"loss": 0.1068,
"step": 1155
},
{
"epoch": 0.5259326660600546,
"grad_norm": 1.2631225946147446,
"learning_rate": 4.864970229244552e-06,
"loss": 0.0732,
"step": 1156
},
{
"epoch": 0.5263876251137397,
"grad_norm": 1.151791721954015,
"learning_rate": 4.864738442754068e-06,
"loss": 0.0612,
"step": 1157
},
{
"epoch": 0.5268425841674249,
"grad_norm": 1.298125985364791,
"learning_rate": 4.864506463027837e-06,
"loss": 0.0841,
"step": 1158
},
{
"epoch": 0.5272975432211101,
"grad_norm": 1.828500217819582,
"learning_rate": 4.864274290084816e-06,
"loss": 0.1279,
"step": 1159
},
{
"epoch": 0.5277525022747953,
"grad_norm": 1.872568934497448,
"learning_rate": 4.864041923943978e-06,
"loss": 0.1041,
"step": 1160
},
{
"epoch": 0.5282074613284804,
"grad_norm": 1.59985877807279,
"learning_rate": 4.863809364624309e-06,
"loss": 0.0996,
"step": 1161
},
{
"epoch": 0.5286624203821656,
"grad_norm": 1.4920832769727852,
"learning_rate": 4.863576612144814e-06,
"loss": 0.1002,
"step": 1162
},
{
"epoch": 0.5291173794358508,
"grad_norm": 1.9606964487777765,
"learning_rate": 4.863343666524512e-06,
"loss": 0.1113,
"step": 1163
},
{
"epoch": 0.5295723384895359,
"grad_norm": 2.2204981872927774,
"learning_rate": 4.863110527782437e-06,
"loss": 0.1106,
"step": 1164
},
{
"epoch": 0.5300272975432211,
"grad_norm": 1.7885324238047555,
"learning_rate": 4.8628771959376435e-06,
"loss": 0.1085,
"step": 1165
},
{
"epoch": 0.5304822565969063,
"grad_norm": 1.7918603713541985,
"learning_rate": 4.862643671009195e-06,
"loss": 0.1007,
"step": 1166
},
{
"epoch": 0.5309372156505915,
"grad_norm": 1.0998167564155898,
"learning_rate": 4.862409953016175e-06,
"loss": 0.0968,
"step": 1167
},
{
"epoch": 0.5313921747042766,
"grad_norm": 1.853940722458201,
"learning_rate": 4.862176041977683e-06,
"loss": 0.1216,
"step": 1168
},
{
"epoch": 0.5318471337579618,
"grad_norm": 1.4646094216764547,
"learning_rate": 4.861941937912832e-06,
"loss": 0.1116,
"step": 1169
},
{
"epoch": 0.532302092811647,
"grad_norm": 1.2365450205781439,
"learning_rate": 4.861707640840752e-06,
"loss": 0.0819,
"step": 1170
},
{
"epoch": 0.5327570518653321,
"grad_norm": 1.6463867940760566,
"learning_rate": 4.861473150780589e-06,
"loss": 0.1094,
"step": 1171
},
{
"epoch": 0.5332120109190173,
"grad_norm": 1.686704498138834,
"learning_rate": 4.8612384677515054e-06,
"loss": 0.1071,
"step": 1172
},
{
"epoch": 0.5336669699727025,
"grad_norm": 1.2716060091758528,
"learning_rate": 4.861003591772677e-06,
"loss": 0.0788,
"step": 1173
},
{
"epoch": 0.5341219290263877,
"grad_norm": 1.596228030510201,
"learning_rate": 4.860768522863297e-06,
"loss": 0.0716,
"step": 1174
},
{
"epoch": 0.5345768880800728,
"grad_norm": 1.6508703177098787,
"learning_rate": 4.860533261042574e-06,
"loss": 0.0977,
"step": 1175
},
{
"epoch": 0.535031847133758,
"grad_norm": 1.3185419902691182,
"learning_rate": 4.8602978063297336e-06,
"loss": 0.1103,
"step": 1176
},
{
"epoch": 0.5354868061874432,
"grad_norm": 1.6903360885675578,
"learning_rate": 4.8600621587440155e-06,
"loss": 0.0933,
"step": 1177
},
{
"epoch": 0.5359417652411284,
"grad_norm": 1.5059509187961821,
"learning_rate": 4.859826318304676e-06,
"loss": 0.1093,
"step": 1178
},
{
"epoch": 0.5363967242948134,
"grad_norm": 1.156363062560368,
"learning_rate": 4.859590285030986e-06,
"loss": 0.091,
"step": 1179
},
{
"epoch": 0.5368516833484986,
"grad_norm": 1.4254896552320762,
"learning_rate": 4.859354058942234e-06,
"loss": 0.099,
"step": 1180
},
{
"epoch": 0.5373066424021838,
"grad_norm": 1.6756998416867424,
"learning_rate": 4.859117640057723e-06,
"loss": 0.1058,
"step": 1181
},
{
"epoch": 0.5377616014558689,
"grad_norm": 1.906068462189616,
"learning_rate": 4.858881028396773e-06,
"loss": 0.1344,
"step": 1182
},
{
"epoch": 0.5382165605095541,
"grad_norm": 1.6813817476503583,
"learning_rate": 4.8586442239787165e-06,
"loss": 0.0938,
"step": 1183
},
{
"epoch": 0.5386715195632393,
"grad_norm": 1.4947308906180774,
"learning_rate": 4.858407226822906e-06,
"loss": 0.1089,
"step": 1184
},
{
"epoch": 0.5391264786169245,
"grad_norm": 1.5326514903244322,
"learning_rate": 4.858170036948707e-06,
"loss": 0.0903,
"step": 1185
},
{
"epoch": 0.5395814376706096,
"grad_norm": 1.3397075921608799,
"learning_rate": 4.857932654375503e-06,
"loss": 0.079,
"step": 1186
},
{
"epoch": 0.5400363967242948,
"grad_norm": 2.3382844220202963,
"learning_rate": 4.857695079122691e-06,
"loss": 0.1606,
"step": 1187
},
{
"epoch": 0.54049135577798,
"grad_norm": 1.2780125171194971,
"learning_rate": 4.857457311209683e-06,
"loss": 0.0819,
"step": 1188
},
{
"epoch": 0.5409463148316651,
"grad_norm": 1.3621256537302653,
"learning_rate": 4.857219350655911e-06,
"loss": 0.0837,
"step": 1189
},
{
"epoch": 0.5414012738853503,
"grad_norm": 1.4753266540938175,
"learning_rate": 4.856981197480818e-06,
"loss": 0.092,
"step": 1190
},
{
"epoch": 0.5418562329390355,
"grad_norm": 1.1486583975675493,
"learning_rate": 4.856742851703866e-06,
"loss": 0.0695,
"step": 1191
},
{
"epoch": 0.5423111919927207,
"grad_norm": 1.6118421470322997,
"learning_rate": 4.856504313344531e-06,
"loss": 0.1306,
"step": 1192
},
{
"epoch": 0.5427661510464058,
"grad_norm": 1.654223645513978,
"learning_rate": 4.8562655824223055e-06,
"loss": 0.0868,
"step": 1193
},
{
"epoch": 0.543221110100091,
"grad_norm": 1.166432446622458,
"learning_rate": 4.856026658956697e-06,
"loss": 0.0592,
"step": 1194
},
{
"epoch": 0.5436760691537762,
"grad_norm": 1.2408147318232963,
"learning_rate": 4.8557875429672295e-06,
"loss": 0.0893,
"step": 1195
},
{
"epoch": 0.5441310282074613,
"grad_norm": 1.4658290533217708,
"learning_rate": 4.855548234473444e-06,
"loss": 0.1193,
"step": 1196
},
{
"epoch": 0.5445859872611465,
"grad_norm": 1.5813217399288642,
"learning_rate": 4.8553087334948935e-06,
"loss": 0.1027,
"step": 1197
},
{
"epoch": 0.5450409463148317,
"grad_norm": 1.346354212639339,
"learning_rate": 4.855069040051149e-06,
"loss": 0.0842,
"step": 1198
},
{
"epoch": 0.5454959053685169,
"grad_norm": 1.7976208225125645,
"learning_rate": 4.854829154161799e-06,
"loss": 0.1231,
"step": 1199
},
{
"epoch": 0.545950864422202,
"grad_norm": 1.468188785415714,
"learning_rate": 4.854589075846445e-06,
"loss": 0.0941,
"step": 1200
},
{
"epoch": 0.5464058234758872,
"grad_norm": 1.2900368220049758,
"learning_rate": 4.854348805124704e-06,
"loss": 0.0866,
"step": 1201
},
{
"epoch": 0.5468607825295724,
"grad_norm": 1.465762931238317,
"learning_rate": 4.85410834201621e-06,
"loss": 0.0917,
"step": 1202
},
{
"epoch": 0.5473157415832575,
"grad_norm": 2.030229358227215,
"learning_rate": 4.8538676865406155e-06,
"loss": 0.1367,
"step": 1203
},
{
"epoch": 0.5477707006369427,
"grad_norm": 1.3216014713960686,
"learning_rate": 4.853626838717582e-06,
"loss": 0.0744,
"step": 1204
},
{
"epoch": 0.5482256596906279,
"grad_norm": 1.34429128033589,
"learning_rate": 4.853385798566793e-06,
"loss": 0.072,
"step": 1205
},
{
"epoch": 0.5486806187443131,
"grad_norm": 1.3681907039168972,
"learning_rate": 4.8531445661079444e-06,
"loss": 0.0772,
"step": 1206
},
{
"epoch": 0.5491355777979982,
"grad_norm": 1.7634866119794534,
"learning_rate": 4.852903141360749e-06,
"loss": 0.1093,
"step": 1207
},
{
"epoch": 0.5495905368516834,
"grad_norm": 1.3755217621758322,
"learning_rate": 4.852661524344933e-06,
"loss": 0.0706,
"step": 1208
},
{
"epoch": 0.5500454959053686,
"grad_norm": 1.8792585200640362,
"learning_rate": 4.852419715080244e-06,
"loss": 0.1248,
"step": 1209
},
{
"epoch": 0.5505004549590536,
"grad_norm": 1.3604609211138492,
"learning_rate": 4.852177713586437e-06,
"loss": 0.0849,
"step": 1210
},
{
"epoch": 0.5509554140127388,
"grad_norm": 1.3077627182539715,
"learning_rate": 4.85193551988329e-06,
"loss": 0.0876,
"step": 1211
},
{
"epoch": 0.551410373066424,
"grad_norm": 1.5010970994642232,
"learning_rate": 4.851693133990594e-06,
"loss": 0.0887,
"step": 1212
},
{
"epoch": 0.5518653321201092,
"grad_norm": 1.2366263332853158,
"learning_rate": 4.851450555928155e-06,
"loss": 0.0677,
"step": 1213
},
{
"epoch": 0.5523202911737943,
"grad_norm": 1.9682815492889902,
"learning_rate": 4.851207785715797e-06,
"loss": 0.1605,
"step": 1214
},
{
"epoch": 0.5527752502274795,
"grad_norm": 1.4810464832161876,
"learning_rate": 4.850964823373355e-06,
"loss": 0.1194,
"step": 1215
},
{
"epoch": 0.5532302092811647,
"grad_norm": 1.269367325606048,
"learning_rate": 4.850721668920685e-06,
"loss": 0.0869,
"step": 1216
},
{
"epoch": 0.5536851683348498,
"grad_norm": 1.811102361348233,
"learning_rate": 4.850478322377657e-06,
"loss": 0.113,
"step": 1217
},
{
"epoch": 0.554140127388535,
"grad_norm": 1.8234155506944059,
"learning_rate": 4.8502347837641536e-06,
"loss": 0.1337,
"step": 1218
},
{
"epoch": 0.5545950864422202,
"grad_norm": 1.5374689189034605,
"learning_rate": 4.8499910531000776e-06,
"loss": 0.0923,
"step": 1219
},
{
"epoch": 0.5550500454959054,
"grad_norm": 2.1434119748623583,
"learning_rate": 4.849747130405346e-06,
"loss": 0.1165,
"step": 1220
},
{
"epoch": 0.5555050045495905,
"grad_norm": 1.5741068071079671,
"learning_rate": 4.849503015699889e-06,
"loss": 0.0833,
"step": 1221
},
{
"epoch": 0.5559599636032757,
"grad_norm": 1.4450089536449229,
"learning_rate": 4.849258709003657e-06,
"loss": 0.0874,
"step": 1222
},
{
"epoch": 0.5564149226569609,
"grad_norm": 2.0523390040501206,
"learning_rate": 4.849014210336612e-06,
"loss": 0.1311,
"step": 1223
},
{
"epoch": 0.556869881710646,
"grad_norm": 1.6272370459349303,
"learning_rate": 4.848769519718734e-06,
"loss": 0.1283,
"step": 1224
},
{
"epoch": 0.5573248407643312,
"grad_norm": 1.7795199436155464,
"learning_rate": 4.848524637170018e-06,
"loss": 0.1053,
"step": 1225
},
{
"epoch": 0.5577797998180164,
"grad_norm": 2.039787438198539,
"learning_rate": 4.848279562710474e-06,
"loss": 0.119,
"step": 1226
},
{
"epoch": 0.5582347588717016,
"grad_norm": 1.048713205847522,
"learning_rate": 4.848034296360129e-06,
"loss": 0.0613,
"step": 1227
},
{
"epoch": 0.5586897179253867,
"grad_norm": 1.2246704661323997,
"learning_rate": 4.847788838139025e-06,
"loss": 0.0907,
"step": 1228
},
{
"epoch": 0.5591446769790719,
"grad_norm": 1.4248227073394217,
"learning_rate": 4.847543188067219e-06,
"loss": 0.0831,
"step": 1229
},
{
"epoch": 0.5595996360327571,
"grad_norm": 1.6554531335771108,
"learning_rate": 4.847297346164786e-06,
"loss": 0.098,
"step": 1230
},
{
"epoch": 0.5600545950864422,
"grad_norm": 1.6618601198336995,
"learning_rate": 4.8470513124518134e-06,
"loss": 0.1067,
"step": 1231
},
{
"epoch": 0.5605095541401274,
"grad_norm": 1.910127735430222,
"learning_rate": 4.8468050869484075e-06,
"loss": 0.1153,
"step": 1232
},
{
"epoch": 0.5609645131938126,
"grad_norm": 1.662154262618556,
"learning_rate": 4.846558669674688e-06,
"loss": 0.0858,
"step": 1233
},
{
"epoch": 0.5614194722474978,
"grad_norm": 1.666011221920497,
"learning_rate": 4.8463120606507904e-06,
"loss": 0.087,
"step": 1234
},
{
"epoch": 0.5618744313011829,
"grad_norm": 1.8392638033651618,
"learning_rate": 4.846065259896867e-06,
"loss": 0.1007,
"step": 1235
},
{
"epoch": 0.5623293903548681,
"grad_norm": 1.823608778063299,
"learning_rate": 4.845818267433086e-06,
"loss": 0.1234,
"step": 1236
},
{
"epoch": 0.5627843494085533,
"grad_norm": 1.6001337547517656,
"learning_rate": 4.845571083279629e-06,
"loss": 0.0992,
"step": 1237
},
{
"epoch": 0.5632393084622384,
"grad_norm": 1.244896894294659,
"learning_rate": 4.845323707456696e-06,
"loss": 0.0911,
"step": 1238
},
{
"epoch": 0.5636942675159236,
"grad_norm": 1.6134676145738456,
"learning_rate": 4.845076139984502e-06,
"loss": 0.0988,
"step": 1239
},
{
"epoch": 0.5641492265696088,
"grad_norm": 1.817921705994322,
"learning_rate": 4.844828380883274e-06,
"loss": 0.1137,
"step": 1240
},
{
"epoch": 0.564604185623294,
"grad_norm": 1.223760267965902,
"learning_rate": 4.844580430173261e-06,
"loss": 0.0912,
"step": 1241
},
{
"epoch": 0.565059144676979,
"grad_norm": 1.0223923432784907,
"learning_rate": 4.8443322878747236e-06,
"loss": 0.0549,
"step": 1242
},
{
"epoch": 0.5655141037306642,
"grad_norm": 1.4179515952754742,
"learning_rate": 4.844083954007938e-06,
"loss": 0.0909,
"step": 1243
},
{
"epoch": 0.5659690627843494,
"grad_norm": 1.964821324684815,
"learning_rate": 4.843835428593198e-06,
"loss": 0.1331,
"step": 1244
},
{
"epoch": 0.5664240218380345,
"grad_norm": 1.8460290937807686,
"learning_rate": 4.84358671165081e-06,
"loss": 0.1355,
"step": 1245
},
{
"epoch": 0.5668789808917197,
"grad_norm": 1.9533421795112815,
"learning_rate": 4.843337803201102e-06,
"loss": 0.1493,
"step": 1246
},
{
"epoch": 0.5673339399454049,
"grad_norm": 1.7429301575956597,
"learning_rate": 4.8430887032644094e-06,
"loss": 0.1208,
"step": 1247
},
{
"epoch": 0.5677888989990901,
"grad_norm": 1.6048397609024965,
"learning_rate": 4.842839411861089e-06,
"loss": 0.1016,
"step": 1248
},
{
"epoch": 0.5682438580527752,
"grad_norm": 1.5611018277418034,
"learning_rate": 4.842589929011513e-06,
"loss": 0.0996,
"step": 1249
},
{
"epoch": 0.5686988171064604,
"grad_norm": 1.549763833499855,
"learning_rate": 4.8423402547360665e-06,
"loss": 0.1047,
"step": 1250
},
{
"epoch": 0.5691537761601456,
"grad_norm": 1.5794849405940026,
"learning_rate": 4.842090389055153e-06,
"loss": 0.0885,
"step": 1251
},
{
"epoch": 0.5696087352138307,
"grad_norm": 1.340948229500544,
"learning_rate": 4.841840331989189e-06,
"loss": 0.082,
"step": 1252
},
{
"epoch": 0.5700636942675159,
"grad_norm": 1.187480617941468,
"learning_rate": 4.841590083558608e-06,
"loss": 0.0757,
"step": 1253
},
{
"epoch": 0.5705186533212011,
"grad_norm": 1.6889387454247615,
"learning_rate": 4.841339643783861e-06,
"loss": 0.1007,
"step": 1254
},
{
"epoch": 0.5709736123748863,
"grad_norm": 1.8032486510427874,
"learning_rate": 4.841089012685412e-06,
"loss": 0.1387,
"step": 1255
},
{
"epoch": 0.5714285714285714,
"grad_norm": 1.518781686351209,
"learning_rate": 4.840838190283741e-06,
"loss": 0.1073,
"step": 1256
},
{
"epoch": 0.5718835304822566,
"grad_norm": 1.2622352263295604,
"learning_rate": 4.8405871765993435e-06,
"loss": 0.0611,
"step": 1257
},
{
"epoch": 0.5723384895359418,
"grad_norm": 1.3733958676153404,
"learning_rate": 4.840335971652732e-06,
"loss": 0.0806,
"step": 1258
},
{
"epoch": 0.5727934485896269,
"grad_norm": 1.414930922234482,
"learning_rate": 4.840084575464434e-06,
"loss": 0.0967,
"step": 1259
},
{
"epoch": 0.5732484076433121,
"grad_norm": 1.3132222404269749,
"learning_rate": 4.839832988054992e-06,
"loss": 0.0844,
"step": 1260
},
{
"epoch": 0.5737033666969973,
"grad_norm": 1.4304276264926878,
"learning_rate": 4.839581209444966e-06,
"loss": 0.08,
"step": 1261
},
{
"epoch": 0.5741583257506825,
"grad_norm": 1.6261976055252851,
"learning_rate": 4.839329239654927e-06,
"loss": 0.1086,
"step": 1262
},
{
"epoch": 0.5746132848043676,
"grad_norm": 1.4905660158866907,
"learning_rate": 4.839077078705468e-06,
"loss": 0.0758,
"step": 1263
},
{
"epoch": 0.5750682438580528,
"grad_norm": 1.6218355961437578,
"learning_rate": 4.838824726617194e-06,
"loss": 0.1066,
"step": 1264
},
{
"epoch": 0.575523202911738,
"grad_norm": 1.7405100413536567,
"learning_rate": 4.838572183410725e-06,
"loss": 0.1103,
"step": 1265
},
{
"epoch": 0.5759781619654231,
"grad_norm": 1.5825357430240847,
"learning_rate": 4.838319449106697e-06,
"loss": 0.1026,
"step": 1266
},
{
"epoch": 0.5764331210191083,
"grad_norm": 1.4234319951879078,
"learning_rate": 4.838066523725764e-06,
"loss": 0.0761,
"step": 1267
},
{
"epoch": 0.5768880800727935,
"grad_norm": 1.4883172887933762,
"learning_rate": 4.837813407288594e-06,
"loss": 0.0989,
"step": 1268
},
{
"epoch": 0.5773430391264787,
"grad_norm": 1.437934945090456,
"learning_rate": 4.837560099815869e-06,
"loss": 0.0874,
"step": 1269
},
{
"epoch": 0.5777979981801638,
"grad_norm": 1.6175863411283686,
"learning_rate": 4.837306601328289e-06,
"loss": 0.1074,
"step": 1270
},
{
"epoch": 0.578252957233849,
"grad_norm": 1.3546376195879695,
"learning_rate": 4.837052911846569e-06,
"loss": 0.099,
"step": 1271
},
{
"epoch": 0.5787079162875342,
"grad_norm": 1.615443707505004,
"learning_rate": 4.836799031391439e-06,
"loss": 0.1093,
"step": 1272
},
{
"epoch": 0.5791628753412192,
"grad_norm": 0.7225881399048506,
"learning_rate": 4.836544959983645e-06,
"loss": 0.0439,
"step": 1273
},
{
"epoch": 0.5796178343949044,
"grad_norm": 2.1011993101699926,
"learning_rate": 4.8362906976439485e-06,
"loss": 0.1277,
"step": 1274
},
{
"epoch": 0.5800727934485896,
"grad_norm": 2.000601957434587,
"learning_rate": 4.836036244393127e-06,
"loss": 0.1495,
"step": 1275
},
{
"epoch": 0.5805277525022748,
"grad_norm": 1.6950265520988297,
"learning_rate": 4.835781600251973e-06,
"loss": 0.0976,
"step": 1276
},
{
"epoch": 0.5809827115559599,
"grad_norm": 1.3727073330890776,
"learning_rate": 4.835526765241295e-06,
"loss": 0.0828,
"step": 1277
},
{
"epoch": 0.5814376706096451,
"grad_norm": 1.5570369931283408,
"learning_rate": 4.835271739381917e-06,
"loss": 0.1109,
"step": 1278
},
{
"epoch": 0.5818926296633303,
"grad_norm": 1.0713801990040446,
"learning_rate": 4.835016522694678e-06,
"loss": 0.0757,
"step": 1279
},
{
"epoch": 0.5823475887170154,
"grad_norm": 1.942364052088125,
"learning_rate": 4.834761115200434e-06,
"loss": 0.1642,
"step": 1280
},
{
"epoch": 0.5828025477707006,
"grad_norm": 1.7377055370855508,
"learning_rate": 4.834505516920055e-06,
"loss": 0.1187,
"step": 1281
},
{
"epoch": 0.5832575068243858,
"grad_norm": 1.6956294426437164,
"learning_rate": 4.834249727874428e-06,
"loss": 0.1051,
"step": 1282
},
{
"epoch": 0.583712465878071,
"grad_norm": 1.4102019730152917,
"learning_rate": 4.833993748084455e-06,
"loss": 0.0704,
"step": 1283
},
{
"epoch": 0.5841674249317561,
"grad_norm": 1.2666669426637933,
"learning_rate": 4.833737577571052e-06,
"loss": 0.072,
"step": 1284
},
{
"epoch": 0.5846223839854413,
"grad_norm": 1.451859405282776,
"learning_rate": 4.833481216355153e-06,
"loss": 0.0833,
"step": 1285
},
{
"epoch": 0.5850773430391265,
"grad_norm": 2.2038986828884846,
"learning_rate": 4.833224664457709e-06,
"loss": 0.1247,
"step": 1286
},
{
"epoch": 0.5855323020928116,
"grad_norm": 2.170783563626466,
"learning_rate": 4.83296792189968e-06,
"loss": 0.0991,
"step": 1287
},
{
"epoch": 0.5859872611464968,
"grad_norm": 1.8083451546198175,
"learning_rate": 4.83271098870205e-06,
"loss": 0.1067,
"step": 1288
},
{
"epoch": 0.586442220200182,
"grad_norm": 1.764270130263968,
"learning_rate": 4.832453864885811e-06,
"loss": 0.1181,
"step": 1289
},
{
"epoch": 0.5868971792538672,
"grad_norm": 1.3642172399097685,
"learning_rate": 4.832196550471976e-06,
"loss": 0.0844,
"step": 1290
},
{
"epoch": 0.5873521383075523,
"grad_norm": 1.4693026944828678,
"learning_rate": 4.831939045481571e-06,
"loss": 0.1103,
"step": 1291
},
{
"epoch": 0.5878070973612375,
"grad_norm": 1.370206188315079,
"learning_rate": 4.8316813499356375e-06,
"loss": 0.0914,
"step": 1292
},
{
"epoch": 0.5882620564149227,
"grad_norm": 1.3729593032500749,
"learning_rate": 4.831423463855235e-06,
"loss": 0.0719,
"step": 1293
},
{
"epoch": 0.5887170154686078,
"grad_norm": 1.4507728916778564,
"learning_rate": 4.8311653872614345e-06,
"loss": 0.086,
"step": 1294
},
{
"epoch": 0.589171974522293,
"grad_norm": 1.3070476542527247,
"learning_rate": 4.830907120175327e-06,
"loss": 0.077,
"step": 1295
},
{
"epoch": 0.5896269335759782,
"grad_norm": 2.4221015667648045,
"learning_rate": 4.830648662618015e-06,
"loss": 0.1596,
"step": 1296
},
{
"epoch": 0.5900818926296634,
"grad_norm": 1.103239260506278,
"learning_rate": 4.83039001461062e-06,
"loss": 0.0581,
"step": 1297
},
{
"epoch": 0.5905368516833485,
"grad_norm": 1.8298909001729466,
"learning_rate": 4.830131176174276e-06,
"loss": 0.1082,
"step": 1298
},
{
"epoch": 0.5909918107370337,
"grad_norm": 1.9201560834557836,
"learning_rate": 4.829872147330136e-06,
"loss": 0.1147,
"step": 1299
},
{
"epoch": 0.5914467697907189,
"grad_norm": 1.332697111328447,
"learning_rate": 4.829612928099366e-06,
"loss": 0.0906,
"step": 1300
},
{
"epoch": 0.591901728844404,
"grad_norm": 1.2286901595425765,
"learning_rate": 4.829353518503147e-06,
"loss": 0.0741,
"step": 1301
},
{
"epoch": 0.5923566878980892,
"grad_norm": 1.21692580464079,
"learning_rate": 4.829093918562678e-06,
"loss": 0.0657,
"step": 1302
},
{
"epoch": 0.5928116469517744,
"grad_norm": 1.776387560928479,
"learning_rate": 4.828834128299173e-06,
"loss": 0.122,
"step": 1303
},
{
"epoch": 0.5932666060054596,
"grad_norm": 2.2576443805946003,
"learning_rate": 4.828574147733859e-06,
"loss": 0.1395,
"step": 1304
},
{
"epoch": 0.5937215650591446,
"grad_norm": 1.6394742041639938,
"learning_rate": 4.828313976887982e-06,
"loss": 0.0886,
"step": 1305
},
{
"epoch": 0.5941765241128298,
"grad_norm": 1.729743531966717,
"learning_rate": 4.8280536157828e-06,
"loss": 0.1191,
"step": 1306
},
{
"epoch": 0.594631483166515,
"grad_norm": 1.4769755060752687,
"learning_rate": 4.827793064439592e-06,
"loss": 0.0965,
"step": 1307
},
{
"epoch": 0.5950864422202001,
"grad_norm": 1.4080505436977253,
"learning_rate": 4.8275323228796455e-06,
"loss": 0.0874,
"step": 1308
},
{
"epoch": 0.5955414012738853,
"grad_norm": 0.9123649868426729,
"learning_rate": 4.8272713911242695e-06,
"loss": 0.0402,
"step": 1309
},
{
"epoch": 0.5959963603275705,
"grad_norm": 1.1294729714943839,
"learning_rate": 4.827010269194785e-06,
"loss": 0.0631,
"step": 1310
},
{
"epoch": 0.5964513193812557,
"grad_norm": 1.9689287013341512,
"learning_rate": 4.8267489571125295e-06,
"loss": 0.1181,
"step": 1311
},
{
"epoch": 0.5969062784349408,
"grad_norm": 2.330161760291491,
"learning_rate": 4.826487454898857e-06,
"loss": 0.1448,
"step": 1312
},
{
"epoch": 0.597361237488626,
"grad_norm": 1.2992174727337271,
"learning_rate": 4.826225762575136e-06,
"loss": 0.0857,
"step": 1313
},
{
"epoch": 0.5978161965423112,
"grad_norm": 1.4247199067825551,
"learning_rate": 4.825963880162752e-06,
"loss": 0.0863,
"step": 1314
},
{
"epoch": 0.5982711555959963,
"grad_norm": 2.01495341050897,
"learning_rate": 4.825701807683102e-06,
"loss": 0.1072,
"step": 1315
},
{
"epoch": 0.5987261146496815,
"grad_norm": 1.7412264774469277,
"learning_rate": 4.825439545157603e-06,
"loss": 0.1092,
"step": 1316
},
{
"epoch": 0.5991810737033667,
"grad_norm": 1.4724909601046332,
"learning_rate": 4.825177092607687e-06,
"loss": 0.0999,
"step": 1317
},
{
"epoch": 0.5996360327570519,
"grad_norm": 1.3473250398166379,
"learning_rate": 4.8249144500547995e-06,
"loss": 0.0847,
"step": 1318
},
{
"epoch": 0.600090991810737,
"grad_norm": 1.3069589653313691,
"learning_rate": 4.824651617520402e-06,
"loss": 0.0669,
"step": 1319
},
{
"epoch": 0.6005459508644222,
"grad_norm": 1.5442197540840334,
"learning_rate": 4.824388595025972e-06,
"loss": 0.1178,
"step": 1320
},
{
"epoch": 0.6010009099181074,
"grad_norm": 1.5331976112900332,
"learning_rate": 4.824125382593003e-06,
"loss": 0.0874,
"step": 1321
},
{
"epoch": 0.6014558689717925,
"grad_norm": 1.4665462333148995,
"learning_rate": 4.823861980243003e-06,
"loss": 0.1106,
"step": 1322
},
{
"epoch": 0.6019108280254777,
"grad_norm": 1.909519129682131,
"learning_rate": 4.823598387997497e-06,
"loss": 0.1163,
"step": 1323
},
{
"epoch": 0.6023657870791629,
"grad_norm": 1.5641688210807196,
"learning_rate": 4.823334605878024e-06,
"loss": 0.0797,
"step": 1324
},
{
"epoch": 0.6028207461328481,
"grad_norm": 1.572854435679942,
"learning_rate": 4.82307063390614e-06,
"loss": 0.09,
"step": 1325
},
{
"epoch": 0.6032757051865332,
"grad_norm": 1.6242534333910885,
"learning_rate": 4.822806472103413e-06,
"loss": 0.1031,
"step": 1326
},
{
"epoch": 0.6037306642402184,
"grad_norm": 1.3730669374310474,
"learning_rate": 4.822542120491431e-06,
"loss": 0.0842,
"step": 1327
},
{
"epoch": 0.6041856232939036,
"grad_norm": 1.12030081002078,
"learning_rate": 4.822277579091796e-06,
"loss": 0.0933,
"step": 1328
},
{
"epoch": 0.6046405823475887,
"grad_norm": 1.2764536589561721,
"learning_rate": 4.822012847926125e-06,
"loss": 0.0795,
"step": 1329
},
{
"epoch": 0.6050955414012739,
"grad_norm": 1.4682540895282241,
"learning_rate": 4.821747927016049e-06,
"loss": 0.0834,
"step": 1330
},
{
"epoch": 0.6055505004549591,
"grad_norm": 1.5003874511683086,
"learning_rate": 4.821482816383219e-06,
"loss": 0.1096,
"step": 1331
},
{
"epoch": 0.6060054595086443,
"grad_norm": 1.2445527510541503,
"learning_rate": 4.821217516049296e-06,
"loss": 0.0789,
"step": 1332
},
{
"epoch": 0.6064604185623294,
"grad_norm": 1.3266125786690217,
"learning_rate": 4.82095202603596e-06,
"loss": 0.0796,
"step": 1333
},
{
"epoch": 0.6069153776160146,
"grad_norm": 1.5070167125246237,
"learning_rate": 4.820686346364906e-06,
"loss": 0.0924,
"step": 1334
},
{
"epoch": 0.6073703366696998,
"grad_norm": 1.9776742406411276,
"learning_rate": 4.820420477057843e-06,
"loss": 0.1066,
"step": 1335
},
{
"epoch": 0.607825295723385,
"grad_norm": 1.7020369242588063,
"learning_rate": 4.820154418136498e-06,
"loss": 0.1212,
"step": 1336
},
{
"epoch": 0.60828025477707,
"grad_norm": 1.8050978290349085,
"learning_rate": 4.819888169622612e-06,
"loss": 0.1102,
"step": 1337
},
{
"epoch": 0.6087352138307552,
"grad_norm": 1.4892394361348396,
"learning_rate": 4.819621731537942e-06,
"loss": 0.1139,
"step": 1338
},
{
"epoch": 0.6091901728844404,
"grad_norm": 1.4499858080485506,
"learning_rate": 4.819355103904259e-06,
"loss": 0.0833,
"step": 1339
},
{
"epoch": 0.6096451319381255,
"grad_norm": 1.5725512633612637,
"learning_rate": 4.81908828674335e-06,
"loss": 0.0915,
"step": 1340
},
{
"epoch": 0.6101000909918107,
"grad_norm": 1.122002936682905,
"learning_rate": 4.81882128007702e-06,
"loss": 0.0706,
"step": 1341
},
{
"epoch": 0.6105550500454959,
"grad_norm": 1.6231339345844462,
"learning_rate": 4.818554083927086e-06,
"loss": 0.0989,
"step": 1342
},
{
"epoch": 0.6110100090991811,
"grad_norm": 1.5566168283978299,
"learning_rate": 4.818286698315383e-06,
"loss": 0.0802,
"step": 1343
},
{
"epoch": 0.6114649681528662,
"grad_norm": 1.5209649714120241,
"learning_rate": 4.818019123263761e-06,
"loss": 0.1202,
"step": 1344
},
{
"epoch": 0.6119199272065514,
"grad_norm": 1.5198574931775437,
"learning_rate": 4.817751358794084e-06,
"loss": 0.0824,
"step": 1345
},
{
"epoch": 0.6123748862602366,
"grad_norm": 1.3969905074954028,
"learning_rate": 4.8174834049282325e-06,
"loss": 0.1004,
"step": 1346
},
{
"epoch": 0.6128298453139217,
"grad_norm": 2.1750619266428455,
"learning_rate": 4.817215261688104e-06,
"loss": 0.1479,
"step": 1347
},
{
"epoch": 0.6132848043676069,
"grad_norm": 1.4757724334002973,
"learning_rate": 4.816946929095607e-06,
"loss": 0.0816,
"step": 1348
},
{
"epoch": 0.6137397634212921,
"grad_norm": 1.5237192624117821,
"learning_rate": 4.816678407172671e-06,
"loss": 0.1043,
"step": 1349
},
{
"epoch": 0.6141947224749773,
"grad_norm": 1.369442898723999,
"learning_rate": 4.816409695941238e-06,
"loss": 0.092,
"step": 1350
},
{
"epoch": 0.6146496815286624,
"grad_norm": 1.3552993829733393,
"learning_rate": 4.816140795423265e-06,
"loss": 0.0896,
"step": 1351
},
{
"epoch": 0.6151046405823476,
"grad_norm": 1.914785073036727,
"learning_rate": 4.8158717056407255e-06,
"loss": 0.1405,
"step": 1352
},
{
"epoch": 0.6155595996360328,
"grad_norm": 2.510056256789934,
"learning_rate": 4.815602426615609e-06,
"loss": 0.1347,
"step": 1353
},
{
"epoch": 0.6160145586897179,
"grad_norm": 1.6994784582879867,
"learning_rate": 4.815332958369919e-06,
"loss": 0.1043,
"step": 1354
},
{
"epoch": 0.6164695177434031,
"grad_norm": 1.609212664276651,
"learning_rate": 4.815063300925677e-06,
"loss": 0.0801,
"step": 1355
},
{
"epoch": 0.6169244767970883,
"grad_norm": 1.3059644313522971,
"learning_rate": 4.814793454304915e-06,
"loss": 0.0962,
"step": 1356
},
{
"epoch": 0.6173794358507735,
"grad_norm": 1.316795599125537,
"learning_rate": 4.814523418529686e-06,
"loss": 0.0945,
"step": 1357
},
{
"epoch": 0.6178343949044586,
"grad_norm": 1.458401958119273,
"learning_rate": 4.814253193622056e-06,
"loss": 0.0931,
"step": 1358
},
{
"epoch": 0.6182893539581438,
"grad_norm": 1.5782519499021963,
"learning_rate": 4.813982779604106e-06,
"loss": 0.086,
"step": 1359
},
{
"epoch": 0.618744313011829,
"grad_norm": 1.4337607882677579,
"learning_rate": 4.813712176497933e-06,
"loss": 0.1051,
"step": 1360
},
{
"epoch": 0.6191992720655141,
"grad_norm": 1.7873980657918327,
"learning_rate": 4.813441384325649e-06,
"loss": 0.1049,
"step": 1361
},
{
"epoch": 0.6196542311191993,
"grad_norm": 1.3606232019090971,
"learning_rate": 4.813170403109383e-06,
"loss": 0.0708,
"step": 1362
},
{
"epoch": 0.6201091901728845,
"grad_norm": 1.3563405384219576,
"learning_rate": 4.8128992328712774e-06,
"loss": 0.086,
"step": 1363
},
{
"epoch": 0.6205641492265697,
"grad_norm": 1.3192980800606737,
"learning_rate": 4.812627873633492e-06,
"loss": 0.0781,
"step": 1364
},
{
"epoch": 0.6210191082802548,
"grad_norm": 1.7840648545688607,
"learning_rate": 4.8123563254182e-06,
"loss": 0.1361,
"step": 1365
},
{
"epoch": 0.62147406733394,
"grad_norm": 1.8322981514345795,
"learning_rate": 4.8120845882475924e-06,
"loss": 0.1282,
"step": 1366
},
{
"epoch": 0.6219290263876252,
"grad_norm": 2.0823134031423267,
"learning_rate": 4.8118126621438734e-06,
"loss": 0.1303,
"step": 1367
},
{
"epoch": 0.6223839854413102,
"grad_norm": 1.8738406581860008,
"learning_rate": 4.811540547129263e-06,
"loss": 0.1603,
"step": 1368
},
{
"epoch": 0.6228389444949954,
"grad_norm": 1.7465048715810059,
"learning_rate": 4.811268243225999e-06,
"loss": 0.1157,
"step": 1369
},
{
"epoch": 0.6232939035486806,
"grad_norm": 1.3620940982420815,
"learning_rate": 4.810995750456331e-06,
"loss": 0.0794,
"step": 1370
},
{
"epoch": 0.6237488626023658,
"grad_norm": 1.7874358637623151,
"learning_rate": 4.810723068842526e-06,
"loss": 0.1272,
"step": 1371
},
{
"epoch": 0.6242038216560509,
"grad_norm": 1.7579304475520012,
"learning_rate": 4.810450198406867e-06,
"loss": 0.1185,
"step": 1372
},
{
"epoch": 0.6246587807097361,
"grad_norm": 2.467789845960662,
"learning_rate": 4.810177139171653e-06,
"loss": 0.1557,
"step": 1373
},
{
"epoch": 0.6251137397634213,
"grad_norm": 1.1425822722647716,
"learning_rate": 4.809903891159195e-06,
"loss": 0.0657,
"step": 1374
},
{
"epoch": 0.6255686988171064,
"grad_norm": 2.016266262602286,
"learning_rate": 4.809630454391822e-06,
"loss": 0.107,
"step": 1375
},
{
"epoch": 0.6260236578707916,
"grad_norm": 1.7559713706649986,
"learning_rate": 4.80935682889188e-06,
"loss": 0.1506,
"step": 1376
},
{
"epoch": 0.6264786169244768,
"grad_norm": 1.4915046053791412,
"learning_rate": 4.809083014681726e-06,
"loss": 0.1212,
"step": 1377
},
{
"epoch": 0.626933575978162,
"grad_norm": 1.632149901378183,
"learning_rate": 4.808809011783735e-06,
"loss": 0.1266,
"step": 1378
},
{
"epoch": 0.6273885350318471,
"grad_norm": 1.3124240257866033,
"learning_rate": 4.808534820220299e-06,
"loss": 0.0837,
"step": 1379
},
{
"epoch": 0.6278434940855323,
"grad_norm": 1.7180772149333445,
"learning_rate": 4.8082604400138226e-06,
"loss": 0.1287,
"step": 1380
},
{
"epoch": 0.6282984531392175,
"grad_norm": 1.071227301223936,
"learning_rate": 4.807985871186726e-06,
"loss": 0.0776,
"step": 1381
},
{
"epoch": 0.6287534121929026,
"grad_norm": 1.7108717630459847,
"learning_rate": 4.8077111137614484e-06,
"loss": 0.0991,
"step": 1382
},
{
"epoch": 0.6292083712465878,
"grad_norm": 1.6365913346705507,
"learning_rate": 4.8074361677604394e-06,
"loss": 0.1004,
"step": 1383
},
{
"epoch": 0.629663330300273,
"grad_norm": 1.6392222223495618,
"learning_rate": 4.807161033206168e-06,
"loss": 0.1002,
"step": 1384
},
{
"epoch": 0.6301182893539582,
"grad_norm": 1.687969288374962,
"learning_rate": 4.806885710121114e-06,
"loss": 0.1099,
"step": 1385
},
{
"epoch": 0.6305732484076433,
"grad_norm": 1.4063826448960124,
"learning_rate": 4.806610198527779e-06,
"loss": 0.0896,
"step": 1386
},
{
"epoch": 0.6310282074613285,
"grad_norm": 1.540144583948253,
"learning_rate": 4.8063344984486755e-06,
"loss": 0.0879,
"step": 1387
},
{
"epoch": 0.6314831665150137,
"grad_norm": 1.5064915998503037,
"learning_rate": 4.806058609906331e-06,
"loss": 0.0962,
"step": 1388
},
{
"epoch": 0.6319381255686988,
"grad_norm": 2.1627291975031104,
"learning_rate": 4.805782532923292e-06,
"loss": 0.128,
"step": 1389
},
{
"epoch": 0.632393084622384,
"grad_norm": 1.647216495001309,
"learning_rate": 4.805506267522116e-06,
"loss": 0.1248,
"step": 1390
},
{
"epoch": 0.6328480436760692,
"grad_norm": 1.9302875416620158,
"learning_rate": 4.80522981372538e-06,
"loss": 0.1297,
"step": 1391
},
{
"epoch": 0.6333030027297544,
"grad_norm": 1.1401865771531547,
"learning_rate": 4.804953171555674e-06,
"loss": 0.077,
"step": 1392
},
{
"epoch": 0.6337579617834395,
"grad_norm": 2.3827232130583513,
"learning_rate": 4.8046763410356046e-06,
"loss": 0.1231,
"step": 1393
},
{
"epoch": 0.6342129208371247,
"grad_norm": 2.132009387110179,
"learning_rate": 4.804399322187791e-06,
"loss": 0.1363,
"step": 1394
},
{
"epoch": 0.6346678798908099,
"grad_norm": 1.914550517915578,
"learning_rate": 4.8041221150348725e-06,
"loss": 0.1408,
"step": 1395
},
{
"epoch": 0.635122838944495,
"grad_norm": 1.5194825054621766,
"learning_rate": 4.8038447195995e-06,
"loss": 0.1107,
"step": 1396
},
{
"epoch": 0.6355777979981801,
"grad_norm": 1.6908421741011026,
"learning_rate": 4.80356713590434e-06,
"loss": 0.1057,
"step": 1397
},
{
"epoch": 0.6360327570518653,
"grad_norm": 1.957264325451557,
"learning_rate": 4.803289363972078e-06,
"loss": 0.1279,
"step": 1398
},
{
"epoch": 0.6364877161055505,
"grad_norm": 1.429753125674933,
"learning_rate": 4.8030114038254094e-06,
"loss": 0.0906,
"step": 1399
},
{
"epoch": 0.6369426751592356,
"grad_norm": 1.574683320179916,
"learning_rate": 4.80273325548705e-06,
"loss": 0.0951,
"step": 1400
},
{
"epoch": 0.6373976342129208,
"grad_norm": 1.422366848550457,
"learning_rate": 4.802454918979728e-06,
"loss": 0.0906,
"step": 1401
},
{
"epoch": 0.637852593266606,
"grad_norm": 1.9963358207494448,
"learning_rate": 4.802176394326187e-06,
"loss": 0.1483,
"step": 1402
},
{
"epoch": 0.6383075523202911,
"grad_norm": 1.781860008561357,
"learning_rate": 4.801897681549188e-06,
"loss": 0.0878,
"step": 1403
},
{
"epoch": 0.6387625113739763,
"grad_norm": 1.635142292837631,
"learning_rate": 4.801618780671506e-06,
"loss": 0.1054,
"step": 1404
},
{
"epoch": 0.6392174704276615,
"grad_norm": 1.3235648640664877,
"learning_rate": 4.801339691715932e-06,
"loss": 0.0939,
"step": 1405
},
{
"epoch": 0.6396724294813467,
"grad_norm": 1.2245139670763607,
"learning_rate": 4.8010604147052695e-06,
"loss": 0.0625,
"step": 1406
},
{
"epoch": 0.6401273885350318,
"grad_norm": 1.4675657307946148,
"learning_rate": 4.800780949662343e-06,
"loss": 0.0994,
"step": 1407
},
{
"epoch": 0.640582347588717,
"grad_norm": 1.493372713452032,
"learning_rate": 4.800501296609986e-06,
"loss": 0.0841,
"step": 1408
},
{
"epoch": 0.6410373066424022,
"grad_norm": 1.7340947187812135,
"learning_rate": 4.800221455571053e-06,
"loss": 0.1088,
"step": 1409
},
{
"epoch": 0.6414922656960873,
"grad_norm": 1.2980113793311265,
"learning_rate": 4.7999414265684105e-06,
"loss": 0.0852,
"step": 1410
},
{
"epoch": 0.6419472247497725,
"grad_norm": 1.4464636793664913,
"learning_rate": 4.79966120962494e-06,
"loss": 0.0976,
"step": 1411
},
{
"epoch": 0.6424021838034577,
"grad_norm": 1.4659649640116845,
"learning_rate": 4.799380804763542e-06,
"loss": 0.0901,
"step": 1412
},
{
"epoch": 0.6428571428571429,
"grad_norm": 1.703460078887615,
"learning_rate": 4.799100212007128e-06,
"loss": 0.1074,
"step": 1413
},
{
"epoch": 0.643312101910828,
"grad_norm": 1.3106092828093312,
"learning_rate": 4.7988194313786275e-06,
"loss": 0.0736,
"step": 1414
},
{
"epoch": 0.6437670609645132,
"grad_norm": 0.9724381635858095,
"learning_rate": 4.798538462900984e-06,
"loss": 0.0657,
"step": 1415
},
{
"epoch": 0.6442220200181984,
"grad_norm": 1.3180852195340405,
"learning_rate": 4.798257306597157e-06,
"loss": 0.0791,
"step": 1416
},
{
"epoch": 0.6446769790718835,
"grad_norm": 1.3806990093425773,
"learning_rate": 4.797975962490122e-06,
"loss": 0.102,
"step": 1417
},
{
"epoch": 0.6451319381255687,
"grad_norm": 1.0796594549250105,
"learning_rate": 4.797694430602869e-06,
"loss": 0.0521,
"step": 1418
},
{
"epoch": 0.6455868971792539,
"grad_norm": 1.8299905872463706,
"learning_rate": 4.797412710958405e-06,
"loss": 0.1117,
"step": 1419
},
{
"epoch": 0.6460418562329391,
"grad_norm": 1.7103989898617438,
"learning_rate": 4.797130803579747e-06,
"loss": 0.1034,
"step": 1420
},
{
"epoch": 0.6464968152866242,
"grad_norm": 1.9920043416958193,
"learning_rate": 4.796848708489935e-06,
"loss": 0.1314,
"step": 1421
},
{
"epoch": 0.6469517743403094,
"grad_norm": 1.55952000492946,
"learning_rate": 4.796566425712018e-06,
"loss": 0.1094,
"step": 1422
},
{
"epoch": 0.6474067333939946,
"grad_norm": 1.569073968162044,
"learning_rate": 4.796283955269065e-06,
"loss": 0.1288,
"step": 1423
},
{
"epoch": 0.6478616924476797,
"grad_norm": 1.9345498009875362,
"learning_rate": 4.796001297184156e-06,
"loss": 0.1276,
"step": 1424
},
{
"epoch": 0.6483166515013649,
"grad_norm": 1.996849276778458,
"learning_rate": 4.79571845148039e-06,
"loss": 0.1443,
"step": 1425
},
{
"epoch": 0.6487716105550501,
"grad_norm": 1.1655015182194328,
"learning_rate": 4.795435418180879e-06,
"loss": 0.0895,
"step": 1426
},
{
"epoch": 0.6492265696087353,
"grad_norm": 1.6476688817001566,
"learning_rate": 4.795152197308753e-06,
"loss": 0.0993,
"step": 1427
},
{
"epoch": 0.6496815286624203,
"grad_norm": 1.0099999351331836,
"learning_rate": 4.794868788887154e-06,
"loss": 0.0671,
"step": 1428
},
{
"epoch": 0.6501364877161055,
"grad_norm": 1.8391539690012708,
"learning_rate": 4.79458519293924e-06,
"loss": 0.1345,
"step": 1429
},
{
"epoch": 0.6505914467697907,
"grad_norm": 1.4752057458255263,
"learning_rate": 4.794301409488187e-06,
"loss": 0.0873,
"step": 1430
},
{
"epoch": 0.6510464058234758,
"grad_norm": 1.2943024580621056,
"learning_rate": 4.7940174385571835e-06,
"loss": 0.0802,
"step": 1431
},
{
"epoch": 0.651501364877161,
"grad_norm": 1.3918512180039062,
"learning_rate": 4.793733280169435e-06,
"loss": 0.0993,
"step": 1432
},
{
"epoch": 0.6519563239308462,
"grad_norm": 2.2174420994103574,
"learning_rate": 4.7934489343481614e-06,
"loss": 0.1425,
"step": 1433
},
{
"epoch": 0.6524112829845314,
"grad_norm": 1.774834870886046,
"learning_rate": 4.7931644011165975e-06,
"loss": 0.0982,
"step": 1434
},
{
"epoch": 0.6528662420382165,
"grad_norm": 1.2208864014501382,
"learning_rate": 4.792879680497995e-06,
"loss": 0.0807,
"step": 1435
},
{
"epoch": 0.6533212010919017,
"grad_norm": 1.8182347519697841,
"learning_rate": 4.79259477251562e-06,
"loss": 0.1194,
"step": 1436
},
{
"epoch": 0.6537761601455869,
"grad_norm": 1.8801650010523618,
"learning_rate": 4.792309677192753e-06,
"loss": 0.1326,
"step": 1437
},
{
"epoch": 0.654231119199272,
"grad_norm": 1.776650087976607,
"learning_rate": 4.79202439455269e-06,
"loss": 0.0995,
"step": 1438
},
{
"epoch": 0.6546860782529572,
"grad_norm": 1.2419464528847455,
"learning_rate": 4.791738924618745e-06,
"loss": 0.0819,
"step": 1439
},
{
"epoch": 0.6551410373066424,
"grad_norm": 1.3878814997047564,
"learning_rate": 4.791453267414245e-06,
"loss": 0.077,
"step": 1440
},
{
"epoch": 0.6555959963603276,
"grad_norm": 1.3963850212985605,
"learning_rate": 4.7911674229625316e-06,
"loss": 0.0797,
"step": 1441
},
{
"epoch": 0.6560509554140127,
"grad_norm": 1.9634000929904991,
"learning_rate": 4.790881391286963e-06,
"loss": 0.1173,
"step": 1442
},
{
"epoch": 0.6565059144676979,
"grad_norm": 1.5553330936936114,
"learning_rate": 4.790595172410914e-06,
"loss": 0.099,
"step": 1443
},
{
"epoch": 0.6569608735213831,
"grad_norm": 1.9255393679593797,
"learning_rate": 4.79030876635777e-06,
"loss": 0.1353,
"step": 1444
},
{
"epoch": 0.6574158325750682,
"grad_norm": 1.461167870438619,
"learning_rate": 4.790022173150938e-06,
"loss": 0.1049,
"step": 1445
},
{
"epoch": 0.6578707916287534,
"grad_norm": 1.0062740037097007,
"learning_rate": 4.789735392813835e-06,
"loss": 0.0594,
"step": 1446
},
{
"epoch": 0.6583257506824386,
"grad_norm": 1.4058443933458273,
"learning_rate": 4.789448425369896e-06,
"loss": 0.0872,
"step": 1447
},
{
"epoch": 0.6587807097361238,
"grad_norm": 1.5311615159042697,
"learning_rate": 4.789161270842571e-06,
"loss": 0.0939,
"step": 1448
},
{
"epoch": 0.6592356687898089,
"grad_norm": 1.6595649465936542,
"learning_rate": 4.7888739292553235e-06,
"loss": 0.1248,
"step": 1449
},
{
"epoch": 0.6596906278434941,
"grad_norm": 1.7051412400140817,
"learning_rate": 4.788586400631636e-06,
"loss": 0.1197,
"step": 1450
},
{
"epoch": 0.6601455868971793,
"grad_norm": 1.2115114973668668,
"learning_rate": 4.788298684995003e-06,
"loss": 0.0905,
"step": 1451
},
{
"epoch": 0.6606005459508644,
"grad_norm": 1.4239694731611245,
"learning_rate": 4.7880107823689355e-06,
"loss": 0.0801,
"step": 1452
},
{
"epoch": 0.6610555050045496,
"grad_norm": 1.5925606772355265,
"learning_rate": 4.787722692776958e-06,
"loss": 0.1183,
"step": 1453
},
{
"epoch": 0.6615104640582348,
"grad_norm": 1.7931970729363222,
"learning_rate": 4.787434416242615e-06,
"loss": 0.1189,
"step": 1454
},
{
"epoch": 0.66196542311192,
"grad_norm": 2.3171059544303874,
"learning_rate": 4.787145952789461e-06,
"loss": 0.1436,
"step": 1455
},
{
"epoch": 0.6624203821656051,
"grad_norm": 1.4441484331538328,
"learning_rate": 4.786857302441069e-06,
"loss": 0.0781,
"step": 1456
},
{
"epoch": 0.6628753412192903,
"grad_norm": 1.690439275216053,
"learning_rate": 4.786568465221025e-06,
"loss": 0.1111,
"step": 1457
},
{
"epoch": 0.6633303002729755,
"grad_norm": 1.6812302333143159,
"learning_rate": 4.7862794411529315e-06,
"loss": 0.1175,
"step": 1458
},
{
"epoch": 0.6637852593266605,
"grad_norm": 1.9541579133281037,
"learning_rate": 4.7859902302604075e-06,
"loss": 0.1329,
"step": 1459
},
{
"epoch": 0.6642402183803457,
"grad_norm": 1.8591409223722424,
"learning_rate": 4.785700832567085e-06,
"loss": 0.1211,
"step": 1460
},
{
"epoch": 0.664695177434031,
"grad_norm": 1.325162611861324,
"learning_rate": 4.785411248096613e-06,
"loss": 0.0743,
"step": 1461
},
{
"epoch": 0.6651501364877161,
"grad_norm": 1.3065112220161235,
"learning_rate": 4.785121476872654e-06,
"loss": 0.1034,
"step": 1462
},
{
"epoch": 0.6656050955414012,
"grad_norm": 1.5925894626386907,
"learning_rate": 4.784831518918888e-06,
"loss": 0.1196,
"step": 1463
},
{
"epoch": 0.6660600545950864,
"grad_norm": 1.1820283205821733,
"learning_rate": 4.784541374259008e-06,
"loss": 0.0769,
"step": 1464
},
{
"epoch": 0.6665150136487716,
"grad_norm": 1.571736758093102,
"learning_rate": 4.7842510429167244e-06,
"loss": 0.1,
"step": 1465
},
{
"epoch": 0.6669699727024567,
"grad_norm": 1.5876822973446192,
"learning_rate": 4.783960524915761e-06,
"loss": 0.1214,
"step": 1466
},
{
"epoch": 0.6674249317561419,
"grad_norm": 1.5160576603586384,
"learning_rate": 4.783669820279858e-06,
"loss": 0.0979,
"step": 1467
},
{
"epoch": 0.6678798908098271,
"grad_norm": 1.2434477128547956,
"learning_rate": 4.783378929032769e-06,
"loss": 0.0824,
"step": 1468
},
{
"epoch": 0.6683348498635123,
"grad_norm": 1.46291955617626,
"learning_rate": 4.783087851198267e-06,
"loss": 0.0942,
"step": 1469
},
{
"epoch": 0.6687898089171974,
"grad_norm": 1.7951492565076614,
"learning_rate": 4.7827965868001356e-06,
"loss": 0.1192,
"step": 1470
},
{
"epoch": 0.6692447679708826,
"grad_norm": 1.4406289448080234,
"learning_rate": 4.782505135862176e-06,
"loss": 0.1009,
"step": 1471
},
{
"epoch": 0.6696997270245678,
"grad_norm": 1.4538780681359404,
"learning_rate": 4.782213498408205e-06,
"loss": 0.1012,
"step": 1472
},
{
"epoch": 0.6701546860782529,
"grad_norm": 1.4490300401257787,
"learning_rate": 4.781921674462053e-06,
"loss": 0.0782,
"step": 1473
},
{
"epoch": 0.6706096451319381,
"grad_norm": 1.8860995116874109,
"learning_rate": 4.781629664047566e-06,
"loss": 0.1148,
"step": 1474
},
{
"epoch": 0.6710646041856233,
"grad_norm": 1.3918036510588907,
"learning_rate": 4.781337467188607e-06,
"loss": 0.1025,
"step": 1475
},
{
"epoch": 0.6715195632393085,
"grad_norm": 2.3859380054935344,
"learning_rate": 4.781045083909053e-06,
"loss": 0.1219,
"step": 1476
},
{
"epoch": 0.6719745222929936,
"grad_norm": 1.9401784591368603,
"learning_rate": 4.780752514232796e-06,
"loss": 0.1022,
"step": 1477
},
{
"epoch": 0.6724294813466788,
"grad_norm": 1.374892200929808,
"learning_rate": 4.780459758183743e-06,
"loss": 0.0896,
"step": 1478
},
{
"epoch": 0.672884440400364,
"grad_norm": 1.4250914966637114,
"learning_rate": 4.780166815785817e-06,
"loss": 0.0907,
"step": 1479
},
{
"epoch": 0.6733393994540491,
"grad_norm": 1.3888650548243648,
"learning_rate": 4.7798736870629554e-06,
"loss": 0.1102,
"step": 1480
},
{
"epoch": 0.6737943585077343,
"grad_norm": 1.5225956652456023,
"learning_rate": 4.779580372039113e-06,
"loss": 0.0809,
"step": 1481
},
{
"epoch": 0.6742493175614195,
"grad_norm": 2.133500594182355,
"learning_rate": 4.779286870738256e-06,
"loss": 0.1069,
"step": 1482
},
{
"epoch": 0.6747042766151047,
"grad_norm": 1.6417529269403512,
"learning_rate": 4.778993183184371e-06,
"loss": 0.0879,
"step": 1483
},
{
"epoch": 0.6751592356687898,
"grad_norm": 2.188184230975794,
"learning_rate": 4.778699309401453e-06,
"loss": 0.1196,
"step": 1484
},
{
"epoch": 0.675614194722475,
"grad_norm": 1.366654497975806,
"learning_rate": 4.7784052494135195e-06,
"loss": 0.0952,
"step": 1485
},
{
"epoch": 0.6760691537761602,
"grad_norm": 2.2251300669835734,
"learning_rate": 4.778111003244596e-06,
"loss": 0.0962,
"step": 1486
},
{
"epoch": 0.6765241128298453,
"grad_norm": 1.2239477453163228,
"learning_rate": 4.777816570918731e-06,
"loss": 0.0771,
"step": 1487
},
{
"epoch": 0.6769790718835305,
"grad_norm": 1.4442063624509236,
"learning_rate": 4.777521952459982e-06,
"loss": 0.0881,
"step": 1488
},
{
"epoch": 0.6774340309372157,
"grad_norm": 1.792892312265488,
"learning_rate": 4.777227147892424e-06,
"loss": 0.108,
"step": 1489
},
{
"epoch": 0.6778889899909009,
"grad_norm": 1.5848897809985478,
"learning_rate": 4.776932157240147e-06,
"loss": 0.0973,
"step": 1490
},
{
"epoch": 0.678343949044586,
"grad_norm": 1.5924788947742,
"learning_rate": 4.776636980527257e-06,
"loss": 0.0997,
"step": 1491
},
{
"epoch": 0.6787989080982711,
"grad_norm": 1.7689554235448024,
"learning_rate": 4.776341617777874e-06,
"loss": 0.0907,
"step": 1492
},
{
"epoch": 0.6792538671519563,
"grad_norm": 1.561936690334899,
"learning_rate": 4.776046069016135e-06,
"loss": 0.1045,
"step": 1493
},
{
"epoch": 0.6797088262056415,
"grad_norm": 1.978023029084926,
"learning_rate": 4.775750334266188e-06,
"loss": 0.1316,
"step": 1494
},
{
"epoch": 0.6801637852593266,
"grad_norm": 1.2221171400180673,
"learning_rate": 4.775454413552202e-06,
"loss": 0.0708,
"step": 1495
},
{
"epoch": 0.6806187443130118,
"grad_norm": 2.2916692264154848,
"learning_rate": 4.775158306898358e-06,
"loss": 0.1045,
"step": 1496
},
{
"epoch": 0.681073703366697,
"grad_norm": 1.5270730953843772,
"learning_rate": 4.774862014328849e-06,
"loss": 0.087,
"step": 1497
},
{
"epoch": 0.6815286624203821,
"grad_norm": 1.5001501033936573,
"learning_rate": 4.774565535867892e-06,
"loss": 0.083,
"step": 1498
},
{
"epoch": 0.6819836214740673,
"grad_norm": 2.228962091730558,
"learning_rate": 4.77426887153971e-06,
"loss": 0.132,
"step": 1499
},
{
"epoch": 0.6824385805277525,
"grad_norm": 1.7756631467911705,
"learning_rate": 4.773972021368546e-06,
"loss": 0.1156,
"step": 1500
},
{
"epoch": 0.6828935395814377,
"grad_norm": 1.9028113721779674,
"learning_rate": 4.773674985378658e-06,
"loss": 0.1692,
"step": 1501
},
{
"epoch": 0.6833484986351228,
"grad_norm": 1.591856567558633,
"learning_rate": 4.773377763594319e-06,
"loss": 0.0829,
"step": 1502
},
{
"epoch": 0.683803457688808,
"grad_norm": 1.7330424169213765,
"learning_rate": 4.773080356039814e-06,
"loss": 0.1079,
"step": 1503
},
{
"epoch": 0.6842584167424932,
"grad_norm": 1.3093378510726064,
"learning_rate": 4.772782762739448e-06,
"loss": 0.0919,
"step": 1504
},
{
"epoch": 0.6847133757961783,
"grad_norm": 1.5644465201102973,
"learning_rate": 4.772484983717539e-06,
"loss": 0.096,
"step": 1505
},
{
"epoch": 0.6851683348498635,
"grad_norm": 1.7535246249527565,
"learning_rate": 4.77218701899842e-06,
"loss": 0.1025,
"step": 1506
},
{
"epoch": 0.6856232939035487,
"grad_norm": 1.682557507776212,
"learning_rate": 4.771888868606438e-06,
"loss": 0.1245,
"step": 1507
},
{
"epoch": 0.6860782529572339,
"grad_norm": 1.1063626083550568,
"learning_rate": 4.771590532565957e-06,
"loss": 0.0628,
"step": 1508
},
{
"epoch": 0.686533212010919,
"grad_norm": 1.447485907138006,
"learning_rate": 4.771292010901357e-06,
"loss": 0.0756,
"step": 1509
},
{
"epoch": 0.6869881710646042,
"grad_norm": 1.9968564435349099,
"learning_rate": 4.77099330363703e-06,
"loss": 0.1121,
"step": 1510
},
{
"epoch": 0.6874431301182894,
"grad_norm": 1.331414088559165,
"learning_rate": 4.770694410797387e-06,
"loss": 0.0918,
"step": 1511
},
{
"epoch": 0.6878980891719745,
"grad_norm": 1.7374051988448433,
"learning_rate": 4.770395332406851e-06,
"loss": 0.1046,
"step": 1512
},
{
"epoch": 0.6883530482256597,
"grad_norm": 1.5590482284052172,
"learning_rate": 4.770096068489861e-06,
"loss": 0.1045,
"step": 1513
},
{
"epoch": 0.6888080072793449,
"grad_norm": 1.2266167614387768,
"learning_rate": 4.769796619070872e-06,
"loss": 0.0877,
"step": 1514
},
{
"epoch": 0.6892629663330301,
"grad_norm": 1.1438287132644533,
"learning_rate": 4.769496984174353e-06,
"loss": 0.0759,
"step": 1515
},
{
"epoch": 0.6897179253867152,
"grad_norm": 1.5191110521315079,
"learning_rate": 4.769197163824791e-06,
"loss": 0.0839,
"step": 1516
},
{
"epoch": 0.6901728844404004,
"grad_norm": 1.5352637302100918,
"learning_rate": 4.768897158046683e-06,
"loss": 0.0927,
"step": 1517
},
{
"epoch": 0.6906278434940856,
"grad_norm": 1.224151460496261,
"learning_rate": 4.768596966864546e-06,
"loss": 0.0758,
"step": 1518
},
{
"epoch": 0.6910828025477707,
"grad_norm": 2.097275342036678,
"learning_rate": 4.76829659030291e-06,
"loss": 0.1606,
"step": 1519
},
{
"epoch": 0.6915377616014559,
"grad_norm": 1.773445388033648,
"learning_rate": 4.767996028386319e-06,
"loss": 0.1071,
"step": 1520
},
{
"epoch": 0.6919927206551411,
"grad_norm": 1.798404416562804,
"learning_rate": 4.767695281139336e-06,
"loss": 0.0882,
"step": 1521
},
{
"epoch": 0.6924476797088263,
"grad_norm": 1.6643609283655776,
"learning_rate": 4.767394348586535e-06,
"loss": 0.0986,
"step": 1522
},
{
"epoch": 0.6929026387625113,
"grad_norm": 1.8351458616302123,
"learning_rate": 4.767093230752507e-06,
"loss": 0.1398,
"step": 1523
},
{
"epoch": 0.6933575978161965,
"grad_norm": 1.695947028633324,
"learning_rate": 4.766791927661859e-06,
"loss": 0.1217,
"step": 1524
},
{
"epoch": 0.6938125568698817,
"grad_norm": 1.706097971198418,
"learning_rate": 4.766490439339211e-06,
"loss": 0.0852,
"step": 1525
},
{
"epoch": 0.6942675159235668,
"grad_norm": 1.6641835764066073,
"learning_rate": 4.7661887658092e-06,
"loss": 0.1078,
"step": 1526
},
{
"epoch": 0.694722474977252,
"grad_norm": 1.4721263946542316,
"learning_rate": 4.765886907096477e-06,
"loss": 0.1046,
"step": 1527
},
{
"epoch": 0.6951774340309372,
"grad_norm": 1.7677748922664356,
"learning_rate": 4.7655848632257084e-06,
"loss": 0.1257,
"step": 1528
},
{
"epoch": 0.6956323930846224,
"grad_norm": 1.5849838243983163,
"learning_rate": 4.7652826342215764e-06,
"loss": 0.113,
"step": 1529
},
{
"epoch": 0.6960873521383075,
"grad_norm": 1.8167247958495556,
"learning_rate": 4.764980220108777e-06,
"loss": 0.1308,
"step": 1530
},
{
"epoch": 0.6965423111919927,
"grad_norm": 2.259597776447737,
"learning_rate": 4.764677620912022e-06,
"loss": 0.1488,
"step": 1531
},
{
"epoch": 0.6969972702456779,
"grad_norm": 1.3871244274886438,
"learning_rate": 4.764374836656041e-06,
"loss": 0.1014,
"step": 1532
},
{
"epoch": 0.697452229299363,
"grad_norm": 1.261518456907349,
"learning_rate": 4.764071867365571e-06,
"loss": 0.0998,
"step": 1533
},
{
"epoch": 0.6979071883530482,
"grad_norm": 1.7720377742538196,
"learning_rate": 4.763768713065375e-06,
"loss": 0.1003,
"step": 1534
},
{
"epoch": 0.6983621474067334,
"grad_norm": 1.9316342411609453,
"learning_rate": 4.763465373780223e-06,
"loss": 0.1218,
"step": 1535
},
{
"epoch": 0.6988171064604186,
"grad_norm": 1.7090441393124594,
"learning_rate": 4.763161849534902e-06,
"loss": 0.1016,
"step": 1536
},
{
"epoch": 0.6992720655141037,
"grad_norm": 1.0772372058571478,
"learning_rate": 4.762858140354214e-06,
"loss": 0.0795,
"step": 1537
},
{
"epoch": 0.6997270245677889,
"grad_norm": 1.5989783419371975,
"learning_rate": 4.7625542462629785e-06,
"loss": 0.1051,
"step": 1538
},
{
"epoch": 0.7001819836214741,
"grad_norm": 0.9329076652331691,
"learning_rate": 4.762250167286027e-06,
"loss": 0.0492,
"step": 1539
},
{
"epoch": 0.7006369426751592,
"grad_norm": 1.7557978189042716,
"learning_rate": 4.761945903448209e-06,
"loss": 0.1336,
"step": 1540
},
{
"epoch": 0.7010919017288444,
"grad_norm": 1.1252616618728841,
"learning_rate": 4.761641454774386e-06,
"loss": 0.085,
"step": 1541
},
{
"epoch": 0.7015468607825296,
"grad_norm": 1.9520354546929128,
"learning_rate": 4.761336821289436e-06,
"loss": 0.158,
"step": 1542
},
{
"epoch": 0.7020018198362148,
"grad_norm": 1.088110444545801,
"learning_rate": 4.761032003018254e-06,
"loss": 0.0667,
"step": 1543
},
{
"epoch": 0.7024567788898999,
"grad_norm": 1.353551986968956,
"learning_rate": 4.760726999985748e-06,
"loss": 0.0748,
"step": 1544
},
{
"epoch": 0.7029117379435851,
"grad_norm": 1.2483430565784006,
"learning_rate": 4.7604218122168406e-06,
"loss": 0.0821,
"step": 1545
},
{
"epoch": 0.7033666969972703,
"grad_norm": 2.014581699156683,
"learning_rate": 4.760116439736471e-06,
"loss": 0.1376,
"step": 1546
},
{
"epoch": 0.7038216560509554,
"grad_norm": 2.2990546871467386,
"learning_rate": 4.759810882569591e-06,
"loss": 0.1528,
"step": 1547
},
{
"epoch": 0.7042766151046406,
"grad_norm": 1.062682027844058,
"learning_rate": 4.759505140741172e-06,
"loss": 0.0646,
"step": 1548
},
{
"epoch": 0.7047315741583258,
"grad_norm": 2.1924162550625863,
"learning_rate": 4.759199214276196e-06,
"loss": 0.1277,
"step": 1549
},
{
"epoch": 0.705186533212011,
"grad_norm": 1.4339312162219853,
"learning_rate": 4.758893103199665e-06,
"loss": 0.1056,
"step": 1550
},
{
"epoch": 0.7056414922656961,
"grad_norm": 1.6814902406994063,
"learning_rate": 4.758586807536588e-06,
"loss": 0.0968,
"step": 1551
},
{
"epoch": 0.7060964513193813,
"grad_norm": 1.055808036587697,
"learning_rate": 4.758280327311998e-06,
"loss": 0.0624,
"step": 1552
},
{
"epoch": 0.7065514103730665,
"grad_norm": 2.092612313664783,
"learning_rate": 4.757973662550938e-06,
"loss": 0.1076,
"step": 1553
},
{
"epoch": 0.7070063694267515,
"grad_norm": 1.2099784449421243,
"learning_rate": 4.757666813278466e-06,
"loss": 0.0791,
"step": 1554
},
{
"epoch": 0.7074613284804367,
"grad_norm": 1.7701219392423706,
"learning_rate": 4.757359779519659e-06,
"loss": 0.1158,
"step": 1555
},
{
"epoch": 0.707916287534122,
"grad_norm": 1.9442818433331057,
"learning_rate": 4.757052561299604e-06,
"loss": 0.1498,
"step": 1556
},
{
"epoch": 0.7083712465878071,
"grad_norm": 2.1934930579734417,
"learning_rate": 4.756745158643407e-06,
"loss": 0.1446,
"step": 1557
},
{
"epoch": 0.7088262056414922,
"grad_norm": 1.852211386061071,
"learning_rate": 4.7564375715761865e-06,
"loss": 0.1163,
"step": 1558
},
{
"epoch": 0.7092811646951774,
"grad_norm": 0.8096640629799587,
"learning_rate": 4.756129800123078e-06,
"loss": 0.0398,
"step": 1559
},
{
"epoch": 0.7097361237488626,
"grad_norm": 1.414444864803518,
"learning_rate": 4.755821844309232e-06,
"loss": 0.1126,
"step": 1560
},
{
"epoch": 0.7101910828025477,
"grad_norm": 1.598441885528022,
"learning_rate": 4.75551370415981e-06,
"loss": 0.1008,
"step": 1561
},
{
"epoch": 0.7106460418562329,
"grad_norm": 1.7052656116179543,
"learning_rate": 4.755205379699996e-06,
"loss": 0.105,
"step": 1562
},
{
"epoch": 0.7111010009099181,
"grad_norm": 1.570140158085679,
"learning_rate": 4.75489687095498e-06,
"loss": 0.103,
"step": 1563
},
{
"epoch": 0.7115559599636033,
"grad_norm": 1.6128979312038125,
"learning_rate": 4.754588177949977e-06,
"loss": 0.0947,
"step": 1564
},
{
"epoch": 0.7120109190172884,
"grad_norm": 1.5157416875909306,
"learning_rate": 4.7542793007102086e-06,
"loss": 0.0826,
"step": 1565
},
{
"epoch": 0.7124658780709736,
"grad_norm": 1.7615482286425264,
"learning_rate": 4.7539702392609165e-06,
"loss": 0.1349,
"step": 1566
},
{
"epoch": 0.7129208371246588,
"grad_norm": 1.1762971295347604,
"learning_rate": 4.753660993627356e-06,
"loss": 0.0649,
"step": 1567
},
{
"epoch": 0.7133757961783439,
"grad_norm": 2.155472421625263,
"learning_rate": 4.753351563834795e-06,
"loss": 0.1308,
"step": 1568
},
{
"epoch": 0.7138307552320291,
"grad_norm": 1.7676905218706818,
"learning_rate": 4.753041949908521e-06,
"loss": 0.1034,
"step": 1569
},
{
"epoch": 0.7142857142857143,
"grad_norm": 1.8250626593850294,
"learning_rate": 4.752732151873834e-06,
"loss": 0.1,
"step": 1570
},
{
"epoch": 0.7147406733393995,
"grad_norm": 1.4984330035047126,
"learning_rate": 4.752422169756048e-06,
"loss": 0.1107,
"step": 1571
},
{
"epoch": 0.7151956323930846,
"grad_norm": 1.2161952645703746,
"learning_rate": 4.752112003580495e-06,
"loss": 0.0772,
"step": 1572
},
{
"epoch": 0.7156505914467698,
"grad_norm": 1.8268634010084226,
"learning_rate": 4.751801653372518e-06,
"loss": 0.0853,
"step": 1573
},
{
"epoch": 0.716105550500455,
"grad_norm": 1.6855455239576989,
"learning_rate": 4.751491119157481e-06,
"loss": 0.1055,
"step": 1574
},
{
"epoch": 0.7165605095541401,
"grad_norm": 1.1214993041730539,
"learning_rate": 4.751180400960756e-06,
"loss": 0.0653,
"step": 1575
},
{
"epoch": 0.7170154686078253,
"grad_norm": 1.8475828844832658,
"learning_rate": 4.7508694988077355e-06,
"loss": 0.1416,
"step": 1576
},
{
"epoch": 0.7174704276615105,
"grad_norm": 1.4469787222461497,
"learning_rate": 4.750558412723824e-06,
"loss": 0.0766,
"step": 1577
},
{
"epoch": 0.7179253867151957,
"grad_norm": 1.6682547194818422,
"learning_rate": 4.750247142734442e-06,
"loss": 0.073,
"step": 1578
},
{
"epoch": 0.7183803457688808,
"grad_norm": 1.8235039708297685,
"learning_rate": 4.749935688865026e-06,
"loss": 0.1299,
"step": 1579
},
{
"epoch": 0.718835304822566,
"grad_norm": 1.2674959382982702,
"learning_rate": 4.749624051141026e-06,
"loss": 0.0639,
"step": 1580
},
{
"epoch": 0.7192902638762512,
"grad_norm": 1.1814301599394401,
"learning_rate": 4.7493122295879076e-06,
"loss": 0.074,
"step": 1581
},
{
"epoch": 0.7197452229299363,
"grad_norm": 1.8607689058316668,
"learning_rate": 4.7490002242311525e-06,
"loss": 0.1202,
"step": 1582
},
{
"epoch": 0.7202001819836215,
"grad_norm": 1.40248476110639,
"learning_rate": 4.748688035096255e-06,
"loss": 0.0831,
"step": 1583
},
{
"epoch": 0.7206551410373067,
"grad_norm": 1.376835864910441,
"learning_rate": 4.748375662208726e-06,
"loss": 0.0627,
"step": 1584
},
{
"epoch": 0.7211101000909919,
"grad_norm": 1.7445419287373105,
"learning_rate": 4.748063105594092e-06,
"loss": 0.1182,
"step": 1585
},
{
"epoch": 0.721565059144677,
"grad_norm": 1.6298546358892563,
"learning_rate": 4.747750365277892e-06,
"loss": 0.1203,
"step": 1586
},
{
"epoch": 0.7220200181983621,
"grad_norm": 1.78857652271692,
"learning_rate": 4.747437441285684e-06,
"loss": 0.0845,
"step": 1587
},
{
"epoch": 0.7224749772520473,
"grad_norm": 1.5543624854659128,
"learning_rate": 4.747124333643038e-06,
"loss": 0.1067,
"step": 1588
},
{
"epoch": 0.7229299363057324,
"grad_norm": 1.6938973264546118,
"learning_rate": 4.746811042375538e-06,
"loss": 0.1092,
"step": 1589
},
{
"epoch": 0.7233848953594176,
"grad_norm": 1.4339359801015907,
"learning_rate": 4.746497567508787e-06,
"loss": 0.1009,
"step": 1590
},
{
"epoch": 0.7238398544131028,
"grad_norm": 1.370915821139941,
"learning_rate": 4.7461839090684e-06,
"loss": 0.0967,
"step": 1591
},
{
"epoch": 0.724294813466788,
"grad_norm": 1.65404522408881,
"learning_rate": 4.745870067080007e-06,
"loss": 0.0936,
"step": 1592
},
{
"epoch": 0.7247497725204731,
"grad_norm": 2.5744395171768026,
"learning_rate": 4.7455560415692545e-06,
"loss": 0.1734,
"step": 1593
},
{
"epoch": 0.7252047315741583,
"grad_norm": 1.6130757907987123,
"learning_rate": 4.745241832561803e-06,
"loss": 0.0782,
"step": 1594
},
{
"epoch": 0.7256596906278435,
"grad_norm": 1.3264278567683987,
"learning_rate": 4.744927440083329e-06,
"loss": 0.0883,
"step": 1595
},
{
"epoch": 0.7261146496815286,
"grad_norm": 1.4845169251283168,
"learning_rate": 4.744612864159522e-06,
"loss": 0.0866,
"step": 1596
},
{
"epoch": 0.7265696087352138,
"grad_norm": 1.867201501230081,
"learning_rate": 4.7442981048160895e-06,
"loss": 0.1239,
"step": 1597
},
{
"epoch": 0.727024567788899,
"grad_norm": 1.5395932028522379,
"learning_rate": 4.74398316207875e-06,
"loss": 0.0937,
"step": 1598
},
{
"epoch": 0.7274795268425842,
"grad_norm": 2.47394198911153,
"learning_rate": 4.74366803597324e-06,
"loss": 0.1896,
"step": 1599
},
{
"epoch": 0.7279344858962693,
"grad_norm": 1.6788148875306355,
"learning_rate": 4.743352726525311e-06,
"loss": 0.1001,
"step": 1600
},
{
"epoch": 0.7283894449499545,
"grad_norm": 1.1785705121541328,
"learning_rate": 4.743037233760728e-06,
"loss": 0.0723,
"step": 1601
},
{
"epoch": 0.7288444040036397,
"grad_norm": 1.5889628523330563,
"learning_rate": 4.742721557705271e-06,
"loss": 0.0978,
"step": 1602
},
{
"epoch": 0.7292993630573248,
"grad_norm": 1.3734642738638374,
"learning_rate": 4.7424056983847374e-06,
"loss": 0.0961,
"step": 1603
},
{
"epoch": 0.72975432211101,
"grad_norm": 1.6433399521175855,
"learning_rate": 4.7420896558249366e-06,
"loss": 0.1037,
"step": 1604
},
{
"epoch": 0.7302092811646952,
"grad_norm": 1.0189389361932368,
"learning_rate": 4.741773430051694e-06,
"loss": 0.0571,
"step": 1605
},
{
"epoch": 0.7306642402183804,
"grad_norm": 1.8326786415176635,
"learning_rate": 4.74145702109085e-06,
"loss": 0.1069,
"step": 1606
},
{
"epoch": 0.7311191992720655,
"grad_norm": 1.9145632983548877,
"learning_rate": 4.741140428968261e-06,
"loss": 0.1155,
"step": 1607
},
{
"epoch": 0.7315741583257507,
"grad_norm": 0.8975672007604479,
"learning_rate": 4.740823653709797e-06,
"loss": 0.0594,
"step": 1608
},
{
"epoch": 0.7320291173794359,
"grad_norm": 1.1104882324072687,
"learning_rate": 4.740506695341343e-06,
"loss": 0.0774,
"step": 1609
},
{
"epoch": 0.732484076433121,
"grad_norm": 1.8804023117943707,
"learning_rate": 4.740189553888801e-06,
"loss": 0.1265,
"step": 1610
},
{
"epoch": 0.7329390354868062,
"grad_norm": 1.3783166591523974,
"learning_rate": 4.739872229378085e-06,
"loss": 0.0849,
"step": 1611
},
{
"epoch": 0.7333939945404914,
"grad_norm": 1.5383875985636057,
"learning_rate": 4.739554721835125e-06,
"loss": 0.0764,
"step": 1612
},
{
"epoch": 0.7338489535941766,
"grad_norm": 1.7836575489679842,
"learning_rate": 4.739237031285867e-06,
"loss": 0.1208,
"step": 1613
},
{
"epoch": 0.7343039126478617,
"grad_norm": 2.0374287466508343,
"learning_rate": 4.738919157756272e-06,
"loss": 0.1283,
"step": 1614
},
{
"epoch": 0.7347588717015469,
"grad_norm": 1.4713023421634537,
"learning_rate": 4.738601101272313e-06,
"loss": 0.1143,
"step": 1615
},
{
"epoch": 0.7352138307552321,
"grad_norm": 1.3004252033026868,
"learning_rate": 4.738282861859983e-06,
"loss": 0.0785,
"step": 1616
},
{
"epoch": 0.7356687898089171,
"grad_norm": 1.7078107635335555,
"learning_rate": 4.737964439545284e-06,
"loss": 0.0989,
"step": 1617
},
{
"epoch": 0.7361237488626023,
"grad_norm": 1.482235192071265,
"learning_rate": 4.737645834354238e-06,
"loss": 0.0889,
"step": 1618
},
{
"epoch": 0.7365787079162875,
"grad_norm": 1.3632184750760454,
"learning_rate": 4.737327046312879e-06,
"loss": 0.0728,
"step": 1619
},
{
"epoch": 0.7370336669699727,
"grad_norm": 1.6185932631828381,
"learning_rate": 4.737008075447259e-06,
"loss": 0.0967,
"step": 1620
},
{
"epoch": 0.7374886260236578,
"grad_norm": 1.7060869720795129,
"learning_rate": 4.73668892178344e-06,
"loss": 0.1054,
"step": 1621
},
{
"epoch": 0.737943585077343,
"grad_norm": 1.672488053873089,
"learning_rate": 4.736369585347503e-06,
"loss": 0.1172,
"step": 1622
},
{
"epoch": 0.7383985441310282,
"grad_norm": 2.009207481858011,
"learning_rate": 4.736050066165544e-06,
"loss": 0.1104,
"step": 1623
},
{
"epoch": 0.7388535031847133,
"grad_norm": 1.7386909135986017,
"learning_rate": 4.735730364263671e-06,
"loss": 0.1142,
"step": 1624
},
{
"epoch": 0.7393084622383985,
"grad_norm": 1.6299431755796778,
"learning_rate": 4.735410479668009e-06,
"loss": 0.109,
"step": 1625
},
{
"epoch": 0.7397634212920837,
"grad_norm": 1.5971057123643035,
"learning_rate": 4.735090412404697e-06,
"loss": 0.1037,
"step": 1626
},
{
"epoch": 0.7402183803457689,
"grad_norm": 1.4066558803560258,
"learning_rate": 4.734770162499891e-06,
"loss": 0.0718,
"step": 1627
},
{
"epoch": 0.740673339399454,
"grad_norm": 1.3437849408188942,
"learning_rate": 4.734449729979759e-06,
"loss": 0.0775,
"step": 1628
},
{
"epoch": 0.7411282984531392,
"grad_norm": 1.8126383722195984,
"learning_rate": 4.734129114870486e-06,
"loss": 0.1097,
"step": 1629
},
{
"epoch": 0.7415832575068244,
"grad_norm": 1.7276681892706887,
"learning_rate": 4.733808317198271e-06,
"loss": 0.075,
"step": 1630
},
{
"epoch": 0.7420382165605095,
"grad_norm": 1.4303092464154914,
"learning_rate": 4.733487336989327e-06,
"loss": 0.0839,
"step": 1631
},
{
"epoch": 0.7424931756141947,
"grad_norm": 1.8755052783018096,
"learning_rate": 4.733166174269886e-06,
"loss": 0.1156,
"step": 1632
},
{
"epoch": 0.7429481346678799,
"grad_norm": 1.4937298948438007,
"learning_rate": 4.732844829066189e-06,
"loss": 0.1005,
"step": 1633
},
{
"epoch": 0.7434030937215651,
"grad_norm": 1.641256737556786,
"learning_rate": 4.732523301404497e-06,
"loss": 0.1038,
"step": 1634
},
{
"epoch": 0.7438580527752502,
"grad_norm": 1.8968655868657809,
"learning_rate": 4.732201591311082e-06,
"loss": 0.1318,
"step": 1635
},
{
"epoch": 0.7443130118289354,
"grad_norm": 1.5647661977098755,
"learning_rate": 4.731879698812233e-06,
"loss": 0.1295,
"step": 1636
},
{
"epoch": 0.7447679708826206,
"grad_norm": 1.3130665672457837,
"learning_rate": 4.731557623934255e-06,
"loss": 0.0797,
"step": 1637
},
{
"epoch": 0.7452229299363057,
"grad_norm": 2.2524036787204236,
"learning_rate": 4.7312353667034645e-06,
"loss": 0.1549,
"step": 1638
},
{
"epoch": 0.7456778889899909,
"grad_norm": 2.171706574250327,
"learning_rate": 4.730912927146197e-06,
"loss": 0.1221,
"step": 1639
},
{
"epoch": 0.7461328480436761,
"grad_norm": 1.3055559061415911,
"learning_rate": 4.7305903052888e-06,
"loss": 0.0797,
"step": 1640
},
{
"epoch": 0.7465878070973613,
"grad_norm": 1.9092438244747783,
"learning_rate": 4.730267501157636e-06,
"loss": 0.1211,
"step": 1641
},
{
"epoch": 0.7470427661510464,
"grad_norm": 1.3873103303782754,
"learning_rate": 4.729944514779084e-06,
"loss": 0.0863,
"step": 1642
},
{
"epoch": 0.7474977252047316,
"grad_norm": 1.3769315994876887,
"learning_rate": 4.729621346179536e-06,
"loss": 0.095,
"step": 1643
},
{
"epoch": 0.7479526842584168,
"grad_norm": 1.3309888167219324,
"learning_rate": 4.7292979953854e-06,
"loss": 0.091,
"step": 1644
},
{
"epoch": 0.7484076433121019,
"grad_norm": 1.3388937850633889,
"learning_rate": 4.7289744624231004e-06,
"loss": 0.0715,
"step": 1645
},
{
"epoch": 0.7488626023657871,
"grad_norm": 2.9889212809141026,
"learning_rate": 4.728650747319073e-06,
"loss": 0.1403,
"step": 1646
},
{
"epoch": 0.7493175614194723,
"grad_norm": 1.7436207494414042,
"learning_rate": 4.728326850099771e-06,
"loss": 0.11,
"step": 1647
},
{
"epoch": 0.7497725204731575,
"grad_norm": 1.2990437768947476,
"learning_rate": 4.728002770791663e-06,
"loss": 0.0982,
"step": 1648
},
{
"epoch": 0.7502274795268425,
"grad_norm": 1.344045724677696,
"learning_rate": 4.727678509421229e-06,
"loss": 0.0922,
"step": 1649
},
{
"epoch": 0.7506824385805277,
"grad_norm": 1.1045854705826224,
"learning_rate": 4.727354066014968e-06,
"loss": 0.0704,
"step": 1650
},
{
"epoch": 0.7511373976342129,
"grad_norm": 1.5988720844668791,
"learning_rate": 4.727029440599391e-06,
"loss": 0.1066,
"step": 1651
},
{
"epoch": 0.7515923566878981,
"grad_norm": 1.3512878420396681,
"learning_rate": 4.726704633201025e-06,
"loss": 0.074,
"step": 1652
},
{
"epoch": 0.7520473157415832,
"grad_norm": 1.669678273086279,
"learning_rate": 4.726379643846412e-06,
"loss": 0.1167,
"step": 1653
},
{
"epoch": 0.7525022747952684,
"grad_norm": 1.8860050110009976,
"learning_rate": 4.726054472562109e-06,
"loss": 0.1203,
"step": 1654
},
{
"epoch": 0.7529572338489536,
"grad_norm": 1.1328199081442367,
"learning_rate": 4.725729119374687e-06,
"loss": 0.0715,
"step": 1655
},
{
"epoch": 0.7534121929026387,
"grad_norm": 1.369550149899098,
"learning_rate": 4.725403584310734e-06,
"loss": 0.0788,
"step": 1656
},
{
"epoch": 0.7538671519563239,
"grad_norm": 1.7251897843263797,
"learning_rate": 4.725077867396849e-06,
"loss": 0.0951,
"step": 1657
},
{
"epoch": 0.7543221110100091,
"grad_norm": 1.4350282883675796,
"learning_rate": 4.724751968659648e-06,
"loss": 0.1096,
"step": 1658
},
{
"epoch": 0.7547770700636943,
"grad_norm": 1.9342343144020262,
"learning_rate": 4.724425888125764e-06,
"loss": 0.125,
"step": 1659
},
{
"epoch": 0.7552320291173794,
"grad_norm": 1.6341803441145442,
"learning_rate": 4.724099625821842e-06,
"loss": 0.0945,
"step": 1660
},
{
"epoch": 0.7556869881710646,
"grad_norm": 1.545830512814091,
"learning_rate": 4.723773181774543e-06,
"loss": 0.0961,
"step": 1661
},
{
"epoch": 0.7561419472247498,
"grad_norm": 1.7914456776458303,
"learning_rate": 4.723446556010542e-06,
"loss": 0.1092,
"step": 1662
},
{
"epoch": 0.7565969062784349,
"grad_norm": 1.2264032188306588,
"learning_rate": 4.7231197485565275e-06,
"loss": 0.096,
"step": 1663
},
{
"epoch": 0.7570518653321201,
"grad_norm": 1.838239870158386,
"learning_rate": 4.722792759439209e-06,
"loss": 0.129,
"step": 1664
},
{
"epoch": 0.7575068243858053,
"grad_norm": 1.8429853108458891,
"learning_rate": 4.722465588685302e-06,
"loss": 0.147,
"step": 1665
},
{
"epoch": 0.7579617834394905,
"grad_norm": 1.2105825230064677,
"learning_rate": 4.722138236321545e-06,
"loss": 0.0666,
"step": 1666
},
{
"epoch": 0.7584167424931756,
"grad_norm": 1.5830454148486297,
"learning_rate": 4.721810702374687e-06,
"loss": 0.0912,
"step": 1667
},
{
"epoch": 0.7588717015468608,
"grad_norm": 1.298617622670505,
"learning_rate": 4.721482986871491e-06,
"loss": 0.0787,
"step": 1668
},
{
"epoch": 0.759326660600546,
"grad_norm": 2.2458643789106105,
"learning_rate": 4.721155089838738e-06,
"loss": 0.1031,
"step": 1669
},
{
"epoch": 0.7597816196542311,
"grad_norm": 1.414978172323641,
"learning_rate": 4.720827011303222e-06,
"loss": 0.0909,
"step": 1670
},
{
"epoch": 0.7602365787079163,
"grad_norm": 1.8584646816819383,
"learning_rate": 4.720498751291751e-06,
"loss": 0.1186,
"step": 1671
},
{
"epoch": 0.7606915377616015,
"grad_norm": 1.924659600493317,
"learning_rate": 4.72017030983115e-06,
"loss": 0.142,
"step": 1672
},
{
"epoch": 0.7611464968152867,
"grad_norm": 1.7938185155065802,
"learning_rate": 4.7198416869482575e-06,
"loss": 0.1118,
"step": 1673
},
{
"epoch": 0.7616014558689718,
"grad_norm": 2.011921232392505,
"learning_rate": 4.719512882669926e-06,
"loss": 0.1518,
"step": 1674
},
{
"epoch": 0.762056414922657,
"grad_norm": 1.2982227487003852,
"learning_rate": 4.719183897023027e-06,
"loss": 0.0693,
"step": 1675
},
{
"epoch": 0.7625113739763422,
"grad_norm": 1.9792808150985886,
"learning_rate": 4.718854730034441e-06,
"loss": 0.1061,
"step": 1676
},
{
"epoch": 0.7629663330300273,
"grad_norm": 1.4920604204792802,
"learning_rate": 4.718525381731066e-06,
"loss": 0.0911,
"step": 1677
},
{
"epoch": 0.7634212920837125,
"grad_norm": 1.8909028849207012,
"learning_rate": 4.718195852139816e-06,
"loss": 0.1248,
"step": 1678
},
{
"epoch": 0.7638762511373977,
"grad_norm": 2.241114950796753,
"learning_rate": 4.717866141287618e-06,
"loss": 0.1693,
"step": 1679
},
{
"epoch": 0.7643312101910829,
"grad_norm": 1.6580808599028765,
"learning_rate": 4.717536249201416e-06,
"loss": 0.0957,
"step": 1680
},
{
"epoch": 0.7647861692447679,
"grad_norm": 1.2734902743152507,
"learning_rate": 4.7172061759081646e-06,
"loss": 0.0793,
"step": 1681
},
{
"epoch": 0.7652411282984531,
"grad_norm": 1.5750820786177648,
"learning_rate": 4.716875921434838e-06,
"loss": 0.103,
"step": 1682
},
{
"epoch": 0.7656960873521383,
"grad_norm": 1.7945590491479675,
"learning_rate": 4.716545485808421e-06,
"loss": 0.1025,
"step": 1683
},
{
"epoch": 0.7661510464058234,
"grad_norm": 1.6260623314970664,
"learning_rate": 4.716214869055918e-06,
"loss": 0.1065,
"step": 1684
},
{
"epoch": 0.7666060054595086,
"grad_norm": 1.6283865070296875,
"learning_rate": 4.715884071204344e-06,
"loss": 0.1118,
"step": 1685
},
{
"epoch": 0.7670609645131938,
"grad_norm": 1.3932748974606075,
"learning_rate": 4.715553092280731e-06,
"loss": 0.1022,
"step": 1686
},
{
"epoch": 0.767515923566879,
"grad_norm": 1.847468209296495,
"learning_rate": 4.7152219323121246e-06,
"loss": 0.1118,
"step": 1687
},
{
"epoch": 0.7679708826205641,
"grad_norm": 2.2984795110772978,
"learning_rate": 4.714890591325586e-06,
"loss": 0.1499,
"step": 1688
},
{
"epoch": 0.7684258416742493,
"grad_norm": 1.3869963208937237,
"learning_rate": 4.714559069348189e-06,
"loss": 0.082,
"step": 1689
},
{
"epoch": 0.7688808007279345,
"grad_norm": 1.8609913938687341,
"learning_rate": 4.714227366407027e-06,
"loss": 0.1227,
"step": 1690
},
{
"epoch": 0.7693357597816196,
"grad_norm": 1.3603756636552151,
"learning_rate": 4.7138954825292035e-06,
"loss": 0.0837,
"step": 1691
},
{
"epoch": 0.7697907188353048,
"grad_norm": 1.4363757354044508,
"learning_rate": 4.71356341774184e-06,
"loss": 0.1016,
"step": 1692
},
{
"epoch": 0.77024567788899,
"grad_norm": 1.7266309692154667,
"learning_rate": 4.713231172072069e-06,
"loss": 0.1035,
"step": 1693
},
{
"epoch": 0.7707006369426752,
"grad_norm": 1.7559578771562407,
"learning_rate": 4.712898745547043e-06,
"loss": 0.1108,
"step": 1694
},
{
"epoch": 0.7711555959963603,
"grad_norm": 1.7003164181986268,
"learning_rate": 4.712566138193923e-06,
"loss": 0.1331,
"step": 1695
},
{
"epoch": 0.7716105550500455,
"grad_norm": 1.3433623006567543,
"learning_rate": 4.712233350039892e-06,
"loss": 0.0875,
"step": 1696
},
{
"epoch": 0.7720655141037307,
"grad_norm": 1.4038710385167128,
"learning_rate": 4.711900381112141e-06,
"loss": 0.0781,
"step": 1697
},
{
"epoch": 0.7725204731574158,
"grad_norm": 1.169674714312164,
"learning_rate": 4.71156723143788e-06,
"loss": 0.0811,
"step": 1698
},
{
"epoch": 0.772975432211101,
"grad_norm": 1.4291280792428351,
"learning_rate": 4.711233901044332e-06,
"loss": 0.0969,
"step": 1699
},
{
"epoch": 0.7734303912647862,
"grad_norm": 1.4493582239493352,
"learning_rate": 4.710900389958735e-06,
"loss": 0.1001,
"step": 1700
},
{
"epoch": 0.7738853503184714,
"grad_norm": 1.942694182484742,
"learning_rate": 4.710566698208343e-06,
"loss": 0.1557,
"step": 1701
},
{
"epoch": 0.7743403093721565,
"grad_norm": 1.4803492252427144,
"learning_rate": 4.710232825820424e-06,
"loss": 0.0986,
"step": 1702
},
{
"epoch": 0.7747952684258417,
"grad_norm": 0.9755297818504668,
"learning_rate": 4.709898772822258e-06,
"loss": 0.0561,
"step": 1703
},
{
"epoch": 0.7752502274795269,
"grad_norm": 1.1791809604326482,
"learning_rate": 4.709564539241145e-06,
"loss": 0.0757,
"step": 1704
},
{
"epoch": 0.775705186533212,
"grad_norm": 1.2009047371661077,
"learning_rate": 4.709230125104396e-06,
"loss": 0.0884,
"step": 1705
},
{
"epoch": 0.7761601455868972,
"grad_norm": 1.1974081374389889,
"learning_rate": 4.708895530439339e-06,
"loss": 0.065,
"step": 1706
},
{
"epoch": 0.7766151046405824,
"grad_norm": 1.2627750540057827,
"learning_rate": 4.708560755273313e-06,
"loss": 0.0583,
"step": 1707
},
{
"epoch": 0.7770700636942676,
"grad_norm": 2.358488367779091,
"learning_rate": 4.7082257996336765e-06,
"loss": 0.1537,
"step": 1708
},
{
"epoch": 0.7775250227479527,
"grad_norm": 1.294851907930423,
"learning_rate": 4.707890663547801e-06,
"loss": 0.0933,
"step": 1709
},
{
"epoch": 0.7779799818016379,
"grad_norm": 1.4515186083780571,
"learning_rate": 4.7075553470430695e-06,
"loss": 0.1129,
"step": 1710
},
{
"epoch": 0.778434940855323,
"grad_norm": 2.243497892180013,
"learning_rate": 4.707219850146885e-06,
"loss": 0.1131,
"step": 1711
},
{
"epoch": 0.7788898999090081,
"grad_norm": 1.4994790836976213,
"learning_rate": 4.706884172886662e-06,
"loss": 0.1208,
"step": 1712
},
{
"epoch": 0.7793448589626933,
"grad_norm": 1.3372579159675713,
"learning_rate": 4.706548315289831e-06,
"loss": 0.0832,
"step": 1713
},
{
"epoch": 0.7797998180163785,
"grad_norm": 1.4278251897253653,
"learning_rate": 4.706212277383836e-06,
"loss": 0.0825,
"step": 1714
},
{
"epoch": 0.7802547770700637,
"grad_norm": 1.6733465665912612,
"learning_rate": 4.705876059196136e-06,
"loss": 0.1136,
"step": 1715
},
{
"epoch": 0.7807097361237488,
"grad_norm": 1.623215614746565,
"learning_rate": 4.705539660754208e-06,
"loss": 0.1073,
"step": 1716
},
{
"epoch": 0.781164695177434,
"grad_norm": 1.3278600542007308,
"learning_rate": 4.705203082085538e-06,
"loss": 0.0784,
"step": 1717
},
{
"epoch": 0.7816196542311192,
"grad_norm": 1.9615616543104608,
"learning_rate": 4.70486632321763e-06,
"loss": 0.1246,
"step": 1718
},
{
"epoch": 0.7820746132848043,
"grad_norm": 1.6162561833650906,
"learning_rate": 4.7045293841780034e-06,
"loss": 0.1011,
"step": 1719
},
{
"epoch": 0.7825295723384895,
"grad_norm": 1.1315087094780059,
"learning_rate": 4.704192264994193e-06,
"loss": 0.0728,
"step": 1720
},
{
"epoch": 0.7829845313921747,
"grad_norm": 1.1842216997924317,
"learning_rate": 4.703854965693743e-06,
"loss": 0.0623,
"step": 1721
},
{
"epoch": 0.7834394904458599,
"grad_norm": 1.7587767128013225,
"learning_rate": 4.703517486304218e-06,
"loss": 0.1177,
"step": 1722
},
{
"epoch": 0.783894449499545,
"grad_norm": 1.339870047242403,
"learning_rate": 4.703179826853195e-06,
"loss": 0.0885,
"step": 1723
},
{
"epoch": 0.7843494085532302,
"grad_norm": 1.614840853064282,
"learning_rate": 4.702841987368265e-06,
"loss": 0.0832,
"step": 1724
},
{
"epoch": 0.7848043676069154,
"grad_norm": 1.333156859451678,
"learning_rate": 4.702503967877038e-06,
"loss": 0.0642,
"step": 1725
},
{
"epoch": 0.7852593266606005,
"grad_norm": 2.263050031311358,
"learning_rate": 4.702165768407132e-06,
"loss": 0.1535,
"step": 1726
},
{
"epoch": 0.7857142857142857,
"grad_norm": 1.6639628191313198,
"learning_rate": 4.701827388986185e-06,
"loss": 0.1104,
"step": 1727
},
{
"epoch": 0.7861692447679709,
"grad_norm": 1.5141275873838496,
"learning_rate": 4.701488829641845e-06,
"loss": 0.0812,
"step": 1728
},
{
"epoch": 0.7866242038216561,
"grad_norm": 1.3656110267192454,
"learning_rate": 4.701150090401782e-06,
"loss": 0.0882,
"step": 1729
},
{
"epoch": 0.7870791628753412,
"grad_norm": 1.2883384323068632,
"learning_rate": 4.700811171293673e-06,
"loss": 0.0874,
"step": 1730
},
{
"epoch": 0.7875341219290264,
"grad_norm": 1.3671667581027154,
"learning_rate": 4.700472072345214e-06,
"loss": 0.105,
"step": 1731
},
{
"epoch": 0.7879890809827116,
"grad_norm": 1.5338726176497217,
"learning_rate": 4.700132793584113e-06,
"loss": 0.0953,
"step": 1732
},
{
"epoch": 0.7884440400363967,
"grad_norm": 1.442558179107586,
"learning_rate": 4.699793335038098e-06,
"loss": 0.0888,
"step": 1733
},
{
"epoch": 0.7888989990900819,
"grad_norm": 1.5541388069508446,
"learning_rate": 4.699453696734905e-06,
"loss": 0.0976,
"step": 1734
},
{
"epoch": 0.7893539581437671,
"grad_norm": 1.9432135282290401,
"learning_rate": 4.699113878702288e-06,
"loss": 0.141,
"step": 1735
},
{
"epoch": 0.7898089171974523,
"grad_norm": 1.0899335114210718,
"learning_rate": 4.698773880968017e-06,
"loss": 0.0587,
"step": 1736
},
{
"epoch": 0.7902638762511374,
"grad_norm": 0.8364235505875384,
"learning_rate": 4.698433703559874e-06,
"loss": 0.0407,
"step": 1737
},
{
"epoch": 0.7907188353048226,
"grad_norm": 0.9990422189505855,
"learning_rate": 4.698093346505656e-06,
"loss": 0.0577,
"step": 1738
},
{
"epoch": 0.7911737943585078,
"grad_norm": 1.6422962944234947,
"learning_rate": 4.697752809833177e-06,
"loss": 0.1103,
"step": 1739
},
{
"epoch": 0.7916287534121929,
"grad_norm": 1.9416159087916842,
"learning_rate": 4.697412093570263e-06,
"loss": 0.1188,
"step": 1740
},
{
"epoch": 0.792083712465878,
"grad_norm": 1.2746246892088609,
"learning_rate": 4.697071197744756e-06,
"loss": 0.0764,
"step": 1741
},
{
"epoch": 0.7925386715195633,
"grad_norm": 1.377041852744143,
"learning_rate": 4.6967301223845115e-06,
"loss": 0.0716,
"step": 1742
},
{
"epoch": 0.7929936305732485,
"grad_norm": 1.3143519637208256,
"learning_rate": 4.696388867517403e-06,
"loss": 0.0962,
"step": 1743
},
{
"epoch": 0.7934485896269335,
"grad_norm": 1.3897826616270026,
"learning_rate": 4.696047433171316e-06,
"loss": 0.0653,
"step": 1744
},
{
"epoch": 0.7939035486806187,
"grad_norm": 1.570502952913152,
"learning_rate": 4.695705819374149e-06,
"loss": 0.0789,
"step": 1745
},
{
"epoch": 0.7943585077343039,
"grad_norm": 1.2543848649594032,
"learning_rate": 4.695364026153818e-06,
"loss": 0.0804,
"step": 1746
},
{
"epoch": 0.794813466787989,
"grad_norm": 1.6107183046475493,
"learning_rate": 4.695022053538253e-06,
"loss": 0.0953,
"step": 1747
},
{
"epoch": 0.7952684258416742,
"grad_norm": 1.9045899767745609,
"learning_rate": 4.694679901555398e-06,
"loss": 0.1102,
"step": 1748
},
{
"epoch": 0.7957233848953594,
"grad_norm": 5.2618436961914385,
"learning_rate": 4.694337570233213e-06,
"loss": 0.1912,
"step": 1749
},
{
"epoch": 0.7961783439490446,
"grad_norm": 1.474591164189007,
"learning_rate": 4.693995059599672e-06,
"loss": 0.0896,
"step": 1750
},
{
"epoch": 0.7966333030027297,
"grad_norm": 1.8428893564579294,
"learning_rate": 4.693652369682762e-06,
"loss": 0.1262,
"step": 1751
},
{
"epoch": 0.7970882620564149,
"grad_norm": 1.364047783874895,
"learning_rate": 4.693309500510487e-06,
"loss": 0.0655,
"step": 1752
},
{
"epoch": 0.7975432211101001,
"grad_norm": 1.410719847190726,
"learning_rate": 4.692966452110864e-06,
"loss": 0.0692,
"step": 1753
},
{
"epoch": 0.7979981801637852,
"grad_norm": 1.529856723780921,
"learning_rate": 4.6926232245119265e-06,
"loss": 0.1243,
"step": 1754
},
{
"epoch": 0.7984531392174704,
"grad_norm": 2.1726237253640917,
"learning_rate": 4.69227981774172e-06,
"loss": 0.1196,
"step": 1755
},
{
"epoch": 0.7989080982711556,
"grad_norm": 1.754980663107314,
"learning_rate": 4.691936231828308e-06,
"loss": 0.0992,
"step": 1756
},
{
"epoch": 0.7993630573248408,
"grad_norm": 1.4886622831847642,
"learning_rate": 4.691592466799766e-06,
"loss": 0.1047,
"step": 1757
},
{
"epoch": 0.7998180163785259,
"grad_norm": 1.513862751936672,
"learning_rate": 4.691248522684184e-06,
"loss": 0.098,
"step": 1758
},
{
"epoch": 0.8002729754322111,
"grad_norm": 1.2579558512837248,
"learning_rate": 4.690904399509668e-06,
"loss": 0.0682,
"step": 1759
},
{
"epoch": 0.8007279344858963,
"grad_norm": 2.2566273953343217,
"learning_rate": 4.69056009730434e-06,
"loss": 0.1303,
"step": 1760
},
{
"epoch": 0.8011828935395814,
"grad_norm": 2.00717633449877,
"learning_rate": 4.690215616096332e-06,
"loss": 0.1321,
"step": 1761
},
{
"epoch": 0.8016378525932666,
"grad_norm": 1.7602932248302383,
"learning_rate": 4.689870955913796e-06,
"loss": 0.1131,
"step": 1762
},
{
"epoch": 0.8020928116469518,
"grad_norm": 1.0775248513760711,
"learning_rate": 4.689526116784894e-06,
"loss": 0.0681,
"step": 1763
},
{
"epoch": 0.802547770700637,
"grad_norm": 1.7261882758690685,
"learning_rate": 4.689181098737805e-06,
"loss": 0.0936,
"step": 1764
},
{
"epoch": 0.8030027297543221,
"grad_norm": 0.9831258196463247,
"learning_rate": 4.6888359018007235e-06,
"loss": 0.0582,
"step": 1765
},
{
"epoch": 0.8034576888080073,
"grad_norm": 1.32544308940629,
"learning_rate": 4.6884905260018565e-06,
"loss": 0.0818,
"step": 1766
},
{
"epoch": 0.8039126478616925,
"grad_norm": 1.5291799564685358,
"learning_rate": 4.688144971369427e-06,
"loss": 0.0942,
"step": 1767
},
{
"epoch": 0.8043676069153776,
"grad_norm": 1.6829263485608386,
"learning_rate": 4.687799237931673e-06,
"loss": 0.0901,
"step": 1768
},
{
"epoch": 0.8048225659690628,
"grad_norm": 1.7708955797300434,
"learning_rate": 4.687453325716844e-06,
"loss": 0.1385,
"step": 1769
},
{
"epoch": 0.805277525022748,
"grad_norm": 1.7812931298131665,
"learning_rate": 4.687107234753208e-06,
"loss": 0.0999,
"step": 1770
},
{
"epoch": 0.8057324840764332,
"grad_norm": 1.7046411065133646,
"learning_rate": 4.686760965069046e-06,
"loss": 0.0926,
"step": 1771
},
{
"epoch": 0.8061874431301183,
"grad_norm": 1.3399041945985406,
"learning_rate": 4.686414516692653e-06,
"loss": 0.106,
"step": 1772
},
{
"epoch": 0.8066424021838035,
"grad_norm": 1.3995619121814422,
"learning_rate": 4.68606788965234e-06,
"loss": 0.0946,
"step": 1773
},
{
"epoch": 0.8070973612374887,
"grad_norm": 2.0667835204996154,
"learning_rate": 4.68572108397643e-06,
"loss": 0.1627,
"step": 1774
},
{
"epoch": 0.8075523202911737,
"grad_norm": 1.578406572399004,
"learning_rate": 4.6853740996932645e-06,
"loss": 0.1036,
"step": 1775
},
{
"epoch": 0.8080072793448589,
"grad_norm": 1.4979933750877805,
"learning_rate": 4.685026936831196e-06,
"loss": 0.0957,
"step": 1776
},
{
"epoch": 0.8084622383985441,
"grad_norm": 1.3907524265462887,
"learning_rate": 4.684679595418595e-06,
"loss": 0.0755,
"step": 1777
},
{
"epoch": 0.8089171974522293,
"grad_norm": 1.60762031761907,
"learning_rate": 4.684332075483843e-06,
"loss": 0.078,
"step": 1778
},
{
"epoch": 0.8093721565059144,
"grad_norm": 1.7415989004852326,
"learning_rate": 4.6839843770553374e-06,
"loss": 0.1354,
"step": 1779
},
{
"epoch": 0.8098271155595996,
"grad_norm": 1.2070058349523003,
"learning_rate": 4.683636500161491e-06,
"loss": 0.0801,
"step": 1780
},
{
"epoch": 0.8102820746132848,
"grad_norm": 2.223701811300553,
"learning_rate": 4.683288444830732e-06,
"loss": 0.155,
"step": 1781
},
{
"epoch": 0.8107370336669699,
"grad_norm": 1.386561374212108,
"learning_rate": 4.6829402110915015e-06,
"loss": 0.0788,
"step": 1782
},
{
"epoch": 0.8111919927206551,
"grad_norm": 1.5941557438159017,
"learning_rate": 4.682591798972253e-06,
"loss": 0.0965,
"step": 1783
},
{
"epoch": 0.8116469517743403,
"grad_norm": 1.2984908291809154,
"learning_rate": 4.682243208501461e-06,
"loss": 0.0785,
"step": 1784
},
{
"epoch": 0.8121019108280255,
"grad_norm": 1.3916802407448716,
"learning_rate": 4.681894439707609e-06,
"loss": 0.0707,
"step": 1785
},
{
"epoch": 0.8125568698817106,
"grad_norm": 1.275188321376173,
"learning_rate": 4.681545492619195e-06,
"loss": 0.0845,
"step": 1786
},
{
"epoch": 0.8130118289353958,
"grad_norm": 1.6216179048744568,
"learning_rate": 4.681196367264736e-06,
"loss": 0.1003,
"step": 1787
},
{
"epoch": 0.813466787989081,
"grad_norm": 1.6523815915742526,
"learning_rate": 4.680847063672761e-06,
"loss": 0.1067,
"step": 1788
},
{
"epoch": 0.8139217470427661,
"grad_norm": 1.2993987478913556,
"learning_rate": 4.680497581871811e-06,
"loss": 0.0935,
"step": 1789
},
{
"epoch": 0.8143767060964513,
"grad_norm": 1.9221257157278642,
"learning_rate": 4.680147921890447e-06,
"loss": 0.1051,
"step": 1790
},
{
"epoch": 0.8148316651501365,
"grad_norm": 1.6348625375140673,
"learning_rate": 4.67979808375724e-06,
"loss": 0.0903,
"step": 1791
},
{
"epoch": 0.8152866242038217,
"grad_norm": 1.591568957822019,
"learning_rate": 4.679448067500777e-06,
"loss": 0.0909,
"step": 1792
},
{
"epoch": 0.8157415832575068,
"grad_norm": 10.097792186896083,
"learning_rate": 4.67909787314966e-06,
"loss": 0.2083,
"step": 1793
},
{
"epoch": 0.816196542311192,
"grad_norm": 1.7756594176954685,
"learning_rate": 4.678747500732505e-06,
"loss": 0.12,
"step": 1794
},
{
"epoch": 0.8166515013648772,
"grad_norm": 1.292124863258569,
"learning_rate": 4.6783969502779455e-06,
"loss": 0.091,
"step": 1795
},
{
"epoch": 0.8171064604185623,
"grad_norm": 1.6688778356922547,
"learning_rate": 4.6780462218146236e-06,
"loss": 0.1165,
"step": 1796
},
{
"epoch": 0.8175614194722475,
"grad_norm": 1.7180044442845732,
"learning_rate": 4.6776953153712005e-06,
"loss": 0.136,
"step": 1797
},
{
"epoch": 0.8180163785259327,
"grad_norm": 1.480949834935634,
"learning_rate": 4.67734423097635e-06,
"loss": 0.1014,
"step": 1798
},
{
"epoch": 0.8184713375796179,
"grad_norm": 1.1428278249187394,
"learning_rate": 4.676992968658762e-06,
"loss": 0.095,
"step": 1799
},
{
"epoch": 0.818926296633303,
"grad_norm": 1.2399013375886503,
"learning_rate": 4.67664152844714e-06,
"loss": 0.0844,
"step": 1800
},
{
"epoch": 0.8193812556869882,
"grad_norm": 1.7138722388873069,
"learning_rate": 4.676289910370202e-06,
"loss": 0.0843,
"step": 1801
},
{
"epoch": 0.8198362147406734,
"grad_norm": 2.1122540273521917,
"learning_rate": 4.675938114456682e-06,
"loss": 0.1242,
"step": 1802
},
{
"epoch": 0.8202911737943585,
"grad_norm": 1.2962455851687862,
"learning_rate": 4.675586140735323e-06,
"loss": 0.0865,
"step": 1803
},
{
"epoch": 0.8207461328480437,
"grad_norm": 1.758157481346009,
"learning_rate": 4.675233989234891e-06,
"loss": 0.11,
"step": 1804
},
{
"epoch": 0.8212010919017289,
"grad_norm": 1.1367235297205203,
"learning_rate": 4.67488165998416e-06,
"loss": 0.0712,
"step": 1805
},
{
"epoch": 0.821656050955414,
"grad_norm": 1.7362201964597195,
"learning_rate": 4.674529153011922e-06,
"loss": 0.125,
"step": 1806
},
{
"epoch": 0.8221110100090991,
"grad_norm": 1.8194542425018903,
"learning_rate": 4.674176468346982e-06,
"loss": 0.1211,
"step": 1807
},
{
"epoch": 0.8225659690627843,
"grad_norm": 1.266991443456481,
"learning_rate": 4.673823606018158e-06,
"loss": 0.0728,
"step": 1808
},
{
"epoch": 0.8230209281164695,
"grad_norm": 1.5932681379034783,
"learning_rate": 4.673470566054288e-06,
"loss": 0.1112,
"step": 1809
},
{
"epoch": 0.8234758871701547,
"grad_norm": 1.6394037488609823,
"learning_rate": 4.673117348484217e-06,
"loss": 0.0904,
"step": 1810
},
{
"epoch": 0.8239308462238398,
"grad_norm": 1.6075069533857789,
"learning_rate": 4.672763953336811e-06,
"loss": 0.0851,
"step": 1811
},
{
"epoch": 0.824385805277525,
"grad_norm": 2.0887533751903953,
"learning_rate": 4.672410380640946e-06,
"loss": 0.1259,
"step": 1812
},
{
"epoch": 0.8248407643312102,
"grad_norm": 1.4047419878725926,
"learning_rate": 4.672056630425516e-06,
"loss": 0.0926,
"step": 1813
},
{
"epoch": 0.8252957233848953,
"grad_norm": 1.3090222394661126,
"learning_rate": 4.671702702719426e-06,
"loss": 0.0815,
"step": 1814
},
{
"epoch": 0.8257506824385805,
"grad_norm": 1.4844416960380937,
"learning_rate": 4.671348597551599e-06,
"loss": 0.1006,
"step": 1815
},
{
"epoch": 0.8262056414922657,
"grad_norm": 1.5208683282093747,
"learning_rate": 4.670994314950971e-06,
"loss": 0.0889,
"step": 1816
},
{
"epoch": 0.8266606005459509,
"grad_norm": 1.4736427666739214,
"learning_rate": 4.6706398549464905e-06,
"loss": 0.0733,
"step": 1817
},
{
"epoch": 0.827115559599636,
"grad_norm": 0.9170331048659165,
"learning_rate": 4.670285217567124e-06,
"loss": 0.0444,
"step": 1818
},
{
"epoch": 0.8275705186533212,
"grad_norm": 1.53493498663163,
"learning_rate": 4.6699304028418516e-06,
"loss": 0.1041,
"step": 1819
},
{
"epoch": 0.8280254777070064,
"grad_norm": 1.8019365470508273,
"learning_rate": 4.669575410799665e-06,
"loss": 0.0978,
"step": 1820
},
{
"epoch": 0.8284804367606915,
"grad_norm": 1.2273449371459113,
"learning_rate": 4.669220241469573e-06,
"loss": 0.0943,
"step": 1821
},
{
"epoch": 0.8289353958143767,
"grad_norm": 1.3479797120899357,
"learning_rate": 4.668864894880599e-06,
"loss": 0.0971,
"step": 1822
},
{
"epoch": 0.8293903548680619,
"grad_norm": 4.546793784746178,
"learning_rate": 4.668509371061781e-06,
"loss": 0.2014,
"step": 1823
},
{
"epoch": 0.8298453139217471,
"grad_norm": 2.1255643817651406,
"learning_rate": 4.668153670042171e-06,
"loss": 0.1161,
"step": 1824
},
{
"epoch": 0.8303002729754322,
"grad_norm": 1.2727939490852902,
"learning_rate": 4.667797791850833e-06,
"loss": 0.0747,
"step": 1825
},
{
"epoch": 0.8307552320291174,
"grad_norm": 1.4060666687371306,
"learning_rate": 4.6674417365168495e-06,
"loss": 0.0844,
"step": 1826
},
{
"epoch": 0.8312101910828026,
"grad_norm": 1.761485619412532,
"learning_rate": 4.667085504069315e-06,
"loss": 0.1116,
"step": 1827
},
{
"epoch": 0.8316651501364877,
"grad_norm": 1.9328273968129432,
"learning_rate": 4.66672909453734e-06,
"loss": 0.1475,
"step": 1828
},
{
"epoch": 0.8321201091901729,
"grad_norm": 8.632824223723155,
"learning_rate": 4.6663725079500485e-06,
"loss": 0.1256,
"step": 1829
},
{
"epoch": 0.8325750682438581,
"grad_norm": 1.6100163480643372,
"learning_rate": 4.666015744336578e-06,
"loss": 0.0792,
"step": 1830
},
{
"epoch": 0.8330300272975433,
"grad_norm": 1.0641733424560582,
"learning_rate": 4.665658803726083e-06,
"loss": 0.0793,
"step": 1831
},
{
"epoch": 0.8334849863512284,
"grad_norm": 1.177632875705685,
"learning_rate": 4.6653016861477315e-06,
"loss": 0.0757,
"step": 1832
},
{
"epoch": 0.8339399454049136,
"grad_norm": 1.3697702067415636,
"learning_rate": 4.664944391630704e-06,
"loss": 0.0762,
"step": 1833
},
{
"epoch": 0.8343949044585988,
"grad_norm": 1.529924651084583,
"learning_rate": 4.664586920204197e-06,
"loss": 0.0942,
"step": 1834
},
{
"epoch": 0.8348498635122839,
"grad_norm": 1.5597346422795266,
"learning_rate": 4.664229271897422e-06,
"loss": 0.0807,
"step": 1835
},
{
"epoch": 0.835304822565969,
"grad_norm": 1.1979216966579072,
"learning_rate": 4.663871446739606e-06,
"loss": 0.1023,
"step": 1836
},
{
"epoch": 0.8357597816196543,
"grad_norm": 1.9885793595161931,
"learning_rate": 4.663513444759986e-06,
"loss": 0.1081,
"step": 1837
},
{
"epoch": 0.8362147406733395,
"grad_norm": 1.803945786045615,
"learning_rate": 4.663155265987818e-06,
"loss": 0.1046,
"step": 1838
},
{
"epoch": 0.8366696997270245,
"grad_norm": 1.4426781581496406,
"learning_rate": 4.66279691045237e-06,
"loss": 0.0862,
"step": 1839
},
{
"epoch": 0.8371246587807097,
"grad_norm": 2.026712182903179,
"learning_rate": 4.662438378182927e-06,
"loss": 0.1318,
"step": 1840
},
{
"epoch": 0.8375796178343949,
"grad_norm": 1.3508804333427062,
"learning_rate": 4.662079669208783e-06,
"loss": 0.0855,
"step": 1841
},
{
"epoch": 0.83803457688808,
"grad_norm": 1.8060788779967734,
"learning_rate": 4.661720783559254e-06,
"loss": 0.118,
"step": 1842
},
{
"epoch": 0.8384895359417652,
"grad_norm": 1.3650594998874732,
"learning_rate": 4.661361721263664e-06,
"loss": 0.0792,
"step": 1843
},
{
"epoch": 0.8389444949954504,
"grad_norm": 1.5308874946054334,
"learning_rate": 4.661002482351355e-06,
"loss": 0.092,
"step": 1844
},
{
"epoch": 0.8393994540491356,
"grad_norm": 1.5787726497224914,
"learning_rate": 4.660643066851682e-06,
"loss": 0.0864,
"step": 1845
},
{
"epoch": 0.8398544131028207,
"grad_norm": 1.3050034033987155,
"learning_rate": 4.6602834747940155e-06,
"loss": 0.0882,
"step": 1846
},
{
"epoch": 0.8403093721565059,
"grad_norm": 1.3933693896920527,
"learning_rate": 4.6599237062077385e-06,
"loss": 0.082,
"step": 1847
},
{
"epoch": 0.8407643312101911,
"grad_norm": 1.4441581935162036,
"learning_rate": 4.65956376112225e-06,
"loss": 0.0845,
"step": 1848
},
{
"epoch": 0.8412192902638762,
"grad_norm": 1.0740044757490639,
"learning_rate": 4.659203639566965e-06,
"loss": 0.0579,
"step": 1849
},
{
"epoch": 0.8416742493175614,
"grad_norm": 1.4897808416015064,
"learning_rate": 4.658843341571308e-06,
"loss": 0.0928,
"step": 1850
},
{
"epoch": 0.8421292083712466,
"grad_norm": 1.2736939992740985,
"learning_rate": 4.6584828671647235e-06,
"loss": 0.0678,
"step": 1851
},
{
"epoch": 0.8425841674249318,
"grad_norm": 1.7454965393572843,
"learning_rate": 4.658122216376666e-06,
"loss": 0.1273,
"step": 1852
},
{
"epoch": 0.8430391264786169,
"grad_norm": 2.203665099645685,
"learning_rate": 4.657761389236607e-06,
"loss": 0.1483,
"step": 1853
},
{
"epoch": 0.8434940855323021,
"grad_norm": 2.2587200410334796,
"learning_rate": 4.657400385774032e-06,
"loss": 0.1392,
"step": 1854
},
{
"epoch": 0.8439490445859873,
"grad_norm": 1.4611165706322784,
"learning_rate": 4.65703920601844e-06,
"loss": 0.108,
"step": 1855
},
{
"epoch": 0.8444040036396724,
"grad_norm": 1.4687908651365826,
"learning_rate": 4.656677849999345e-06,
"loss": 0.0861,
"step": 1856
},
{
"epoch": 0.8448589626933576,
"grad_norm": 1.2067561638456004,
"learning_rate": 4.656316317746275e-06,
"loss": 0.0591,
"step": 1857
},
{
"epoch": 0.8453139217470428,
"grad_norm": 1.4053103637325914,
"learning_rate": 4.655954609288775e-06,
"loss": 0.0913,
"step": 1858
},
{
"epoch": 0.845768880800728,
"grad_norm": 1.5935104104332813,
"learning_rate": 4.655592724656399e-06,
"loss": 0.101,
"step": 1859
},
{
"epoch": 0.8462238398544131,
"grad_norm": 1.1076063342242028,
"learning_rate": 4.655230663878721e-06,
"loss": 0.0682,
"step": 1860
},
{
"epoch": 0.8466787989080983,
"grad_norm": 1.3547571746668823,
"learning_rate": 4.654868426985326e-06,
"loss": 0.0783,
"step": 1861
},
{
"epoch": 0.8471337579617835,
"grad_norm": 2.189936562068025,
"learning_rate": 4.654506014005814e-06,
"loss": 0.1489,
"step": 1862
},
{
"epoch": 0.8475887170154686,
"grad_norm": 1.8695626003234893,
"learning_rate": 4.6541434249698e-06,
"loss": 0.1257,
"step": 1863
},
{
"epoch": 0.8480436760691538,
"grad_norm": 1.0837368326622652,
"learning_rate": 4.6537806599069144e-06,
"loss": 0.0739,
"step": 1864
},
{
"epoch": 0.848498635122839,
"grad_norm": 1.7007189798619473,
"learning_rate": 4.653417718846799e-06,
"loss": 0.1008,
"step": 1865
},
{
"epoch": 0.8489535941765242,
"grad_norm": 2.0060980879747476,
"learning_rate": 4.6530546018191126e-06,
"loss": 0.1085,
"step": 1866
},
{
"epoch": 0.8494085532302093,
"grad_norm": 2.2978969469139594,
"learning_rate": 4.652691308853526e-06,
"loss": 0.0987,
"step": 1867
},
{
"epoch": 0.8498635122838945,
"grad_norm": 1.5780683821835888,
"learning_rate": 4.652327839979729e-06,
"loss": 0.0968,
"step": 1868
},
{
"epoch": 0.8503184713375797,
"grad_norm": 1.4785104975671204,
"learning_rate": 4.651964195227419e-06,
"loss": 0.0789,
"step": 1869
},
{
"epoch": 0.8507734303912647,
"grad_norm": 1.3276789113426002,
"learning_rate": 4.651600374626315e-06,
"loss": 0.1042,
"step": 1870
},
{
"epoch": 0.8512283894449499,
"grad_norm": 1.2972649195872694,
"learning_rate": 4.651236378206144e-06,
"loss": 0.0856,
"step": 1871
},
{
"epoch": 0.8516833484986351,
"grad_norm": 1.7278397992212022,
"learning_rate": 4.650872205996651e-06,
"loss": 0.1148,
"step": 1872
},
{
"epoch": 0.8521383075523203,
"grad_norm": 1.8216586488949666,
"learning_rate": 4.650507858027595e-06,
"loss": 0.1079,
"step": 1873
},
{
"epoch": 0.8525932666060054,
"grad_norm": 1.4139889414338878,
"learning_rate": 4.6501433343287475e-06,
"loss": 0.0875,
"step": 1874
},
{
"epoch": 0.8530482256596906,
"grad_norm": 1.7248584454041247,
"learning_rate": 4.6497786349298975e-06,
"loss": 0.0971,
"step": 1875
},
{
"epoch": 0.8535031847133758,
"grad_norm": 1.4315172966956178,
"learning_rate": 4.649413759860846e-06,
"loss": 0.0786,
"step": 1876
},
{
"epoch": 0.8539581437670609,
"grad_norm": 1.753091182567414,
"learning_rate": 4.649048709151408e-06,
"loss": 0.1209,
"step": 1877
},
{
"epoch": 0.8544131028207461,
"grad_norm": 1.1669060948418768,
"learning_rate": 4.648683482831415e-06,
"loss": 0.0806,
"step": 1878
},
{
"epoch": 0.8548680618744313,
"grad_norm": 1.9265886537142733,
"learning_rate": 4.648318080930711e-06,
"loss": 0.1334,
"step": 1879
},
{
"epoch": 0.8553230209281165,
"grad_norm": 4.314311448137681,
"learning_rate": 4.647952503479154e-06,
"loss": 0.178,
"step": 1880
},
{
"epoch": 0.8557779799818016,
"grad_norm": 1.7124103845535494,
"learning_rate": 4.6475867505066195e-06,
"loss": 0.1141,
"step": 1881
},
{
"epoch": 0.8562329390354868,
"grad_norm": 1.7108302555198733,
"learning_rate": 4.647220822042995e-06,
"loss": 0.123,
"step": 1882
},
{
"epoch": 0.856687898089172,
"grad_norm": 1.5372453395034074,
"learning_rate": 4.64685471811818e-06,
"loss": 0.0801,
"step": 1883
},
{
"epoch": 0.8571428571428571,
"grad_norm": 1.5126705629909598,
"learning_rate": 4.646488438762094e-06,
"loss": 0.1193,
"step": 1884
},
{
"epoch": 0.8575978161965423,
"grad_norm": 1.9269079385312733,
"learning_rate": 4.646121984004666e-06,
"loss": 0.1244,
"step": 1885
},
{
"epoch": 0.8580527752502275,
"grad_norm": 2.2684708844494144,
"learning_rate": 4.64575535387584e-06,
"loss": 0.1279,
"step": 1886
},
{
"epoch": 0.8585077343039127,
"grad_norm": 1.670087782048151,
"learning_rate": 4.645388548405578e-06,
"loss": 0.1023,
"step": 1887
},
{
"epoch": 0.8589626933575978,
"grad_norm": 1.8607124006440674,
"learning_rate": 4.645021567623852e-06,
"loss": 0.1094,
"step": 1888
},
{
"epoch": 0.859417652411283,
"grad_norm": 1.5175623385645085,
"learning_rate": 4.644654411560651e-06,
"loss": 0.0996,
"step": 1889
},
{
"epoch": 0.8598726114649682,
"grad_norm": 1.6936906161685268,
"learning_rate": 4.644287080245975e-06,
"loss": 0.0967,
"step": 1890
},
{
"epoch": 0.8603275705186533,
"grad_norm": 1.702801567909897,
"learning_rate": 4.643919573709843e-06,
"loss": 0.1202,
"step": 1891
},
{
"epoch": 0.8607825295723385,
"grad_norm": 1.8484706111091351,
"learning_rate": 4.6435518919822854e-06,
"loss": 0.1189,
"step": 1892
},
{
"epoch": 0.8612374886260237,
"grad_norm": 1.3074850022658593,
"learning_rate": 4.643184035093348e-06,
"loss": 0.0765,
"step": 1893
},
{
"epoch": 0.8616924476797089,
"grad_norm": 1.5927282725036787,
"learning_rate": 4.642816003073089e-06,
"loss": 0.0905,
"step": 1894
},
{
"epoch": 0.862147406733394,
"grad_norm": 1.2778897141301788,
"learning_rate": 4.6424477959515836e-06,
"loss": 0.0949,
"step": 1895
},
{
"epoch": 0.8626023657870792,
"grad_norm": 1.4290175115369155,
"learning_rate": 4.642079413758919e-06,
"loss": 0.0913,
"step": 1896
},
{
"epoch": 0.8630573248407644,
"grad_norm": 1.4520897103313697,
"learning_rate": 4.641710856525199e-06,
"loss": 0.0896,
"step": 1897
},
{
"epoch": 0.8635122838944495,
"grad_norm": 1.6787474017663324,
"learning_rate": 4.641342124280539e-06,
"loss": 0.1299,
"step": 1898
},
{
"epoch": 0.8639672429481347,
"grad_norm": 1.1624658106387618,
"learning_rate": 4.6409732170550705e-06,
"loss": 0.0734,
"step": 1899
},
{
"epoch": 0.8644222020018199,
"grad_norm": 1.4729283765471304,
"learning_rate": 4.64060413487894e-06,
"loss": 0.1152,
"step": 1900
},
{
"epoch": 0.864877161055505,
"grad_norm": 1.2495961485560472,
"learning_rate": 4.640234877782306e-06,
"loss": 0.07,
"step": 1901
},
{
"epoch": 0.8653321201091901,
"grad_norm": 1.3795532525061756,
"learning_rate": 4.639865445795344e-06,
"loss": 0.0664,
"step": 1902
},
{
"epoch": 0.8657870791628753,
"grad_norm": 1.1223729388488364,
"learning_rate": 4.63949583894824e-06,
"loss": 0.0746,
"step": 1903
},
{
"epoch": 0.8662420382165605,
"grad_norm": 1.8028240933781334,
"learning_rate": 4.639126057271199e-06,
"loss": 0.1168,
"step": 1904
},
{
"epoch": 0.8666969972702456,
"grad_norm": 2.2543814411508585,
"learning_rate": 4.6387561007944355e-06,
"loss": 0.1905,
"step": 1905
},
{
"epoch": 0.8671519563239308,
"grad_norm": 1.6271318494510778,
"learning_rate": 4.638385969548183e-06,
"loss": 0.1309,
"step": 1906
},
{
"epoch": 0.867606915377616,
"grad_norm": 1.2925438472778907,
"learning_rate": 4.638015663562686e-06,
"loss": 0.0935,
"step": 1907
},
{
"epoch": 0.8680618744313012,
"grad_norm": 1.5349623239831232,
"learning_rate": 4.637645182868204e-06,
"loss": 0.0955,
"step": 1908
},
{
"epoch": 0.8685168334849863,
"grad_norm": 1.1090852503032294,
"learning_rate": 4.637274527495011e-06,
"loss": 0.0627,
"step": 1909
},
{
"epoch": 0.8689717925386715,
"grad_norm": 1.3576365082507504,
"learning_rate": 4.6369036974733955e-06,
"loss": 0.0892,
"step": 1910
},
{
"epoch": 0.8694267515923567,
"grad_norm": 1.3013579320639557,
"learning_rate": 4.63653269283366e-06,
"loss": 0.0872,
"step": 1911
},
{
"epoch": 0.8698817106460418,
"grad_norm": 1.3423951865701553,
"learning_rate": 4.636161513606122e-06,
"loss": 0.1124,
"step": 1912
},
{
"epoch": 0.870336669699727,
"grad_norm": 1.5125877550581834,
"learning_rate": 4.6357901598211105e-06,
"loss": 0.1136,
"step": 1913
},
{
"epoch": 0.8707916287534122,
"grad_norm": 1.3908271970593282,
"learning_rate": 4.635418631508974e-06,
"loss": 0.0879,
"step": 1914
},
{
"epoch": 0.8712465878070974,
"grad_norm": 1.269444632546315,
"learning_rate": 4.635046928700069e-06,
"loss": 0.0963,
"step": 1915
},
{
"epoch": 0.8717015468607825,
"grad_norm": 1.5552630147374251,
"learning_rate": 4.634675051424771e-06,
"loss": 0.0938,
"step": 1916
},
{
"epoch": 0.8721565059144677,
"grad_norm": 1.2657711758454817,
"learning_rate": 4.634302999713468e-06,
"loss": 0.0772,
"step": 1917
},
{
"epoch": 0.8726114649681529,
"grad_norm": 2.5181910772046394,
"learning_rate": 4.633930773596563e-06,
"loss": 0.1552,
"step": 1918
},
{
"epoch": 0.873066424021838,
"grad_norm": 1.850598086876328,
"learning_rate": 4.633558373104472e-06,
"loss": 0.1332,
"step": 1919
},
{
"epoch": 0.8735213830755232,
"grad_norm": 1.3941232502211163,
"learning_rate": 4.633185798267625e-06,
"loss": 0.1069,
"step": 1920
},
{
"epoch": 0.8739763421292084,
"grad_norm": 1.7644105148701474,
"learning_rate": 4.632813049116467e-06,
"loss": 0.112,
"step": 1921
},
{
"epoch": 0.8744313011828936,
"grad_norm": 1.6230790274291094,
"learning_rate": 4.63244012568146e-06,
"loss": 0.0926,
"step": 1922
},
{
"epoch": 0.8748862602365787,
"grad_norm": 1.8353813347257433,
"learning_rate": 4.632067027993076e-06,
"loss": 0.1008,
"step": 1923
},
{
"epoch": 0.8753412192902639,
"grad_norm": 1.6963512631134225,
"learning_rate": 4.631693756081802e-06,
"loss": 0.1085,
"step": 1924
},
{
"epoch": 0.8757961783439491,
"grad_norm": 1.3821835844572639,
"learning_rate": 4.631320309978141e-06,
"loss": 0.1048,
"step": 1925
},
{
"epoch": 0.8762511373976342,
"grad_norm": 1.8006435232211586,
"learning_rate": 4.630946689712609e-06,
"loss": 0.1161,
"step": 1926
},
{
"epoch": 0.8767060964513194,
"grad_norm": 1.4677928424313746,
"learning_rate": 4.630572895315737e-06,
"loss": 0.0869,
"step": 1927
},
{
"epoch": 0.8771610555050046,
"grad_norm": 1.0039983293317123,
"learning_rate": 4.63019892681807e-06,
"loss": 0.0677,
"step": 1928
},
{
"epoch": 0.8776160145586898,
"grad_norm": 1.0380886358048869,
"learning_rate": 4.629824784250166e-06,
"loss": 0.0718,
"step": 1929
},
{
"epoch": 0.8780709736123748,
"grad_norm": 1.2258052916757236,
"learning_rate": 4.629450467642599e-06,
"loss": 0.0766,
"step": 1930
},
{
"epoch": 0.87852593266606,
"grad_norm": 1.2915914597688039,
"learning_rate": 4.629075977025957e-06,
"loss": 0.0936,
"step": 1931
},
{
"epoch": 0.8789808917197452,
"grad_norm": 1.5105085754665042,
"learning_rate": 4.62870131243084e-06,
"loss": 0.1132,
"step": 1932
},
{
"epoch": 0.8794358507734303,
"grad_norm": 1.1778695514626771,
"learning_rate": 4.628326473887865e-06,
"loss": 0.0628,
"step": 1933
},
{
"epoch": 0.8798908098271155,
"grad_norm": 1.3631999608539114,
"learning_rate": 4.627951461427663e-06,
"loss": 0.0732,
"step": 1934
},
{
"epoch": 0.8803457688808007,
"grad_norm": 1.2567766444164166,
"learning_rate": 4.627576275080876e-06,
"loss": 0.0753,
"step": 1935
},
{
"epoch": 0.8808007279344859,
"grad_norm": 1.3480297690817413,
"learning_rate": 4.627200914878165e-06,
"loss": 0.0896,
"step": 1936
},
{
"epoch": 0.881255686988171,
"grad_norm": 1.7364720181858189,
"learning_rate": 4.6268253808502005e-06,
"loss": 0.1262,
"step": 1937
},
{
"epoch": 0.8817106460418562,
"grad_norm": 1.0626111071231377,
"learning_rate": 4.626449673027671e-06,
"loss": 0.0606,
"step": 1938
},
{
"epoch": 0.8821656050955414,
"grad_norm": 1.3200320755167188,
"learning_rate": 4.626073791441278e-06,
"loss": 0.0863,
"step": 1939
},
{
"epoch": 0.8826205641492265,
"grad_norm": 1.6285352743033554,
"learning_rate": 4.625697736121735e-06,
"loss": 0.1074,
"step": 1940
},
{
"epoch": 0.8830755232029117,
"grad_norm": 1.9619338739386996,
"learning_rate": 4.6253215070997735e-06,
"loss": 0.144,
"step": 1941
},
{
"epoch": 0.8835304822565969,
"grad_norm": 1.3282837474813922,
"learning_rate": 4.624945104406135e-06,
"loss": 0.0848,
"step": 1942
},
{
"epoch": 0.8839854413102821,
"grad_norm": 1.445768351624459,
"learning_rate": 4.624568528071579e-06,
"loss": 0.0781,
"step": 1943
},
{
"epoch": 0.8844404003639672,
"grad_norm": 1.4862896876471505,
"learning_rate": 4.624191778126879e-06,
"loss": 0.1004,
"step": 1944
},
{
"epoch": 0.8848953594176524,
"grad_norm": 1.764835736083021,
"learning_rate": 4.623814854602818e-06,
"loss": 0.126,
"step": 1945
},
{
"epoch": 0.8853503184713376,
"grad_norm": 1.5473555941944201,
"learning_rate": 4.623437757530198e-06,
"loss": 0.102,
"step": 1946
},
{
"epoch": 0.8858052775250227,
"grad_norm": 1.208960470604565,
"learning_rate": 4.623060486939835e-06,
"loss": 0.0877,
"step": 1947
},
{
"epoch": 0.8862602365787079,
"grad_norm": 2.075304609707155,
"learning_rate": 4.622683042862556e-06,
"loss": 0.1296,
"step": 1948
},
{
"epoch": 0.8867151956323931,
"grad_norm": 1.3616698341531555,
"learning_rate": 4.622305425329205e-06,
"loss": 0.0809,
"step": 1949
},
{
"epoch": 0.8871701546860783,
"grad_norm": 1.037693244784599,
"learning_rate": 4.621927634370638e-06,
"loss": 0.069,
"step": 1950
},
{
"epoch": 0.8876251137397634,
"grad_norm": 1.385175285733036,
"learning_rate": 4.621549670017727e-06,
"loss": 0.0852,
"step": 1951
},
{
"epoch": 0.8880800727934486,
"grad_norm": 1.511337977341827,
"learning_rate": 4.6211715323013595e-06,
"loss": 0.0951,
"step": 1952
},
{
"epoch": 0.8885350318471338,
"grad_norm": 1.3257350384392963,
"learning_rate": 4.6207932212524325e-06,
"loss": 0.1089,
"step": 1953
},
{
"epoch": 0.8889899909008189,
"grad_norm": 1.644272679286999,
"learning_rate": 4.620414736901861e-06,
"loss": 0.1032,
"step": 1954
},
{
"epoch": 0.8894449499545041,
"grad_norm": 1.467980610687172,
"learning_rate": 4.620036079280573e-06,
"loss": 0.1087,
"step": 1955
},
{
"epoch": 0.8898999090081893,
"grad_norm": 1.7890199290097137,
"learning_rate": 4.619657248419511e-06,
"loss": 0.1292,
"step": 1956
},
{
"epoch": 0.8903548680618745,
"grad_norm": 1.2684892744943759,
"learning_rate": 4.61927824434963e-06,
"loss": 0.084,
"step": 1957
},
{
"epoch": 0.8908098271155596,
"grad_norm": 1.5369979197668246,
"learning_rate": 4.6188990671019015e-06,
"loss": 0.1069,
"step": 1958
},
{
"epoch": 0.8912647861692448,
"grad_norm": 1.4903088456543865,
"learning_rate": 4.618519716707311e-06,
"loss": 0.1046,
"step": 1959
},
{
"epoch": 0.89171974522293,
"grad_norm": 1.4234136171740401,
"learning_rate": 4.618140193196856e-06,
"loss": 0.0954,
"step": 1960
},
{
"epoch": 0.892174704276615,
"grad_norm": 1.4256868330149868,
"learning_rate": 4.61776049660155e-06,
"loss": 0.1061,
"step": 1961
},
{
"epoch": 0.8926296633303002,
"grad_norm": 1.0896312490692992,
"learning_rate": 4.61738062695242e-06,
"loss": 0.0685,
"step": 1962
},
{
"epoch": 0.8930846223839854,
"grad_norm": 1.6230164914703122,
"learning_rate": 4.617000584280506e-06,
"loss": 0.078,
"step": 1963
},
{
"epoch": 0.8935395814376706,
"grad_norm": 1.532700386976957,
"learning_rate": 4.616620368616866e-06,
"loss": 0.1004,
"step": 1964
},
{
"epoch": 0.8939945404913557,
"grad_norm": 2.1875161300164088,
"learning_rate": 4.616239979992568e-06,
"loss": 0.1662,
"step": 1965
},
{
"epoch": 0.8944494995450409,
"grad_norm": 1.5587825874123524,
"learning_rate": 4.615859418438695e-06,
"loss": 0.0949,
"step": 1966
},
{
"epoch": 0.8949044585987261,
"grad_norm": 1.0305034082384465,
"learning_rate": 4.615478683986345e-06,
"loss": 0.0661,
"step": 1967
},
{
"epoch": 0.8953594176524113,
"grad_norm": 2.128047847661886,
"learning_rate": 4.6150977766666315e-06,
"loss": 0.1668,
"step": 1968
},
{
"epoch": 0.8958143767060964,
"grad_norm": 3.489557977843618,
"learning_rate": 4.614716696510679e-06,
"loss": 0.1712,
"step": 1969
},
{
"epoch": 0.8962693357597816,
"grad_norm": 1.7769003003393216,
"learning_rate": 4.614335443549628e-06,
"loss": 0.1045,
"step": 1970
},
{
"epoch": 0.8967242948134668,
"grad_norm": 1.1115237845484403,
"learning_rate": 4.613954017814633e-06,
"loss": 0.0622,
"step": 1971
},
{
"epoch": 0.8971792538671519,
"grad_norm": 1.5013208339193664,
"learning_rate": 4.613572419336862e-06,
"loss": 0.0901,
"step": 1972
},
{
"epoch": 0.8976342129208371,
"grad_norm": 1.8914963811365617,
"learning_rate": 4.613190648147497e-06,
"loss": 0.122,
"step": 1973
},
{
"epoch": 0.8980891719745223,
"grad_norm": 1.5463380525844979,
"learning_rate": 4.612808704277736e-06,
"loss": 0.1085,
"step": 1974
},
{
"epoch": 0.8985441310282075,
"grad_norm": 2.025461784041365,
"learning_rate": 4.612426587758789e-06,
"loss": 0.1584,
"step": 1975
},
{
"epoch": 0.8989990900818926,
"grad_norm": 1.3179625167865827,
"learning_rate": 4.612044298621881e-06,
"loss": 0.0899,
"step": 1976
},
{
"epoch": 0.8994540491355778,
"grad_norm": 1.9415947301083212,
"learning_rate": 4.611661836898252e-06,
"loss": 0.1089,
"step": 1977
},
{
"epoch": 0.899909008189263,
"grad_norm": 1.7954614950390768,
"learning_rate": 4.611279202619151e-06,
"loss": 0.1164,
"step": 1978
},
{
"epoch": 0.9003639672429481,
"grad_norm": 1.6871453686213593,
"learning_rate": 4.61089639581585e-06,
"loss": 0.1137,
"step": 1979
},
{
"epoch": 0.9008189262966333,
"grad_norm": 1.7734698119689802,
"learning_rate": 4.610513416519628e-06,
"loss": 0.1057,
"step": 1980
},
{
"epoch": 0.9012738853503185,
"grad_norm": 1.4584208401217962,
"learning_rate": 4.6101302647617806e-06,
"loss": 0.1015,
"step": 1981
},
{
"epoch": 0.9017288444040037,
"grad_norm": 1.6070445694078421,
"learning_rate": 4.609746940573617e-06,
"loss": 0.0938,
"step": 1982
},
{
"epoch": 0.9021838034576888,
"grad_norm": 1.7455436886266977,
"learning_rate": 4.609363443986461e-06,
"loss": 0.1054,
"step": 1983
},
{
"epoch": 0.902638762511374,
"grad_norm": 1.539177132214102,
"learning_rate": 4.60897977503165e-06,
"loss": 0.1065,
"step": 1984
},
{
"epoch": 0.9030937215650592,
"grad_norm": 2.1232448783228466,
"learning_rate": 4.608595933740536e-06,
"loss": 0.1509,
"step": 1985
},
{
"epoch": 0.9035486806187443,
"grad_norm": 1.8306612455581022,
"learning_rate": 4.608211920144485e-06,
"loss": 0.1507,
"step": 1986
},
{
"epoch": 0.9040036396724295,
"grad_norm": 1.8906981305909403,
"learning_rate": 4.607827734274876e-06,
"loss": 0.1279,
"step": 1987
},
{
"epoch": 0.9044585987261147,
"grad_norm": 1.2043844526137453,
"learning_rate": 4.607443376163104e-06,
"loss": 0.0587,
"step": 1988
},
{
"epoch": 0.9049135577797999,
"grad_norm": 1.5805357354670655,
"learning_rate": 4.607058845840576e-06,
"loss": 0.0901,
"step": 1989
},
{
"epoch": 0.905368516833485,
"grad_norm": 1.0726472977315278,
"learning_rate": 4.606674143338714e-06,
"loss": 0.0632,
"step": 1990
},
{
"epoch": 0.9058234758871702,
"grad_norm": 1.2973447399415932,
"learning_rate": 4.606289268688955e-06,
"loss": 0.0695,
"step": 1991
},
{
"epoch": 0.9062784349408554,
"grad_norm": 1.5444078623124102,
"learning_rate": 4.605904221922749e-06,
"loss": 0.097,
"step": 1992
},
{
"epoch": 0.9067333939945404,
"grad_norm": 1.2266104902516182,
"learning_rate": 4.6055190030715605e-06,
"loss": 0.0813,
"step": 1993
},
{
"epoch": 0.9071883530482256,
"grad_norm": 1.367812480965606,
"learning_rate": 4.605133612166868e-06,
"loss": 0.0849,
"step": 1994
},
{
"epoch": 0.9076433121019108,
"grad_norm": 1.477324926137721,
"learning_rate": 4.604748049240162e-06,
"loss": 0.1038,
"step": 1995
},
{
"epoch": 0.908098271155596,
"grad_norm": 1.451806186182134,
"learning_rate": 4.604362314322951e-06,
"loss": 0.0968,
"step": 1996
},
{
"epoch": 0.9085532302092811,
"grad_norm": 1.3200703707312873,
"learning_rate": 4.603976407446756e-06,
"loss": 0.0824,
"step": 1997
},
{
"epoch": 0.9090081892629663,
"grad_norm": 1.4302611512368968,
"learning_rate": 4.603590328643108e-06,
"loss": 0.0697,
"step": 1998
},
{
"epoch": 0.9094631483166515,
"grad_norm": 1.3896738683560679,
"learning_rate": 4.60320407794356e-06,
"loss": 0.0845,
"step": 1999
},
{
"epoch": 0.9099181073703366,
"grad_norm": 1.2636123499087144,
"learning_rate": 4.602817655379672e-06,
"loss": 0.0956,
"step": 2000
},
{
"epoch": 0.9103730664240218,
"grad_norm": 1.6260464547343871,
"learning_rate": 4.602431060983022e-06,
"loss": 0.0964,
"step": 2001
},
{
"epoch": 0.910828025477707,
"grad_norm": 1.6435883240849032,
"learning_rate": 4.6020442947852e-06,
"loss": 0.1204,
"step": 2002
},
{
"epoch": 0.9112829845313922,
"grad_norm": 1.1627579803449954,
"learning_rate": 4.6016573568178105e-06,
"loss": 0.0658,
"step": 2003
},
{
"epoch": 0.9117379435850773,
"grad_norm": 1.4830323644470826,
"learning_rate": 4.601270247112473e-06,
"loss": 0.1393,
"step": 2004
},
{
"epoch": 0.9121929026387625,
"grad_norm": 1.749363317911088,
"learning_rate": 4.60088296570082e-06,
"loss": 0.0958,
"step": 2005
},
{
"epoch": 0.9126478616924477,
"grad_norm": 1.0989290307772166,
"learning_rate": 4.600495512614499e-06,
"loss": 0.0725,
"step": 2006
},
{
"epoch": 0.9131028207461328,
"grad_norm": 1.9104698820808548,
"learning_rate": 4.60010788788517e-06,
"loss": 0.1416,
"step": 2007
},
{
"epoch": 0.913557779799818,
"grad_norm": 1.5990058826046467,
"learning_rate": 4.5997200915445095e-06,
"loss": 0.0988,
"step": 2008
},
{
"epoch": 0.9140127388535032,
"grad_norm": 1.3776065339397323,
"learning_rate": 4.599332123624204e-06,
"loss": 0.0954,
"step": 2009
},
{
"epoch": 0.9144676979071884,
"grad_norm": 1.5565907276733302,
"learning_rate": 4.598943984155959e-06,
"loss": 0.1119,
"step": 2010
},
{
"epoch": 0.9149226569608735,
"grad_norm": 1.3912866638954073,
"learning_rate": 4.598555673171489e-06,
"loss": 0.0786,
"step": 2011
},
{
"epoch": 0.9153776160145587,
"grad_norm": 1.205108886240087,
"learning_rate": 4.5981671907025275e-06,
"loss": 0.0718,
"step": 2012
},
{
"epoch": 0.9158325750682439,
"grad_norm": 1.5559249472503263,
"learning_rate": 4.597778536780818e-06,
"loss": 0.0844,
"step": 2013
},
{
"epoch": 0.916287534121929,
"grad_norm": 1.5416078099963733,
"learning_rate": 4.597389711438121e-06,
"loss": 0.0985,
"step": 2014
},
{
"epoch": 0.9167424931756142,
"grad_norm": 1.1809384394020321,
"learning_rate": 4.597000714706207e-06,
"loss": 0.0586,
"step": 2015
},
{
"epoch": 0.9171974522292994,
"grad_norm": 2.074795994465729,
"learning_rate": 4.596611546616865e-06,
"loss": 0.1433,
"step": 2016
},
{
"epoch": 0.9176524112829846,
"grad_norm": 1.613757709909132,
"learning_rate": 4.596222207201896e-06,
"loss": 0.0977,
"step": 2017
},
{
"epoch": 0.9181073703366697,
"grad_norm": 2.252514307571058,
"learning_rate": 4.595832696493115e-06,
"loss": 0.163,
"step": 2018
},
{
"epoch": 0.9185623293903549,
"grad_norm": 1.3450701602494668,
"learning_rate": 4.59544301452235e-06,
"loss": 0.0838,
"step": 2019
},
{
"epoch": 0.9190172884440401,
"grad_norm": 1.8222310782671887,
"learning_rate": 4.595053161321444e-06,
"loss": 0.0969,
"step": 2020
},
{
"epoch": 0.9194722474977252,
"grad_norm": 1.355290969082531,
"learning_rate": 4.594663136922256e-06,
"loss": 0.0864,
"step": 2021
},
{
"epoch": 0.9199272065514104,
"grad_norm": 1.2303249878491525,
"learning_rate": 4.594272941356655e-06,
"loss": 0.0766,
"step": 2022
},
{
"epoch": 0.9203821656050956,
"grad_norm": 1.8690439214006958,
"learning_rate": 4.593882574656528e-06,
"loss": 0.123,
"step": 2023
},
{
"epoch": 0.9208371246587808,
"grad_norm": 1.602742311595863,
"learning_rate": 4.5934920368537724e-06,
"loss": 0.0911,
"step": 2024
},
{
"epoch": 0.9212920837124658,
"grad_norm": 1.4118302094020563,
"learning_rate": 4.593101327980301e-06,
"loss": 0.0983,
"step": 2025
},
{
"epoch": 0.921747042766151,
"grad_norm": 1.4886683990661254,
"learning_rate": 4.592710448068043e-06,
"loss": 0.1092,
"step": 2026
},
{
"epoch": 0.9222020018198362,
"grad_norm": 1.3005237265557243,
"learning_rate": 4.592319397148936e-06,
"loss": 0.0734,
"step": 2027
},
{
"epoch": 0.9226569608735213,
"grad_norm": 1.5293431637156654,
"learning_rate": 4.5919281752549386e-06,
"loss": 0.0939,
"step": 2028
},
{
"epoch": 0.9231119199272065,
"grad_norm": 1.9189898686105098,
"learning_rate": 4.5915367824180165e-06,
"loss": 0.1316,
"step": 2029
},
{
"epoch": 0.9235668789808917,
"grad_norm": 1.0704164238908853,
"learning_rate": 4.591145218670154e-06,
"loss": 0.0643,
"step": 2030
},
{
"epoch": 0.9240218380345769,
"grad_norm": 1.7428021756867538,
"learning_rate": 4.590753484043348e-06,
"loss": 0.1002,
"step": 2031
},
{
"epoch": 0.924476797088262,
"grad_norm": 2.077744811906698,
"learning_rate": 4.590361578569609e-06,
"loss": 0.1469,
"step": 2032
},
{
"epoch": 0.9249317561419472,
"grad_norm": 1.4526612982168714,
"learning_rate": 4.589969502280962e-06,
"loss": 0.0794,
"step": 2033
},
{
"epoch": 0.9253867151956324,
"grad_norm": 1.127350657053563,
"learning_rate": 4.589577255209445e-06,
"loss": 0.0825,
"step": 2034
},
{
"epoch": 0.9258416742493175,
"grad_norm": 1.5418218878324004,
"learning_rate": 4.589184837387112e-06,
"loss": 0.105,
"step": 2035
},
{
"epoch": 0.9262966333030027,
"grad_norm": 1.3295279516903347,
"learning_rate": 4.588792248846028e-06,
"loss": 0.0806,
"step": 2036
},
{
"epoch": 0.9267515923566879,
"grad_norm": 2.133194300217007,
"learning_rate": 4.588399489618274e-06,
"loss": 0.1101,
"step": 2037
},
{
"epoch": 0.9272065514103731,
"grad_norm": 1.127900412980361,
"learning_rate": 4.588006559735945e-06,
"loss": 0.0822,
"step": 2038
},
{
"epoch": 0.9276615104640582,
"grad_norm": 1.352574531632525,
"learning_rate": 4.587613459231149e-06,
"loss": 0.0883,
"step": 2039
},
{
"epoch": 0.9281164695177434,
"grad_norm": 1.563891551285797,
"learning_rate": 4.5872201881360105e-06,
"loss": 0.108,
"step": 2040
},
{
"epoch": 0.9285714285714286,
"grad_norm": 1.4992265474183826,
"learning_rate": 4.586826746482662e-06,
"loss": 0.1117,
"step": 2041
},
{
"epoch": 0.9290263876251137,
"grad_norm": 1.2789604094060583,
"learning_rate": 4.586433134303257e-06,
"loss": 0.0848,
"step": 2042
},
{
"epoch": 0.9294813466787989,
"grad_norm": 1.5307618549025637,
"learning_rate": 4.586039351629959e-06,
"loss": 0.1006,
"step": 2043
},
{
"epoch": 0.9299363057324841,
"grad_norm": 1.0708631866469716,
"learning_rate": 4.585645398494944e-06,
"loss": 0.0654,
"step": 2044
},
{
"epoch": 0.9303912647861693,
"grad_norm": 1.3019370361509508,
"learning_rate": 4.585251274930406e-06,
"loss": 0.0953,
"step": 2045
},
{
"epoch": 0.9308462238398544,
"grad_norm": 1.9864224586066603,
"learning_rate": 4.584856980968552e-06,
"loss": 0.1184,
"step": 2046
},
{
"epoch": 0.9313011828935396,
"grad_norm": 1.405575544197549,
"learning_rate": 4.584462516641599e-06,
"loss": 0.0941,
"step": 2047
},
{
"epoch": 0.9317561419472248,
"grad_norm": 1.4689077044508474,
"learning_rate": 4.584067881981784e-06,
"loss": 0.0895,
"step": 2048
},
{
"epoch": 0.9322111010009099,
"grad_norm": 1.135702737688899,
"learning_rate": 4.583673077021352e-06,
"loss": 0.0632,
"step": 2049
},
{
"epoch": 0.9326660600545951,
"grad_norm": 1.4668776418008769,
"learning_rate": 4.583278101792567e-06,
"loss": 0.1013,
"step": 2050
},
{
"epoch": 0.9331210191082803,
"grad_norm": 1.4404528025951508,
"learning_rate": 4.582882956327704e-06,
"loss": 0.0827,
"step": 2051
},
{
"epoch": 0.9335759781619655,
"grad_norm": 1.8675992636120633,
"learning_rate": 4.58248764065905e-06,
"loss": 0.1034,
"step": 2052
},
{
"epoch": 0.9340309372156506,
"grad_norm": 1.3878361039518945,
"learning_rate": 4.582092154818912e-06,
"loss": 0.099,
"step": 2053
},
{
"epoch": 0.9344858962693358,
"grad_norm": 1.562944151678329,
"learning_rate": 4.581696498839605e-06,
"loss": 0.1111,
"step": 2054
},
{
"epoch": 0.934940855323021,
"grad_norm": 1.3973553853526035,
"learning_rate": 4.581300672753462e-06,
"loss": 0.0983,
"step": 2055
},
{
"epoch": 0.935395814376706,
"grad_norm": 1.2166463674221564,
"learning_rate": 4.580904676592826e-06,
"loss": 0.066,
"step": 2056
},
{
"epoch": 0.9358507734303912,
"grad_norm": 1.2616231646039247,
"learning_rate": 4.580508510390057e-06,
"loss": 0.064,
"step": 2057
},
{
"epoch": 0.9363057324840764,
"grad_norm": 1.9200531356431896,
"learning_rate": 4.580112174177529e-06,
"loss": 0.1314,
"step": 2058
},
{
"epoch": 0.9367606915377616,
"grad_norm": 1.3598186993146166,
"learning_rate": 4.5797156679876274e-06,
"loss": 0.081,
"step": 2059
},
{
"epoch": 0.9372156505914467,
"grad_norm": 1.4701853511040646,
"learning_rate": 4.5793189918527524e-06,
"loss": 0.1156,
"step": 2060
},
{
"epoch": 0.9376706096451319,
"grad_norm": 1.4188407331643451,
"learning_rate": 4.5789221458053205e-06,
"loss": 0.0933,
"step": 2061
},
{
"epoch": 0.9381255686988171,
"grad_norm": 1.907331731563626,
"learning_rate": 4.578525129877759e-06,
"loss": 0.0997,
"step": 2062
},
{
"epoch": 0.9385805277525022,
"grad_norm": 1.3277724749366069,
"learning_rate": 4.5781279441025105e-06,
"loss": 0.1062,
"step": 2063
},
{
"epoch": 0.9390354868061874,
"grad_norm": 1.5198259374517775,
"learning_rate": 4.577730588512031e-06,
"loss": 0.0935,
"step": 2064
},
{
"epoch": 0.9394904458598726,
"grad_norm": 1.8061595559890593,
"learning_rate": 4.577333063138791e-06,
"loss": 0.1106,
"step": 2065
},
{
"epoch": 0.9399454049135578,
"grad_norm": 1.2937016104475862,
"learning_rate": 4.576935368015274e-06,
"loss": 0.0754,
"step": 2066
},
{
"epoch": 0.9404003639672429,
"grad_norm": 1.739125309652933,
"learning_rate": 4.576537503173978e-06,
"loss": 0.1485,
"step": 2067
},
{
"epoch": 0.9408553230209281,
"grad_norm": 1.7162057258895547,
"learning_rate": 4.576139468647415e-06,
"loss": 0.1067,
"step": 2068
},
{
"epoch": 0.9413102820746133,
"grad_norm": 1.2858589124219557,
"learning_rate": 4.575741264468111e-06,
"loss": 0.0721,
"step": 2069
},
{
"epoch": 0.9417652411282984,
"grad_norm": 1.7879372382669043,
"learning_rate": 4.575342890668603e-06,
"loss": 0.1117,
"step": 2070
},
{
"epoch": 0.9422202001819836,
"grad_norm": 2.2199652906323903,
"learning_rate": 4.574944347281448e-06,
"loss": 0.1667,
"step": 2071
},
{
"epoch": 0.9426751592356688,
"grad_norm": 1.8603229351203365,
"learning_rate": 4.5745456343392114e-06,
"loss": 0.0966,
"step": 2072
},
{
"epoch": 0.943130118289354,
"grad_norm": 1.519516751229683,
"learning_rate": 4.574146751874473e-06,
"loss": 0.1081,
"step": 2073
},
{
"epoch": 0.9435850773430391,
"grad_norm": 1.550278973906248,
"learning_rate": 4.57374769991983e-06,
"loss": 0.1,
"step": 2074
},
{
"epoch": 0.9440400363967243,
"grad_norm": 1.2078297904504105,
"learning_rate": 4.573348478507888e-06,
"loss": 0.0778,
"step": 2075
},
{
"epoch": 0.9444949954504095,
"grad_norm": 2.3422905994337397,
"learning_rate": 4.5729490876712725e-06,
"loss": 0.183,
"step": 2076
},
{
"epoch": 0.9449499545040946,
"grad_norm": 1.4654776784670076,
"learning_rate": 4.572549527442619e-06,
"loss": 0.1011,
"step": 2077
},
{
"epoch": 0.9454049135577798,
"grad_norm": 1.9579159955836072,
"learning_rate": 4.572149797854578e-06,
"loss": 0.0992,
"step": 2078
},
{
"epoch": 0.945859872611465,
"grad_norm": 1.2715287260379102,
"learning_rate": 4.571749898939813e-06,
"loss": 0.0774,
"step": 2079
},
{
"epoch": 0.9463148316651502,
"grad_norm": 1.3137532408543675,
"learning_rate": 4.5713498307310024e-06,
"loss": 0.0767,
"step": 2080
},
{
"epoch": 0.9467697907188353,
"grad_norm": 1.1469787080077873,
"learning_rate": 4.570949593260837e-06,
"loss": 0.0657,
"step": 2081
},
{
"epoch": 0.9472247497725205,
"grad_norm": 1.4149315544143606,
"learning_rate": 4.570549186562024e-06,
"loss": 0.0973,
"step": 2082
},
{
"epoch": 0.9476797088262057,
"grad_norm": 1.653104733877059,
"learning_rate": 4.570148610667281e-06,
"loss": 0.1008,
"step": 2083
},
{
"epoch": 0.9481346678798908,
"grad_norm": 2.1146488269160493,
"learning_rate": 4.569747865609343e-06,
"loss": 0.137,
"step": 2084
},
{
"epoch": 0.948589626933576,
"grad_norm": 1.8861095602948619,
"learning_rate": 4.569346951420957e-06,
"loss": 0.1312,
"step": 2085
},
{
"epoch": 0.9490445859872612,
"grad_norm": 1.7581609580097275,
"learning_rate": 4.568945868134882e-06,
"loss": 0.1201,
"step": 2086
},
{
"epoch": 0.9494995450409464,
"grad_norm": 1.6957832968300124,
"learning_rate": 4.568544615783894e-06,
"loss": 0.1165,
"step": 2087
},
{
"epoch": 0.9499545040946314,
"grad_norm": 1.2934181985102529,
"learning_rate": 4.568143194400782e-06,
"loss": 0.0779,
"step": 2088
},
{
"epoch": 0.9504094631483166,
"grad_norm": 1.453041608889173,
"learning_rate": 4.567741604018348e-06,
"loss": 0.0931,
"step": 2089
},
{
"epoch": 0.9508644222020018,
"grad_norm": 1.9468778720901316,
"learning_rate": 4.567339844669407e-06,
"loss": 0.123,
"step": 2090
},
{
"epoch": 0.9513193812556869,
"grad_norm": 1.236496096780351,
"learning_rate": 4.566937916386791e-06,
"loss": 0.0929,
"step": 2091
},
{
"epoch": 0.9517743403093721,
"grad_norm": 1.2079482887760136,
"learning_rate": 4.566535819203342e-06,
"loss": 0.0836,
"step": 2092
},
{
"epoch": 0.9522292993630573,
"grad_norm": 1.6071846832424066,
"learning_rate": 4.566133553151918e-06,
"loss": 0.116,
"step": 2093
},
{
"epoch": 0.9526842584167425,
"grad_norm": 1.5917288384663462,
"learning_rate": 4.565731118265392e-06,
"loss": 0.1023,
"step": 2094
},
{
"epoch": 0.9531392174704276,
"grad_norm": 1.9630737595748788,
"learning_rate": 4.5653285145766465e-06,
"loss": 0.1162,
"step": 2095
},
{
"epoch": 0.9535941765241128,
"grad_norm": 0.9081536366325619,
"learning_rate": 4.564925742118583e-06,
"loss": 0.0504,
"step": 2096
},
{
"epoch": 0.954049135577798,
"grad_norm": 1.7249991192643004,
"learning_rate": 4.564522800924111e-06,
"loss": 0.1385,
"step": 2097
},
{
"epoch": 0.9545040946314831,
"grad_norm": 1.2321759022595342,
"learning_rate": 4.56411969102616e-06,
"loss": 0.0828,
"step": 2098
},
{
"epoch": 0.9549590536851683,
"grad_norm": 1.6485112601533525,
"learning_rate": 4.5637164124576695e-06,
"loss": 0.1108,
"step": 2099
},
{
"epoch": 0.9554140127388535,
"grad_norm": 1.5011949500492594,
"learning_rate": 4.563312965251594e-06,
"loss": 0.0789,
"step": 2100
},
{
"epoch": 0.9558689717925387,
"grad_norm": 1.717044211260813,
"learning_rate": 4.562909349440899e-06,
"loss": 0.101,
"step": 2101
},
{
"epoch": 0.9563239308462238,
"grad_norm": 1.9690794320469502,
"learning_rate": 4.5625055650585695e-06,
"loss": 0.0887,
"step": 2102
},
{
"epoch": 0.956778889899909,
"grad_norm": 1.541908934769312,
"learning_rate": 4.562101612137599e-06,
"loss": 0.1032,
"step": 2103
},
{
"epoch": 0.9572338489535942,
"grad_norm": 1.1708242416085117,
"learning_rate": 4.561697490710998e-06,
"loss": 0.0786,
"step": 2104
},
{
"epoch": 0.9576888080072793,
"grad_norm": 1.7420543259304238,
"learning_rate": 4.561293200811787e-06,
"loss": 0.1027,
"step": 2105
},
{
"epoch": 0.9581437670609645,
"grad_norm": 1.8800626433332384,
"learning_rate": 4.560888742473005e-06,
"loss": 0.126,
"step": 2106
},
{
"epoch": 0.9585987261146497,
"grad_norm": 1.7463314395249052,
"learning_rate": 4.560484115727703e-06,
"loss": 0.1144,
"step": 2107
},
{
"epoch": 0.9590536851683349,
"grad_norm": 1.3208911722315206,
"learning_rate": 4.560079320608942e-06,
"loss": 0.0831,
"step": 2108
},
{
"epoch": 0.95950864422202,
"grad_norm": 1.1738917496826642,
"learning_rate": 4.5596743571498035e-06,
"loss": 0.0845,
"step": 2109
},
{
"epoch": 0.9599636032757052,
"grad_norm": 1.5293179012953557,
"learning_rate": 4.5592692253833775e-06,
"loss": 0.0872,
"step": 2110
},
{
"epoch": 0.9604185623293904,
"grad_norm": 1.3928507698223558,
"learning_rate": 4.5588639253427705e-06,
"loss": 0.0768,
"step": 2111
},
{
"epoch": 0.9608735213830755,
"grad_norm": 1.3009781755226655,
"learning_rate": 4.558458457061101e-06,
"loss": 0.0775,
"step": 2112
},
{
"epoch": 0.9613284804367607,
"grad_norm": 1.4250614240937063,
"learning_rate": 4.5580528205715024e-06,
"loss": 0.0891,
"step": 2113
},
{
"epoch": 0.9617834394904459,
"grad_norm": 1.9113281468795669,
"learning_rate": 4.557647015907121e-06,
"loss": 0.1099,
"step": 2114
},
{
"epoch": 0.9622383985441311,
"grad_norm": 1.2894594976227116,
"learning_rate": 4.557241043101118e-06,
"loss": 0.0779,
"step": 2115
},
{
"epoch": 0.9626933575978162,
"grad_norm": 1.6810992823881177,
"learning_rate": 4.556834902186667e-06,
"loss": 0.1397,
"step": 2116
},
{
"epoch": 0.9631483166515014,
"grad_norm": 1.5393066887459634,
"learning_rate": 4.556428593196956e-06,
"loss": 0.0935,
"step": 2117
},
{
"epoch": 0.9636032757051866,
"grad_norm": 2.3178365688110505,
"learning_rate": 4.556022116165189e-06,
"loss": 0.1796,
"step": 2118
},
{
"epoch": 0.9640582347588716,
"grad_norm": 1.646496340430589,
"learning_rate": 4.555615471124578e-06,
"loss": 0.0953,
"step": 2119
},
{
"epoch": 0.9645131938125568,
"grad_norm": 1.5571147184879746,
"learning_rate": 4.555208658108354e-06,
"loss": 0.084,
"step": 2120
},
{
"epoch": 0.964968152866242,
"grad_norm": 1.4911160901088942,
"learning_rate": 4.55480167714976e-06,
"loss": 0.0834,
"step": 2121
},
{
"epoch": 0.9654231119199272,
"grad_norm": 1.3047219270900265,
"learning_rate": 4.554394528282052e-06,
"loss": 0.1113,
"step": 2122
},
{
"epoch": 0.9658780709736123,
"grad_norm": 1.2420084154140223,
"learning_rate": 4.553987211538501e-06,
"loss": 0.0754,
"step": 2123
},
{
"epoch": 0.9663330300272975,
"grad_norm": 1.7628843384857757,
"learning_rate": 4.5535797269523906e-06,
"loss": 0.1157,
"step": 2124
},
{
"epoch": 0.9667879890809827,
"grad_norm": 1.3728918544083577,
"learning_rate": 4.55317207455702e-06,
"loss": 0.0886,
"step": 2125
},
{
"epoch": 0.9672429481346679,
"grad_norm": 1.3436653964528689,
"learning_rate": 4.552764254385697e-06,
"loss": 0.1031,
"step": 2126
},
{
"epoch": 0.967697907188353,
"grad_norm": 1.3924013137185407,
"learning_rate": 4.552356266471751e-06,
"loss": 0.0802,
"step": 2127
},
{
"epoch": 0.9681528662420382,
"grad_norm": 1.5484652367308942,
"learning_rate": 4.55194811084852e-06,
"loss": 0.1083,
"step": 2128
},
{
"epoch": 0.9686078252957234,
"grad_norm": 2.249626354585024,
"learning_rate": 4.551539787549354e-06,
"loss": 0.141,
"step": 2129
},
{
"epoch": 0.9690627843494085,
"grad_norm": 1.1787037168147345,
"learning_rate": 4.551131296607623e-06,
"loss": 0.0661,
"step": 2130
},
{
"epoch": 0.9695177434030937,
"grad_norm": 1.3998025414191242,
"learning_rate": 4.550722638056703e-06,
"loss": 0.0778,
"step": 2131
},
{
"epoch": 0.9699727024567789,
"grad_norm": 1.3210786504328669,
"learning_rate": 4.550313811929993e-06,
"loss": 0.0768,
"step": 2132
},
{
"epoch": 0.9704276615104641,
"grad_norm": 1.5377350977690776,
"learning_rate": 4.549904818260895e-06,
"loss": 0.0925,
"step": 2133
},
{
"epoch": 0.9708826205641492,
"grad_norm": 1.6844155831955263,
"learning_rate": 4.549495657082834e-06,
"loss": 0.1208,
"step": 2134
},
{
"epoch": 0.9713375796178344,
"grad_norm": 1.4036987190850851,
"learning_rate": 4.549086328429242e-06,
"loss": 0.0736,
"step": 2135
},
{
"epoch": 0.9717925386715196,
"grad_norm": 1.7761277537947966,
"learning_rate": 4.548676832333569e-06,
"loss": 0.1008,
"step": 2136
},
{
"epoch": 0.9722474977252047,
"grad_norm": 1.455686991970635,
"learning_rate": 4.548267168829279e-06,
"loss": 0.0829,
"step": 2137
},
{
"epoch": 0.9727024567788899,
"grad_norm": 1.7910623421009375,
"learning_rate": 4.547857337949844e-06,
"loss": 0.0997,
"step": 2138
},
{
"epoch": 0.9731574158325751,
"grad_norm": 1.3409903276734334,
"learning_rate": 4.5474473397287556e-06,
"loss": 0.0757,
"step": 2139
},
{
"epoch": 0.9736123748862603,
"grad_norm": 1.2780097230437193,
"learning_rate": 4.547037174199517e-06,
"loss": 0.0828,
"step": 2140
},
{
"epoch": 0.9740673339399454,
"grad_norm": 1.9829718978602024,
"learning_rate": 4.546626841395645e-06,
"loss": 0.1136,
"step": 2141
},
{
"epoch": 0.9745222929936306,
"grad_norm": 2.3916912753219504,
"learning_rate": 4.54621634135067e-06,
"loss": 0.106,
"step": 2142
},
{
"epoch": 0.9749772520473158,
"grad_norm": 1.7857367523019798,
"learning_rate": 4.545805674098136e-06,
"loss": 0.1179,
"step": 2143
},
{
"epoch": 0.9754322111010009,
"grad_norm": 1.8192016060510356,
"learning_rate": 4.545394839671601e-06,
"loss": 0.1153,
"step": 2144
},
{
"epoch": 0.9758871701546861,
"grad_norm": 1.7666738715643184,
"learning_rate": 4.544983838104637e-06,
"loss": 0.1155,
"step": 2145
},
{
"epoch": 0.9763421292083713,
"grad_norm": 1.669261781086701,
"learning_rate": 4.544572669430828e-06,
"loss": 0.1137,
"step": 2146
},
{
"epoch": 0.9767970882620565,
"grad_norm": 2.0898641673496603,
"learning_rate": 4.544161333683775e-06,
"loss": 0.1475,
"step": 2147
},
{
"epoch": 0.9772520473157416,
"grad_norm": 1.520441107938051,
"learning_rate": 4.543749830897088e-06,
"loss": 0.1196,
"step": 2148
},
{
"epoch": 0.9777070063694268,
"grad_norm": 1.778574007165815,
"learning_rate": 4.543338161104395e-06,
"loss": 0.1186,
"step": 2149
},
{
"epoch": 0.978161965423112,
"grad_norm": 1.4187578330640795,
"learning_rate": 4.542926324339335e-06,
"loss": 0.085,
"step": 2150
},
{
"epoch": 0.978616924476797,
"grad_norm": 1.7565219742471405,
"learning_rate": 4.542514320635561e-06,
"loss": 0.1288,
"step": 2151
},
{
"epoch": 0.9790718835304822,
"grad_norm": 1.8973174985472165,
"learning_rate": 4.542102150026741e-06,
"loss": 0.1309,
"step": 2152
},
{
"epoch": 0.9795268425841674,
"grad_norm": 1.8442728665682142,
"learning_rate": 4.541689812546556e-06,
"loss": 0.122,
"step": 2153
},
{
"epoch": 0.9799818016378526,
"grad_norm": 1.697144335252138,
"learning_rate": 4.541277308228698e-06,
"loss": 0.1176,
"step": 2154
},
{
"epoch": 0.9804367606915377,
"grad_norm": 1.435709999735577,
"learning_rate": 4.540864637106879e-06,
"loss": 0.0963,
"step": 2155
},
{
"epoch": 0.9808917197452229,
"grad_norm": 1.527626504741261,
"learning_rate": 4.540451799214817e-06,
"loss": 0.0698,
"step": 2156
},
{
"epoch": 0.9813466787989081,
"grad_norm": 1.3910853805372232,
"learning_rate": 4.540038794586248e-06,
"loss": 0.0872,
"step": 2157
},
{
"epoch": 0.9818016378525932,
"grad_norm": 1.2876712484569046,
"learning_rate": 4.539625623254923e-06,
"loss": 0.0746,
"step": 2158
},
{
"epoch": 0.9822565969062784,
"grad_norm": 1.9040440120946815,
"learning_rate": 4.539212285254601e-06,
"loss": 0.1374,
"step": 2159
},
{
"epoch": 0.9827115559599636,
"grad_norm": 1.368387986825355,
"learning_rate": 4.5387987806190615e-06,
"loss": 0.077,
"step": 2160
},
{
"epoch": 0.9831665150136488,
"grad_norm": 1.5628650011793674,
"learning_rate": 4.538385109382093e-06,
"loss": 0.1122,
"step": 2161
},
{
"epoch": 0.9836214740673339,
"grad_norm": 1.536108666612923,
"learning_rate": 4.537971271577498e-06,
"loss": 0.113,
"step": 2162
},
{
"epoch": 0.9840764331210191,
"grad_norm": 1.1870522930285077,
"learning_rate": 4.537557267239093e-06,
"loss": 0.0829,
"step": 2163
},
{
"epoch": 0.9845313921747043,
"grad_norm": 1.1107704199660413,
"learning_rate": 4.537143096400712e-06,
"loss": 0.0727,
"step": 2164
},
{
"epoch": 0.9849863512283894,
"grad_norm": 1.6702825882583634,
"learning_rate": 4.536728759096195e-06,
"loss": 0.1012,
"step": 2165
},
{
"epoch": 0.9854413102820746,
"grad_norm": 1.4945842814197994,
"learning_rate": 4.536314255359402e-06,
"loss": 0.0859,
"step": 2166
},
{
"epoch": 0.9858962693357598,
"grad_norm": 1.5267042639312454,
"learning_rate": 4.535899585224204e-06,
"loss": 0.0904,
"step": 2167
},
{
"epoch": 0.986351228389445,
"grad_norm": 1.1363362545968427,
"learning_rate": 4.535484748724486e-06,
"loss": 0.0713,
"step": 2168
},
{
"epoch": 0.9868061874431301,
"grad_norm": 1.1419506135274826,
"learning_rate": 4.535069745894147e-06,
"loss": 0.092,
"step": 2169
},
{
"epoch": 0.9872611464968153,
"grad_norm": 1.4152116084634339,
"learning_rate": 4.534654576767098e-06,
"loss": 0.0964,
"step": 2170
},
{
"epoch": 0.9877161055505005,
"grad_norm": 1.8913935291732986,
"learning_rate": 4.534239241377266e-06,
"loss": 0.1623,
"step": 2171
},
{
"epoch": 0.9881710646041856,
"grad_norm": 1.7108674711326843,
"learning_rate": 4.5338237397585895e-06,
"loss": 0.1366,
"step": 2172
},
{
"epoch": 0.9886260236578708,
"grad_norm": 1.867375764653619,
"learning_rate": 4.533408071945021e-06,
"loss": 0.0921,
"step": 2173
},
{
"epoch": 0.989080982711556,
"grad_norm": 1.6894369905832938,
"learning_rate": 4.532992237970528e-06,
"loss": 0.1123,
"step": 2174
},
{
"epoch": 0.9895359417652412,
"grad_norm": 1.2564113420047616,
"learning_rate": 4.532576237869091e-06,
"loss": 0.0639,
"step": 2175
},
{
"epoch": 0.9899909008189263,
"grad_norm": 1.1715905637128303,
"learning_rate": 4.5321600716747025e-06,
"loss": 0.0671,
"step": 2176
},
{
"epoch": 0.9904458598726115,
"grad_norm": 1.4423653806452021,
"learning_rate": 4.531743739421369e-06,
"loss": 0.0846,
"step": 2177
},
{
"epoch": 0.9909008189262967,
"grad_norm": 1.3936572340730857,
"learning_rate": 4.531327241143114e-06,
"loss": 0.0757,
"step": 2178
},
{
"epoch": 0.9913557779799818,
"grad_norm": 1.6605247652982158,
"learning_rate": 4.530910576873969e-06,
"loss": 0.0911,
"step": 2179
},
{
"epoch": 0.991810737033667,
"grad_norm": 1.4040852794165468,
"learning_rate": 4.530493746647984e-06,
"loss": 0.0822,
"step": 2180
},
{
"epoch": 0.9922656960873522,
"grad_norm": 1.5953857991116012,
"learning_rate": 4.530076750499219e-06,
"loss": 0.1092,
"step": 2181
},
{
"epoch": 0.9927206551410374,
"grad_norm": 1.683735567141584,
"learning_rate": 4.52965958846175e-06,
"loss": 0.1118,
"step": 2182
},
{
"epoch": 0.9931756141947224,
"grad_norm": 1.2145587847450323,
"learning_rate": 4.529242260569665e-06,
"loss": 0.0771,
"step": 2183
},
{
"epoch": 0.9936305732484076,
"grad_norm": 2.1272377858020364,
"learning_rate": 4.528824766857067e-06,
"loss": 0.1349,
"step": 2184
},
{
"epoch": 0.9940855323020928,
"grad_norm": 1.6139933135537738,
"learning_rate": 4.5284071073580715e-06,
"loss": 0.1055,
"step": 2185
},
{
"epoch": 0.9945404913557779,
"grad_norm": 1.4819652610036196,
"learning_rate": 4.527989282106807e-06,
"loss": 0.0803,
"step": 2186
},
{
"epoch": 0.9949954504094631,
"grad_norm": 2.09368743936611,
"learning_rate": 4.527571291137416e-06,
"loss": 0.1076,
"step": 2187
},
{
"epoch": 0.9954504094631483,
"grad_norm": 1.7222186608794936,
"learning_rate": 4.527153134484056e-06,
"loss": 0.1173,
"step": 2188
},
{
"epoch": 0.9959053685168335,
"grad_norm": 1.3187846074985496,
"learning_rate": 4.5267348121808965e-06,
"loss": 0.0808,
"step": 2189
},
{
"epoch": 0.9963603275705186,
"grad_norm": 1.346265507805684,
"learning_rate": 4.526316324262121e-06,
"loss": 0.112,
"step": 2190
},
{
"epoch": 0.9968152866242038,
"grad_norm": 1.341519514219881,
"learning_rate": 4.525897670761926e-06,
"loss": 0.0725,
"step": 2191
},
{
"epoch": 0.997270245677889,
"grad_norm": 1.5713885768234184,
"learning_rate": 4.525478851714522e-06,
"loss": 0.0977,
"step": 2192
},
{
"epoch": 0.9977252047315741,
"grad_norm": 1.5698524319504634,
"learning_rate": 4.525059867154133e-06,
"loss": 0.0995,
"step": 2193
},
{
"epoch": 0.9981801637852593,
"grad_norm": 1.3144114191589358,
"learning_rate": 4.5246407171149975e-06,
"loss": 0.0923,
"step": 2194
},
{
"epoch": 0.9986351228389445,
"grad_norm": 1.5066334421883691,
"learning_rate": 4.5242214016313655e-06,
"loss": 0.0944,
"step": 2195
},
{
"epoch": 0.9990900818926297,
"grad_norm": 1.0767315858549367,
"learning_rate": 4.523801920737501e-06,
"loss": 0.0692,
"step": 2196
},
{
"epoch": 0.9995450409463148,
"grad_norm": 1.362736303528153,
"learning_rate": 4.523382274467684e-06,
"loss": 0.0916,
"step": 2197
},
{
"epoch": 1.0,
"grad_norm": 1.0570283557217577,
"learning_rate": 4.522962462856206e-06,
"loss": 0.0478,
"step": 2198
},
{
"epoch": 1.000454959053685,
"grad_norm": 0.7592988875540077,
"learning_rate": 4.522542485937369e-06,
"loss": 0.0248,
"step": 2199
},
{
"epoch": 1.0009099181073704,
"grad_norm": 1.368980443137241,
"learning_rate": 4.522122343745495e-06,
"loss": 0.0579,
"step": 2200
},
{
"epoch": 1.0013648771610555,
"grad_norm": 1.0628585042902956,
"learning_rate": 4.521702036314915e-06,
"loss": 0.0561,
"step": 2201
},
{
"epoch": 1.0018198362147406,
"grad_norm": 0.7147794512980257,
"learning_rate": 4.521281563679973e-06,
"loss": 0.0292,
"step": 2202
},
{
"epoch": 1.0022747952684259,
"grad_norm": 1.1709050671007968,
"learning_rate": 4.5208609258750314e-06,
"loss": 0.0464,
"step": 2203
},
{
"epoch": 1.002729754322111,
"grad_norm": 0.9609189064016738,
"learning_rate": 4.52044012293446e-06,
"loss": 0.0451,
"step": 2204
},
{
"epoch": 1.0031847133757963,
"grad_norm": 1.354272672445694,
"learning_rate": 4.520019154892646e-06,
"loss": 0.0555,
"step": 2205
},
{
"epoch": 1.0036396724294814,
"grad_norm": 1.2952803538667241,
"learning_rate": 4.519598021783989e-06,
"loss": 0.0539,
"step": 2206
},
{
"epoch": 1.0040946314831665,
"grad_norm": 1.5494612726642303,
"learning_rate": 4.519176723642903e-06,
"loss": 0.0762,
"step": 2207
},
{
"epoch": 1.0045495905368518,
"grad_norm": 1.4196552945601155,
"learning_rate": 4.518755260503813e-06,
"loss": 0.0606,
"step": 2208
},
{
"epoch": 1.0050045495905369,
"grad_norm": 0.9459922110368411,
"learning_rate": 4.51833363240116e-06,
"loss": 0.0401,
"step": 2209
},
{
"epoch": 1.005459508644222,
"grad_norm": 0.9836663939604657,
"learning_rate": 4.517911839369398e-06,
"loss": 0.0414,
"step": 2210
},
{
"epoch": 1.0059144676979073,
"grad_norm": 1.0781133888503407,
"learning_rate": 4.517489881442993e-06,
"loss": 0.0347,
"step": 2211
},
{
"epoch": 1.0063694267515924,
"grad_norm": 0.8497781985930463,
"learning_rate": 4.517067758656424e-06,
"loss": 0.0266,
"step": 2212
},
{
"epoch": 1.0068243858052774,
"grad_norm": 1.113725479756549,
"learning_rate": 4.516645471044188e-06,
"loss": 0.0328,
"step": 2213
},
{
"epoch": 1.0072793448589628,
"grad_norm": 1.0208334208547818,
"learning_rate": 4.516223018640791e-06,
"loss": 0.0373,
"step": 2214
},
{
"epoch": 1.0077343039126478,
"grad_norm": 1.3882067298362941,
"learning_rate": 4.515800401480754e-06,
"loss": 0.0457,
"step": 2215
},
{
"epoch": 1.008189262966333,
"grad_norm": 1.0420659203678968,
"learning_rate": 4.515377619598612e-06,
"loss": 0.0462,
"step": 2216
},
{
"epoch": 1.0086442220200182,
"grad_norm": 1.3078136737284674,
"learning_rate": 4.514954673028913e-06,
"loss": 0.0508,
"step": 2217
},
{
"epoch": 1.0090991810737033,
"grad_norm": 1.1934759478904275,
"learning_rate": 4.5145315618062155e-06,
"loss": 0.0448,
"step": 2218
},
{
"epoch": 1.0095541401273886,
"grad_norm": 1.2215784077070255,
"learning_rate": 4.514108285965098e-06,
"loss": 0.0427,
"step": 2219
},
{
"epoch": 1.0100090991810737,
"grad_norm": 1.178079318605842,
"learning_rate": 4.513684845540146e-06,
"loss": 0.0481,
"step": 2220
}
],
"logging_steps": 1,
"max_steps": 10990,
"num_input_tokens_seen": 0,
"num_train_epochs": 5,
"save_steps": 555,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 14593065984000.0,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}