redsgnaoh's picture
Upload folder using huggingface_hub
cd7b244 verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.5050045495905369,
"eval_steps": 500,
"global_step": 1110,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.00045495905368516835,
"grad_norm": 2.3685307115973546,
"learning_rate": 5e-06,
"loss": 0.0587,
"step": 1
},
{
"epoch": 0.0009099181073703367,
"grad_norm": 3.207290006513166,
"learning_rate": 4.999999897855645e-06,
"loss": 0.0976,
"step": 2
},
{
"epoch": 0.001364877161055505,
"grad_norm": 3.061584755625611,
"learning_rate": 4.9999995914225884e-06,
"loss": 0.1138,
"step": 3
},
{
"epoch": 0.0018198362147406734,
"grad_norm": 2.4708172493174265,
"learning_rate": 4.999999080700855e-06,
"loss": 0.102,
"step": 4
},
{
"epoch": 0.0022747952684258415,
"grad_norm": 2.7122863978048204,
"learning_rate": 4.999998365690486e-06,
"loss": 0.0899,
"step": 5
},
{
"epoch": 0.00272975432211101,
"grad_norm": 2.1348308028500367,
"learning_rate": 4.999997446391542e-06,
"loss": 0.0589,
"step": 6
},
{
"epoch": 0.0031847133757961785,
"grad_norm": 1.9525029408374595,
"learning_rate": 4.999996322804095e-06,
"loss": 0.0692,
"step": 7
},
{
"epoch": 0.003639672429481347,
"grad_norm": 2.4972521600201087,
"learning_rate": 4.999994994928239e-06,
"loss": 0.094,
"step": 8
},
{
"epoch": 0.004094631483166515,
"grad_norm": 1.3057783939017902,
"learning_rate": 4.999993462764082e-06,
"loss": 0.0401,
"step": 9
},
{
"epoch": 0.004549590536851683,
"grad_norm": 1.8178622655461494,
"learning_rate": 4.999991726311749e-06,
"loss": 0.0508,
"step": 10
},
{
"epoch": 0.005004549590536852,
"grad_norm": 1.8904298363447831,
"learning_rate": 4.999989785571382e-06,
"loss": 0.0466,
"step": 11
},
{
"epoch": 0.00545950864422202,
"grad_norm": 2.397431505721498,
"learning_rate": 4.999987640543139e-06,
"loss": 0.0684,
"step": 12
},
{
"epoch": 0.005914467697907188,
"grad_norm": 2.121710266227225,
"learning_rate": 4.999985291227196e-06,
"loss": 0.0729,
"step": 13
},
{
"epoch": 0.006369426751592357,
"grad_norm": 2.9696000985831614,
"learning_rate": 4.999982737623746e-06,
"loss": 0.0922,
"step": 14
},
{
"epoch": 0.006824385805277525,
"grad_norm": 2.270433126704546,
"learning_rate": 4.999979979732995e-06,
"loss": 0.0946,
"step": 15
},
{
"epoch": 0.007279344858962694,
"grad_norm": 1.9380248124362378,
"learning_rate": 4.999977017555171e-06,
"loss": 0.0578,
"step": 16
},
{
"epoch": 0.0077343039126478615,
"grad_norm": 2.6281882171357958,
"learning_rate": 4.999973851090514e-06,
"loss": 0.1147,
"step": 17
},
{
"epoch": 0.00818926296633303,
"grad_norm": 2.40029765076707,
"learning_rate": 4.999970480339284e-06,
"loss": 0.0906,
"step": 18
},
{
"epoch": 0.008644222020018199,
"grad_norm": 2.889640814144301,
"learning_rate": 4.9999669053017564e-06,
"loss": 0.0792,
"step": 19
},
{
"epoch": 0.009099181073703366,
"grad_norm": 2.3110994220860883,
"learning_rate": 4.9999631259782235e-06,
"loss": 0.0751,
"step": 20
},
{
"epoch": 0.009554140127388535,
"grad_norm": 2.6890244705482806,
"learning_rate": 4.999959142368993e-06,
"loss": 0.0966,
"step": 21
},
{
"epoch": 0.010009099181073703,
"grad_norm": 2.2488041264680563,
"learning_rate": 4.999954954474391e-06,
"loss": 0.0714,
"step": 22
},
{
"epoch": 0.010464058234758872,
"grad_norm": 2.0642223983397883,
"learning_rate": 4.9999505622947594e-06,
"loss": 0.0881,
"step": 23
},
{
"epoch": 0.01091901728844404,
"grad_norm": 2.384727655713489,
"learning_rate": 4.999945965830458e-06,
"loss": 0.0992,
"step": 24
},
{
"epoch": 0.011373976342129208,
"grad_norm": 2.2739375250381504,
"learning_rate": 4.999941165081863e-06,
"loss": 0.0831,
"step": 25
},
{
"epoch": 0.011828935395814377,
"grad_norm": 1.6418905911049972,
"learning_rate": 4.999936160049364e-06,
"loss": 0.0662,
"step": 26
},
{
"epoch": 0.012283894449499545,
"grad_norm": 2.029045596294324,
"learning_rate": 4.999930950733373e-06,
"loss": 0.097,
"step": 27
},
{
"epoch": 0.012738853503184714,
"grad_norm": 2.2833378337725287,
"learning_rate": 4.999925537134312e-06,
"loss": 0.0823,
"step": 28
},
{
"epoch": 0.013193812556869881,
"grad_norm": 2.611896749496796,
"learning_rate": 4.9999199192526286e-06,
"loss": 0.1115,
"step": 29
},
{
"epoch": 0.01364877161055505,
"grad_norm": 2.4812612616344865,
"learning_rate": 4.9999140970887775e-06,
"loss": 0.0854,
"step": 30
},
{
"epoch": 0.014103730664240218,
"grad_norm": 2.0837983680092904,
"learning_rate": 4.999908070643236e-06,
"loss": 0.0837,
"step": 31
},
{
"epoch": 0.014558689717925387,
"grad_norm": 2.0812008840647827,
"learning_rate": 4.999901839916495e-06,
"loss": 0.064,
"step": 32
},
{
"epoch": 0.015013648771610554,
"grad_norm": 1.5275195881020318,
"learning_rate": 4.999895404909067e-06,
"loss": 0.0582,
"step": 33
},
{
"epoch": 0.015468607825295723,
"grad_norm": 2.703502541064391,
"learning_rate": 4.999888765621476e-06,
"loss": 0.1102,
"step": 34
},
{
"epoch": 0.01592356687898089,
"grad_norm": 1.7231856796809104,
"learning_rate": 4.999881922054264e-06,
"loss": 0.0571,
"step": 35
},
{
"epoch": 0.01637852593266606,
"grad_norm": 1.6472076658400754,
"learning_rate": 4.999874874207991e-06,
"loss": 0.0536,
"step": 36
},
{
"epoch": 0.01683348498635123,
"grad_norm": 2.902300005488672,
"learning_rate": 4.999867622083232e-06,
"loss": 0.1302,
"step": 37
},
{
"epoch": 0.017288444040036398,
"grad_norm": 1.9543380822482044,
"learning_rate": 4.99986016568058e-06,
"loss": 0.0983,
"step": 38
},
{
"epoch": 0.017743403093721567,
"grad_norm": 1.814859572890468,
"learning_rate": 4.999852505000646e-06,
"loss": 0.0717,
"step": 39
},
{
"epoch": 0.018198362147406732,
"grad_norm": 1.882630749677819,
"learning_rate": 4.999844640044053e-06,
"loss": 0.07,
"step": 40
},
{
"epoch": 0.0186533212010919,
"grad_norm": 2.4063115131397823,
"learning_rate": 4.999836570811445e-06,
"loss": 0.0873,
"step": 41
},
{
"epoch": 0.01910828025477707,
"grad_norm": 2.9701013712692035,
"learning_rate": 4.999828297303483e-06,
"loss": 0.0957,
"step": 42
},
{
"epoch": 0.019563239308462238,
"grad_norm": 2.207833234895104,
"learning_rate": 4.9998198195208405e-06,
"loss": 0.0879,
"step": 43
},
{
"epoch": 0.020018198362147407,
"grad_norm": 2.168760551509319,
"learning_rate": 4.999811137464212e-06,
"loss": 0.0967,
"step": 44
},
{
"epoch": 0.020473157415832575,
"grad_norm": 2.12859962179133,
"learning_rate": 4.999802251134307e-06,
"loss": 0.1028,
"step": 45
},
{
"epoch": 0.020928116469517744,
"grad_norm": 1.8067595132130894,
"learning_rate": 4.99979316053185e-06,
"loss": 0.0778,
"step": 46
},
{
"epoch": 0.021383075523202913,
"grad_norm": 3.8815722657740594,
"learning_rate": 4.999783865657585e-06,
"loss": 0.1812,
"step": 47
},
{
"epoch": 0.02183803457688808,
"grad_norm": 4.142186542548352,
"learning_rate": 4.999774366512272e-06,
"loss": 0.1981,
"step": 48
},
{
"epoch": 0.022292993630573247,
"grad_norm": 2.4946427215064015,
"learning_rate": 4.9997646630966865e-06,
"loss": 0.0866,
"step": 49
},
{
"epoch": 0.022747952684258416,
"grad_norm": 2.219814267860857,
"learning_rate": 4.999754755411621e-06,
"loss": 0.0767,
"step": 50
},
{
"epoch": 0.023202911737943584,
"grad_norm": 1.7512451842619647,
"learning_rate": 4.9997446434578865e-06,
"loss": 0.0709,
"step": 51
},
{
"epoch": 0.023657870791628753,
"grad_norm": 1.9267762038567948,
"learning_rate": 4.999734327236307e-06,
"loss": 0.0791,
"step": 52
},
{
"epoch": 0.024112829845313922,
"grad_norm": 1.3192434416131813,
"learning_rate": 4.999723806747728e-06,
"loss": 0.0611,
"step": 53
},
{
"epoch": 0.02456778889899909,
"grad_norm": 2.0553891309583787,
"learning_rate": 4.99971308199301e-06,
"loss": 0.0708,
"step": 54
},
{
"epoch": 0.02502274795268426,
"grad_norm": 1.6809260342794263,
"learning_rate": 4.999702152973025e-06,
"loss": 0.0662,
"step": 55
},
{
"epoch": 0.025477707006369428,
"grad_norm": 2.0087287549898716,
"learning_rate": 4.9996910196886694e-06,
"loss": 0.0795,
"step": 56
},
{
"epoch": 0.025932666060054597,
"grad_norm": 1.3268510730840513,
"learning_rate": 4.999679682140852e-06,
"loss": 0.0422,
"step": 57
},
{
"epoch": 0.026387625113739762,
"grad_norm": 2.646053521216802,
"learning_rate": 4.999668140330499e-06,
"loss": 0.1284,
"step": 58
},
{
"epoch": 0.02684258416742493,
"grad_norm": 1.5857988579934552,
"learning_rate": 4.999656394258555e-06,
"loss": 0.0647,
"step": 59
},
{
"epoch": 0.0272975432211101,
"grad_norm": 1.756551616255058,
"learning_rate": 4.999644443925978e-06,
"loss": 0.078,
"step": 60
},
{
"epoch": 0.027752502274795268,
"grad_norm": 2.2102751228780546,
"learning_rate": 4.999632289333746e-06,
"loss": 0.0785,
"step": 61
},
{
"epoch": 0.028207461328480437,
"grad_norm": 2.338156657994438,
"learning_rate": 4.999619930482852e-06,
"loss": 0.0835,
"step": 62
},
{
"epoch": 0.028662420382165606,
"grad_norm": 2.0921557148636616,
"learning_rate": 4.999607367374304e-06,
"loss": 0.0974,
"step": 63
},
{
"epoch": 0.029117379435850774,
"grad_norm": 1.7535396635399074,
"learning_rate": 4.999594600009131e-06,
"loss": 0.0605,
"step": 64
},
{
"epoch": 0.029572338489535943,
"grad_norm": 2.2055708873696585,
"learning_rate": 4.999581628388375e-06,
"loss": 0.0946,
"step": 65
},
{
"epoch": 0.03002729754322111,
"grad_norm": 2.5001955714674216,
"learning_rate": 4.999568452513097e-06,
"loss": 0.1549,
"step": 66
},
{
"epoch": 0.030482256596906277,
"grad_norm": 2.417716838936908,
"learning_rate": 4.9995550723843726e-06,
"loss": 0.0953,
"step": 67
},
{
"epoch": 0.030937215650591446,
"grad_norm": 1.9976883408624455,
"learning_rate": 4.999541488003295e-06,
"loss": 0.0772,
"step": 68
},
{
"epoch": 0.03139217470427662,
"grad_norm": 1.9326277047503455,
"learning_rate": 4.999527699370975e-06,
"loss": 0.0764,
"step": 69
},
{
"epoch": 0.03184713375796178,
"grad_norm": 2.0337761312716527,
"learning_rate": 4.99951370648854e-06,
"loss": 0.0659,
"step": 70
},
{
"epoch": 0.03230209281164695,
"grad_norm": 1.895878774895592,
"learning_rate": 4.999499509357132e-06,
"loss": 0.0815,
"step": 71
},
{
"epoch": 0.03275705186533212,
"grad_norm": 2.0909717848011313,
"learning_rate": 4.999485107977912e-06,
"loss": 0.084,
"step": 72
},
{
"epoch": 0.033212010919017286,
"grad_norm": 1.5271836426577585,
"learning_rate": 4.999470502352057e-06,
"loss": 0.0645,
"step": 73
},
{
"epoch": 0.03366696997270246,
"grad_norm": 2.4817155636981223,
"learning_rate": 4.999455692480759e-06,
"loss": 0.1008,
"step": 74
},
{
"epoch": 0.034121929026387623,
"grad_norm": 1.6027477251164817,
"learning_rate": 4.999440678365229e-06,
"loss": 0.0722,
"step": 75
},
{
"epoch": 0.034576888080072796,
"grad_norm": 2.164861284274037,
"learning_rate": 4.999425460006695e-06,
"loss": 0.0876,
"step": 76
},
{
"epoch": 0.03503184713375796,
"grad_norm": 1.8147143711706584,
"learning_rate": 4.9994100374063995e-06,
"loss": 0.0739,
"step": 77
},
{
"epoch": 0.03548680618744313,
"grad_norm": 2.379478288499757,
"learning_rate": 4.9993944105656035e-06,
"loss": 0.1158,
"step": 78
},
{
"epoch": 0.0359417652411283,
"grad_norm": 1.7238147576191318,
"learning_rate": 4.999378579485582e-06,
"loss": 0.0749,
"step": 79
},
{
"epoch": 0.036396724294813464,
"grad_norm": 2.1444185576728323,
"learning_rate": 4.999362544167632e-06,
"loss": 0.0937,
"step": 80
},
{
"epoch": 0.036851683348498636,
"grad_norm": 1.18142283635082,
"learning_rate": 4.99934630461306e-06,
"loss": 0.0569,
"step": 81
},
{
"epoch": 0.0373066424021838,
"grad_norm": 2.3599788407160456,
"learning_rate": 4.999329860823197e-06,
"loss": 0.0848,
"step": 82
},
{
"epoch": 0.03776160145586897,
"grad_norm": 1.851574920799011,
"learning_rate": 4.999313212799383e-06,
"loss": 0.0882,
"step": 83
},
{
"epoch": 0.03821656050955414,
"grad_norm": 2.144291660745484,
"learning_rate": 4.99929636054298e-06,
"loss": 0.0881,
"step": 84
},
{
"epoch": 0.03867151956323931,
"grad_norm": 2.083071837291781,
"learning_rate": 4.999279304055366e-06,
"loss": 0.1109,
"step": 85
},
{
"epoch": 0.039126478616924476,
"grad_norm": 2.245491182317419,
"learning_rate": 4.999262043337933e-06,
"loss": 0.0933,
"step": 86
},
{
"epoch": 0.03958143767060965,
"grad_norm": 2.076902724310137,
"learning_rate": 4.999244578392094e-06,
"loss": 0.1004,
"step": 87
},
{
"epoch": 0.040036396724294813,
"grad_norm": 2.213157445111281,
"learning_rate": 4.9992269092192736e-06,
"loss": 0.1048,
"step": 88
},
{
"epoch": 0.04049135577797998,
"grad_norm": 1.8088256581500983,
"learning_rate": 4.9992090358209166e-06,
"loss": 0.0803,
"step": 89
},
{
"epoch": 0.04094631483166515,
"grad_norm": 1.6952266837081935,
"learning_rate": 4.9991909581984835e-06,
"loss": 0.0707,
"step": 90
},
{
"epoch": 0.041401273885350316,
"grad_norm": 1.2806634047624867,
"learning_rate": 4.999172676353451e-06,
"loss": 0.0405,
"step": 91
},
{
"epoch": 0.04185623293903549,
"grad_norm": 1.537222164184117,
"learning_rate": 4.999154190287314e-06,
"loss": 0.0678,
"step": 92
},
{
"epoch": 0.042311191992720654,
"grad_norm": 2.152654560935853,
"learning_rate": 4.999135500001583e-06,
"loss": 0.1323,
"step": 93
},
{
"epoch": 0.042766151046405826,
"grad_norm": 1.7293087783358614,
"learning_rate": 4.9991166054977844e-06,
"loss": 0.0851,
"step": 94
},
{
"epoch": 0.04322111010009099,
"grad_norm": 2.689089264886033,
"learning_rate": 4.999097506777463e-06,
"loss": 0.1018,
"step": 95
},
{
"epoch": 0.04367606915377616,
"grad_norm": 1.8242860351920025,
"learning_rate": 4.999078203842179e-06,
"loss": 0.1063,
"step": 96
},
{
"epoch": 0.04413102820746133,
"grad_norm": 1.5249963877818449,
"learning_rate": 4.999058696693511e-06,
"loss": 0.0593,
"step": 97
},
{
"epoch": 0.044585987261146494,
"grad_norm": 1.668772591755926,
"learning_rate": 4.99903898533305e-06,
"loss": 0.0709,
"step": 98
},
{
"epoch": 0.045040946314831666,
"grad_norm": 1.8521288885149407,
"learning_rate": 4.99901906976241e-06,
"loss": 0.0842,
"step": 99
},
{
"epoch": 0.04549590536851683,
"grad_norm": 2.106435857041323,
"learning_rate": 4.998998949983217e-06,
"loss": 0.0921,
"step": 100
},
{
"epoch": 0.045950864422202004,
"grad_norm": 2.104450695294598,
"learning_rate": 4.998978625997115e-06,
"loss": 0.1082,
"step": 101
},
{
"epoch": 0.04640582347588717,
"grad_norm": 2.1381043167125466,
"learning_rate": 4.998958097805765e-06,
"loss": 0.0966,
"step": 102
},
{
"epoch": 0.04686078252957234,
"grad_norm": 1.6962878781771613,
"learning_rate": 4.9989373654108445e-06,
"loss": 0.0721,
"step": 103
},
{
"epoch": 0.047315741583257506,
"grad_norm": 26.768545049591438,
"learning_rate": 4.9989164288140465e-06,
"loss": 0.362,
"step": 104
},
{
"epoch": 0.04777070063694268,
"grad_norm": 2.63813062408578,
"learning_rate": 4.998895288017085e-06,
"loss": 0.1373,
"step": 105
},
{
"epoch": 0.048225659690627844,
"grad_norm": 1.828826426920959,
"learning_rate": 4.998873943021684e-06,
"loss": 0.0743,
"step": 106
},
{
"epoch": 0.04868061874431301,
"grad_norm": 1.524672393516503,
"learning_rate": 4.998852393829589e-06,
"loss": 0.0693,
"step": 107
},
{
"epoch": 0.04913557779799818,
"grad_norm": 3.0873114713096683,
"learning_rate": 4.9988306404425625e-06,
"loss": 0.1492,
"step": 108
},
{
"epoch": 0.049590536851683346,
"grad_norm": 1.7541988764209069,
"learning_rate": 4.99880868286238e-06,
"loss": 0.0941,
"step": 109
},
{
"epoch": 0.05004549590536852,
"grad_norm": 2.3475973125438103,
"learning_rate": 4.998786521090836e-06,
"loss": 0.0925,
"step": 110
},
{
"epoch": 0.050500454959053684,
"grad_norm": 2.1297159392440452,
"learning_rate": 4.9987641551297426e-06,
"loss": 0.1209,
"step": 111
},
{
"epoch": 0.050955414012738856,
"grad_norm": 1.8188477873711246,
"learning_rate": 4.998741584980926e-06,
"loss": 0.1191,
"step": 112
},
{
"epoch": 0.05141037306642402,
"grad_norm": 2.0744703068317474,
"learning_rate": 4.9987188106462314e-06,
"loss": 0.0958,
"step": 113
},
{
"epoch": 0.051865332120109194,
"grad_norm": 1.67585557445257,
"learning_rate": 4.99869583212752e-06,
"loss": 0.0759,
"step": 114
},
{
"epoch": 0.05232029117379436,
"grad_norm": 2.9423649270306456,
"learning_rate": 4.9986726494266694e-06,
"loss": 0.1628,
"step": 115
},
{
"epoch": 0.052775250227479524,
"grad_norm": 1.9805897541793653,
"learning_rate": 4.998649262545574e-06,
"loss": 0.0865,
"step": 116
},
{
"epoch": 0.053230209281164696,
"grad_norm": 1.862673950464683,
"learning_rate": 4.998625671486144e-06,
"loss": 0.0841,
"step": 117
},
{
"epoch": 0.05368516833484986,
"grad_norm": 1.6852737490573195,
"learning_rate": 4.998601876250308e-06,
"loss": 0.0801,
"step": 118
},
{
"epoch": 0.054140127388535034,
"grad_norm": 1.8645780399689873,
"learning_rate": 4.998577876840011e-06,
"loss": 0.0822,
"step": 119
},
{
"epoch": 0.0545950864422202,
"grad_norm": 1.7705796593126653,
"learning_rate": 4.9985536732572124e-06,
"loss": 0.0836,
"step": 120
},
{
"epoch": 0.05505004549590537,
"grad_norm": 1.4380115814084553,
"learning_rate": 4.998529265503891e-06,
"loss": 0.0714,
"step": 121
},
{
"epoch": 0.055505004549590536,
"grad_norm": 1.841019746353449,
"learning_rate": 4.9985046535820416e-06,
"loss": 0.0925,
"step": 122
},
{
"epoch": 0.05595996360327571,
"grad_norm": 2.13633472088372,
"learning_rate": 4.998479837493675e-06,
"loss": 0.1098,
"step": 123
},
{
"epoch": 0.056414922656960874,
"grad_norm": 1.6795956051728682,
"learning_rate": 4.9984548172408195e-06,
"loss": 0.0623,
"step": 124
},
{
"epoch": 0.05686988171064604,
"grad_norm": 7.146738489798405,
"learning_rate": 4.998429592825519e-06,
"loss": 0.1803,
"step": 125
},
{
"epoch": 0.05732484076433121,
"grad_norm": 2.17497011974541,
"learning_rate": 4.998404164249835e-06,
"loss": 0.1209,
"step": 126
},
{
"epoch": 0.05777979981801638,
"grad_norm": 1.9663385354035616,
"learning_rate": 4.998378531515845e-06,
"loss": 0.0704,
"step": 127
},
{
"epoch": 0.05823475887170155,
"grad_norm": 2.398444068788508,
"learning_rate": 4.998352694625645e-06,
"loss": 0.0819,
"step": 128
},
{
"epoch": 0.058689717925386714,
"grad_norm": 1.5854929257305652,
"learning_rate": 4.998326653581343e-06,
"loss": 0.0775,
"step": 129
},
{
"epoch": 0.059144676979071886,
"grad_norm": 1.8831317521751245,
"learning_rate": 4.998300408385072e-06,
"loss": 0.0895,
"step": 130
},
{
"epoch": 0.05959963603275705,
"grad_norm": 2.624836374744882,
"learning_rate": 4.998273959038972e-06,
"loss": 0.1398,
"step": 131
},
{
"epoch": 0.06005459508644222,
"grad_norm": 1.8281764860819427,
"learning_rate": 4.998247305545207e-06,
"loss": 0.0979,
"step": 132
},
{
"epoch": 0.06050955414012739,
"grad_norm": 1.4175605750366638,
"learning_rate": 4.998220447905953e-06,
"loss": 0.0674,
"step": 133
},
{
"epoch": 0.060964513193812554,
"grad_norm": 2.0007328792439307,
"learning_rate": 4.998193386123408e-06,
"loss": 0.1082,
"step": 134
},
{
"epoch": 0.061419472247497726,
"grad_norm": 2.2534593276871355,
"learning_rate": 4.99816612019978e-06,
"loss": 0.1165,
"step": 135
},
{
"epoch": 0.06187443130118289,
"grad_norm": 7.223128092677242,
"learning_rate": 4.998138650137298e-06,
"loss": 0.1547,
"step": 136
},
{
"epoch": 0.062329390354868064,
"grad_norm": 2.0541187438324178,
"learning_rate": 4.998110975938208e-06,
"loss": 0.1153,
"step": 137
},
{
"epoch": 0.06278434940855324,
"grad_norm": 2.900003934434033,
"learning_rate": 4.998083097604769e-06,
"loss": 0.1227,
"step": 138
},
{
"epoch": 0.0632393084622384,
"grad_norm": 2.9930382656276655,
"learning_rate": 4.998055015139261e-06,
"loss": 0.0671,
"step": 139
},
{
"epoch": 0.06369426751592357,
"grad_norm": 1.8183166737473904,
"learning_rate": 4.998026728543979e-06,
"loss": 0.0879,
"step": 140
},
{
"epoch": 0.06414922656960874,
"grad_norm": 1.750231162848612,
"learning_rate": 4.997998237821233e-06,
"loss": 0.0973,
"step": 141
},
{
"epoch": 0.0646041856232939,
"grad_norm": 1.531092755332603,
"learning_rate": 4.997969542973352e-06,
"loss": 0.0755,
"step": 142
},
{
"epoch": 0.06505914467697907,
"grad_norm": 2.106588666489457,
"learning_rate": 4.997940644002681e-06,
"loss": 0.1014,
"step": 143
},
{
"epoch": 0.06551410373066424,
"grad_norm": 2.4260145417995513,
"learning_rate": 4.997911540911581e-06,
"loss": 0.0992,
"step": 144
},
{
"epoch": 0.06596906278434941,
"grad_norm": 1.9957158387709846,
"learning_rate": 4.99788223370243e-06,
"loss": 0.1074,
"step": 145
},
{
"epoch": 0.06642402183803457,
"grad_norm": 2.7359115449729385,
"learning_rate": 4.9978527223776245e-06,
"loss": 0.1298,
"step": 146
},
{
"epoch": 0.06687898089171974,
"grad_norm": 1.4774963397056595,
"learning_rate": 4.9978230069395735e-06,
"loss": 0.0725,
"step": 147
},
{
"epoch": 0.06733393994540492,
"grad_norm": 2.4431671333335188,
"learning_rate": 4.9977930873907065e-06,
"loss": 0.0983,
"step": 148
},
{
"epoch": 0.06778889899909009,
"grad_norm": 1.9906443670591782,
"learning_rate": 4.997762963733468e-06,
"loss": 0.1039,
"step": 149
},
{
"epoch": 0.06824385805277525,
"grad_norm": 2.0201798980001517,
"learning_rate": 4.997732635970321e-06,
"loss": 0.085,
"step": 150
},
{
"epoch": 0.06869881710646042,
"grad_norm": 1.7461931203369137,
"learning_rate": 4.9977021041037425e-06,
"loss": 0.0884,
"step": 151
},
{
"epoch": 0.06915377616014559,
"grad_norm": 2.339191302020108,
"learning_rate": 4.9976713681362265e-06,
"loss": 0.1159,
"step": 152
},
{
"epoch": 0.06960873521383075,
"grad_norm": 2.314166753359135,
"learning_rate": 4.997640428070286e-06,
"loss": 0.1338,
"step": 153
},
{
"epoch": 0.07006369426751592,
"grad_norm": 1.5963391451568967,
"learning_rate": 4.99760928390845e-06,
"loss": 0.0575,
"step": 154
},
{
"epoch": 0.0705186533212011,
"grad_norm": 1.7788915412646347,
"learning_rate": 4.997577935653262e-06,
"loss": 0.08,
"step": 155
},
{
"epoch": 0.07097361237488627,
"grad_norm": 1.5840889143049688,
"learning_rate": 4.9975463833072835e-06,
"loss": 0.0709,
"step": 156
},
{
"epoch": 0.07142857142857142,
"grad_norm": 2.1242834812157962,
"learning_rate": 4.997514626873093e-06,
"loss": 0.1078,
"step": 157
},
{
"epoch": 0.0718835304822566,
"grad_norm": 1.7256733994251798,
"learning_rate": 4.997482666353287e-06,
"loss": 0.0678,
"step": 158
},
{
"epoch": 0.07233848953594177,
"grad_norm": 2.2088750555704073,
"learning_rate": 4.997450501750476e-06,
"loss": 0.0981,
"step": 159
},
{
"epoch": 0.07279344858962693,
"grad_norm": 1.817598507902073,
"learning_rate": 4.997418133067288e-06,
"loss": 0.0829,
"step": 160
},
{
"epoch": 0.0732484076433121,
"grad_norm": 1.9174894618752205,
"learning_rate": 4.997385560306368e-06,
"loss": 0.0922,
"step": 161
},
{
"epoch": 0.07370336669699727,
"grad_norm": 1.7975593397664607,
"learning_rate": 4.997352783470379e-06,
"loss": 0.093,
"step": 162
},
{
"epoch": 0.07415832575068244,
"grad_norm": 2.1789877377155147,
"learning_rate": 4.997319802561997e-06,
"loss": 0.1044,
"step": 163
},
{
"epoch": 0.0746132848043676,
"grad_norm": 1.5046722090412417,
"learning_rate": 4.9972866175839196e-06,
"loss": 0.0806,
"step": 164
},
{
"epoch": 0.07506824385805277,
"grad_norm": 1.828261506678391,
"learning_rate": 4.9972532285388575e-06,
"loss": 0.1018,
"step": 165
},
{
"epoch": 0.07552320291173795,
"grad_norm": 1.853289616987827,
"learning_rate": 4.997219635429538e-06,
"loss": 0.1177,
"step": 166
},
{
"epoch": 0.07597816196542312,
"grad_norm": 1.9172069323651033,
"learning_rate": 4.997185838258709e-06,
"loss": 0.0817,
"step": 167
},
{
"epoch": 0.07643312101910828,
"grad_norm": 1.6956924002006215,
"learning_rate": 4.997151837029129e-06,
"loss": 0.0679,
"step": 168
},
{
"epoch": 0.07688808007279345,
"grad_norm": 1.8575330553269362,
"learning_rate": 4.997117631743579e-06,
"loss": 0.0855,
"step": 169
},
{
"epoch": 0.07734303912647862,
"grad_norm": 1.7266908578071283,
"learning_rate": 4.997083222404852e-06,
"loss": 0.0625,
"step": 170
},
{
"epoch": 0.07779799818016378,
"grad_norm": 1.6397125044179104,
"learning_rate": 4.997048609015762e-06,
"loss": 0.0751,
"step": 171
},
{
"epoch": 0.07825295723384895,
"grad_norm": 1.5340896344557344,
"learning_rate": 4.997013791579136e-06,
"loss": 0.0786,
"step": 172
},
{
"epoch": 0.07870791628753412,
"grad_norm": 1.9189331650587453,
"learning_rate": 4.996978770097819e-06,
"loss": 0.0953,
"step": 173
},
{
"epoch": 0.0791628753412193,
"grad_norm": 1.7773721601434869,
"learning_rate": 4.996943544574673e-06,
"loss": 0.083,
"step": 174
},
{
"epoch": 0.07961783439490445,
"grad_norm": 1.7663708027835396,
"learning_rate": 4.996908115012576e-06,
"loss": 0.0711,
"step": 175
},
{
"epoch": 0.08007279344858963,
"grad_norm": 2.0988130747441462,
"learning_rate": 4.996872481414425e-06,
"loss": 0.1068,
"step": 176
},
{
"epoch": 0.0805277525022748,
"grad_norm": 3.491649419917669,
"learning_rate": 4.9968366437831305e-06,
"loss": 0.1596,
"step": 177
},
{
"epoch": 0.08098271155595996,
"grad_norm": 0.9772529604089312,
"learning_rate": 4.99680060212162e-06,
"loss": 0.0469,
"step": 178
},
{
"epoch": 0.08143767060964513,
"grad_norm": 1.411497576217555,
"learning_rate": 4.996764356432841e-06,
"loss": 0.0799,
"step": 179
},
{
"epoch": 0.0818926296633303,
"grad_norm": 1.9634897057091474,
"learning_rate": 4.996727906719754e-06,
"loss": 0.0818,
"step": 180
},
{
"epoch": 0.08234758871701547,
"grad_norm": 1.8622777856402457,
"learning_rate": 4.9966912529853365e-06,
"loss": 0.0654,
"step": 181
},
{
"epoch": 0.08280254777070063,
"grad_norm": 1.6338074095796988,
"learning_rate": 4.996654395232585e-06,
"loss": 0.0744,
"step": 182
},
{
"epoch": 0.0832575068243858,
"grad_norm": 1.534919993971643,
"learning_rate": 4.996617333464512e-06,
"loss": 0.0639,
"step": 183
},
{
"epoch": 0.08371246587807098,
"grad_norm": 1.7391379315757225,
"learning_rate": 4.996580067684145e-06,
"loss": 0.0715,
"step": 184
},
{
"epoch": 0.08416742493175614,
"grad_norm": 1.7215093643580193,
"learning_rate": 4.996542597894528e-06,
"loss": 0.1192,
"step": 185
},
{
"epoch": 0.08462238398544131,
"grad_norm": 2.041088124472192,
"learning_rate": 4.996504924098726e-06,
"loss": 0.1078,
"step": 186
},
{
"epoch": 0.08507734303912648,
"grad_norm": 1.7083926900772908,
"learning_rate": 4.9964670462998145e-06,
"loss": 0.0922,
"step": 187
},
{
"epoch": 0.08553230209281165,
"grad_norm": 1.9950587953196364,
"learning_rate": 4.99642896450089e-06,
"loss": 0.125,
"step": 188
},
{
"epoch": 0.08598726114649681,
"grad_norm": 2.2702904646099022,
"learning_rate": 4.9963906787050656e-06,
"loss": 0.1318,
"step": 189
},
{
"epoch": 0.08644222020018198,
"grad_norm": 1.5062676480402928,
"learning_rate": 4.996352188915467e-06,
"loss": 0.0621,
"step": 190
},
{
"epoch": 0.08689717925386715,
"grad_norm": 2.6764229211241153,
"learning_rate": 4.996313495135242e-06,
"loss": 0.1112,
"step": 191
},
{
"epoch": 0.08735213830755233,
"grad_norm": 2.276483991348045,
"learning_rate": 4.9962745973675505e-06,
"loss": 0.1219,
"step": 192
},
{
"epoch": 0.08780709736123748,
"grad_norm": 1.4375762261827663,
"learning_rate": 4.996235495615572e-06,
"loss": 0.0641,
"step": 193
},
{
"epoch": 0.08826205641492266,
"grad_norm": 2.3164336329931094,
"learning_rate": 4.996196189882503e-06,
"loss": 0.1176,
"step": 194
},
{
"epoch": 0.08871701546860783,
"grad_norm": 2.225732764096407,
"learning_rate": 4.996156680171552e-06,
"loss": 0.1096,
"step": 195
},
{
"epoch": 0.08917197452229299,
"grad_norm": 1.8464739663611849,
"learning_rate": 4.996116966485951e-06,
"loss": 0.0817,
"step": 196
},
{
"epoch": 0.08962693357597816,
"grad_norm": 1.9290667932284378,
"learning_rate": 4.996077048828944e-06,
"loss": 0.1106,
"step": 197
},
{
"epoch": 0.09008189262966333,
"grad_norm": 1.6322378586848272,
"learning_rate": 4.996036927203793e-06,
"loss": 0.0972,
"step": 198
},
{
"epoch": 0.0905368516833485,
"grad_norm": 2.2100804969645416,
"learning_rate": 4.995996601613775e-06,
"loss": 0.0944,
"step": 199
},
{
"epoch": 0.09099181073703366,
"grad_norm": 1.5641835045850314,
"learning_rate": 4.9959560720621875e-06,
"loss": 0.0896,
"step": 200
},
{
"epoch": 0.09144676979071883,
"grad_norm": 2.2116837789953117,
"learning_rate": 4.995915338552341e-06,
"loss": 0.1331,
"step": 201
},
{
"epoch": 0.09190172884440401,
"grad_norm": 1.8792253280188753,
"learning_rate": 4.995874401087565e-06,
"loss": 0.0967,
"step": 202
},
{
"epoch": 0.09235668789808917,
"grad_norm": 2.167978668790899,
"learning_rate": 4.9958332596712035e-06,
"loss": 0.1141,
"step": 203
},
{
"epoch": 0.09281164695177434,
"grad_norm": 1.8621318139110883,
"learning_rate": 4.99579191430662e-06,
"loss": 0.0972,
"step": 204
},
{
"epoch": 0.09326660600545951,
"grad_norm": 1.8429430162012657,
"learning_rate": 4.995750364997192e-06,
"loss": 0.0967,
"step": 205
},
{
"epoch": 0.09372156505914468,
"grad_norm": 1.5424629326591568,
"learning_rate": 4.995708611746314e-06,
"loss": 0.0814,
"step": 206
},
{
"epoch": 0.09417652411282984,
"grad_norm": 2.0700985381007904,
"learning_rate": 4.995666654557399e-06,
"loss": 0.1038,
"step": 207
},
{
"epoch": 0.09463148316651501,
"grad_norm": 1.8765344045928045,
"learning_rate": 4.995624493433876e-06,
"loss": 0.1075,
"step": 208
},
{
"epoch": 0.09508644222020018,
"grad_norm": 1.8732891178471252,
"learning_rate": 4.995582128379189e-06,
"loss": 0.1001,
"step": 209
},
{
"epoch": 0.09554140127388536,
"grad_norm": 2.1418545940903373,
"learning_rate": 4.9955395593968e-06,
"loss": 0.1463,
"step": 210
},
{
"epoch": 0.09599636032757052,
"grad_norm": 1.905821465202796,
"learning_rate": 4.99549678649019e-06,
"loss": 0.0848,
"step": 211
},
{
"epoch": 0.09645131938125569,
"grad_norm": 1.7581366634538098,
"learning_rate": 4.99545380966285e-06,
"loss": 0.0976,
"step": 212
},
{
"epoch": 0.09690627843494086,
"grad_norm": 2.133882292644339,
"learning_rate": 4.995410628918294e-06,
"loss": 0.1036,
"step": 213
},
{
"epoch": 0.09736123748862602,
"grad_norm": 1.6491455235555508,
"learning_rate": 4.995367244260052e-06,
"loss": 0.1,
"step": 214
},
{
"epoch": 0.09781619654231119,
"grad_norm": 1.372315749578445,
"learning_rate": 4.995323655691667e-06,
"loss": 0.0543,
"step": 215
},
{
"epoch": 0.09827115559599636,
"grad_norm": 2.2929084487384297,
"learning_rate": 4.995279863216702e-06,
"loss": 0.1005,
"step": 216
},
{
"epoch": 0.09872611464968153,
"grad_norm": 1.8371182479654964,
"learning_rate": 4.995235866838735e-06,
"loss": 0.096,
"step": 217
},
{
"epoch": 0.09918107370336669,
"grad_norm": 1.4189314035725125,
"learning_rate": 4.995191666561361e-06,
"loss": 0.0707,
"step": 218
},
{
"epoch": 0.09963603275705187,
"grad_norm": 1.4036483642687965,
"learning_rate": 4.995147262388192e-06,
"loss": 0.0689,
"step": 219
},
{
"epoch": 0.10009099181073704,
"grad_norm": 1.7382878807357938,
"learning_rate": 4.995102654322858e-06,
"loss": 0.0829,
"step": 220
},
{
"epoch": 0.1005459508644222,
"grad_norm": 1.3102015447280675,
"learning_rate": 4.995057842369002e-06,
"loss": 0.0548,
"step": 221
},
{
"epoch": 0.10100090991810737,
"grad_norm": 1.8490525072637034,
"learning_rate": 4.995012826530287e-06,
"loss": 0.1044,
"step": 222
},
{
"epoch": 0.10145586897179254,
"grad_norm": 2.802543488000276,
"learning_rate": 4.99496760681039e-06,
"loss": 0.1393,
"step": 223
},
{
"epoch": 0.10191082802547771,
"grad_norm": 2.4234245545914295,
"learning_rate": 4.994922183213009e-06,
"loss": 0.1325,
"step": 224
},
{
"epoch": 0.10236578707916287,
"grad_norm": 1.1495372549504432,
"learning_rate": 4.9948765557418535e-06,
"loss": 0.0585,
"step": 225
},
{
"epoch": 0.10282074613284804,
"grad_norm": 2.1666263724534267,
"learning_rate": 4.994830724400653e-06,
"loss": 0.1063,
"step": 226
},
{
"epoch": 0.10327570518653321,
"grad_norm": 1.7066677970234532,
"learning_rate": 4.994784689193151e-06,
"loss": 0.1002,
"step": 227
},
{
"epoch": 0.10373066424021839,
"grad_norm": 1.5304723941528642,
"learning_rate": 4.994738450123111e-06,
"loss": 0.0825,
"step": 228
},
{
"epoch": 0.10418562329390355,
"grad_norm": 2.1125485884299486,
"learning_rate": 4.994692007194312e-06,
"loss": 0.1089,
"step": 229
},
{
"epoch": 0.10464058234758872,
"grad_norm": 1.4297773182355138,
"learning_rate": 4.994645360410547e-06,
"loss": 0.0855,
"step": 230
},
{
"epoch": 0.10509554140127389,
"grad_norm": 1.741498602747005,
"learning_rate": 4.99459850977563e-06,
"loss": 0.0884,
"step": 231
},
{
"epoch": 0.10555050045495905,
"grad_norm": 1.6875366585424447,
"learning_rate": 4.994551455293388e-06,
"loss": 0.068,
"step": 232
},
{
"epoch": 0.10600545950864422,
"grad_norm": 2.03347527932056,
"learning_rate": 4.9945041969676654e-06,
"loss": 0.0997,
"step": 233
},
{
"epoch": 0.10646041856232939,
"grad_norm": 1.5553350034126536,
"learning_rate": 4.994456734802325e-06,
"loss": 0.0709,
"step": 234
},
{
"epoch": 0.10691537761601456,
"grad_norm": 1.354348073951093,
"learning_rate": 4.994409068801247e-06,
"loss": 0.0858,
"step": 235
},
{
"epoch": 0.10737033666969972,
"grad_norm": 1.6048007960766557,
"learning_rate": 4.994361198968323e-06,
"loss": 0.0891,
"step": 236
},
{
"epoch": 0.1078252957233849,
"grad_norm": 2.3380973830643663,
"learning_rate": 4.994313125307466e-06,
"loss": 0.116,
"step": 237
},
{
"epoch": 0.10828025477707007,
"grad_norm": 1.68606521406513,
"learning_rate": 4.994264847822605e-06,
"loss": 0.09,
"step": 238
},
{
"epoch": 0.10873521383075523,
"grad_norm": 2.0274881934833715,
"learning_rate": 4.994216366517684e-06,
"loss": 0.0856,
"step": 239
},
{
"epoch": 0.1091901728844404,
"grad_norm": 1.9224041067300894,
"learning_rate": 4.994167681396667e-06,
"loss": 0.1032,
"step": 240
},
{
"epoch": 0.10964513193812557,
"grad_norm": 2.213562554498921,
"learning_rate": 4.994118792463529e-06,
"loss": 0.1125,
"step": 241
},
{
"epoch": 0.11010009099181074,
"grad_norm": 2.396477374166045,
"learning_rate": 4.994069699722267e-06,
"loss": 0.16,
"step": 242
},
{
"epoch": 0.1105550500454959,
"grad_norm": 1.6621616457271884,
"learning_rate": 4.994020403176893e-06,
"loss": 0.1088,
"step": 243
},
{
"epoch": 0.11101000909918107,
"grad_norm": 2.0137991000965862,
"learning_rate": 4.9939709028314345e-06,
"loss": 0.1203,
"step": 244
},
{
"epoch": 0.11146496815286625,
"grad_norm": 1.731498246221376,
"learning_rate": 4.993921198689935e-06,
"loss": 0.0779,
"step": 245
},
{
"epoch": 0.11191992720655142,
"grad_norm": 1.53319841517271,
"learning_rate": 4.993871290756459e-06,
"loss": 0.0859,
"step": 246
},
{
"epoch": 0.11237488626023658,
"grad_norm": 1.5738861001818754,
"learning_rate": 4.9938211790350835e-06,
"loss": 0.0822,
"step": 247
},
{
"epoch": 0.11282984531392175,
"grad_norm": 1.795556137822037,
"learning_rate": 4.993770863529902e-06,
"loss": 0.1082,
"step": 248
},
{
"epoch": 0.11328480436760692,
"grad_norm": 1.753136266606954,
"learning_rate": 4.993720344245029e-06,
"loss": 0.0826,
"step": 249
},
{
"epoch": 0.11373976342129208,
"grad_norm": 1.724266476242851,
"learning_rate": 4.99366962118459e-06,
"loss": 0.0851,
"step": 250
},
{
"epoch": 0.11419472247497725,
"grad_norm": 1.8081901179247517,
"learning_rate": 4.99361869435273e-06,
"loss": 0.0965,
"step": 251
},
{
"epoch": 0.11464968152866242,
"grad_norm": 2.064401083784083,
"learning_rate": 4.993567563753613e-06,
"loss": 0.0881,
"step": 252
},
{
"epoch": 0.1151046405823476,
"grad_norm": 1.6354098857617054,
"learning_rate": 4.993516229391414e-06,
"loss": 0.0933,
"step": 253
},
{
"epoch": 0.11555959963603275,
"grad_norm": 1.2711881947711132,
"learning_rate": 4.993464691270331e-06,
"loss": 0.0595,
"step": 254
},
{
"epoch": 0.11601455868971793,
"grad_norm": 1.5847340722430843,
"learning_rate": 4.993412949394572e-06,
"loss": 0.0812,
"step": 255
},
{
"epoch": 0.1164695177434031,
"grad_norm": 1.5774467606957123,
"learning_rate": 4.993361003768369e-06,
"loss": 0.081,
"step": 256
},
{
"epoch": 0.11692447679708826,
"grad_norm": 1.3573852133613107,
"learning_rate": 4.993308854395963e-06,
"loss": 0.0812,
"step": 257
},
{
"epoch": 0.11737943585077343,
"grad_norm": 1.5273272920136396,
"learning_rate": 4.993256501281618e-06,
"loss": 0.0634,
"step": 258
},
{
"epoch": 0.1178343949044586,
"grad_norm": 1.8382646613112785,
"learning_rate": 4.993203944429611e-06,
"loss": 0.1145,
"step": 259
},
{
"epoch": 0.11828935395814377,
"grad_norm": 1.5747608705636602,
"learning_rate": 4.993151183844236e-06,
"loss": 0.0801,
"step": 260
},
{
"epoch": 0.11874431301182893,
"grad_norm": 1.7065433305132354,
"learning_rate": 4.9930982195298065e-06,
"loss": 0.0742,
"step": 261
},
{
"epoch": 0.1191992720655141,
"grad_norm": 1.709109441111134,
"learning_rate": 4.9930450514906484e-06,
"loss": 0.1028,
"step": 262
},
{
"epoch": 0.11965423111919928,
"grad_norm": 1.6959707782927067,
"learning_rate": 4.9929916797311075e-06,
"loss": 0.0791,
"step": 263
},
{
"epoch": 0.12010919017288443,
"grad_norm": 2.374639715905283,
"learning_rate": 4.992938104255545e-06,
"loss": 0.1477,
"step": 264
},
{
"epoch": 0.1205641492265696,
"grad_norm": 1.6263809057131815,
"learning_rate": 4.992884325068339e-06,
"loss": 0.0916,
"step": 265
},
{
"epoch": 0.12101910828025478,
"grad_norm": 1.6207164559915699,
"learning_rate": 4.992830342173882e-06,
"loss": 0.1068,
"step": 266
},
{
"epoch": 0.12147406733393995,
"grad_norm": 2.0552449766971823,
"learning_rate": 4.992776155576589e-06,
"loss": 0.1145,
"step": 267
},
{
"epoch": 0.12192902638762511,
"grad_norm": 1.6692049909432523,
"learning_rate": 4.992721765280884e-06,
"loss": 0.1172,
"step": 268
},
{
"epoch": 0.12238398544131028,
"grad_norm": 2.456621954888186,
"learning_rate": 4.992667171291215e-06,
"loss": 0.1267,
"step": 269
},
{
"epoch": 0.12283894449499545,
"grad_norm": 1.5125250812884448,
"learning_rate": 4.992612373612042e-06,
"loss": 0.0661,
"step": 270
},
{
"epoch": 0.12329390354868063,
"grad_norm": 2.0952324870431553,
"learning_rate": 4.99255737224784e-06,
"loss": 0.0917,
"step": 271
},
{
"epoch": 0.12374886260236578,
"grad_norm": 1.4094336450761362,
"learning_rate": 4.9925021672031075e-06,
"loss": 0.0905,
"step": 272
},
{
"epoch": 0.12420382165605096,
"grad_norm": 2.239902062561175,
"learning_rate": 4.992446758482353e-06,
"loss": 0.0995,
"step": 273
},
{
"epoch": 0.12465878070973613,
"grad_norm": 2.696125395972354,
"learning_rate": 4.992391146090106e-06,
"loss": 0.1613,
"step": 274
},
{
"epoch": 0.1251137397634213,
"grad_norm": 1.4853155964847005,
"learning_rate": 4.99233533003091e-06,
"loss": 0.0826,
"step": 275
},
{
"epoch": 0.12556869881710647,
"grad_norm": 1.5393545957542452,
"learning_rate": 4.992279310309326e-06,
"loss": 0.1128,
"step": 276
},
{
"epoch": 0.12602365787079162,
"grad_norm": 2.4236941073693283,
"learning_rate": 4.9922230869299316e-06,
"loss": 0.1607,
"step": 277
},
{
"epoch": 0.1264786169244768,
"grad_norm": 1.6611888199243576,
"learning_rate": 4.992166659897321e-06,
"loss": 0.1005,
"step": 278
},
{
"epoch": 0.12693357597816196,
"grad_norm": 1.3896864345667146,
"learning_rate": 4.992110029216106e-06,
"loss": 0.079,
"step": 279
},
{
"epoch": 0.12738853503184713,
"grad_norm": 1.3647278081745937,
"learning_rate": 4.992053194890914e-06,
"loss": 0.0767,
"step": 280
},
{
"epoch": 0.1278434940855323,
"grad_norm": 2.0323876810575525,
"learning_rate": 4.991996156926388e-06,
"loss": 0.101,
"step": 281
},
{
"epoch": 0.12829845313921748,
"grad_norm": 1.948481701516796,
"learning_rate": 4.9919389153271904e-06,
"loss": 0.106,
"step": 282
},
{
"epoch": 0.12875341219290265,
"grad_norm": 1.3512588403363923,
"learning_rate": 4.991881470097998e-06,
"loss": 0.0897,
"step": 283
},
{
"epoch": 0.1292083712465878,
"grad_norm": 1.4862053800013564,
"learning_rate": 4.991823821243505e-06,
"loss": 0.0898,
"step": 284
},
{
"epoch": 0.12966333030027297,
"grad_norm": 2.287612016528911,
"learning_rate": 4.991765968768422e-06,
"loss": 0.1048,
"step": 285
},
{
"epoch": 0.13011828935395814,
"grad_norm": 1.8190624177647585,
"learning_rate": 4.991707912677477e-06,
"loss": 0.076,
"step": 286
},
{
"epoch": 0.1305732484076433,
"grad_norm": 1.4178411985180965,
"learning_rate": 4.991649652975414e-06,
"loss": 0.062,
"step": 287
},
{
"epoch": 0.13102820746132848,
"grad_norm": 1.7010811854624341,
"learning_rate": 4.991591189666994e-06,
"loss": 0.0928,
"step": 288
},
{
"epoch": 0.13148316651501366,
"grad_norm": 1.7824920481002249,
"learning_rate": 4.991532522756993e-06,
"loss": 0.09,
"step": 289
},
{
"epoch": 0.13193812556869883,
"grad_norm": 1.12093519239752,
"learning_rate": 4.991473652250207e-06,
"loss": 0.0564,
"step": 290
},
{
"epoch": 0.13239308462238397,
"grad_norm": 1.4956629959050902,
"learning_rate": 4.991414578151445e-06,
"loss": 0.0777,
"step": 291
},
{
"epoch": 0.13284804367606914,
"grad_norm": 3.467748085139679,
"learning_rate": 4.991355300465535e-06,
"loss": 0.193,
"step": 292
},
{
"epoch": 0.13330300272975432,
"grad_norm": 1.746518786410603,
"learning_rate": 4.99129581919732e-06,
"loss": 0.0862,
"step": 293
},
{
"epoch": 0.1337579617834395,
"grad_norm": 1.3513400373127227,
"learning_rate": 4.9912361343516616e-06,
"loss": 0.0588,
"step": 294
},
{
"epoch": 0.13421292083712466,
"grad_norm": 1.7841617467512154,
"learning_rate": 4.991176245933437e-06,
"loss": 0.0982,
"step": 295
},
{
"epoch": 0.13466787989080983,
"grad_norm": 1.6650575824861316,
"learning_rate": 4.9911161539475385e-06,
"loss": 0.0868,
"step": 296
},
{
"epoch": 0.135122838944495,
"grad_norm": 2.0850606622795667,
"learning_rate": 4.991055858398879e-06,
"loss": 0.1087,
"step": 297
},
{
"epoch": 0.13557779799818018,
"grad_norm": 2.27094495258401,
"learning_rate": 4.990995359292384e-06,
"loss": 0.1177,
"step": 298
},
{
"epoch": 0.13603275705186532,
"grad_norm": 1.8175215978998918,
"learning_rate": 4.990934656632997e-06,
"loss": 0.1029,
"step": 299
},
{
"epoch": 0.1364877161055505,
"grad_norm": 1.9580713421337124,
"learning_rate": 4.990873750425679e-06,
"loss": 0.0842,
"step": 300
},
{
"epoch": 0.13694267515923567,
"grad_norm": 1.5378181370134305,
"learning_rate": 4.990812640675406e-06,
"loss": 0.0813,
"step": 301
},
{
"epoch": 0.13739763421292084,
"grad_norm": 1.4646500614646956,
"learning_rate": 4.990751327387174e-06,
"loss": 0.0642,
"step": 302
},
{
"epoch": 0.137852593266606,
"grad_norm": 1.7132953215338962,
"learning_rate": 4.99068981056599e-06,
"loss": 0.0921,
"step": 303
},
{
"epoch": 0.13830755232029118,
"grad_norm": 2.020828034549401,
"learning_rate": 4.990628090216885e-06,
"loss": 0.1164,
"step": 304
},
{
"epoch": 0.13876251137397635,
"grad_norm": 1.4167009033800524,
"learning_rate": 4.990566166344898e-06,
"loss": 0.0695,
"step": 305
},
{
"epoch": 0.1392174704276615,
"grad_norm": 1.743676237886539,
"learning_rate": 4.990504038955092e-06,
"loss": 0.1083,
"step": 306
},
{
"epoch": 0.13967242948134667,
"grad_norm": 1.8343720931834766,
"learning_rate": 4.990441708052542e-06,
"loss": 0.0985,
"step": 307
},
{
"epoch": 0.14012738853503184,
"grad_norm": 1.4113998497835858,
"learning_rate": 4.9903791736423435e-06,
"loss": 0.081,
"step": 308
},
{
"epoch": 0.14058234758871702,
"grad_norm": 1.8830612535708886,
"learning_rate": 4.9903164357296044e-06,
"loss": 0.0954,
"step": 309
},
{
"epoch": 0.1410373066424022,
"grad_norm": 1.4208829323408247,
"learning_rate": 4.990253494319453e-06,
"loss": 0.0919,
"step": 310
},
{
"epoch": 0.14149226569608736,
"grad_norm": 1.3671067756437636,
"learning_rate": 4.990190349417032e-06,
"loss": 0.0928,
"step": 311
},
{
"epoch": 0.14194722474977253,
"grad_norm": 1.965673083316737,
"learning_rate": 4.990127001027501e-06,
"loss": 0.0849,
"step": 312
},
{
"epoch": 0.14240218380345768,
"grad_norm": 1.3933093475773835,
"learning_rate": 4.990063449156037e-06,
"loss": 0.0735,
"step": 313
},
{
"epoch": 0.14285714285714285,
"grad_norm": 1.8960360183192995,
"learning_rate": 4.989999693807832e-06,
"loss": 0.1141,
"step": 314
},
{
"epoch": 0.14331210191082802,
"grad_norm": 1.8316795975938271,
"learning_rate": 4.989935734988098e-06,
"loss": 0.1084,
"step": 315
},
{
"epoch": 0.1437670609645132,
"grad_norm": 1.6451238367574679,
"learning_rate": 4.98987157270206e-06,
"loss": 0.0739,
"step": 316
},
{
"epoch": 0.14422202001819837,
"grad_norm": 2.0644883617404854,
"learning_rate": 4.989807206954961e-06,
"loss": 0.1125,
"step": 317
},
{
"epoch": 0.14467697907188354,
"grad_norm": 1.322196438354388,
"learning_rate": 4.9897426377520605e-06,
"loss": 0.0792,
"step": 318
},
{
"epoch": 0.1451319381255687,
"grad_norm": 2.568915637493138,
"learning_rate": 4.989677865098636e-06,
"loss": 0.1236,
"step": 319
},
{
"epoch": 0.14558689717925385,
"grad_norm": 1.1659492648591403,
"learning_rate": 4.989612888999978e-06,
"loss": 0.0624,
"step": 320
},
{
"epoch": 0.14604185623293903,
"grad_norm": 1.431829324891758,
"learning_rate": 4.9895477094614e-06,
"loss": 0.0855,
"step": 321
},
{
"epoch": 0.1464968152866242,
"grad_norm": 1.1704367288212936,
"learning_rate": 4.989482326488225e-06,
"loss": 0.0741,
"step": 322
},
{
"epoch": 0.14695177434030937,
"grad_norm": 1.6170438514885752,
"learning_rate": 4.989416740085796e-06,
"loss": 0.1057,
"step": 323
},
{
"epoch": 0.14740673339399454,
"grad_norm": 1.639627544263893,
"learning_rate": 4.9893509502594735e-06,
"loss": 0.0784,
"step": 324
},
{
"epoch": 0.14786169244767972,
"grad_norm": 1.6437318926278874,
"learning_rate": 4.9892849570146335e-06,
"loss": 0.1105,
"step": 325
},
{
"epoch": 0.1483166515013649,
"grad_norm": 1.6588510281862943,
"learning_rate": 4.989218760356668e-06,
"loss": 0.106,
"step": 326
},
{
"epoch": 0.14877161055505003,
"grad_norm": 1.692767253326721,
"learning_rate": 4.989152360290987e-06,
"loss": 0.1068,
"step": 327
},
{
"epoch": 0.1492265696087352,
"grad_norm": 2.117777475502305,
"learning_rate": 4.989085756823015e-06,
"loss": 0.1274,
"step": 328
},
{
"epoch": 0.14968152866242038,
"grad_norm": 1.6877038030416243,
"learning_rate": 4.989018949958197e-06,
"loss": 0.1001,
"step": 329
},
{
"epoch": 0.15013648771610555,
"grad_norm": 2.018139319167573,
"learning_rate": 4.98895193970199e-06,
"loss": 0.0726,
"step": 330
},
{
"epoch": 0.15059144676979072,
"grad_norm": 1.7601822979826238,
"learning_rate": 4.9888847260598705e-06,
"loss": 0.0884,
"step": 331
},
{
"epoch": 0.1510464058234759,
"grad_norm": 2.153451550499006,
"learning_rate": 4.98881730903733e-06,
"loss": 0.1263,
"step": 332
},
{
"epoch": 0.15150136487716107,
"grad_norm": 1.7037846763057773,
"learning_rate": 4.98874968863988e-06,
"loss": 0.1017,
"step": 333
},
{
"epoch": 0.15195632393084624,
"grad_norm": 1.6373036503866722,
"learning_rate": 4.988681864873044e-06,
"loss": 0.0936,
"step": 334
},
{
"epoch": 0.15241128298453138,
"grad_norm": 1.5043938510579566,
"learning_rate": 4.988613837742364e-06,
"loss": 0.0841,
"step": 335
},
{
"epoch": 0.15286624203821655,
"grad_norm": 1.9480098961832564,
"learning_rate": 4.9885456072534015e-06,
"loss": 0.093,
"step": 336
},
{
"epoch": 0.15332120109190173,
"grad_norm": 2.0743334215437845,
"learning_rate": 4.988477173411728e-06,
"loss": 0.1001,
"step": 337
},
{
"epoch": 0.1537761601455869,
"grad_norm": 1.3686100112269117,
"learning_rate": 4.988408536222939e-06,
"loss": 0.0706,
"step": 338
},
{
"epoch": 0.15423111919927207,
"grad_norm": 1.7072624744285279,
"learning_rate": 4.9883396956926416e-06,
"loss": 0.0883,
"step": 339
},
{
"epoch": 0.15468607825295724,
"grad_norm": 1.2178991309049074,
"learning_rate": 4.988270651826462e-06,
"loss": 0.066,
"step": 340
},
{
"epoch": 0.15514103730664242,
"grad_norm": 1.5734145514066031,
"learning_rate": 4.988201404630041e-06,
"loss": 0.0818,
"step": 341
},
{
"epoch": 0.15559599636032756,
"grad_norm": 1.4266019263972631,
"learning_rate": 4.988131954109038e-06,
"loss": 0.0835,
"step": 342
},
{
"epoch": 0.15605095541401273,
"grad_norm": 2.2620036917930633,
"learning_rate": 4.988062300269128e-06,
"loss": 0.1374,
"step": 343
},
{
"epoch": 0.1565059144676979,
"grad_norm": 1.4975643248719304,
"learning_rate": 4.987992443116003e-06,
"loss": 0.0817,
"step": 344
},
{
"epoch": 0.15696087352138308,
"grad_norm": 1.723916950757741,
"learning_rate": 4.987922382655372e-06,
"loss": 0.086,
"step": 345
},
{
"epoch": 0.15741583257506825,
"grad_norm": 2.50033376989197,
"learning_rate": 4.987852118892958e-06,
"loss": 0.1498,
"step": 346
},
{
"epoch": 0.15787079162875342,
"grad_norm": 2.0776125106096934,
"learning_rate": 4.987781651834503e-06,
"loss": 0.1258,
"step": 347
},
{
"epoch": 0.1583257506824386,
"grad_norm": 2.186488732885297,
"learning_rate": 4.987710981485768e-06,
"loss": 0.1203,
"step": 348
},
{
"epoch": 0.15878070973612374,
"grad_norm": 2.0497982262406698,
"learning_rate": 4.987640107852525e-06,
"loss": 0.1365,
"step": 349
},
{
"epoch": 0.1592356687898089,
"grad_norm": 1.394060418907116,
"learning_rate": 4.987569030940567e-06,
"loss": 0.0811,
"step": 350
},
{
"epoch": 0.15969062784349408,
"grad_norm": 1.5257209721345255,
"learning_rate": 4.987497750755702e-06,
"loss": 0.0665,
"step": 351
},
{
"epoch": 0.16014558689717925,
"grad_norm": 2.328076306378438,
"learning_rate": 4.987426267303753e-06,
"loss": 0.1186,
"step": 352
},
{
"epoch": 0.16060054595086443,
"grad_norm": 1.8266119344469305,
"learning_rate": 4.987354580590563e-06,
"loss": 0.1011,
"step": 353
},
{
"epoch": 0.1610555050045496,
"grad_norm": 1.7369452160483552,
"learning_rate": 4.987282690621991e-06,
"loss": 0.117,
"step": 354
},
{
"epoch": 0.16151046405823477,
"grad_norm": 1.8346392689418392,
"learning_rate": 4.987210597403907e-06,
"loss": 0.1,
"step": 355
},
{
"epoch": 0.16196542311191992,
"grad_norm": 1.9402353280122917,
"learning_rate": 4.987138300942208e-06,
"loss": 0.0949,
"step": 356
},
{
"epoch": 0.1624203821656051,
"grad_norm": 1.4819316275042067,
"learning_rate": 4.987065801242798e-06,
"loss": 0.0855,
"step": 357
},
{
"epoch": 0.16287534121929026,
"grad_norm": 1.8440191145455884,
"learning_rate": 4.986993098311601e-06,
"loss": 0.1057,
"step": 358
},
{
"epoch": 0.16333030027297543,
"grad_norm": 1.712390016283102,
"learning_rate": 4.986920192154561e-06,
"loss": 0.0917,
"step": 359
},
{
"epoch": 0.1637852593266606,
"grad_norm": 1.2697535382377623,
"learning_rate": 4.986847082777632e-06,
"loss": 0.0729,
"step": 360
},
{
"epoch": 0.16424021838034578,
"grad_norm": 1.5330396115730802,
"learning_rate": 4.986773770186791e-06,
"loss": 0.0966,
"step": 361
},
{
"epoch": 0.16469517743403095,
"grad_norm": 2.359233717201702,
"learning_rate": 4.986700254388027e-06,
"loss": 0.1308,
"step": 362
},
{
"epoch": 0.1651501364877161,
"grad_norm": 1.330733109747955,
"learning_rate": 4.986626535387349e-06,
"loss": 0.0728,
"step": 363
},
{
"epoch": 0.16560509554140126,
"grad_norm": 1.7398719883146694,
"learning_rate": 4.9865526131907795e-06,
"loss": 0.0893,
"step": 364
},
{
"epoch": 0.16606005459508644,
"grad_norm": 2.018839749017437,
"learning_rate": 4.9864784878043595e-06,
"loss": 0.1268,
"step": 365
},
{
"epoch": 0.1665150136487716,
"grad_norm": 2.439244123753763,
"learning_rate": 4.986404159234146e-06,
"loss": 0.1047,
"step": 366
},
{
"epoch": 0.16696997270245678,
"grad_norm": 1.4077243142655576,
"learning_rate": 4.986329627486213e-06,
"loss": 0.07,
"step": 367
},
{
"epoch": 0.16742493175614195,
"grad_norm": 2.0634194365835583,
"learning_rate": 4.986254892566652e-06,
"loss": 0.1199,
"step": 368
},
{
"epoch": 0.16787989080982713,
"grad_norm": 1.507898380305614,
"learning_rate": 4.9861799544815684e-06,
"loss": 0.0798,
"step": 369
},
{
"epoch": 0.16833484986351227,
"grad_norm": 1.5689447325912511,
"learning_rate": 4.986104813237086e-06,
"loss": 0.0872,
"step": 370
},
{
"epoch": 0.16878980891719744,
"grad_norm": 1.5434828853102547,
"learning_rate": 4.986029468839346e-06,
"loss": 0.0756,
"step": 371
},
{
"epoch": 0.16924476797088261,
"grad_norm": 1.9546839136865664,
"learning_rate": 4.985953921294505e-06,
"loss": 0.129,
"step": 372
},
{
"epoch": 0.1696997270245678,
"grad_norm": 1.4457360634551049,
"learning_rate": 4.985878170608736e-06,
"loss": 0.0651,
"step": 373
},
{
"epoch": 0.17015468607825296,
"grad_norm": 1.7053082159754585,
"learning_rate": 4.985802216788228e-06,
"loss": 0.0786,
"step": 374
},
{
"epoch": 0.17060964513193813,
"grad_norm": 2.0831330601859643,
"learning_rate": 4.98572605983919e-06,
"loss": 0.1087,
"step": 375
},
{
"epoch": 0.1710646041856233,
"grad_norm": 1.3106266925763963,
"learning_rate": 4.985649699767842e-06,
"loss": 0.0666,
"step": 376
},
{
"epoch": 0.17151956323930848,
"grad_norm": 1.5931730936354696,
"learning_rate": 4.985573136580427e-06,
"loss": 0.1015,
"step": 377
},
{
"epoch": 0.17197452229299362,
"grad_norm": 1.3398175715153904,
"learning_rate": 4.9854963702832e-06,
"loss": 0.0706,
"step": 378
},
{
"epoch": 0.1724294813466788,
"grad_norm": 1.4932070031671647,
"learning_rate": 4.985419400882433e-06,
"loss": 0.1009,
"step": 379
},
{
"epoch": 0.17288444040036396,
"grad_norm": 2.05809614886543,
"learning_rate": 4.985342228384418e-06,
"loss": 0.1373,
"step": 380
},
{
"epoch": 0.17333939945404914,
"grad_norm": 25.314485102746445,
"learning_rate": 4.985264852795459e-06,
"loss": 0.529,
"step": 381
},
{
"epoch": 0.1737943585077343,
"grad_norm": 1.3496622625056474,
"learning_rate": 4.98518727412188e-06,
"loss": 0.0792,
"step": 382
},
{
"epoch": 0.17424931756141948,
"grad_norm": 2.042157493841037,
"learning_rate": 4.98510949237002e-06,
"loss": 0.1127,
"step": 383
},
{
"epoch": 0.17470427661510465,
"grad_norm": 2.093747109047391,
"learning_rate": 4.985031507546234e-06,
"loss": 0.0931,
"step": 384
},
{
"epoch": 0.1751592356687898,
"grad_norm": 2.620290737475778,
"learning_rate": 4.984953319656896e-06,
"loss": 0.1258,
"step": 385
},
{
"epoch": 0.17561419472247497,
"grad_norm": 1.7812499192074571,
"learning_rate": 4.984874928708395e-06,
"loss": 0.0934,
"step": 386
},
{
"epoch": 0.17606915377616014,
"grad_norm": 1.9861134139953058,
"learning_rate": 4.984796334707136e-06,
"loss": 0.1105,
"step": 387
},
{
"epoch": 0.17652411282984531,
"grad_norm": 9.71210910528449,
"learning_rate": 4.984717537659542e-06,
"loss": 0.119,
"step": 388
},
{
"epoch": 0.1769790718835305,
"grad_norm": 1.2902315877318344,
"learning_rate": 4.984638537572052e-06,
"loss": 0.0591,
"step": 389
},
{
"epoch": 0.17743403093721566,
"grad_norm": 1.693249076147672,
"learning_rate": 4.984559334451121e-06,
"loss": 0.0906,
"step": 390
},
{
"epoch": 0.17788898999090083,
"grad_norm": 1.7045791781932,
"learning_rate": 4.984479928303221e-06,
"loss": 0.066,
"step": 391
},
{
"epoch": 0.17834394904458598,
"grad_norm": 1.588345004423415,
"learning_rate": 4.984400319134841e-06,
"loss": 0.079,
"step": 392
},
{
"epoch": 0.17879890809827115,
"grad_norm": 2.8167066456613368,
"learning_rate": 4.984320506952487e-06,
"loss": 0.1743,
"step": 393
},
{
"epoch": 0.17925386715195632,
"grad_norm": 1.8409665855781128,
"learning_rate": 4.9842404917626796e-06,
"loss": 0.1009,
"step": 394
},
{
"epoch": 0.1797088262056415,
"grad_norm": 1.5444918002986228,
"learning_rate": 4.984160273571959e-06,
"loss": 0.0952,
"step": 395
},
{
"epoch": 0.18016378525932666,
"grad_norm": 1.9718645058282944,
"learning_rate": 4.9840798523868785e-06,
"loss": 0.1217,
"step": 396
},
{
"epoch": 0.18061874431301184,
"grad_norm": 1.669853882784426,
"learning_rate": 4.983999228214011e-06,
"loss": 0.083,
"step": 397
},
{
"epoch": 0.181073703366697,
"grad_norm": 1.5445667787054873,
"learning_rate": 4.983918401059943e-06,
"loss": 0.0838,
"step": 398
},
{
"epoch": 0.18152866242038215,
"grad_norm": 1.8477622601816133,
"learning_rate": 4.983837370931282e-06,
"loss": 0.1199,
"step": 399
},
{
"epoch": 0.18198362147406733,
"grad_norm": 2.295804335093856,
"learning_rate": 4.983756137834647e-06,
"loss": 0.1561,
"step": 400
},
{
"epoch": 0.1824385805277525,
"grad_norm": 2.1902816453958933,
"learning_rate": 4.9836747017766765e-06,
"loss": 0.1014,
"step": 401
},
{
"epoch": 0.18289353958143767,
"grad_norm": 1.7414949549224419,
"learning_rate": 4.983593062764027e-06,
"loss": 0.1046,
"step": 402
},
{
"epoch": 0.18334849863512284,
"grad_norm": 3.529761555914209,
"learning_rate": 4.983511220803367e-06,
"loss": 0.1573,
"step": 403
},
{
"epoch": 0.18380345768880801,
"grad_norm": 1.5931351386368249,
"learning_rate": 4.983429175901386e-06,
"loss": 0.0896,
"step": 404
},
{
"epoch": 0.1842584167424932,
"grad_norm": 1.4617184144821196,
"learning_rate": 4.983346928064788e-06,
"loss": 0.0698,
"step": 405
},
{
"epoch": 0.18471337579617833,
"grad_norm": 1.564679441746091,
"learning_rate": 4.9832644773002935e-06,
"loss": 0.0955,
"step": 406
},
{
"epoch": 0.1851683348498635,
"grad_norm": 1.4077890282448986,
"learning_rate": 4.98318182361464e-06,
"loss": 0.0887,
"step": 407
},
{
"epoch": 0.18562329390354868,
"grad_norm": 1.6028267121804223,
"learning_rate": 4.9830989670145825e-06,
"loss": 0.0989,
"step": 408
},
{
"epoch": 0.18607825295723385,
"grad_norm": 1.8479648547197383,
"learning_rate": 4.9830159075068905e-06,
"loss": 0.1009,
"step": 409
},
{
"epoch": 0.18653321201091902,
"grad_norm": 1.8145495712184487,
"learning_rate": 4.9829326450983514e-06,
"loss": 0.1125,
"step": 410
},
{
"epoch": 0.1869881710646042,
"grad_norm": 1.839873930402737,
"learning_rate": 4.98284917979577e-06,
"loss": 0.0975,
"step": 411
},
{
"epoch": 0.18744313011828936,
"grad_norm": 2.3433237407057863,
"learning_rate": 4.9827655116059656e-06,
"loss": 0.1061,
"step": 412
},
{
"epoch": 0.18789808917197454,
"grad_norm": 1.479552769836274,
"learning_rate": 4.9826816405357755e-06,
"loss": 0.105,
"step": 413
},
{
"epoch": 0.18835304822565968,
"grad_norm": 1.0380040250679141,
"learning_rate": 4.982597566592054e-06,
"loss": 0.0522,
"step": 414
},
{
"epoch": 0.18880800727934485,
"grad_norm": 2.2146611071914744,
"learning_rate": 4.982513289781671e-06,
"loss": 0.1403,
"step": 415
},
{
"epoch": 0.18926296633303002,
"grad_norm": 1.4265466923705232,
"learning_rate": 4.982428810111512e-06,
"loss": 0.0846,
"step": 416
},
{
"epoch": 0.1897179253867152,
"grad_norm": 1.4254072959974569,
"learning_rate": 4.9823441275884814e-06,
"loss": 0.0787,
"step": 417
},
{
"epoch": 0.19017288444040037,
"grad_norm": 2.353200458571576,
"learning_rate": 4.982259242219499e-06,
"loss": 0.1114,
"step": 418
},
{
"epoch": 0.19062784349408554,
"grad_norm": 1.3512279730893322,
"learning_rate": 4.9821741540115006e-06,
"loss": 0.0678,
"step": 419
},
{
"epoch": 0.1910828025477707,
"grad_norm": 1.728060266498106,
"learning_rate": 4.982088862971441e-06,
"loss": 0.1129,
"step": 420
},
{
"epoch": 0.19153776160145586,
"grad_norm": 1.8022543001727114,
"learning_rate": 4.982003369106287e-06,
"loss": 0.1036,
"step": 421
},
{
"epoch": 0.19199272065514103,
"grad_norm": 1.2312712834502222,
"learning_rate": 4.981917672423028e-06,
"loss": 0.065,
"step": 422
},
{
"epoch": 0.1924476797088262,
"grad_norm": 1.6183848549336255,
"learning_rate": 4.981831772928664e-06,
"loss": 0.0934,
"step": 423
},
{
"epoch": 0.19290263876251137,
"grad_norm": 2.001713262915152,
"learning_rate": 4.981745670630216e-06,
"loss": 0.1356,
"step": 424
},
{
"epoch": 0.19335759781619655,
"grad_norm": 2.0057745044552995,
"learning_rate": 4.981659365534718e-06,
"loss": 0.1285,
"step": 425
},
{
"epoch": 0.19381255686988172,
"grad_norm": 2.299079022869691,
"learning_rate": 4.981572857649225e-06,
"loss": 0.1195,
"step": 426
},
{
"epoch": 0.1942675159235669,
"grad_norm": 1.6869951958248894,
"learning_rate": 4.981486146980804e-06,
"loss": 0.0877,
"step": 427
},
{
"epoch": 0.19472247497725204,
"grad_norm": 1.9301190501764922,
"learning_rate": 4.9813992335365415e-06,
"loss": 0.0977,
"step": 428
},
{
"epoch": 0.1951774340309372,
"grad_norm": 1.6227704434432904,
"learning_rate": 4.98131211732354e-06,
"loss": 0.1035,
"step": 429
},
{
"epoch": 0.19563239308462238,
"grad_norm": 1.632769015838627,
"learning_rate": 4.981224798348917e-06,
"loss": 0.0833,
"step": 430
},
{
"epoch": 0.19608735213830755,
"grad_norm": 2.3862639707091082,
"learning_rate": 4.981137276619809e-06,
"loss": 0.1419,
"step": 431
},
{
"epoch": 0.19654231119199272,
"grad_norm": 1.2625986411158334,
"learning_rate": 4.9810495521433675e-06,
"loss": 0.078,
"step": 432
},
{
"epoch": 0.1969972702456779,
"grad_norm": 2.5081068393508157,
"learning_rate": 4.9809616249267616e-06,
"loss": 0.1478,
"step": 433
},
{
"epoch": 0.19745222929936307,
"grad_norm": 1.9644808854065114,
"learning_rate": 4.980873494977174e-06,
"loss": 0.121,
"step": 434
},
{
"epoch": 0.1979071883530482,
"grad_norm": 1.647433915922947,
"learning_rate": 4.98078516230181e-06,
"loss": 0.0865,
"step": 435
},
{
"epoch": 0.19836214740673339,
"grad_norm": 1.5774273491436515,
"learning_rate": 4.980696626907884e-06,
"loss": 0.0887,
"step": 436
},
{
"epoch": 0.19881710646041856,
"grad_norm": 1.5604062690588907,
"learning_rate": 4.980607888802633e-06,
"loss": 0.1,
"step": 437
},
{
"epoch": 0.19927206551410373,
"grad_norm": 1.548442809835796,
"learning_rate": 4.980518947993307e-06,
"loss": 0.1005,
"step": 438
},
{
"epoch": 0.1997270245677889,
"grad_norm": 1.6276180373825353,
"learning_rate": 4.980429804487176e-06,
"loss": 0.1006,
"step": 439
},
{
"epoch": 0.20018198362147407,
"grad_norm": 1.5718547041391637,
"learning_rate": 4.980340458291521e-06,
"loss": 0.0858,
"step": 440
},
{
"epoch": 0.20063694267515925,
"grad_norm": 1.3679183632524226,
"learning_rate": 4.980250909413646e-06,
"loss": 0.0901,
"step": 441
},
{
"epoch": 0.2010919017288444,
"grad_norm": 1.7491296961984788,
"learning_rate": 4.980161157860867e-06,
"loss": 0.0888,
"step": 442
},
{
"epoch": 0.20154686078252956,
"grad_norm": 2.0306839493761446,
"learning_rate": 4.980071203640519e-06,
"loss": 0.0893,
"step": 443
},
{
"epoch": 0.20200181983621474,
"grad_norm": 1.353153596211688,
"learning_rate": 4.979981046759952e-06,
"loss": 0.0753,
"step": 444
},
{
"epoch": 0.2024567788898999,
"grad_norm": 1.969605104045741,
"learning_rate": 4.979890687226533e-06,
"loss": 0.1033,
"step": 445
},
{
"epoch": 0.20291173794358508,
"grad_norm": 2.085518332646124,
"learning_rate": 4.979800125047647e-06,
"loss": 0.0979,
"step": 446
},
{
"epoch": 0.20336669699727025,
"grad_norm": 1.6181669031153556,
"learning_rate": 4.979709360230692e-06,
"loss": 0.0969,
"step": 447
},
{
"epoch": 0.20382165605095542,
"grad_norm": 1.6760914355637484,
"learning_rate": 4.979618392783087e-06,
"loss": 0.0883,
"step": 448
},
{
"epoch": 0.20427661510464057,
"grad_norm": 1.2907730003800948,
"learning_rate": 4.979527222712266e-06,
"loss": 0.0775,
"step": 449
},
{
"epoch": 0.20473157415832574,
"grad_norm": 1.241096973502198,
"learning_rate": 4.9794358500256765e-06,
"loss": 0.0599,
"step": 450
},
{
"epoch": 0.2051865332120109,
"grad_norm": 1.579037640818148,
"learning_rate": 4.979344274730786e-06,
"loss": 0.0831,
"step": 451
},
{
"epoch": 0.20564149226569609,
"grad_norm": 2.225915719971972,
"learning_rate": 4.979252496835079e-06,
"loss": 0.1116,
"step": 452
},
{
"epoch": 0.20609645131938126,
"grad_norm": 2.3031173397129923,
"learning_rate": 4.979160516346054e-06,
"loss": 0.1536,
"step": 453
},
{
"epoch": 0.20655141037306643,
"grad_norm": 27.297310781833385,
"learning_rate": 4.979068333271227e-06,
"loss": 0.9223,
"step": 454
},
{
"epoch": 0.2070063694267516,
"grad_norm": 2.4041431299507607,
"learning_rate": 4.978975947618131e-06,
"loss": 0.1184,
"step": 455
},
{
"epoch": 0.20746132848043677,
"grad_norm": 1.6683861662324915,
"learning_rate": 4.978883359394316e-06,
"loss": 0.1301,
"step": 456
},
{
"epoch": 0.20791628753412192,
"grad_norm": 1.9056814965685545,
"learning_rate": 4.978790568607347e-06,
"loss": 0.1001,
"step": 457
},
{
"epoch": 0.2083712465878071,
"grad_norm": 1.9713836323302738,
"learning_rate": 4.9786975752648076e-06,
"loss": 0.1174,
"step": 458
},
{
"epoch": 0.20882620564149226,
"grad_norm": 1.598376196967646,
"learning_rate": 4.978604379374295e-06,
"loss": 0.0986,
"step": 459
},
{
"epoch": 0.20928116469517744,
"grad_norm": 1.5517923833736031,
"learning_rate": 4.978510980943427e-06,
"loss": 0.0807,
"step": 460
},
{
"epoch": 0.2097361237488626,
"grad_norm": 2.004418653450344,
"learning_rate": 4.978417379979834e-06,
"loss": 0.1065,
"step": 461
},
{
"epoch": 0.21019108280254778,
"grad_norm": 1.7753220163198007,
"learning_rate": 4.978323576491165e-06,
"loss": 0.0987,
"step": 462
},
{
"epoch": 0.21064604185623295,
"grad_norm": 1.7384737383317277,
"learning_rate": 4.978229570485085e-06,
"loss": 0.1048,
"step": 463
},
{
"epoch": 0.2111010009099181,
"grad_norm": 1.5352099211420311,
"learning_rate": 4.978135361969276e-06,
"loss": 0.0983,
"step": 464
},
{
"epoch": 0.21155595996360327,
"grad_norm": 1.6028799125387194,
"learning_rate": 4.9780409509514375e-06,
"loss": 0.091,
"step": 465
},
{
"epoch": 0.21201091901728844,
"grad_norm": 1.9664054893168261,
"learning_rate": 4.977946337439282e-06,
"loss": 0.1495,
"step": 466
},
{
"epoch": 0.2124658780709736,
"grad_norm": 1.7122667851036462,
"learning_rate": 4.9778515214405436e-06,
"loss": 0.1139,
"step": 467
},
{
"epoch": 0.21292083712465878,
"grad_norm": 1.7566455248377864,
"learning_rate": 4.977756502962967e-06,
"loss": 0.1097,
"step": 468
},
{
"epoch": 0.21337579617834396,
"grad_norm": 1.1350501611425003,
"learning_rate": 4.97766128201432e-06,
"loss": 0.0629,
"step": 469
},
{
"epoch": 0.21383075523202913,
"grad_norm": 1.2023067292666059,
"learning_rate": 4.977565858602381e-06,
"loss": 0.0782,
"step": 470
},
{
"epoch": 0.21428571428571427,
"grad_norm": 1.628252441426902,
"learning_rate": 4.977470232734949e-06,
"loss": 0.0987,
"step": 471
},
{
"epoch": 0.21474067333939945,
"grad_norm": 1.724322735405813,
"learning_rate": 4.977374404419838e-06,
"loss": 0.0903,
"step": 472
},
{
"epoch": 0.21519563239308462,
"grad_norm": 1.470263169494043,
"learning_rate": 4.977278373664877e-06,
"loss": 0.0882,
"step": 473
},
{
"epoch": 0.2156505914467698,
"grad_norm": 2.599396527432543,
"learning_rate": 4.977182140477916e-06,
"loss": 0.1209,
"step": 474
},
{
"epoch": 0.21610555050045496,
"grad_norm": 1.6800447119151198,
"learning_rate": 4.977085704866817e-06,
"loss": 0.0776,
"step": 475
},
{
"epoch": 0.21656050955414013,
"grad_norm": 1.5595540666125045,
"learning_rate": 4.97698906683946e-06,
"loss": 0.103,
"step": 476
},
{
"epoch": 0.2170154686078253,
"grad_norm": 2.248635180290087,
"learning_rate": 4.9768922264037435e-06,
"loss": 0.1388,
"step": 477
},
{
"epoch": 0.21747042766151045,
"grad_norm": 1.1547627152960565,
"learning_rate": 4.976795183567579e-06,
"loss": 0.0624,
"step": 478
},
{
"epoch": 0.21792538671519562,
"grad_norm": 1.56353757750327,
"learning_rate": 4.976697938338898e-06,
"loss": 0.0856,
"step": 479
},
{
"epoch": 0.2183803457688808,
"grad_norm": 1.2335181237621284,
"learning_rate": 4.976600490725645e-06,
"loss": 0.0644,
"step": 480
},
{
"epoch": 0.21883530482256597,
"grad_norm": 1.900991648340467,
"learning_rate": 4.976502840735785e-06,
"loss": 0.153,
"step": 481
},
{
"epoch": 0.21929026387625114,
"grad_norm": 1.3078243371858722,
"learning_rate": 4.976404988377297e-06,
"loss": 0.0621,
"step": 482
},
{
"epoch": 0.2197452229299363,
"grad_norm": 2.0047686247285923,
"learning_rate": 4.976306933658176e-06,
"loss": 0.1136,
"step": 483
},
{
"epoch": 0.22020018198362148,
"grad_norm": 1.8552855878852923,
"learning_rate": 4.976208676586435e-06,
"loss": 0.1284,
"step": 484
},
{
"epoch": 0.22065514103730663,
"grad_norm": 1.8525936784229493,
"learning_rate": 4.976110217170104e-06,
"loss": 0.0917,
"step": 485
},
{
"epoch": 0.2211101000909918,
"grad_norm": 1.4658188242525991,
"learning_rate": 4.976011555417228e-06,
"loss": 0.0749,
"step": 486
},
{
"epoch": 0.22156505914467697,
"grad_norm": 1.1511032936840262,
"learning_rate": 4.975912691335869e-06,
"loss": 0.0761,
"step": 487
},
{
"epoch": 0.22202001819836215,
"grad_norm": 1.458580259230844,
"learning_rate": 4.975813624934106e-06,
"loss": 0.0768,
"step": 488
},
{
"epoch": 0.22247497725204732,
"grad_norm": 1.5627508232221192,
"learning_rate": 4.975714356220035e-06,
"loss": 0.0823,
"step": 489
},
{
"epoch": 0.2229299363057325,
"grad_norm": 1.075721834306004,
"learning_rate": 4.975614885201766e-06,
"loss": 0.0504,
"step": 490
},
{
"epoch": 0.22338489535941766,
"grad_norm": 1.6198884733457342,
"learning_rate": 4.975515211887429e-06,
"loss": 0.1024,
"step": 491
},
{
"epoch": 0.22383985441310283,
"grad_norm": 1.6346417323820548,
"learning_rate": 4.9754153362851684e-06,
"loss": 0.0851,
"step": 492
},
{
"epoch": 0.22429481346678798,
"grad_norm": 2.448143027911265,
"learning_rate": 4.975315258403145e-06,
"loss": 0.1479,
"step": 493
},
{
"epoch": 0.22474977252047315,
"grad_norm": 1.6016068432961146,
"learning_rate": 4.975214978249537e-06,
"loss": 0.0886,
"step": 494
},
{
"epoch": 0.22520473157415832,
"grad_norm": 1.4721161321318619,
"learning_rate": 4.975114495832539e-06,
"loss": 0.0976,
"step": 495
},
{
"epoch": 0.2256596906278435,
"grad_norm": 1.7625335294527533,
"learning_rate": 4.975013811160362e-06,
"loss": 0.0898,
"step": 496
},
{
"epoch": 0.22611464968152867,
"grad_norm": 1.9298670425360585,
"learning_rate": 4.974912924241233e-06,
"loss": 0.1027,
"step": 497
},
{
"epoch": 0.22656960873521384,
"grad_norm": 1.4996755802132458,
"learning_rate": 4.974811835083397e-06,
"loss": 0.0978,
"step": 498
},
{
"epoch": 0.227024567788899,
"grad_norm": 2.1147277125940955,
"learning_rate": 4.974710543695114e-06,
"loss": 0.1063,
"step": 499
},
{
"epoch": 0.22747952684258416,
"grad_norm": 2.529920688558412,
"learning_rate": 4.974609050084661e-06,
"loss": 0.1476,
"step": 500
},
{
"epoch": 0.22793448589626933,
"grad_norm": 2.14209787933433,
"learning_rate": 4.974507354260332e-06,
"loss": 0.1261,
"step": 501
},
{
"epoch": 0.2283894449499545,
"grad_norm": 1.9058176611193165,
"learning_rate": 4.974405456230436e-06,
"loss": 0.1203,
"step": 502
},
{
"epoch": 0.22884440400363967,
"grad_norm": 1.8980074058725056,
"learning_rate": 4.974303356003301e-06,
"loss": 0.0996,
"step": 503
},
{
"epoch": 0.22929936305732485,
"grad_norm": 1.4579903539692274,
"learning_rate": 4.974201053587268e-06,
"loss": 0.0943,
"step": 504
},
{
"epoch": 0.22975432211101002,
"grad_norm": 1.3940386820106656,
"learning_rate": 4.9740985489907005e-06,
"loss": 0.0663,
"step": 505
},
{
"epoch": 0.2302092811646952,
"grad_norm": 2.441971054754706,
"learning_rate": 4.973995842221971e-06,
"loss": 0.1245,
"step": 506
},
{
"epoch": 0.23066424021838033,
"grad_norm": 1.919620601900113,
"learning_rate": 4.973892933289476e-06,
"loss": 0.1159,
"step": 507
},
{
"epoch": 0.2311191992720655,
"grad_norm": 1.672712776153676,
"learning_rate": 4.97378982220162e-06,
"loss": 0.0981,
"step": 508
},
{
"epoch": 0.23157415832575068,
"grad_norm": 1.2125382683302124,
"learning_rate": 4.973686508966832e-06,
"loss": 0.0601,
"step": 509
},
{
"epoch": 0.23202911737943585,
"grad_norm": 1.222443145221144,
"learning_rate": 4.973582993593554e-06,
"loss": 0.0715,
"step": 510
},
{
"epoch": 0.23248407643312102,
"grad_norm": 1.5223951861259333,
"learning_rate": 4.973479276090244e-06,
"loss": 0.0795,
"step": 511
},
{
"epoch": 0.2329390354868062,
"grad_norm": 1.2392582362318094,
"learning_rate": 4.973375356465378e-06,
"loss": 0.061,
"step": 512
},
{
"epoch": 0.23339399454049137,
"grad_norm": 1.7285156139774616,
"learning_rate": 4.973271234727447e-06,
"loss": 0.1201,
"step": 513
},
{
"epoch": 0.2338489535941765,
"grad_norm": 1.4723786585295477,
"learning_rate": 4.97316691088496e-06,
"loss": 0.0885,
"step": 514
},
{
"epoch": 0.23430391264786168,
"grad_norm": 2.25192801645438,
"learning_rate": 4.973062384946442e-06,
"loss": 0.135,
"step": 515
},
{
"epoch": 0.23475887170154686,
"grad_norm": 1.1373098395352674,
"learning_rate": 4.9729576569204345e-06,
"loss": 0.0728,
"step": 516
},
{
"epoch": 0.23521383075523203,
"grad_norm": 1.5300830315604266,
"learning_rate": 4.972852726815495e-06,
"loss": 0.0941,
"step": 517
},
{
"epoch": 0.2356687898089172,
"grad_norm": 1.8026113068627658,
"learning_rate": 4.972747594640197e-06,
"loss": 0.1247,
"step": 518
},
{
"epoch": 0.23612374886260237,
"grad_norm": 1.794104737159684,
"learning_rate": 4.9726422604031335e-06,
"loss": 0.095,
"step": 519
},
{
"epoch": 0.23657870791628755,
"grad_norm": 1.1504559186965777,
"learning_rate": 4.97253672411291e-06,
"loss": 0.0674,
"step": 520
},
{
"epoch": 0.2370336669699727,
"grad_norm": 1.4316672986650767,
"learning_rate": 4.972430985778152e-06,
"loss": 0.0702,
"step": 521
},
{
"epoch": 0.23748862602365786,
"grad_norm": 1.5328603666600327,
"learning_rate": 4.972325045407499e-06,
"loss": 0.0675,
"step": 522
},
{
"epoch": 0.23794358507734303,
"grad_norm": 3.2405357176859857,
"learning_rate": 4.972218903009608e-06,
"loss": 0.1212,
"step": 523
},
{
"epoch": 0.2383985441310282,
"grad_norm": 1.5109558607242208,
"learning_rate": 4.972112558593153e-06,
"loss": 0.0938,
"step": 524
},
{
"epoch": 0.23885350318471338,
"grad_norm": 1.264935168060258,
"learning_rate": 4.972006012166823e-06,
"loss": 0.0742,
"step": 525
},
{
"epoch": 0.23930846223839855,
"grad_norm": 1.3461924059029533,
"learning_rate": 4.971899263739326e-06,
"loss": 0.0844,
"step": 526
},
{
"epoch": 0.23976342129208372,
"grad_norm": 1.7441591810954875,
"learning_rate": 4.971792313319384e-06,
"loss": 0.1139,
"step": 527
},
{
"epoch": 0.24021838034576887,
"grad_norm": 1.7027600325330141,
"learning_rate": 4.971685160915737e-06,
"loss": 0.0867,
"step": 528
},
{
"epoch": 0.24067333939945404,
"grad_norm": 1.6301828004618641,
"learning_rate": 4.971577806537139e-06,
"loss": 0.0943,
"step": 529
},
{
"epoch": 0.2411282984531392,
"grad_norm": 1.6173281507194255,
"learning_rate": 4.971470250192366e-06,
"loss": 0.1052,
"step": 530
},
{
"epoch": 0.24158325750682438,
"grad_norm": 17.712189021618492,
"learning_rate": 4.9713624918902045e-06,
"loss": 0.3191,
"step": 531
},
{
"epoch": 0.24203821656050956,
"grad_norm": 2.336934606774547,
"learning_rate": 4.971254531639461e-06,
"loss": 0.1347,
"step": 532
},
{
"epoch": 0.24249317561419473,
"grad_norm": 1.8922827015678323,
"learning_rate": 4.971146369448957e-06,
"loss": 0.1144,
"step": 533
},
{
"epoch": 0.2429481346678799,
"grad_norm": 1.7408688040721931,
"learning_rate": 4.971038005327532e-06,
"loss": 0.1143,
"step": 534
},
{
"epoch": 0.24340309372156507,
"grad_norm": 1.9327103804196282,
"learning_rate": 4.970929439284039e-06,
"loss": 0.1377,
"step": 535
},
{
"epoch": 0.24385805277525022,
"grad_norm": 2.0181579320929224,
"learning_rate": 4.970820671327351e-06,
"loss": 0.1259,
"step": 536
},
{
"epoch": 0.2443130118289354,
"grad_norm": 1.1056426992050885,
"learning_rate": 4.9707117014663565e-06,
"loss": 0.0633,
"step": 537
},
{
"epoch": 0.24476797088262056,
"grad_norm": 1.853338129642874,
"learning_rate": 4.97060252970996e-06,
"loss": 0.1215,
"step": 538
},
{
"epoch": 0.24522292993630573,
"grad_norm": 1.6843406450831364,
"learning_rate": 4.970493156067081e-06,
"loss": 0.1016,
"step": 539
},
{
"epoch": 0.2456778889899909,
"grad_norm": 1.1701908663612965,
"learning_rate": 4.970383580546658e-06,
"loss": 0.0731,
"step": 540
},
{
"epoch": 0.24613284804367608,
"grad_norm": 1.7890527407391215,
"learning_rate": 4.970273803157645e-06,
"loss": 0.1097,
"step": 541
},
{
"epoch": 0.24658780709736125,
"grad_norm": 1.4169073671700831,
"learning_rate": 4.970163823909013e-06,
"loss": 0.0845,
"step": 542
},
{
"epoch": 0.2470427661510464,
"grad_norm": 1.5828589024944335,
"learning_rate": 4.970053642809748e-06,
"loss": 0.0921,
"step": 543
},
{
"epoch": 0.24749772520473157,
"grad_norm": 1.6370747251722932,
"learning_rate": 4.969943259868853e-06,
"loss": 0.1088,
"step": 544
},
{
"epoch": 0.24795268425841674,
"grad_norm": 2.023470308157194,
"learning_rate": 4.969832675095351e-06,
"loss": 0.1052,
"step": 545
},
{
"epoch": 0.2484076433121019,
"grad_norm": 1.7462230999429424,
"learning_rate": 4.969721888498275e-06,
"loss": 0.1141,
"step": 546
},
{
"epoch": 0.24886260236578708,
"grad_norm": 1.428774250085193,
"learning_rate": 4.96961090008668e-06,
"loss": 0.0824,
"step": 547
},
{
"epoch": 0.24931756141947226,
"grad_norm": 1.6447081301063733,
"learning_rate": 4.969499709869635e-06,
"loss": 0.1324,
"step": 548
},
{
"epoch": 0.24977252047315743,
"grad_norm": 2.0250820847646054,
"learning_rate": 4.969388317856225e-06,
"loss": 0.1122,
"step": 549
},
{
"epoch": 0.2502274795268426,
"grad_norm": 2.060820071851061,
"learning_rate": 4.969276724055554e-06,
"loss": 0.128,
"step": 550
},
{
"epoch": 0.25068243858052774,
"grad_norm": 1.8421595012757042,
"learning_rate": 4.969164928476741e-06,
"loss": 0.0929,
"step": 551
},
{
"epoch": 0.25113739763421294,
"grad_norm": 1.8378761522798848,
"learning_rate": 4.969052931128919e-06,
"loss": 0.1038,
"step": 552
},
{
"epoch": 0.2515923566878981,
"grad_norm": 1.4559119574869848,
"learning_rate": 4.968940732021243e-06,
"loss": 0.0884,
"step": 553
},
{
"epoch": 0.25204731574158323,
"grad_norm": 1.9971887851212364,
"learning_rate": 4.9688283311628795e-06,
"loss": 0.1353,
"step": 554
},
{
"epoch": 0.25250227479526843,
"grad_norm": 1.7386639848323409,
"learning_rate": 4.968715728563014e-06,
"loss": 0.1025,
"step": 555
},
{
"epoch": 0.2529572338489536,
"grad_norm": 1.260155855896464,
"learning_rate": 4.968602924230847e-06,
"loss": 0.0684,
"step": 556
},
{
"epoch": 0.2534121929026388,
"grad_norm": 2.3395689748358843,
"learning_rate": 4.968489918175598e-06,
"loss": 0.1151,
"step": 557
},
{
"epoch": 0.2538671519563239,
"grad_norm": 2.0737729432038137,
"learning_rate": 4.9683767104065014e-06,
"loss": 0.107,
"step": 558
},
{
"epoch": 0.2543221110100091,
"grad_norm": 1.4554456387078378,
"learning_rate": 4.968263300932806e-06,
"loss": 0.0674,
"step": 559
},
{
"epoch": 0.25477707006369427,
"grad_norm": 1.236095562563839,
"learning_rate": 4.968149689763781e-06,
"loss": 0.0771,
"step": 560
},
{
"epoch": 0.2552320291173794,
"grad_norm": 1.6261579693523964,
"learning_rate": 4.968035876908708e-06,
"loss": 0.1033,
"step": 561
},
{
"epoch": 0.2556869881710646,
"grad_norm": 1.8267174614929946,
"learning_rate": 4.967921862376889e-06,
"loss": 0.1153,
"step": 562
},
{
"epoch": 0.25614194722474976,
"grad_norm": 1.9897704292294367,
"learning_rate": 4.9678076461776415e-06,
"loss": 0.1168,
"step": 563
},
{
"epoch": 0.25659690627843496,
"grad_norm": 1.9727936679798233,
"learning_rate": 4.9676932283202965e-06,
"loss": 0.1389,
"step": 564
},
{
"epoch": 0.2570518653321201,
"grad_norm": 1.8484690700205213,
"learning_rate": 4.967578608814205e-06,
"loss": 0.1024,
"step": 565
},
{
"epoch": 0.2575068243858053,
"grad_norm": 1.4833575893287436,
"learning_rate": 4.9674637876687345e-06,
"loss": 0.0959,
"step": 566
},
{
"epoch": 0.25796178343949044,
"grad_norm": 1.0731244531443167,
"learning_rate": 4.967348764893265e-06,
"loss": 0.0652,
"step": 567
},
{
"epoch": 0.2584167424931756,
"grad_norm": 1.882586364820984,
"learning_rate": 4.967233540497197e-06,
"loss": 0.0887,
"step": 568
},
{
"epoch": 0.2588717015468608,
"grad_norm": 1.5585900206462215,
"learning_rate": 4.967118114489946e-06,
"loss": 0.0705,
"step": 569
},
{
"epoch": 0.25932666060054593,
"grad_norm": 1.4304247727655925,
"learning_rate": 4.967002486880944e-06,
"loss": 0.0689,
"step": 570
},
{
"epoch": 0.25978161965423113,
"grad_norm": 1.996611084455256,
"learning_rate": 4.966886657679641e-06,
"loss": 0.1134,
"step": 571
},
{
"epoch": 0.2602365787079163,
"grad_norm": 2.573142554440562,
"learning_rate": 4.966770626895499e-06,
"loss": 0.137,
"step": 572
},
{
"epoch": 0.2606915377616015,
"grad_norm": 1.7759211248358038,
"learning_rate": 4.966654394538002e-06,
"loss": 0.097,
"step": 573
},
{
"epoch": 0.2611464968152866,
"grad_norm": 1.3021079669208342,
"learning_rate": 4.966537960616646e-06,
"loss": 0.0774,
"step": 574
},
{
"epoch": 0.26160145586897177,
"grad_norm": 2.328733131052364,
"learning_rate": 4.9664213251409486e-06,
"loss": 0.1105,
"step": 575
},
{
"epoch": 0.26205641492265697,
"grad_norm": 2.281267812919593,
"learning_rate": 4.9663044881204375e-06,
"loss": 0.1556,
"step": 576
},
{
"epoch": 0.2625113739763421,
"grad_norm": 1.7215892787568372,
"learning_rate": 4.9661874495646615e-06,
"loss": 0.0917,
"step": 577
},
{
"epoch": 0.2629663330300273,
"grad_norm": 1.3072003221216781,
"learning_rate": 4.9660702094831845e-06,
"loss": 0.0818,
"step": 578
},
{
"epoch": 0.26342129208371245,
"grad_norm": 2.141135787879026,
"learning_rate": 4.965952767885587e-06,
"loss": 0.1187,
"step": 579
},
{
"epoch": 0.26387625113739765,
"grad_norm": 2.3440295569320857,
"learning_rate": 4.965835124781465e-06,
"loss": 0.1336,
"step": 580
},
{
"epoch": 0.2643312101910828,
"grad_norm": 1.2377586425554465,
"learning_rate": 4.965717280180432e-06,
"loss": 0.0771,
"step": 581
},
{
"epoch": 0.26478616924476794,
"grad_norm": 1.5553208083958672,
"learning_rate": 4.965599234092118e-06,
"loss": 0.0906,
"step": 582
},
{
"epoch": 0.26524112829845314,
"grad_norm": 1.676762616981095,
"learning_rate": 4.96548098652617e-06,
"loss": 0.1091,
"step": 583
},
{
"epoch": 0.2656960873521383,
"grad_norm": 1.8329426527347645,
"learning_rate": 4.965362537492249e-06,
"loss": 0.1171,
"step": 584
},
{
"epoch": 0.2661510464058235,
"grad_norm": 1.2752855217123082,
"learning_rate": 4.9652438870000356e-06,
"loss": 0.0726,
"step": 585
},
{
"epoch": 0.26660600545950863,
"grad_norm": 1.188941544645384,
"learning_rate": 4.965125035059224e-06,
"loss": 0.0801,
"step": 586
},
{
"epoch": 0.26706096451319383,
"grad_norm": 1.4654127807937742,
"learning_rate": 4.965005981679527e-06,
"loss": 0.0839,
"step": 587
},
{
"epoch": 0.267515923566879,
"grad_norm": 2.0288718475884107,
"learning_rate": 4.964886726870673e-06,
"loss": 0.1239,
"step": 588
},
{
"epoch": 0.2679708826205642,
"grad_norm": 1.972686660841513,
"learning_rate": 4.964767270642407e-06,
"loss": 0.1004,
"step": 589
},
{
"epoch": 0.2684258416742493,
"grad_norm": 1.6499743360699521,
"learning_rate": 4.964647613004491e-06,
"loss": 0.0976,
"step": 590
},
{
"epoch": 0.26888080072793447,
"grad_norm": 1.5661213245685233,
"learning_rate": 4.964527753966702e-06,
"loss": 0.0818,
"step": 591
},
{
"epoch": 0.26933575978161967,
"grad_norm": 1.387453226127614,
"learning_rate": 4.964407693538834e-06,
"loss": 0.0813,
"step": 592
},
{
"epoch": 0.2697907188353048,
"grad_norm": 1.8652006740776592,
"learning_rate": 4.9642874317307e-06,
"loss": 0.1092,
"step": 593
},
{
"epoch": 0.27024567788899,
"grad_norm": 1.6739291749648295,
"learning_rate": 4.964166968552124e-06,
"loss": 0.1262,
"step": 594
},
{
"epoch": 0.27070063694267515,
"grad_norm": 1.4965319066427345,
"learning_rate": 4.9640463040129525e-06,
"loss": 0.0749,
"step": 595
},
{
"epoch": 0.27115559599636035,
"grad_norm": 1.483777185503557,
"learning_rate": 4.963925438123044e-06,
"loss": 0.075,
"step": 596
},
{
"epoch": 0.2716105550500455,
"grad_norm": 1.646106287941782,
"learning_rate": 4.963804370892276e-06,
"loss": 0.0948,
"step": 597
},
{
"epoch": 0.27206551410373064,
"grad_norm": 1.8923424637891237,
"learning_rate": 4.9636831023305405e-06,
"loss": 0.1296,
"step": 598
},
{
"epoch": 0.27252047315741584,
"grad_norm": 1.453967822900046,
"learning_rate": 4.963561632447748e-06,
"loss": 0.0777,
"step": 599
},
{
"epoch": 0.272975432211101,
"grad_norm": 1.2633146266239919,
"learning_rate": 4.9634399612538255e-06,
"loss": 0.0704,
"step": 600
},
{
"epoch": 0.2734303912647862,
"grad_norm": 24.856853600017228,
"learning_rate": 4.963318088758714e-06,
"loss": 0.4372,
"step": 601
},
{
"epoch": 0.27388535031847133,
"grad_norm": 1.6301604814034822,
"learning_rate": 4.963196014972371e-06,
"loss": 0.0879,
"step": 602
},
{
"epoch": 0.27434030937215653,
"grad_norm": 1.556460730817159,
"learning_rate": 4.963073739904775e-06,
"loss": 0.0893,
"step": 603
},
{
"epoch": 0.2747952684258417,
"grad_norm": 1.657318032059153,
"learning_rate": 4.962951263565915e-06,
"loss": 0.0933,
"step": 604
},
{
"epoch": 0.2752502274795268,
"grad_norm": 2.273490391362205,
"learning_rate": 4.962828585965801e-06,
"loss": 0.1038,
"step": 605
},
{
"epoch": 0.275705186533212,
"grad_norm": 1.5114052665682505,
"learning_rate": 4.962705707114457e-06,
"loss": 0.097,
"step": 606
},
{
"epoch": 0.27616014558689717,
"grad_norm": 1.7683179621585026,
"learning_rate": 4.962582627021923e-06,
"loss": 0.1127,
"step": 607
},
{
"epoch": 0.27661510464058237,
"grad_norm": 1.8859941959717001,
"learning_rate": 4.962459345698258e-06,
"loss": 0.1152,
"step": 608
},
{
"epoch": 0.2770700636942675,
"grad_norm": 1.9839838015935523,
"learning_rate": 4.962335863153537e-06,
"loss": 0.1198,
"step": 609
},
{
"epoch": 0.2775250227479527,
"grad_norm": 1.3671283570292578,
"learning_rate": 4.962212179397847e-06,
"loss": 0.0876,
"step": 610
},
{
"epoch": 0.27797998180163785,
"grad_norm": 1.4623540558631782,
"learning_rate": 4.962088294441299e-06,
"loss": 0.0754,
"step": 611
},
{
"epoch": 0.278434940855323,
"grad_norm": 2.3501285954750806,
"learning_rate": 4.9619642082940135e-06,
"loss": 0.1,
"step": 612
},
{
"epoch": 0.2788898999090082,
"grad_norm": 1.6593172768016098,
"learning_rate": 4.9618399209661305e-06,
"loss": 0.0918,
"step": 613
},
{
"epoch": 0.27934485896269334,
"grad_norm": 1.4913746956676242,
"learning_rate": 4.961715432467807e-06,
"loss": 0.0788,
"step": 614
},
{
"epoch": 0.27979981801637854,
"grad_norm": 1.3335438953393988,
"learning_rate": 4.961590742809216e-06,
"loss": 0.0743,
"step": 615
},
{
"epoch": 0.2802547770700637,
"grad_norm": 1.4631866469804606,
"learning_rate": 4.961465852000545e-06,
"loss": 0.0869,
"step": 616
},
{
"epoch": 0.2807097361237489,
"grad_norm": 1.8021656107937525,
"learning_rate": 4.961340760052001e-06,
"loss": 0.0906,
"step": 617
},
{
"epoch": 0.28116469517743403,
"grad_norm": 1.74213914067233,
"learning_rate": 4.961215466973806e-06,
"loss": 0.0926,
"step": 618
},
{
"epoch": 0.2816196542311192,
"grad_norm": 2.764803909834576,
"learning_rate": 4.961089972776197e-06,
"loss": 0.1823,
"step": 619
},
{
"epoch": 0.2820746132848044,
"grad_norm": 1.3665676735119967,
"learning_rate": 4.9609642774694285e-06,
"loss": 0.0734,
"step": 620
},
{
"epoch": 0.2825295723384895,
"grad_norm": 1.9426323562959267,
"learning_rate": 4.960838381063774e-06,
"loss": 0.0972,
"step": 621
},
{
"epoch": 0.2829845313921747,
"grad_norm": 2.3374254341147322,
"learning_rate": 4.960712283569521e-06,
"loss": 0.1411,
"step": 622
},
{
"epoch": 0.28343949044585987,
"grad_norm": 2.2747894788958543,
"learning_rate": 4.960585984996971e-06,
"loss": 0.1033,
"step": 623
},
{
"epoch": 0.28389444949954507,
"grad_norm": 1.7445142059152803,
"learning_rate": 4.960459485356447e-06,
"loss": 0.1222,
"step": 624
},
{
"epoch": 0.2843494085532302,
"grad_norm": 1.5220008831965313,
"learning_rate": 4.960332784658285e-06,
"loss": 0.1027,
"step": 625
},
{
"epoch": 0.28480436760691535,
"grad_norm": 2.1347326062219034,
"learning_rate": 4.960205882912839e-06,
"loss": 0.1237,
"step": 626
},
{
"epoch": 0.28525932666060055,
"grad_norm": 2.5984695620436002,
"learning_rate": 4.9600787801304785e-06,
"loss": 0.1871,
"step": 627
},
{
"epoch": 0.2857142857142857,
"grad_norm": 2.1207792848317375,
"learning_rate": 4.959951476321589e-06,
"loss": 0.1205,
"step": 628
},
{
"epoch": 0.2861692447679709,
"grad_norm": 1.1897630810057305,
"learning_rate": 4.959823971496575e-06,
"loss": 0.0773,
"step": 629
},
{
"epoch": 0.28662420382165604,
"grad_norm": 3.4920069239312976,
"learning_rate": 4.959696265665853e-06,
"loss": 0.1897,
"step": 630
},
{
"epoch": 0.28707916287534124,
"grad_norm": 1.425742783647833,
"learning_rate": 4.959568358839862e-06,
"loss": 0.0635,
"step": 631
},
{
"epoch": 0.2875341219290264,
"grad_norm": 1.330689822741385,
"learning_rate": 4.95944025102905e-06,
"loss": 0.0722,
"step": 632
},
{
"epoch": 0.28798908098271153,
"grad_norm": 1.99039564333339,
"learning_rate": 4.959311942243888e-06,
"loss": 0.1158,
"step": 633
},
{
"epoch": 0.28844404003639673,
"grad_norm": 1.593751969696495,
"learning_rate": 4.95918343249486e-06,
"loss": 0.0861,
"step": 634
},
{
"epoch": 0.2888989990900819,
"grad_norm": 1.8945402616067804,
"learning_rate": 4.959054721792469e-06,
"loss": 0.1171,
"step": 635
},
{
"epoch": 0.2893539581437671,
"grad_norm": 1.4569740573581391,
"learning_rate": 4.958925810147231e-06,
"loss": 0.0777,
"step": 636
},
{
"epoch": 0.2898089171974522,
"grad_norm": 1.7102068304451903,
"learning_rate": 4.958796697569679e-06,
"loss": 0.0872,
"step": 637
},
{
"epoch": 0.2902638762511374,
"grad_norm": 1.5378977203553044,
"learning_rate": 4.958667384070365e-06,
"loss": 0.0796,
"step": 638
},
{
"epoch": 0.29071883530482256,
"grad_norm": 1.9723232607058794,
"learning_rate": 4.958537869659855e-06,
"loss": 0.1204,
"step": 639
},
{
"epoch": 0.2911737943585077,
"grad_norm": 1.4856408560761394,
"learning_rate": 4.958408154348734e-06,
"loss": 0.0763,
"step": 640
},
{
"epoch": 0.2916287534121929,
"grad_norm": 1.7342797592944788,
"learning_rate": 4.9582782381476e-06,
"loss": 0.1104,
"step": 641
},
{
"epoch": 0.29208371246587805,
"grad_norm": 2.179383476129295,
"learning_rate": 4.958148121067071e-06,
"loss": 0.1694,
"step": 642
},
{
"epoch": 0.29253867151956325,
"grad_norm": 1.8609060135735762,
"learning_rate": 4.9580178031177775e-06,
"loss": 0.1303,
"step": 643
},
{
"epoch": 0.2929936305732484,
"grad_norm": 1.4742279064065518,
"learning_rate": 4.9578872843103694e-06,
"loss": 0.1001,
"step": 644
},
{
"epoch": 0.2934485896269336,
"grad_norm": 1.7670333338462736,
"learning_rate": 4.957756564655513e-06,
"loss": 0.1022,
"step": 645
},
{
"epoch": 0.29390354868061874,
"grad_norm": 1.6630538784639108,
"learning_rate": 4.957625644163888e-06,
"loss": 0.1055,
"step": 646
},
{
"epoch": 0.2943585077343039,
"grad_norm": 1.9118546637397547,
"learning_rate": 4.957494522846194e-06,
"loss": 0.1029,
"step": 647
},
{
"epoch": 0.2948134667879891,
"grad_norm": 1.7468783195584092,
"learning_rate": 4.957363200713146e-06,
"loss": 0.13,
"step": 648
},
{
"epoch": 0.29526842584167423,
"grad_norm": 1.4923304655802225,
"learning_rate": 4.957231677775475e-06,
"loss": 0.0846,
"step": 649
},
{
"epoch": 0.29572338489535943,
"grad_norm": 2.0864859163635407,
"learning_rate": 4.957099954043928e-06,
"loss": 0.1363,
"step": 650
},
{
"epoch": 0.2961783439490446,
"grad_norm": 1.467640729386297,
"learning_rate": 4.956968029529269e-06,
"loss": 0.113,
"step": 651
},
{
"epoch": 0.2966333030027298,
"grad_norm": 1.5940129351295147,
"learning_rate": 4.956835904242277e-06,
"loss": 0.1121,
"step": 652
},
{
"epoch": 0.2970882620564149,
"grad_norm": 1.305300483782713,
"learning_rate": 4.9567035781937516e-06,
"loss": 0.0569,
"step": 653
},
{
"epoch": 0.29754322111010006,
"grad_norm": 1.8626374769697236,
"learning_rate": 4.9565710513945024e-06,
"loss": 0.095,
"step": 654
},
{
"epoch": 0.29799818016378526,
"grad_norm": 1.9350135167075724,
"learning_rate": 4.956438323855362e-06,
"loss": 0.11,
"step": 655
},
{
"epoch": 0.2984531392174704,
"grad_norm": 1.7292500874953625,
"learning_rate": 4.956305395587174e-06,
"loss": 0.1259,
"step": 656
},
{
"epoch": 0.2989080982711556,
"grad_norm": 1.7021672274359103,
"learning_rate": 4.956172266600802e-06,
"loss": 0.0857,
"step": 657
},
{
"epoch": 0.29936305732484075,
"grad_norm": 1.2481942065304896,
"learning_rate": 4.956038936907125e-06,
"loss": 0.0776,
"step": 658
},
{
"epoch": 0.29981801637852595,
"grad_norm": 1.4091727470459356,
"learning_rate": 4.955905406517036e-06,
"loss": 0.0706,
"step": 659
},
{
"epoch": 0.3002729754322111,
"grad_norm": 1.8640524340898077,
"learning_rate": 4.95577167544145e-06,
"loss": 0.1176,
"step": 660
},
{
"epoch": 0.30072793448589624,
"grad_norm": 2.0619543797721698,
"learning_rate": 4.955637743691291e-06,
"loss": 0.1148,
"step": 661
},
{
"epoch": 0.30118289353958144,
"grad_norm": 1.9364848961200234,
"learning_rate": 4.955503611277506e-06,
"loss": 0.0964,
"step": 662
},
{
"epoch": 0.3016378525932666,
"grad_norm": 1.5509916734065172,
"learning_rate": 4.955369278211055e-06,
"loss": 0.0824,
"step": 663
},
{
"epoch": 0.3020928116469518,
"grad_norm": 1.8848317603882998,
"learning_rate": 4.955234744502914e-06,
"loss": 0.1,
"step": 664
},
{
"epoch": 0.30254777070063693,
"grad_norm": 1.7147002197137917,
"learning_rate": 4.955100010164079e-06,
"loss": 0.1042,
"step": 665
},
{
"epoch": 0.30300272975432213,
"grad_norm": 1.8287392204283686,
"learning_rate": 4.954965075205557e-06,
"loss": 0.0894,
"step": 666
},
{
"epoch": 0.3034576888080073,
"grad_norm": 3.2978505813072765,
"learning_rate": 4.9548299396383755e-06,
"loss": 0.1555,
"step": 667
},
{
"epoch": 0.3039126478616925,
"grad_norm": 1.733214316892207,
"learning_rate": 4.954694603473578e-06,
"loss": 0.0848,
"step": 668
},
{
"epoch": 0.3043676069153776,
"grad_norm": 2.1290440022616917,
"learning_rate": 4.954559066722222e-06,
"loss": 0.1329,
"step": 669
},
{
"epoch": 0.30482256596906276,
"grad_norm": 1.7482728884321743,
"learning_rate": 4.954423329395385e-06,
"loss": 0.1135,
"step": 670
},
{
"epoch": 0.30527752502274796,
"grad_norm": 1.8272762006745102,
"learning_rate": 4.954287391504156e-06,
"loss": 0.1233,
"step": 671
},
{
"epoch": 0.3057324840764331,
"grad_norm": 2.276356474817249,
"learning_rate": 4.9541512530596455e-06,
"loss": 0.1426,
"step": 672
},
{
"epoch": 0.3061874431301183,
"grad_norm": 1.5212465132609405,
"learning_rate": 4.954014914072978e-06,
"loss": 0.0908,
"step": 673
},
{
"epoch": 0.30664240218380345,
"grad_norm": 1.7081770141846233,
"learning_rate": 4.9538783745552934e-06,
"loss": 0.1069,
"step": 674
},
{
"epoch": 0.30709736123748865,
"grad_norm": 2.2065783569813755,
"learning_rate": 4.95374163451775e-06,
"loss": 0.1303,
"step": 675
},
{
"epoch": 0.3075523202911738,
"grad_norm": 1.9717809133208803,
"learning_rate": 4.953604693971521e-06,
"loss": 0.0969,
"step": 676
},
{
"epoch": 0.30800727934485894,
"grad_norm": 1.5094990032560427,
"learning_rate": 4.953467552927798e-06,
"loss": 0.059,
"step": 677
},
{
"epoch": 0.30846223839854414,
"grad_norm": 2.5084055121202726,
"learning_rate": 4.9533302113977845e-06,
"loss": 0.141,
"step": 678
},
{
"epoch": 0.3089171974522293,
"grad_norm": 2.1105100650062814,
"learning_rate": 4.9531926693927055e-06,
"loss": 0.1162,
"step": 679
},
{
"epoch": 0.3093721565059145,
"grad_norm": 1.9374617838160508,
"learning_rate": 4.953054926923801e-06,
"loss": 0.1119,
"step": 680
},
{
"epoch": 0.30982711555959963,
"grad_norm": 2.266159358282095,
"learning_rate": 4.952916984002325e-06,
"loss": 0.1188,
"step": 681
},
{
"epoch": 0.31028207461328483,
"grad_norm": 2.1490900129362243,
"learning_rate": 4.95277884063955e-06,
"loss": 0.1337,
"step": 682
},
{
"epoch": 0.31073703366697,
"grad_norm": 1.5330806658735066,
"learning_rate": 4.952640496846766e-06,
"loss": 0.109,
"step": 683
},
{
"epoch": 0.3111919927206551,
"grad_norm": 1.41231573264733,
"learning_rate": 4.952501952635276e-06,
"loss": 0.0837,
"step": 684
},
{
"epoch": 0.3116469517743403,
"grad_norm": 1.993511064296186,
"learning_rate": 4.952363208016402e-06,
"loss": 0.1272,
"step": 685
},
{
"epoch": 0.31210191082802546,
"grad_norm": 1.6098606771380728,
"learning_rate": 4.952224263001482e-06,
"loss": 0.0816,
"step": 686
},
{
"epoch": 0.31255686988171066,
"grad_norm": 1.2309412681015492,
"learning_rate": 4.952085117601868e-06,
"loss": 0.0692,
"step": 687
},
{
"epoch": 0.3130118289353958,
"grad_norm": 1.7997377974129165,
"learning_rate": 4.951945771828933e-06,
"loss": 0.1322,
"step": 688
},
{
"epoch": 0.313466787989081,
"grad_norm": 1.3223154067967124,
"learning_rate": 4.951806225694061e-06,
"loss": 0.0979,
"step": 689
},
{
"epoch": 0.31392174704276615,
"grad_norm": 1.9747397800251965,
"learning_rate": 4.951666479208658e-06,
"loss": 0.1184,
"step": 690
},
{
"epoch": 0.3143767060964513,
"grad_norm": 1.4466542632801185,
"learning_rate": 4.951526532384141e-06,
"loss": 0.085,
"step": 691
},
{
"epoch": 0.3148316651501365,
"grad_norm": 1.8649877852775587,
"learning_rate": 4.951386385231946e-06,
"loss": 0.1011,
"step": 692
},
{
"epoch": 0.31528662420382164,
"grad_norm": 1.2680670071467166,
"learning_rate": 4.951246037763528e-06,
"loss": 0.0748,
"step": 693
},
{
"epoch": 0.31574158325750684,
"grad_norm": 1.5151831279551418,
"learning_rate": 4.9511054899903524e-06,
"loss": 0.0874,
"step": 694
},
{
"epoch": 0.316196542311192,
"grad_norm": 1.6436638497099227,
"learning_rate": 4.950964741923905e-06,
"loss": 0.0982,
"step": 695
},
{
"epoch": 0.3166515013648772,
"grad_norm": 1.5379093700813176,
"learning_rate": 4.950823793575688e-06,
"loss": 0.0857,
"step": 696
},
{
"epoch": 0.31710646041856233,
"grad_norm": 2.4063943761092452,
"learning_rate": 4.950682644957218e-06,
"loss": 0.1253,
"step": 697
},
{
"epoch": 0.3175614194722475,
"grad_norm": 2.5063143673804844,
"learning_rate": 4.9505412960800295e-06,
"loss": 0.1511,
"step": 698
},
{
"epoch": 0.3180163785259327,
"grad_norm": 1.722833309256951,
"learning_rate": 4.950399746955673e-06,
"loss": 0.0999,
"step": 699
},
{
"epoch": 0.3184713375796178,
"grad_norm": 1.8190148406823232,
"learning_rate": 4.950257997595716e-06,
"loss": 0.0895,
"step": 700
},
{
"epoch": 0.318926296633303,
"grad_norm": 1.9186747250049239,
"learning_rate": 4.950116048011739e-06,
"loss": 0.0964,
"step": 701
},
{
"epoch": 0.31938125568698816,
"grad_norm": 1.372930302125184,
"learning_rate": 4.949973898215344e-06,
"loss": 0.0589,
"step": 702
},
{
"epoch": 0.31983621474067336,
"grad_norm": 1.9707430002902289,
"learning_rate": 4.949831548218146e-06,
"loss": 0.1054,
"step": 703
},
{
"epoch": 0.3202911737943585,
"grad_norm": 2.0845604349239832,
"learning_rate": 4.949688998031777e-06,
"loss": 0.1105,
"step": 704
},
{
"epoch": 0.32074613284804365,
"grad_norm": 1.4969274131429369,
"learning_rate": 4.949546247667886e-06,
"loss": 0.0814,
"step": 705
},
{
"epoch": 0.32120109190172885,
"grad_norm": 1.9940826155791407,
"learning_rate": 4.949403297138137e-06,
"loss": 0.1064,
"step": 706
},
{
"epoch": 0.321656050955414,
"grad_norm": 1.7246519891154302,
"learning_rate": 4.949260146454212e-06,
"loss": 0.1093,
"step": 707
},
{
"epoch": 0.3221110100090992,
"grad_norm": 1.6890948945842699,
"learning_rate": 4.94911679562781e-06,
"loss": 0.0888,
"step": 708
},
{
"epoch": 0.32256596906278434,
"grad_norm": 2.0455963687465837,
"learning_rate": 4.948973244670643e-06,
"loss": 0.1019,
"step": 709
},
{
"epoch": 0.32302092811646954,
"grad_norm": 1.7678121189421865,
"learning_rate": 4.948829493594441e-06,
"loss": 0.0961,
"step": 710
},
{
"epoch": 0.3234758871701547,
"grad_norm": 1.3731566726245188,
"learning_rate": 4.9486855424109524e-06,
"loss": 0.072,
"step": 711
},
{
"epoch": 0.32393084622383983,
"grad_norm": 1.4962983653581472,
"learning_rate": 4.948541391131939e-06,
"loss": 0.0905,
"step": 712
},
{
"epoch": 0.32438580527752503,
"grad_norm": 1.4198695601427125,
"learning_rate": 4.948397039769181e-06,
"loss": 0.0616,
"step": 713
},
{
"epoch": 0.3248407643312102,
"grad_norm": 1.131377673368795,
"learning_rate": 4.948252488334474e-06,
"loss": 0.0526,
"step": 714
},
{
"epoch": 0.3252957233848954,
"grad_norm": 1.1969683311404917,
"learning_rate": 4.948107736839629e-06,
"loss": 0.0763,
"step": 715
},
{
"epoch": 0.3257506824385805,
"grad_norm": 1.6793927846583725,
"learning_rate": 4.947962785296476e-06,
"loss": 0.1153,
"step": 716
},
{
"epoch": 0.3262056414922657,
"grad_norm": 2.070694963019659,
"learning_rate": 4.9478176337168594e-06,
"loss": 0.1153,
"step": 717
},
{
"epoch": 0.32666060054595086,
"grad_norm": 2.7729923804058516,
"learning_rate": 4.9476722821126386e-06,
"loss": 0.171,
"step": 718
},
{
"epoch": 0.327115559599636,
"grad_norm": 1.4442284620787837,
"learning_rate": 4.9475267304956945e-06,
"loss": 0.0997,
"step": 719
},
{
"epoch": 0.3275705186533212,
"grad_norm": 2.0979816044129413,
"learning_rate": 4.947380978877917e-06,
"loss": 0.1138,
"step": 720
},
{
"epoch": 0.32802547770700635,
"grad_norm": 1.9982881232916472,
"learning_rate": 4.947235027271219e-06,
"loss": 0.1402,
"step": 721
},
{
"epoch": 0.32848043676069155,
"grad_norm": 1.3317844805683108,
"learning_rate": 4.9470888756875265e-06,
"loss": 0.0707,
"step": 722
},
{
"epoch": 0.3289353958143767,
"grad_norm": 1.4665146144499257,
"learning_rate": 4.946942524138782e-06,
"loss": 0.075,
"step": 723
},
{
"epoch": 0.3293903548680619,
"grad_norm": 1.6321427811402383,
"learning_rate": 4.946795972636944e-06,
"loss": 0.0971,
"step": 724
},
{
"epoch": 0.32984531392174704,
"grad_norm": 1.9541110640157349,
"learning_rate": 4.94664922119399e-06,
"loss": 0.1347,
"step": 725
},
{
"epoch": 0.3303002729754322,
"grad_norm": 1.664760132709453,
"learning_rate": 4.94650226982191e-06,
"loss": 0.0959,
"step": 726
},
{
"epoch": 0.3307552320291174,
"grad_norm": 2.509161708357272,
"learning_rate": 4.9463551185327115e-06,
"loss": 0.1885,
"step": 727
},
{
"epoch": 0.33121019108280253,
"grad_norm": 1.7296886670922147,
"learning_rate": 4.946207767338422e-06,
"loss": 0.0867,
"step": 728
},
{
"epoch": 0.33166515013648773,
"grad_norm": 1.5254904811287948,
"learning_rate": 4.9460602162510805e-06,
"loss": 0.09,
"step": 729
},
{
"epoch": 0.3321201091901729,
"grad_norm": 1.3404896968358107,
"learning_rate": 4.945912465282744e-06,
"loss": 0.0782,
"step": 730
},
{
"epoch": 0.3325750682438581,
"grad_norm": 1.79952897501454,
"learning_rate": 4.945764514445487e-06,
"loss": 0.1444,
"step": 731
},
{
"epoch": 0.3330300272975432,
"grad_norm": 2.48899319031489,
"learning_rate": 4.9456163637513986e-06,
"loss": 0.1136,
"step": 732
},
{
"epoch": 0.33348498635122836,
"grad_norm": 1.8285171425829347,
"learning_rate": 4.945468013212585e-06,
"loss": 0.1052,
"step": 733
},
{
"epoch": 0.33393994540491356,
"grad_norm": 1.7843881981445446,
"learning_rate": 4.945319462841169e-06,
"loss": 0.1116,
"step": 734
},
{
"epoch": 0.3343949044585987,
"grad_norm": 2.181301353034186,
"learning_rate": 4.94517071264929e-06,
"loss": 0.1251,
"step": 735
},
{
"epoch": 0.3348498635122839,
"grad_norm": 1.2980326592722402,
"learning_rate": 4.945021762649102e-06,
"loss": 0.0648,
"step": 736
},
{
"epoch": 0.33530482256596905,
"grad_norm": 1.3874782347309536,
"learning_rate": 4.9448726128527776e-06,
"loss": 0.0978,
"step": 737
},
{
"epoch": 0.33575978161965425,
"grad_norm": 1.8955499231356112,
"learning_rate": 4.944723263272504e-06,
"loss": 0.0998,
"step": 738
},
{
"epoch": 0.3362147406733394,
"grad_norm": 1.6102418502733031,
"learning_rate": 4.944573713920485e-06,
"loss": 0.1055,
"step": 739
},
{
"epoch": 0.33666969972702454,
"grad_norm": 3.355056116777925,
"learning_rate": 4.944423964808943e-06,
"loss": 0.1831,
"step": 740
},
{
"epoch": 0.33712465878070974,
"grad_norm": 1.507329867530008,
"learning_rate": 4.944274015950113e-06,
"loss": 0.0889,
"step": 741
},
{
"epoch": 0.3375796178343949,
"grad_norm": 1.610548678904166,
"learning_rate": 4.944123867356249e-06,
"loss": 0.0752,
"step": 742
},
{
"epoch": 0.3380345768880801,
"grad_norm": 1.918715600058829,
"learning_rate": 4.943973519039619e-06,
"loss": 0.1335,
"step": 743
},
{
"epoch": 0.33848953594176523,
"grad_norm": 1.3921163271356483,
"learning_rate": 4.943822971012511e-06,
"loss": 0.0727,
"step": 744
},
{
"epoch": 0.33894449499545043,
"grad_norm": 1.2023922578586952,
"learning_rate": 4.943672223287226e-06,
"loss": 0.0628,
"step": 745
},
{
"epoch": 0.3393994540491356,
"grad_norm": 2.2794421985003317,
"learning_rate": 4.9435212758760815e-06,
"loss": 0.1404,
"step": 746
},
{
"epoch": 0.3398544131028208,
"grad_norm": 1.3986125533304865,
"learning_rate": 4.943370128791413e-06,
"loss": 0.0787,
"step": 747
},
{
"epoch": 0.3403093721565059,
"grad_norm": 1.5259961799310353,
"learning_rate": 4.943218782045574e-06,
"loss": 0.1079,
"step": 748
},
{
"epoch": 0.34076433121019106,
"grad_norm": 1.8181192019120165,
"learning_rate": 4.943067235650927e-06,
"loss": 0.1195,
"step": 749
},
{
"epoch": 0.34121929026387626,
"grad_norm": 1.831268771798402,
"learning_rate": 4.942915489619859e-06,
"loss": 0.1065,
"step": 750
},
{
"epoch": 0.3416742493175614,
"grad_norm": 1.7306841826817951,
"learning_rate": 4.9427635439647704e-06,
"loss": 0.1232,
"step": 751
},
{
"epoch": 0.3421292083712466,
"grad_norm": 1.7076927486745839,
"learning_rate": 4.942611398698075e-06,
"loss": 0.0912,
"step": 752
},
{
"epoch": 0.34258416742493175,
"grad_norm": 1.7425991433970283,
"learning_rate": 4.942459053832208e-06,
"loss": 0.0997,
"step": 753
},
{
"epoch": 0.34303912647861695,
"grad_norm": 1.809200639541382,
"learning_rate": 4.942306509379617e-06,
"loss": 0.1085,
"step": 754
},
{
"epoch": 0.3434940855323021,
"grad_norm": 1.293751880354007,
"learning_rate": 4.942153765352767e-06,
"loss": 0.0966,
"step": 755
},
{
"epoch": 0.34394904458598724,
"grad_norm": 1.2918089478267207,
"learning_rate": 4.94200082176414e-06,
"loss": 0.078,
"step": 756
},
{
"epoch": 0.34440400363967244,
"grad_norm": 1.5059276244213293,
"learning_rate": 4.941847678626234e-06,
"loss": 0.0805,
"step": 757
},
{
"epoch": 0.3448589626933576,
"grad_norm": 1.4851814064844335,
"learning_rate": 4.941694335951563e-06,
"loss": 0.0983,
"step": 758
},
{
"epoch": 0.3453139217470428,
"grad_norm": 1.8989617812022122,
"learning_rate": 4.9415407937526575e-06,
"loss": 0.1107,
"step": 759
},
{
"epoch": 0.34576888080072793,
"grad_norm": 1.8347292963195811,
"learning_rate": 4.9413870520420635e-06,
"loss": 0.1237,
"step": 760
},
{
"epoch": 0.34622383985441313,
"grad_norm": 1.5924498433598573,
"learning_rate": 4.941233110832346e-06,
"loss": 0.0735,
"step": 761
},
{
"epoch": 0.3466787989080983,
"grad_norm": 2.3326854621993984,
"learning_rate": 4.941078970136082e-06,
"loss": 0.1295,
"step": 762
},
{
"epoch": 0.3471337579617834,
"grad_norm": 1.7112828341096407,
"learning_rate": 4.940924629965869e-06,
"loss": 0.1162,
"step": 763
},
{
"epoch": 0.3475887170154686,
"grad_norm": 1.5436956280322631,
"learning_rate": 4.940770090334319e-06,
"loss": 0.0861,
"step": 764
},
{
"epoch": 0.34804367606915376,
"grad_norm": 1.6236751771508604,
"learning_rate": 4.940615351254059e-06,
"loss": 0.0968,
"step": 765
},
{
"epoch": 0.34849863512283896,
"grad_norm": 1.0400997330052792,
"learning_rate": 4.940460412737734e-06,
"loss": 0.0711,
"step": 766
},
{
"epoch": 0.3489535941765241,
"grad_norm": 1.623731539624473,
"learning_rate": 4.940305274798005e-06,
"loss": 0.0929,
"step": 767
},
{
"epoch": 0.3494085532302093,
"grad_norm": 1.3764287278870393,
"learning_rate": 4.940149937447549e-06,
"loss": 0.1002,
"step": 768
},
{
"epoch": 0.34986351228389445,
"grad_norm": 1.1571526873015439,
"learning_rate": 4.939994400699061e-06,
"loss": 0.0659,
"step": 769
},
{
"epoch": 0.3503184713375796,
"grad_norm": 1.3670356182264325,
"learning_rate": 4.939838664565248e-06,
"loss": 0.0991,
"step": 770
},
{
"epoch": 0.3507734303912648,
"grad_norm": 1.2532975621868427,
"learning_rate": 4.939682729058839e-06,
"loss": 0.0713,
"step": 771
},
{
"epoch": 0.35122838944494994,
"grad_norm": 1.3003896066972325,
"learning_rate": 4.939526594192574e-06,
"loss": 0.0784,
"step": 772
},
{
"epoch": 0.35168334849863514,
"grad_norm": 1.4253255736587618,
"learning_rate": 4.939370259979213e-06,
"loss": 0.0826,
"step": 773
},
{
"epoch": 0.3521383075523203,
"grad_norm": 2.0399381310170766,
"learning_rate": 4.9392137264315295e-06,
"loss": 0.1293,
"step": 774
},
{
"epoch": 0.3525932666060055,
"grad_norm": 1.938165172266556,
"learning_rate": 4.939056993562316e-06,
"loss": 0.1407,
"step": 775
},
{
"epoch": 0.35304822565969063,
"grad_norm": 1.5665447950299711,
"learning_rate": 4.9389000613843805e-06,
"loss": 0.0942,
"step": 776
},
{
"epoch": 0.3535031847133758,
"grad_norm": 1.6514430942693614,
"learning_rate": 4.938742929910546e-06,
"loss": 0.0927,
"step": 777
},
{
"epoch": 0.353958143767061,
"grad_norm": 1.0136329941515525,
"learning_rate": 4.938585599153652e-06,
"loss": 0.0676,
"step": 778
},
{
"epoch": 0.3544131028207461,
"grad_norm": 1.6808166258098367,
"learning_rate": 4.938428069126555e-06,
"loss": 0.1029,
"step": 779
},
{
"epoch": 0.3548680618744313,
"grad_norm": 1.6649052760273926,
"learning_rate": 4.9382703398421285e-06,
"loss": 0.0952,
"step": 780
},
{
"epoch": 0.35532302092811646,
"grad_norm": 1.734423574608651,
"learning_rate": 4.938112411313261e-06,
"loss": 0.1098,
"step": 781
},
{
"epoch": 0.35577797998180166,
"grad_norm": 1.5154424391674823,
"learning_rate": 4.937954283552858e-06,
"loss": 0.0808,
"step": 782
},
{
"epoch": 0.3562329390354868,
"grad_norm": 1.6988796126790968,
"learning_rate": 4.93779595657384e-06,
"loss": 0.1066,
"step": 783
},
{
"epoch": 0.35668789808917195,
"grad_norm": 2.050921985283142,
"learning_rate": 4.937637430389145e-06,
"loss": 0.1184,
"step": 784
},
{
"epoch": 0.35714285714285715,
"grad_norm": 1.5678672253769157,
"learning_rate": 4.937478705011729e-06,
"loss": 0.0709,
"step": 785
},
{
"epoch": 0.3575978161965423,
"grad_norm": 1.5215473079480804,
"learning_rate": 4.937319780454559e-06,
"loss": 0.1086,
"step": 786
},
{
"epoch": 0.3580527752502275,
"grad_norm": 1.4009067409412712,
"learning_rate": 4.937160656730625e-06,
"loss": 0.1004,
"step": 787
},
{
"epoch": 0.35850773430391264,
"grad_norm": 1.538795370618956,
"learning_rate": 4.9370013338529274e-06,
"loss": 0.0897,
"step": 788
},
{
"epoch": 0.35896269335759784,
"grad_norm": 1.3446100123630027,
"learning_rate": 4.936841811834486e-06,
"loss": 0.0907,
"step": 789
},
{
"epoch": 0.359417652411283,
"grad_norm": 1.9381081676057568,
"learning_rate": 4.936682090688337e-06,
"loss": 0.1534,
"step": 790
},
{
"epoch": 0.35987261146496813,
"grad_norm": 1.787589837431021,
"learning_rate": 4.936522170427531e-06,
"loss": 0.0919,
"step": 791
},
{
"epoch": 0.36032757051865333,
"grad_norm": 1.7189621906826116,
"learning_rate": 4.936362051065136e-06,
"loss": 0.0799,
"step": 792
},
{
"epoch": 0.3607825295723385,
"grad_norm": 1.615638183805568,
"learning_rate": 4.936201732614238e-06,
"loss": 0.0898,
"step": 793
},
{
"epoch": 0.3612374886260237,
"grad_norm": 1.899483445293266,
"learning_rate": 4.9360412150879355e-06,
"loss": 0.1086,
"step": 794
},
{
"epoch": 0.3616924476797088,
"grad_norm": 1.8831302635176637,
"learning_rate": 4.935880498499346e-06,
"loss": 0.0951,
"step": 795
},
{
"epoch": 0.362147406733394,
"grad_norm": 2.0172166216160594,
"learning_rate": 4.935719582861604e-06,
"loss": 0.0983,
"step": 796
},
{
"epoch": 0.36260236578707916,
"grad_norm": 1.7713001106130557,
"learning_rate": 4.935558468187855e-06,
"loss": 0.1177,
"step": 797
},
{
"epoch": 0.3630573248407643,
"grad_norm": 2.049007453668216,
"learning_rate": 4.935397154491268e-06,
"loss": 0.1349,
"step": 798
},
{
"epoch": 0.3635122838944495,
"grad_norm": 2.02340700279538,
"learning_rate": 4.935235641785023e-06,
"loss": 0.1419,
"step": 799
},
{
"epoch": 0.36396724294813465,
"grad_norm": 1.5504094804690502,
"learning_rate": 4.935073930082319e-06,
"loss": 0.1141,
"step": 800
},
{
"epoch": 0.36442220200181985,
"grad_norm": 1.3892292745868653,
"learning_rate": 4.93491201939637e-06,
"loss": 0.0859,
"step": 801
},
{
"epoch": 0.364877161055505,
"grad_norm": 1.636711407623354,
"learning_rate": 4.934749909740408e-06,
"loss": 0.1168,
"step": 802
},
{
"epoch": 0.3653321201091902,
"grad_norm": 1.5867549476191922,
"learning_rate": 4.934587601127677e-06,
"loss": 0.0941,
"step": 803
},
{
"epoch": 0.36578707916287534,
"grad_norm": 1.5019646850922737,
"learning_rate": 4.934425093571442e-06,
"loss": 0.0931,
"step": 804
},
{
"epoch": 0.3662420382165605,
"grad_norm": 1.5412581659446851,
"learning_rate": 4.934262387084984e-06,
"loss": 0.0931,
"step": 805
},
{
"epoch": 0.3666969972702457,
"grad_norm": 1.3579602631174856,
"learning_rate": 4.934099481681595e-06,
"loss": 0.0745,
"step": 806
},
{
"epoch": 0.36715195632393083,
"grad_norm": 1.800459979497766,
"learning_rate": 4.933936377374589e-06,
"loss": 0.1072,
"step": 807
},
{
"epoch": 0.36760691537761603,
"grad_norm": 1.1946995764469395,
"learning_rate": 4.933773074177293e-06,
"loss": 0.0848,
"step": 808
},
{
"epoch": 0.3680618744313012,
"grad_norm": 1.6651644751131276,
"learning_rate": 4.933609572103053e-06,
"loss": 0.0965,
"step": 809
},
{
"epoch": 0.3685168334849864,
"grad_norm": 1.913995880200427,
"learning_rate": 4.933445871165229e-06,
"loss": 0.1315,
"step": 810
},
{
"epoch": 0.3689717925386715,
"grad_norm": 1.5517430124798408,
"learning_rate": 4.933281971377197e-06,
"loss": 0.0856,
"step": 811
},
{
"epoch": 0.36942675159235666,
"grad_norm": 1.474632001508129,
"learning_rate": 4.933117872752352e-06,
"loss": 0.0989,
"step": 812
},
{
"epoch": 0.36988171064604186,
"grad_norm": 1.8862093944877263,
"learning_rate": 4.932953575304102e-06,
"loss": 0.1087,
"step": 813
},
{
"epoch": 0.370336669699727,
"grad_norm": 1.6830668966166524,
"learning_rate": 4.932789079045873e-06,
"loss": 0.1213,
"step": 814
},
{
"epoch": 0.3707916287534122,
"grad_norm": 1.7198476556190763,
"learning_rate": 4.932624383991106e-06,
"loss": 0.1215,
"step": 815
},
{
"epoch": 0.37124658780709735,
"grad_norm": 2.109229814604393,
"learning_rate": 4.9324594901532605e-06,
"loss": 0.1337,
"step": 816
},
{
"epoch": 0.37170154686078255,
"grad_norm": 1.4154701665481155,
"learning_rate": 4.93229439754581e-06,
"loss": 0.0944,
"step": 817
},
{
"epoch": 0.3721565059144677,
"grad_norm": 1.973608289061544,
"learning_rate": 4.932129106182246e-06,
"loss": 0.0901,
"step": 818
},
{
"epoch": 0.37261146496815284,
"grad_norm": 1.651833939526615,
"learning_rate": 4.931963616076075e-06,
"loss": 0.0876,
"step": 819
},
{
"epoch": 0.37306642402183804,
"grad_norm": 1.3876140677966586,
"learning_rate": 4.93179792724082e-06,
"loss": 0.0791,
"step": 820
},
{
"epoch": 0.3735213830755232,
"grad_norm": 1.4201117298181156,
"learning_rate": 4.9316320396900195e-06,
"loss": 0.0857,
"step": 821
},
{
"epoch": 0.3739763421292084,
"grad_norm": 2.158894018361071,
"learning_rate": 4.9314659534372305e-06,
"loss": 0.1499,
"step": 822
},
{
"epoch": 0.37443130118289353,
"grad_norm": 1.2722019893377066,
"learning_rate": 4.931299668496024e-06,
"loss": 0.0626,
"step": 823
},
{
"epoch": 0.37488626023657873,
"grad_norm": 1.5889108253283166,
"learning_rate": 4.931133184879988e-06,
"loss": 0.1003,
"step": 824
},
{
"epoch": 0.37534121929026387,
"grad_norm": 1.133918642525753,
"learning_rate": 4.930966502602727e-06,
"loss": 0.0714,
"step": 825
},
{
"epoch": 0.37579617834394907,
"grad_norm": 2.1296168633446615,
"learning_rate": 4.930799621677862e-06,
"loss": 0.1276,
"step": 826
},
{
"epoch": 0.3762511373976342,
"grad_norm": 2.018575113751553,
"learning_rate": 4.93063254211903e-06,
"loss": 0.134,
"step": 827
},
{
"epoch": 0.37670609645131936,
"grad_norm": 1.2247931548507431,
"learning_rate": 4.930465263939882e-06,
"loss": 0.0617,
"step": 828
},
{
"epoch": 0.37716105550500456,
"grad_norm": 2.032637719937323,
"learning_rate": 4.9302977871540894e-06,
"loss": 0.1191,
"step": 829
},
{
"epoch": 0.3776160145586897,
"grad_norm": 1.8922514826155596,
"learning_rate": 4.930130111775336e-06,
"loss": 0.1136,
"step": 830
},
{
"epoch": 0.3780709736123749,
"grad_norm": 1.2345527477299194,
"learning_rate": 4.9299622378173245e-06,
"loss": 0.0613,
"step": 831
},
{
"epoch": 0.37852593266606005,
"grad_norm": 2.2369584057058693,
"learning_rate": 4.929794165293773e-06,
"loss": 0.1384,
"step": 832
},
{
"epoch": 0.37898089171974525,
"grad_norm": 1.2980952577352378,
"learning_rate": 4.9296258942184145e-06,
"loss": 0.0889,
"step": 833
},
{
"epoch": 0.3794358507734304,
"grad_norm": 2.116237658876168,
"learning_rate": 4.929457424605e-06,
"loss": 0.1156,
"step": 834
},
{
"epoch": 0.37989080982711554,
"grad_norm": 1.820103679143319,
"learning_rate": 4.929288756467296e-06,
"loss": 0.1224,
"step": 835
},
{
"epoch": 0.38034576888080074,
"grad_norm": 1.6658306682266317,
"learning_rate": 4.929119889819086e-06,
"loss": 0.0871,
"step": 836
},
{
"epoch": 0.3808007279344859,
"grad_norm": 2.7831412779318128,
"learning_rate": 4.928950824674169e-06,
"loss": 0.1447,
"step": 837
},
{
"epoch": 0.3812556869881711,
"grad_norm": 1.460745158832598,
"learning_rate": 4.928781561046359e-06,
"loss": 0.0902,
"step": 838
},
{
"epoch": 0.3817106460418562,
"grad_norm": 1.544649379546627,
"learning_rate": 4.928612098949488e-06,
"loss": 0.0995,
"step": 839
},
{
"epoch": 0.3821656050955414,
"grad_norm": 1.583411250445995,
"learning_rate": 4.9284424383974026e-06,
"loss": 0.1007,
"step": 840
},
{
"epoch": 0.38262056414922657,
"grad_norm": 1.2960669635575661,
"learning_rate": 4.928272579403969e-06,
"loss": 0.0679,
"step": 841
},
{
"epoch": 0.3830755232029117,
"grad_norm": 1.4865280371498417,
"learning_rate": 4.928102521983067e-06,
"loss": 0.1208,
"step": 842
},
{
"epoch": 0.3835304822565969,
"grad_norm": 2.1345090660254145,
"learning_rate": 4.9279322661485906e-06,
"loss": 0.1489,
"step": 843
},
{
"epoch": 0.38398544131028206,
"grad_norm": 1.705469805887344,
"learning_rate": 4.927761811914455e-06,
"loss": 0.1084,
"step": 844
},
{
"epoch": 0.38444040036396726,
"grad_norm": 1.358954041720105,
"learning_rate": 4.927591159294587e-06,
"loss": 0.0827,
"step": 845
},
{
"epoch": 0.3848953594176524,
"grad_norm": 1.8335314647218843,
"learning_rate": 4.927420308302933e-06,
"loss": 0.102,
"step": 846
},
{
"epoch": 0.3853503184713376,
"grad_norm": 1.710141204765745,
"learning_rate": 4.927249258953454e-06,
"loss": 0.1091,
"step": 847
},
{
"epoch": 0.38580527752502275,
"grad_norm": 1.7784989569871608,
"learning_rate": 4.927078011260126e-06,
"loss": 0.1094,
"step": 848
},
{
"epoch": 0.3862602365787079,
"grad_norm": 1.9072996593932403,
"learning_rate": 4.926906565236943e-06,
"loss": 0.1255,
"step": 849
},
{
"epoch": 0.3867151956323931,
"grad_norm": 1.7435526255624214,
"learning_rate": 4.926734920897916e-06,
"loss": 0.1076,
"step": 850
},
{
"epoch": 0.38717015468607824,
"grad_norm": 1.3254342460194672,
"learning_rate": 4.926563078257071e-06,
"loss": 0.099,
"step": 851
},
{
"epoch": 0.38762511373976344,
"grad_norm": 1.0985508710385608,
"learning_rate": 4.926391037328448e-06,
"loss": 0.0848,
"step": 852
},
{
"epoch": 0.3880800727934486,
"grad_norm": 1.6344858491886853,
"learning_rate": 4.926218798126108e-06,
"loss": 0.1102,
"step": 853
},
{
"epoch": 0.3885350318471338,
"grad_norm": 1.694464350768917,
"learning_rate": 4.926046360664124e-06,
"loss": 0.0868,
"step": 854
},
{
"epoch": 0.3889899909008189,
"grad_norm": 1.865189060623283,
"learning_rate": 4.925873724956588e-06,
"loss": 0.1152,
"step": 855
},
{
"epoch": 0.38944494995450407,
"grad_norm": 1.794490671041637,
"learning_rate": 4.9257008910176065e-06,
"loss": 0.1443,
"step": 856
},
{
"epoch": 0.38989990900818927,
"grad_norm": 1.6294296423553156,
"learning_rate": 4.925527858861302e-06,
"loss": 0.092,
"step": 857
},
{
"epoch": 0.3903548680618744,
"grad_norm": 1.7424555145921712,
"learning_rate": 4.925354628501814e-06,
"loss": 0.1002,
"step": 858
},
{
"epoch": 0.3908098271155596,
"grad_norm": 2.309513172607415,
"learning_rate": 4.925181199953299e-06,
"loss": 0.1288,
"step": 859
},
{
"epoch": 0.39126478616924476,
"grad_norm": 1.3668641274774587,
"learning_rate": 4.9250075732299285e-06,
"loss": 0.0903,
"step": 860
},
{
"epoch": 0.39171974522292996,
"grad_norm": 1.7785057619158235,
"learning_rate": 4.92483374834589e-06,
"loss": 0.1181,
"step": 861
},
{
"epoch": 0.3921747042766151,
"grad_norm": 1.5234971151354315,
"learning_rate": 4.9246597253153884e-06,
"loss": 0.0935,
"step": 862
},
{
"epoch": 0.39262966333030025,
"grad_norm": 1.1791645313929775,
"learning_rate": 4.924485504152644e-06,
"loss": 0.0822,
"step": 863
},
{
"epoch": 0.39308462238398545,
"grad_norm": 1.5983057485508323,
"learning_rate": 4.924311084871892e-06,
"loss": 0.0966,
"step": 864
},
{
"epoch": 0.3935395814376706,
"grad_norm": 1.6634965227764558,
"learning_rate": 4.924136467487387e-06,
"loss": 0.0759,
"step": 865
},
{
"epoch": 0.3939945404913558,
"grad_norm": 1.5231170961334706,
"learning_rate": 4.923961652013397e-06,
"loss": 0.0881,
"step": 866
},
{
"epoch": 0.39444949954504094,
"grad_norm": 1.4495990250164725,
"learning_rate": 4.923786638464207e-06,
"loss": 0.0941,
"step": 867
},
{
"epoch": 0.39490445859872614,
"grad_norm": 1.3390712595063252,
"learning_rate": 4.9236114268541196e-06,
"loss": 0.0846,
"step": 868
},
{
"epoch": 0.3953594176524113,
"grad_norm": 1.627122973701433,
"learning_rate": 4.923436017197451e-06,
"loss": 0.0819,
"step": 869
},
{
"epoch": 0.3958143767060964,
"grad_norm": 1.3377642278691055,
"learning_rate": 4.923260409508535e-06,
"loss": 0.088,
"step": 870
},
{
"epoch": 0.3962693357597816,
"grad_norm": 1.9694748985572026,
"learning_rate": 4.9230846038017214e-06,
"loss": 0.151,
"step": 871
},
{
"epoch": 0.39672429481346677,
"grad_norm": 1.4923965061921258,
"learning_rate": 4.922908600091378e-06,
"loss": 0.0795,
"step": 872
},
{
"epoch": 0.39717925386715197,
"grad_norm": 1.8057120373297069,
"learning_rate": 4.9227323983918835e-06,
"loss": 0.1439,
"step": 873
},
{
"epoch": 0.3976342129208371,
"grad_norm": 1.226146313826682,
"learning_rate": 4.922555998717639e-06,
"loss": 0.0845,
"step": 874
},
{
"epoch": 0.3980891719745223,
"grad_norm": 1.4188073442884932,
"learning_rate": 4.922379401083058e-06,
"loss": 0.0723,
"step": 875
},
{
"epoch": 0.39854413102820746,
"grad_norm": 1.6044422866063657,
"learning_rate": 4.922202605502573e-06,
"loss": 0.0981,
"step": 876
},
{
"epoch": 0.3989990900818926,
"grad_norm": 1.645096377490142,
"learning_rate": 4.922025611990629e-06,
"loss": 0.0882,
"step": 877
},
{
"epoch": 0.3994540491355778,
"grad_norm": 1.4988618969542298,
"learning_rate": 4.92184842056169e-06,
"loss": 0.0914,
"step": 878
},
{
"epoch": 0.39990900818926295,
"grad_norm": 1.4716766649704647,
"learning_rate": 4.921671031230235e-06,
"loss": 0.0843,
"step": 879
},
{
"epoch": 0.40036396724294815,
"grad_norm": 1.8151437273817552,
"learning_rate": 4.921493444010759e-06,
"loss": 0.1115,
"step": 880
},
{
"epoch": 0.4008189262966333,
"grad_norm": 1.3841092562389385,
"learning_rate": 4.921315658917774e-06,
"loss": 0.0821,
"step": 881
},
{
"epoch": 0.4012738853503185,
"grad_norm": 1.5281014710080694,
"learning_rate": 4.921137675965809e-06,
"loss": 0.0894,
"step": 882
},
{
"epoch": 0.40172884440400364,
"grad_norm": 1.1860457913745353,
"learning_rate": 4.920959495169406e-06,
"loss": 0.0819,
"step": 883
},
{
"epoch": 0.4021838034576888,
"grad_norm": 1.9670434695091386,
"learning_rate": 4.920781116543126e-06,
"loss": 0.1198,
"step": 884
},
{
"epoch": 0.402638762511374,
"grad_norm": 1.4837005110977715,
"learning_rate": 4.920602540101546e-06,
"loss": 0.0871,
"step": 885
},
{
"epoch": 0.4030937215650591,
"grad_norm": 1.8269163623820734,
"learning_rate": 4.920423765859257e-06,
"loss": 0.0956,
"step": 886
},
{
"epoch": 0.4035486806187443,
"grad_norm": 1.6998774179110374,
"learning_rate": 4.920244793830869e-06,
"loss": 0.0973,
"step": 887
},
{
"epoch": 0.40400363967242947,
"grad_norm": 1.6596471546846747,
"learning_rate": 4.920065624031006e-06,
"loss": 0.1085,
"step": 888
},
{
"epoch": 0.40445859872611467,
"grad_norm": 1.4077908132773769,
"learning_rate": 4.919886256474309e-06,
"loss": 0.0904,
"step": 889
},
{
"epoch": 0.4049135577797998,
"grad_norm": 1.7022215596121757,
"learning_rate": 4.919706691175435e-06,
"loss": 0.091,
"step": 890
},
{
"epoch": 0.40536851683348496,
"grad_norm": 2.1232813584307455,
"learning_rate": 4.919526928149058e-06,
"loss": 0.1366,
"step": 891
},
{
"epoch": 0.40582347588717016,
"grad_norm": 1.6341211456957871,
"learning_rate": 4.919346967409867e-06,
"loss": 0.1108,
"step": 892
},
{
"epoch": 0.4062784349408553,
"grad_norm": 1.5324489468460818,
"learning_rate": 4.919166808972567e-06,
"loss": 0.1228,
"step": 893
},
{
"epoch": 0.4067333939945405,
"grad_norm": 2.099437608372934,
"learning_rate": 4.918986452851881e-06,
"loss": 0.1245,
"step": 894
},
{
"epoch": 0.40718835304822565,
"grad_norm": 1.3588941988828955,
"learning_rate": 4.918805899062545e-06,
"loss": 0.0621,
"step": 895
},
{
"epoch": 0.40764331210191085,
"grad_norm": 0.8277266375645331,
"learning_rate": 4.9186251476193146e-06,
"loss": 0.0499,
"step": 896
},
{
"epoch": 0.408098271155596,
"grad_norm": 1.7852175335240448,
"learning_rate": 4.918444198536959e-06,
"loss": 0.1206,
"step": 897
},
{
"epoch": 0.40855323020928114,
"grad_norm": 1.5382745011065326,
"learning_rate": 4.918263051830267e-06,
"loss": 0.1081,
"step": 898
},
{
"epoch": 0.40900818926296634,
"grad_norm": 1.621296590196374,
"learning_rate": 4.918081707514037e-06,
"loss": 0.0881,
"step": 899
},
{
"epoch": 0.4094631483166515,
"grad_norm": 2.178092466242458,
"learning_rate": 4.917900165603091e-06,
"loss": 0.1364,
"step": 900
},
{
"epoch": 0.4099181073703367,
"grad_norm": 1.5880350908655525,
"learning_rate": 4.9177184261122624e-06,
"loss": 0.1073,
"step": 901
},
{
"epoch": 0.4103730664240218,
"grad_norm": 1.8483741427612825,
"learning_rate": 4.917536489056402e-06,
"loss": 0.0972,
"step": 902
},
{
"epoch": 0.410828025477707,
"grad_norm": 1.5893537500919641,
"learning_rate": 4.9173543544503775e-06,
"loss": 0.0851,
"step": 903
},
{
"epoch": 0.41128298453139217,
"grad_norm": 1.144493331243443,
"learning_rate": 4.917172022309072e-06,
"loss": 0.0637,
"step": 904
},
{
"epoch": 0.41173794358507737,
"grad_norm": 1.139422632834299,
"learning_rate": 4.916989492647385e-06,
"loss": 0.065,
"step": 905
},
{
"epoch": 0.4121929026387625,
"grad_norm": 1.2858602055549935,
"learning_rate": 4.916806765480231e-06,
"loss": 0.079,
"step": 906
},
{
"epoch": 0.41264786169244766,
"grad_norm": 1.9716514818564959,
"learning_rate": 4.9166238408225416e-06,
"loss": 0.161,
"step": 907
},
{
"epoch": 0.41310282074613286,
"grad_norm": 1.6206512831659239,
"learning_rate": 4.916440718689267e-06,
"loss": 0.0958,
"step": 908
},
{
"epoch": 0.413557779799818,
"grad_norm": 1.2472167749456646,
"learning_rate": 4.916257399095369e-06,
"loss": 0.0705,
"step": 909
},
{
"epoch": 0.4140127388535032,
"grad_norm": 1.1891048303298737,
"learning_rate": 4.916073882055827e-06,
"loss": 0.0671,
"step": 910
},
{
"epoch": 0.41446769790718835,
"grad_norm": 1.9533245506572903,
"learning_rate": 4.91589016758564e-06,
"loss": 0.1203,
"step": 911
},
{
"epoch": 0.41492265696087355,
"grad_norm": 1.7223916244259532,
"learning_rate": 4.915706255699817e-06,
"loss": 0.1171,
"step": 912
},
{
"epoch": 0.4153776160145587,
"grad_norm": 2.042050502050582,
"learning_rate": 4.915522146413389e-06,
"loss": 0.152,
"step": 913
},
{
"epoch": 0.41583257506824384,
"grad_norm": 1.5213892799482642,
"learning_rate": 4.9153378397413985e-06,
"loss": 0.1011,
"step": 914
},
{
"epoch": 0.41628753412192904,
"grad_norm": 1.8893914267841023,
"learning_rate": 4.915153335698908e-06,
"loss": 0.1133,
"step": 915
},
{
"epoch": 0.4167424931756142,
"grad_norm": 1.7882796521112458,
"learning_rate": 4.914968634300994e-06,
"loss": 0.1081,
"step": 916
},
{
"epoch": 0.4171974522292994,
"grad_norm": 1.186974851727905,
"learning_rate": 4.914783735562748e-06,
"loss": 0.0791,
"step": 917
},
{
"epoch": 0.4176524112829845,
"grad_norm": 1.3276822787818023,
"learning_rate": 4.914598639499281e-06,
"loss": 0.0929,
"step": 918
},
{
"epoch": 0.4181073703366697,
"grad_norm": 1.3143453344689244,
"learning_rate": 4.914413346125717e-06,
"loss": 0.0907,
"step": 919
},
{
"epoch": 0.41856232939035487,
"grad_norm": 1.2706441279848544,
"learning_rate": 4.914227855457199e-06,
"loss": 0.0797,
"step": 920
},
{
"epoch": 0.41901728844404,
"grad_norm": 1.8437493208675002,
"learning_rate": 4.914042167508881e-06,
"loss": 0.0851,
"step": 921
},
{
"epoch": 0.4194722474977252,
"grad_norm": 1.4975873837594447,
"learning_rate": 4.9138562822959416e-06,
"loss": 0.0735,
"step": 922
},
{
"epoch": 0.41992720655141036,
"grad_norm": 1.8590378932388973,
"learning_rate": 4.913670199833566e-06,
"loss": 0.0955,
"step": 923
},
{
"epoch": 0.42038216560509556,
"grad_norm": 1.6110342357827778,
"learning_rate": 4.913483920136961e-06,
"loss": 0.0904,
"step": 924
},
{
"epoch": 0.4208371246587807,
"grad_norm": 1.761284240310015,
"learning_rate": 4.91329744322135e-06,
"loss": 0.0967,
"step": 925
},
{
"epoch": 0.4212920837124659,
"grad_norm": 1.3709410104557458,
"learning_rate": 4.913110769101971e-06,
"loss": 0.0872,
"step": 926
},
{
"epoch": 0.42174704276615105,
"grad_norm": 1.6539854986144262,
"learning_rate": 4.912923897794077e-06,
"loss": 0.0982,
"step": 927
},
{
"epoch": 0.4222020018198362,
"grad_norm": 1.6465498130671066,
"learning_rate": 4.912736829312938e-06,
"loss": 0.1093,
"step": 928
},
{
"epoch": 0.4226569608735214,
"grad_norm": 1.8873864205133448,
"learning_rate": 4.912549563673842e-06,
"loss": 0.1239,
"step": 929
},
{
"epoch": 0.42311191992720654,
"grad_norm": 1.5496708014603886,
"learning_rate": 4.912362100892091e-06,
"loss": 0.1273,
"step": 930
},
{
"epoch": 0.42356687898089174,
"grad_norm": 1.1519662533075623,
"learning_rate": 4.912174440983002e-06,
"loss": 0.0729,
"step": 931
},
{
"epoch": 0.4240218380345769,
"grad_norm": 1.6674274772885138,
"learning_rate": 4.911986583961912e-06,
"loss": 0.1107,
"step": 932
},
{
"epoch": 0.4244767970882621,
"grad_norm": 1.8943327104641587,
"learning_rate": 4.91179852984417e-06,
"loss": 0.0989,
"step": 933
},
{
"epoch": 0.4249317561419472,
"grad_norm": 1.3387420389544245,
"learning_rate": 4.911610278645144e-06,
"loss": 0.0873,
"step": 934
},
{
"epoch": 0.42538671519563237,
"grad_norm": 1.3086866571732974,
"learning_rate": 4.911421830380217e-06,
"loss": 0.0767,
"step": 935
},
{
"epoch": 0.42584167424931757,
"grad_norm": 2.04544186641041,
"learning_rate": 4.911233185064788e-06,
"loss": 0.1285,
"step": 936
},
{
"epoch": 0.4262966333030027,
"grad_norm": 1.6906012723967403,
"learning_rate": 4.911044342714272e-06,
"loss": 0.0997,
"step": 937
},
{
"epoch": 0.4267515923566879,
"grad_norm": 1.439162135385858,
"learning_rate": 4.9108553033440995e-06,
"loss": 0.0744,
"step": 938
},
{
"epoch": 0.42720655141037306,
"grad_norm": 1.2593154408057343,
"learning_rate": 4.91066606696972e-06,
"loss": 0.074,
"step": 939
},
{
"epoch": 0.42766151046405826,
"grad_norm": 1.7514521824191083,
"learning_rate": 4.910476633606597e-06,
"loss": 0.0971,
"step": 940
},
{
"epoch": 0.4281164695177434,
"grad_norm": 1.5625231909908295,
"learning_rate": 4.9102870032702075e-06,
"loss": 0.0689,
"step": 941
},
{
"epoch": 0.42857142857142855,
"grad_norm": 1.5194579023544843,
"learning_rate": 4.910097175976049e-06,
"loss": 0.0824,
"step": 942
},
{
"epoch": 0.42902638762511375,
"grad_norm": 1.4223453649486908,
"learning_rate": 4.909907151739634e-06,
"loss": 0.0747,
"step": 943
},
{
"epoch": 0.4294813466787989,
"grad_norm": 2.2121264200483393,
"learning_rate": 4.909716930576489e-06,
"loss": 0.1463,
"step": 944
},
{
"epoch": 0.4299363057324841,
"grad_norm": 1.5012792406542972,
"learning_rate": 4.909526512502158e-06,
"loss": 0.1241,
"step": 945
},
{
"epoch": 0.43039126478616924,
"grad_norm": 1.6714102508168673,
"learning_rate": 4.9093358975322025e-06,
"loss": 0.1045,
"step": 946
},
{
"epoch": 0.43084622383985444,
"grad_norm": 1.5613346147429912,
"learning_rate": 4.909145085682198e-06,
"loss": 0.1105,
"step": 947
},
{
"epoch": 0.4313011828935396,
"grad_norm": 1.4864622392832871,
"learning_rate": 4.908954076967737e-06,
"loss": 0.0831,
"step": 948
},
{
"epoch": 0.4317561419472247,
"grad_norm": 1.5530391149425158,
"learning_rate": 4.908762871404427e-06,
"loss": 0.1345,
"step": 949
},
{
"epoch": 0.4322111010009099,
"grad_norm": 1.5444429676980205,
"learning_rate": 4.908571469007893e-06,
"loss": 0.0886,
"step": 950
},
{
"epoch": 0.43266606005459507,
"grad_norm": 1.8034818342216412,
"learning_rate": 4.908379869793776e-06,
"loss": 0.1046,
"step": 951
},
{
"epoch": 0.43312101910828027,
"grad_norm": 1.3153452614362922,
"learning_rate": 4.908188073777732e-06,
"loss": 0.0715,
"step": 952
},
{
"epoch": 0.4335759781619654,
"grad_norm": 2.0825682650521857,
"learning_rate": 4.9079960809754334e-06,
"loss": 0.135,
"step": 953
},
{
"epoch": 0.4340309372156506,
"grad_norm": 1.3431541090651076,
"learning_rate": 4.90780389140257e-06,
"loss": 0.0812,
"step": 954
},
{
"epoch": 0.43448589626933576,
"grad_norm": 2.018134282960315,
"learning_rate": 4.907611505074846e-06,
"loss": 0.1001,
"step": 955
},
{
"epoch": 0.4349408553230209,
"grad_norm": 1.8270847906398506,
"learning_rate": 4.907418922007983e-06,
"loss": 0.1054,
"step": 956
},
{
"epoch": 0.4353958143767061,
"grad_norm": 1.5502670619333374,
"learning_rate": 4.907226142217717e-06,
"loss": 0.0832,
"step": 957
},
{
"epoch": 0.43585077343039125,
"grad_norm": 1.5099564094926066,
"learning_rate": 4.9070331657198015e-06,
"loss": 0.093,
"step": 958
},
{
"epoch": 0.43630573248407645,
"grad_norm": 1.6580816557213998,
"learning_rate": 4.906839992530006e-06,
"loss": 0.1133,
"step": 959
},
{
"epoch": 0.4367606915377616,
"grad_norm": 1.9468112171012433,
"learning_rate": 4.906646622664115e-06,
"loss": 0.1122,
"step": 960
},
{
"epoch": 0.4372156505914468,
"grad_norm": 1.3246750710377195,
"learning_rate": 4.906453056137931e-06,
"loss": 0.0572,
"step": 961
},
{
"epoch": 0.43767060964513194,
"grad_norm": 2.1577598041780846,
"learning_rate": 4.90625929296727e-06,
"loss": 0.1419,
"step": 962
},
{
"epoch": 0.4381255686988171,
"grad_norm": 1.3649728107391488,
"learning_rate": 4.9060653331679665e-06,
"loss": 0.1026,
"step": 963
},
{
"epoch": 0.4385805277525023,
"grad_norm": 1.7954750394301047,
"learning_rate": 4.90587117675587e-06,
"loss": 0.124,
"step": 964
},
{
"epoch": 0.4390354868061874,
"grad_norm": 1.6192897762023186,
"learning_rate": 4.905676823746846e-06,
"loss": 0.102,
"step": 965
},
{
"epoch": 0.4394904458598726,
"grad_norm": 1.183156466195084,
"learning_rate": 4.9054822741567745e-06,
"loss": 0.0741,
"step": 966
},
{
"epoch": 0.43994540491355777,
"grad_norm": 1.791057313794206,
"learning_rate": 4.905287528001555e-06,
"loss": 0.0986,
"step": 967
},
{
"epoch": 0.44040036396724297,
"grad_norm": 1.5587372758795195,
"learning_rate": 4.905092585297102e-06,
"loss": 0.0959,
"step": 968
},
{
"epoch": 0.4408553230209281,
"grad_norm": 1.9086814389692623,
"learning_rate": 4.904897446059344e-06,
"loss": 0.1124,
"step": 969
},
{
"epoch": 0.44131028207461326,
"grad_norm": 1.5518685718016205,
"learning_rate": 4.9047021103042255e-06,
"loss": 0.0802,
"step": 970
},
{
"epoch": 0.44176524112829846,
"grad_norm": 1.5626634869227398,
"learning_rate": 4.904506578047712e-06,
"loss": 0.0966,
"step": 971
},
{
"epoch": 0.4422202001819836,
"grad_norm": 1.6777151282946248,
"learning_rate": 4.9043108493057785e-06,
"loss": 0.0946,
"step": 972
},
{
"epoch": 0.4426751592356688,
"grad_norm": 1.3918546303467518,
"learning_rate": 4.904114924094421e-06,
"loss": 0.0776,
"step": 973
},
{
"epoch": 0.44313011828935395,
"grad_norm": 1.7054781101293177,
"learning_rate": 4.903918802429648e-06,
"loss": 0.1076,
"step": 974
},
{
"epoch": 0.44358507734303915,
"grad_norm": 0.9435161970580179,
"learning_rate": 4.9037224843274875e-06,
"loss": 0.055,
"step": 975
},
{
"epoch": 0.4440400363967243,
"grad_norm": 1.8279732096534727,
"learning_rate": 4.903525969803979e-06,
"loss": 0.144,
"step": 976
},
{
"epoch": 0.44449499545040944,
"grad_norm": 1.5827975534285916,
"learning_rate": 4.903329258875184e-06,
"loss": 0.0876,
"step": 977
},
{
"epoch": 0.44494995450409464,
"grad_norm": 1.5817514212508765,
"learning_rate": 4.903132351557175e-06,
"loss": 0.1003,
"step": 978
},
{
"epoch": 0.4454049135577798,
"grad_norm": 1.55794858043461,
"learning_rate": 4.902935247866043e-06,
"loss": 0.0901,
"step": 979
},
{
"epoch": 0.445859872611465,
"grad_norm": 1.7648097170403771,
"learning_rate": 4.9027379478178935e-06,
"loss": 0.1117,
"step": 980
},
{
"epoch": 0.4463148316651501,
"grad_norm": 1.4493752053158233,
"learning_rate": 4.90254045142885e-06,
"loss": 0.0824,
"step": 981
},
{
"epoch": 0.4467697907188353,
"grad_norm": 1.4618354488172722,
"learning_rate": 4.90234275871505e-06,
"loss": 0.08,
"step": 982
},
{
"epoch": 0.44722474977252047,
"grad_norm": 2.314057245131694,
"learning_rate": 4.9021448696926486e-06,
"loss": 0.1437,
"step": 983
},
{
"epoch": 0.44767970882620567,
"grad_norm": 1.2365214796695643,
"learning_rate": 4.901946784377816e-06,
"loss": 0.0955,
"step": 984
},
{
"epoch": 0.4481346678798908,
"grad_norm": 1.2633152164234291,
"learning_rate": 4.90174850278674e-06,
"loss": 0.0803,
"step": 985
},
{
"epoch": 0.44858962693357596,
"grad_norm": 1.5083171008818446,
"learning_rate": 4.901550024935623e-06,
"loss": 0.0942,
"step": 986
},
{
"epoch": 0.44904458598726116,
"grad_norm": 1.1583463791947812,
"learning_rate": 4.901351350840683e-06,
"loss": 0.0786,
"step": 987
},
{
"epoch": 0.4494995450409463,
"grad_norm": 1.343367085202188,
"learning_rate": 4.901152480518155e-06,
"loss": 0.0724,
"step": 988
},
{
"epoch": 0.4499545040946315,
"grad_norm": 1.1159650914918346,
"learning_rate": 4.900953413984289e-06,
"loss": 0.0681,
"step": 989
},
{
"epoch": 0.45040946314831665,
"grad_norm": 2.0950998044271025,
"learning_rate": 4.900754151255353e-06,
"loss": 0.1541,
"step": 990
},
{
"epoch": 0.45086442220200185,
"grad_norm": 1.4260341278646986,
"learning_rate": 4.9005546923476305e-06,
"loss": 0.0707,
"step": 991
},
{
"epoch": 0.451319381255687,
"grad_norm": 1.6502415030386688,
"learning_rate": 4.9003550372774185e-06,
"loss": 0.1111,
"step": 992
},
{
"epoch": 0.45177434030937214,
"grad_norm": 1.280806174818392,
"learning_rate": 4.900155186061033e-06,
"loss": 0.0789,
"step": 993
},
{
"epoch": 0.45222929936305734,
"grad_norm": 1.9745186799391785,
"learning_rate": 4.8999551387148045e-06,
"loss": 0.1125,
"step": 994
},
{
"epoch": 0.4526842584167425,
"grad_norm": 1.2542781615680096,
"learning_rate": 4.89975489525508e-06,
"loss": 0.0814,
"step": 995
},
{
"epoch": 0.4531392174704277,
"grad_norm": 1.5218729573521388,
"learning_rate": 4.899554455698223e-06,
"loss": 0.0849,
"step": 996
},
{
"epoch": 0.4535941765241128,
"grad_norm": 1.4911465655176248,
"learning_rate": 4.899353820060612e-06,
"loss": 0.0887,
"step": 997
},
{
"epoch": 0.454049135577798,
"grad_norm": 1.8552177664529743,
"learning_rate": 4.899152988358643e-06,
"loss": 0.1153,
"step": 998
},
{
"epoch": 0.45450409463148317,
"grad_norm": 1.3462289694693903,
"learning_rate": 4.898951960608725e-06,
"loss": 0.0768,
"step": 999
},
{
"epoch": 0.4549590536851683,
"grad_norm": 1.5105165626051191,
"learning_rate": 4.8987507368272865e-06,
"loss": 0.0916,
"step": 1000
},
{
"epoch": 0.4554140127388535,
"grad_norm": 1.7874012401425645,
"learning_rate": 4.898549317030772e-06,
"loss": 0.1228,
"step": 1001
},
{
"epoch": 0.45586897179253866,
"grad_norm": 1.8678564128703685,
"learning_rate": 4.898347701235637e-06,
"loss": 0.1226,
"step": 1002
},
{
"epoch": 0.45632393084622386,
"grad_norm": 1.9367180322034927,
"learning_rate": 4.89814588945836e-06,
"loss": 0.1239,
"step": 1003
},
{
"epoch": 0.456778889899909,
"grad_norm": 1.8462049373063074,
"learning_rate": 4.89794388171543e-06,
"loss": 0.1106,
"step": 1004
},
{
"epoch": 0.4572338489535942,
"grad_norm": 1.7977459529642075,
"learning_rate": 4.897741678023356e-06,
"loss": 0.1137,
"step": 1005
},
{
"epoch": 0.45768880800727935,
"grad_norm": 1.4317415496884898,
"learning_rate": 4.897539278398659e-06,
"loss": 0.0835,
"step": 1006
},
{
"epoch": 0.4581437670609645,
"grad_norm": 1.947224769167489,
"learning_rate": 4.8973366828578804e-06,
"loss": 0.1087,
"step": 1007
},
{
"epoch": 0.4585987261146497,
"grad_norm": 1.6840082807319827,
"learning_rate": 4.897133891417574e-06,
"loss": 0.1004,
"step": 1008
},
{
"epoch": 0.45905368516833484,
"grad_norm": 1.6722996299672828,
"learning_rate": 4.896930904094311e-06,
"loss": 0.0869,
"step": 1009
},
{
"epoch": 0.45950864422202004,
"grad_norm": 2.2431321251776986,
"learning_rate": 4.896727720904679e-06,
"loss": 0.121,
"step": 1010
},
{
"epoch": 0.4599636032757052,
"grad_norm": 1.2761704386307018,
"learning_rate": 4.896524341865282e-06,
"loss": 0.0736,
"step": 1011
},
{
"epoch": 0.4604185623293904,
"grad_norm": 1.6413390038739506,
"learning_rate": 4.896320766992737e-06,
"loss": 0.1286,
"step": 1012
},
{
"epoch": 0.4608735213830755,
"grad_norm": 1.5251335582402008,
"learning_rate": 4.896116996303682e-06,
"loss": 0.0989,
"step": 1013
},
{
"epoch": 0.46132848043676067,
"grad_norm": 1.8038369878473837,
"learning_rate": 4.895913029814766e-06,
"loss": 0.097,
"step": 1014
},
{
"epoch": 0.46178343949044587,
"grad_norm": 2.012861641550116,
"learning_rate": 4.895708867542658e-06,
"loss": 0.1111,
"step": 1015
},
{
"epoch": 0.462238398544131,
"grad_norm": 1.7366035889417508,
"learning_rate": 4.895504509504039e-06,
"loss": 0.1029,
"step": 1016
},
{
"epoch": 0.4626933575978162,
"grad_norm": 1.3763665767496873,
"learning_rate": 4.89529995571561e-06,
"loss": 0.0938,
"step": 1017
},
{
"epoch": 0.46314831665150136,
"grad_norm": 1.6906151679744952,
"learning_rate": 4.895095206194086e-06,
"loss": 0.1085,
"step": 1018
},
{
"epoch": 0.46360327570518656,
"grad_norm": 1.5053749521419235,
"learning_rate": 4.894890260956198e-06,
"loss": 0.0884,
"step": 1019
},
{
"epoch": 0.4640582347588717,
"grad_norm": 1.5334372638839222,
"learning_rate": 4.8946851200186925e-06,
"loss": 0.1015,
"step": 1020
},
{
"epoch": 0.46451319381255685,
"grad_norm": 1.576638091265577,
"learning_rate": 4.894479783398334e-06,
"loss": 0.0903,
"step": 1021
},
{
"epoch": 0.46496815286624205,
"grad_norm": 1.7368682352331435,
"learning_rate": 4.8942742511119004e-06,
"loss": 0.1029,
"step": 1022
},
{
"epoch": 0.4654231119199272,
"grad_norm": 3.9669130222003455,
"learning_rate": 4.894068523176187e-06,
"loss": 0.2383,
"step": 1023
},
{
"epoch": 0.4658780709736124,
"grad_norm": 1.5974114766744798,
"learning_rate": 4.8938625996080056e-06,
"loss": 0.1116,
"step": 1024
},
{
"epoch": 0.46633303002729753,
"grad_norm": 1.1252846797063132,
"learning_rate": 4.893656480424184e-06,
"loss": 0.0673,
"step": 1025
},
{
"epoch": 0.46678798908098273,
"grad_norm": 1.5329254322284862,
"learning_rate": 4.893450165641564e-06,
"loss": 0.1066,
"step": 1026
},
{
"epoch": 0.4672429481346679,
"grad_norm": 1.3116647286111784,
"learning_rate": 4.893243655277005e-06,
"loss": 0.086,
"step": 1027
},
{
"epoch": 0.467697907188353,
"grad_norm": 1.5621452726926597,
"learning_rate": 4.893036949347383e-06,
"loss": 0.0937,
"step": 1028
},
{
"epoch": 0.4681528662420382,
"grad_norm": 1.44299341979305,
"learning_rate": 4.892830047869588e-06,
"loss": 0.0922,
"step": 1029
},
{
"epoch": 0.46860782529572337,
"grad_norm": 1.2004173985623205,
"learning_rate": 4.892622950860527e-06,
"loss": 0.0545,
"step": 1030
},
{
"epoch": 0.46906278434940857,
"grad_norm": 1.2933675353670258,
"learning_rate": 4.892415658337123e-06,
"loss": 0.0938,
"step": 1031
},
{
"epoch": 0.4695177434030937,
"grad_norm": 1.3899639516557423,
"learning_rate": 4.892208170316317e-06,
"loss": 0.0807,
"step": 1032
},
{
"epoch": 0.4699727024567789,
"grad_norm": 1.2103198454795117,
"learning_rate": 4.892000486815062e-06,
"loss": 0.0724,
"step": 1033
},
{
"epoch": 0.47042766151046406,
"grad_norm": 1.4625912187815495,
"learning_rate": 4.891792607850328e-06,
"loss": 0.0944,
"step": 1034
},
{
"epoch": 0.4708826205641492,
"grad_norm": 2.3778377956475074,
"learning_rate": 4.891584533439104e-06,
"loss": 0.1301,
"step": 1035
},
{
"epoch": 0.4713375796178344,
"grad_norm": 1.6240877825800288,
"learning_rate": 4.891376263598393e-06,
"loss": 0.1056,
"step": 1036
},
{
"epoch": 0.47179253867151955,
"grad_norm": 1.377205820937822,
"learning_rate": 4.891167798345213e-06,
"loss": 0.0879,
"step": 1037
},
{
"epoch": 0.47224749772520475,
"grad_norm": 1.918358313853146,
"learning_rate": 4.890959137696598e-06,
"loss": 0.1218,
"step": 1038
},
{
"epoch": 0.4727024567788899,
"grad_norm": 1.9802948601827106,
"learning_rate": 4.890750281669601e-06,
"loss": 0.0966,
"step": 1039
},
{
"epoch": 0.4731574158325751,
"grad_norm": 1.209426799273833,
"learning_rate": 4.890541230281287e-06,
"loss": 0.0687,
"step": 1040
},
{
"epoch": 0.47361237488626023,
"grad_norm": 1.714672711362897,
"learning_rate": 4.8903319835487385e-06,
"loss": 0.1119,
"step": 1041
},
{
"epoch": 0.4740673339399454,
"grad_norm": 1.8426958086935912,
"learning_rate": 4.890122541489056e-06,
"loss": 0.1071,
"step": 1042
},
{
"epoch": 0.4745222929936306,
"grad_norm": 1.5412332450392434,
"learning_rate": 4.889912904119353e-06,
"loss": 0.1194,
"step": 1043
},
{
"epoch": 0.4749772520473157,
"grad_norm": 1.5900743055736573,
"learning_rate": 4.88970307145676e-06,
"loss": 0.0905,
"step": 1044
},
{
"epoch": 0.4754322111010009,
"grad_norm": 1.299438309320783,
"learning_rate": 4.889493043518423e-06,
"loss": 0.0782,
"step": 1045
},
{
"epoch": 0.47588717015468607,
"grad_norm": 1.2775434133946648,
"learning_rate": 4.889282820321506e-06,
"loss": 0.067,
"step": 1046
},
{
"epoch": 0.47634212920837127,
"grad_norm": 2.0181187729173313,
"learning_rate": 4.889072401883187e-06,
"loss": 0.1039,
"step": 1047
},
{
"epoch": 0.4767970882620564,
"grad_norm": 1.3673144633984753,
"learning_rate": 4.88886178822066e-06,
"loss": 0.0871,
"step": 1048
},
{
"epoch": 0.47725204731574156,
"grad_norm": 1.5512598399498212,
"learning_rate": 4.888650979351136e-06,
"loss": 0.0936,
"step": 1049
},
{
"epoch": 0.47770700636942676,
"grad_norm": 1.8862924775266208,
"learning_rate": 4.888439975291841e-06,
"loss": 0.149,
"step": 1050
},
{
"epoch": 0.4781619654231119,
"grad_norm": 1.527860807788029,
"learning_rate": 4.888228776060017e-06,
"loss": 0.0981,
"step": 1051
},
{
"epoch": 0.4786169244767971,
"grad_norm": 1.635801739367282,
"learning_rate": 4.888017381672923e-06,
"loss": 0.1004,
"step": 1052
},
{
"epoch": 0.47907188353048225,
"grad_norm": 1.496869794404093,
"learning_rate": 4.887805792147832e-06,
"loss": 0.0921,
"step": 1053
},
{
"epoch": 0.47952684258416745,
"grad_norm": 1.729233289880027,
"learning_rate": 4.887594007502036e-06,
"loss": 0.089,
"step": 1054
},
{
"epoch": 0.4799818016378526,
"grad_norm": 1.9599768924005974,
"learning_rate": 4.887382027752838e-06,
"loss": 0.1029,
"step": 1055
},
{
"epoch": 0.48043676069153773,
"grad_norm": 1.6584360062505734,
"learning_rate": 4.8871698529175636e-06,
"loss": 0.1173,
"step": 1056
},
{
"epoch": 0.48089171974522293,
"grad_norm": 1.631421092772313,
"learning_rate": 4.886957483013549e-06,
"loss": 0.1231,
"step": 1057
},
{
"epoch": 0.4813466787989081,
"grad_norm": 2.3766899063373996,
"learning_rate": 4.886744918058149e-06,
"loss": 0.13,
"step": 1058
},
{
"epoch": 0.4818016378525933,
"grad_norm": 1.7346716794855597,
"learning_rate": 4.886532158068732e-06,
"loss": 0.0938,
"step": 1059
},
{
"epoch": 0.4822565969062784,
"grad_norm": 1.5214305907929453,
"learning_rate": 4.886319203062683e-06,
"loss": 0.0761,
"step": 1060
},
{
"epoch": 0.4827115559599636,
"grad_norm": 1.6073102647133055,
"learning_rate": 4.886106053057408e-06,
"loss": 0.0818,
"step": 1061
},
{
"epoch": 0.48316651501364877,
"grad_norm": 1.803380712114119,
"learning_rate": 4.88589270807032e-06,
"loss": 0.1231,
"step": 1062
},
{
"epoch": 0.48362147406733397,
"grad_norm": 1.5275199982317587,
"learning_rate": 4.885679168118855e-06,
"loss": 0.1105,
"step": 1063
},
{
"epoch": 0.4840764331210191,
"grad_norm": 1.8472965185652206,
"learning_rate": 4.8854654332204635e-06,
"loss": 0.1324,
"step": 1064
},
{
"epoch": 0.48453139217470426,
"grad_norm": 1.41701925154465,
"learning_rate": 4.885251503392607e-06,
"loss": 0.0767,
"step": 1065
},
{
"epoch": 0.48498635122838946,
"grad_norm": 2.00437974621472,
"learning_rate": 4.885037378652771e-06,
"loss": 0.1336,
"step": 1066
},
{
"epoch": 0.4854413102820746,
"grad_norm": 1.4895968911800157,
"learning_rate": 4.884823059018451e-06,
"loss": 0.0726,
"step": 1067
},
{
"epoch": 0.4858962693357598,
"grad_norm": 1.5673178312119351,
"learning_rate": 4.88460854450716e-06,
"loss": 0.0843,
"step": 1068
},
{
"epoch": 0.48635122838944495,
"grad_norm": 1.1450505304026162,
"learning_rate": 4.884393835136427e-06,
"loss": 0.073,
"step": 1069
},
{
"epoch": 0.48680618744313015,
"grad_norm": 1.5223195045028948,
"learning_rate": 4.884178930923799e-06,
"loss": 0.0823,
"step": 1070
},
{
"epoch": 0.4872611464968153,
"grad_norm": 1.912651615279676,
"learning_rate": 4.883963831886834e-06,
"loss": 0.0989,
"step": 1071
},
{
"epoch": 0.48771610555050043,
"grad_norm": 1.6904540179044927,
"learning_rate": 4.8837485380431115e-06,
"loss": 0.0981,
"step": 1072
},
{
"epoch": 0.48817106460418563,
"grad_norm": 1.4559744514600277,
"learning_rate": 4.883533049410223e-06,
"loss": 0.0874,
"step": 1073
},
{
"epoch": 0.4886260236578708,
"grad_norm": 1.9041018278788933,
"learning_rate": 4.8833173660057785e-06,
"loss": 0.1065,
"step": 1074
},
{
"epoch": 0.489080982711556,
"grad_norm": 1.582657768337463,
"learning_rate": 4.8831014878474004e-06,
"loss": 0.0993,
"step": 1075
},
{
"epoch": 0.4895359417652411,
"grad_norm": 1.487895945323618,
"learning_rate": 4.882885414952732e-06,
"loss": 0.0887,
"step": 1076
},
{
"epoch": 0.4899909008189263,
"grad_norm": 1.1105199391014717,
"learning_rate": 4.882669147339428e-06,
"loss": 0.0521,
"step": 1077
},
{
"epoch": 0.49044585987261147,
"grad_norm": 1.3448385373486804,
"learning_rate": 4.882452685025161e-06,
"loss": 0.0606,
"step": 1078
},
{
"epoch": 0.4909008189262966,
"grad_norm": 1.9169790386878416,
"learning_rate": 4.88223602802762e-06,
"loss": 0.1103,
"step": 1079
},
{
"epoch": 0.4913557779799818,
"grad_norm": 1.4350936971881065,
"learning_rate": 4.882019176364509e-06,
"loss": 0.1052,
"step": 1080
},
{
"epoch": 0.49181073703366696,
"grad_norm": 1.9005260167330429,
"learning_rate": 4.881802130053548e-06,
"loss": 0.1217,
"step": 1081
},
{
"epoch": 0.49226569608735216,
"grad_norm": 1.4814940279383466,
"learning_rate": 4.881584889112473e-06,
"loss": 0.079,
"step": 1082
},
{
"epoch": 0.4927206551410373,
"grad_norm": 1.7134074599855604,
"learning_rate": 4.881367453559036e-06,
"loss": 0.1025,
"step": 1083
},
{
"epoch": 0.4931756141947225,
"grad_norm": 1.2847311247280295,
"learning_rate": 4.881149823411005e-06,
"loss": 0.0587,
"step": 1084
},
{
"epoch": 0.49363057324840764,
"grad_norm": 1.196984822353409,
"learning_rate": 4.880931998686162e-06,
"loss": 0.0779,
"step": 1085
},
{
"epoch": 0.4940855323020928,
"grad_norm": 2.247552936990941,
"learning_rate": 4.880713979402311e-06,
"loss": 0.1534,
"step": 1086
},
{
"epoch": 0.494540491355778,
"grad_norm": 2.5523444538687645,
"learning_rate": 4.880495765577263e-06,
"loss": 0.146,
"step": 1087
},
{
"epoch": 0.49499545040946313,
"grad_norm": 1.7690099480339412,
"learning_rate": 4.880277357228852e-06,
"loss": 0.084,
"step": 1088
},
{
"epoch": 0.49545040946314833,
"grad_norm": 1.2117156565437108,
"learning_rate": 4.880058754374923e-06,
"loss": 0.0833,
"step": 1089
},
{
"epoch": 0.4959053685168335,
"grad_norm": 1.5484757487864966,
"learning_rate": 4.879839957033343e-06,
"loss": 0.0938,
"step": 1090
},
{
"epoch": 0.4963603275705187,
"grad_norm": 1.5534223234923523,
"learning_rate": 4.879620965221987e-06,
"loss": 0.09,
"step": 1091
},
{
"epoch": 0.4968152866242038,
"grad_norm": 1.3405465803260945,
"learning_rate": 4.879401778958755e-06,
"loss": 0.0784,
"step": 1092
},
{
"epoch": 0.49727024567788897,
"grad_norm": 1.3343510524547628,
"learning_rate": 4.8791823982615525e-06,
"loss": 0.064,
"step": 1093
},
{
"epoch": 0.49772520473157417,
"grad_norm": 1.2315640234775116,
"learning_rate": 4.878962823148308e-06,
"loss": 0.067,
"step": 1094
},
{
"epoch": 0.4981801637852593,
"grad_norm": 1.654273388728327,
"learning_rate": 4.878743053636968e-06,
"loss": 0.0964,
"step": 1095
},
{
"epoch": 0.4986351228389445,
"grad_norm": 1.3344367681027707,
"learning_rate": 4.878523089745485e-06,
"loss": 0.0865,
"step": 1096
},
{
"epoch": 0.49909008189262966,
"grad_norm": 1.0737534169537484,
"learning_rate": 4.878302931491837e-06,
"loss": 0.0722,
"step": 1097
},
{
"epoch": 0.49954504094631486,
"grad_norm": 1.2217058614506033,
"learning_rate": 4.8780825788940145e-06,
"loss": 0.0531,
"step": 1098
},
{
"epoch": 0.5,
"grad_norm": 1.765512273684173,
"learning_rate": 4.877862031970023e-06,
"loss": 0.1016,
"step": 1099
},
{
"epoch": 0.5004549590536852,
"grad_norm": 2.1360497116346444,
"learning_rate": 4.8776412907378845e-06,
"loss": 0.1095,
"step": 1100
},
{
"epoch": 0.5009099181073703,
"grad_norm": 1.5928570797543171,
"learning_rate": 4.877420355215637e-06,
"loss": 0.0909,
"step": 1101
},
{
"epoch": 0.5013648771610555,
"grad_norm": 1.9221830556747463,
"learning_rate": 4.877199225421334e-06,
"loss": 0.123,
"step": 1102
},
{
"epoch": 0.5018198362147407,
"grad_norm": 1.967973587212139,
"learning_rate": 4.8769779013730454e-06,
"loss": 0.1535,
"step": 1103
},
{
"epoch": 0.5022747952684259,
"grad_norm": 2.02512821365078,
"learning_rate": 4.876756383088858e-06,
"loss": 0.1173,
"step": 1104
},
{
"epoch": 0.502729754322111,
"grad_norm": 1.3904167109659709,
"learning_rate": 4.876534670586872e-06,
"loss": 0.0839,
"step": 1105
},
{
"epoch": 0.5031847133757962,
"grad_norm": 1.4435165077122623,
"learning_rate": 4.8763127638852045e-06,
"loss": 0.0924,
"step": 1106
},
{
"epoch": 0.5036396724294814,
"grad_norm": 1.7029448773247835,
"learning_rate": 4.87609066300199e-06,
"loss": 0.1076,
"step": 1107
},
{
"epoch": 0.5040946314831665,
"grad_norm": 1.750067106251082,
"learning_rate": 4.875868367955376e-06,
"loss": 0.1077,
"step": 1108
},
{
"epoch": 0.5045495905368517,
"grad_norm": 1.9748651822243342,
"learning_rate": 4.87564587876353e-06,
"loss": 0.1294,
"step": 1109
},
{
"epoch": 0.5050045495905369,
"grad_norm": 1.7656971074259822,
"learning_rate": 4.87542319544463e-06,
"loss": 0.0974,
"step": 1110
}
],
"logging_steps": 1,
"max_steps": 10990,
"num_input_tokens_seen": 0,
"num_train_epochs": 5,
"save_steps": 555,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 7279902056448.0,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}