OpenThinker3-1.5B / trainer_state.json
cjiao's picture
End of training
2feeed7 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 2.9983155530600785,
"eval_steps": 500,
"global_step": 1335,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0022459292532285235,
"grad_norm": 1.9819730520248413,
"learning_rate": 1.1940298507462686e-06,
"loss": 0.8684,
"step": 1
},
{
"epoch": 0.004491858506457047,
"grad_norm": 2.0259573459625244,
"learning_rate": 2.3880597014925373e-06,
"loss": 0.8821,
"step": 2
},
{
"epoch": 0.00673778775968557,
"grad_norm": 2.085845947265625,
"learning_rate": 3.582089552238806e-06,
"loss": 0.9416,
"step": 3
},
{
"epoch": 0.008983717012914094,
"grad_norm": 1.9758139848709106,
"learning_rate": 4.7761194029850745e-06,
"loss": 0.9074,
"step": 4
},
{
"epoch": 0.011229646266142616,
"grad_norm": 1.7997628450393677,
"learning_rate": 5.970149253731343e-06,
"loss": 0.9069,
"step": 5
},
{
"epoch": 0.01347557551937114,
"grad_norm": 1.4710899591445923,
"learning_rate": 7.164179104477612e-06,
"loss": 0.8754,
"step": 6
},
{
"epoch": 0.015721504772599662,
"grad_norm": 1.4286643266677856,
"learning_rate": 8.35820895522388e-06,
"loss": 0.8896,
"step": 7
},
{
"epoch": 0.017967434025828188,
"grad_norm": 1.2477623224258423,
"learning_rate": 9.552238805970149e-06,
"loss": 0.8545,
"step": 8
},
{
"epoch": 0.02021336327905671,
"grad_norm": 1.2966960668563843,
"learning_rate": 1.074626865671642e-05,
"loss": 0.839,
"step": 9
},
{
"epoch": 0.022459292532285232,
"grad_norm": 1.4356369972229004,
"learning_rate": 1.1940298507462686e-05,
"loss": 0.8639,
"step": 10
},
{
"epoch": 0.024705221785513758,
"grad_norm": 1.1653496026992798,
"learning_rate": 1.3134328358208957e-05,
"loss": 0.8142,
"step": 11
},
{
"epoch": 0.02695115103874228,
"grad_norm": 0.9282035231590271,
"learning_rate": 1.4328358208955224e-05,
"loss": 0.8022,
"step": 12
},
{
"epoch": 0.029197080291970802,
"grad_norm": 1.086950421333313,
"learning_rate": 1.5522388059701494e-05,
"loss": 0.7908,
"step": 13
},
{
"epoch": 0.031443009545199324,
"grad_norm": 0.7987905144691467,
"learning_rate": 1.671641791044776e-05,
"loss": 0.7838,
"step": 14
},
{
"epoch": 0.033688938798427846,
"grad_norm": 0.7030066847801208,
"learning_rate": 1.791044776119403e-05,
"loss": 0.7856,
"step": 15
},
{
"epoch": 0.035934868051656375,
"grad_norm": 0.7216812372207642,
"learning_rate": 1.9104477611940298e-05,
"loss": 0.7666,
"step": 16
},
{
"epoch": 0.0381807973048849,
"grad_norm": 0.7004393935203552,
"learning_rate": 2.029850746268657e-05,
"loss": 0.7602,
"step": 17
},
{
"epoch": 0.04042672655811342,
"grad_norm": 0.5651209950447083,
"learning_rate": 2.149253731343284e-05,
"loss": 0.7637,
"step": 18
},
{
"epoch": 0.04267265581134194,
"grad_norm": 0.5799914598464966,
"learning_rate": 2.2686567164179106e-05,
"loss": 0.7357,
"step": 19
},
{
"epoch": 0.044918585064570464,
"grad_norm": 0.531233549118042,
"learning_rate": 2.3880597014925373e-05,
"loss": 0.7552,
"step": 20
},
{
"epoch": 0.047164514317798986,
"grad_norm": 0.5684418678283691,
"learning_rate": 2.5074626865671646e-05,
"loss": 0.7671,
"step": 21
},
{
"epoch": 0.049410443571027515,
"grad_norm": 0.4873054623603821,
"learning_rate": 2.6268656716417913e-05,
"loss": 0.737,
"step": 22
},
{
"epoch": 0.05165637282425604,
"grad_norm": 0.49275314807891846,
"learning_rate": 2.746268656716418e-05,
"loss": 0.7362,
"step": 23
},
{
"epoch": 0.05390230207748456,
"grad_norm": 0.47075843811035156,
"learning_rate": 2.8656716417910447e-05,
"loss": 0.7234,
"step": 24
},
{
"epoch": 0.05614823133071308,
"grad_norm": 0.3865251839160919,
"learning_rate": 2.985074626865672e-05,
"loss": 0.7169,
"step": 25
},
{
"epoch": 0.058394160583941604,
"grad_norm": 0.4154004156589508,
"learning_rate": 3.104477611940299e-05,
"loss": 0.7119,
"step": 26
},
{
"epoch": 0.060640089837170126,
"grad_norm": 0.37092125415802,
"learning_rate": 3.2238805970149255e-05,
"loss": 0.7138,
"step": 27
},
{
"epoch": 0.06288601909039865,
"grad_norm": 0.3488411605358124,
"learning_rate": 3.343283582089552e-05,
"loss": 0.7216,
"step": 28
},
{
"epoch": 0.06513194834362718,
"grad_norm": 0.32693928480148315,
"learning_rate": 3.462686567164179e-05,
"loss": 0.6925,
"step": 29
},
{
"epoch": 0.06737787759685569,
"grad_norm": 0.34904295206069946,
"learning_rate": 3.582089552238806e-05,
"loss": 0.7105,
"step": 30
},
{
"epoch": 0.06962380685008422,
"grad_norm": 0.32673367857933044,
"learning_rate": 3.701492537313433e-05,
"loss": 0.696,
"step": 31
},
{
"epoch": 0.07186973610331275,
"grad_norm": 0.32177790999412537,
"learning_rate": 3.8208955223880596e-05,
"loss": 0.7064,
"step": 32
},
{
"epoch": 0.07411566535654127,
"grad_norm": 0.3286134600639343,
"learning_rate": 3.940298507462687e-05,
"loss": 0.7091,
"step": 33
},
{
"epoch": 0.0763615946097698,
"grad_norm": 0.3438747525215149,
"learning_rate": 4.059701492537314e-05,
"loss": 0.7149,
"step": 34
},
{
"epoch": 0.07860752386299831,
"grad_norm": 0.29362648725509644,
"learning_rate": 4.1791044776119404e-05,
"loss": 0.685,
"step": 35
},
{
"epoch": 0.08085345311622684,
"grad_norm": 0.30074256658554077,
"learning_rate": 4.298507462686568e-05,
"loss": 0.7011,
"step": 36
},
{
"epoch": 0.08309938236945537,
"grad_norm": 0.3120618462562561,
"learning_rate": 4.4179104477611944e-05,
"loss": 0.684,
"step": 37
},
{
"epoch": 0.08534531162268388,
"grad_norm": 0.2569892406463623,
"learning_rate": 4.537313432835821e-05,
"loss": 0.684,
"step": 38
},
{
"epoch": 0.08759124087591241,
"grad_norm": 0.28327882289886475,
"learning_rate": 4.6567164179104485e-05,
"loss": 0.6968,
"step": 39
},
{
"epoch": 0.08983717012914093,
"grad_norm": 0.26424843072891235,
"learning_rate": 4.7761194029850745e-05,
"loss": 0.6915,
"step": 40
},
{
"epoch": 0.09208309938236946,
"grad_norm": 0.2620261609554291,
"learning_rate": 4.895522388059702e-05,
"loss": 0.6744,
"step": 41
},
{
"epoch": 0.09432902863559797,
"grad_norm": 0.32121092081069946,
"learning_rate": 5.014925373134329e-05,
"loss": 0.6746,
"step": 42
},
{
"epoch": 0.0965749578888265,
"grad_norm": 0.3997937738895416,
"learning_rate": 5.134328358208955e-05,
"loss": 0.6806,
"step": 43
},
{
"epoch": 0.09882088714205503,
"grad_norm": 0.3264799416065216,
"learning_rate": 5.2537313432835826e-05,
"loss": 0.6729,
"step": 44
},
{
"epoch": 0.10106681639528355,
"grad_norm": 0.33052176237106323,
"learning_rate": 5.37313432835821e-05,
"loss": 0.6758,
"step": 45
},
{
"epoch": 0.10331274564851207,
"grad_norm": 0.43345314264297485,
"learning_rate": 5.492537313432836e-05,
"loss": 0.6767,
"step": 46
},
{
"epoch": 0.10555867490174059,
"grad_norm": 0.37080681324005127,
"learning_rate": 5.6119402985074634e-05,
"loss": 0.6526,
"step": 47
},
{
"epoch": 0.10780460415496912,
"grad_norm": 0.381356418132782,
"learning_rate": 5.7313432835820894e-05,
"loss": 0.6739,
"step": 48
},
{
"epoch": 0.11005053340819765,
"grad_norm": 0.36677348613739014,
"learning_rate": 5.850746268656717e-05,
"loss": 0.6782,
"step": 49
},
{
"epoch": 0.11229646266142616,
"grad_norm": 0.40393349528312683,
"learning_rate": 5.970149253731344e-05,
"loss": 0.6528,
"step": 50
},
{
"epoch": 0.11454239191465469,
"grad_norm": 0.5078914165496826,
"learning_rate": 6.08955223880597e-05,
"loss": 0.6697,
"step": 51
},
{
"epoch": 0.11678832116788321,
"grad_norm": 0.742857813835144,
"learning_rate": 6.208955223880598e-05,
"loss": 0.6341,
"step": 52
},
{
"epoch": 0.11903425042111174,
"grad_norm": 0.8604367971420288,
"learning_rate": 6.328358208955224e-05,
"loss": 0.662,
"step": 53
},
{
"epoch": 0.12128017967434025,
"grad_norm": 0.7391287684440613,
"learning_rate": 6.447761194029851e-05,
"loss": 0.6696,
"step": 54
},
{
"epoch": 0.12352610892756878,
"grad_norm": 0.5703966617584229,
"learning_rate": 6.567164179104479e-05,
"loss": 0.6619,
"step": 55
},
{
"epoch": 0.1257720381807973,
"grad_norm": 0.7210264801979065,
"learning_rate": 6.686567164179104e-05,
"loss": 0.6647,
"step": 56
},
{
"epoch": 0.12801796743402583,
"grad_norm": 0.8133912682533264,
"learning_rate": 6.805970149253732e-05,
"loss": 0.658,
"step": 57
},
{
"epoch": 0.13026389668725435,
"grad_norm": 0.9062953591346741,
"learning_rate": 6.925373134328358e-05,
"loss": 0.6732,
"step": 58
},
{
"epoch": 0.13250982594048288,
"grad_norm": 0.9497516751289368,
"learning_rate": 7.044776119402986e-05,
"loss": 0.6743,
"step": 59
},
{
"epoch": 0.13475575519371139,
"grad_norm": 0.5923281908035278,
"learning_rate": 7.164179104477612e-05,
"loss": 0.6609,
"step": 60
},
{
"epoch": 0.13700168444693991,
"grad_norm": 0.839241087436676,
"learning_rate": 7.283582089552239e-05,
"loss": 0.6673,
"step": 61
},
{
"epoch": 0.13924761370016844,
"grad_norm": 0.9110313653945923,
"learning_rate": 7.402985074626866e-05,
"loss": 0.6795,
"step": 62
},
{
"epoch": 0.14149354295339697,
"grad_norm": 0.6465680599212646,
"learning_rate": 7.522388059701494e-05,
"loss": 0.6634,
"step": 63
},
{
"epoch": 0.1437394722066255,
"grad_norm": 0.5419987440109253,
"learning_rate": 7.641791044776119e-05,
"loss": 0.6489,
"step": 64
},
{
"epoch": 0.145985401459854,
"grad_norm": 0.6124593019485474,
"learning_rate": 7.761194029850747e-05,
"loss": 0.6617,
"step": 65
},
{
"epoch": 0.14823133071308253,
"grad_norm": 0.5836852788925171,
"learning_rate": 7.880597014925374e-05,
"loss": 0.6319,
"step": 66
},
{
"epoch": 0.15047725996631106,
"grad_norm": 0.6319289207458496,
"learning_rate": 8e-05,
"loss": 0.6504,
"step": 67
},
{
"epoch": 0.1527231892195396,
"grad_norm": 0.6081493496894836,
"learning_rate": 8.119402985074627e-05,
"loss": 0.65,
"step": 68
},
{
"epoch": 0.15496911847276812,
"grad_norm": 0.5973412394523621,
"learning_rate": 8.238805970149255e-05,
"loss": 0.6449,
"step": 69
},
{
"epoch": 0.15721504772599662,
"grad_norm": 0.6423139572143555,
"learning_rate": 8.358208955223881e-05,
"loss": 0.6584,
"step": 70
},
{
"epoch": 0.15946097697922515,
"grad_norm": 0.7579260468482971,
"learning_rate": 8.477611940298507e-05,
"loss": 0.6472,
"step": 71
},
{
"epoch": 0.16170690623245368,
"grad_norm": 0.8475743532180786,
"learning_rate": 8.597014925373135e-05,
"loss": 0.6405,
"step": 72
},
{
"epoch": 0.1639528354856822,
"grad_norm": 0.5964512228965759,
"learning_rate": 8.716417910447762e-05,
"loss": 0.633,
"step": 73
},
{
"epoch": 0.16619876473891074,
"grad_norm": 0.47265729308128357,
"learning_rate": 8.835820895522389e-05,
"loss": 0.6453,
"step": 74
},
{
"epoch": 0.16844469399213924,
"grad_norm": 0.7188097238540649,
"learning_rate": 8.955223880597014e-05,
"loss": 0.6603,
"step": 75
},
{
"epoch": 0.17069062324536777,
"grad_norm": 0.49939826130867004,
"learning_rate": 9.074626865671642e-05,
"loss": 0.6339,
"step": 76
},
{
"epoch": 0.1729365524985963,
"grad_norm": 0.5468081831932068,
"learning_rate": 9.194029850746269e-05,
"loss": 0.639,
"step": 77
},
{
"epoch": 0.17518248175182483,
"grad_norm": 0.6105530858039856,
"learning_rate": 9.313432835820897e-05,
"loss": 0.6537,
"step": 78
},
{
"epoch": 0.17742841100505333,
"grad_norm": 0.48114606738090515,
"learning_rate": 9.432835820895524e-05,
"loss": 0.6579,
"step": 79
},
{
"epoch": 0.17967434025828186,
"grad_norm": 0.6263488531112671,
"learning_rate": 9.552238805970149e-05,
"loss": 0.6292,
"step": 80
},
{
"epoch": 0.18192026951151039,
"grad_norm": 0.5369325280189514,
"learning_rate": 9.671641791044777e-05,
"loss": 0.6608,
"step": 81
},
{
"epoch": 0.18416619876473891,
"grad_norm": 0.7140039801597595,
"learning_rate": 9.791044776119404e-05,
"loss": 0.6339,
"step": 82
},
{
"epoch": 0.18641212801796744,
"grad_norm": 0.9011125564575195,
"learning_rate": 9.91044776119403e-05,
"loss": 0.6342,
"step": 83
},
{
"epoch": 0.18865805727119594,
"grad_norm": 1.1369616985321045,
"learning_rate": 0.00010029850746268659,
"loss": 0.6442,
"step": 84
},
{
"epoch": 0.19090398652442447,
"grad_norm": 1.0306285619735718,
"learning_rate": 0.00010149253731343285,
"loss": 0.6419,
"step": 85
},
{
"epoch": 0.193149915777653,
"grad_norm": 0.8979660272598267,
"learning_rate": 0.0001026865671641791,
"loss": 0.632,
"step": 86
},
{
"epoch": 0.19539584503088153,
"grad_norm": 0.6676183342933655,
"learning_rate": 0.00010388059701492539,
"loss": 0.6386,
"step": 87
},
{
"epoch": 0.19764177428411006,
"grad_norm": 0.7217721939086914,
"learning_rate": 0.00010507462686567165,
"loss": 0.6546,
"step": 88
},
{
"epoch": 0.19988770353733856,
"grad_norm": 0.7290446162223816,
"learning_rate": 0.00010626865671641792,
"loss": 0.6328,
"step": 89
},
{
"epoch": 0.2021336327905671,
"grad_norm": 0.8381432890892029,
"learning_rate": 0.0001074626865671642,
"loss": 0.6311,
"step": 90
},
{
"epoch": 0.20437956204379562,
"grad_norm": 1.0938982963562012,
"learning_rate": 0.00010865671641791045,
"loss": 0.6559,
"step": 91
},
{
"epoch": 0.20662549129702415,
"grad_norm": 0.8039063215255737,
"learning_rate": 0.00010985074626865672,
"loss": 0.636,
"step": 92
},
{
"epoch": 0.20887142055025268,
"grad_norm": 0.7171061635017395,
"learning_rate": 0.000111044776119403,
"loss": 0.6456,
"step": 93
},
{
"epoch": 0.21111734980348118,
"grad_norm": 0.7186174988746643,
"learning_rate": 0.00011223880597014927,
"loss": 0.6285,
"step": 94
},
{
"epoch": 0.2133632790567097,
"grad_norm": 0.6290779113769531,
"learning_rate": 0.00011343283582089553,
"loss": 0.6336,
"step": 95
},
{
"epoch": 0.21560920830993824,
"grad_norm": 0.7359249591827393,
"learning_rate": 0.00011462686567164179,
"loss": 0.6542,
"step": 96
},
{
"epoch": 0.21785513756316677,
"grad_norm": 0.775365948677063,
"learning_rate": 0.00011582089552238807,
"loss": 0.6369,
"step": 97
},
{
"epoch": 0.2201010668163953,
"grad_norm": 0.8260976076126099,
"learning_rate": 0.00011701492537313434,
"loss": 0.6142,
"step": 98
},
{
"epoch": 0.2223469960696238,
"grad_norm": 0.704872727394104,
"learning_rate": 0.00011820895522388062,
"loss": 0.6473,
"step": 99
},
{
"epoch": 0.22459292532285233,
"grad_norm": 0.5987293124198914,
"learning_rate": 0.00011940298507462688,
"loss": 0.6458,
"step": 100
},
{
"epoch": 0.22683885457608086,
"grad_norm": 0.7472802400588989,
"learning_rate": 0.00012059701492537314,
"loss": 0.6235,
"step": 101
},
{
"epoch": 0.22908478382930939,
"grad_norm": 0.7303177118301392,
"learning_rate": 0.0001217910447761194,
"loss": 0.6432,
"step": 102
},
{
"epoch": 0.2313307130825379,
"grad_norm": 0.5669957995414734,
"learning_rate": 0.00012298507462686568,
"loss": 0.6276,
"step": 103
},
{
"epoch": 0.23357664233576642,
"grad_norm": 0.47117286920547485,
"learning_rate": 0.00012417910447761195,
"loss": 0.6429,
"step": 104
},
{
"epoch": 0.23582257158899494,
"grad_norm": 0.6563988327980042,
"learning_rate": 0.00012537313432835822,
"loss": 0.6276,
"step": 105
},
{
"epoch": 0.23806850084222347,
"grad_norm": 0.5849066972732544,
"learning_rate": 0.00012656716417910448,
"loss": 0.6309,
"step": 106
},
{
"epoch": 0.240314430095452,
"grad_norm": 0.7347849607467651,
"learning_rate": 0.00012776119402985075,
"loss": 0.6382,
"step": 107
},
{
"epoch": 0.2425603593486805,
"grad_norm": 0.6520137190818787,
"learning_rate": 0.00012895522388059702,
"loss": 0.6386,
"step": 108
},
{
"epoch": 0.24480628860190903,
"grad_norm": 0.60540372133255,
"learning_rate": 0.00013014925373134329,
"loss": 0.613,
"step": 109
},
{
"epoch": 0.24705221785513756,
"grad_norm": 0.7710558176040649,
"learning_rate": 0.00013134328358208958,
"loss": 0.6104,
"step": 110
},
{
"epoch": 0.2492981471083661,
"grad_norm": 0.6582499742507935,
"learning_rate": 0.00013253731343283582,
"loss": 0.628,
"step": 111
},
{
"epoch": 0.2515440763615946,
"grad_norm": 0.6089588403701782,
"learning_rate": 0.00013373134328358209,
"loss": 0.6313,
"step": 112
},
{
"epoch": 0.2537900056148231,
"grad_norm": 0.5754179358482361,
"learning_rate": 0.00013492537313432838,
"loss": 0.6283,
"step": 113
},
{
"epoch": 0.25603593486805165,
"grad_norm": 0.617273211479187,
"learning_rate": 0.00013611940298507465,
"loss": 0.6187,
"step": 114
},
{
"epoch": 0.2582818641212802,
"grad_norm": 0.6104961037635803,
"learning_rate": 0.00013731343283582091,
"loss": 0.6267,
"step": 115
},
{
"epoch": 0.2605277933745087,
"grad_norm": 0.691856861114502,
"learning_rate": 0.00013850746268656715,
"loss": 0.6202,
"step": 116
},
{
"epoch": 0.26277372262773724,
"grad_norm": 0.8089864253997803,
"learning_rate": 0.00013970149253731345,
"loss": 0.635,
"step": 117
},
{
"epoch": 0.26501965188096577,
"grad_norm": 1.1346023082733154,
"learning_rate": 0.00014089552238805972,
"loss": 0.6462,
"step": 118
},
{
"epoch": 0.2672655811341943,
"grad_norm": 0.8319297432899475,
"learning_rate": 0.00014208955223880598,
"loss": 0.6179,
"step": 119
},
{
"epoch": 0.26951151038742277,
"grad_norm": 0.5904942154884338,
"learning_rate": 0.00014328358208955225,
"loss": 0.629,
"step": 120
},
{
"epoch": 0.2717574396406513,
"grad_norm": 0.5950160026550293,
"learning_rate": 0.00014447761194029852,
"loss": 0.6245,
"step": 121
},
{
"epoch": 0.27400336889387983,
"grad_norm": 0.6426451802253723,
"learning_rate": 0.00014567164179104478,
"loss": 0.614,
"step": 122
},
{
"epoch": 0.27624929814710836,
"grad_norm": 0.6028596758842468,
"learning_rate": 0.00014686567164179105,
"loss": 0.6127,
"step": 123
},
{
"epoch": 0.2784952274003369,
"grad_norm": 0.6075330972671509,
"learning_rate": 0.00014805970149253732,
"loss": 0.6283,
"step": 124
},
{
"epoch": 0.2807411566535654,
"grad_norm": 0.6084921360015869,
"learning_rate": 0.0001492537313432836,
"loss": 0.6351,
"step": 125
},
{
"epoch": 0.28298708590679394,
"grad_norm": 0.627112865447998,
"learning_rate": 0.00015044776119402988,
"loss": 0.6393,
"step": 126
},
{
"epoch": 0.2852330151600225,
"grad_norm": 0.6501988172531128,
"learning_rate": 0.00015164179104477612,
"loss": 0.6097,
"step": 127
},
{
"epoch": 0.287478944413251,
"grad_norm": 0.6280235648155212,
"learning_rate": 0.00015283582089552238,
"loss": 0.6281,
"step": 128
},
{
"epoch": 0.28972487366647953,
"grad_norm": 0.49232786893844604,
"learning_rate": 0.00015402985074626868,
"loss": 0.6341,
"step": 129
},
{
"epoch": 0.291970802919708,
"grad_norm": 0.5303974747657776,
"learning_rate": 0.00015522388059701495,
"loss": 0.6098,
"step": 130
},
{
"epoch": 0.29421673217293653,
"grad_norm": 0.5729207992553711,
"learning_rate": 0.0001564179104477612,
"loss": 0.617,
"step": 131
},
{
"epoch": 0.29646266142616506,
"grad_norm": 0.6265519857406616,
"learning_rate": 0.00015761194029850748,
"loss": 0.5968,
"step": 132
},
{
"epoch": 0.2987085906793936,
"grad_norm": 0.6463232636451721,
"learning_rate": 0.00015880597014925375,
"loss": 0.6391,
"step": 133
},
{
"epoch": 0.3009545199326221,
"grad_norm": 0.593257486820221,
"learning_rate": 0.00016,
"loss": 0.6189,
"step": 134
},
{
"epoch": 0.30320044918585065,
"grad_norm": 0.5925970077514648,
"learning_rate": 0.00015999972630083387,
"loss": 0.6139,
"step": 135
},
{
"epoch": 0.3054463784390792,
"grad_norm": 0.6394967436790466,
"learning_rate": 0.00015999890520520824,
"loss": 0.6038,
"step": 136
},
{
"epoch": 0.3076923076923077,
"grad_norm": 0.649044394493103,
"learning_rate": 0.00015999753671874147,
"loss": 0.614,
"step": 137
},
{
"epoch": 0.30993823694553624,
"grad_norm": 0.6179019808769226,
"learning_rate": 0.00015999562085079733,
"loss": 0.6171,
"step": 138
},
{
"epoch": 0.3121841661987647,
"grad_norm": 0.5114040374755859,
"learning_rate": 0.0001599931576144852,
"loss": 0.6076,
"step": 139
},
{
"epoch": 0.31443009545199324,
"grad_norm": 0.5721436142921448,
"learning_rate": 0.00015999014702665964,
"loss": 0.6173,
"step": 140
},
{
"epoch": 0.31667602470522177,
"grad_norm": 0.7164266109466553,
"learning_rate": 0.00015998658910792058,
"loss": 0.611,
"step": 141
},
{
"epoch": 0.3189219539584503,
"grad_norm": 0.8134217858314514,
"learning_rate": 0.00015998248388261302,
"loss": 0.6296,
"step": 142
},
{
"epoch": 0.32116788321167883,
"grad_norm": 0.828131377696991,
"learning_rate": 0.00015997783137882682,
"loss": 0.6331,
"step": 143
},
{
"epoch": 0.32341381246490736,
"grad_norm": 0.7628505825996399,
"learning_rate": 0.00015997263162839667,
"loss": 0.6524,
"step": 144
},
{
"epoch": 0.3256597417181359,
"grad_norm": 0.6403250098228455,
"learning_rate": 0.0001599668846669018,
"loss": 0.6097,
"step": 145
},
{
"epoch": 0.3279056709713644,
"grad_norm": 0.5496403574943542,
"learning_rate": 0.00015996059053366562,
"loss": 0.6187,
"step": 146
},
{
"epoch": 0.33015160022459294,
"grad_norm": 0.6352928876876831,
"learning_rate": 0.0001599537492717556,
"loss": 0.619,
"step": 147
},
{
"epoch": 0.3323975294778215,
"grad_norm": 0.6073532104492188,
"learning_rate": 0.00015994636092798295,
"loss": 0.6218,
"step": 148
},
{
"epoch": 0.33464345873104995,
"grad_norm": 0.40914225578308105,
"learning_rate": 0.00015993842555290226,
"loss": 0.6161,
"step": 149
},
{
"epoch": 0.3368893879842785,
"grad_norm": 0.4364437758922577,
"learning_rate": 0.0001599299432008112,
"loss": 0.637,
"step": 150
},
{
"epoch": 0.339135317237507,
"grad_norm": 0.5311095118522644,
"learning_rate": 0.00015992091392975002,
"loss": 0.5972,
"step": 151
},
{
"epoch": 0.34138124649073553,
"grad_norm": 0.545671284198761,
"learning_rate": 0.00015991133780150136,
"loss": 0.6103,
"step": 152
},
{
"epoch": 0.34362717574396406,
"grad_norm": 0.4276280105113983,
"learning_rate": 0.00015990121488158968,
"loss": 0.6148,
"step": 153
},
{
"epoch": 0.3458731049971926,
"grad_norm": 0.4059518575668335,
"learning_rate": 0.00015989054523928085,
"loss": 0.6332,
"step": 154
},
{
"epoch": 0.3481190342504211,
"grad_norm": 0.42028188705444336,
"learning_rate": 0.00015987932894758164,
"loss": 0.5972,
"step": 155
},
{
"epoch": 0.35036496350364965,
"grad_norm": 0.44919151067733765,
"learning_rate": 0.00015986756608323932,
"loss": 0.6017,
"step": 156
},
{
"epoch": 0.3526108927568782,
"grad_norm": 0.3990235924720764,
"learning_rate": 0.00015985525672674103,
"loss": 0.6146,
"step": 157
},
{
"epoch": 0.35485682201010665,
"grad_norm": 0.42787206172943115,
"learning_rate": 0.0001598424009623133,
"loss": 0.6199,
"step": 158
},
{
"epoch": 0.3571027512633352,
"grad_norm": 0.44234439730644226,
"learning_rate": 0.00015982899887792145,
"loss": 0.6279,
"step": 159
},
{
"epoch": 0.3593486805165637,
"grad_norm": 0.4152110815048218,
"learning_rate": 0.00015981505056526893,
"loss": 0.6032,
"step": 160
},
{
"epoch": 0.36159460976979224,
"grad_norm": 0.36194872856140137,
"learning_rate": 0.0001598005561197968,
"loss": 0.6286,
"step": 161
},
{
"epoch": 0.36384053902302077,
"grad_norm": 0.4214819669723511,
"learning_rate": 0.00015978551564068295,
"loss": 0.6006,
"step": 162
},
{
"epoch": 0.3660864682762493,
"grad_norm": 0.41948559880256653,
"learning_rate": 0.00015976992923084161,
"loss": 0.615,
"step": 163
},
{
"epoch": 0.36833239752947783,
"grad_norm": 0.44141775369644165,
"learning_rate": 0.00015975379699692245,
"loss": 0.6236,
"step": 164
},
{
"epoch": 0.37057832678270636,
"grad_norm": 0.47903239727020264,
"learning_rate": 0.00015973711904930993,
"loss": 0.5979,
"step": 165
},
{
"epoch": 0.3728242560359349,
"grad_norm": 0.45099982619285583,
"learning_rate": 0.00015971989550212255,
"loss": 0.6229,
"step": 166
},
{
"epoch": 0.3750701852891634,
"grad_norm": 0.4214828312397003,
"learning_rate": 0.00015970212647321207,
"loss": 0.6146,
"step": 167
},
{
"epoch": 0.3773161145423919,
"grad_norm": 0.43835896253585815,
"learning_rate": 0.00015968381208416273,
"loss": 0.6162,
"step": 168
},
{
"epoch": 0.3795620437956204,
"grad_norm": 0.4372192621231079,
"learning_rate": 0.00015966495246029033,
"loss": 0.6152,
"step": 169
},
{
"epoch": 0.38180797304884895,
"grad_norm": 0.45570138096809387,
"learning_rate": 0.00015964554773064148,
"loss": 0.6107,
"step": 170
},
{
"epoch": 0.3840539023020775,
"grad_norm": 0.5014758706092834,
"learning_rate": 0.0001596255980279926,
"loss": 0.5861,
"step": 171
},
{
"epoch": 0.386299831555306,
"grad_norm": 0.46727222204208374,
"learning_rate": 0.00015960510348884914,
"loss": 0.6104,
"step": 172
},
{
"epoch": 0.38854576080853453,
"grad_norm": 0.5081140398979187,
"learning_rate": 0.00015958406425344455,
"loss": 0.5948,
"step": 173
},
{
"epoch": 0.39079169006176306,
"grad_norm": 0.4470350742340088,
"learning_rate": 0.00015956248046573938,
"loss": 0.5924,
"step": 174
},
{
"epoch": 0.3930376193149916,
"grad_norm": 0.36340662837028503,
"learning_rate": 0.00015954035227342019,
"loss": 0.5972,
"step": 175
},
{
"epoch": 0.3952835485682201,
"grad_norm": 0.34276771545410156,
"learning_rate": 0.00015951767982789875,
"loss": 0.5955,
"step": 176
},
{
"epoch": 0.39752947782144865,
"grad_norm": 0.35867977142333984,
"learning_rate": 0.00015949446328431075,
"loss": 0.611,
"step": 177
},
{
"epoch": 0.3997754070746771,
"grad_norm": 0.3728366792201996,
"learning_rate": 0.00015947070280151492,
"loss": 0.6117,
"step": 178
},
{
"epoch": 0.40202133632790565,
"grad_norm": 0.32302939891815186,
"learning_rate": 0.00015944639854209184,
"loss": 0.6225,
"step": 179
},
{
"epoch": 0.4042672655811342,
"grad_norm": 0.33579641580581665,
"learning_rate": 0.00015942155067234293,
"loss": 0.5915,
"step": 180
},
{
"epoch": 0.4065131948343627,
"grad_norm": 0.29243412613868713,
"learning_rate": 0.00015939615936228922,
"loss": 0.5915,
"step": 181
},
{
"epoch": 0.40875912408759124,
"grad_norm": 0.32980793714523315,
"learning_rate": 0.00015937022478567023,
"loss": 0.6172,
"step": 182
},
{
"epoch": 0.41100505334081977,
"grad_norm": 0.30575114488601685,
"learning_rate": 0.0001593437471199427,
"loss": 0.5958,
"step": 183
},
{
"epoch": 0.4132509825940483,
"grad_norm": 0.3298383951187134,
"learning_rate": 0.00015931672654627958,
"loss": 0.5949,
"step": 184
},
{
"epoch": 0.41549691184727683,
"grad_norm": 0.330642431974411,
"learning_rate": 0.00015928916324956855,
"loss": 0.5929,
"step": 185
},
{
"epoch": 0.41774284110050536,
"grad_norm": 0.33059626817703247,
"learning_rate": 0.00015926105741841088,
"loss": 0.609,
"step": 186
},
{
"epoch": 0.41998877035373383,
"grad_norm": 0.3915445804595947,
"learning_rate": 0.00015923240924512014,
"loss": 0.6045,
"step": 187
},
{
"epoch": 0.42223469960696236,
"grad_norm": 0.3589101731777191,
"learning_rate": 0.00015920321892572088,
"loss": 0.6175,
"step": 188
},
{
"epoch": 0.4244806288601909,
"grad_norm": 0.399964302778244,
"learning_rate": 0.00015917348665994723,
"loss": 0.6157,
"step": 189
},
{
"epoch": 0.4267265581134194,
"grad_norm": 0.3923908770084381,
"learning_rate": 0.0001591432126512416,
"loss": 0.6041,
"step": 190
},
{
"epoch": 0.42897248736664795,
"grad_norm": 0.38844165205955505,
"learning_rate": 0.0001591123971067533,
"loss": 0.5865,
"step": 191
},
{
"epoch": 0.4312184166198765,
"grad_norm": 0.41744178533554077,
"learning_rate": 0.00015908104023733697,
"loss": 0.5823,
"step": 192
},
{
"epoch": 0.433464345873105,
"grad_norm": 0.3478281795978546,
"learning_rate": 0.0001590491422575514,
"loss": 0.6064,
"step": 193
},
{
"epoch": 0.43571027512633353,
"grad_norm": 0.38580065965652466,
"learning_rate": 0.00015901670338565785,
"loss": 0.6119,
"step": 194
},
{
"epoch": 0.43795620437956206,
"grad_norm": 0.5283933877944946,
"learning_rate": 0.0001589837238436186,
"loss": 0.5945,
"step": 195
},
{
"epoch": 0.4402021336327906,
"grad_norm": 0.48087194561958313,
"learning_rate": 0.00015895020385709553,
"loss": 0.6058,
"step": 196
},
{
"epoch": 0.44244806288601907,
"grad_norm": 0.35071608424186707,
"learning_rate": 0.00015891614365544837,
"loss": 0.5672,
"step": 197
},
{
"epoch": 0.4446939921392476,
"grad_norm": 0.3820844888687134,
"learning_rate": 0.0001588815434717334,
"loss": 0.5898,
"step": 198
},
{
"epoch": 0.4469399213924761,
"grad_norm": 0.3622789680957794,
"learning_rate": 0.0001588464035427016,
"loss": 0.5842,
"step": 199
},
{
"epoch": 0.44918585064570465,
"grad_norm": 0.348568856716156,
"learning_rate": 0.00015881072410879726,
"loss": 0.6025,
"step": 200
},
{
"epoch": 0.4514317798989332,
"grad_norm": 0.36718496680259705,
"learning_rate": 0.00015877450541415615,
"loss": 0.5888,
"step": 201
},
{
"epoch": 0.4536777091521617,
"grad_norm": 0.39695170521736145,
"learning_rate": 0.0001587377477066039,
"loss": 0.6159,
"step": 202
},
{
"epoch": 0.45592363840539024,
"grad_norm": 0.4380107522010803,
"learning_rate": 0.0001587004512376544,
"loss": 0.6001,
"step": 203
},
{
"epoch": 0.45816956765861877,
"grad_norm": 0.40494075417518616,
"learning_rate": 0.00015866261626250794,
"loss": 0.6016,
"step": 204
},
{
"epoch": 0.4604154969118473,
"grad_norm": 0.3275372385978699,
"learning_rate": 0.00015862424304004954,
"loss": 0.5918,
"step": 205
},
{
"epoch": 0.4626614261650758,
"grad_norm": 0.3288284242153168,
"learning_rate": 0.00015858533183284718,
"loss": 0.608,
"step": 206
},
{
"epoch": 0.4649073554183043,
"grad_norm": 0.32171040773391724,
"learning_rate": 0.00015854588290714999,
"loss": 0.5816,
"step": 207
},
{
"epoch": 0.46715328467153283,
"grad_norm": 0.3992040157318115,
"learning_rate": 0.00015850589653288642,
"loss": 0.591,
"step": 208
},
{
"epoch": 0.46939921392476136,
"grad_norm": 0.38158226013183594,
"learning_rate": 0.00015846537298366242,
"loss": 0.5831,
"step": 209
},
{
"epoch": 0.4716451431779899,
"grad_norm": 0.32366326451301575,
"learning_rate": 0.0001584243125367595,
"loss": 0.5822,
"step": 210
},
{
"epoch": 0.4738910724312184,
"grad_norm": 0.41187676787376404,
"learning_rate": 0.00015838271547313293,
"loss": 0.6027,
"step": 211
},
{
"epoch": 0.47613700168444695,
"grad_norm": 0.48473531007766724,
"learning_rate": 0.00015834058207740974,
"loss": 0.5819,
"step": 212
},
{
"epoch": 0.4783829309376755,
"grad_norm": 0.3934939205646515,
"learning_rate": 0.00015829791263788682,
"loss": 0.6042,
"step": 213
},
{
"epoch": 0.480628860190904,
"grad_norm": 0.32344624400138855,
"learning_rate": 0.00015825470744652894,
"loss": 0.5717,
"step": 214
},
{
"epoch": 0.48287478944413254,
"grad_norm": 0.27189725637435913,
"learning_rate": 0.0001582109667989667,
"loss": 0.6015,
"step": 215
},
{
"epoch": 0.485120718697361,
"grad_norm": 0.349128395318985,
"learning_rate": 0.00015816669099449454,
"loss": 0.6037,
"step": 216
},
{
"epoch": 0.48736664795058954,
"grad_norm": 0.3456957936286926,
"learning_rate": 0.00015812188033606877,
"loss": 0.5974,
"step": 217
},
{
"epoch": 0.48961257720381807,
"grad_norm": 0.29926273226737976,
"learning_rate": 0.00015807653513030538,
"loss": 0.6,
"step": 218
},
{
"epoch": 0.4918585064570466,
"grad_norm": 0.3260749280452728,
"learning_rate": 0.00015803065568747798,
"loss": 0.5955,
"step": 219
},
{
"epoch": 0.4941044357102751,
"grad_norm": 0.4071785509586334,
"learning_rate": 0.00015798424232151573,
"loss": 0.5899,
"step": 220
},
{
"epoch": 0.49635036496350365,
"grad_norm": 0.37568220496177673,
"learning_rate": 0.00015793729535000108,
"loss": 0.6008,
"step": 221
},
{
"epoch": 0.4985962942167322,
"grad_norm": 0.4158768355846405,
"learning_rate": 0.00015788981509416773,
"loss": 0.5897,
"step": 222
},
{
"epoch": 0.5008422234699607,
"grad_norm": 0.44514065980911255,
"learning_rate": 0.00015784180187889833,
"loss": 0.5807,
"step": 223
},
{
"epoch": 0.5030881527231892,
"grad_norm": 0.37475013732910156,
"learning_rate": 0.00015779325603272232,
"loss": 0.586,
"step": 224
},
{
"epoch": 0.5053340819764177,
"grad_norm": 0.4093579649925232,
"learning_rate": 0.0001577441778878136,
"loss": 0.5966,
"step": 225
},
{
"epoch": 0.5075800112296462,
"grad_norm": 0.4048860967159271,
"learning_rate": 0.00015769456777998842,
"loss": 0.6107,
"step": 226
},
{
"epoch": 0.5098259404828748,
"grad_norm": 0.31557515263557434,
"learning_rate": 0.00015764442604870285,
"loss": 0.609,
"step": 227
},
{
"epoch": 0.5120718697361033,
"grad_norm": 0.33514130115509033,
"learning_rate": 0.0001575937530370507,
"loss": 0.5866,
"step": 228
},
{
"epoch": 0.5143177989893318,
"grad_norm": 0.3601367771625519,
"learning_rate": 0.0001575425490917609,
"loss": 0.586,
"step": 229
},
{
"epoch": 0.5165637282425604,
"grad_norm": 0.3701965808868408,
"learning_rate": 0.00015749081456319544,
"loss": 0.5755,
"step": 230
},
{
"epoch": 0.5188096574957889,
"grad_norm": 0.3042786419391632,
"learning_rate": 0.0001574385498053468,
"loss": 0.5978,
"step": 231
},
{
"epoch": 0.5210555867490174,
"grad_norm": 0.33692997694015503,
"learning_rate": 0.00015738575517583542,
"loss": 0.6078,
"step": 232
},
{
"epoch": 0.523301516002246,
"grad_norm": 0.36644524335861206,
"learning_rate": 0.00015733243103590748,
"loss": 0.575,
"step": 233
},
{
"epoch": 0.5255474452554745,
"grad_norm": 0.3802548050880432,
"learning_rate": 0.00015727857775043227,
"loss": 0.6041,
"step": 234
},
{
"epoch": 0.527793374508703,
"grad_norm": 0.37767213582992554,
"learning_rate": 0.00015722419568789983,
"loss": 0.591,
"step": 235
},
{
"epoch": 0.5300393037619315,
"grad_norm": 0.38524535298347473,
"learning_rate": 0.00015716928522041825,
"loss": 0.601,
"step": 236
},
{
"epoch": 0.5322852330151601,
"grad_norm": 0.4806888699531555,
"learning_rate": 0.00015711384672371126,
"loss": 0.5935,
"step": 237
},
{
"epoch": 0.5345311622683886,
"grad_norm": 0.3960248827934265,
"learning_rate": 0.0001570578805771156,
"loss": 0.5789,
"step": 238
},
{
"epoch": 0.5367770915216171,
"grad_norm": 0.29831525683403015,
"learning_rate": 0.00015700138716357852,
"loss": 0.5917,
"step": 239
},
{
"epoch": 0.5390230207748455,
"grad_norm": 0.30690401792526245,
"learning_rate": 0.00015694436686965497,
"loss": 0.5819,
"step": 240
},
{
"epoch": 0.5412689500280741,
"grad_norm": 0.30107825994491577,
"learning_rate": 0.00015688682008550514,
"loss": 0.5965,
"step": 241
},
{
"epoch": 0.5435148792813026,
"grad_norm": 0.30696406960487366,
"learning_rate": 0.0001568287472048917,
"loss": 0.6025,
"step": 242
},
{
"epoch": 0.5457608085345311,
"grad_norm": 0.32280731201171875,
"learning_rate": 0.00015677014862517714,
"loss": 0.5868,
"step": 243
},
{
"epoch": 0.5480067377877597,
"grad_norm": 0.31739377975463867,
"learning_rate": 0.000156711024747321,
"loss": 0.5898,
"step": 244
},
{
"epoch": 0.5502526670409882,
"grad_norm": 0.3620510995388031,
"learning_rate": 0.0001566513759758772,
"loss": 0.5621,
"step": 245
},
{
"epoch": 0.5524985962942167,
"grad_norm": 0.26646366715431213,
"learning_rate": 0.00015659120271899118,
"loss": 0.5731,
"step": 246
},
{
"epoch": 0.5547445255474452,
"grad_norm": 0.3814524710178375,
"learning_rate": 0.00015653050538839722,
"loss": 0.5947,
"step": 247
},
{
"epoch": 0.5569904548006738,
"grad_norm": 0.4031396210193634,
"learning_rate": 0.00015646928439941557,
"loss": 0.612,
"step": 248
},
{
"epoch": 0.5592363840539023,
"grad_norm": 0.38268253207206726,
"learning_rate": 0.00015640754017094954,
"loss": 0.5792,
"step": 249
},
{
"epoch": 0.5614823133071308,
"grad_norm": 0.37941139936447144,
"learning_rate": 0.0001563452731254827,
"loss": 0.6071,
"step": 250
},
{
"epoch": 0.5637282425603594,
"grad_norm": 0.3618276119232178,
"learning_rate": 0.00015628248368907603,
"loss": 0.5776,
"step": 251
},
{
"epoch": 0.5659741718135879,
"grad_norm": 0.3906313180923462,
"learning_rate": 0.000156219172291365,
"loss": 0.5732,
"step": 252
},
{
"epoch": 0.5682201010668164,
"grad_norm": 0.4234972894191742,
"learning_rate": 0.0001561553393655564,
"loss": 0.5674,
"step": 253
},
{
"epoch": 0.570466030320045,
"grad_norm": 0.4400922954082489,
"learning_rate": 0.00015609098534842582,
"loss": 0.5894,
"step": 254
},
{
"epoch": 0.5727119595732735,
"grad_norm": 0.38799750804901123,
"learning_rate": 0.0001560261106803142,
"loss": 0.5833,
"step": 255
},
{
"epoch": 0.574957888826502,
"grad_norm": 0.31524044275283813,
"learning_rate": 0.00015596071580512515,
"loss": 0.5841,
"step": 256
},
{
"epoch": 0.5772038180797305,
"grad_norm": 0.3451038599014282,
"learning_rate": 0.00015589480117032174,
"loss": 0.6003,
"step": 257
},
{
"epoch": 0.5794497473329591,
"grad_norm": 0.3648560047149658,
"learning_rate": 0.00015582836722692346,
"loss": 0.5787,
"step": 258
},
{
"epoch": 0.5816956765861875,
"grad_norm": 0.37476226687431335,
"learning_rate": 0.00015576141442950317,
"loss": 0.5719,
"step": 259
},
{
"epoch": 0.583941605839416,
"grad_norm": 0.33187106251716614,
"learning_rate": 0.00015569394323618403,
"loss": 0.5785,
"step": 260
},
{
"epoch": 0.5861875350926445,
"grad_norm": 0.36073818802833557,
"learning_rate": 0.00015562595410863626,
"loss": 0.5965,
"step": 261
},
{
"epoch": 0.5884334643458731,
"grad_norm": 0.3586486577987671,
"learning_rate": 0.00015555744751207404,
"loss": 0.5857,
"step": 262
},
{
"epoch": 0.5906793935991016,
"grad_norm": 0.44820883870124817,
"learning_rate": 0.0001554884239152523,
"loss": 0.5804,
"step": 263
},
{
"epoch": 0.5929253228523301,
"grad_norm": 0.43128344416618347,
"learning_rate": 0.00015541888379046366,
"loss": 0.5613,
"step": 264
},
{
"epoch": 0.5951712521055587,
"grad_norm": 0.38606396317481995,
"learning_rate": 0.0001553488276135349,
"loss": 0.5958,
"step": 265
},
{
"epoch": 0.5974171813587872,
"grad_norm": 0.36493563652038574,
"learning_rate": 0.0001552782558638239,
"loss": 0.5663,
"step": 266
},
{
"epoch": 0.5996631106120157,
"grad_norm": 0.40545809268951416,
"learning_rate": 0.00015520716902421648,
"loss": 0.5934,
"step": 267
},
{
"epoch": 0.6019090398652442,
"grad_norm": 0.42288488149642944,
"learning_rate": 0.00015513556758112282,
"loss": 0.5729,
"step": 268
},
{
"epoch": 0.6041549691184728,
"grad_norm": 0.2895568311214447,
"learning_rate": 0.00015506345202447432,
"loss": 0.6046,
"step": 269
},
{
"epoch": 0.6064008983717013,
"grad_norm": 0.3440837562084198,
"learning_rate": 0.00015499082284772017,
"loss": 0.5654,
"step": 270
},
{
"epoch": 0.6086468276249298,
"grad_norm": 0.36002352833747864,
"learning_rate": 0.00015491768054782395,
"loss": 0.5923,
"step": 271
},
{
"epoch": 0.6108927568781584,
"grad_norm": 0.28700196743011475,
"learning_rate": 0.00015484402562526036,
"loss": 0.5826,
"step": 272
},
{
"epoch": 0.6131386861313869,
"grad_norm": 0.32599133253097534,
"learning_rate": 0.0001547698585840117,
"loss": 0.5783,
"step": 273
},
{
"epoch": 0.6153846153846154,
"grad_norm": 0.37215182185173035,
"learning_rate": 0.00015469517993156435,
"loss": 0.583,
"step": 274
},
{
"epoch": 0.617630544637844,
"grad_norm": 0.3325370252132416,
"learning_rate": 0.0001546199901789055,
"loss": 0.582,
"step": 275
},
{
"epoch": 0.6198764738910725,
"grad_norm": 0.3477807939052582,
"learning_rate": 0.00015454428984051937,
"loss": 0.5726,
"step": 276
},
{
"epoch": 0.622122403144301,
"grad_norm": 0.37678489089012146,
"learning_rate": 0.000154468079434384,
"loss": 0.5786,
"step": 277
},
{
"epoch": 0.6243683323975294,
"grad_norm": 0.3045758008956909,
"learning_rate": 0.00015439135948196756,
"loss": 0.5829,
"step": 278
},
{
"epoch": 0.626614261650758,
"grad_norm": 0.3221797049045563,
"learning_rate": 0.0001543141305082246,
"loss": 0.5811,
"step": 279
},
{
"epoch": 0.6288601909039865,
"grad_norm": 0.35202842950820923,
"learning_rate": 0.00015423639304159288,
"loss": 0.5655,
"step": 280
},
{
"epoch": 0.631106120157215,
"grad_norm": 0.2838123142719269,
"learning_rate": 0.00015415814761398936,
"loss": 0.5991,
"step": 281
},
{
"epoch": 0.6333520494104435,
"grad_norm": 0.33944493532180786,
"learning_rate": 0.0001540793947608067,
"loss": 0.5764,
"step": 282
},
{
"epoch": 0.6355979786636721,
"grad_norm": 0.29667678475379944,
"learning_rate": 0.0001540001350209097,
"loss": 0.5745,
"step": 283
},
{
"epoch": 0.6378439079169006,
"grad_norm": 0.37716934084892273,
"learning_rate": 0.00015392036893663148,
"loss": 0.5739,
"step": 284
},
{
"epoch": 0.6400898371701291,
"grad_norm": 0.3955274522304535,
"learning_rate": 0.00015384009705376978,
"loss": 0.574,
"step": 285
},
{
"epoch": 0.6423357664233577,
"grad_norm": 0.29740408062934875,
"learning_rate": 0.00015375931992158331,
"loss": 0.567,
"step": 286
},
{
"epoch": 0.6445816956765862,
"grad_norm": 0.3198919892311096,
"learning_rate": 0.0001536780380927879,
"loss": 0.5672,
"step": 287
},
{
"epoch": 0.6468276249298147,
"grad_norm": 0.3355892598628998,
"learning_rate": 0.0001535962521235528,
"loss": 0.57,
"step": 288
},
{
"epoch": 0.6490735541830432,
"grad_norm": 0.32803425192832947,
"learning_rate": 0.00015351396257349675,
"loss": 0.5839,
"step": 289
},
{
"epoch": 0.6513194834362718,
"grad_norm": 0.3538999557495117,
"learning_rate": 0.00015343117000568432,
"loss": 0.5864,
"step": 290
},
{
"epoch": 0.6535654126895003,
"grad_norm": 0.3156984746456146,
"learning_rate": 0.00015334787498662192,
"loss": 0.5872,
"step": 291
},
{
"epoch": 0.6558113419427288,
"grad_norm": 0.336056113243103,
"learning_rate": 0.00015326407808625395,
"loss": 0.578,
"step": 292
},
{
"epoch": 0.6580572711959574,
"grad_norm": 0.3894708454608917,
"learning_rate": 0.00015317977987795898,
"loss": 0.5682,
"step": 293
},
{
"epoch": 0.6603032004491859,
"grad_norm": 0.3500683605670929,
"learning_rate": 0.00015309498093854577,
"loss": 0.5934,
"step": 294
},
{
"epoch": 0.6625491297024144,
"grad_norm": 0.331767201423645,
"learning_rate": 0.00015300968184824926,
"loss": 0.5781,
"step": 295
},
{
"epoch": 0.664795058955643,
"grad_norm": 0.4042721092700958,
"learning_rate": 0.0001529238831907267,
"loss": 0.5811,
"step": 296
},
{
"epoch": 0.6670409882088714,
"grad_norm": 0.2907451093196869,
"learning_rate": 0.00015283758555305362,
"loss": 0.5925,
"step": 297
},
{
"epoch": 0.6692869174620999,
"grad_norm": 0.28044381737709045,
"learning_rate": 0.0001527507895257198,
"loss": 0.5717,
"step": 298
},
{
"epoch": 0.6715328467153284,
"grad_norm": 0.2812747359275818,
"learning_rate": 0.00015266349570262528,
"loss": 0.5796,
"step": 299
},
{
"epoch": 0.673778775968557,
"grad_norm": 0.28039273619651794,
"learning_rate": 0.00015257570468107617,
"loss": 0.5682,
"step": 300
},
{
"epoch": 0.6760247052217855,
"grad_norm": 0.2821033000946045,
"learning_rate": 0.00015248741706178073,
"loss": 0.5939,
"step": 301
},
{
"epoch": 0.678270634475014,
"grad_norm": 0.31085771322250366,
"learning_rate": 0.0001523986334488452,
"loss": 0.5829,
"step": 302
},
{
"epoch": 0.6805165637282425,
"grad_norm": 0.31658798456192017,
"learning_rate": 0.00015230935444976955,
"loss": 0.6073,
"step": 303
},
{
"epoch": 0.6827624929814711,
"grad_norm": 0.28057488799095154,
"learning_rate": 0.00015221958067544348,
"loss": 0.5888,
"step": 304
},
{
"epoch": 0.6850084222346996,
"grad_norm": 0.29499179124832153,
"learning_rate": 0.00015212931274014214,
"loss": 0.5713,
"step": 305
},
{
"epoch": 0.6872543514879281,
"grad_norm": 0.31696656346321106,
"learning_rate": 0.00015203855126152204,
"loss": 0.5956,
"step": 306
},
{
"epoch": 0.6895002807411567,
"grad_norm": 0.2905656695365906,
"learning_rate": 0.00015194729686061672,
"loss": 0.56,
"step": 307
},
{
"epoch": 0.6917462099943852,
"grad_norm": 0.33711618185043335,
"learning_rate": 0.00015185555016183246,
"loss": 0.5816,
"step": 308
},
{
"epoch": 0.6939921392476137,
"grad_norm": 0.3962436616420746,
"learning_rate": 0.00015176331179294416,
"loss": 0.5933,
"step": 309
},
{
"epoch": 0.6962380685008422,
"grad_norm": 0.2827875316143036,
"learning_rate": 0.00015167058238509093,
"loss": 0.5529,
"step": 310
},
{
"epoch": 0.6984839977540708,
"grad_norm": 0.252986878156662,
"learning_rate": 0.00015157736257277182,
"loss": 0.5915,
"step": 311
},
{
"epoch": 0.7007299270072993,
"grad_norm": 0.28363773226737976,
"learning_rate": 0.00015148365299384145,
"loss": 0.5621,
"step": 312
},
{
"epoch": 0.7029758562605278,
"grad_norm": 0.26527139544487,
"learning_rate": 0.00015138945428950566,
"loss": 0.5791,
"step": 313
},
{
"epoch": 0.7052217855137564,
"grad_norm": 0.24393455684185028,
"learning_rate": 0.0001512947671043171,
"loss": 0.5549,
"step": 314
},
{
"epoch": 0.7074677147669849,
"grad_norm": 0.24904131889343262,
"learning_rate": 0.00015119959208617092,
"loss": 0.5627,
"step": 315
},
{
"epoch": 0.7097136440202133,
"grad_norm": 0.3018868863582611,
"learning_rate": 0.00015110392988630016,
"loss": 0.5802,
"step": 316
},
{
"epoch": 0.7119595732734418,
"grad_norm": 0.34517163038253784,
"learning_rate": 0.0001510077811592714,
"loss": 0.5831,
"step": 317
},
{
"epoch": 0.7142055025266704,
"grad_norm": 0.3295687437057495,
"learning_rate": 0.00015091114656298033,
"loss": 0.5978,
"step": 318
},
{
"epoch": 0.7164514317798989,
"grad_norm": 0.3116067945957184,
"learning_rate": 0.00015081402675864717,
"loss": 0.58,
"step": 319
},
{
"epoch": 0.7186973610331274,
"grad_norm": 0.2843012809753418,
"learning_rate": 0.00015071642241081212,
"loss": 0.5837,
"step": 320
},
{
"epoch": 0.720943290286356,
"grad_norm": 0.27185961604118347,
"learning_rate": 0.00015061833418733095,
"loss": 0.5746,
"step": 321
},
{
"epoch": 0.7231892195395845,
"grad_norm": 0.26890790462493896,
"learning_rate": 0.00015051976275937023,
"loss": 0.5642,
"step": 322
},
{
"epoch": 0.725435148792813,
"grad_norm": 0.29379114508628845,
"learning_rate": 0.00015042070880140292,
"loss": 0.5796,
"step": 323
},
{
"epoch": 0.7276810780460415,
"grad_norm": 0.2906297743320465,
"learning_rate": 0.0001503211729912037,
"loss": 0.5666,
"step": 324
},
{
"epoch": 0.7299270072992701,
"grad_norm": 0.2815559506416321,
"learning_rate": 0.00015022115600984423,
"loss": 0.5582,
"step": 325
},
{
"epoch": 0.7321729365524986,
"grad_norm": 0.3286380469799042,
"learning_rate": 0.0001501206585416886,
"loss": 0.5462,
"step": 326
},
{
"epoch": 0.7344188658057271,
"grad_norm": 0.3522341549396515,
"learning_rate": 0.00015001968127438872,
"loss": 0.5654,
"step": 327
},
{
"epoch": 0.7366647950589557,
"grad_norm": 0.33905208110809326,
"learning_rate": 0.00014991822489887938,
"loss": 0.5606,
"step": 328
},
{
"epoch": 0.7389107243121842,
"grad_norm": 0.29921072721481323,
"learning_rate": 0.00014981629010937372,
"loss": 0.5772,
"step": 329
},
{
"epoch": 0.7411566535654127,
"grad_norm": 0.2822812497615814,
"learning_rate": 0.00014971387760335841,
"loss": 0.5772,
"step": 330
},
{
"epoch": 0.7434025828186412,
"grad_norm": 0.3244154155254364,
"learning_rate": 0.0001496109880815889,
"loss": 0.5736,
"step": 331
},
{
"epoch": 0.7456485120718698,
"grad_norm": 0.3305480480194092,
"learning_rate": 0.0001495076222480846,
"loss": 0.586,
"step": 332
},
{
"epoch": 0.7478944413250983,
"grad_norm": 0.3018239140510559,
"learning_rate": 0.00014940378081012407,
"loss": 0.579,
"step": 333
},
{
"epoch": 0.7501403705783268,
"grad_norm": 0.3692958652973175,
"learning_rate": 0.00014929946447824014,
"loss": 0.5767,
"step": 334
},
{
"epoch": 0.7523862998315554,
"grad_norm": 0.3724178373813629,
"learning_rate": 0.00014919467396621523,
"loss": 0.5721,
"step": 335
},
{
"epoch": 0.7546322290847838,
"grad_norm": 0.3226647973060608,
"learning_rate": 0.00014908940999107615,
"loss": 0.553,
"step": 336
},
{
"epoch": 0.7568781583380123,
"grad_norm": 0.28518086671829224,
"learning_rate": 0.00014898367327308945,
"loss": 0.566,
"step": 337
},
{
"epoch": 0.7591240875912408,
"grad_norm": 0.2642190158367157,
"learning_rate": 0.0001488774645357565,
"loss": 0.5732,
"step": 338
},
{
"epoch": 0.7613700168444694,
"grad_norm": 0.2713199555873871,
"learning_rate": 0.0001487707845058083,
"loss": 0.5679,
"step": 339
},
{
"epoch": 0.7636159460976979,
"grad_norm": 0.28339532017707825,
"learning_rate": 0.00014866363391320076,
"loss": 0.5664,
"step": 340
},
{
"epoch": 0.7658618753509264,
"grad_norm": 0.26976078748703003,
"learning_rate": 0.0001485560134911096,
"loss": 0.5917,
"step": 341
},
{
"epoch": 0.768107804604155,
"grad_norm": 0.31055644154548645,
"learning_rate": 0.00014844792397592524,
"loss": 0.5609,
"step": 342
},
{
"epoch": 0.7703537338573835,
"grad_norm": 0.28089481592178345,
"learning_rate": 0.000148339366107248,
"loss": 0.5553,
"step": 343
},
{
"epoch": 0.772599663110612,
"grad_norm": 0.3059735894203186,
"learning_rate": 0.00014823034062788282,
"loss": 0.5827,
"step": 344
},
{
"epoch": 0.7748455923638405,
"grad_norm": 0.3540654480457306,
"learning_rate": 0.00014812084828383425,
"loss": 0.5417,
"step": 345
},
{
"epoch": 0.7770915216170691,
"grad_norm": 0.3125968277454376,
"learning_rate": 0.0001480108898243014,
"loss": 0.5676,
"step": 346
},
{
"epoch": 0.7793374508702976,
"grad_norm": 0.2534315884113312,
"learning_rate": 0.0001479004660016727,
"loss": 0.5724,
"step": 347
},
{
"epoch": 0.7815833801235261,
"grad_norm": 0.30985814332962036,
"learning_rate": 0.0001477895775715209,
"loss": 0.5682,
"step": 348
},
{
"epoch": 0.7838293093767547,
"grad_norm": 0.334831178188324,
"learning_rate": 0.00014767822529259772,
"loss": 0.5653,
"step": 349
},
{
"epoch": 0.7860752386299832,
"grad_norm": 0.29639920592308044,
"learning_rate": 0.00014756640992682883,
"loss": 0.5959,
"step": 350
},
{
"epoch": 0.7883211678832117,
"grad_norm": 0.33278346061706543,
"learning_rate": 0.00014745413223930858,
"loss": 0.57,
"step": 351
},
{
"epoch": 0.7905670971364402,
"grad_norm": 0.26434555649757385,
"learning_rate": 0.00014734139299829466,
"loss": 0.5847,
"step": 352
},
{
"epoch": 0.7928130263896688,
"grad_norm": 0.295564204454422,
"learning_rate": 0.00014722819297520296,
"loss": 0.5345,
"step": 353
},
{
"epoch": 0.7950589556428973,
"grad_norm": 0.32043787837028503,
"learning_rate": 0.00014711453294460235,
"loss": 0.5751,
"step": 354
},
{
"epoch": 0.7973048848961257,
"grad_norm": 0.35145339369773865,
"learning_rate": 0.00014700041368420914,
"loss": 0.5782,
"step": 355
},
{
"epoch": 0.7995508141493542,
"grad_norm": 0.2663813531398773,
"learning_rate": 0.00014688583597488204,
"loss": 0.5457,
"step": 356
},
{
"epoch": 0.8017967434025828,
"grad_norm": 0.3394940197467804,
"learning_rate": 0.00014677080060061662,
"loss": 0.5669,
"step": 357
},
{
"epoch": 0.8040426726558113,
"grad_norm": 0.28702473640441895,
"learning_rate": 0.00014665530834854002,
"loss": 0.5715,
"step": 358
},
{
"epoch": 0.8062886019090398,
"grad_norm": 0.3419654071331024,
"learning_rate": 0.0001465393600089056,
"loss": 0.5804,
"step": 359
},
{
"epoch": 0.8085345311622684,
"grad_norm": 0.35292762517929077,
"learning_rate": 0.00014642295637508742,
"loss": 0.5666,
"step": 360
},
{
"epoch": 0.8107804604154969,
"grad_norm": 0.31325843930244446,
"learning_rate": 0.00014630609824357494,
"loss": 0.5857,
"step": 361
},
{
"epoch": 0.8130263896687254,
"grad_norm": 0.27262774109840393,
"learning_rate": 0.00014618878641396748,
"loss": 0.5797,
"step": 362
},
{
"epoch": 0.815272318921954,
"grad_norm": 0.2780674397945404,
"learning_rate": 0.00014607102168896882,
"loss": 0.5552,
"step": 363
},
{
"epoch": 0.8175182481751825,
"grad_norm": 0.2732245922088623,
"learning_rate": 0.00014595280487438158,
"loss": 0.5716,
"step": 364
},
{
"epoch": 0.819764177428411,
"grad_norm": 0.33612555265426636,
"learning_rate": 0.0001458341367791019,
"loss": 0.5756,
"step": 365
},
{
"epoch": 0.8220101066816395,
"grad_norm": 0.267904669046402,
"learning_rate": 0.0001457150182151137,
"loss": 0.5694,
"step": 366
},
{
"epoch": 0.8242560359348681,
"grad_norm": 0.2547987401485443,
"learning_rate": 0.0001455954499974833,
"loss": 0.5718,
"step": 367
},
{
"epoch": 0.8265019651880966,
"grad_norm": 0.2813619375228882,
"learning_rate": 0.00014547543294435376,
"loss": 0.5521,
"step": 368
},
{
"epoch": 0.8287478944413251,
"grad_norm": 0.2692398428916931,
"learning_rate": 0.0001453549678769392,
"loss": 0.5644,
"step": 369
},
{
"epoch": 0.8309938236945537,
"grad_norm": 0.24875199794769287,
"learning_rate": 0.0001452340556195194,
"loss": 0.5563,
"step": 370
},
{
"epoch": 0.8332397529477822,
"grad_norm": 0.24863849580287933,
"learning_rate": 0.00014511269699943392,
"loss": 0.5479,
"step": 371
},
{
"epoch": 0.8354856822010107,
"grad_norm": 0.2492000311613083,
"learning_rate": 0.00014499089284707658,
"loss": 0.5742,
"step": 372
},
{
"epoch": 0.8377316114542392,
"grad_norm": 0.2373623251914978,
"learning_rate": 0.0001448686439958898,
"loss": 0.5688,
"step": 373
},
{
"epoch": 0.8399775407074677,
"grad_norm": 0.265248566865921,
"learning_rate": 0.00014474595128235876,
"loss": 0.5616,
"step": 374
},
{
"epoch": 0.8422234699606962,
"grad_norm": 0.2871013879776001,
"learning_rate": 0.00014462281554600577,
"loss": 0.556,
"step": 375
},
{
"epoch": 0.8444693992139247,
"grad_norm": 0.31418806314468384,
"learning_rate": 0.00014449923762938462,
"loss": 0.5644,
"step": 376
},
{
"epoch": 0.8467153284671532,
"grad_norm": 0.3332020044326782,
"learning_rate": 0.00014437521837807455,
"loss": 0.5611,
"step": 377
},
{
"epoch": 0.8489612577203818,
"grad_norm": 0.2672823965549469,
"learning_rate": 0.00014425075864067473,
"loss": 0.5575,
"step": 378
},
{
"epoch": 0.8512071869736103,
"grad_norm": 0.23632559180259705,
"learning_rate": 0.00014412585926879833,
"loss": 0.578,
"step": 379
},
{
"epoch": 0.8534531162268388,
"grad_norm": 0.31967830657958984,
"learning_rate": 0.00014400052111706668,
"loss": 0.5738,
"step": 380
},
{
"epoch": 0.8556990454800674,
"grad_norm": 0.3274000287055969,
"learning_rate": 0.0001438747450431035,
"loss": 0.5606,
"step": 381
},
{
"epoch": 0.8579449747332959,
"grad_norm": 0.32115650177001953,
"learning_rate": 0.00014374853190752892,
"loss": 0.601,
"step": 382
},
{
"epoch": 0.8601909039865244,
"grad_norm": 0.3195722997188568,
"learning_rate": 0.00014362188257395367,
"loss": 0.5794,
"step": 383
},
{
"epoch": 0.862436833239753,
"grad_norm": 0.32217174768447876,
"learning_rate": 0.00014349479790897325,
"loss": 0.5687,
"step": 384
},
{
"epoch": 0.8646827624929815,
"grad_norm": 0.3338417410850525,
"learning_rate": 0.00014336727878216178,
"loss": 0.5513,
"step": 385
},
{
"epoch": 0.86692869174621,
"grad_norm": 0.2939014732837677,
"learning_rate": 0.00014323932606606624,
"loss": 0.5845,
"step": 386
},
{
"epoch": 0.8691746209994385,
"grad_norm": 0.34269392490386963,
"learning_rate": 0.00014311094063620036,
"loss": 0.5721,
"step": 387
},
{
"epoch": 0.8714205502526671,
"grad_norm": 0.3684992492198944,
"learning_rate": 0.00014298212337103888,
"loss": 0.5924,
"step": 388
},
{
"epoch": 0.8736664795058956,
"grad_norm": 0.27671441435813904,
"learning_rate": 0.0001428528751520112,
"loss": 0.5536,
"step": 389
},
{
"epoch": 0.8759124087591241,
"grad_norm": 0.3508271276950836,
"learning_rate": 0.0001427231968634955,
"loss": 0.5499,
"step": 390
},
{
"epoch": 0.8781583380123527,
"grad_norm": 0.3735405504703522,
"learning_rate": 0.00014259308939281292,
"loss": 0.5472,
"step": 391
},
{
"epoch": 0.8804042672655812,
"grad_norm": 0.30313754081726074,
"learning_rate": 0.00014246255363022095,
"loss": 0.5598,
"step": 392
},
{
"epoch": 0.8826501965188096,
"grad_norm": 0.28613924980163574,
"learning_rate": 0.00014233159046890792,
"loss": 0.5589,
"step": 393
},
{
"epoch": 0.8848961257720381,
"grad_norm": 0.3396552503108978,
"learning_rate": 0.00014220020080498648,
"loss": 0.5722,
"step": 394
},
{
"epoch": 0.8871420550252667,
"grad_norm": 0.24562208354473114,
"learning_rate": 0.00014206838553748773,
"loss": 0.5617,
"step": 395
},
{
"epoch": 0.8893879842784952,
"grad_norm": 0.26731806993484497,
"learning_rate": 0.00014193614556835482,
"loss": 0.5876,
"step": 396
},
{
"epoch": 0.8916339135317237,
"grad_norm": 0.30024391412734985,
"learning_rate": 0.00014180348180243706,
"loss": 0.5457,
"step": 397
},
{
"epoch": 0.8938798427849522,
"grad_norm": 0.23074807226657867,
"learning_rate": 0.0001416703951474834,
"loss": 0.5767,
"step": 398
},
{
"epoch": 0.8961257720381808,
"grad_norm": 0.2882399260997772,
"learning_rate": 0.00014153688651413662,
"loss": 0.548,
"step": 399
},
{
"epoch": 0.8983717012914093,
"grad_norm": 0.30070793628692627,
"learning_rate": 0.00014140295681592667,
"loss": 0.5483,
"step": 400
},
{
"epoch": 0.9006176305446378,
"grad_norm": 0.261349081993103,
"learning_rate": 0.00014126860696926473,
"loss": 0.5568,
"step": 401
},
{
"epoch": 0.9028635597978664,
"grad_norm": 0.2514486610889435,
"learning_rate": 0.00014113383789343686,
"loss": 0.5656,
"step": 402
},
{
"epoch": 0.9051094890510949,
"grad_norm": 0.28470492362976074,
"learning_rate": 0.00014099865051059765,
"loss": 0.5877,
"step": 403
},
{
"epoch": 0.9073554183043234,
"grad_norm": 0.28581055998802185,
"learning_rate": 0.00014086304574576394,
"loss": 0.5703,
"step": 404
},
{
"epoch": 0.909601347557552,
"grad_norm": 0.22891870141029358,
"learning_rate": 0.00014072702452680848,
"loss": 0.5631,
"step": 405
},
{
"epoch": 0.9118472768107805,
"grad_norm": 0.27686670422554016,
"learning_rate": 0.00014059058778445363,
"loss": 0.542,
"step": 406
},
{
"epoch": 0.914093206064009,
"grad_norm": 0.27244243025779724,
"learning_rate": 0.000140453736452265,
"loss": 0.5444,
"step": 407
},
{
"epoch": 0.9163391353172375,
"grad_norm": 0.2376582771539688,
"learning_rate": 0.00014031647146664494,
"loss": 0.5624,
"step": 408
},
{
"epoch": 0.9185850645704661,
"grad_norm": 0.29739177227020264,
"learning_rate": 0.00014017879376682627,
"loss": 0.5579,
"step": 409
},
{
"epoch": 0.9208309938236946,
"grad_norm": 0.24522463977336884,
"learning_rate": 0.00014004070429486575,
"loss": 0.5778,
"step": 410
},
{
"epoch": 0.9230769230769231,
"grad_norm": 0.28420501947402954,
"learning_rate": 0.00013990220399563775,
"loss": 0.582,
"step": 411
},
{
"epoch": 0.9253228523301515,
"grad_norm": 0.3128701150417328,
"learning_rate": 0.0001397632938168277,
"loss": 0.5597,
"step": 412
},
{
"epoch": 0.9275687815833801,
"grad_norm": 0.2503584921360016,
"learning_rate": 0.0001396239747089255,
"loss": 0.557,
"step": 413
},
{
"epoch": 0.9298147108366086,
"grad_norm": 0.2346629947423935,
"learning_rate": 0.00013948424762521937,
"loss": 0.5567,
"step": 414
},
{
"epoch": 0.9320606400898371,
"grad_norm": 0.26330509781837463,
"learning_rate": 0.00013934411352178888,
"loss": 0.5556,
"step": 415
},
{
"epoch": 0.9343065693430657,
"grad_norm": 0.2683373689651489,
"learning_rate": 0.00013920357335749873,
"loss": 0.5585,
"step": 416
},
{
"epoch": 0.9365524985962942,
"grad_norm": 0.21568100154399872,
"learning_rate": 0.0001390626280939921,
"loss": 0.5837,
"step": 417
},
{
"epoch": 0.9387984278495227,
"grad_norm": 0.2531348168849945,
"learning_rate": 0.00013892127869568396,
"loss": 0.5505,
"step": 418
},
{
"epoch": 0.9410443571027512,
"grad_norm": 0.2542068362236023,
"learning_rate": 0.00013877952612975465,
"loss": 0.5834,
"step": 419
},
{
"epoch": 0.9432902863559798,
"grad_norm": 0.22806115448474884,
"learning_rate": 0.00013863737136614318,
"loss": 0.5648,
"step": 420
},
{
"epoch": 0.9455362156092083,
"grad_norm": 0.3068370819091797,
"learning_rate": 0.00013849481537754054,
"loss": 0.5488,
"step": 421
},
{
"epoch": 0.9477821448624368,
"grad_norm": 0.31988707184791565,
"learning_rate": 0.00013835185913938305,
"loss": 0.5679,
"step": 422
},
{
"epoch": 0.9500280741156654,
"grad_norm": 0.2480153888463974,
"learning_rate": 0.00013820850362984585,
"loss": 0.5481,
"step": 423
},
{
"epoch": 0.9522740033688939,
"grad_norm": 0.2835778295993805,
"learning_rate": 0.00013806474982983602,
"loss": 0.5575,
"step": 424
},
{
"epoch": 0.9545199326221224,
"grad_norm": 0.28375929594039917,
"learning_rate": 0.0001379205987229859,
"loss": 0.5522,
"step": 425
},
{
"epoch": 0.956765861875351,
"grad_norm": 0.32451605796813965,
"learning_rate": 0.00013777605129564649,
"loss": 0.5531,
"step": 426
},
{
"epoch": 0.9590117911285795,
"grad_norm": 0.2671431005001068,
"learning_rate": 0.00013763110853688053,
"loss": 0.5597,
"step": 427
},
{
"epoch": 0.961257720381808,
"grad_norm": 0.31597959995269775,
"learning_rate": 0.0001374857714384558,
"loss": 0.5668,
"step": 428
},
{
"epoch": 0.9635036496350365,
"grad_norm": 0.3307490050792694,
"learning_rate": 0.00013734004099483842,
"loss": 0.5412,
"step": 429
},
{
"epoch": 0.9657495788882651,
"grad_norm": 0.2594424784183502,
"learning_rate": 0.00013719391820318585,
"loss": 0.534,
"step": 430
},
{
"epoch": 0.9679955081414935,
"grad_norm": 0.24068522453308105,
"learning_rate": 0.00013704740406334027,
"loss": 0.567,
"step": 431
},
{
"epoch": 0.970241437394722,
"grad_norm": 0.27227339148521423,
"learning_rate": 0.00013690049957782162,
"loss": 0.55,
"step": 432
},
{
"epoch": 0.9724873666479505,
"grad_norm": 0.2331283837556839,
"learning_rate": 0.0001367532057518208,
"loss": 0.5296,
"step": 433
},
{
"epoch": 0.9747332959011791,
"grad_norm": 0.25519710779190063,
"learning_rate": 0.00013660552359319274,
"loss": 0.5759,
"step": 434
},
{
"epoch": 0.9769792251544076,
"grad_norm": 0.25783583521842957,
"learning_rate": 0.0001364574541124495,
"loss": 0.5642,
"step": 435
},
{
"epoch": 0.9792251544076361,
"grad_norm": 0.2404668927192688,
"learning_rate": 0.00013630899832275348,
"loss": 0.5566,
"step": 436
},
{
"epoch": 0.9814710836608647,
"grad_norm": 0.29142558574676514,
"learning_rate": 0.00013616015723991027,
"loss": 0.5666,
"step": 437
},
{
"epoch": 0.9837170129140932,
"grad_norm": 0.2782052755355835,
"learning_rate": 0.00013601093188236188,
"loss": 0.5507,
"step": 438
},
{
"epoch": 0.9859629421673217,
"grad_norm": 0.21326802670955658,
"learning_rate": 0.00013586132327117974,
"loss": 0.5685,
"step": 439
},
{
"epoch": 0.9882088714205502,
"grad_norm": 0.25155940651893616,
"learning_rate": 0.00013571133243005763,
"loss": 0.5803,
"step": 440
},
{
"epoch": 0.9904548006737788,
"grad_norm": 0.2218320518732071,
"learning_rate": 0.00013556096038530474,
"loss": 0.5488,
"step": 441
},
{
"epoch": 0.9927007299270073,
"grad_norm": 0.27737680077552795,
"learning_rate": 0.00013541020816583869,
"loss": 0.5651,
"step": 442
},
{
"epoch": 0.9949466591802358,
"grad_norm": 0.2509002387523651,
"learning_rate": 0.00013525907680317836,
"loss": 0.5525,
"step": 443
},
{
"epoch": 0.9971925884334644,
"grad_norm": 0.25884488224983215,
"learning_rate": 0.000135107567331437,
"loss": 0.567,
"step": 444
},
{
"epoch": 0.9994385176866929,
"grad_norm": 0.2978728711605072,
"learning_rate": 0.00013495568078731495,
"loss": 0.5405,
"step": 445
},
{
"epoch": 1.0016844469399213,
"grad_norm": 0.31027480959892273,
"learning_rate": 0.00013480341821009277,
"loss": 0.5251,
"step": 446
},
{
"epoch": 1.00393037619315,
"grad_norm": 0.3249771296977997,
"learning_rate": 0.00013465078064162393,
"loss": 0.5197,
"step": 447
},
{
"epoch": 1.0061763054463784,
"grad_norm": 0.330244243144989,
"learning_rate": 0.00013449776912632784,
"loss": 0.5177,
"step": 448
},
{
"epoch": 1.008422234699607,
"grad_norm": 0.31560900807380676,
"learning_rate": 0.00013434438471118262,
"loss": 0.5108,
"step": 449
},
{
"epoch": 1.0106681639528354,
"grad_norm": 0.32275712490081787,
"learning_rate": 0.00013419062844571784,
"loss": 0.498,
"step": 450
},
{
"epoch": 1.012914093206064,
"grad_norm": 0.30424079298973083,
"learning_rate": 0.0001340365013820077,
"loss": 0.5394,
"step": 451
},
{
"epoch": 1.0151600224592925,
"grad_norm": 0.26794448494911194,
"learning_rate": 0.00013388200457466326,
"loss": 0.4944,
"step": 452
},
{
"epoch": 1.0174059517125211,
"grad_norm": 0.31360936164855957,
"learning_rate": 0.00013372713908082578,
"loss": 0.5062,
"step": 453
},
{
"epoch": 1.0196518809657495,
"grad_norm": 0.33009976148605347,
"learning_rate": 0.00013357190596015919,
"loss": 0.5105,
"step": 454
},
{
"epoch": 1.0218978102189782,
"grad_norm": 0.2470821887254715,
"learning_rate": 0.00013341630627484286,
"loss": 0.5185,
"step": 455
},
{
"epoch": 1.0241437394722066,
"grad_norm": 0.304426908493042,
"learning_rate": 0.00013326034108956437,
"loss": 0.5292,
"step": 456
},
{
"epoch": 1.0263896687254352,
"grad_norm": 0.3242713510990143,
"learning_rate": 0.0001331040114715123,
"loss": 0.5214,
"step": 457
},
{
"epoch": 1.0286355979786637,
"grad_norm": 0.31412094831466675,
"learning_rate": 0.00013294731849036875,
"loss": 0.5106,
"step": 458
},
{
"epoch": 1.0308815272318923,
"grad_norm": 0.27217480540275574,
"learning_rate": 0.0001327902632183022,
"loss": 0.5344,
"step": 459
},
{
"epoch": 1.0331274564851207,
"grad_norm": 0.2789839208126068,
"learning_rate": 0.00013263284672996009,
"loss": 0.521,
"step": 460
},
{
"epoch": 1.0353733857383491,
"grad_norm": 0.27859795093536377,
"learning_rate": 0.00013247507010246144,
"loss": 0.5316,
"step": 461
},
{
"epoch": 1.0376193149915778,
"grad_norm": 0.30018481612205505,
"learning_rate": 0.00013231693441538952,
"loss": 0.5083,
"step": 462
},
{
"epoch": 1.0398652442448062,
"grad_norm": 0.2683006525039673,
"learning_rate": 0.0001321584407507845,
"loss": 0.5378,
"step": 463
},
{
"epoch": 1.0421111734980348,
"grad_norm": 0.27185767889022827,
"learning_rate": 0.000131999590193136,
"loss": 0.5117,
"step": 464
},
{
"epoch": 1.0443571027512633,
"grad_norm": 0.2839741110801697,
"learning_rate": 0.0001318403838293756,
"loss": 0.5282,
"step": 465
},
{
"epoch": 1.046603032004492,
"grad_norm": 0.2537892460823059,
"learning_rate": 0.00013168082274886953,
"loss": 0.5096,
"step": 466
},
{
"epoch": 1.0488489612577203,
"grad_norm": 0.2625972032546997,
"learning_rate": 0.00013152090804341118,
"loss": 0.5188,
"step": 467
},
{
"epoch": 1.051094890510949,
"grad_norm": 0.3052925169467926,
"learning_rate": 0.00013136064080721354,
"loss": 0.5409,
"step": 468
},
{
"epoch": 1.0533408197641774,
"grad_norm": 0.2866557538509369,
"learning_rate": 0.00013120002213690192,
"loss": 0.5101,
"step": 469
},
{
"epoch": 1.055586749017406,
"grad_norm": 0.26804205775260925,
"learning_rate": 0.00013103905313150617,
"loss": 0.5221,
"step": 470
},
{
"epoch": 1.0578326782706344,
"grad_norm": 0.2677738070487976,
"learning_rate": 0.00013087773489245334,
"loss": 0.5203,
"step": 471
},
{
"epoch": 1.060078607523863,
"grad_norm": 0.273448646068573,
"learning_rate": 0.00013071606852356013,
"loss": 0.5349,
"step": 472
},
{
"epoch": 1.0623245367770915,
"grad_norm": 0.27046024799346924,
"learning_rate": 0.00013055405513102533,
"loss": 0.5132,
"step": 473
},
{
"epoch": 1.0645704660303201,
"grad_norm": 0.25829020142555237,
"learning_rate": 0.00013039169582342215,
"loss": 0.4968,
"step": 474
},
{
"epoch": 1.0668163952835485,
"grad_norm": 0.27012374997138977,
"learning_rate": 0.0001302289917116908,
"loss": 0.5166,
"step": 475
},
{
"epoch": 1.0690623245367772,
"grad_norm": 0.2819938063621521,
"learning_rate": 0.00013006594390913077,
"loss": 0.5238,
"step": 476
},
{
"epoch": 1.0713082537900056,
"grad_norm": 0.24958448112010956,
"learning_rate": 0.00012990255353139324,
"loss": 0.5031,
"step": 477
},
{
"epoch": 1.073554183043234,
"grad_norm": 0.23778881132602692,
"learning_rate": 0.0001297388216964735,
"loss": 0.5297,
"step": 478
},
{
"epoch": 1.0758001122964627,
"grad_norm": 0.25948163866996765,
"learning_rate": 0.00012957474952470313,
"loss": 0.5146,
"step": 479
},
{
"epoch": 1.078046041549691,
"grad_norm": 0.22898133099079132,
"learning_rate": 0.00012941033813874264,
"loss": 0.5137,
"step": 480
},
{
"epoch": 1.0802919708029197,
"grad_norm": 0.2507185637950897,
"learning_rate": 0.00012924558866357343,
"loss": 0.5241,
"step": 481
},
{
"epoch": 1.0825379000561481,
"grad_norm": 0.2403927892446518,
"learning_rate": 0.00012908050222649043,
"loss": 0.5036,
"step": 482
},
{
"epoch": 1.0847838293093768,
"grad_norm": 0.23922879993915558,
"learning_rate": 0.00012891507995709412,
"loss": 0.528,
"step": 483
},
{
"epoch": 1.0870297585626052,
"grad_norm": 0.2286342829465866,
"learning_rate": 0.00012874932298728286,
"loss": 0.5202,
"step": 484
},
{
"epoch": 1.0892756878158338,
"grad_norm": 0.258478045463562,
"learning_rate": 0.00012858323245124538,
"loss": 0.5041,
"step": 485
},
{
"epoch": 1.0915216170690623,
"grad_norm": 0.27987441420555115,
"learning_rate": 0.0001284168094854526,
"loss": 0.5021,
"step": 486
},
{
"epoch": 1.093767546322291,
"grad_norm": 0.22872576117515564,
"learning_rate": 0.00012825005522865027,
"loss": 0.5243,
"step": 487
},
{
"epoch": 1.0960134755755193,
"grad_norm": 0.22990728914737701,
"learning_rate": 0.00012808297082185087,
"loss": 0.5186,
"step": 488
},
{
"epoch": 1.098259404828748,
"grad_norm": 0.21057239174842834,
"learning_rate": 0.000127915557408326,
"loss": 0.5074,
"step": 489
},
{
"epoch": 1.1005053340819764,
"grad_norm": 0.2562633752822876,
"learning_rate": 0.00012774781613359841,
"loss": 0.5205,
"step": 490
},
{
"epoch": 1.102751263335205,
"grad_norm": 0.23108799755573273,
"learning_rate": 0.0001275797481454343,
"loss": 0.5289,
"step": 491
},
{
"epoch": 1.1049971925884334,
"grad_norm": 0.2631300389766693,
"learning_rate": 0.00012741135459383543,
"loss": 0.5198,
"step": 492
},
{
"epoch": 1.107243121841662,
"grad_norm": 0.2443421483039856,
"learning_rate": 0.00012724263663103108,
"loss": 0.535,
"step": 493
},
{
"epoch": 1.1094890510948905,
"grad_norm": 0.22926633059978485,
"learning_rate": 0.00012707359541147043,
"loss": 0.4935,
"step": 494
},
{
"epoch": 1.1117349803481191,
"grad_norm": 0.25909942388534546,
"learning_rate": 0.00012690423209181452,
"loss": 0.4998,
"step": 495
},
{
"epoch": 1.1139809096013475,
"grad_norm": 0.24831925332546234,
"learning_rate": 0.0001267345478309283,
"loss": 0.5246,
"step": 496
},
{
"epoch": 1.1162268388545762,
"grad_norm": 0.26700034737586975,
"learning_rate": 0.00012656454378987282,
"loss": 0.5276,
"step": 497
},
{
"epoch": 1.1184727681078046,
"grad_norm": 0.24582357704639435,
"learning_rate": 0.00012639422113189713,
"loss": 0.5274,
"step": 498
},
{
"epoch": 1.120718697361033,
"grad_norm": 0.2464480996131897,
"learning_rate": 0.00012622358102243054,
"loss": 0.514,
"step": 499
},
{
"epoch": 1.1229646266142617,
"grad_norm": 0.28942957520484924,
"learning_rate": 0.0001260526246290744,
"loss": 0.5216,
"step": 500
},
{
"epoch": 1.12521055586749,
"grad_norm": 0.29417484998703003,
"learning_rate": 0.00012588135312159427,
"loss": 0.5214,
"step": 501
},
{
"epoch": 1.1274564851207187,
"grad_norm": 0.27026209235191345,
"learning_rate": 0.00012570976767191188,
"loss": 0.5206,
"step": 502
},
{
"epoch": 1.1297024143739471,
"grad_norm": 0.2554686963558197,
"learning_rate": 0.0001255378694540971,
"loss": 0.5285,
"step": 503
},
{
"epoch": 1.1319483436271758,
"grad_norm": 0.28773826360702515,
"learning_rate": 0.00012536565964435986,
"loss": 0.4933,
"step": 504
},
{
"epoch": 1.1341942728804042,
"grad_norm": 0.28885528445243835,
"learning_rate": 0.00012519313942104224,
"loss": 0.5392,
"step": 505
},
{
"epoch": 1.1364402021336328,
"grad_norm": 0.31166213750839233,
"learning_rate": 0.00012502030996461023,
"loss": 0.5333,
"step": 506
},
{
"epoch": 1.1386861313868613,
"grad_norm": 0.3064601719379425,
"learning_rate": 0.00012484717245764585,
"loss": 0.5261,
"step": 507
},
{
"epoch": 1.14093206064009,
"grad_norm": 0.3036741018295288,
"learning_rate": 0.00012467372808483882,
"loss": 0.5309,
"step": 508
},
{
"epoch": 1.1431779898933183,
"grad_norm": 0.2402871996164322,
"learning_rate": 0.00012449997803297866,
"loss": 0.4906,
"step": 509
},
{
"epoch": 1.145423919146547,
"grad_norm": 0.26572084426879883,
"learning_rate": 0.0001243259234909465,
"loss": 0.5152,
"step": 510
},
{
"epoch": 1.1476698483997754,
"grad_norm": 0.26166555285453796,
"learning_rate": 0.00012415156564970687,
"loss": 0.5266,
"step": 511
},
{
"epoch": 1.149915777653004,
"grad_norm": 0.26020121574401855,
"learning_rate": 0.0001239769057022997,
"loss": 0.5063,
"step": 512
},
{
"epoch": 1.1521617069062324,
"grad_norm": 0.2840318977832794,
"learning_rate": 0.00012380194484383201,
"loss": 0.5301,
"step": 513
},
{
"epoch": 1.154407636159461,
"grad_norm": 0.2320166826248169,
"learning_rate": 0.00012362668427146986,
"loss": 0.5074,
"step": 514
},
{
"epoch": 1.1566535654126895,
"grad_norm": 0.26712101697921753,
"learning_rate": 0.00012345112518443008,
"loss": 0.5247,
"step": 515
},
{
"epoch": 1.158899494665918,
"grad_norm": 0.2772868871688843,
"learning_rate": 0.000123275268783972,
"loss": 0.5113,
"step": 516
},
{
"epoch": 1.1611454239191465,
"grad_norm": 0.23757833242416382,
"learning_rate": 0.00012309911627338943,
"loss": 0.5383,
"step": 517
},
{
"epoch": 1.1633913531723752,
"grad_norm": 0.24388740956783295,
"learning_rate": 0.00012292266885800221,
"loss": 0.5404,
"step": 518
},
{
"epoch": 1.1656372824256036,
"grad_norm": 0.32931777834892273,
"learning_rate": 0.00012274592774514812,
"loss": 0.5304,
"step": 519
},
{
"epoch": 1.167883211678832,
"grad_norm": 0.2616422176361084,
"learning_rate": 0.00012256889414417456,
"loss": 0.5111,
"step": 520
},
{
"epoch": 1.1701291409320607,
"grad_norm": 0.20813870429992676,
"learning_rate": 0.0001223915692664302,
"loss": 0.4817,
"step": 521
},
{
"epoch": 1.172375070185289,
"grad_norm": 0.2631247639656067,
"learning_rate": 0.00012221395432525687,
"loss": 0.5119,
"step": 522
},
{
"epoch": 1.1746209994385177,
"grad_norm": 0.22986264526844025,
"learning_rate": 0.0001220360505359811,
"loss": 0.5136,
"step": 523
},
{
"epoch": 1.1768669286917461,
"grad_norm": 0.23806849122047424,
"learning_rate": 0.00012185785911590583,
"loss": 0.5247,
"step": 524
},
{
"epoch": 1.1791128579449748,
"grad_norm": 0.2917364537715912,
"learning_rate": 0.00012167938128430216,
"loss": 0.5286,
"step": 525
},
{
"epoch": 1.1813587871982032,
"grad_norm": 0.24546997249126434,
"learning_rate": 0.00012150061826240091,
"loss": 0.5197,
"step": 526
},
{
"epoch": 1.1836047164514318,
"grad_norm": 0.22644369304180145,
"learning_rate": 0.00012132157127338435,
"loss": 0.5369,
"step": 527
},
{
"epoch": 1.1858506457046603,
"grad_norm": 0.2547290623188019,
"learning_rate": 0.00012114224154237777,
"loss": 0.5108,
"step": 528
},
{
"epoch": 1.188096574957889,
"grad_norm": 0.2384437471628189,
"learning_rate": 0.00012096263029644112,
"loss": 0.528,
"step": 529
},
{
"epoch": 1.1903425042111173,
"grad_norm": 0.2654406726360321,
"learning_rate": 0.0001207827387645606,
"loss": 0.5179,
"step": 530
},
{
"epoch": 1.192588433464346,
"grad_norm": 0.19757139682769775,
"learning_rate": 0.00012060256817764025,
"loss": 0.5126,
"step": 531
},
{
"epoch": 1.1948343627175744,
"grad_norm": 0.21663667261600494,
"learning_rate": 0.00012042211976849356,
"loss": 0.5136,
"step": 532
},
{
"epoch": 1.197080291970803,
"grad_norm": 0.21993404626846313,
"learning_rate": 0.00012024139477183504,
"loss": 0.5185,
"step": 533
},
{
"epoch": 1.1993262212240314,
"grad_norm": 0.2317759096622467,
"learning_rate": 0.00012006039442427167,
"loss": 0.5139,
"step": 534
},
{
"epoch": 1.20157215047726,
"grad_norm": 0.21483832597732544,
"learning_rate": 0.0001198791199642946,
"loss": 0.5231,
"step": 535
},
{
"epoch": 1.2038180797304885,
"grad_norm": 0.2653373181819916,
"learning_rate": 0.0001196975726322705,
"loss": 0.5177,
"step": 536
},
{
"epoch": 1.206064008983717,
"grad_norm": 0.19980397820472717,
"learning_rate": 0.00011951575367043321,
"loss": 0.5081,
"step": 537
},
{
"epoch": 1.2083099382369455,
"grad_norm": 0.2335788607597351,
"learning_rate": 0.00011933366432287522,
"loss": 0.5283,
"step": 538
},
{
"epoch": 1.210555867490174,
"grad_norm": 0.20896217226982117,
"learning_rate": 0.00011915130583553906,
"loss": 0.5009,
"step": 539
},
{
"epoch": 1.2128017967434026,
"grad_norm": 0.2064492404460907,
"learning_rate": 0.00011896867945620891,
"loss": 0.5072,
"step": 540
},
{
"epoch": 1.215047725996631,
"grad_norm": 0.22994771599769592,
"learning_rate": 0.00011878578643450191,
"loss": 0.506,
"step": 541
},
{
"epoch": 1.2172936552498597,
"grad_norm": 0.21593116223812103,
"learning_rate": 0.00011860262802185982,
"loss": 0.5304,
"step": 542
},
{
"epoch": 1.219539584503088,
"grad_norm": 0.21689918637275696,
"learning_rate": 0.0001184192054715402,
"loss": 0.5163,
"step": 543
},
{
"epoch": 1.2217855137563167,
"grad_norm": 0.20837046205997467,
"learning_rate": 0.00011823552003860805,
"loss": 0.5247,
"step": 544
},
{
"epoch": 1.2240314430095451,
"grad_norm": 0.2125036120414734,
"learning_rate": 0.00011805157297992715,
"loss": 0.5118,
"step": 545
},
{
"epoch": 1.2262773722627738,
"grad_norm": 0.21233297884464264,
"learning_rate": 0.00011786736555415134,
"loss": 0.5091,
"step": 546
},
{
"epoch": 1.2285233015160022,
"grad_norm": 0.2236490547657013,
"learning_rate": 0.00011768289902171612,
"loss": 0.5168,
"step": 547
},
{
"epoch": 1.2307692307692308,
"grad_norm": 0.2149861603975296,
"learning_rate": 0.00011749817464482995,
"loss": 0.5221,
"step": 548
},
{
"epoch": 1.2330151600224593,
"grad_norm": 0.23652967810630798,
"learning_rate": 0.00011731319368746545,
"loss": 0.5132,
"step": 549
},
{
"epoch": 1.235261089275688,
"grad_norm": 0.2397671788930893,
"learning_rate": 0.00011712795741535098,
"loss": 0.5085,
"step": 550
},
{
"epoch": 1.2375070185289163,
"grad_norm": 0.1940278857946396,
"learning_rate": 0.00011694246709596195,
"loss": 0.5429,
"step": 551
},
{
"epoch": 1.239752947782145,
"grad_norm": 0.24372558295726776,
"learning_rate": 0.00011675672399851188,
"loss": 0.5091,
"step": 552
},
{
"epoch": 1.2419988770353734,
"grad_norm": 0.21898634731769562,
"learning_rate": 0.00011657072939394413,
"loss": 0.5164,
"step": 553
},
{
"epoch": 1.2442448062886018,
"grad_norm": 0.2210114300251007,
"learning_rate": 0.00011638448455492287,
"loss": 0.5133,
"step": 554
},
{
"epoch": 1.2464907355418304,
"grad_norm": 0.2156367301940918,
"learning_rate": 0.00011619799075582452,
"loss": 0.5109,
"step": 555
},
{
"epoch": 1.248736664795059,
"grad_norm": 0.1969204545021057,
"learning_rate": 0.00011601124927272906,
"loss": 0.5143,
"step": 556
},
{
"epoch": 1.2509825940482875,
"grad_norm": 0.19980621337890625,
"learning_rate": 0.00011582426138341111,
"loss": 0.5087,
"step": 557
},
{
"epoch": 1.253228523301516,
"grad_norm": 0.2064458578824997,
"learning_rate": 0.00011563702836733152,
"loss": 0.505,
"step": 558
},
{
"epoch": 1.2554744525547445,
"grad_norm": 0.24166250228881836,
"learning_rate": 0.00011544955150562819,
"loss": 0.5204,
"step": 559
},
{
"epoch": 1.2577203818079732,
"grad_norm": 0.251028448343277,
"learning_rate": 0.0001152618320811077,
"loss": 0.5071,
"step": 560
},
{
"epoch": 1.2599663110612016,
"grad_norm": 0.1982237845659256,
"learning_rate": 0.0001150738713782363,
"loss": 0.5059,
"step": 561
},
{
"epoch": 1.26221224031443,
"grad_norm": 0.26162639260292053,
"learning_rate": 0.00011488567068313114,
"loss": 0.5172,
"step": 562
},
{
"epoch": 1.2644581695676587,
"grad_norm": 0.2098427712917328,
"learning_rate": 0.0001146972312835516,
"loss": 0.5135,
"step": 563
},
{
"epoch": 1.266704098820887,
"grad_norm": 0.2430814802646637,
"learning_rate": 0.00011450855446889031,
"loss": 0.5125,
"step": 564
},
{
"epoch": 1.2689500280741157,
"grad_norm": 0.21262916922569275,
"learning_rate": 0.00011431964153016444,
"loss": 0.5114,
"step": 565
},
{
"epoch": 1.2711959573273441,
"grad_norm": 0.20545636117458344,
"learning_rate": 0.00011413049376000686,
"loss": 0.5095,
"step": 566
},
{
"epoch": 1.2734418865805728,
"grad_norm": 0.23621973395347595,
"learning_rate": 0.00011394111245265724,
"loss": 0.5231,
"step": 567
},
{
"epoch": 1.2756878158338012,
"grad_norm": 0.21574462950229645,
"learning_rate": 0.00011375149890395321,
"loss": 0.5292,
"step": 568
},
{
"epoch": 1.2779337450870298,
"grad_norm": 0.22070422768592834,
"learning_rate": 0.00011356165441132152,
"loss": 0.5157,
"step": 569
},
{
"epoch": 1.2801796743402583,
"grad_norm": 0.19420836865901947,
"learning_rate": 0.00011337158027376918,
"loss": 0.5179,
"step": 570
},
{
"epoch": 1.2824256035934867,
"grad_norm": 0.26924458146095276,
"learning_rate": 0.0001131812777918745,
"loss": 0.5408,
"step": 571
},
{
"epoch": 1.2846715328467153,
"grad_norm": 0.22928448021411896,
"learning_rate": 0.00011299074826777824,
"loss": 0.5146,
"step": 572
},
{
"epoch": 1.286917462099944,
"grad_norm": 0.24480290710926056,
"learning_rate": 0.00011279999300517471,
"loss": 0.5151,
"step": 573
},
{
"epoch": 1.2891633913531724,
"grad_norm": 0.2365870326757431,
"learning_rate": 0.0001126090133093028,
"loss": 0.5088,
"step": 574
},
{
"epoch": 1.2914093206064008,
"grad_norm": 0.2634016275405884,
"learning_rate": 0.0001124178104869371,
"loss": 0.519,
"step": 575
},
{
"epoch": 1.2936552498596294,
"grad_norm": 0.275654673576355,
"learning_rate": 0.00011222638584637897,
"loss": 0.5276,
"step": 576
},
{
"epoch": 1.295901179112858,
"grad_norm": 0.2414851039648056,
"learning_rate": 0.00011203474069744747,
"loss": 0.4996,
"step": 577
},
{
"epoch": 1.2981471083660865,
"grad_norm": 0.23619700968265533,
"learning_rate": 0.00011184287635147058,
"loss": 0.5116,
"step": 578
},
{
"epoch": 1.300393037619315,
"grad_norm": 0.25254112482070923,
"learning_rate": 0.00011165079412127607,
"loss": 0.5133,
"step": 579
},
{
"epoch": 1.3026389668725435,
"grad_norm": 0.21320711076259613,
"learning_rate": 0.00011145849532118258,
"loss": 0.5049,
"step": 580
},
{
"epoch": 1.304884896125772,
"grad_norm": 0.24191851913928986,
"learning_rate": 0.00011126598126699068,
"loss": 0.5226,
"step": 581
},
{
"epoch": 1.3071308253790006,
"grad_norm": 0.20547953248023987,
"learning_rate": 0.00011107325327597372,
"loss": 0.5196,
"step": 582
},
{
"epoch": 1.309376754632229,
"grad_norm": 0.2211044281721115,
"learning_rate": 0.00011088031266686902,
"loss": 0.5135,
"step": 583
},
{
"epoch": 1.3116226838854577,
"grad_norm": 0.226315438747406,
"learning_rate": 0.00011068716075986863,
"loss": 0.5155,
"step": 584
},
{
"epoch": 1.313868613138686,
"grad_norm": 0.1992364525794983,
"learning_rate": 0.00011049379887661044,
"loss": 0.5135,
"step": 585
},
{
"epoch": 1.3161145423919147,
"grad_norm": 0.20736606419086456,
"learning_rate": 0.00011030022834016916,
"loss": 0.5107,
"step": 586
},
{
"epoch": 1.3183604716451431,
"grad_norm": 0.20780953764915466,
"learning_rate": 0.00011010645047504712,
"loss": 0.5072,
"step": 587
},
{
"epoch": 1.3206064008983718,
"grad_norm": 0.20156902074813843,
"learning_rate": 0.0001099124666071653,
"loss": 0.5037,
"step": 588
},
{
"epoch": 1.3228523301516002,
"grad_norm": 0.18280163407325745,
"learning_rate": 0.00010971827806385431,
"loss": 0.5308,
"step": 589
},
{
"epoch": 1.3250982594048288,
"grad_norm": 0.20286300778388977,
"learning_rate": 0.00010952388617384519,
"loss": 0.5239,
"step": 590
},
{
"epoch": 1.3273441886580573,
"grad_norm": 0.20476078987121582,
"learning_rate": 0.00010932929226726041,
"loss": 0.5339,
"step": 591
},
{
"epoch": 1.3295901179112857,
"grad_norm": 0.19983462989330292,
"learning_rate": 0.00010913449767560468,
"loss": 0.5166,
"step": 592
},
{
"epoch": 1.3318360471645143,
"grad_norm": 0.22195865213871002,
"learning_rate": 0.00010893950373175597,
"loss": 0.514,
"step": 593
},
{
"epoch": 1.334081976417743,
"grad_norm": 0.20715545117855072,
"learning_rate": 0.00010874431176995627,
"loss": 0.5296,
"step": 594
},
{
"epoch": 1.3363279056709714,
"grad_norm": 0.21173766255378723,
"learning_rate": 0.00010854892312580249,
"loss": 0.4918,
"step": 595
},
{
"epoch": 1.3385738349241998,
"grad_norm": 0.2034001350402832,
"learning_rate": 0.0001083533391362374,
"loss": 0.5176,
"step": 596
},
{
"epoch": 1.3408197641774284,
"grad_norm": 0.23540934920310974,
"learning_rate": 0.00010815756113954031,
"loss": 0.5145,
"step": 597
},
{
"epoch": 1.343065693430657,
"grad_norm": 0.19440345466136932,
"learning_rate": 0.00010796159047531811,
"loss": 0.5167,
"step": 598
},
{
"epoch": 1.3453116226838855,
"grad_norm": 0.2172805666923523,
"learning_rate": 0.00010776542848449602,
"loss": 0.5235,
"step": 599
},
{
"epoch": 1.347557551937114,
"grad_norm": 0.19153092801570892,
"learning_rate": 0.00010756907650930831,
"loss": 0.4961,
"step": 600
},
{
"epoch": 1.3498034811903425,
"grad_norm": 0.2150796353816986,
"learning_rate": 0.00010737253589328933,
"loss": 0.5154,
"step": 601
},
{
"epoch": 1.352049410443571,
"grad_norm": 0.21939396858215332,
"learning_rate": 0.0001071758079812641,
"loss": 0.5387,
"step": 602
},
{
"epoch": 1.3542953396967996,
"grad_norm": 0.20470492541790009,
"learning_rate": 0.00010697889411933928,
"loss": 0.4978,
"step": 603
},
{
"epoch": 1.356541268950028,
"grad_norm": 0.21058504283428192,
"learning_rate": 0.00010678179565489388,
"loss": 0.5096,
"step": 604
},
{
"epoch": 1.3587871982032567,
"grad_norm": 0.1950283795595169,
"learning_rate": 0.00010658451393656999,
"loss": 0.5089,
"step": 605
},
{
"epoch": 1.361033127456485,
"grad_norm": 0.21830430626869202,
"learning_rate": 0.00010638705031426371,
"loss": 0.4892,
"step": 606
},
{
"epoch": 1.3632790567097137,
"grad_norm": 0.19007915258407593,
"learning_rate": 0.00010618940613911576,
"loss": 0.5309,
"step": 607
},
{
"epoch": 1.3655249859629421,
"grad_norm": 0.20983009040355682,
"learning_rate": 0.0001059915827635022,
"loss": 0.5171,
"step": 608
},
{
"epoch": 1.3677709152161706,
"grad_norm": 0.20747217535972595,
"learning_rate": 0.00010579358154102548,
"loss": 0.4915,
"step": 609
},
{
"epoch": 1.3700168444693992,
"grad_norm": 0.20381350815296173,
"learning_rate": 0.00010559540382650474,
"loss": 0.503,
"step": 610
},
{
"epoch": 1.3722627737226278,
"grad_norm": 0.2014596313238144,
"learning_rate": 0.00010539705097596689,
"loss": 0.5124,
"step": 611
},
{
"epoch": 1.3745087029758563,
"grad_norm": 0.2117050141096115,
"learning_rate": 0.00010519852434663721,
"loss": 0.4996,
"step": 612
},
{
"epoch": 1.3767546322290847,
"grad_norm": 0.21098558604717255,
"learning_rate": 0.00010499982529692996,
"loss": 0.492,
"step": 613
},
{
"epoch": 1.3790005614823133,
"grad_norm": 0.22107858955860138,
"learning_rate": 0.00010480095518643929,
"loss": 0.5165,
"step": 614
},
{
"epoch": 1.381246490735542,
"grad_norm": 0.22238287329673767,
"learning_rate": 0.00010460191537592977,
"loss": 0.5095,
"step": 615
},
{
"epoch": 1.3834924199887704,
"grad_norm": 0.20342691242694855,
"learning_rate": 0.00010440270722732714,
"loss": 0.5141,
"step": 616
},
{
"epoch": 1.3857383492419988,
"grad_norm": 0.22299018502235413,
"learning_rate": 0.00010420333210370903,
"loss": 0.5133,
"step": 617
},
{
"epoch": 1.3879842784952274,
"grad_norm": 0.20717273652553558,
"learning_rate": 0.00010400379136929557,
"loss": 0.5143,
"step": 618
},
{
"epoch": 1.3902302077484558,
"grad_norm": 0.20377473533153534,
"learning_rate": 0.00010380408638944007,
"loss": 0.4835,
"step": 619
},
{
"epoch": 1.3924761370016845,
"grad_norm": 0.22891288995742798,
"learning_rate": 0.00010360421853061966,
"loss": 0.5122,
"step": 620
},
{
"epoch": 1.394722066254913,
"grad_norm": 0.19375132024288177,
"learning_rate": 0.00010340418916042603,
"loss": 0.5052,
"step": 621
},
{
"epoch": 1.3969679955081415,
"grad_norm": 0.191814586520195,
"learning_rate": 0.00010320399964755596,
"loss": 0.4988,
"step": 622
},
{
"epoch": 1.39921392476137,
"grad_norm": 0.1985396444797516,
"learning_rate": 0.00010300365136180201,
"loss": 0.5049,
"step": 623
},
{
"epoch": 1.4014598540145986,
"grad_norm": 0.18780378997325897,
"learning_rate": 0.0001028031456740432,
"loss": 0.5002,
"step": 624
},
{
"epoch": 1.403705783267827,
"grad_norm": 0.21660645306110382,
"learning_rate": 0.00010260248395623548,
"loss": 0.5184,
"step": 625
},
{
"epoch": 1.4059517125210557,
"grad_norm": 0.19068920612335205,
"learning_rate": 0.00010240166758140245,
"loss": 0.5032,
"step": 626
},
{
"epoch": 1.408197641774284,
"grad_norm": 0.2113179713487625,
"learning_rate": 0.00010220069792362601,
"loss": 0.5152,
"step": 627
},
{
"epoch": 1.4104435710275127,
"grad_norm": 0.18784399330615997,
"learning_rate": 0.00010199957635803684,
"loss": 0.5261,
"step": 628
},
{
"epoch": 1.4126895002807411,
"grad_norm": 0.1969737708568573,
"learning_rate": 0.00010179830426080504,
"loss": 0.5152,
"step": 629
},
{
"epoch": 1.4149354295339696,
"grad_norm": 0.18799488246440887,
"learning_rate": 0.00010159688300913076,
"loss": 0.5111,
"step": 630
},
{
"epoch": 1.4171813587871982,
"grad_norm": 0.18792767822742462,
"learning_rate": 0.0001013953139812347,
"loss": 0.5092,
"step": 631
},
{
"epoch": 1.4194272880404268,
"grad_norm": 0.21675904095172882,
"learning_rate": 0.00010119359855634876,
"loss": 0.5076,
"step": 632
},
{
"epoch": 1.4216732172936553,
"grad_norm": 0.19109146296977997,
"learning_rate": 0.00010099173811470652,
"loss": 0.507,
"step": 633
},
{
"epoch": 1.4239191465468837,
"grad_norm": 0.1930873841047287,
"learning_rate": 0.00010078973403753383,
"loss": 0.5195,
"step": 634
},
{
"epoch": 1.4261650758001123,
"grad_norm": 0.18737006187438965,
"learning_rate": 0.00010058758770703938,
"loss": 0.5233,
"step": 635
},
{
"epoch": 1.428411005053341,
"grad_norm": 0.1958773285150528,
"learning_rate": 0.00010038530050640522,
"loss": 0.5031,
"step": 636
},
{
"epoch": 1.4306569343065694,
"grad_norm": 0.18015055358409882,
"learning_rate": 0.00010018287381977732,
"loss": 0.5138,
"step": 637
},
{
"epoch": 1.4329028635597978,
"grad_norm": 0.18713940680027008,
"learning_rate": 9.998030903225603e-05,
"loss": 0.5084,
"step": 638
},
{
"epoch": 1.4351487928130264,
"grad_norm": 0.20459598302841187,
"learning_rate": 9.977760752988671e-05,
"loss": 0.5409,
"step": 639
},
{
"epoch": 1.4373947220662548,
"grad_norm": 0.17716822028160095,
"learning_rate": 9.957477069965018e-05,
"loss": 0.509,
"step": 640
},
{
"epoch": 1.4396406513194835,
"grad_norm": 0.1981070339679718,
"learning_rate": 9.93717999294532e-05,
"loss": 0.4953,
"step": 641
},
{
"epoch": 1.441886580572712,
"grad_norm": 0.19121180474758148,
"learning_rate": 9.916869660811906e-05,
"loss": 0.5109,
"step": 642
},
{
"epoch": 1.4441325098259405,
"grad_norm": 0.20929452776908875,
"learning_rate": 9.896546212537793e-05,
"loss": 0.517,
"step": 643
},
{
"epoch": 1.446378439079169,
"grad_norm": 0.19593368470668793,
"learning_rate": 9.87620978718576e-05,
"loss": 0.5071,
"step": 644
},
{
"epoch": 1.4486243683323976,
"grad_norm": 0.21035808324813843,
"learning_rate": 9.855860523907372e-05,
"loss": 0.5198,
"step": 645
},
{
"epoch": 1.450870297585626,
"grad_norm": 0.19853971898555756,
"learning_rate": 9.835498561942036e-05,
"loss": 0.5437,
"step": 646
},
{
"epoch": 1.4531162268388544,
"grad_norm": 0.1949443370103836,
"learning_rate": 9.815124040616056e-05,
"loss": 0.5076,
"step": 647
},
{
"epoch": 1.455362156092083,
"grad_norm": 0.20280544459819794,
"learning_rate": 9.794737099341664e-05,
"loss": 0.5093,
"step": 648
},
{
"epoch": 1.4576080853453117,
"grad_norm": 0.21078361570835114,
"learning_rate": 9.774337877616083e-05,
"loss": 0.5081,
"step": 649
},
{
"epoch": 1.4598540145985401,
"grad_norm": 0.1961338371038437,
"learning_rate": 9.753926515020567e-05,
"loss": 0.5096,
"step": 650
},
{
"epoch": 1.4620999438517686,
"grad_norm": 0.19009891152381897,
"learning_rate": 9.733503151219433e-05,
"loss": 0.4999,
"step": 651
},
{
"epoch": 1.4643458731049972,
"grad_norm": 0.18627040088176727,
"learning_rate": 9.713067925959126e-05,
"loss": 0.5056,
"step": 652
},
{
"epoch": 1.4665918023582258,
"grad_norm": 0.1938895285129547,
"learning_rate": 9.692620979067245e-05,
"loss": 0.5137,
"step": 653
},
{
"epoch": 1.4688377316114543,
"grad_norm": 0.2050761729478836,
"learning_rate": 9.672162450451602e-05,
"loss": 0.5051,
"step": 654
},
{
"epoch": 1.4710836608646827,
"grad_norm": 0.19880592823028564,
"learning_rate": 9.651692480099251e-05,
"loss": 0.5055,
"step": 655
},
{
"epoch": 1.4733295901179113,
"grad_norm": 0.18447960913181305,
"learning_rate": 9.631211208075534e-05,
"loss": 0.5296,
"step": 656
},
{
"epoch": 1.4755755193711397,
"grad_norm": 0.19004195928573608,
"learning_rate": 9.610718774523137e-05,
"loss": 0.5258,
"step": 657
},
{
"epoch": 1.4778214486243684,
"grad_norm": 0.19954320788383484,
"learning_rate": 9.590215319661097e-05,
"loss": 0.5011,
"step": 658
},
{
"epoch": 1.4800673778775968,
"grad_norm": 0.17005719244480133,
"learning_rate": 9.569700983783885e-05,
"loss": 0.5062,
"step": 659
},
{
"epoch": 1.4823133071308254,
"grad_norm": 0.21068550646305084,
"learning_rate": 9.549175907260415e-05,
"loss": 0.5044,
"step": 660
},
{
"epoch": 1.4845592363840538,
"grad_norm": 0.18736523389816284,
"learning_rate": 9.528640230533093e-05,
"loss": 0.521,
"step": 661
},
{
"epoch": 1.4868051656372825,
"grad_norm": 0.19477304816246033,
"learning_rate": 9.508094094116863e-05,
"loss": 0.5065,
"step": 662
},
{
"epoch": 1.489051094890511,
"grad_norm": 0.20427975058555603,
"learning_rate": 9.48753763859823e-05,
"loss": 0.5208,
"step": 663
},
{
"epoch": 1.4912970241437395,
"grad_norm": 0.20408067107200623,
"learning_rate": 9.466971004634316e-05,
"loss": 0.4917,
"step": 664
},
{
"epoch": 1.493542953396968,
"grad_norm": 0.22063596546649933,
"learning_rate": 9.446394332951885e-05,
"loss": 0.5097,
"step": 665
},
{
"epoch": 1.4957888826501966,
"grad_norm": 0.20878678560256958,
"learning_rate": 9.425807764346383e-05,
"loss": 0.505,
"step": 666
},
{
"epoch": 1.498034811903425,
"grad_norm": 0.21228721737861633,
"learning_rate": 9.405211439680975e-05,
"loss": 0.5249,
"step": 667
},
{
"epoch": 1.5002807411566534,
"grad_norm": 0.21478019654750824,
"learning_rate": 9.384605499885586e-05,
"loss": 0.516,
"step": 668
},
{
"epoch": 1.502526670409882,
"grad_norm": 0.23727190494537354,
"learning_rate": 9.363990085955929e-05,
"loss": 0.5128,
"step": 669
},
{
"epoch": 1.5047725996631107,
"grad_norm": 0.211452454328537,
"learning_rate": 9.343365338952544e-05,
"loss": 0.5141,
"step": 670
},
{
"epoch": 1.5070185289163391,
"grad_norm": 0.24813149869441986,
"learning_rate": 9.322731399999829e-05,
"loss": 0.5286,
"step": 671
},
{
"epoch": 1.5092644581695676,
"grad_norm": 0.19929581880569458,
"learning_rate": 9.302088410285084e-05,
"loss": 0.5065,
"step": 672
},
{
"epoch": 1.5115103874227962,
"grad_norm": 0.23539748787879944,
"learning_rate": 9.281436511057538e-05,
"loss": 0.5045,
"step": 673
},
{
"epoch": 1.5137563166760248,
"grad_norm": 0.18617475032806396,
"learning_rate": 9.260775843627378e-05,
"loss": 0.4943,
"step": 674
},
{
"epoch": 1.5160022459292533,
"grad_norm": 0.22366289794445038,
"learning_rate": 9.24010654936479e-05,
"loss": 0.5136,
"step": 675
},
{
"epoch": 1.5182481751824817,
"grad_norm": 0.21610277891159058,
"learning_rate": 9.219428769698991e-05,
"loss": 0.4968,
"step": 676
},
{
"epoch": 1.5204941044357103,
"grad_norm": 0.19368857145309448,
"learning_rate": 9.198742646117254e-05,
"loss": 0.5129,
"step": 677
},
{
"epoch": 1.522740033688939,
"grad_norm": 0.20865383744239807,
"learning_rate": 9.178048320163954e-05,
"loss": 0.5136,
"step": 678
},
{
"epoch": 1.5249859629421674,
"grad_norm": 0.18743731081485748,
"learning_rate": 9.15734593343958e-05,
"loss": 0.5149,
"step": 679
},
{
"epoch": 1.5272318921953958,
"grad_norm": 0.22473086416721344,
"learning_rate": 9.136635627599783e-05,
"loss": 0.5155,
"step": 680
},
{
"epoch": 1.5294778214486242,
"grad_norm": 0.1838371306657791,
"learning_rate": 9.115917544354398e-05,
"loss": 0.5102,
"step": 681
},
{
"epoch": 1.5317237507018528,
"grad_norm": 0.19203968346118927,
"learning_rate": 9.095191825466481e-05,
"loss": 0.5225,
"step": 682
},
{
"epoch": 1.5339696799550815,
"grad_norm": 0.21374920010566711,
"learning_rate": 9.074458612751329e-05,
"loss": 0.5165,
"step": 683
},
{
"epoch": 1.53621560920831,
"grad_norm": 0.19073887169361115,
"learning_rate": 9.053718048075516e-05,
"loss": 0.5082,
"step": 684
},
{
"epoch": 1.5384615384615383,
"grad_norm": 0.21084338426589966,
"learning_rate": 9.032970273355926e-05,
"loss": 0.4975,
"step": 685
},
{
"epoch": 1.540707467714767,
"grad_norm": 0.20061564445495605,
"learning_rate": 9.012215430558776e-05,
"loss": 0.5048,
"step": 686
},
{
"epoch": 1.5429533969679956,
"grad_norm": 0.17530708014965057,
"learning_rate": 8.991453661698641e-05,
"loss": 0.51,
"step": 687
},
{
"epoch": 1.545199326221224,
"grad_norm": 0.2152005285024643,
"learning_rate": 8.970685108837497e-05,
"loss": 0.5224,
"step": 688
},
{
"epoch": 1.5474452554744524,
"grad_norm": 0.1882491558790207,
"learning_rate": 8.949909914083732e-05,
"loss": 0.5271,
"step": 689
},
{
"epoch": 1.549691184727681,
"grad_norm": 0.21567484736442566,
"learning_rate": 8.92912821959118e-05,
"loss": 0.5156,
"step": 690
},
{
"epoch": 1.5519371139809097,
"grad_norm": 0.19783969223499298,
"learning_rate": 8.908340167558154e-05,
"loss": 0.4966,
"step": 691
},
{
"epoch": 1.5541830432341381,
"grad_norm": 0.20946729183197021,
"learning_rate": 8.88754590022647e-05,
"loss": 0.4923,
"step": 692
},
{
"epoch": 1.5564289724873666,
"grad_norm": 0.19118967652320862,
"learning_rate": 8.866745559880464e-05,
"loss": 0.5136,
"step": 693
},
{
"epoch": 1.5586749017405952,
"grad_norm": 0.2122071534395218,
"learning_rate": 8.845939288846032e-05,
"loss": 0.5155,
"step": 694
},
{
"epoch": 1.5609208309938238,
"grad_norm": 0.1733548641204834,
"learning_rate": 8.825127229489653e-05,
"loss": 0.4971,
"step": 695
},
{
"epoch": 1.5631667602470523,
"grad_norm": 0.2194015234708786,
"learning_rate": 8.804309524217408e-05,
"loss": 0.4942,
"step": 696
},
{
"epoch": 1.5654126895002807,
"grad_norm": 0.1795753836631775,
"learning_rate": 8.783486315474008e-05,
"loss": 0.5032,
"step": 697
},
{
"epoch": 1.5676586187535093,
"grad_norm": 0.21514686942100525,
"learning_rate": 8.762657745741831e-05,
"loss": 0.5036,
"step": 698
},
{
"epoch": 1.5699045480067377,
"grad_norm": 0.20286062359809875,
"learning_rate": 8.741823957539926e-05,
"loss": 0.5097,
"step": 699
},
{
"epoch": 1.5721504772599664,
"grad_norm": 0.19607621431350708,
"learning_rate": 8.720985093423053e-05,
"loss": 0.498,
"step": 700
},
{
"epoch": 1.5743964065131948,
"grad_norm": 0.23368516564369202,
"learning_rate": 8.700141295980711e-05,
"loss": 0.529,
"step": 701
},
{
"epoch": 1.5766423357664232,
"grad_norm": 0.21203581988811493,
"learning_rate": 8.679292707836149e-05,
"loss": 0.4959,
"step": 702
},
{
"epoch": 1.5788882650196518,
"grad_norm": 0.26587924361228943,
"learning_rate": 8.658439471645391e-05,
"loss": 0.5201,
"step": 703
},
{
"epoch": 1.5811341942728805,
"grad_norm": 0.1834084540605545,
"learning_rate": 8.637581730096275e-05,
"loss": 0.504,
"step": 704
},
{
"epoch": 1.583380123526109,
"grad_norm": 0.24840541183948517,
"learning_rate": 8.616719625907463e-05,
"loss": 0.5149,
"step": 705
},
{
"epoch": 1.5856260527793373,
"grad_norm": 0.18650217354297638,
"learning_rate": 8.595853301827469e-05,
"loss": 0.4866,
"step": 706
},
{
"epoch": 1.587871982032566,
"grad_norm": 0.21472761034965515,
"learning_rate": 8.574982900633676e-05,
"loss": 0.513,
"step": 707
},
{
"epoch": 1.5901179112857946,
"grad_norm": 0.20243674516677856,
"learning_rate": 8.554108565131373e-05,
"loss": 0.5073,
"step": 708
},
{
"epoch": 1.592363840539023,
"grad_norm": 0.18156473338603973,
"learning_rate": 8.533230438152765e-05,
"loss": 0.5117,
"step": 709
},
{
"epoch": 1.5946097697922514,
"grad_norm": 0.18785932660102844,
"learning_rate": 8.512348662555996e-05,
"loss": 0.5184,
"step": 710
},
{
"epoch": 1.59685569904548,
"grad_norm": 0.19026771187782288,
"learning_rate": 8.49146338122419e-05,
"loss": 0.493,
"step": 711
},
{
"epoch": 1.5991016282987087,
"grad_norm": 0.1765296906232834,
"learning_rate": 8.47057473706444e-05,
"loss": 0.4921,
"step": 712
},
{
"epoch": 1.6013475575519371,
"grad_norm": 0.18513350188732147,
"learning_rate": 8.449682873006862e-05,
"loss": 0.5043,
"step": 713
},
{
"epoch": 1.6035934868051656,
"grad_norm": 0.1919069141149521,
"learning_rate": 8.4287879320036e-05,
"loss": 0.4893,
"step": 714
},
{
"epoch": 1.6058394160583942,
"grad_norm": 0.18348896503448486,
"learning_rate": 8.40789005702785e-05,
"loss": 0.5287,
"step": 715
},
{
"epoch": 1.6080853453116228,
"grad_norm": 0.19792461395263672,
"learning_rate": 8.386989391072892e-05,
"loss": 0.518,
"step": 716
},
{
"epoch": 1.6103312745648513,
"grad_norm": 0.2027343064546585,
"learning_rate": 8.366086077151091e-05,
"loss": 0.5109,
"step": 717
},
{
"epoch": 1.6125772038180797,
"grad_norm": 0.2016996443271637,
"learning_rate": 8.34518025829294e-05,
"loss": 0.5169,
"step": 718
},
{
"epoch": 1.614823133071308,
"grad_norm": 0.20013925433158875,
"learning_rate": 8.324272077546064e-05,
"loss": 0.4997,
"step": 719
},
{
"epoch": 1.6170690623245367,
"grad_norm": 0.18940746784210205,
"learning_rate": 8.30336167797426e-05,
"loss": 0.4962,
"step": 720
},
{
"epoch": 1.6193149915777654,
"grad_norm": 0.20259737968444824,
"learning_rate": 8.282449202656496e-05,
"loss": 0.524,
"step": 721
},
{
"epoch": 1.6215609208309938,
"grad_norm": 0.22202381491661072,
"learning_rate": 8.261534794685952e-05,
"loss": 0.4966,
"step": 722
},
{
"epoch": 1.6238068500842222,
"grad_norm": 0.19881968200206757,
"learning_rate": 8.240618597169029e-05,
"loss": 0.5065,
"step": 723
},
{
"epoch": 1.6260527793374508,
"grad_norm": 0.1963961273431778,
"learning_rate": 8.219700753224371e-05,
"loss": 0.5027,
"step": 724
},
{
"epoch": 1.6282987085906795,
"grad_norm": 0.20289023220539093,
"learning_rate": 8.198781405981888e-05,
"loss": 0.5123,
"step": 725
},
{
"epoch": 1.630544637843908,
"grad_norm": 0.20166555047035217,
"learning_rate": 8.177860698581778e-05,
"loss": 0.4844,
"step": 726
},
{
"epoch": 1.6327905670971363,
"grad_norm": 0.21527273952960968,
"learning_rate": 8.156938774173548e-05,
"loss": 0.4884,
"step": 727
},
{
"epoch": 1.635036496350365,
"grad_norm": 0.19657008349895477,
"learning_rate": 8.136015775915025e-05,
"loss": 0.5046,
"step": 728
},
{
"epoch": 1.6372824256035936,
"grad_norm": 0.1984531283378601,
"learning_rate": 8.11509184697139e-05,
"loss": 0.5075,
"step": 729
},
{
"epoch": 1.639528354856822,
"grad_norm": 0.18290367722511292,
"learning_rate": 8.094167130514195e-05,
"loss": 0.5094,
"step": 730
},
{
"epoch": 1.6417742841100504,
"grad_norm": 0.18201418220996857,
"learning_rate": 8.073241769720371e-05,
"loss": 0.4916,
"step": 731
},
{
"epoch": 1.644020213363279,
"grad_norm": 0.17987395823001862,
"learning_rate": 8.052315907771262e-05,
"loss": 0.5107,
"step": 732
},
{
"epoch": 1.6462661426165077,
"grad_norm": 0.17415151000022888,
"learning_rate": 8.031389687851647e-05,
"loss": 0.4787,
"step": 733
},
{
"epoch": 1.6485120718697361,
"grad_norm": 0.18529638648033142,
"learning_rate": 8.010463253148746e-05,
"loss": 0.4942,
"step": 734
},
{
"epoch": 1.6507580011229646,
"grad_norm": 0.18021097779273987,
"learning_rate": 7.989536746851255e-05,
"loss": 0.5244,
"step": 735
},
{
"epoch": 1.6530039303761932,
"grad_norm": 0.18884895741939545,
"learning_rate": 7.968610312148354e-05,
"loss": 0.5067,
"step": 736
},
{
"epoch": 1.6552498596294218,
"grad_norm": 0.17446008324623108,
"learning_rate": 7.94768409222874e-05,
"loss": 0.4919,
"step": 737
},
{
"epoch": 1.6574957888826503,
"grad_norm": 0.16754934191703796,
"learning_rate": 7.926758230279634e-05,
"loss": 0.504,
"step": 738
},
{
"epoch": 1.6597417181358787,
"grad_norm": 0.17202447354793549,
"learning_rate": 7.905832869485808e-05,
"loss": 0.5118,
"step": 739
},
{
"epoch": 1.661987647389107,
"grad_norm": 0.17612679302692413,
"learning_rate": 7.88490815302861e-05,
"loss": 0.4997,
"step": 740
},
{
"epoch": 1.6642335766423357,
"grad_norm": 0.1580231636762619,
"learning_rate": 7.863984224084977e-05,
"loss": 0.477,
"step": 741
},
{
"epoch": 1.6664795058955644,
"grad_norm": 0.1829080730676651,
"learning_rate": 7.843061225826455e-05,
"loss": 0.5091,
"step": 742
},
{
"epoch": 1.6687254351487928,
"grad_norm": 0.17909185588359833,
"learning_rate": 7.822139301418226e-05,
"loss": 0.5197,
"step": 743
},
{
"epoch": 1.6709713644020212,
"grad_norm": 0.18631631135940552,
"learning_rate": 7.801218594018115e-05,
"loss": 0.5069,
"step": 744
},
{
"epoch": 1.6732172936552498,
"grad_norm": 0.17326535284519196,
"learning_rate": 7.78029924677563e-05,
"loss": 0.5088,
"step": 745
},
{
"epoch": 1.6754632229084785,
"grad_norm": 0.20143157243728638,
"learning_rate": 7.759381402830973e-05,
"loss": 0.528,
"step": 746
},
{
"epoch": 1.677709152161707,
"grad_norm": 0.1783144623041153,
"learning_rate": 7.738465205314048e-05,
"loss": 0.4956,
"step": 747
},
{
"epoch": 1.6799550814149353,
"grad_norm": 0.19444549083709717,
"learning_rate": 7.717550797343506e-05,
"loss": 0.4859,
"step": 748
},
{
"epoch": 1.682201010668164,
"grad_norm": 0.18391017615795135,
"learning_rate": 7.696638322025744e-05,
"loss": 0.5036,
"step": 749
},
{
"epoch": 1.6844469399213926,
"grad_norm": 0.2030087262392044,
"learning_rate": 7.675727922453939e-05,
"loss": 0.5032,
"step": 750
},
{
"epoch": 1.686692869174621,
"grad_norm": 0.17419691383838654,
"learning_rate": 7.654819741707065e-05,
"loss": 0.5055,
"step": 751
},
{
"epoch": 1.6889387984278494,
"grad_norm": 0.1854201853275299,
"learning_rate": 7.633913922848912e-05,
"loss": 0.5,
"step": 752
},
{
"epoch": 1.691184727681078,
"grad_norm": 0.19161422550678253,
"learning_rate": 7.613010608927113e-05,
"loss": 0.4888,
"step": 753
},
{
"epoch": 1.6934306569343067,
"grad_norm": 0.1729954481124878,
"learning_rate": 7.592109942972152e-05,
"loss": 0.5028,
"step": 754
},
{
"epoch": 1.6956765861875351,
"grad_norm": 0.19286830723285675,
"learning_rate": 7.571212067996402e-05,
"loss": 0.5133,
"step": 755
},
{
"epoch": 1.6979225154407636,
"grad_norm": 0.17671585083007812,
"learning_rate": 7.550317126993141e-05,
"loss": 0.5035,
"step": 756
},
{
"epoch": 1.700168444693992,
"grad_norm": 0.1909675896167755,
"learning_rate": 7.529425262935561e-05,
"loss": 0.5147,
"step": 757
},
{
"epoch": 1.7024143739472206,
"grad_norm": 0.1676298975944519,
"learning_rate": 7.508536618775814e-05,
"loss": 0.488,
"step": 758
},
{
"epoch": 1.7046603032004493,
"grad_norm": 0.1871660202741623,
"learning_rate": 7.487651337444005e-05,
"loss": 0.4986,
"step": 759
},
{
"epoch": 1.7069062324536777,
"grad_norm": 0.17889705300331116,
"learning_rate": 7.466769561847239e-05,
"loss": 0.5103,
"step": 760
},
{
"epoch": 1.709152161706906,
"grad_norm": 0.18187767267227173,
"learning_rate": 7.445891434868628e-05,
"loss": 0.477,
"step": 761
},
{
"epoch": 1.7113980909601347,
"grad_norm": 0.17818237841129303,
"learning_rate": 7.425017099366326e-05,
"loss": 0.5143,
"step": 762
},
{
"epoch": 1.7136440202133634,
"grad_norm": 0.1854383796453476,
"learning_rate": 7.404146698172536e-05,
"loss": 0.5286,
"step": 763
},
{
"epoch": 1.7158899494665918,
"grad_norm": 0.1802191585302353,
"learning_rate": 7.383280374092538e-05,
"loss": 0.493,
"step": 764
},
{
"epoch": 1.7181358787198202,
"grad_norm": 0.17232070863246918,
"learning_rate": 7.362418269903728e-05,
"loss": 0.5124,
"step": 765
},
{
"epoch": 1.7203818079730488,
"grad_norm": 0.2103428691625595,
"learning_rate": 7.34156052835461e-05,
"loss": 0.5372,
"step": 766
},
{
"epoch": 1.7226277372262775,
"grad_norm": 0.1758391559123993,
"learning_rate": 7.320707292163853e-05,
"loss": 0.5019,
"step": 767
},
{
"epoch": 1.724873666479506,
"grad_norm": 0.19223737716674805,
"learning_rate": 7.299858704019291e-05,
"loss": 0.4956,
"step": 768
},
{
"epoch": 1.7271195957327343,
"grad_norm": 0.17237992584705353,
"learning_rate": 7.279014906576949e-05,
"loss": 0.4991,
"step": 769
},
{
"epoch": 1.729365524985963,
"grad_norm": 0.17996814846992493,
"learning_rate": 7.258176042460077e-05,
"loss": 0.4882,
"step": 770
},
{
"epoch": 1.7316114542391916,
"grad_norm": 0.17651812732219696,
"learning_rate": 7.237342254258173e-05,
"loss": 0.5167,
"step": 771
},
{
"epoch": 1.73385738349242,
"grad_norm": 0.19715122878551483,
"learning_rate": 7.216513684525992e-05,
"loss": 0.516,
"step": 772
},
{
"epoch": 1.7361033127456484,
"grad_norm": 0.16534049808979034,
"learning_rate": 7.195690475782596e-05,
"loss": 0.5241,
"step": 773
},
{
"epoch": 1.738349241998877,
"grad_norm": 0.20934666693210602,
"learning_rate": 7.174872770510348e-05,
"loss": 0.4848,
"step": 774
},
{
"epoch": 1.7405951712521057,
"grad_norm": 0.17493613064289093,
"learning_rate": 7.15406071115397e-05,
"loss": 0.509,
"step": 775
},
{
"epoch": 1.7428411005053341,
"grad_norm": 0.19224363565444946,
"learning_rate": 7.133254440119538e-05,
"loss": 0.5166,
"step": 776
},
{
"epoch": 1.7450870297585626,
"grad_norm": 0.17673024535179138,
"learning_rate": 7.11245409977353e-05,
"loss": 0.4919,
"step": 777
},
{
"epoch": 1.747332959011791,
"grad_norm": 0.17207755148410797,
"learning_rate": 7.091659832441848e-05,
"loss": 0.5325,
"step": 778
},
{
"epoch": 1.7495788882650196,
"grad_norm": 0.17099009454250336,
"learning_rate": 7.070871780408824e-05,
"loss": 0.4918,
"step": 779
},
{
"epoch": 1.7518248175182483,
"grad_norm": 0.16904598474502563,
"learning_rate": 7.05009008591627e-05,
"loss": 0.4883,
"step": 780
},
{
"epoch": 1.7540707467714767,
"grad_norm": 0.17518097162246704,
"learning_rate": 7.029314891162504e-05,
"loss": 0.5112,
"step": 781
},
{
"epoch": 1.756316676024705,
"grad_norm": 0.1848541796207428,
"learning_rate": 7.008546338301358e-05,
"loss": 0.522,
"step": 782
},
{
"epoch": 1.7585626052779337,
"grad_norm": 0.18024159967899323,
"learning_rate": 6.987784569441228e-05,
"loss": 0.5163,
"step": 783
},
{
"epoch": 1.7608085345311624,
"grad_norm": 0.16730569303035736,
"learning_rate": 6.967029726644075e-05,
"loss": 0.4693,
"step": 784
},
{
"epoch": 1.7630544637843908,
"grad_norm": 0.18763582408428192,
"learning_rate": 6.946281951924487e-05,
"loss": 0.5143,
"step": 785
},
{
"epoch": 1.7653003930376192,
"grad_norm": 0.16916576027870178,
"learning_rate": 6.925541387248674e-05,
"loss": 0.5188,
"step": 786
},
{
"epoch": 1.7675463222908478,
"grad_norm": 0.19620057940483093,
"learning_rate": 6.904808174533521e-05,
"loss": 0.5024,
"step": 787
},
{
"epoch": 1.7697922515440765,
"grad_norm": 0.16816137731075287,
"learning_rate": 6.884082455645606e-05,
"loss": 0.4878,
"step": 788
},
{
"epoch": 1.772038180797305,
"grad_norm": 0.1925499141216278,
"learning_rate": 6.863364372400221e-05,
"loss": 0.4922,
"step": 789
},
{
"epoch": 1.7742841100505333,
"grad_norm": 0.15602745115756989,
"learning_rate": 6.842654066560422e-05,
"loss": 0.4888,
"step": 790
},
{
"epoch": 1.776530039303762,
"grad_norm": 0.17124199867248535,
"learning_rate": 6.821951679836049e-05,
"loss": 0.4795,
"step": 791
},
{
"epoch": 1.7787759685569906,
"grad_norm": 0.17022277414798737,
"learning_rate": 6.801257353882746e-05,
"loss": 0.4966,
"step": 792
},
{
"epoch": 1.781021897810219,
"grad_norm": 0.15725384652614594,
"learning_rate": 6.78057123030101e-05,
"loss": 0.4905,
"step": 793
},
{
"epoch": 1.7832678270634474,
"grad_norm": 0.17000839114189148,
"learning_rate": 6.759893450635213e-05,
"loss": 0.498,
"step": 794
},
{
"epoch": 1.7855137563166759,
"grad_norm": 0.15647220611572266,
"learning_rate": 6.739224156372625e-05,
"loss": 0.4948,
"step": 795
},
{
"epoch": 1.7877596855699045,
"grad_norm": 0.17224030196666718,
"learning_rate": 6.718563488942463e-05,
"loss": 0.4995,
"step": 796
},
{
"epoch": 1.7900056148231331,
"grad_norm": 0.17135286331176758,
"learning_rate": 6.697911589714917e-05,
"loss": 0.5028,
"step": 797
},
{
"epoch": 1.7922515440763616,
"grad_norm": 0.1629776656627655,
"learning_rate": 6.677268600000172e-05,
"loss": 0.5004,
"step": 798
},
{
"epoch": 1.79449747332959,
"grad_norm": 0.19575197994709015,
"learning_rate": 6.656634661047461e-05,
"loss": 0.5112,
"step": 799
},
{
"epoch": 1.7967434025828186,
"grad_norm": 0.15462997555732727,
"learning_rate": 6.636009914044074e-05,
"loss": 0.5036,
"step": 800
},
{
"epoch": 1.7989893318360473,
"grad_norm": 0.19468659162521362,
"learning_rate": 6.615394500114417e-05,
"loss": 0.5062,
"step": 801
},
{
"epoch": 1.8012352610892757,
"grad_norm": 0.15850648283958435,
"learning_rate": 6.594788560319025e-05,
"loss": 0.5103,
"step": 802
},
{
"epoch": 1.803481190342504,
"grad_norm": 0.16901031136512756,
"learning_rate": 6.574192235653619e-05,
"loss": 0.4964,
"step": 803
},
{
"epoch": 1.8057271195957327,
"grad_norm": 0.16941389441490173,
"learning_rate": 6.553605667048119e-05,
"loss": 0.4956,
"step": 804
},
{
"epoch": 1.8079730488489614,
"grad_norm": 0.1633678376674652,
"learning_rate": 6.533028995365687e-05,
"loss": 0.4844,
"step": 805
},
{
"epoch": 1.8102189781021898,
"grad_norm": 0.16450218856334686,
"learning_rate": 6.51246236140177e-05,
"loss": 0.5039,
"step": 806
},
{
"epoch": 1.8124649073554182,
"grad_norm": 0.1649266928434372,
"learning_rate": 6.49190590588314e-05,
"loss": 0.5237,
"step": 807
},
{
"epoch": 1.8147108366086468,
"grad_norm": 0.17138883471488953,
"learning_rate": 6.471359769466907e-05,
"loss": 0.5086,
"step": 808
},
{
"epoch": 1.8169567658618755,
"grad_norm": 0.17378132045269012,
"learning_rate": 6.450824092739589e-05,
"loss": 0.5091,
"step": 809
},
{
"epoch": 1.819202695115104,
"grad_norm": 0.17285092175006866,
"learning_rate": 6.430299016216119e-05,
"loss": 0.5055,
"step": 810
},
{
"epoch": 1.8214486243683323,
"grad_norm": 0.1718919575214386,
"learning_rate": 6.409784680338905e-05,
"loss": 0.4842,
"step": 811
},
{
"epoch": 1.823694553621561,
"grad_norm": 0.16790670156478882,
"learning_rate": 6.389281225476867e-05,
"loss": 0.5004,
"step": 812
},
{
"epoch": 1.8259404828747896,
"grad_norm": 0.1849760264158249,
"learning_rate": 6.368788791924467e-05,
"loss": 0.4939,
"step": 813
},
{
"epoch": 1.828186412128018,
"grad_norm": 0.16113969683647156,
"learning_rate": 6.348307519900753e-05,
"loss": 0.5024,
"step": 814
},
{
"epoch": 1.8304323413812464,
"grad_norm": 0.1709127277135849,
"learning_rate": 6.3278375495484e-05,
"loss": 0.4977,
"step": 815
},
{
"epoch": 1.8326782706344749,
"grad_norm": 0.1758309006690979,
"learning_rate": 6.307379020932758e-05,
"loss": 0.4689,
"step": 816
},
{
"epoch": 1.8349241998877035,
"grad_norm": 0.16264449059963226,
"learning_rate": 6.286932074040876e-05,
"loss": 0.4974,
"step": 817
},
{
"epoch": 1.8371701291409321,
"grad_norm": 0.17811472713947296,
"learning_rate": 6.266496848780567e-05,
"loss": 0.4987,
"step": 818
},
{
"epoch": 1.8394160583941606,
"grad_norm": 0.17399878799915314,
"learning_rate": 6.246073484979436e-05,
"loss": 0.4867,
"step": 819
},
{
"epoch": 1.841661987647389,
"grad_norm": 0.17421691119670868,
"learning_rate": 6.225662122383918e-05,
"loss": 0.5162,
"step": 820
},
{
"epoch": 1.8439079169006176,
"grad_norm": 0.17920304834842682,
"learning_rate": 6.205262900658339e-05,
"loss": 0.5058,
"step": 821
},
{
"epoch": 1.8461538461538463,
"grad_norm": 0.16607870161533356,
"learning_rate": 6.184875959383947e-05,
"loss": 0.5063,
"step": 822
},
{
"epoch": 1.8483997754070747,
"grad_norm": 0.19281108677387238,
"learning_rate": 6.164501438057965e-05,
"loss": 0.4936,
"step": 823
},
{
"epoch": 1.850645704660303,
"grad_norm": 0.16037489473819733,
"learning_rate": 6.144139476092631e-05,
"loss": 0.4949,
"step": 824
},
{
"epoch": 1.8528916339135317,
"grad_norm": 0.19559049606323242,
"learning_rate": 6.123790212814241e-05,
"loss": 0.4981,
"step": 825
},
{
"epoch": 1.8551375631667604,
"grad_norm": 0.15469707548618317,
"learning_rate": 6.1034537874622085e-05,
"loss": 0.5021,
"step": 826
},
{
"epoch": 1.8573834924199888,
"grad_norm": 0.18738782405853271,
"learning_rate": 6.0831303391880975e-05,
"loss": 0.4846,
"step": 827
},
{
"epoch": 1.8596294216732172,
"grad_norm": 0.16658605635166168,
"learning_rate": 6.0628200070546796e-05,
"loss": 0.4945,
"step": 828
},
{
"epoch": 1.8618753509264458,
"grad_norm": 0.16776609420776367,
"learning_rate": 6.042522930034984e-05,
"loss": 0.4992,
"step": 829
},
{
"epoch": 1.8641212801796745,
"grad_norm": 0.17124858498573303,
"learning_rate": 6.022239247011331e-05,
"loss": 0.4915,
"step": 830
},
{
"epoch": 1.866367209432903,
"grad_norm": 0.15521185100078583,
"learning_rate": 6.001969096774399e-05,
"loss": 0.5134,
"step": 831
},
{
"epoch": 1.8686131386861313,
"grad_norm": 0.1691064089536667,
"learning_rate": 5.981712618022272e-05,
"loss": 0.5018,
"step": 832
},
{
"epoch": 1.87085906793936,
"grad_norm": 0.15585891902446747,
"learning_rate": 5.96146994935948e-05,
"loss": 0.5071,
"step": 833
},
{
"epoch": 1.8731049971925884,
"grad_norm": 0.1678674966096878,
"learning_rate": 5.9412412292960656e-05,
"loss": 0.5123,
"step": 834
},
{
"epoch": 1.875350926445817,
"grad_norm": 0.15515373647212982,
"learning_rate": 5.92102659624662e-05,
"loss": 0.495,
"step": 835
},
{
"epoch": 1.8775968556990454,
"grad_norm": 0.17066361010074615,
"learning_rate": 5.900826188529351e-05,
"loss": 0.4982,
"step": 836
},
{
"epoch": 1.8798427849522739,
"grad_norm": 0.14784766733646393,
"learning_rate": 5.880640144365124e-05,
"loss": 0.492,
"step": 837
},
{
"epoch": 1.8820887142055025,
"grad_norm": 0.1624741405248642,
"learning_rate": 5.86046860187653e-05,
"loss": 0.4985,
"step": 838
},
{
"epoch": 1.8843346434587311,
"grad_norm": 0.14903901517391205,
"learning_rate": 5.840311699086928e-05,
"loss": 0.4887,
"step": 839
},
{
"epoch": 1.8865805727119596,
"grad_norm": 0.16569632291793823,
"learning_rate": 5.820169573919499e-05,
"loss": 0.5031,
"step": 840
},
{
"epoch": 1.888826501965188,
"grad_norm": 0.15516996383666992,
"learning_rate": 5.800042364196319e-05,
"loss": 0.4974,
"step": 841
},
{
"epoch": 1.8910724312184166,
"grad_norm": 0.1705656498670578,
"learning_rate": 5.779930207637401e-05,
"loss": 0.5064,
"step": 842
},
{
"epoch": 1.8933183604716453,
"grad_norm": 0.16612909734249115,
"learning_rate": 5.759833241859755e-05,
"loss": 0.4928,
"step": 843
},
{
"epoch": 1.8955642897248737,
"grad_norm": 0.15691480040550232,
"learning_rate": 5.7397516043764564e-05,
"loss": 0.4992,
"step": 844
},
{
"epoch": 1.897810218978102,
"grad_norm": 0.15925776958465576,
"learning_rate": 5.719685432595681e-05,
"loss": 0.503,
"step": 845
},
{
"epoch": 1.9000561482313307,
"grad_norm": 0.1793777197599411,
"learning_rate": 5.6996348638198e-05,
"loss": 0.5015,
"step": 846
},
{
"epoch": 1.9023020774845594,
"grad_norm": 0.15224167704582214,
"learning_rate": 5.6796000352444056e-05,
"loss": 0.4791,
"step": 847
},
{
"epoch": 1.9045480067377878,
"grad_norm": 0.17081177234649658,
"learning_rate": 5.6595810839574e-05,
"loss": 0.4925,
"step": 848
},
{
"epoch": 1.9067939359910162,
"grad_norm": 0.1512937992811203,
"learning_rate": 5.6395781469380354e-05,
"loss": 0.4901,
"step": 849
},
{
"epoch": 1.9090398652442448,
"grad_norm": 0.15645167231559753,
"learning_rate": 5.619591361055998e-05,
"loss": 0.5001,
"step": 850
},
{
"epoch": 1.9112857944974735,
"grad_norm": 0.17164252698421478,
"learning_rate": 5.5996208630704445e-05,
"loss": 0.4956,
"step": 851
},
{
"epoch": 1.913531723750702,
"grad_norm": 0.15667004883289337,
"learning_rate": 5.579666789629098e-05,
"loss": 0.4906,
"step": 852
},
{
"epoch": 1.9157776530039303,
"grad_norm": 0.16768649220466614,
"learning_rate": 5.559729277267286e-05,
"loss": 0.5099,
"step": 853
},
{
"epoch": 1.9180235822571587,
"grad_norm": 0.16727498173713684,
"learning_rate": 5.539808462407026e-05,
"loss": 0.503,
"step": 854
},
{
"epoch": 1.9202695115103874,
"grad_norm": 0.16755451261997223,
"learning_rate": 5.519904481356076e-05,
"loss": 0.5099,
"step": 855
},
{
"epoch": 1.922515440763616,
"grad_norm": 0.16387148201465607,
"learning_rate": 5.500017470307007e-05,
"loss": 0.4957,
"step": 856
},
{
"epoch": 1.9247613700168444,
"grad_norm": 0.15775729715824127,
"learning_rate": 5.480147565336282e-05,
"loss": 0.4976,
"step": 857
},
{
"epoch": 1.9270072992700729,
"grad_norm": 0.1581815481185913,
"learning_rate": 5.4602949024033116e-05,
"loss": 0.4949,
"step": 858
},
{
"epoch": 1.9292532285233015,
"grad_norm": 0.15002784132957458,
"learning_rate": 5.4404596173495265e-05,
"loss": 0.5099,
"step": 859
},
{
"epoch": 1.9314991577765301,
"grad_norm": 0.15235161781311035,
"learning_rate": 5.420641845897455e-05,
"loss": 0.4809,
"step": 860
},
{
"epoch": 1.9337450870297586,
"grad_norm": 0.16005192697048187,
"learning_rate": 5.4008417236497815e-05,
"loss": 0.493,
"step": 861
},
{
"epoch": 1.935991016282987,
"grad_norm": 0.15347884595394135,
"learning_rate": 5.381059386088428e-05,
"loss": 0.5071,
"step": 862
},
{
"epoch": 1.9382369455362156,
"grad_norm": 0.15472036600112915,
"learning_rate": 5.361294968573629e-05,
"loss": 0.4924,
"step": 863
},
{
"epoch": 1.9404828747894443,
"grad_norm": 0.17055299878120422,
"learning_rate": 5.341548606343001e-05,
"loss": 0.5057,
"step": 864
},
{
"epoch": 1.9427288040426727,
"grad_norm": 0.15424910187721252,
"learning_rate": 5.321820434510617e-05,
"loss": 0.5041,
"step": 865
},
{
"epoch": 1.944974733295901,
"grad_norm": 0.15976421535015106,
"learning_rate": 5.302110588066075e-05,
"loss": 0.4742,
"step": 866
},
{
"epoch": 1.9472206625491297,
"grad_norm": 0.15673977136611938,
"learning_rate": 5.282419201873593e-05,
"loss": 0.49,
"step": 867
},
{
"epoch": 1.9494665918023584,
"grad_norm": 0.14829935133457184,
"learning_rate": 5.262746410671071e-05,
"loss": 0.5017,
"step": 868
},
{
"epoch": 1.9517125210555868,
"grad_norm": 0.14897191524505615,
"learning_rate": 5.243092349069169e-05,
"loss": 0.4803,
"step": 869
},
{
"epoch": 1.9539584503088152,
"grad_norm": 0.15609802305698395,
"learning_rate": 5.223457151550402e-05,
"loss": 0.4961,
"step": 870
},
{
"epoch": 1.9562043795620438,
"grad_norm": 0.15764057636260986,
"learning_rate": 5.203840952468191e-05,
"loss": 0.5003,
"step": 871
},
{
"epoch": 1.9584503088152723,
"grad_norm": 0.16121333837509155,
"learning_rate": 5.184243886045971e-05,
"loss": 0.5054,
"step": 872
},
{
"epoch": 1.960696238068501,
"grad_norm": 0.15507447719573975,
"learning_rate": 5.164666086376262e-05,
"loss": 0.4954,
"step": 873
},
{
"epoch": 1.9629421673217293,
"grad_norm": 0.16584189236164093,
"learning_rate": 5.145107687419751e-05,
"loss": 0.4924,
"step": 874
},
{
"epoch": 1.9651880965749577,
"grad_norm": 0.15702944993972778,
"learning_rate": 5.1255688230043766e-05,
"loss": 0.5004,
"step": 875
},
{
"epoch": 1.9674340258281864,
"grad_norm": 0.17031584680080414,
"learning_rate": 5.106049626824405e-05,
"loss": 0.5139,
"step": 876
},
{
"epoch": 1.969679955081415,
"grad_norm": 0.16193878650665283,
"learning_rate": 5.0865502324395345e-05,
"loss": 0.4849,
"step": 877
},
{
"epoch": 1.9719258843346434,
"grad_norm": 0.16150209307670593,
"learning_rate": 5.067070773273962e-05,
"loss": 0.4719,
"step": 878
},
{
"epoch": 1.9741718135878719,
"grad_norm": 0.1520845890045166,
"learning_rate": 5.047611382615481e-05,
"loss": 0.4995,
"step": 879
},
{
"epoch": 1.9764177428411005,
"grad_norm": 0.16827571392059326,
"learning_rate": 5.0281721936145713e-05,
"loss": 0.4908,
"step": 880
},
{
"epoch": 1.9786636720943291,
"grad_norm": 0.15770889818668365,
"learning_rate": 5.008753339283471e-05,
"loss": 0.5116,
"step": 881
},
{
"epoch": 1.9809096013475576,
"grad_norm": 0.1623336225748062,
"learning_rate": 4.98935495249529e-05,
"loss": 0.492,
"step": 882
},
{
"epoch": 1.983155530600786,
"grad_norm": 0.16279038786888123,
"learning_rate": 4.9699771659830855e-05,
"loss": 0.5021,
"step": 883
},
{
"epoch": 1.9854014598540146,
"grad_norm": 0.16874343156814575,
"learning_rate": 4.950620112338955e-05,
"loss": 0.4876,
"step": 884
},
{
"epoch": 1.9876473891072433,
"grad_norm": 0.15390436351299286,
"learning_rate": 4.931283924013141e-05,
"loss": 0.4879,
"step": 885
},
{
"epoch": 1.9898933183604717,
"grad_norm": 0.17372553050518036,
"learning_rate": 4.911968733313101e-05,
"loss": 0.4876,
"step": 886
},
{
"epoch": 1.9921392476137,
"grad_norm": 0.15854312479496002,
"learning_rate": 4.892674672402631e-05,
"loss": 0.5128,
"step": 887
},
{
"epoch": 1.9943851768669287,
"grad_norm": 0.1635546237230301,
"learning_rate": 4.873401873300934e-05,
"loss": 0.4946,
"step": 888
},
{
"epoch": 1.9966311061201574,
"grad_norm": 0.15970109403133392,
"learning_rate": 4.8541504678817435e-05,
"loss": 0.501,
"step": 889
},
{
"epoch": 1.9988770353733858,
"grad_norm": 0.1637182980775833,
"learning_rate": 4.834920587872397e-05,
"loss": 0.4807,
"step": 890
},
{
"epoch": 2.001122964626614,
"grad_norm": 0.17495502531528473,
"learning_rate": 4.815712364852945e-05,
"loss": 0.4725,
"step": 891
},
{
"epoch": 2.0033688938798426,
"grad_norm": 0.20412583649158478,
"learning_rate": 4.7965259302552546e-05,
"loss": 0.4545,
"step": 892
},
{
"epoch": 2.0056148231330715,
"grad_norm": 0.1694943606853485,
"learning_rate": 4.777361415362106e-05,
"loss": 0.4561,
"step": 893
},
{
"epoch": 2.0078607523863,
"grad_norm": 0.20532694458961487,
"learning_rate": 4.75821895130629e-05,
"loss": 0.4585,
"step": 894
},
{
"epoch": 2.0101066816395283,
"grad_norm": 0.21771076321601868,
"learning_rate": 4.739098669069723e-05,
"loss": 0.4609,
"step": 895
},
{
"epoch": 2.0123526108927567,
"grad_norm": 0.19157661497592926,
"learning_rate": 4.7200006994825314e-05,
"loss": 0.4533,
"step": 896
},
{
"epoch": 2.0145985401459856,
"grad_norm": 0.1829356700181961,
"learning_rate": 4.700925173222178e-05,
"loss": 0.4401,
"step": 897
},
{
"epoch": 2.016844469399214,
"grad_norm": 0.1815447062253952,
"learning_rate": 4.681872220812551e-05,
"loss": 0.4497,
"step": 898
},
{
"epoch": 2.0190903986524424,
"grad_norm": 0.16822156310081482,
"learning_rate": 4.662841972623084e-05,
"loss": 0.4573,
"step": 899
},
{
"epoch": 2.021336327905671,
"grad_norm": 0.18054281175136566,
"learning_rate": 4.643834558867852e-05,
"loss": 0.4589,
"step": 900
},
{
"epoch": 2.0235822571588993,
"grad_norm": 0.18319673836231232,
"learning_rate": 4.6248501096046827e-05,
"loss": 0.4376,
"step": 901
},
{
"epoch": 2.025828186412128,
"grad_norm": 0.1645708829164505,
"learning_rate": 4.605888754734278e-05,
"loss": 0.4304,
"step": 902
},
{
"epoch": 2.0280741156653566,
"grad_norm": 0.17893430590629578,
"learning_rate": 4.586950623999314e-05,
"loss": 0.4526,
"step": 903
},
{
"epoch": 2.030320044918585,
"grad_norm": 0.17927826941013336,
"learning_rate": 4.568035846983558e-05,
"loss": 0.4616,
"step": 904
},
{
"epoch": 2.0325659741718134,
"grad_norm": 0.1680602878332138,
"learning_rate": 4.549144553110974e-05,
"loss": 0.4611,
"step": 905
},
{
"epoch": 2.0348119034250423,
"grad_norm": 0.1612085998058319,
"learning_rate": 4.5302768716448434e-05,
"loss": 0.4567,
"step": 906
},
{
"epoch": 2.0370578326782707,
"grad_norm": 0.1724167913198471,
"learning_rate": 4.5114329316868875e-05,
"loss": 0.4666,
"step": 907
},
{
"epoch": 2.039303761931499,
"grad_norm": 0.15838028490543365,
"learning_rate": 4.492612862176371e-05,
"loss": 0.4529,
"step": 908
},
{
"epoch": 2.0415496911847275,
"grad_norm": 0.15649183094501495,
"learning_rate": 4.473816791889228e-05,
"loss": 0.4462,
"step": 909
},
{
"epoch": 2.0437956204379564,
"grad_norm": 0.16123028099536896,
"learning_rate": 4.455044849437182e-05,
"loss": 0.4345,
"step": 910
},
{
"epoch": 2.046041549691185,
"grad_norm": 0.16162772476673126,
"learning_rate": 4.436297163266853e-05,
"loss": 0.4585,
"step": 911
},
{
"epoch": 2.048287478944413,
"grad_norm": 0.1522200107574463,
"learning_rate": 4.4175738616588894e-05,
"loss": 0.4614,
"step": 912
},
{
"epoch": 2.0505334081976416,
"grad_norm": 0.16501305997371674,
"learning_rate": 4.398875072727097e-05,
"loss": 0.4486,
"step": 913
},
{
"epoch": 2.0527793374508705,
"grad_norm": 0.15927040576934814,
"learning_rate": 4.380200924417548e-05,
"loss": 0.4574,
"step": 914
},
{
"epoch": 2.055025266704099,
"grad_norm": 0.1553938090801239,
"learning_rate": 4.361551544507713e-05,
"loss": 0.4446,
"step": 915
},
{
"epoch": 2.0572711959573273,
"grad_norm": 0.16552136838436127,
"learning_rate": 4.3429270606055895e-05,
"loss": 0.4583,
"step": 916
},
{
"epoch": 2.0595171252105557,
"grad_norm": 0.1564835011959076,
"learning_rate": 4.3243276001488156e-05,
"loss": 0.4476,
"step": 917
},
{
"epoch": 2.0617630544637846,
"grad_norm": 0.1577308475971222,
"learning_rate": 4.305753290403809e-05,
"loss": 0.4632,
"step": 918
},
{
"epoch": 2.064008983717013,
"grad_norm": 0.15984061360359192,
"learning_rate": 4.2872042584649015e-05,
"loss": 0.4624,
"step": 919
},
{
"epoch": 2.0662549129702414,
"grad_norm": 0.16448809206485748,
"learning_rate": 4.268680631253455e-05,
"loss": 0.4436,
"step": 920
},
{
"epoch": 2.06850084222347,
"grad_norm": 0.16196516156196594,
"learning_rate": 4.250182535517008e-05,
"loss": 0.4375,
"step": 921
},
{
"epoch": 2.0707467714766983,
"grad_norm": 0.15193282067775726,
"learning_rate": 4.231710097828388e-05,
"loss": 0.4287,
"step": 922
},
{
"epoch": 2.072992700729927,
"grad_norm": 0.16018415987491608,
"learning_rate": 4.2132634445848704e-05,
"loss": 0.4543,
"step": 923
},
{
"epoch": 2.0752386299831556,
"grad_norm": 0.16128796339035034,
"learning_rate": 4.194842702007289e-05,
"loss": 0.4621,
"step": 924
},
{
"epoch": 2.077484559236384,
"grad_norm": 0.15342706441879272,
"learning_rate": 4.176447996139196e-05,
"loss": 0.4355,
"step": 925
},
{
"epoch": 2.0797304884896124,
"grad_norm": 0.1577060967683792,
"learning_rate": 4.1580794528459834e-05,
"loss": 0.4521,
"step": 926
},
{
"epoch": 2.0819764177428413,
"grad_norm": 0.16120131313800812,
"learning_rate": 4.13973719781402e-05,
"loss": 0.4501,
"step": 927
},
{
"epoch": 2.0842223469960697,
"grad_norm": 0.16163085401058197,
"learning_rate": 4.1214213565498086e-05,
"loss": 0.4518,
"step": 928
},
{
"epoch": 2.086468276249298,
"grad_norm": 0.1605272889137268,
"learning_rate": 4.10313205437911e-05,
"loss": 0.4334,
"step": 929
},
{
"epoch": 2.0887142055025265,
"grad_norm": 0.16757291555404663,
"learning_rate": 4.084869416446095e-05,
"loss": 0.4579,
"step": 930
},
{
"epoch": 2.0909601347557554,
"grad_norm": 0.1572689265012741,
"learning_rate": 4.0666335677124816e-05,
"loss": 0.4462,
"step": 931
},
{
"epoch": 2.093206064008984,
"grad_norm": 0.1841953992843628,
"learning_rate": 4.048424632956681e-05,
"loss": 0.4241,
"step": 932
},
{
"epoch": 2.095451993262212,
"grad_norm": 0.1640552282333374,
"learning_rate": 4.030242736772952e-05,
"loss": 0.4495,
"step": 933
},
{
"epoch": 2.0976979225154406,
"grad_norm": 0.15904076397418976,
"learning_rate": 4.0120880035705416e-05,
"loss": 0.4513,
"step": 934
},
{
"epoch": 2.0999438517686695,
"grad_norm": 0.17605750262737274,
"learning_rate": 3.9939605575728315e-05,
"loss": 0.4444,
"step": 935
},
{
"epoch": 2.102189781021898,
"grad_norm": 0.15149734914302826,
"learning_rate": 3.975860522816497e-05,
"loss": 0.4423,
"step": 936
},
{
"epoch": 2.1044357102751263,
"grad_norm": 0.15931731462478638,
"learning_rate": 3.957788023150647e-05,
"loss": 0.4558,
"step": 937
},
{
"epoch": 2.1066816395283547,
"grad_norm": 0.1513037383556366,
"learning_rate": 3.939743182235978e-05,
"loss": 0.4451,
"step": 938
},
{
"epoch": 2.108927568781583,
"grad_norm": 0.1563446819782257,
"learning_rate": 3.921726123543942e-05,
"loss": 0.4438,
"step": 939
},
{
"epoch": 2.111173498034812,
"grad_norm": 0.14871710538864136,
"learning_rate": 3.9037369703558876e-05,
"loss": 0.449,
"step": 940
},
{
"epoch": 2.1134194272880404,
"grad_norm": 0.14909642934799194,
"learning_rate": 3.8857758457622246e-05,
"loss": 0.4643,
"step": 941
},
{
"epoch": 2.115665356541269,
"grad_norm": 0.15018634498119354,
"learning_rate": 3.867842872661565e-05,
"loss": 0.4483,
"step": 942
},
{
"epoch": 2.1179112857944973,
"grad_norm": 0.16879071295261383,
"learning_rate": 3.8499381737599124e-05,
"loss": 0.4726,
"step": 943
},
{
"epoch": 2.120157215047726,
"grad_norm": 0.1683071106672287,
"learning_rate": 3.832061871569787e-05,
"loss": 0.4499,
"step": 944
},
{
"epoch": 2.1224031443009546,
"grad_norm": 0.15678516030311584,
"learning_rate": 3.814214088409419e-05,
"loss": 0.4484,
"step": 945
},
{
"epoch": 2.124649073554183,
"grad_norm": 0.1773703545331955,
"learning_rate": 3.7963949464018945e-05,
"loss": 0.4605,
"step": 946
},
{
"epoch": 2.1268950028074114,
"grad_norm": 0.1767614483833313,
"learning_rate": 3.778604567474314e-05,
"loss": 0.4574,
"step": 947
},
{
"epoch": 2.1291409320606403,
"grad_norm": 0.15908308327198029,
"learning_rate": 3.760843073356981e-05,
"loss": 0.4357,
"step": 948
},
{
"epoch": 2.1313868613138687,
"grad_norm": 0.1637633740901947,
"learning_rate": 3.743110585582549e-05,
"loss": 0.4566,
"step": 949
},
{
"epoch": 2.133632790567097,
"grad_norm": 0.1657618135213852,
"learning_rate": 3.725407225485191e-05,
"loss": 0.4497,
"step": 950
},
{
"epoch": 2.1358787198203255,
"grad_norm": 0.15281249582767487,
"learning_rate": 3.707733114199783e-05,
"loss": 0.4494,
"step": 951
},
{
"epoch": 2.1381246490735544,
"grad_norm": 0.16828225553035736,
"learning_rate": 3.690088372661061e-05,
"loss": 0.4412,
"step": 952
},
{
"epoch": 2.140370578326783,
"grad_norm": 0.16215671598911285,
"learning_rate": 3.672473121602801e-05,
"loss": 0.449,
"step": 953
},
{
"epoch": 2.142616507580011,
"grad_norm": 0.14198768138885498,
"learning_rate": 3.654887481556993e-05,
"loss": 0.4556,
"step": 954
},
{
"epoch": 2.1448624368332396,
"grad_norm": 0.1703426092863083,
"learning_rate": 3.6373315728530145e-05,
"loss": 0.4456,
"step": 955
},
{
"epoch": 2.147108366086468,
"grad_norm": 0.15878015756607056,
"learning_rate": 3.6198055156168025e-05,
"loss": 0.4593,
"step": 956
},
{
"epoch": 2.149354295339697,
"grad_norm": 0.15779636800289154,
"learning_rate": 3.602309429770034e-05,
"loss": 0.4543,
"step": 957
},
{
"epoch": 2.1516002245929253,
"grad_norm": 0.15963739156723022,
"learning_rate": 3.584843435029316e-05,
"loss": 0.4363,
"step": 958
},
{
"epoch": 2.1538461538461537,
"grad_norm": 0.15662063658237457,
"learning_rate": 3.567407650905353e-05,
"loss": 0.458,
"step": 959
},
{
"epoch": 2.156092083099382,
"grad_norm": 0.14531637728214264,
"learning_rate": 3.5500021967021344e-05,
"loss": 0.4474,
"step": 960
},
{
"epoch": 2.158338012352611,
"grad_norm": 0.15317556262016296,
"learning_rate": 3.5326271915161205e-05,
"loss": 0.439,
"step": 961
},
{
"epoch": 2.1605839416058394,
"grad_norm": 0.15082910656929016,
"learning_rate": 3.515282754235419e-05,
"loss": 0.4497,
"step": 962
},
{
"epoch": 2.162829870859068,
"grad_norm": 0.14299066364765167,
"learning_rate": 3.4979690035389774e-05,
"loss": 0.4468,
"step": 963
},
{
"epoch": 2.1650758001122963,
"grad_norm": 0.1458815485239029,
"learning_rate": 3.480686057895778e-05,
"loss": 0.453,
"step": 964
},
{
"epoch": 2.167321729365525,
"grad_norm": 0.1518121361732483,
"learning_rate": 3.4634340355640136e-05,
"loss": 0.4393,
"step": 965
},
{
"epoch": 2.1695676586187536,
"grad_norm": 0.14630930125713348,
"learning_rate": 3.446213054590291e-05,
"loss": 0.4617,
"step": 966
},
{
"epoch": 2.171813587871982,
"grad_norm": 0.15554536879062653,
"learning_rate": 3.4290232328088136e-05,
"loss": 0.4555,
"step": 967
},
{
"epoch": 2.1740595171252104,
"grad_norm": 0.1420973539352417,
"learning_rate": 3.4118646878405755e-05,
"loss": 0.4575,
"step": 968
},
{
"epoch": 2.1763054463784393,
"grad_norm": 0.15307992696762085,
"learning_rate": 3.394737537092562e-05,
"loss": 0.466,
"step": 969
},
{
"epoch": 2.1785513756316677,
"grad_norm": 0.14762836694717407,
"learning_rate": 3.377641897756947e-05,
"loss": 0.4653,
"step": 970
},
{
"epoch": 2.180797304884896,
"grad_norm": 0.14197176694869995,
"learning_rate": 3.360577886810286e-05,
"loss": 0.4534,
"step": 971
},
{
"epoch": 2.1830432341381245,
"grad_norm": 0.14574755728244781,
"learning_rate": 3.343545621012721e-05,
"loss": 0.4436,
"step": 972
},
{
"epoch": 2.1852891633913534,
"grad_norm": 0.1501995027065277,
"learning_rate": 3.326545216907171e-05,
"loss": 0.4551,
"step": 973
},
{
"epoch": 2.187535092644582,
"grad_norm": 0.15226097404956818,
"learning_rate": 3.309576790818551e-05,
"loss": 0.4458,
"step": 974
},
{
"epoch": 2.18978102189781,
"grad_norm": 0.14684434235095978,
"learning_rate": 3.292640458852958e-05,
"loss": 0.4494,
"step": 975
},
{
"epoch": 2.1920269511510386,
"grad_norm": 0.14523442089557648,
"learning_rate": 3.275736336896893e-05,
"loss": 0.4445,
"step": 976
},
{
"epoch": 2.1942728804042675,
"grad_norm": 0.1518959403038025,
"learning_rate": 3.25886454061646e-05,
"loss": 0.4649,
"step": 977
},
{
"epoch": 2.196518809657496,
"grad_norm": 0.1398971676826477,
"learning_rate": 3.2420251854565704e-05,
"loss": 0.4563,
"step": 978
},
{
"epoch": 2.1987647389107243,
"grad_norm": 0.13926076889038086,
"learning_rate": 3.22521838664016e-05,
"loss": 0.4479,
"step": 979
},
{
"epoch": 2.2010106681639527,
"grad_norm": 0.14644260704517365,
"learning_rate": 3.2084442591674024e-05,
"loss": 0.4349,
"step": 980
},
{
"epoch": 2.203256597417181,
"grad_norm": 0.14670224487781525,
"learning_rate": 3.191702917814916e-05,
"loss": 0.4532,
"step": 981
},
{
"epoch": 2.20550252667041,
"grad_norm": 0.13720498979091644,
"learning_rate": 3.174994477134978e-05,
"loss": 0.431,
"step": 982
},
{
"epoch": 2.2077484559236384,
"grad_norm": 0.13734634220600128,
"learning_rate": 3.158319051454743e-05,
"loss": 0.437,
"step": 983
},
{
"epoch": 2.209994385176867,
"grad_norm": 0.14033032953739166,
"learning_rate": 3.141676754875465e-05,
"loss": 0.4487,
"step": 984
},
{
"epoch": 2.2122403144300953,
"grad_norm": 0.1471083164215088,
"learning_rate": 3.1250677012717135e-05,
"loss": 0.4544,
"step": 985
},
{
"epoch": 2.214486243683324,
"grad_norm": 0.13971002399921417,
"learning_rate": 3.10849200429059e-05,
"loss": 0.4535,
"step": 986
},
{
"epoch": 2.2167321729365526,
"grad_norm": 0.1465609073638916,
"learning_rate": 3.091949777350958e-05,
"loss": 0.4482,
"step": 987
},
{
"epoch": 2.218978102189781,
"grad_norm": 0.14760175347328186,
"learning_rate": 3.075441133642659e-05,
"loss": 0.4461,
"step": 988
},
{
"epoch": 2.2212240314430094,
"grad_norm": 0.1456819474697113,
"learning_rate": 3.05896618612574e-05,
"loss": 0.4468,
"step": 989
},
{
"epoch": 2.2234699606962383,
"grad_norm": 0.14734943211078644,
"learning_rate": 3.0425250475296883e-05,
"loss": 0.433,
"step": 990
},
{
"epoch": 2.2257158899494667,
"grad_norm": 0.13213606178760529,
"learning_rate": 3.0261178303526536e-05,
"loss": 0.4395,
"step": 991
},
{
"epoch": 2.227961819202695,
"grad_norm": 0.14420166611671448,
"learning_rate": 3.0097446468606785e-05,
"loss": 0.4391,
"step": 992
},
{
"epoch": 2.2302077484559235,
"grad_norm": 0.14115062355995178,
"learning_rate": 2.9934056090869242e-05,
"loss": 0.4371,
"step": 993
},
{
"epoch": 2.2324536777091524,
"grad_norm": 0.14169073104858398,
"learning_rate": 2.9771008288309224e-05,
"loss": 0.4334,
"step": 994
},
{
"epoch": 2.234699606962381,
"grad_norm": 0.14184604585170746,
"learning_rate": 2.9608304176577872e-05,
"loss": 0.4442,
"step": 995
},
{
"epoch": 2.236945536215609,
"grad_norm": 0.14200329780578613,
"learning_rate": 2.9445944868974688e-05,
"loss": 0.465,
"step": 996
},
{
"epoch": 2.2391914654688376,
"grad_norm": 0.14416737854480743,
"learning_rate": 2.9283931476439886e-05,
"loss": 0.4423,
"step": 997
},
{
"epoch": 2.241437394722066,
"grad_norm": 0.14188611507415771,
"learning_rate": 2.9122265107546677e-05,
"loss": 0.4647,
"step": 998
},
{
"epoch": 2.243683323975295,
"grad_norm": 0.14122439920902252,
"learning_rate": 2.8960946868493843e-05,
"loss": 0.4317,
"step": 999
},
{
"epoch": 2.2459292532285233,
"grad_norm": 0.14019352197647095,
"learning_rate": 2.87999778630981e-05,
"loss": 0.4415,
"step": 1000
},
{
"epoch": 2.2481751824817517,
"grad_norm": 0.1378793567419052,
"learning_rate": 2.863935919278645e-05,
"loss": 0.4537,
"step": 1001
},
{
"epoch": 2.25042111173498,
"grad_norm": 0.14002038538455963,
"learning_rate": 2.847909195658886e-05,
"loss": 0.4427,
"step": 1002
},
{
"epoch": 2.252667040988209,
"grad_norm": 0.1482112854719162,
"learning_rate": 2.8319177251130495e-05,
"loss": 0.4465,
"step": 1003
},
{
"epoch": 2.2549129702414374,
"grad_norm": 0.1393243372440338,
"learning_rate": 2.815961617062442e-05,
"loss": 0.4405,
"step": 1004
},
{
"epoch": 2.257158899494666,
"grad_norm": 0.14361439645290375,
"learning_rate": 2.8000409806864007e-05,
"loss": 0.4672,
"step": 1005
},
{
"epoch": 2.2594048287478943,
"grad_norm": 0.13548092544078827,
"learning_rate": 2.7841559249215503e-05,
"loss": 0.4557,
"step": 1006
},
{
"epoch": 2.261650758001123,
"grad_norm": 0.13999567925930023,
"learning_rate": 2.768306558461051e-05,
"loss": 0.4577,
"step": 1007
},
{
"epoch": 2.2638966872543516,
"grad_norm": 0.14704839885234833,
"learning_rate": 2.75249298975386e-05,
"loss": 0.4556,
"step": 1008
},
{
"epoch": 2.26614261650758,
"grad_norm": 0.1454869657754898,
"learning_rate": 2.7367153270039934e-05,
"loss": 0.4656,
"step": 1009
},
{
"epoch": 2.2683885457608084,
"grad_norm": 0.14805535972118378,
"learning_rate": 2.720973678169781e-05,
"loss": 0.4463,
"step": 1010
},
{
"epoch": 2.2706344750140373,
"grad_norm": 0.14422546327114105,
"learning_rate": 2.705268150963125e-05,
"loss": 0.4463,
"step": 1011
},
{
"epoch": 2.2728804042672657,
"grad_norm": 0.14471085369586945,
"learning_rate": 2.6895988528487724e-05,
"loss": 0.4499,
"step": 1012
},
{
"epoch": 2.275126333520494,
"grad_norm": 0.14727704226970673,
"learning_rate": 2.6739658910435663e-05,
"loss": 0.4498,
"step": 1013
},
{
"epoch": 2.2773722627737225,
"grad_norm": 0.13678747415542603,
"learning_rate": 2.6583693725157176e-05,
"loss": 0.4396,
"step": 1014
},
{
"epoch": 2.279618192026951,
"grad_norm": 0.14493557810783386,
"learning_rate": 2.6428094039840827e-05,
"loss": 0.4493,
"step": 1015
},
{
"epoch": 2.28186412128018,
"grad_norm": 0.14464671909809113,
"learning_rate": 2.6272860919174223e-05,
"loss": 0.4586,
"step": 1016
},
{
"epoch": 2.284110050533408,
"grad_norm": 0.13754825294017792,
"learning_rate": 2.6117995425336774e-05,
"loss": 0.4587,
"step": 1017
},
{
"epoch": 2.2863559797866366,
"grad_norm": 0.14128117263317108,
"learning_rate": 2.596349861799235e-05,
"loss": 0.4578,
"step": 1018
},
{
"epoch": 2.2886019090398655,
"grad_norm": 0.14357365667819977,
"learning_rate": 2.5809371554282177e-05,
"loss": 0.4492,
"step": 1019
},
{
"epoch": 2.290847838293094,
"grad_norm": 0.1328091323375702,
"learning_rate": 2.565561528881744e-05,
"loss": 0.4526,
"step": 1020
},
{
"epoch": 2.2930937675463223,
"grad_norm": 0.13385091722011566,
"learning_rate": 2.5502230873672177e-05,
"loss": 0.4692,
"step": 1021
},
{
"epoch": 2.2953396967995507,
"grad_norm": 0.13780003786087036,
"learning_rate": 2.5349219358376082e-05,
"loss": 0.4652,
"step": 1022
},
{
"epoch": 2.297585626052779,
"grad_norm": 0.1325894445180893,
"learning_rate": 2.519658178990727e-05,
"loss": 0.4384,
"step": 1023
},
{
"epoch": 2.299831555306008,
"grad_norm": 0.13235574960708618,
"learning_rate": 2.5044319212685066e-05,
"loss": 0.454,
"step": 1024
},
{
"epoch": 2.3020774845592364,
"grad_norm": 0.13442382216453552,
"learning_rate": 2.4892432668563017e-05,
"loss": 0.4449,
"step": 1025
},
{
"epoch": 2.304323413812465,
"grad_norm": 0.1442955881357193,
"learning_rate": 2.4740923196821653e-05,
"loss": 0.4764,
"step": 1026
},
{
"epoch": 2.3065693430656933,
"grad_norm": 0.13242414593696594,
"learning_rate": 2.4589791834161324e-05,
"loss": 0.44,
"step": 1027
},
{
"epoch": 2.308815272318922,
"grad_norm": 0.1390787959098816,
"learning_rate": 2.443903961469528e-05,
"loss": 0.4671,
"step": 1028
},
{
"epoch": 2.3110612015721506,
"grad_norm": 0.14238110184669495,
"learning_rate": 2.4288667569942402e-05,
"loss": 0.4375,
"step": 1029
},
{
"epoch": 2.313307130825379,
"grad_norm": 0.14821192622184753,
"learning_rate": 2.4138676728820274e-05,
"loss": 0.4575,
"step": 1030
},
{
"epoch": 2.3155530600786074,
"grad_norm": 0.1424325704574585,
"learning_rate": 2.3989068117638114e-05,
"loss": 0.4418,
"step": 1031
},
{
"epoch": 2.317798989331836,
"grad_norm": 0.1394152194261551,
"learning_rate": 2.383984276008975e-05,
"loss": 0.4298,
"step": 1032
},
{
"epoch": 2.3200449185850647,
"grad_norm": 0.1432042270898819,
"learning_rate": 2.3691001677246552e-05,
"loss": 0.4409,
"step": 1033
},
{
"epoch": 2.322290847838293,
"grad_norm": 0.14173389971256256,
"learning_rate": 2.354254588755051e-05,
"loss": 0.4557,
"step": 1034
},
{
"epoch": 2.3245367770915215,
"grad_norm": 0.1387631595134735,
"learning_rate": 2.339447640680728e-05,
"loss": 0.4562,
"step": 1035
},
{
"epoch": 2.3267827063447504,
"grad_norm": 0.14601486921310425,
"learning_rate": 2.3246794248179203e-05,
"loss": 0.4496,
"step": 1036
},
{
"epoch": 2.329028635597979,
"grad_norm": 0.13562379777431488,
"learning_rate": 2.309950042217838e-05,
"loss": 0.4385,
"step": 1037
},
{
"epoch": 2.331274564851207,
"grad_norm": 0.14119566977024078,
"learning_rate": 2.2952595936659757e-05,
"loss": 0.4468,
"step": 1038
},
{
"epoch": 2.3335204941044356,
"grad_norm": 0.13435381650924683,
"learning_rate": 2.2806081796814193e-05,
"loss": 0.4479,
"step": 1039
},
{
"epoch": 2.335766423357664,
"grad_norm": 0.14311861991882324,
"learning_rate": 2.2659959005161617e-05,
"loss": 0.4466,
"step": 1040
},
{
"epoch": 2.338012352610893,
"grad_norm": 0.13565625250339508,
"learning_rate": 2.25142285615442e-05,
"loss": 0.4656,
"step": 1041
},
{
"epoch": 2.3402582818641213,
"grad_norm": 0.1413930356502533,
"learning_rate": 2.2368891463119473e-05,
"loss": 0.4426,
"step": 1042
},
{
"epoch": 2.3425042111173497,
"grad_norm": 0.14812184870243073,
"learning_rate": 2.222394870435352e-05,
"loss": 0.4617,
"step": 1043
},
{
"epoch": 2.344750140370578,
"grad_norm": 0.1381373107433319,
"learning_rate": 2.2079401277014102e-05,
"loss": 0.4506,
"step": 1044
},
{
"epoch": 2.346996069623807,
"grad_norm": 0.1399037092924118,
"learning_rate": 2.193525017016402e-05,
"loss": 0.4427,
"step": 1045
},
{
"epoch": 2.3492419988770354,
"grad_norm": 0.14365847408771515,
"learning_rate": 2.1791496370154173e-05,
"loss": 0.4575,
"step": 1046
},
{
"epoch": 2.351487928130264,
"grad_norm": 0.13773076236248016,
"learning_rate": 2.1648140860616974e-05,
"loss": 0.4501,
"step": 1047
},
{
"epoch": 2.3537338573834923,
"grad_norm": 0.13768814504146576,
"learning_rate": 2.1505184622459517e-05,
"loss": 0.4754,
"step": 1048
},
{
"epoch": 2.3559797866367207,
"grad_norm": 0.13707469403743744,
"learning_rate": 2.1362628633856836e-05,
"loss": 0.4243,
"step": 1049
},
{
"epoch": 2.3582257158899496,
"grad_norm": 0.1411537230014801,
"learning_rate": 2.1220473870245347e-05,
"loss": 0.463,
"step": 1050
},
{
"epoch": 2.360471645143178,
"grad_norm": 0.1276266723871231,
"learning_rate": 2.1078721304316064e-05,
"loss": 0.4492,
"step": 1051
},
{
"epoch": 2.3627175743964064,
"grad_norm": 0.13482601940631866,
"learning_rate": 2.093737190600793e-05,
"loss": 0.451,
"step": 1052
},
{
"epoch": 2.3649635036496353,
"grad_norm": 0.13639169931411743,
"learning_rate": 2.0796426642501305e-05,
"loss": 0.4458,
"step": 1053
},
{
"epoch": 2.3672094329028637,
"grad_norm": 0.128794863820076,
"learning_rate": 2.065588647821116e-05,
"loss": 0.452,
"step": 1054
},
{
"epoch": 2.369455362156092,
"grad_norm": 0.13202716410160065,
"learning_rate": 2.0515752374780664e-05,
"loss": 0.4405,
"step": 1055
},
{
"epoch": 2.3717012914093205,
"grad_norm": 0.15147733688354492,
"learning_rate": 2.03760252910745e-05,
"loss": 0.451,
"step": 1056
},
{
"epoch": 2.373947220662549,
"grad_norm": 0.13587650656700134,
"learning_rate": 2.023670618317235e-05,
"loss": 0.4373,
"step": 1057
},
{
"epoch": 2.376193149915778,
"grad_norm": 0.1358175277709961,
"learning_rate": 2.009779600436228e-05,
"loss": 0.4628,
"step": 1058
},
{
"epoch": 2.378439079169006,
"grad_norm": 0.13308054208755493,
"learning_rate": 1.995929570513427e-05,
"loss": 0.4517,
"step": 1059
},
{
"epoch": 2.3806850084222346,
"grad_norm": 0.14447179436683655,
"learning_rate": 1.9821206233173756e-05,
"loss": 0.464,
"step": 1060
},
{
"epoch": 2.382930937675463,
"grad_norm": 0.1535249650478363,
"learning_rate": 1.9683528533355077e-05,
"loss": 0.4783,
"step": 1061
},
{
"epoch": 2.385176866928692,
"grad_norm": 0.13172586262226105,
"learning_rate": 1.9546263547735006e-05,
"loss": 0.4451,
"step": 1062
},
{
"epoch": 2.3874227961819203,
"grad_norm": 0.13454264402389526,
"learning_rate": 1.9409412215546385e-05,
"loss": 0.4326,
"step": 1063
},
{
"epoch": 2.3896687254351487,
"grad_norm": 0.13548077642917633,
"learning_rate": 1.9272975473191566e-05,
"loss": 0.4725,
"step": 1064
},
{
"epoch": 2.391914654688377,
"grad_norm": 0.1396332085132599,
"learning_rate": 1.91369542542361e-05,
"loss": 0.4433,
"step": 1065
},
{
"epoch": 2.394160583941606,
"grad_norm": 0.13676691055297852,
"learning_rate": 1.9001349489402374e-05,
"loss": 0.4533,
"step": 1066
},
{
"epoch": 2.3964065131948344,
"grad_norm": 0.138559028506279,
"learning_rate": 1.886616210656314e-05,
"loss": 0.4546,
"step": 1067
},
{
"epoch": 2.398652442448063,
"grad_norm": 0.14537115395069122,
"learning_rate": 1.873139303073529e-05,
"loss": 0.4505,
"step": 1068
},
{
"epoch": 2.4008983717012913,
"grad_norm": 0.14567793905735016,
"learning_rate": 1.859704318407336e-05,
"loss": 0.4494,
"step": 1069
},
{
"epoch": 2.40314430095452,
"grad_norm": 0.16292881965637207,
"learning_rate": 1.8463113485863423e-05,
"loss": 0.4493,
"step": 1070
},
{
"epoch": 2.4053902302077486,
"grad_norm": 0.1402868777513504,
"learning_rate": 1.832960485251661e-05,
"loss": 0.4546,
"step": 1071
},
{
"epoch": 2.407636159460977,
"grad_norm": 0.13375958800315857,
"learning_rate": 1.819651819756297e-05,
"loss": 0.4469,
"step": 1072
},
{
"epoch": 2.4098820887142054,
"grad_norm": 0.14132662117481232,
"learning_rate": 1.80638544316452e-05,
"loss": 0.4505,
"step": 1073
},
{
"epoch": 2.412128017967434,
"grad_norm": 0.13755889236927032,
"learning_rate": 1.7931614462512293e-05,
"loss": 0.4704,
"step": 1074
},
{
"epoch": 2.4143739472206627,
"grad_norm": 0.13184499740600586,
"learning_rate": 1.7799799195013526e-05,
"loss": 0.4369,
"step": 1075
},
{
"epoch": 2.416619876473891,
"grad_norm": 0.13104869425296783,
"learning_rate": 1.7668409531092097e-05,
"loss": 0.4521,
"step": 1076
},
{
"epoch": 2.4188658057271195,
"grad_norm": 0.135769784450531,
"learning_rate": 1.7537446369779072e-05,
"loss": 0.4674,
"step": 1077
},
{
"epoch": 2.421111734980348,
"grad_norm": 0.13897131383419037,
"learning_rate": 1.740691060718712e-05,
"loss": 0.4401,
"step": 1078
},
{
"epoch": 2.423357664233577,
"grad_norm": 0.12773634493350983,
"learning_rate": 1.72768031365045e-05,
"loss": 0.4339,
"step": 1079
},
{
"epoch": 2.425603593486805,
"grad_norm": 0.13083034753799438,
"learning_rate": 1.7147124847988834e-05,
"loss": 0.451,
"step": 1080
},
{
"epoch": 2.4278495227400336,
"grad_norm": 0.13339859247207642,
"learning_rate": 1.7017876628961126e-05,
"loss": 0.4495,
"step": 1081
},
{
"epoch": 2.430095451993262,
"grad_norm": 0.13018065690994263,
"learning_rate": 1.6889059363799623e-05,
"loss": 0.4483,
"step": 1082
},
{
"epoch": 2.432341381246491,
"grad_norm": 0.13034923374652863,
"learning_rate": 1.67606739339338e-05,
"loss": 0.4381,
"step": 1083
},
{
"epoch": 2.4345873104997193,
"grad_norm": 0.1323402225971222,
"learning_rate": 1.6632721217838258e-05,
"loss": 0.4414,
"step": 1084
},
{
"epoch": 2.4368332397529477,
"grad_norm": 0.13824905455112457,
"learning_rate": 1.650520209102677e-05,
"loss": 0.4469,
"step": 1085
},
{
"epoch": 2.439079169006176,
"grad_norm": 0.12723715603351593,
"learning_rate": 1.6378117426046332e-05,
"loss": 0.4551,
"step": 1086
},
{
"epoch": 2.441325098259405,
"grad_norm": 0.12957409024238586,
"learning_rate": 1.6251468092471093e-05,
"loss": 0.4435,
"step": 1087
},
{
"epoch": 2.4435710275126334,
"grad_norm": 0.13387183845043182,
"learning_rate": 1.612525495689651e-05,
"loss": 0.4321,
"step": 1088
},
{
"epoch": 2.445816956765862,
"grad_norm": 0.13002759218215942,
"learning_rate": 1.5999478882933325e-05,
"loss": 0.4461,
"step": 1089
},
{
"epoch": 2.4480628860190903,
"grad_norm": 0.13771192729473114,
"learning_rate": 1.5874140731201694e-05,
"loss": 0.4337,
"step": 1090
},
{
"epoch": 2.4503088152723187,
"grad_norm": 0.13762550055980682,
"learning_rate": 1.574924135932529e-05,
"loss": 0.4435,
"step": 1091
},
{
"epoch": 2.4525547445255476,
"grad_norm": 0.13518671691417694,
"learning_rate": 1.5624781621925462e-05,
"loss": 0.4457,
"step": 1092
},
{
"epoch": 2.454800673778776,
"grad_norm": 0.13244876265525818,
"learning_rate": 1.5500762370615392e-05,
"loss": 0.4466,
"step": 1093
},
{
"epoch": 2.4570466030320044,
"grad_norm": 0.1363506317138672,
"learning_rate": 1.5377184453994232e-05,
"loss": 0.4397,
"step": 1094
},
{
"epoch": 2.4592925322852333,
"grad_norm": 0.13642770051956177,
"learning_rate": 1.5254048717641268e-05,
"loss": 0.4525,
"step": 1095
},
{
"epoch": 2.4615384615384617,
"grad_norm": 0.129640594124794,
"learning_rate": 1.5131356004110234e-05,
"loss": 0.4743,
"step": 1096
},
{
"epoch": 2.46378439079169,
"grad_norm": 0.12901091575622559,
"learning_rate": 1.500910715292343e-05,
"loss": 0.4579,
"step": 1097
},
{
"epoch": 2.4660303200449185,
"grad_norm": 0.1358920782804489,
"learning_rate": 1.4887303000566103e-05,
"loss": 0.4218,
"step": 1098
},
{
"epoch": 2.468276249298147,
"grad_norm": 0.13251328468322754,
"learning_rate": 1.4765944380480633e-05,
"loss": 0.454,
"step": 1099
},
{
"epoch": 2.470522178551376,
"grad_norm": 0.13257341086864471,
"learning_rate": 1.464503212306081e-05,
"loss": 0.4534,
"step": 1100
},
{
"epoch": 2.472768107804604,
"grad_norm": 0.135364830493927,
"learning_rate": 1.4524567055646261e-05,
"loss": 0.4535,
"step": 1101
},
{
"epoch": 2.4750140370578326,
"grad_norm": 0.13053563237190247,
"learning_rate": 1.4404550002516709e-05,
"loss": 0.469,
"step": 1102
},
{
"epoch": 2.477259966311061,
"grad_norm": 0.12724533677101135,
"learning_rate": 1.4284981784886314e-05,
"loss": 0.4409,
"step": 1103
},
{
"epoch": 2.47950589556429,
"grad_norm": 0.13512974977493286,
"learning_rate": 1.4165863220898132e-05,
"loss": 0.4644,
"step": 1104
},
{
"epoch": 2.4817518248175183,
"grad_norm": 0.1417611837387085,
"learning_rate": 1.404719512561843e-05,
"loss": 0.4507,
"step": 1105
},
{
"epoch": 2.4839977540707467,
"grad_norm": 0.13797731697559357,
"learning_rate": 1.3928978311031194e-05,
"loss": 0.4427,
"step": 1106
},
{
"epoch": 2.486243683323975,
"grad_norm": 0.13513045012950897,
"learning_rate": 1.3811213586032506e-05,
"loss": 0.4495,
"step": 1107
},
{
"epoch": 2.4884896125772036,
"grad_norm": 0.13863462209701538,
"learning_rate": 1.369390175642507e-05,
"loss": 0.4447,
"step": 1108
},
{
"epoch": 2.4907355418304324,
"grad_norm": 0.13138817250728607,
"learning_rate": 1.3577043624912602e-05,
"loss": 0.4433,
"step": 1109
},
{
"epoch": 2.492981471083661,
"grad_norm": 0.13766634464263916,
"learning_rate": 1.3460639991094423e-05,
"loss": 0.4569,
"step": 1110
},
{
"epoch": 2.4952274003368893,
"grad_norm": 0.13439221680164337,
"learning_rate": 1.3344691651459987e-05,
"loss": 0.4527,
"step": 1111
},
{
"epoch": 2.497473329590118,
"grad_norm": 0.12861117720603943,
"learning_rate": 1.3229199399383395e-05,
"loss": 0.4226,
"step": 1112
},
{
"epoch": 2.4997192588433466,
"grad_norm": 0.135506734251976,
"learning_rate": 1.3114164025117968e-05,
"loss": 0.4355,
"step": 1113
},
{
"epoch": 2.501965188096575,
"grad_norm": 0.13466140627861023,
"learning_rate": 1.299958631579088e-05,
"loss": 0.4613,
"step": 1114
},
{
"epoch": 2.5042111173498034,
"grad_norm": 0.131247416138649,
"learning_rate": 1.2885467055397691e-05,
"loss": 0.4421,
"step": 1115
},
{
"epoch": 2.506457046603032,
"grad_norm": 0.13447698950767517,
"learning_rate": 1.2771807024797052e-05,
"loss": 0.438,
"step": 1116
},
{
"epoch": 2.5087029758562607,
"grad_norm": 0.14003418385982513,
"learning_rate": 1.2658607001705359e-05,
"loss": 0.4327,
"step": 1117
},
{
"epoch": 2.510948905109489,
"grad_norm": 0.13097427785396576,
"learning_rate": 1.254586776069143e-05,
"loss": 0.4427,
"step": 1118
},
{
"epoch": 2.5131948343627175,
"grad_norm": 0.1318497210741043,
"learning_rate": 1.2433590073171175e-05,
"loss": 0.4516,
"step": 1119
},
{
"epoch": 2.5154407636159464,
"grad_norm": 0.12292584031820297,
"learning_rate": 1.23217747074023e-05,
"loss": 0.4355,
"step": 1120
},
{
"epoch": 2.517686692869175,
"grad_norm": 0.12714707851409912,
"learning_rate": 1.2210422428479122e-05,
"loss": 0.4457,
"step": 1121
},
{
"epoch": 2.519932622122403,
"grad_norm": 0.13449381291866302,
"learning_rate": 1.2099533998327328e-05,
"loss": 0.443,
"step": 1122
},
{
"epoch": 2.5221785513756316,
"grad_norm": 0.1288016140460968,
"learning_rate": 1.1989110175698629e-05,
"loss": 0.4488,
"step": 1123
},
{
"epoch": 2.52442448062886,
"grad_norm": 0.12953847646713257,
"learning_rate": 1.1879151716165782e-05,
"loss": 0.4327,
"step": 1124
},
{
"epoch": 2.5266704098820885,
"grad_norm": 0.1303713619709015,
"learning_rate": 1.1769659372117208e-05,
"loss": 0.4452,
"step": 1125
},
{
"epoch": 2.5289163391353173,
"grad_norm": 0.12560470402240753,
"learning_rate": 1.1660633892752018e-05,
"loss": 0.453,
"step": 1126
},
{
"epoch": 2.5311622683885457,
"grad_norm": 0.1277565062046051,
"learning_rate": 1.1552076024074767e-05,
"loss": 0.4342,
"step": 1127
},
{
"epoch": 2.533408197641774,
"grad_norm": 0.13078947365283966,
"learning_rate": 1.1443986508890438e-05,
"loss": 0.4529,
"step": 1128
},
{
"epoch": 2.535654126895003,
"grad_norm": 0.13425932824611664,
"learning_rate": 1.1336366086799262e-05,
"loss": 0.4608,
"step": 1129
},
{
"epoch": 2.5379000561482314,
"grad_norm": 0.12628474831581116,
"learning_rate": 1.1229215494191724e-05,
"loss": 0.4679,
"step": 1130
},
{
"epoch": 2.54014598540146,
"grad_norm": 0.12629267573356628,
"learning_rate": 1.112253546424352e-05,
"loss": 0.4525,
"step": 1131
},
{
"epoch": 2.5423919146546883,
"grad_norm": 0.1339423656463623,
"learning_rate": 1.1016326726910554e-05,
"loss": 0.4601,
"step": 1132
},
{
"epoch": 2.5446378439079167,
"grad_norm": 0.12335141748189926,
"learning_rate": 1.0910590008923871e-05,
"loss": 0.444,
"step": 1133
},
{
"epoch": 2.5468837731611456,
"grad_norm": 0.12865717709064484,
"learning_rate": 1.0805326033784804e-05,
"loss": 0.4384,
"step": 1134
},
{
"epoch": 2.549129702414374,
"grad_norm": 0.13087087869644165,
"learning_rate": 1.0700535521759874e-05,
"loss": 0.4367,
"step": 1135
},
{
"epoch": 2.5513756316676024,
"grad_norm": 0.12297067791223526,
"learning_rate": 1.0596219189875963e-05,
"loss": 0.4431,
"step": 1136
},
{
"epoch": 2.5536215609208313,
"grad_norm": 0.13361698389053345,
"learning_rate": 1.049237775191542e-05,
"loss": 0.4345,
"step": 1137
},
{
"epoch": 2.5558674901740597,
"grad_norm": 0.1307375282049179,
"learning_rate": 1.0389011918411103e-05,
"loss": 0.469,
"step": 1138
},
{
"epoch": 2.558113419427288,
"grad_norm": 0.13051824271678925,
"learning_rate": 1.0286122396641587e-05,
"loss": 0.464,
"step": 1139
},
{
"epoch": 2.5603593486805165,
"grad_norm": 0.13012929260730743,
"learning_rate": 1.0183709890626301e-05,
"loss": 0.4517,
"step": 1140
},
{
"epoch": 2.562605277933745,
"grad_norm": 0.13006287813186646,
"learning_rate": 1.0081775101120645e-05,
"loss": 0.4565,
"step": 1141
},
{
"epoch": 2.5648512071869733,
"grad_norm": 0.12601535022258759,
"learning_rate": 9.980318725611294e-06,
"loss": 0.4355,
"step": 1142
},
{
"epoch": 2.567097136440202,
"grad_norm": 0.13367784023284912,
"learning_rate": 9.879341458311394e-06,
"loss": 0.459,
"step": 1143
},
{
"epoch": 2.5693430656934306,
"grad_norm": 0.13120903074741364,
"learning_rate": 9.778843990155784e-06,
"loss": 0.4516,
"step": 1144
},
{
"epoch": 2.571588994946659,
"grad_norm": 0.12156583368778229,
"learning_rate": 9.67882700879632e-06,
"loss": 0.4366,
"step": 1145
},
{
"epoch": 2.573834924199888,
"grad_norm": 0.12496156245470047,
"learning_rate": 9.57929119859708e-06,
"loss": 0.4503,
"step": 1146
},
{
"epoch": 2.5760808534531163,
"grad_norm": 0.1285991668701172,
"learning_rate": 9.480237240629794e-06,
"loss": 0.4546,
"step": 1147
},
{
"epoch": 2.5783267827063447,
"grad_norm": 0.12715794146060944,
"learning_rate": 9.381665812669074e-06,
"loss": 0.4353,
"step": 1148
},
{
"epoch": 2.580572711959573,
"grad_norm": 0.12791746854782104,
"learning_rate": 9.283577589187884e-06,
"loss": 0.4783,
"step": 1149
},
{
"epoch": 2.5828186412128016,
"grad_norm": 0.12204549461603165,
"learning_rate": 9.185973241352859e-06,
"loss": 0.4475,
"step": 1150
},
{
"epoch": 2.5850645704660304,
"grad_norm": 0.12769286334514618,
"learning_rate": 9.088853437019688e-06,
"loss": 0.44,
"step": 1151
},
{
"epoch": 2.587310499719259,
"grad_norm": 0.12649452686309814,
"learning_rate": 8.99221884072862e-06,
"loss": 0.44,
"step": 1152
},
{
"epoch": 2.5895564289724873,
"grad_norm": 0.12873002886772156,
"learning_rate": 8.896070113699874e-06,
"loss": 0.4356,
"step": 1153
},
{
"epoch": 2.591802358225716,
"grad_norm": 0.12493567168712616,
"learning_rate": 8.800407913829088e-06,
"loss": 0.456,
"step": 1154
},
{
"epoch": 2.5940482874789446,
"grad_norm": 0.12773042917251587,
"learning_rate": 8.705232895682906e-06,
"loss": 0.4502,
"step": 1155
},
{
"epoch": 2.596294216732173,
"grad_norm": 0.1301664263010025,
"learning_rate": 8.610545710494356e-06,
"loss": 0.441,
"step": 1156
},
{
"epoch": 2.5985401459854014,
"grad_norm": 0.136691614985466,
"learning_rate": 8.516347006158567e-06,
"loss": 0.4451,
"step": 1157
},
{
"epoch": 2.60078607523863,
"grad_norm": 0.12582361698150635,
"learning_rate": 8.422637427228193e-06,
"loss": 0.4477,
"step": 1158
},
{
"epoch": 2.6030320044918582,
"grad_norm": 0.12166401743888855,
"learning_rate": 8.329417614909094e-06,
"loss": 0.4402,
"step": 1159
},
{
"epoch": 2.605277933745087,
"grad_norm": 0.12802627682685852,
"learning_rate": 8.236688207055885e-06,
"loss": 0.4545,
"step": 1160
},
{
"epoch": 2.6075238629983155,
"grad_norm": 0.1304531693458557,
"learning_rate": 8.144449838167579e-06,
"loss": 0.4655,
"step": 1161
},
{
"epoch": 2.609769792251544,
"grad_norm": 0.12477454543113708,
"learning_rate": 8.052703139383315e-06,
"loss": 0.4568,
"step": 1162
},
{
"epoch": 2.612015721504773,
"grad_norm": 0.12605507671833038,
"learning_rate": 7.96144873847796e-06,
"loss": 0.4558,
"step": 1163
},
{
"epoch": 2.614261650758001,
"grad_norm": 0.12706461548805237,
"learning_rate": 7.870687259857858e-06,
"loss": 0.4343,
"step": 1164
},
{
"epoch": 2.6165075800112296,
"grad_norm": 0.12751144170761108,
"learning_rate": 7.78041932455655e-06,
"loss": 0.4554,
"step": 1165
},
{
"epoch": 2.618753509264458,
"grad_norm": 0.12677204608917236,
"learning_rate": 7.690645550230482e-06,
"loss": 0.4587,
"step": 1166
},
{
"epoch": 2.6209994385176865,
"grad_norm": 0.12588229775428772,
"learning_rate": 7.6013665511548114e-06,
"loss": 0.4358,
"step": 1167
},
{
"epoch": 2.6232453677709153,
"grad_norm": 0.12063749879598618,
"learning_rate": 7.512582938219259e-06,
"loss": 0.4384,
"step": 1168
},
{
"epoch": 2.6254912970241437,
"grad_norm": 0.12080162763595581,
"learning_rate": 7.424295318923831e-06,
"loss": 0.4542,
"step": 1169
},
{
"epoch": 2.627737226277372,
"grad_norm": 0.12560433149337769,
"learning_rate": 7.336504297374749e-06,
"loss": 0.4493,
"step": 1170
},
{
"epoch": 2.629983155530601,
"grad_norm": 9.130139350891113,
"learning_rate": 7.249210474280208e-06,
"loss": 0.4636,
"step": 1171
},
{
"epoch": 2.6322290847838294,
"grad_norm": 0.12350396066904068,
"learning_rate": 7.162414446946395e-06,
"loss": 0.4543,
"step": 1172
},
{
"epoch": 2.634475014037058,
"grad_norm": 0.12666672468185425,
"learning_rate": 7.076116809273323e-06,
"loss": 0.4633,
"step": 1173
},
{
"epoch": 2.6367209432902863,
"grad_norm": 0.12505994737148285,
"learning_rate": 6.990318151750757e-06,
"loss": 0.4401,
"step": 1174
},
{
"epoch": 2.6389668725435147,
"grad_norm": 0.1194506362080574,
"learning_rate": 6.9050190614542565e-06,
"loss": 0.4625,
"step": 1175
},
{
"epoch": 2.6412128017967436,
"grad_norm": 0.12401262670755386,
"learning_rate": 6.8202201220410255e-06,
"loss": 0.4357,
"step": 1176
},
{
"epoch": 2.643458731049972,
"grad_norm": 0.12455414235591888,
"learning_rate": 6.73592191374607e-06,
"loss": 0.4494,
"step": 1177
},
{
"epoch": 2.6457046603032004,
"grad_norm": 0.12066637724637985,
"learning_rate": 6.652125013378108e-06,
"loss": 0.4565,
"step": 1178
},
{
"epoch": 2.647950589556429,
"grad_norm": 0.12697719037532806,
"learning_rate": 6.5688299943157e-06,
"loss": 0.4434,
"step": 1179
},
{
"epoch": 2.6501965188096577,
"grad_norm": 0.12216756492853165,
"learning_rate": 6.486037426503276e-06,
"loss": 0.4461,
"step": 1180
},
{
"epoch": 2.652442448062886,
"grad_norm": 0.12145403027534485,
"learning_rate": 6.403747876447232e-06,
"loss": 0.4506,
"step": 1181
},
{
"epoch": 2.6546883773161145,
"grad_norm": 0.11756281554698944,
"learning_rate": 6.321961907212109e-06,
"loss": 0.463,
"step": 1182
},
{
"epoch": 2.656934306569343,
"grad_norm": 0.12291593104600906,
"learning_rate": 6.240680078416699e-06,
"loss": 0.4538,
"step": 1183
},
{
"epoch": 2.6591802358225713,
"grad_norm": 0.12477383762598038,
"learning_rate": 6.15990294623023e-06,
"loss": 0.456,
"step": 1184
},
{
"epoch": 2.6614261650758,
"grad_norm": 0.12275049090385437,
"learning_rate": 6.079631063368547e-06,
"loss": 0.4443,
"step": 1185
},
{
"epoch": 2.6636720943290286,
"grad_norm": 0.12498319894075394,
"learning_rate": 5.999864979090326e-06,
"loss": 0.4487,
"step": 1186
},
{
"epoch": 2.665918023582257,
"grad_norm": 0.11939443647861481,
"learning_rate": 5.92060523919332e-06,
"loss": 0.4285,
"step": 1187
},
{
"epoch": 2.668163952835486,
"grad_norm": 0.12449135631322861,
"learning_rate": 5.8418523860106665e-06,
"loss": 0.4609,
"step": 1188
},
{
"epoch": 2.6704098820887143,
"grad_norm": 0.12374921143054962,
"learning_rate": 5.763606958407116e-06,
"loss": 0.4441,
"step": 1189
},
{
"epoch": 2.6726558113419427,
"grad_norm": 0.11954803764820099,
"learning_rate": 5.6858694917754e-06,
"loss": 0.4566,
"step": 1190
},
{
"epoch": 2.674901740595171,
"grad_norm": 0.12245208770036697,
"learning_rate": 5.6086405180324665e-06,
"loss": 0.4519,
"step": 1191
},
{
"epoch": 2.6771476698483996,
"grad_norm": 0.1250237375497818,
"learning_rate": 5.531920565616e-06,
"loss": 0.4364,
"step": 1192
},
{
"epoch": 2.6793935991016284,
"grad_norm": 0.12335599958896637,
"learning_rate": 5.455710159480649e-06,
"loss": 0.4513,
"step": 1193
},
{
"epoch": 2.681639528354857,
"grad_norm": 0.12619943916797638,
"learning_rate": 5.380009821094536e-06,
"loss": 0.4531,
"step": 1194
},
{
"epoch": 2.6838854576080853,
"grad_norm": 0.1240544244647026,
"learning_rate": 5.30482006843565e-06,
"loss": 0.4396,
"step": 1195
},
{
"epoch": 2.686131386861314,
"grad_norm": 0.12158697843551636,
"learning_rate": 5.230141415988312e-06,
"loss": 0.4426,
"step": 1196
},
{
"epoch": 2.6883773161145426,
"grad_norm": 0.12433162331581116,
"learning_rate": 5.155974374739634e-06,
"loss": 0.447,
"step": 1197
},
{
"epoch": 2.690623245367771,
"grad_norm": 0.12310656160116196,
"learning_rate": 5.082319452176068e-06,
"loss": 0.4359,
"step": 1198
},
{
"epoch": 2.6928691746209994,
"grad_norm": 0.11813896149396896,
"learning_rate": 5.009177152279865e-06,
"loss": 0.4538,
"step": 1199
},
{
"epoch": 2.695115103874228,
"grad_norm": 0.12028888612985611,
"learning_rate": 4.936547975525692e-06,
"loss": 0.4334,
"step": 1200
},
{
"epoch": 2.6973610331274562,
"grad_norm": 0.1224963515996933,
"learning_rate": 4.864432418877192e-06,
"loss": 0.4454,
"step": 1201
},
{
"epoch": 2.699606962380685,
"grad_norm": 0.12296409159898758,
"learning_rate": 4.792830975783531e-06,
"loss": 0.4439,
"step": 1202
},
{
"epoch": 2.7018528916339135,
"grad_norm": 0.11706443876028061,
"learning_rate": 4.721744136176103e-06,
"loss": 0.4288,
"step": 1203
},
{
"epoch": 2.704098820887142,
"grad_norm": 0.12277070432901382,
"learning_rate": 4.651172386465152e-06,
"loss": 0.454,
"step": 1204
},
{
"epoch": 2.706344750140371,
"grad_norm": 0.12013454735279083,
"learning_rate": 4.581116209536358e-06,
"loss": 0.4405,
"step": 1205
},
{
"epoch": 2.708590679393599,
"grad_norm": 0.12198374420404434,
"learning_rate": 4.511576084747696e-06,
"loss": 0.4646,
"step": 1206
},
{
"epoch": 2.7108366086468276,
"grad_norm": 0.11776817589998245,
"learning_rate": 4.442552487925982e-06,
"loss": 0.4494,
"step": 1207
},
{
"epoch": 2.713082537900056,
"grad_norm": 0.12356902658939362,
"learning_rate": 4.3740458913637605e-06,
"loss": 0.4578,
"step": 1208
},
{
"epoch": 2.7153284671532845,
"grad_norm": 0.11953306198120117,
"learning_rate": 4.3060567638159775e-06,
"loss": 0.4379,
"step": 1209
},
{
"epoch": 2.7175743964065133,
"grad_norm": 0.12432871758937836,
"learning_rate": 4.238585570496847e-06,
"loss": 0.4441,
"step": 1210
},
{
"epoch": 2.7198203256597417,
"grad_norm": 0.11917420476675034,
"learning_rate": 4.171632773076581e-06,
"loss": 0.4477,
"step": 1211
},
{
"epoch": 2.72206625491297,
"grad_norm": 0.11728362739086151,
"learning_rate": 4.105198829678285e-06,
"loss": 0.4638,
"step": 1212
},
{
"epoch": 2.724312184166199,
"grad_norm": 0.1192561611533165,
"learning_rate": 4.039284194874862e-06,
"loss": 0.427,
"step": 1213
},
{
"epoch": 2.7265581134194274,
"grad_norm": 0.11842131614685059,
"learning_rate": 3.973889319685809e-06,
"loss": 0.4321,
"step": 1214
},
{
"epoch": 2.728804042672656,
"grad_norm": 0.11767691373825073,
"learning_rate": 3.909014651574197e-06,
"loss": 0.4407,
"step": 1215
},
{
"epoch": 2.7310499719258843,
"grad_norm": 0.1151251420378685,
"learning_rate": 3.844660634443616e-06,
"loss": 0.4472,
"step": 1216
},
{
"epoch": 2.7332959011791127,
"grad_norm": 0.1207621842622757,
"learning_rate": 3.7808277086350464e-06,
"loss": 0.4326,
"step": 1217
},
{
"epoch": 2.735541830432341,
"grad_norm": 0.11696569621562958,
"learning_rate": 3.7175163109239855e-06,
"loss": 0.4421,
"step": 1218
},
{
"epoch": 2.73778775968557,
"grad_norm": 0.11953862756490707,
"learning_rate": 3.6547268745173247e-06,
"loss": 0.4382,
"step": 1219
},
{
"epoch": 2.7400336889387984,
"grad_norm": 0.12477323412895203,
"learning_rate": 3.5924598290504855e-06,
"loss": 0.4477,
"step": 1220
},
{
"epoch": 2.742279618192027,
"grad_norm": 0.11988485604524612,
"learning_rate": 3.530715600584449e-06,
"loss": 0.4432,
"step": 1221
},
{
"epoch": 2.7445255474452557,
"grad_norm": 0.12258612364530563,
"learning_rate": 3.469494611602775e-06,
"loss": 0.4575,
"step": 1222
},
{
"epoch": 2.746771476698484,
"grad_norm": 0.125362828373909,
"learning_rate": 3.4087972810088287e-06,
"loss": 0.4358,
"step": 1223
},
{
"epoch": 2.7490174059517125,
"grad_norm": 0.11876025050878525,
"learning_rate": 3.348624024122824e-06,
"loss": 0.4377,
"step": 1224
},
{
"epoch": 2.751263335204941,
"grad_norm": 0.1166381686925888,
"learning_rate": 3.2889752526790165e-06,
"loss": 0.4348,
"step": 1225
},
{
"epoch": 2.7535092644581693,
"grad_norm": 0.1194562315940857,
"learning_rate": 3.2298513748228787e-06,
"loss": 0.4443,
"step": 1226
},
{
"epoch": 2.755755193711398,
"grad_norm": 0.11869972944259644,
"learning_rate": 3.1712527951083126e-06,
"loss": 0.4479,
"step": 1227
},
{
"epoch": 2.7580011229646266,
"grad_norm": 0.11969739198684692,
"learning_rate": 3.1131799144948683e-06,
"loss": 0.454,
"step": 1228
},
{
"epoch": 2.760247052217855,
"grad_norm": 0.12087547779083252,
"learning_rate": 3.0556331303450437e-06,
"loss": 0.4341,
"step": 1229
},
{
"epoch": 2.762492981471084,
"grad_norm": 0.12332521378993988,
"learning_rate": 2.998612836421506e-06,
"loss": 0.4513,
"step": 1230
},
{
"epoch": 2.7647389107243123,
"grad_norm": 0.12205971032381058,
"learning_rate": 2.9421194228844084e-06,
"loss": 0.4501,
"step": 1231
},
{
"epoch": 2.7669848399775407,
"grad_norm": 0.12263938784599304,
"learning_rate": 2.88615327628877e-06,
"loss": 0.4504,
"step": 1232
},
{
"epoch": 2.769230769230769,
"grad_norm": 0.12588439881801605,
"learning_rate": 2.830714779581776e-06,
"loss": 0.4397,
"step": 1233
},
{
"epoch": 2.7714766984839976,
"grad_norm": 0.12059302628040314,
"learning_rate": 2.7758043121001834e-06,
"loss": 0.4354,
"step": 1234
},
{
"epoch": 2.7737226277372264,
"grad_norm": 0.11515524238348007,
"learning_rate": 2.721422249567729e-06,
"loss": 0.4495,
"step": 1235
},
{
"epoch": 2.775968556990455,
"grad_norm": 0.11858617514371872,
"learning_rate": 2.667568964092544e-06,
"loss": 0.4572,
"step": 1236
},
{
"epoch": 2.7782144862436833,
"grad_norm": 0.1132800504565239,
"learning_rate": 2.6142448241646046e-06,
"loss": 0.4492,
"step": 1237
},
{
"epoch": 2.7804604154969117,
"grad_norm": 0.12041954696178436,
"learning_rate": 2.561450194653219e-06,
"loss": 0.444,
"step": 1238
},
{
"epoch": 2.7827063447501406,
"grad_norm": 0.1182764321565628,
"learning_rate": 2.509185436804549e-06,
"loss": 0.4338,
"step": 1239
},
{
"epoch": 2.784952274003369,
"grad_norm": 0.13408203423023224,
"learning_rate": 2.457450908239114e-06,
"loss": 0.4487,
"step": 1240
},
{
"epoch": 2.7871982032565974,
"grad_norm": 0.12381100654602051,
"learning_rate": 2.4062469629493412e-06,
"loss": 0.4364,
"step": 1241
},
{
"epoch": 2.789444132509826,
"grad_norm": 0.12140806019306183,
"learning_rate": 2.3555739512971565e-06,
"loss": 0.4296,
"step": 1242
},
{
"epoch": 2.7916900617630542,
"grad_norm": 0.12192130833864212,
"learning_rate": 2.3054322200115963e-06,
"loss": 0.4537,
"step": 1243
},
{
"epoch": 2.793935991016283,
"grad_norm": 0.11975108832120895,
"learning_rate": 2.255822112186401e-06,
"loss": 0.4735,
"step": 1244
},
{
"epoch": 2.7961819202695115,
"grad_norm": 0.12235341221094131,
"learning_rate": 2.2067439672777047e-06,
"loss": 0.451,
"step": 1245
},
{
"epoch": 2.79842784952274,
"grad_norm": 0.12043313682079315,
"learning_rate": 2.158198121101691e-06,
"loss": 0.4381,
"step": 1246
},
{
"epoch": 2.800673778775969,
"grad_norm": 0.12066707760095596,
"learning_rate": 2.1101849058322932e-06,
"loss": 0.4435,
"step": 1247
},
{
"epoch": 2.802919708029197,
"grad_norm": 0.1135956272482872,
"learning_rate": 2.062704649998937e-06,
"loss": 0.4406,
"step": 1248
},
{
"epoch": 2.8051656372824256,
"grad_norm": 0.12277340143918991,
"learning_rate": 2.0157576784843024e-06,
"loss": 0.4661,
"step": 1249
},
{
"epoch": 2.807411566535654,
"grad_norm": 0.11728162318468094,
"learning_rate": 1.9693443125220346e-06,
"loss": 0.4431,
"step": 1250
},
{
"epoch": 2.8096574957888825,
"grad_norm": 0.11668264865875244,
"learning_rate": 1.9234648696946354e-06,
"loss": 0.4388,
"step": 1251
},
{
"epoch": 2.8119034250421113,
"grad_norm": 0.11695986986160278,
"learning_rate": 1.878119663931246e-06,
"loss": 0.4494,
"step": 1252
},
{
"epoch": 2.8141493542953397,
"grad_norm": 0.1159198209643364,
"learning_rate": 1.833309005505477e-06,
"loss": 0.4311,
"step": 1253
},
{
"epoch": 2.816395283548568,
"grad_norm": 0.1172918975353241,
"learning_rate": 1.7890332010333233e-06,
"loss": 0.4621,
"step": 1254
},
{
"epoch": 2.8186412128017966,
"grad_norm": 0.12139487266540527,
"learning_rate": 1.7452925534710763e-06,
"loss": 0.4393,
"step": 1255
},
{
"epoch": 2.8208871420550254,
"grad_norm": 0.11837179213762283,
"learning_rate": 1.7020873621131738e-06,
"loss": 0.447,
"step": 1256
},
{
"epoch": 2.823133071308254,
"grad_norm": 0.12008003145456314,
"learning_rate": 1.6594179225902652e-06,
"loss": 0.4516,
"step": 1257
},
{
"epoch": 2.8253790005614823,
"grad_norm": 0.11927176266908646,
"learning_rate": 1.617284526867078e-06,
"loss": 0.4404,
"step": 1258
},
{
"epoch": 2.8276249298147107,
"grad_norm": 0.11693605035543442,
"learning_rate": 1.5756874632405095e-06,
"loss": 0.4438,
"step": 1259
},
{
"epoch": 2.829870859067939,
"grad_norm": 0.11941110342741013,
"learning_rate": 1.534627016337593e-06,
"loss": 0.4426,
"step": 1260
},
{
"epoch": 2.832116788321168,
"grad_norm": 0.11750718951225281,
"learning_rate": 1.494103467113588e-06,
"loss": 0.4322,
"step": 1261
},
{
"epoch": 2.8343627175743964,
"grad_norm": 0.116007000207901,
"learning_rate": 1.4541170928500248e-06,
"loss": 0.4621,
"step": 1262
},
{
"epoch": 2.836608646827625,
"grad_norm": 0.11818964034318924,
"learning_rate": 1.4146681671528418e-06,
"loss": 0.4638,
"step": 1263
},
{
"epoch": 2.8388545760808537,
"grad_norm": 0.11916031688451767,
"learning_rate": 1.3757569599504917e-06,
"loss": 0.4425,
"step": 1264
},
{
"epoch": 2.841100505334082,
"grad_norm": 0.11497969180345535,
"learning_rate": 1.3373837374920862e-06,
"loss": 0.4425,
"step": 1265
},
{
"epoch": 2.8433464345873105,
"grad_norm": 0.11767168343067169,
"learning_rate": 1.2995487623456194e-06,
"loss": 0.4532,
"step": 1266
},
{
"epoch": 2.845592363840539,
"grad_norm": 0.115963876247406,
"learning_rate": 1.2622522933961112e-06,
"loss": 0.4344,
"step": 1267
},
{
"epoch": 2.8478382930937673,
"grad_norm": 0.11714527010917664,
"learning_rate": 1.225494585843876e-06,
"loss": 0.4678,
"step": 1268
},
{
"epoch": 2.850084222346996,
"grad_norm": 0.11749914288520813,
"learning_rate": 1.1892758912027546e-06,
"loss": 0.4445,
"step": 1269
},
{
"epoch": 2.8523301516002246,
"grad_norm": 0.11820235848426819,
"learning_rate": 1.1535964572984093e-06,
"loss": 0.4659,
"step": 1270
},
{
"epoch": 2.854576080853453,
"grad_norm": 0.11582965403795242,
"learning_rate": 1.118456528266636e-06,
"loss": 0.4441,
"step": 1271
},
{
"epoch": 2.856822010106682,
"grad_norm": 0.11765659600496292,
"learning_rate": 1.0838563445516503e-06,
"loss": 0.4441,
"step": 1272
},
{
"epoch": 2.8590679393599103,
"grad_norm": 0.11495634913444519,
"learning_rate": 1.0497961429044979e-06,
"loss": 0.4397,
"step": 1273
},
{
"epoch": 2.8613138686131387,
"grad_norm": 0.13958555459976196,
"learning_rate": 1.0162761563813927e-06,
"loss": 0.4332,
"step": 1274
},
{
"epoch": 2.863559797866367,
"grad_norm": 0.11957214772701263,
"learning_rate": 9.832966143421551e-07,
"loss": 0.4476,
"step": 1275
},
{
"epoch": 2.8658057271195956,
"grad_norm": 0.12185267359018326,
"learning_rate": 9.508577424486031e-07,
"loss": 0.4571,
"step": 1276
},
{
"epoch": 2.868051656372824,
"grad_norm": 0.1512320339679718,
"learning_rate": 9.18959762663043e-07,
"loss": 0.4322,
"step": 1277
},
{
"epoch": 2.870297585626053,
"grad_norm": 0.11999038606882095,
"learning_rate": 8.876028932467417e-07,
"loss": 0.4399,
"step": 1278
},
{
"epoch": 2.8725435148792813,
"grad_norm": 0.11745017766952515,
"learning_rate": 8.567873487584077e-07,
"loss": 0.444,
"step": 1279
},
{
"epoch": 2.8747894441325097,
"grad_norm": 0.12399045377969742,
"learning_rate": 8.265133400527881e-07,
"loss": 0.4421,
"step": 1280
},
{
"epoch": 2.8770353733857386,
"grad_norm": 0.11552898585796356,
"learning_rate": 7.967810742791404e-07,
"loss": 0.4369,
"step": 1281
},
{
"epoch": 2.879281302638967,
"grad_norm": 0.1216784194111824,
"learning_rate": 7.675907548798744e-07,
"loss": 0.4597,
"step": 1282
},
{
"epoch": 2.8815272318921954,
"grad_norm": 0.12029793858528137,
"learning_rate": 7.389425815891394e-07,
"loss": 0.4455,
"step": 1283
},
{
"epoch": 2.883773161145424,
"grad_norm": 0.11763288825750351,
"learning_rate": 7.108367504314651e-07,
"loss": 0.4422,
"step": 1284
},
{
"epoch": 2.8860190903986522,
"grad_norm": 0.11679881066083908,
"learning_rate": 6.832734537204299e-07,
"loss": 0.4525,
"step": 1285
},
{
"epoch": 2.888265019651881,
"grad_norm": 0.11851628869771957,
"learning_rate": 6.562528800572931e-07,
"loss": 0.4435,
"step": 1286
},
{
"epoch": 2.8905109489051095,
"grad_norm": 0.11480539292097092,
"learning_rate": 6.297752143297864e-07,
"loss": 0.4484,
"step": 1287
},
{
"epoch": 2.892756878158338,
"grad_norm": 0.12258218973875046,
"learning_rate": 6.03840637710782e-07,
"loss": 0.4504,
"step": 1288
},
{
"epoch": 2.895002807411567,
"grad_norm": 0.12160119414329529,
"learning_rate": 5.784493276570669e-07,
"loss": 0.4401,
"step": 1289
},
{
"epoch": 2.897248736664795,
"grad_norm": 0.1183420866727829,
"learning_rate": 5.536014579081617e-07,
"loss": 0.4523,
"step": 1290
},
{
"epoch": 2.8994946659180236,
"grad_norm": 0.1185230165719986,
"learning_rate": 5.292971984850948e-07,
"loss": 0.4497,
"step": 1291
},
{
"epoch": 2.901740595171252,
"grad_norm": 0.11411769688129425,
"learning_rate": 5.055367156892654e-07,
"loss": 0.4436,
"step": 1292
},
{
"epoch": 2.9039865244244805,
"grad_norm": 0.11810418963432312,
"learning_rate": 4.823201721012538e-07,
"loss": 0.4435,
"step": 1293
},
{
"epoch": 2.906232453677709,
"grad_norm": 0.11871050298213959,
"learning_rate": 4.5964772657980827e-07,
"loss": 0.4512,
"step": 1294
},
{
"epoch": 2.9084783829309377,
"grad_norm": 0.12631046772003174,
"learning_rate": 4.375195342606464e-07,
"loss": 0.4352,
"step": 1295
},
{
"epoch": 2.910724312184166,
"grad_norm": 0.11332812160253525,
"learning_rate": 4.159357465554603e-07,
"loss": 0.4344,
"step": 1296
},
{
"epoch": 2.9129702414373946,
"grad_norm": 0.11570383608341217,
"learning_rate": 3.9489651115087734e-07,
"loss": 0.4491,
"step": 1297
},
{
"epoch": 2.9152161706906234,
"grad_norm": 0.1170554980635643,
"learning_rate": 3.7440197200741214e-07,
"loss": 0.4314,
"step": 1298
},
{
"epoch": 2.917462099943852,
"grad_norm": 0.11701026558876038,
"learning_rate": 3.544522693585428e-07,
"loss": 0.449,
"step": 1299
},
{
"epoch": 2.9197080291970803,
"grad_norm": 0.11610274761915207,
"learning_rate": 3.3504753970968083e-07,
"loss": 0.4493,
"step": 1300
},
{
"epoch": 2.9219539584503087,
"grad_norm": 0.1187182143330574,
"learning_rate": 3.1618791583729157e-07,
"loss": 0.4714,
"step": 1301
},
{
"epoch": 2.924199887703537,
"grad_norm": 0.11808615922927856,
"learning_rate": 2.97873526787944e-07,
"loss": 0.4494,
"step": 1302
},
{
"epoch": 2.926445816956766,
"grad_norm": 0.11943615227937698,
"learning_rate": 2.801044978774758e-07,
"loss": 0.444,
"step": 1303
},
{
"epoch": 2.9286917462099944,
"grad_norm": 0.11159630864858627,
"learning_rate": 2.6288095069009647e-07,
"loss": 0.4365,
"step": 1304
},
{
"epoch": 2.930937675463223,
"grad_norm": 0.11759793758392334,
"learning_rate": 2.4620300307756975e-07,
"loss": 0.4449,
"step": 1305
},
{
"epoch": 2.9331836047164517,
"grad_norm": 0.11761987954378128,
"learning_rate": 2.30070769158397e-07,
"loss": 0.4392,
"step": 1306
},
{
"epoch": 2.93542953396968,
"grad_norm": 0.11657937616109848,
"learning_rate": 2.1448435931705315e-07,
"loss": 0.4361,
"step": 1307
},
{
"epoch": 2.9376754632229085,
"grad_norm": 0.11725448071956635,
"learning_rate": 1.994438802032228e-07,
"loss": 0.4267,
"step": 1308
},
{
"epoch": 2.939921392476137,
"grad_norm": 0.12075719982385635,
"learning_rate": 1.8494943473108095e-07,
"loss": 0.4495,
"step": 1309
},
{
"epoch": 2.9421673217293653,
"grad_norm": 0.11142679303884506,
"learning_rate": 1.710011220785557e-07,
"loss": 0.4275,
"step": 1310
},
{
"epoch": 2.944413250982594,
"grad_norm": 0.1148485466837883,
"learning_rate": 1.575990376866976e-07,
"loss": 0.4362,
"step": 1311
},
{
"epoch": 2.9466591802358226,
"grad_norm": 0.11960410326719284,
"learning_rate": 1.4474327325897818e-07,
"loss": 0.4507,
"step": 1312
},
{
"epoch": 2.948905109489051,
"grad_norm": 0.11774080991744995,
"learning_rate": 1.324339167607036e-07,
"loss": 0.4571,
"step": 1313
},
{
"epoch": 2.9511510387422795,
"grad_norm": 0.11617586016654968,
"learning_rate": 1.2067105241839294e-07,
"loss": 0.4501,
"step": 1314
},
{
"epoch": 2.9533969679955083,
"grad_norm": 0.11817507445812225,
"learning_rate": 1.0945476071918316e-07,
"loss": 0.4471,
"step": 1315
},
{
"epoch": 2.9556428972487367,
"grad_norm": 0.12056715786457062,
"learning_rate": 9.878511841034056e-08,
"loss": 0.4382,
"step": 1316
},
{
"epoch": 2.957888826501965,
"grad_norm": 0.11966580897569656,
"learning_rate": 8.866219849864799e-08,
"loss": 0.4471,
"step": 1317
},
{
"epoch": 2.9601347557551936,
"grad_norm": 0.11672661453485489,
"learning_rate": 7.908607024999626e-08,
"loss": 0.4636,
"step": 1318
},
{
"epoch": 2.962380685008422,
"grad_norm": 0.12275572121143341,
"learning_rate": 7.005679918882457e-08,
"loss": 0.4388,
"step": 1319
},
{
"epoch": 2.964626614261651,
"grad_norm": 0.12186376005411148,
"learning_rate": 6.157444709773863e-08,
"loss": 0.4538,
"step": 1320
},
{
"epoch": 2.9668725435148793,
"grad_norm": 0.11584927141666412,
"learning_rate": 5.3639072017057647e-08,
"loss": 0.4464,
"step": 1321
},
{
"epoch": 2.9691184727681077,
"grad_norm": 0.11662715673446655,
"learning_rate": 4.625072824441468e-08,
"loss": 0.439,
"step": 1322
},
{
"epoch": 2.9713644020213366,
"grad_norm": 0.11534745246171951,
"learning_rate": 3.940946633440135e-08,
"loss": 0.4496,
"step": 1323
},
{
"epoch": 2.973610331274565,
"grad_norm": 0.11853344738483429,
"learning_rate": 3.3115333098212576e-08,
"loss": 0.4498,
"step": 1324
},
{
"epoch": 2.9758562605277934,
"grad_norm": 0.11632394790649414,
"learning_rate": 2.7368371603326838e-08,
"loss": 0.4311,
"step": 1325
},
{
"epoch": 2.978102189781022,
"grad_norm": 0.11567545682191849,
"learning_rate": 2.216862117319529e-08,
"loss": 0.4427,
"step": 1326
},
{
"epoch": 2.9803481190342502,
"grad_norm": 0.11598379909992218,
"learning_rate": 1.7516117387010866e-08,
"loss": 0.4452,
"step": 1327
},
{
"epoch": 2.982594048287479,
"grad_norm": 0.12003415077924728,
"learning_rate": 1.3410892079432914e-08,
"loss": 0.4408,
"step": 1328
},
{
"epoch": 2.9848399775407075,
"grad_norm": 0.11351985484361649,
"learning_rate": 9.85297334037405e-09,
"loss": 0.4529,
"step": 1329
},
{
"epoch": 2.987085906793936,
"grad_norm": 0.11411769688129425,
"learning_rate": 6.842385514831407e-09,
"loss": 0.434,
"step": 1330
},
{
"epoch": 2.9893318360471643,
"grad_norm": 0.12261338531970978,
"learning_rate": 4.3791492026734604e-09,
"loss": 0.4534,
"step": 1331
},
{
"epoch": 2.991577765300393,
"grad_norm": 0.11999525874853134,
"learning_rate": 2.463281258560102e-09,
"loss": 0.448,
"step": 1332
},
{
"epoch": 2.9938236945536216,
"grad_norm": 0.11168470978736877,
"learning_rate": 1.094794791764997e-09,
"loss": 0.4408,
"step": 1333
},
{
"epoch": 2.99606962380685,
"grad_norm": 0.11737479269504547,
"learning_rate": 2.736991661400623e-10,
"loss": 0.4357,
"step": 1334
},
{
"epoch": 2.9983155530600785,
"grad_norm": 0.11935008317232132,
"learning_rate": 0.0,
"loss": 0.4366,
"step": 1335
},
{
"epoch": 2.9983155530600785,
"step": 1335,
"total_flos": 4.209303851158733e+19,
"train_loss": 0.5228641195690141,
"train_runtime": 86845.4923,
"train_samples_per_second": 3.937,
"train_steps_per_second": 0.015
}
],
"logging_steps": 1,
"max_steps": 1335,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 100,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 4.209303851158733e+19,
"train_batch_size": 16,
"trial_name": null,
"trial_params": null
}