nemo_49b_lora / trainer_state.json
TOTORONG's picture
Upload folder using huggingface_hub
2e6c992 verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 1.997830802603037,
"eval_steps": 500,
"global_step": 691,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0028922631959508315,
"grad_norm": 48.71332931518555,
"learning_rate": 0.0,
"loss": 3.3684,
"step": 1
},
{
"epoch": 0.005784526391901663,
"grad_norm": 45.838565826416016,
"learning_rate": 1.4285714285714286e-06,
"loss": 3.2845,
"step": 2
},
{
"epoch": 0.008676789587852495,
"grad_norm": 56.195335388183594,
"learning_rate": 2.8571428571428573e-06,
"loss": 3.5006,
"step": 3
},
{
"epoch": 0.011569052783803326,
"grad_norm": 21.180103302001953,
"learning_rate": 4.285714285714286e-06,
"loss": 3.0654,
"step": 4
},
{
"epoch": 0.014461315979754157,
"grad_norm": 21.839435577392578,
"learning_rate": 5.7142857142857145e-06,
"loss": 2.9674,
"step": 5
},
{
"epoch": 0.01735357917570499,
"grad_norm": 9.650607109069824,
"learning_rate": 7.142857142857143e-06,
"loss": 2.9594,
"step": 6
},
{
"epoch": 0.02024584237165582,
"grad_norm": 6.312131881713867,
"learning_rate": 8.571428571428573e-06,
"loss": 2.8589,
"step": 7
},
{
"epoch": 0.023138105567606652,
"grad_norm": 6.745247840881348,
"learning_rate": 1e-05,
"loss": 2.6967,
"step": 8
},
{
"epoch": 0.026030368763557483,
"grad_norm": 5.504076957702637,
"learning_rate": 1.1428571428571429e-05,
"loss": 2.446,
"step": 9
},
{
"epoch": 0.028922631959508314,
"grad_norm": 4.069777011871338,
"learning_rate": 1.2857142857142857e-05,
"loss": 2.4143,
"step": 10
},
{
"epoch": 0.03181489515545915,
"grad_norm": 3.7189438343048096,
"learning_rate": 1.4285714285714285e-05,
"loss": 2.1749,
"step": 11
},
{
"epoch": 0.03470715835140998,
"grad_norm": 4.501105308532715,
"learning_rate": 1.5714285714285715e-05,
"loss": 2.1738,
"step": 12
},
{
"epoch": 0.03759942154736081,
"grad_norm": 5.211763858795166,
"learning_rate": 1.7142857142857145e-05,
"loss": 2.0335,
"step": 13
},
{
"epoch": 0.04049168474331164,
"grad_norm": 2.67036509513855,
"learning_rate": 1.8571428571428572e-05,
"loss": 1.9522,
"step": 14
},
{
"epoch": 0.04338394793926247,
"grad_norm": 2.7940988540649414,
"learning_rate": 2e-05,
"loss": 1.9142,
"step": 15
},
{
"epoch": 0.046276211135213303,
"grad_norm": 3.4890847206115723,
"learning_rate": 2.1428571428571428e-05,
"loss": 1.8412,
"step": 16
},
{
"epoch": 0.049168474331164135,
"grad_norm": 5.002918720245361,
"learning_rate": 2.2857142857142858e-05,
"loss": 1.8031,
"step": 17
},
{
"epoch": 0.052060737527114966,
"grad_norm": 4.0725226402282715,
"learning_rate": 2.4285714285714288e-05,
"loss": 1.8587,
"step": 18
},
{
"epoch": 0.0549530007230658,
"grad_norm": 2.988891124725342,
"learning_rate": 2.5714285714285714e-05,
"loss": 1.6742,
"step": 19
},
{
"epoch": 0.05784526391901663,
"grad_norm": 2.679062843322754,
"learning_rate": 2.714285714285714e-05,
"loss": 1.563,
"step": 20
},
{
"epoch": 0.06073752711496746,
"grad_norm": 1.9652676582336426,
"learning_rate": 2.857142857142857e-05,
"loss": 1.6146,
"step": 21
},
{
"epoch": 0.0636297903109183,
"grad_norm": 3.5487523078918457,
"learning_rate": 3e-05,
"loss": 1.6091,
"step": 22
},
{
"epoch": 0.06652205350686913,
"grad_norm": 3.5734827518463135,
"learning_rate": 3.142857142857143e-05,
"loss": 1.5872,
"step": 23
},
{
"epoch": 0.06941431670281996,
"grad_norm": 2.6711552143096924,
"learning_rate": 3.285714285714286e-05,
"loss": 1.5964,
"step": 24
},
{
"epoch": 0.07230657989877079,
"grad_norm": 2.6824355125427246,
"learning_rate": 3.428571428571429e-05,
"loss": 1.6661,
"step": 25
},
{
"epoch": 0.07519884309472162,
"grad_norm": 2.8385238647460938,
"learning_rate": 3.571428571428572e-05,
"loss": 1.6069,
"step": 26
},
{
"epoch": 0.07809110629067245,
"grad_norm": 2.863154172897339,
"learning_rate": 3.7142857142857143e-05,
"loss": 1.6074,
"step": 27
},
{
"epoch": 0.08098336948662328,
"grad_norm": 2.5264947414398193,
"learning_rate": 3.857142857142858e-05,
"loss": 1.5442,
"step": 28
},
{
"epoch": 0.08387563268257411,
"grad_norm": 2.4073829650878906,
"learning_rate": 4e-05,
"loss": 1.4696,
"step": 29
},
{
"epoch": 0.08676789587852494,
"grad_norm": 1.2896760702133179,
"learning_rate": 4.1428571428571437e-05,
"loss": 1.4876,
"step": 30
},
{
"epoch": 0.08966015907447578,
"grad_norm": 1.3128914833068848,
"learning_rate": 4.2857142857142856e-05,
"loss": 1.5028,
"step": 31
},
{
"epoch": 0.09255242227042661,
"grad_norm": 1.6972280740737915,
"learning_rate": 4.428571428571428e-05,
"loss": 1.5156,
"step": 32
},
{
"epoch": 0.09544468546637744,
"grad_norm": 1.735119104385376,
"learning_rate": 4.5714285714285716e-05,
"loss": 1.3899,
"step": 33
},
{
"epoch": 0.09833694866232827,
"grad_norm": 1.6684017181396484,
"learning_rate": 4.714285714285714e-05,
"loss": 1.4309,
"step": 34
},
{
"epoch": 0.1012292118582791,
"grad_norm": 1.60593581199646,
"learning_rate": 4.8571428571428576e-05,
"loss": 1.4821,
"step": 35
},
{
"epoch": 0.10412147505422993,
"grad_norm": 1.369852066040039,
"learning_rate": 5e-05,
"loss": 1.4606,
"step": 36
},
{
"epoch": 0.10701373825018076,
"grad_norm": 1.0815776586532593,
"learning_rate": 4.992378048780488e-05,
"loss": 1.3166,
"step": 37
},
{
"epoch": 0.1099060014461316,
"grad_norm": 1.1393859386444092,
"learning_rate": 4.984756097560976e-05,
"loss": 1.3596,
"step": 38
},
{
"epoch": 0.11279826464208242,
"grad_norm": 1.293410062789917,
"learning_rate": 4.977134146341464e-05,
"loss": 1.4997,
"step": 39
},
{
"epoch": 0.11569052783803326,
"grad_norm": 1.5961534976959229,
"learning_rate": 4.969512195121951e-05,
"loss": 1.3908,
"step": 40
},
{
"epoch": 0.11858279103398409,
"grad_norm": 1.6708180904388428,
"learning_rate": 4.961890243902439e-05,
"loss": 1.4434,
"step": 41
},
{
"epoch": 0.12147505422993492,
"grad_norm": 1.3898653984069824,
"learning_rate": 4.954268292682927e-05,
"loss": 1.3746,
"step": 42
},
{
"epoch": 0.12436731742588576,
"grad_norm": 1.1497617959976196,
"learning_rate": 4.946646341463415e-05,
"loss": 1.3683,
"step": 43
},
{
"epoch": 0.1272595806218366,
"grad_norm": 0.9966000318527222,
"learning_rate": 4.9390243902439024e-05,
"loss": 1.4588,
"step": 44
},
{
"epoch": 0.1301518438177874,
"grad_norm": 1.0601434707641602,
"learning_rate": 4.931402439024391e-05,
"loss": 1.3757,
"step": 45
},
{
"epoch": 0.13304410701373826,
"grad_norm": 1.2142244577407837,
"learning_rate": 4.923780487804878e-05,
"loss": 1.419,
"step": 46
},
{
"epoch": 0.13593637020968907,
"grad_norm": 1.2789775133132935,
"learning_rate": 4.916158536585366e-05,
"loss": 1.3221,
"step": 47
},
{
"epoch": 0.13882863340563992,
"grad_norm": 1.2200745344161987,
"learning_rate": 4.908536585365854e-05,
"loss": 1.4087,
"step": 48
},
{
"epoch": 0.14172089660159073,
"grad_norm": 1.0769251585006714,
"learning_rate": 4.900914634146342e-05,
"loss": 1.3794,
"step": 49
},
{
"epoch": 0.14461315979754158,
"grad_norm": 0.9566358923912048,
"learning_rate": 4.893292682926829e-05,
"loss": 1.3159,
"step": 50
},
{
"epoch": 0.1475054229934924,
"grad_norm": 1.0282989740371704,
"learning_rate": 4.885670731707317e-05,
"loss": 1.3803,
"step": 51
},
{
"epoch": 0.15039768618944324,
"grad_norm": 1.0863200426101685,
"learning_rate": 4.878048780487805e-05,
"loss": 1.3548,
"step": 52
},
{
"epoch": 0.15328994938539406,
"grad_norm": 1.0302592515945435,
"learning_rate": 4.870426829268293e-05,
"loss": 1.4204,
"step": 53
},
{
"epoch": 0.1561822125813449,
"grad_norm": 1.0147430896759033,
"learning_rate": 4.86280487804878e-05,
"loss": 1.4,
"step": 54
},
{
"epoch": 0.15907447577729572,
"grad_norm": 0.9125880599021912,
"learning_rate": 4.855182926829269e-05,
"loss": 1.3568,
"step": 55
},
{
"epoch": 0.16196673897324657,
"grad_norm": 0.8917691707611084,
"learning_rate": 4.847560975609756e-05,
"loss": 1.3547,
"step": 56
},
{
"epoch": 0.1648590021691974,
"grad_norm": 1.03391432762146,
"learning_rate": 4.839939024390244e-05,
"loss": 1.2731,
"step": 57
},
{
"epoch": 0.16775126536514823,
"grad_norm": 1.0735812187194824,
"learning_rate": 4.832317073170732e-05,
"loss": 1.2944,
"step": 58
},
{
"epoch": 0.17064352856109907,
"grad_norm": 1.028361439704895,
"learning_rate": 4.82469512195122e-05,
"loss": 1.2617,
"step": 59
},
{
"epoch": 0.1735357917570499,
"grad_norm": 0.9899557828903198,
"learning_rate": 4.817073170731707e-05,
"loss": 1.3891,
"step": 60
},
{
"epoch": 0.17642805495300073,
"grad_norm": 6.040285110473633,
"learning_rate": 4.809451219512195e-05,
"loss": 1.3886,
"step": 61
},
{
"epoch": 0.17932031814895155,
"grad_norm": 1.1661717891693115,
"learning_rate": 4.801829268292683e-05,
"loss": 1.2352,
"step": 62
},
{
"epoch": 0.1822125813449024,
"grad_norm": 3.124387502670288,
"learning_rate": 4.794207317073171e-05,
"loss": 1.2965,
"step": 63
},
{
"epoch": 0.18510484454085321,
"grad_norm": 1.1827131509780884,
"learning_rate": 4.786585365853658e-05,
"loss": 1.349,
"step": 64
},
{
"epoch": 0.18799710773680406,
"grad_norm": 1.027674674987793,
"learning_rate": 4.778963414634147e-05,
"loss": 1.2165,
"step": 65
},
{
"epoch": 0.19088937093275488,
"grad_norm": 0.9438247084617615,
"learning_rate": 4.771341463414634e-05,
"loss": 1.2538,
"step": 66
},
{
"epoch": 0.19378163412870572,
"grad_norm": 0.9163101315498352,
"learning_rate": 4.763719512195122e-05,
"loss": 1.2914,
"step": 67
},
{
"epoch": 0.19667389732465654,
"grad_norm": 0.9787700176239014,
"learning_rate": 4.75609756097561e-05,
"loss": 1.2013,
"step": 68
},
{
"epoch": 0.19956616052060738,
"grad_norm": 0.9685674905776978,
"learning_rate": 4.748475609756098e-05,
"loss": 1.2933,
"step": 69
},
{
"epoch": 0.2024584237165582,
"grad_norm": 0.8412639498710632,
"learning_rate": 4.740853658536585e-05,
"loss": 1.262,
"step": 70
},
{
"epoch": 0.20535068691250905,
"grad_norm": 0.9766181707382202,
"learning_rate": 4.733231707317073e-05,
"loss": 1.225,
"step": 71
},
{
"epoch": 0.20824295010845986,
"grad_norm": 0.990614116191864,
"learning_rate": 4.725609756097561e-05,
"loss": 1.192,
"step": 72
},
{
"epoch": 0.2111352133044107,
"grad_norm": 0.8069394826889038,
"learning_rate": 4.717987804878049e-05,
"loss": 1.2127,
"step": 73
},
{
"epoch": 0.21402747650036152,
"grad_norm": 1.022425889968872,
"learning_rate": 4.710365853658536e-05,
"loss": 1.1593,
"step": 74
},
{
"epoch": 0.21691973969631237,
"grad_norm": 0.9153020977973938,
"learning_rate": 4.702743902439025e-05,
"loss": 1.1794,
"step": 75
},
{
"epoch": 0.2198120028922632,
"grad_norm": 0.7978305816650391,
"learning_rate": 4.695121951219512e-05,
"loss": 1.2791,
"step": 76
},
{
"epoch": 0.22270426608821403,
"grad_norm": 0.8948712348937988,
"learning_rate": 4.6875e-05,
"loss": 1.2227,
"step": 77
},
{
"epoch": 0.22559652928416485,
"grad_norm": 0.9704264998435974,
"learning_rate": 4.679878048780488e-05,
"loss": 1.186,
"step": 78
},
{
"epoch": 0.2284887924801157,
"grad_norm": 0.8205945491790771,
"learning_rate": 4.672256097560976e-05,
"loss": 1.1719,
"step": 79
},
{
"epoch": 0.2313810556760665,
"grad_norm": 0.9167234897613525,
"learning_rate": 4.664634146341464e-05,
"loss": 1.2479,
"step": 80
},
{
"epoch": 0.23427331887201736,
"grad_norm": 0.8766996264457703,
"learning_rate": 4.657012195121951e-05,
"loss": 1.2073,
"step": 81
},
{
"epoch": 0.23716558206796817,
"grad_norm": 0.8327258229255676,
"learning_rate": 4.64939024390244e-05,
"loss": 1.2963,
"step": 82
},
{
"epoch": 0.24005784526391902,
"grad_norm": 0.9994452595710754,
"learning_rate": 4.641768292682927e-05,
"loss": 1.1831,
"step": 83
},
{
"epoch": 0.24295010845986983,
"grad_norm": 0.7853651642799377,
"learning_rate": 4.634146341463415e-05,
"loss": 1.2727,
"step": 84
},
{
"epoch": 0.24584237165582068,
"grad_norm": 0.783089816570282,
"learning_rate": 4.626524390243903e-05,
"loss": 1.2149,
"step": 85
},
{
"epoch": 0.24873463485177152,
"grad_norm": 0.9224200248718262,
"learning_rate": 4.618902439024391e-05,
"loss": 1.1902,
"step": 86
},
{
"epoch": 0.25162689804772237,
"grad_norm": 0.7504012584686279,
"learning_rate": 4.611280487804878e-05,
"loss": 1.2568,
"step": 87
},
{
"epoch": 0.2545191612436732,
"grad_norm": 0.8345561027526855,
"learning_rate": 4.603658536585366e-05,
"loss": 1.1642,
"step": 88
},
{
"epoch": 0.257411424439624,
"grad_norm": 0.8287318348884583,
"learning_rate": 4.596036585365854e-05,
"loss": 1.2115,
"step": 89
},
{
"epoch": 0.2603036876355748,
"grad_norm": 0.7950981259346008,
"learning_rate": 4.588414634146342e-05,
"loss": 1.1439,
"step": 90
},
{
"epoch": 0.2631959508315257,
"grad_norm": 0.8269981741905212,
"learning_rate": 4.580792682926829e-05,
"loss": 1.226,
"step": 91
},
{
"epoch": 0.2660882140274765,
"grad_norm": 0.7990830540657043,
"learning_rate": 4.573170731707318e-05,
"loss": 1.2059,
"step": 92
},
{
"epoch": 0.26898047722342733,
"grad_norm": 0.746702253818512,
"learning_rate": 4.565548780487805e-05,
"loss": 1.272,
"step": 93
},
{
"epoch": 0.27187274041937814,
"grad_norm": 0.7808762192726135,
"learning_rate": 4.557926829268293e-05,
"loss": 1.2135,
"step": 94
},
{
"epoch": 0.274765003615329,
"grad_norm": 0.8141624331474304,
"learning_rate": 4.550304878048781e-05,
"loss": 1.1549,
"step": 95
},
{
"epoch": 0.27765726681127983,
"grad_norm": 0.7702810168266296,
"learning_rate": 4.542682926829269e-05,
"loss": 1.1471,
"step": 96
},
{
"epoch": 0.28054953000723065,
"grad_norm": 0.7874007821083069,
"learning_rate": 4.535060975609756e-05,
"loss": 1.2672,
"step": 97
},
{
"epoch": 0.28344179320318147,
"grad_norm": 0.7983161211013794,
"learning_rate": 4.527439024390244e-05,
"loss": 1.099,
"step": 98
},
{
"epoch": 0.28633405639913234,
"grad_norm": 0.8033881783485413,
"learning_rate": 4.519817073170732e-05,
"loss": 1.1549,
"step": 99
},
{
"epoch": 0.28922631959508316,
"grad_norm": 0.8222156167030334,
"learning_rate": 4.51219512195122e-05,
"loss": 1.1527,
"step": 100
},
{
"epoch": 0.292118582791034,
"grad_norm": 0.7592807412147522,
"learning_rate": 4.504573170731707e-05,
"loss": 1.2142,
"step": 101
},
{
"epoch": 0.2950108459869848,
"grad_norm": 0.7466637492179871,
"learning_rate": 4.496951219512196e-05,
"loss": 1.2232,
"step": 102
},
{
"epoch": 0.29790310918293567,
"grad_norm": 0.7532088756561279,
"learning_rate": 4.489329268292683e-05,
"loss": 1.1717,
"step": 103
},
{
"epoch": 0.3007953723788865,
"grad_norm": 0.766828715801239,
"learning_rate": 4.481707317073171e-05,
"loss": 1.2218,
"step": 104
},
{
"epoch": 0.3036876355748373,
"grad_norm": 0.6948519349098206,
"learning_rate": 4.474085365853659e-05,
"loss": 1.1116,
"step": 105
},
{
"epoch": 0.3065798987707881,
"grad_norm": 0.7532397508621216,
"learning_rate": 4.466463414634147e-05,
"loss": 1.1451,
"step": 106
},
{
"epoch": 0.309472161966739,
"grad_norm": 0.7384987473487854,
"learning_rate": 4.458841463414634e-05,
"loss": 1.2043,
"step": 107
},
{
"epoch": 0.3123644251626898,
"grad_norm": 0.7876350283622742,
"learning_rate": 4.451219512195122e-05,
"loss": 1.3315,
"step": 108
},
{
"epoch": 0.3152566883586406,
"grad_norm": 0.7799772024154663,
"learning_rate": 4.44359756097561e-05,
"loss": 1.2367,
"step": 109
},
{
"epoch": 0.31814895155459144,
"grad_norm": 0.802836537361145,
"learning_rate": 4.435975609756098e-05,
"loss": 1.1859,
"step": 110
},
{
"epoch": 0.3210412147505423,
"grad_norm": 0.7658648490905762,
"learning_rate": 4.428353658536585e-05,
"loss": 1.1554,
"step": 111
},
{
"epoch": 0.32393347794649313,
"grad_norm": 0.7552660703659058,
"learning_rate": 4.420731707317074e-05,
"loss": 1.1773,
"step": 112
},
{
"epoch": 0.32682574114244395,
"grad_norm": 0.7944100499153137,
"learning_rate": 4.413109756097561e-05,
"loss": 1.1369,
"step": 113
},
{
"epoch": 0.3297180043383948,
"grad_norm": 0.79727703332901,
"learning_rate": 4.405487804878049e-05,
"loss": 1.1515,
"step": 114
},
{
"epoch": 0.33261026753434564,
"grad_norm": 0.7767285704612732,
"learning_rate": 4.397865853658537e-05,
"loss": 1.2823,
"step": 115
},
{
"epoch": 0.33550253073029646,
"grad_norm": 0.8018892407417297,
"learning_rate": 4.390243902439025e-05,
"loss": 1.1792,
"step": 116
},
{
"epoch": 0.3383947939262473,
"grad_norm": 0.7893505692481995,
"learning_rate": 4.382621951219512e-05,
"loss": 1.2078,
"step": 117
},
{
"epoch": 0.34128705712219815,
"grad_norm": 0.7643678784370422,
"learning_rate": 4.375e-05,
"loss": 1.1172,
"step": 118
},
{
"epoch": 0.34417932031814896,
"grad_norm": 0.7227766513824463,
"learning_rate": 4.3673780487804886e-05,
"loss": 1.2424,
"step": 119
},
{
"epoch": 0.3470715835140998,
"grad_norm": 0.7557047009468079,
"learning_rate": 4.359756097560976e-05,
"loss": 1.1996,
"step": 120
},
{
"epoch": 0.3499638467100506,
"grad_norm": 0.75395667552948,
"learning_rate": 4.352134146341464e-05,
"loss": 1.2023,
"step": 121
},
{
"epoch": 0.35285610990600147,
"grad_norm": 0.7078515291213989,
"learning_rate": 4.344512195121952e-05,
"loss": 1.2086,
"step": 122
},
{
"epoch": 0.3557483731019523,
"grad_norm": 0.7395102381706238,
"learning_rate": 4.3368902439024396e-05,
"loss": 1.1106,
"step": 123
},
{
"epoch": 0.3586406362979031,
"grad_norm": 0.819173276424408,
"learning_rate": 4.329268292682927e-05,
"loss": 1.1037,
"step": 124
},
{
"epoch": 0.3615328994938539,
"grad_norm": 0.7435188889503479,
"learning_rate": 4.321646341463415e-05,
"loss": 1.1914,
"step": 125
},
{
"epoch": 0.3644251626898048,
"grad_norm": 0.8237520456314087,
"learning_rate": 4.314024390243903e-05,
"loss": 1.1724,
"step": 126
},
{
"epoch": 0.3673174258857556,
"grad_norm": 0.7931056022644043,
"learning_rate": 4.306402439024391e-05,
"loss": 1.1706,
"step": 127
},
{
"epoch": 0.37020968908170643,
"grad_norm": 0.7253796458244324,
"learning_rate": 4.298780487804878e-05,
"loss": 1.1297,
"step": 128
},
{
"epoch": 0.37310195227765725,
"grad_norm": 0.7788090705871582,
"learning_rate": 4.2911585365853665e-05,
"loss": 1.1685,
"step": 129
},
{
"epoch": 0.3759942154736081,
"grad_norm": 0.7236787676811218,
"learning_rate": 4.283536585365854e-05,
"loss": 1.2329,
"step": 130
},
{
"epoch": 0.37888647866955893,
"grad_norm": 0.7436123490333557,
"learning_rate": 4.275914634146342e-05,
"loss": 1.0825,
"step": 131
},
{
"epoch": 0.38177874186550975,
"grad_norm": 0.7631476521492004,
"learning_rate": 4.26829268292683e-05,
"loss": 1.1648,
"step": 132
},
{
"epoch": 0.38467100506146057,
"grad_norm": 0.7813283801078796,
"learning_rate": 4.2606707317073176e-05,
"loss": 1.1475,
"step": 133
},
{
"epoch": 0.38756326825741144,
"grad_norm": 0.7633726000785828,
"learning_rate": 4.253048780487805e-05,
"loss": 1.1771,
"step": 134
},
{
"epoch": 0.39045553145336226,
"grad_norm": 0.7443217039108276,
"learning_rate": 4.245426829268293e-05,
"loss": 1.0879,
"step": 135
},
{
"epoch": 0.3933477946493131,
"grad_norm": 0.7620945572853088,
"learning_rate": 4.237804878048781e-05,
"loss": 1.1515,
"step": 136
},
{
"epoch": 0.3962400578452639,
"grad_norm": 0.7569906711578369,
"learning_rate": 4.230182926829269e-05,
"loss": 1.1857,
"step": 137
},
{
"epoch": 0.39913232104121477,
"grad_norm": 0.754265546798706,
"learning_rate": 4.222560975609756e-05,
"loss": 1.2235,
"step": 138
},
{
"epoch": 0.4020245842371656,
"grad_norm": 0.8115909695625305,
"learning_rate": 4.2149390243902445e-05,
"loss": 1.1533,
"step": 139
},
{
"epoch": 0.4049168474331164,
"grad_norm": 0.7119144201278687,
"learning_rate": 4.207317073170732e-05,
"loss": 1.0566,
"step": 140
},
{
"epoch": 0.4078091106290672,
"grad_norm": 0.745745062828064,
"learning_rate": 4.19969512195122e-05,
"loss": 1.1801,
"step": 141
},
{
"epoch": 0.4107013738250181,
"grad_norm": 0.7318696975708008,
"learning_rate": 4.1920731707317077e-05,
"loss": 1.0448,
"step": 142
},
{
"epoch": 0.4135936370209689,
"grad_norm": 0.691558837890625,
"learning_rate": 4.1844512195121956e-05,
"loss": 1.118,
"step": 143
},
{
"epoch": 0.4164859002169197,
"grad_norm": 0.7404938340187073,
"learning_rate": 4.176829268292683e-05,
"loss": 1.0795,
"step": 144
},
{
"epoch": 0.4193781634128706,
"grad_norm": 0.7128071188926697,
"learning_rate": 4.169207317073171e-05,
"loss": 1.1663,
"step": 145
},
{
"epoch": 0.4222704266088214,
"grad_norm": 0.8010504245758057,
"learning_rate": 4.161585365853659e-05,
"loss": 1.2375,
"step": 146
},
{
"epoch": 0.42516268980477223,
"grad_norm": 0.7428746819496155,
"learning_rate": 4.1539634146341466e-05,
"loss": 1.0991,
"step": 147
},
{
"epoch": 0.42805495300072305,
"grad_norm": 0.7510153651237488,
"learning_rate": 4.146341463414634e-05,
"loss": 1.1386,
"step": 148
},
{
"epoch": 0.4309472161966739,
"grad_norm": 0.7697402834892273,
"learning_rate": 4.1387195121951225e-05,
"loss": 1.06,
"step": 149
},
{
"epoch": 0.43383947939262474,
"grad_norm": 0.7100762128829956,
"learning_rate": 4.13109756097561e-05,
"loss": 1.1578,
"step": 150
},
{
"epoch": 0.43673174258857556,
"grad_norm": 0.7327350974082947,
"learning_rate": 4.123475609756098e-05,
"loss": 1.1994,
"step": 151
},
{
"epoch": 0.4396240057845264,
"grad_norm": 0.7481423020362854,
"learning_rate": 4.1158536585365856e-05,
"loss": 1.1554,
"step": 152
},
{
"epoch": 0.44251626898047725,
"grad_norm": 0.7060924768447876,
"learning_rate": 4.1082317073170736e-05,
"loss": 1.1712,
"step": 153
},
{
"epoch": 0.44540853217642806,
"grad_norm": 0.7289426326751709,
"learning_rate": 4.100609756097561e-05,
"loss": 1.0854,
"step": 154
},
{
"epoch": 0.4483007953723789,
"grad_norm": 0.7729988694190979,
"learning_rate": 4.092987804878049e-05,
"loss": 1.1535,
"step": 155
},
{
"epoch": 0.4511930585683297,
"grad_norm": 0.7460820078849792,
"learning_rate": 4.085365853658537e-05,
"loss": 1.1083,
"step": 156
},
{
"epoch": 0.45408532176428057,
"grad_norm": 0.7617100477218628,
"learning_rate": 4.0777439024390246e-05,
"loss": 1.0703,
"step": 157
},
{
"epoch": 0.4569775849602314,
"grad_norm": 0.7420201897621155,
"learning_rate": 4.070121951219512e-05,
"loss": 1.1499,
"step": 158
},
{
"epoch": 0.4598698481561822,
"grad_norm": 0.7645936608314514,
"learning_rate": 4.0625000000000005e-05,
"loss": 1.1024,
"step": 159
},
{
"epoch": 0.462762111352133,
"grad_norm": 0.7603924870491028,
"learning_rate": 4.0548780487804884e-05,
"loss": 1.0113,
"step": 160
},
{
"epoch": 0.4656543745480839,
"grad_norm": 0.7942943572998047,
"learning_rate": 4.047256097560976e-05,
"loss": 1.1814,
"step": 161
},
{
"epoch": 0.4685466377440347,
"grad_norm": 0.7691872715950012,
"learning_rate": 4.0396341463414636e-05,
"loss": 1.1274,
"step": 162
},
{
"epoch": 0.47143890093998553,
"grad_norm": 0.7765952348709106,
"learning_rate": 4.0320121951219515e-05,
"loss": 1.1215,
"step": 163
},
{
"epoch": 0.47433116413593635,
"grad_norm": 0.7291862368583679,
"learning_rate": 4.0243902439024395e-05,
"loss": 1.0973,
"step": 164
},
{
"epoch": 0.4772234273318872,
"grad_norm": 0.7589432597160339,
"learning_rate": 4.016768292682927e-05,
"loss": 1.1347,
"step": 165
},
{
"epoch": 0.48011569052783803,
"grad_norm": 0.7447579503059387,
"learning_rate": 4.0091463414634153e-05,
"loss": 1.2361,
"step": 166
},
{
"epoch": 0.48300795372378885,
"grad_norm": 0.7255765199661255,
"learning_rate": 4.0015243902439026e-05,
"loss": 1.1495,
"step": 167
},
{
"epoch": 0.48590021691973967,
"grad_norm": 0.7621276378631592,
"learning_rate": 3.9939024390243905e-05,
"loss": 1.1568,
"step": 168
},
{
"epoch": 0.48879248011569054,
"grad_norm": 0.7537471055984497,
"learning_rate": 3.9862804878048785e-05,
"loss": 1.1004,
"step": 169
},
{
"epoch": 0.49168474331164136,
"grad_norm": 0.7859211564064026,
"learning_rate": 3.9786585365853664e-05,
"loss": 1.1401,
"step": 170
},
{
"epoch": 0.4945770065075922,
"grad_norm": 0.7351391911506653,
"learning_rate": 3.971036585365854e-05,
"loss": 1.1076,
"step": 171
},
{
"epoch": 0.49746926970354305,
"grad_norm": 0.7664011716842651,
"learning_rate": 3.9634146341463416e-05,
"loss": 1.0421,
"step": 172
},
{
"epoch": 0.5003615328994938,
"grad_norm": 0.7682709693908691,
"learning_rate": 3.9557926829268295e-05,
"loss": 1.1002,
"step": 173
},
{
"epoch": 0.5032537960954447,
"grad_norm": 0.7599637508392334,
"learning_rate": 3.9481707317073175e-05,
"loss": 1.1453,
"step": 174
},
{
"epoch": 0.5061460592913956,
"grad_norm": 0.8105545043945312,
"learning_rate": 3.940548780487805e-05,
"loss": 1.1733,
"step": 175
},
{
"epoch": 0.5090383224873464,
"grad_norm": 0.7692773938179016,
"learning_rate": 3.932926829268293e-05,
"loss": 1.1658,
"step": 176
},
{
"epoch": 0.5119305856832972,
"grad_norm": 0.7400121092796326,
"learning_rate": 3.9253048780487806e-05,
"loss": 1.1037,
"step": 177
},
{
"epoch": 0.514822848879248,
"grad_norm": 0.7246294021606445,
"learning_rate": 3.9176829268292685e-05,
"loss": 1.1829,
"step": 178
},
{
"epoch": 0.5177151120751988,
"grad_norm": 0.7318651676177979,
"learning_rate": 3.9100609756097565e-05,
"loss": 0.9872,
"step": 179
},
{
"epoch": 0.5206073752711496,
"grad_norm": 0.7589302659034729,
"learning_rate": 3.9024390243902444e-05,
"loss": 1.1624,
"step": 180
},
{
"epoch": 0.5234996384671005,
"grad_norm": 0.7625978589057922,
"learning_rate": 3.8948170731707316e-05,
"loss": 1.1147,
"step": 181
},
{
"epoch": 0.5263919016630514,
"grad_norm": 0.7786478400230408,
"learning_rate": 3.8871951219512196e-05,
"loss": 1.0524,
"step": 182
},
{
"epoch": 0.5292841648590022,
"grad_norm": 0.7591277956962585,
"learning_rate": 3.8795731707317075e-05,
"loss": 1.0672,
"step": 183
},
{
"epoch": 0.532176428054953,
"grad_norm": 0.806042492389679,
"learning_rate": 3.8719512195121954e-05,
"loss": 1.0742,
"step": 184
},
{
"epoch": 0.5350686912509038,
"grad_norm": 0.7718027830123901,
"learning_rate": 3.864329268292683e-05,
"loss": 1.1326,
"step": 185
},
{
"epoch": 0.5379609544468547,
"grad_norm": 0.7538328766822815,
"learning_rate": 3.856707317073171e-05,
"loss": 1.15,
"step": 186
},
{
"epoch": 0.5408532176428055,
"grad_norm": 0.7316940426826477,
"learning_rate": 3.8490853658536586e-05,
"loss": 1.0463,
"step": 187
},
{
"epoch": 0.5437454808387563,
"grad_norm": 0.7699999809265137,
"learning_rate": 3.8414634146341465e-05,
"loss": 1.183,
"step": 188
},
{
"epoch": 0.5466377440347071,
"grad_norm": 0.7050356268882751,
"learning_rate": 3.8338414634146344e-05,
"loss": 1.1208,
"step": 189
},
{
"epoch": 0.549530007230658,
"grad_norm": 0.7819121479988098,
"learning_rate": 3.8262195121951224e-05,
"loss": 1.1622,
"step": 190
},
{
"epoch": 0.5524222704266089,
"grad_norm": 0.700554370880127,
"learning_rate": 3.8185975609756096e-05,
"loss": 1.1104,
"step": 191
},
{
"epoch": 0.5553145336225597,
"grad_norm": 0.7335946559906006,
"learning_rate": 3.8109756097560976e-05,
"loss": 1.0856,
"step": 192
},
{
"epoch": 0.5582067968185105,
"grad_norm": 0.7291987538337708,
"learning_rate": 3.8033536585365855e-05,
"loss": 1.1158,
"step": 193
},
{
"epoch": 0.5610990600144613,
"grad_norm": 0.7313510775566101,
"learning_rate": 3.7957317073170734e-05,
"loss": 1.1907,
"step": 194
},
{
"epoch": 0.5639913232104121,
"grad_norm": 0.7727324366569519,
"learning_rate": 3.788109756097561e-05,
"loss": 1.1681,
"step": 195
},
{
"epoch": 0.5668835864063629,
"grad_norm": 0.7505455613136292,
"learning_rate": 3.780487804878049e-05,
"loss": 1.0712,
"step": 196
},
{
"epoch": 0.5697758496023138,
"grad_norm": 0.7288169860839844,
"learning_rate": 3.7728658536585365e-05,
"loss": 1.113,
"step": 197
},
{
"epoch": 0.5726681127982647,
"grad_norm": 0.8041896820068359,
"learning_rate": 3.7652439024390245e-05,
"loss": 1.056,
"step": 198
},
{
"epoch": 0.5755603759942155,
"grad_norm": 0.7612701058387756,
"learning_rate": 3.7576219512195124e-05,
"loss": 1.0862,
"step": 199
},
{
"epoch": 0.5784526391901663,
"grad_norm": 0.7867717742919922,
"learning_rate": 3.7500000000000003e-05,
"loss": 1.1025,
"step": 200
},
{
"epoch": 0.5813449023861171,
"grad_norm": 0.7869510054588318,
"learning_rate": 3.742378048780488e-05,
"loss": 1.1034,
"step": 201
},
{
"epoch": 0.584237165582068,
"grad_norm": 0.7608320713043213,
"learning_rate": 3.7347560975609755e-05,
"loss": 1.1001,
"step": 202
},
{
"epoch": 0.5871294287780188,
"grad_norm": 0.7015742063522339,
"learning_rate": 3.727134146341464e-05,
"loss": 1.1404,
"step": 203
},
{
"epoch": 0.5900216919739696,
"grad_norm": 0.7741048336029053,
"learning_rate": 3.7195121951219514e-05,
"loss": 1.1069,
"step": 204
},
{
"epoch": 0.5929139551699205,
"grad_norm": 0.7461472749710083,
"learning_rate": 3.7118902439024393e-05,
"loss": 1.1364,
"step": 205
},
{
"epoch": 0.5958062183658713,
"grad_norm": 0.7453926205635071,
"learning_rate": 3.704268292682927e-05,
"loss": 1.1352,
"step": 206
},
{
"epoch": 0.5986984815618221,
"grad_norm": 0.7385006546974182,
"learning_rate": 3.696646341463415e-05,
"loss": 1.1277,
"step": 207
},
{
"epoch": 0.601590744757773,
"grad_norm": 0.755822479724884,
"learning_rate": 3.6890243902439025e-05,
"loss": 1.2321,
"step": 208
},
{
"epoch": 0.6044830079537238,
"grad_norm": 0.7634955048561096,
"learning_rate": 3.6814024390243904e-05,
"loss": 1.0884,
"step": 209
},
{
"epoch": 0.6073752711496746,
"grad_norm": 0.739771842956543,
"learning_rate": 3.673780487804878e-05,
"loss": 1.0753,
"step": 210
},
{
"epoch": 0.6102675343456254,
"grad_norm": 0.7629417777061462,
"learning_rate": 3.666158536585366e-05,
"loss": 1.1352,
"step": 211
},
{
"epoch": 0.6131597975415762,
"grad_norm": 0.7024405002593994,
"learning_rate": 3.6585365853658535e-05,
"loss": 1.0872,
"step": 212
},
{
"epoch": 0.6160520607375272,
"grad_norm": 0.778109610080719,
"learning_rate": 3.650914634146342e-05,
"loss": 1.1214,
"step": 213
},
{
"epoch": 0.618944323933478,
"grad_norm": 0.8042979836463928,
"learning_rate": 3.6432926829268294e-05,
"loss": 1.0782,
"step": 214
},
{
"epoch": 0.6218365871294288,
"grad_norm": 0.7753491997718811,
"learning_rate": 3.635670731707317e-05,
"loss": 1.1299,
"step": 215
},
{
"epoch": 0.6247288503253796,
"grad_norm": 0.7622751593589783,
"learning_rate": 3.628048780487805e-05,
"loss": 1.1382,
"step": 216
},
{
"epoch": 0.6276211135213304,
"grad_norm": 0.7174673080444336,
"learning_rate": 3.620426829268293e-05,
"loss": 1.0331,
"step": 217
},
{
"epoch": 0.6305133767172812,
"grad_norm": 0.7472246885299683,
"learning_rate": 3.6128048780487804e-05,
"loss": 1.1305,
"step": 218
},
{
"epoch": 0.6334056399132321,
"grad_norm": 0.7711846232414246,
"learning_rate": 3.6051829268292684e-05,
"loss": 1.0612,
"step": 219
},
{
"epoch": 0.6362979031091829,
"grad_norm": 0.6961559653282166,
"learning_rate": 3.597560975609756e-05,
"loss": 1.1416,
"step": 220
},
{
"epoch": 0.6391901663051338,
"grad_norm": 0.7391098141670227,
"learning_rate": 3.589939024390244e-05,
"loss": 1.1794,
"step": 221
},
{
"epoch": 0.6420824295010846,
"grad_norm": 0.7802613973617554,
"learning_rate": 3.5823170731707315e-05,
"loss": 1.1799,
"step": 222
},
{
"epoch": 0.6449746926970354,
"grad_norm": 0.7498157620429993,
"learning_rate": 3.57469512195122e-05,
"loss": 1.0472,
"step": 223
},
{
"epoch": 0.6478669558929863,
"grad_norm": 0.7516718506813049,
"learning_rate": 3.5670731707317074e-05,
"loss": 1.1124,
"step": 224
},
{
"epoch": 0.6507592190889371,
"grad_norm": 0.7159478068351746,
"learning_rate": 3.559451219512195e-05,
"loss": 1.1552,
"step": 225
},
{
"epoch": 0.6536514822848879,
"grad_norm": 0.7362671494483948,
"learning_rate": 3.551829268292683e-05,
"loss": 1.0227,
"step": 226
},
{
"epoch": 0.6565437454808387,
"grad_norm": 0.7803052663803101,
"learning_rate": 3.544207317073171e-05,
"loss": 1.0371,
"step": 227
},
{
"epoch": 0.6594360086767896,
"grad_norm": 0.7725502252578735,
"learning_rate": 3.5365853658536584e-05,
"loss": 1.0399,
"step": 228
},
{
"epoch": 0.6623282718727405,
"grad_norm": 0.7670521140098572,
"learning_rate": 3.5289634146341464e-05,
"loss": 1.1018,
"step": 229
},
{
"epoch": 0.6652205350686913,
"grad_norm": 0.8205684423446655,
"learning_rate": 3.521341463414634e-05,
"loss": 1.0543,
"step": 230
},
{
"epoch": 0.6681127982646421,
"grad_norm": 0.7324901223182678,
"learning_rate": 3.513719512195122e-05,
"loss": 1.036,
"step": 231
},
{
"epoch": 0.6710050614605929,
"grad_norm": 0.7967138886451721,
"learning_rate": 3.5060975609756095e-05,
"loss": 1.1261,
"step": 232
},
{
"epoch": 0.6738973246565437,
"grad_norm": 0.7588431239128113,
"learning_rate": 3.498475609756098e-05,
"loss": 1.14,
"step": 233
},
{
"epoch": 0.6767895878524945,
"grad_norm": 0.778581440448761,
"learning_rate": 3.4908536585365853e-05,
"loss": 1.1164,
"step": 234
},
{
"epoch": 0.6796818510484454,
"grad_norm": 0.7511733174324036,
"learning_rate": 3.483231707317073e-05,
"loss": 1.0948,
"step": 235
},
{
"epoch": 0.6825741142443963,
"grad_norm": 0.7637711763381958,
"learning_rate": 3.475609756097561e-05,
"loss": 1.0335,
"step": 236
},
{
"epoch": 0.6854663774403471,
"grad_norm": 0.7194728851318359,
"learning_rate": 3.467987804878049e-05,
"loss": 1.1091,
"step": 237
},
{
"epoch": 0.6883586406362979,
"grad_norm": 1.0010840892791748,
"learning_rate": 3.4603658536585364e-05,
"loss": 1.1292,
"step": 238
},
{
"epoch": 0.6912509038322487,
"grad_norm": 0.767515242099762,
"learning_rate": 3.4527439024390243e-05,
"loss": 1.0865,
"step": 239
},
{
"epoch": 0.6941431670281996,
"grad_norm": 0.7898090481758118,
"learning_rate": 3.445121951219512e-05,
"loss": 1.0899,
"step": 240
},
{
"epoch": 0.6970354302241504,
"grad_norm": 0.7335265874862671,
"learning_rate": 3.4375e-05,
"loss": 1.033,
"step": 241
},
{
"epoch": 0.6999276934201012,
"grad_norm": 0.8223006129264832,
"learning_rate": 3.429878048780488e-05,
"loss": 1.195,
"step": 242
},
{
"epoch": 0.702819956616052,
"grad_norm": 0.8035436868667603,
"learning_rate": 3.422256097560976e-05,
"loss": 1.0006,
"step": 243
},
{
"epoch": 0.7057122198120029,
"grad_norm": 0.7428767681121826,
"learning_rate": 3.414634146341464e-05,
"loss": 1.092,
"step": 244
},
{
"epoch": 0.7086044830079538,
"grad_norm": 0.7584668397903442,
"learning_rate": 3.407012195121951e-05,
"loss": 1.1246,
"step": 245
},
{
"epoch": 0.7114967462039046,
"grad_norm": 0.7379582524299622,
"learning_rate": 3.399390243902439e-05,
"loss": 1.1107,
"step": 246
},
{
"epoch": 0.7143890093998554,
"grad_norm": 0.7565631866455078,
"learning_rate": 3.391768292682927e-05,
"loss": 1.1014,
"step": 247
},
{
"epoch": 0.7172812725958062,
"grad_norm": 0.78312748670578,
"learning_rate": 3.384146341463415e-05,
"loss": 1.0298,
"step": 248
},
{
"epoch": 0.720173535791757,
"grad_norm": 0.7658934593200684,
"learning_rate": 3.376524390243902e-05,
"loss": 1.0314,
"step": 249
},
{
"epoch": 0.7230657989877078,
"grad_norm": 0.7525564432144165,
"learning_rate": 3.368902439024391e-05,
"loss": 1.0405,
"step": 250
},
{
"epoch": 0.7259580621836587,
"grad_norm": 0.7480136752128601,
"learning_rate": 3.361280487804878e-05,
"loss": 1.0912,
"step": 251
},
{
"epoch": 0.7288503253796096,
"grad_norm": 0.7331277132034302,
"learning_rate": 3.353658536585366e-05,
"loss": 1.102,
"step": 252
},
{
"epoch": 0.7317425885755604,
"grad_norm": 0.7302331924438477,
"learning_rate": 3.346036585365854e-05,
"loss": 1.0178,
"step": 253
},
{
"epoch": 0.7346348517715112,
"grad_norm": 0.7028475999832153,
"learning_rate": 3.338414634146342e-05,
"loss": 0.997,
"step": 254
},
{
"epoch": 0.737527114967462,
"grad_norm": 0.7154017090797424,
"learning_rate": 3.330792682926829e-05,
"loss": 1.0404,
"step": 255
},
{
"epoch": 0.7404193781634129,
"grad_norm": 0.7640696167945862,
"learning_rate": 3.323170731707317e-05,
"loss": 1.1071,
"step": 256
},
{
"epoch": 0.7433116413593637,
"grad_norm": 0.7853246331214905,
"learning_rate": 3.315548780487805e-05,
"loss": 1.0511,
"step": 257
},
{
"epoch": 0.7462039045553145,
"grad_norm": 0.7854739427566528,
"learning_rate": 3.307926829268293e-05,
"loss": 1.0859,
"step": 258
},
{
"epoch": 0.7490961677512654,
"grad_norm": 0.7378141283988953,
"learning_rate": 3.30030487804878e-05,
"loss": 1.0932,
"step": 259
},
{
"epoch": 0.7519884309472162,
"grad_norm": 0.7881212830543518,
"learning_rate": 3.292682926829269e-05,
"loss": 1.0923,
"step": 260
},
{
"epoch": 0.754880694143167,
"grad_norm": 0.7434545755386353,
"learning_rate": 3.285060975609756e-05,
"loss": 1.0612,
"step": 261
},
{
"epoch": 0.7577729573391179,
"grad_norm": 0.7590733766555786,
"learning_rate": 3.277439024390244e-05,
"loss": 1.099,
"step": 262
},
{
"epoch": 0.7606652205350687,
"grad_norm": 0.809688925743103,
"learning_rate": 3.269817073170732e-05,
"loss": 1.1674,
"step": 263
},
{
"epoch": 0.7635574837310195,
"grad_norm": 0.7180957198143005,
"learning_rate": 3.26219512195122e-05,
"loss": 1.1196,
"step": 264
},
{
"epoch": 0.7664497469269703,
"grad_norm": 0.7526130676269531,
"learning_rate": 3.254573170731707e-05,
"loss": 0.9961,
"step": 265
},
{
"epoch": 0.7693420101229211,
"grad_norm": 0.8099539279937744,
"learning_rate": 3.246951219512195e-05,
"loss": 1.0742,
"step": 266
},
{
"epoch": 0.7722342733188721,
"grad_norm": 0.7374089360237122,
"learning_rate": 3.239329268292683e-05,
"loss": 1.0845,
"step": 267
},
{
"epoch": 0.7751265365148229,
"grad_norm": 0.6704961061477661,
"learning_rate": 3.231707317073171e-05,
"loss": 0.9631,
"step": 268
},
{
"epoch": 0.7780187997107737,
"grad_norm": 0.7654604315757751,
"learning_rate": 3.224085365853658e-05,
"loss": 1.0663,
"step": 269
},
{
"epoch": 0.7809110629067245,
"grad_norm": 0.7672616243362427,
"learning_rate": 3.216463414634147e-05,
"loss": 1.0802,
"step": 270
},
{
"epoch": 0.7838033261026753,
"grad_norm": 0.7247093915939331,
"learning_rate": 3.208841463414634e-05,
"loss": 1.0921,
"step": 271
},
{
"epoch": 0.7866955892986262,
"grad_norm": 0.75218266248703,
"learning_rate": 3.201219512195122e-05,
"loss": 1.1062,
"step": 272
},
{
"epoch": 0.789587852494577,
"grad_norm": 0.7745797038078308,
"learning_rate": 3.19359756097561e-05,
"loss": 1.1105,
"step": 273
},
{
"epoch": 0.7924801156905278,
"grad_norm": 0.7872446179389954,
"learning_rate": 3.185975609756098e-05,
"loss": 1.0644,
"step": 274
},
{
"epoch": 0.7953723788864787,
"grad_norm": 0.8333762884140015,
"learning_rate": 3.178353658536585e-05,
"loss": 1.065,
"step": 275
},
{
"epoch": 0.7982646420824295,
"grad_norm": 0.7147220969200134,
"learning_rate": 3.170731707317073e-05,
"loss": 1.1217,
"step": 276
},
{
"epoch": 0.8011569052783803,
"grad_norm": 0.7681723237037659,
"learning_rate": 3.163109756097561e-05,
"loss": 1.0033,
"step": 277
},
{
"epoch": 0.8040491684743312,
"grad_norm": 0.7502139210700989,
"learning_rate": 3.155487804878049e-05,
"loss": 1.0245,
"step": 278
},
{
"epoch": 0.806941431670282,
"grad_norm": 0.7371497750282288,
"learning_rate": 3.147865853658536e-05,
"loss": 0.9599,
"step": 279
},
{
"epoch": 0.8098336948662328,
"grad_norm": 0.7861061692237854,
"learning_rate": 3.140243902439025e-05,
"loss": 1.0698,
"step": 280
},
{
"epoch": 0.8127259580621836,
"grad_norm": 0.7982838749885559,
"learning_rate": 3.132621951219512e-05,
"loss": 1.093,
"step": 281
},
{
"epoch": 0.8156182212581344,
"grad_norm": 0.7698132991790771,
"learning_rate": 3.125e-05,
"loss": 0.9996,
"step": 282
},
{
"epoch": 0.8185104844540854,
"grad_norm": 0.7293528914451599,
"learning_rate": 3.117378048780488e-05,
"loss": 1.0981,
"step": 283
},
{
"epoch": 0.8214027476500362,
"grad_norm": 0.7758128643035889,
"learning_rate": 3.109756097560976e-05,
"loss": 1.1089,
"step": 284
},
{
"epoch": 0.824295010845987,
"grad_norm": 0.7410516738891602,
"learning_rate": 3.102134146341464e-05,
"loss": 1.0829,
"step": 285
},
{
"epoch": 0.8271872740419378,
"grad_norm": 0.7614254355430603,
"learning_rate": 3.094512195121951e-05,
"loss": 1.0397,
"step": 286
},
{
"epoch": 0.8300795372378886,
"grad_norm": 0.7554497718811035,
"learning_rate": 3.08689024390244e-05,
"loss": 1.0749,
"step": 287
},
{
"epoch": 0.8329718004338394,
"grad_norm": 0.7554106116294861,
"learning_rate": 3.079268292682927e-05,
"loss": 1.0298,
"step": 288
},
{
"epoch": 0.8358640636297903,
"grad_norm": 0.7850284576416016,
"learning_rate": 3.071646341463415e-05,
"loss": 1.0809,
"step": 289
},
{
"epoch": 0.8387563268257412,
"grad_norm": 0.7142320275306702,
"learning_rate": 3.064024390243903e-05,
"loss": 1.0664,
"step": 290
},
{
"epoch": 0.841648590021692,
"grad_norm": 0.7595747113227844,
"learning_rate": 3.056402439024391e-05,
"loss": 1.0581,
"step": 291
},
{
"epoch": 0.8445408532176428,
"grad_norm": 0.8003636598587036,
"learning_rate": 3.048780487804878e-05,
"loss": 1.0977,
"step": 292
},
{
"epoch": 0.8474331164135936,
"grad_norm": 0.7981911301612854,
"learning_rate": 3.0411585365853663e-05,
"loss": 1.0425,
"step": 293
},
{
"epoch": 0.8503253796095445,
"grad_norm": 0.7293020486831665,
"learning_rate": 3.0335365853658536e-05,
"loss": 1.086,
"step": 294
},
{
"epoch": 0.8532176428054953,
"grad_norm": 0.7135725617408752,
"learning_rate": 3.025914634146342e-05,
"loss": 1.1027,
"step": 295
},
{
"epoch": 0.8561099060014461,
"grad_norm": 0.7151292562484741,
"learning_rate": 3.0182926829268294e-05,
"loss": 1.1234,
"step": 296
},
{
"epoch": 0.8590021691973969,
"grad_norm": 0.7805321216583252,
"learning_rate": 3.0106707317073174e-05,
"loss": 1.0833,
"step": 297
},
{
"epoch": 0.8618944323933478,
"grad_norm": 0.7318261861801147,
"learning_rate": 3.003048780487805e-05,
"loss": 1.0742,
"step": 298
},
{
"epoch": 0.8647866955892987,
"grad_norm": 0.7618130445480347,
"learning_rate": 2.995426829268293e-05,
"loss": 1.0974,
"step": 299
},
{
"epoch": 0.8676789587852495,
"grad_norm": 0.7759801745414734,
"learning_rate": 2.9878048780487805e-05,
"loss": 1.0236,
"step": 300
},
{
"epoch": 0.8705712219812003,
"grad_norm": 0.7935881614685059,
"learning_rate": 2.9801829268292684e-05,
"loss": 1.0511,
"step": 301
},
{
"epoch": 0.8734634851771511,
"grad_norm": 0.7859032154083252,
"learning_rate": 2.972560975609756e-05,
"loss": 1.0469,
"step": 302
},
{
"epoch": 0.8763557483731019,
"grad_norm": 0.7812406420707703,
"learning_rate": 2.9649390243902443e-05,
"loss": 1.0289,
"step": 303
},
{
"epoch": 0.8792480115690527,
"grad_norm": 0.7637215256690979,
"learning_rate": 2.9573170731707316e-05,
"loss": 0.9902,
"step": 304
},
{
"epoch": 0.8821402747650036,
"grad_norm": 0.7497740983963013,
"learning_rate": 2.9496951219512198e-05,
"loss": 1.0487,
"step": 305
},
{
"epoch": 0.8850325379609545,
"grad_norm": 0.7327484488487244,
"learning_rate": 2.9420731707317074e-05,
"loss": 1.1966,
"step": 306
},
{
"epoch": 0.8879248011569053,
"grad_norm": 0.7829355597496033,
"learning_rate": 2.9344512195121954e-05,
"loss": 1.0982,
"step": 307
},
{
"epoch": 0.8908170643528561,
"grad_norm": 0.7765836119651794,
"learning_rate": 2.926829268292683e-05,
"loss": 0.9476,
"step": 308
},
{
"epoch": 0.8937093275488069,
"grad_norm": 0.7646698951721191,
"learning_rate": 2.919207317073171e-05,
"loss": 1.1214,
"step": 309
},
{
"epoch": 0.8966015907447578,
"grad_norm": 0.7531141638755798,
"learning_rate": 2.9115853658536585e-05,
"loss": 1.0438,
"step": 310
},
{
"epoch": 0.8994938539407086,
"grad_norm": 0.7788392305374146,
"learning_rate": 2.9039634146341464e-05,
"loss": 1.0591,
"step": 311
},
{
"epoch": 0.9023861171366594,
"grad_norm": 0.7006287574768066,
"learning_rate": 2.896341463414634e-05,
"loss": 1.0351,
"step": 312
},
{
"epoch": 0.9052783803326103,
"grad_norm": 0.8054205775260925,
"learning_rate": 2.8887195121951223e-05,
"loss": 1.1357,
"step": 313
},
{
"epoch": 0.9081706435285611,
"grad_norm": 0.7643339037895203,
"learning_rate": 2.8810975609756095e-05,
"loss": 1.073,
"step": 314
},
{
"epoch": 0.911062906724512,
"grad_norm": 0.7552357316017151,
"learning_rate": 2.8734756097560978e-05,
"loss": 1.0199,
"step": 315
},
{
"epoch": 0.9139551699204628,
"grad_norm": 0.7398456931114197,
"learning_rate": 2.8658536585365854e-05,
"loss": 1.0532,
"step": 316
},
{
"epoch": 0.9168474331164136,
"grad_norm": 0.7522266507148743,
"learning_rate": 2.8582317073170733e-05,
"loss": 1.0824,
"step": 317
},
{
"epoch": 0.9197396963123644,
"grad_norm": 0.7729273438453674,
"learning_rate": 2.850609756097561e-05,
"loss": 1.0282,
"step": 318
},
{
"epoch": 0.9226319595083152,
"grad_norm": 0.7700569033622742,
"learning_rate": 2.842987804878049e-05,
"loss": 1.0654,
"step": 319
},
{
"epoch": 0.925524222704266,
"grad_norm": 0.7540171146392822,
"learning_rate": 2.8353658536585365e-05,
"loss": 1.0615,
"step": 320
},
{
"epoch": 0.928416485900217,
"grad_norm": 0.7484927773475647,
"learning_rate": 2.8277439024390244e-05,
"loss": 1.0276,
"step": 321
},
{
"epoch": 0.9313087490961678,
"grad_norm": 0.793731153011322,
"learning_rate": 2.820121951219512e-05,
"loss": 1.0536,
"step": 322
},
{
"epoch": 0.9342010122921186,
"grad_norm": 0.7182806134223938,
"learning_rate": 2.8125000000000003e-05,
"loss": 1.0135,
"step": 323
},
{
"epoch": 0.9370932754880694,
"grad_norm": 0.7177212834358215,
"learning_rate": 2.8048780487804882e-05,
"loss": 1.02,
"step": 324
},
{
"epoch": 0.9399855386840202,
"grad_norm": 0.7477127909660339,
"learning_rate": 2.7972560975609758e-05,
"loss": 1.09,
"step": 325
},
{
"epoch": 0.9428778018799711,
"grad_norm": 0.7824453115463257,
"learning_rate": 2.7896341463414637e-05,
"loss": 0.9806,
"step": 326
},
{
"epoch": 0.9457700650759219,
"grad_norm": 0.7952285408973694,
"learning_rate": 2.7820121951219513e-05,
"loss": 1.0417,
"step": 327
},
{
"epoch": 0.9486623282718727,
"grad_norm": 0.8422231674194336,
"learning_rate": 2.7743902439024393e-05,
"loss": 1.0552,
"step": 328
},
{
"epoch": 0.9515545914678236,
"grad_norm": 0.8023759722709656,
"learning_rate": 2.766768292682927e-05,
"loss": 1.0695,
"step": 329
},
{
"epoch": 0.9544468546637744,
"grad_norm": 0.7767244577407837,
"learning_rate": 2.759146341463415e-05,
"loss": 1.1414,
"step": 330
},
{
"epoch": 0.9573391178597253,
"grad_norm": 0.7687296271324158,
"learning_rate": 2.7515243902439024e-05,
"loss": 1.0518,
"step": 331
},
{
"epoch": 0.9602313810556761,
"grad_norm": 0.76921147108078,
"learning_rate": 2.7439024390243906e-05,
"loss": 1.0616,
"step": 332
},
{
"epoch": 0.9631236442516269,
"grad_norm": 0.7176332473754883,
"learning_rate": 2.7362804878048782e-05,
"loss": 1.0969,
"step": 333
},
{
"epoch": 0.9660159074475777,
"grad_norm": 0.7853028774261475,
"learning_rate": 2.7286585365853662e-05,
"loss": 1.0375,
"step": 334
},
{
"epoch": 0.9689081706435285,
"grad_norm": 0.7683706879615784,
"learning_rate": 2.7210365853658538e-05,
"loss": 0.9734,
"step": 335
},
{
"epoch": 0.9718004338394793,
"grad_norm": 0.8103812336921692,
"learning_rate": 2.7134146341463417e-05,
"loss": 1.0579,
"step": 336
},
{
"epoch": 0.9746926970354303,
"grad_norm": 0.7865802049636841,
"learning_rate": 2.7057926829268293e-05,
"loss": 1.019,
"step": 337
},
{
"epoch": 0.9775849602313811,
"grad_norm": 0.7285350561141968,
"learning_rate": 2.6981707317073172e-05,
"loss": 1.0886,
"step": 338
},
{
"epoch": 0.9804772234273319,
"grad_norm": 0.7790278196334839,
"learning_rate": 2.6905487804878048e-05,
"loss": 1.03,
"step": 339
},
{
"epoch": 0.9833694866232827,
"grad_norm": 0.8020289540290833,
"learning_rate": 2.682926829268293e-05,
"loss": 1.0997,
"step": 340
},
{
"epoch": 0.9862617498192335,
"grad_norm": 0.7671722173690796,
"learning_rate": 2.6753048780487804e-05,
"loss": 1.1381,
"step": 341
},
{
"epoch": 0.9891540130151844,
"grad_norm": 0.8592469096183777,
"learning_rate": 2.6676829268292686e-05,
"loss": 1.0825,
"step": 342
},
{
"epoch": 0.9920462762111352,
"grad_norm": 0.7508606910705566,
"learning_rate": 2.6600609756097562e-05,
"loss": 1.0808,
"step": 343
},
{
"epoch": 0.9949385394070861,
"grad_norm": 0.7976868152618408,
"learning_rate": 2.652439024390244e-05,
"loss": 1.0345,
"step": 344
},
{
"epoch": 0.9978308026030369,
"grad_norm": 0.7527894973754883,
"learning_rate": 2.6448170731707318e-05,
"loss": 0.9788,
"step": 345
},
{
"epoch": 1.0,
"grad_norm": 0.7162013053894043,
"learning_rate": 2.6371951219512197e-05,
"loss": 0.6875,
"step": 346
},
{
"epoch": 1.002892263195951,
"grad_norm": 0.7367959022521973,
"learning_rate": 2.6295731707317073e-05,
"loss": 0.8764,
"step": 347
},
{
"epoch": 1.0057845263919016,
"grad_norm": 0.7669069170951843,
"learning_rate": 2.6219512195121952e-05,
"loss": 0.8188,
"step": 348
},
{
"epoch": 1.0086767895878526,
"grad_norm": 0.7791001200675964,
"learning_rate": 2.6143292682926828e-05,
"loss": 0.9138,
"step": 349
},
{
"epoch": 1.0115690527838033,
"grad_norm": 0.7576078772544861,
"learning_rate": 2.606707317073171e-05,
"loss": 0.7908,
"step": 350
},
{
"epoch": 1.0144613159797542,
"grad_norm": 0.7850218415260315,
"learning_rate": 2.5990853658536583e-05,
"loss": 0.9152,
"step": 351
},
{
"epoch": 1.017353579175705,
"grad_norm": 0.9033083319664001,
"learning_rate": 2.5914634146341466e-05,
"loss": 0.8214,
"step": 352
},
{
"epoch": 1.0202458423716558,
"grad_norm": 0.91056889295578,
"learning_rate": 2.5838414634146342e-05,
"loss": 0.873,
"step": 353
},
{
"epoch": 1.0231381055676068,
"grad_norm": 0.9178743958473206,
"learning_rate": 2.576219512195122e-05,
"loss": 0.8357,
"step": 354
},
{
"epoch": 1.0260303687635575,
"grad_norm": 0.9112760424613953,
"learning_rate": 2.5685975609756097e-05,
"loss": 0.8381,
"step": 355
},
{
"epoch": 1.0289226319595084,
"grad_norm": 0.874699056148529,
"learning_rate": 2.5609756097560977e-05,
"loss": 0.8443,
"step": 356
},
{
"epoch": 1.031814895155459,
"grad_norm": 0.866185188293457,
"learning_rate": 2.5533536585365853e-05,
"loss": 0.8651,
"step": 357
},
{
"epoch": 1.03470715835141,
"grad_norm": 0.8335126042366028,
"learning_rate": 2.5457317073170732e-05,
"loss": 0.7468,
"step": 358
},
{
"epoch": 1.0375994215473607,
"grad_norm": 0.8364746570587158,
"learning_rate": 2.5381097560975608e-05,
"loss": 0.8368,
"step": 359
},
{
"epoch": 1.0404916847433117,
"grad_norm": 0.887727677822113,
"learning_rate": 2.530487804878049e-05,
"loss": 0.8161,
"step": 360
},
{
"epoch": 1.0433839479392624,
"grad_norm": 0.8570895791053772,
"learning_rate": 2.5228658536585363e-05,
"loss": 0.7743,
"step": 361
},
{
"epoch": 1.0462762111352133,
"grad_norm": 0.8758525252342224,
"learning_rate": 2.5152439024390246e-05,
"loss": 0.7668,
"step": 362
},
{
"epoch": 1.0491684743311642,
"grad_norm": 0.9433422088623047,
"learning_rate": 2.5076219512195122e-05,
"loss": 0.8556,
"step": 363
},
{
"epoch": 1.052060737527115,
"grad_norm": 0.957084596157074,
"learning_rate": 2.5e-05,
"loss": 0.859,
"step": 364
},
{
"epoch": 1.0549530007230659,
"grad_norm": 0.9015299677848816,
"learning_rate": 2.492378048780488e-05,
"loss": 0.7513,
"step": 365
},
{
"epoch": 1.0578452639190166,
"grad_norm": 0.8645225763320923,
"learning_rate": 2.4847560975609756e-05,
"loss": 0.7758,
"step": 366
},
{
"epoch": 1.0607375271149675,
"grad_norm": 0.8781758546829224,
"learning_rate": 2.4771341463414636e-05,
"loss": 0.7608,
"step": 367
},
{
"epoch": 1.0636297903109182,
"grad_norm": 0.9088943600654602,
"learning_rate": 2.4695121951219512e-05,
"loss": 0.8187,
"step": 368
},
{
"epoch": 1.0665220535068691,
"grad_norm": 0.8699431419372559,
"learning_rate": 2.461890243902439e-05,
"loss": 0.885,
"step": 369
},
{
"epoch": 1.06941431670282,
"grad_norm": 0.8766498565673828,
"learning_rate": 2.454268292682927e-05,
"loss": 0.8439,
"step": 370
},
{
"epoch": 1.0723065798987708,
"grad_norm": 0.9093021154403687,
"learning_rate": 2.4466463414634146e-05,
"loss": 0.8731,
"step": 371
},
{
"epoch": 1.0751988430947217,
"grad_norm": 0.9020785689353943,
"learning_rate": 2.4390243902439026e-05,
"loss": 0.8291,
"step": 372
},
{
"epoch": 1.0780911062906724,
"grad_norm": 0.8650471568107605,
"learning_rate": 2.43140243902439e-05,
"loss": 0.8439,
"step": 373
},
{
"epoch": 1.0809833694866233,
"grad_norm": 0.9382796883583069,
"learning_rate": 2.423780487804878e-05,
"loss": 0.8312,
"step": 374
},
{
"epoch": 1.083875632682574,
"grad_norm": 0.8890308737754822,
"learning_rate": 2.416158536585366e-05,
"loss": 0.8552,
"step": 375
},
{
"epoch": 1.086767895878525,
"grad_norm": 0.9097614884376526,
"learning_rate": 2.4085365853658536e-05,
"loss": 0.8513,
"step": 376
},
{
"epoch": 1.0896601590744757,
"grad_norm": 0.9238763451576233,
"learning_rate": 2.4009146341463416e-05,
"loss": 0.7782,
"step": 377
},
{
"epoch": 1.0925524222704266,
"grad_norm": 0.917517364025116,
"learning_rate": 2.393292682926829e-05,
"loss": 0.7853,
"step": 378
},
{
"epoch": 1.0954446854663775,
"grad_norm": 0.954457700252533,
"learning_rate": 2.385670731707317e-05,
"loss": 0.8102,
"step": 379
},
{
"epoch": 1.0983369486623282,
"grad_norm": 0.9540069699287415,
"learning_rate": 2.378048780487805e-05,
"loss": 0.8117,
"step": 380
},
{
"epoch": 1.1012292118582792,
"grad_norm": 0.8629953265190125,
"learning_rate": 2.3704268292682926e-05,
"loss": 0.8483,
"step": 381
},
{
"epoch": 1.1041214750542299,
"grad_norm": 0.9152767658233643,
"learning_rate": 2.3628048780487806e-05,
"loss": 0.7391,
"step": 382
},
{
"epoch": 1.1070137382501808,
"grad_norm": 0.9119929671287537,
"learning_rate": 2.355182926829268e-05,
"loss": 0.8084,
"step": 383
},
{
"epoch": 1.1099060014461315,
"grad_norm": 0.9688836932182312,
"learning_rate": 2.347560975609756e-05,
"loss": 0.8794,
"step": 384
},
{
"epoch": 1.1127982646420824,
"grad_norm": 0.8734216094017029,
"learning_rate": 2.339939024390244e-05,
"loss": 0.771,
"step": 385
},
{
"epoch": 1.1156905278380334,
"grad_norm": 0.936385452747345,
"learning_rate": 2.332317073170732e-05,
"loss": 0.843,
"step": 386
},
{
"epoch": 1.118582791033984,
"grad_norm": 0.8708637356758118,
"learning_rate": 2.32469512195122e-05,
"loss": 0.8005,
"step": 387
},
{
"epoch": 1.121475054229935,
"grad_norm": 0.9174913167953491,
"learning_rate": 2.3170731707317075e-05,
"loss": 0.7858,
"step": 388
},
{
"epoch": 1.1243673174258857,
"grad_norm": 0.8793891668319702,
"learning_rate": 2.3094512195121954e-05,
"loss": 0.7827,
"step": 389
},
{
"epoch": 1.1272595806218366,
"grad_norm": 0.9375653266906738,
"learning_rate": 2.301829268292683e-05,
"loss": 0.8587,
"step": 390
},
{
"epoch": 1.1301518438177873,
"grad_norm": 0.9476063251495361,
"learning_rate": 2.294207317073171e-05,
"loss": 0.8222,
"step": 391
},
{
"epoch": 1.1330441070137383,
"grad_norm": 0.8776272535324097,
"learning_rate": 2.286585365853659e-05,
"loss": 0.8089,
"step": 392
},
{
"epoch": 1.1359363702096892,
"grad_norm": 0.8908610343933105,
"learning_rate": 2.2789634146341465e-05,
"loss": 0.8531,
"step": 393
},
{
"epoch": 1.13882863340564,
"grad_norm": 0.9270078539848328,
"learning_rate": 2.2713414634146344e-05,
"loss": 0.8842,
"step": 394
},
{
"epoch": 1.1417208966015908,
"grad_norm": 0.9019871354103088,
"learning_rate": 2.263719512195122e-05,
"loss": 0.7006,
"step": 395
},
{
"epoch": 1.1446131597975415,
"grad_norm": 0.9170034527778625,
"learning_rate": 2.25609756097561e-05,
"loss": 0.8055,
"step": 396
},
{
"epoch": 1.1475054229934925,
"grad_norm": 0.9285536408424377,
"learning_rate": 2.248475609756098e-05,
"loss": 0.8192,
"step": 397
},
{
"epoch": 1.1503976861894432,
"grad_norm": 0.9291247725486755,
"learning_rate": 2.2408536585365855e-05,
"loss": 0.7733,
"step": 398
},
{
"epoch": 1.153289949385394,
"grad_norm": 0.893548846244812,
"learning_rate": 2.2332317073170734e-05,
"loss": 0.8112,
"step": 399
},
{
"epoch": 1.1561822125813448,
"grad_norm": 0.933894693851471,
"learning_rate": 2.225609756097561e-05,
"loss": 0.8244,
"step": 400
},
{
"epoch": 1.1590744757772957,
"grad_norm": 0.8933086395263672,
"learning_rate": 2.217987804878049e-05,
"loss": 0.799,
"step": 401
},
{
"epoch": 1.1619667389732466,
"grad_norm": 0.8862596750259399,
"learning_rate": 2.210365853658537e-05,
"loss": 0.7522,
"step": 402
},
{
"epoch": 1.1648590021691974,
"grad_norm": 0.9892849922180176,
"learning_rate": 2.2027439024390244e-05,
"loss": 0.8144,
"step": 403
},
{
"epoch": 1.1677512653651483,
"grad_norm": 0.8950841426849365,
"learning_rate": 2.1951219512195124e-05,
"loss": 0.8498,
"step": 404
},
{
"epoch": 1.170643528561099,
"grad_norm": 0.9264621734619141,
"learning_rate": 2.1875e-05,
"loss": 0.8619,
"step": 405
},
{
"epoch": 1.17353579175705,
"grad_norm": 0.9350318908691406,
"learning_rate": 2.179878048780488e-05,
"loss": 0.901,
"step": 406
},
{
"epoch": 1.1764280549530008,
"grad_norm": 0.8909422755241394,
"learning_rate": 2.172256097560976e-05,
"loss": 0.7969,
"step": 407
},
{
"epoch": 1.1793203181489516,
"grad_norm": 0.9076801538467407,
"learning_rate": 2.1646341463414634e-05,
"loss": 0.8102,
"step": 408
},
{
"epoch": 1.1822125813449025,
"grad_norm": 0.9365906715393066,
"learning_rate": 2.1570121951219514e-05,
"loss": 0.8216,
"step": 409
},
{
"epoch": 1.1851048445408532,
"grad_norm": 0.9423839449882507,
"learning_rate": 2.149390243902439e-05,
"loss": 0.8007,
"step": 410
},
{
"epoch": 1.1879971077368041,
"grad_norm": 0.9760177135467529,
"learning_rate": 2.141768292682927e-05,
"loss": 0.7394,
"step": 411
},
{
"epoch": 1.1908893709327548,
"grad_norm": 0.9895643591880798,
"learning_rate": 2.134146341463415e-05,
"loss": 0.8613,
"step": 412
},
{
"epoch": 1.1937816341287057,
"grad_norm": 0.9074323177337646,
"learning_rate": 2.1265243902439024e-05,
"loss": 0.7996,
"step": 413
},
{
"epoch": 1.1966738973246565,
"grad_norm": 0.9774613380432129,
"learning_rate": 2.1189024390243904e-05,
"loss": 0.7982,
"step": 414
},
{
"epoch": 1.1995661605206074,
"grad_norm": 0.9536191821098328,
"learning_rate": 2.111280487804878e-05,
"loss": 0.8498,
"step": 415
},
{
"epoch": 1.2024584237165583,
"grad_norm": 0.9640031456947327,
"learning_rate": 2.103658536585366e-05,
"loss": 0.7995,
"step": 416
},
{
"epoch": 1.205350686912509,
"grad_norm": 0.9486613869667053,
"learning_rate": 2.0960365853658538e-05,
"loss": 0.8277,
"step": 417
},
{
"epoch": 1.20824295010846,
"grad_norm": 0.9539316296577454,
"learning_rate": 2.0884146341463414e-05,
"loss": 0.8163,
"step": 418
},
{
"epoch": 1.2111352133044107,
"grad_norm": 0.9421859383583069,
"learning_rate": 2.0807926829268294e-05,
"loss": 0.8645,
"step": 419
},
{
"epoch": 1.2140274765003616,
"grad_norm": 0.9420467615127563,
"learning_rate": 2.073170731707317e-05,
"loss": 0.7646,
"step": 420
},
{
"epoch": 1.2169197396963123,
"grad_norm": 0.8715965151786804,
"learning_rate": 2.065548780487805e-05,
"loss": 0.819,
"step": 421
},
{
"epoch": 1.2198120028922632,
"grad_norm": 0.8634954690933228,
"learning_rate": 2.0579268292682928e-05,
"loss": 0.8478,
"step": 422
},
{
"epoch": 1.222704266088214,
"grad_norm": 0.9214886426925659,
"learning_rate": 2.0503048780487804e-05,
"loss": 0.8249,
"step": 423
},
{
"epoch": 1.2255965292841648,
"grad_norm": 0.9319393634796143,
"learning_rate": 2.0426829268292683e-05,
"loss": 0.8251,
"step": 424
},
{
"epoch": 1.2284887924801158,
"grad_norm": 0.9580456018447876,
"learning_rate": 2.035060975609756e-05,
"loss": 0.8139,
"step": 425
},
{
"epoch": 1.2313810556760665,
"grad_norm": 0.9004295468330383,
"learning_rate": 2.0274390243902442e-05,
"loss": 0.7768,
"step": 426
},
{
"epoch": 1.2342733188720174,
"grad_norm": 0.9250595569610596,
"learning_rate": 2.0198170731707318e-05,
"loss": 0.7709,
"step": 427
},
{
"epoch": 1.2371655820679681,
"grad_norm": 0.9740453362464905,
"learning_rate": 2.0121951219512197e-05,
"loss": 0.8407,
"step": 428
},
{
"epoch": 1.240057845263919,
"grad_norm": 0.9681423306465149,
"learning_rate": 2.0045731707317077e-05,
"loss": 0.7929,
"step": 429
},
{
"epoch": 1.2429501084598698,
"grad_norm": 0.9964022040367126,
"learning_rate": 1.9969512195121953e-05,
"loss": 0.7823,
"step": 430
},
{
"epoch": 1.2458423716558207,
"grad_norm": 1.0318474769592285,
"learning_rate": 1.9893292682926832e-05,
"loss": 0.8579,
"step": 431
},
{
"epoch": 1.2487346348517716,
"grad_norm": 0.9292550086975098,
"learning_rate": 1.9817073170731708e-05,
"loss": 0.815,
"step": 432
},
{
"epoch": 1.2516268980477223,
"grad_norm": 0.9619131088256836,
"learning_rate": 1.9740853658536587e-05,
"loss": 0.8136,
"step": 433
},
{
"epoch": 1.2545191612436732,
"grad_norm": 0.9113368391990662,
"learning_rate": 1.9664634146341467e-05,
"loss": 0.7857,
"step": 434
},
{
"epoch": 1.257411424439624,
"grad_norm": 0.9458669424057007,
"learning_rate": 1.9588414634146343e-05,
"loss": 0.8051,
"step": 435
},
{
"epoch": 1.2603036876355749,
"grad_norm": 0.9174255132675171,
"learning_rate": 1.9512195121951222e-05,
"loss": 0.8014,
"step": 436
},
{
"epoch": 1.2631959508315256,
"grad_norm": 0.961124837398529,
"learning_rate": 1.9435975609756098e-05,
"loss": 0.8441,
"step": 437
},
{
"epoch": 1.2660882140274765,
"grad_norm": 1.0305391550064087,
"learning_rate": 1.9359756097560977e-05,
"loss": 0.8183,
"step": 438
},
{
"epoch": 1.2689804772234274,
"grad_norm": 0.939954936504364,
"learning_rate": 1.9283536585365857e-05,
"loss": 0.7894,
"step": 439
},
{
"epoch": 1.2718727404193781,
"grad_norm": 0.921103835105896,
"learning_rate": 1.9207317073170733e-05,
"loss": 0.7405,
"step": 440
},
{
"epoch": 1.274765003615329,
"grad_norm": 0.926176130771637,
"learning_rate": 1.9131097560975612e-05,
"loss": 0.7853,
"step": 441
},
{
"epoch": 1.2776572668112798,
"grad_norm": 0.9235204458236694,
"learning_rate": 1.9054878048780488e-05,
"loss": 0.8532,
"step": 442
},
{
"epoch": 1.2805495300072307,
"grad_norm": 0.9539816975593567,
"learning_rate": 1.8978658536585367e-05,
"loss": 0.7904,
"step": 443
},
{
"epoch": 1.2834417932031814,
"grad_norm": 0.9811721444129944,
"learning_rate": 1.8902439024390246e-05,
"loss": 0.824,
"step": 444
},
{
"epoch": 1.2863340563991323,
"grad_norm": 0.900104284286499,
"learning_rate": 1.8826219512195122e-05,
"loss": 0.762,
"step": 445
},
{
"epoch": 1.289226319595083,
"grad_norm": 0.9972739815711975,
"learning_rate": 1.8750000000000002e-05,
"loss": 0.8043,
"step": 446
},
{
"epoch": 1.292118582791034,
"grad_norm": 0.9787886738777161,
"learning_rate": 1.8673780487804878e-05,
"loss": 0.8379,
"step": 447
},
{
"epoch": 1.295010845986985,
"grad_norm": 1.0129365921020508,
"learning_rate": 1.8597560975609757e-05,
"loss": 0.8211,
"step": 448
},
{
"epoch": 1.2979031091829356,
"grad_norm": 0.9614445567131042,
"learning_rate": 1.8521341463414636e-05,
"loss": 0.811,
"step": 449
},
{
"epoch": 1.3007953723788865,
"grad_norm": 0.9432827830314636,
"learning_rate": 1.8445121951219512e-05,
"loss": 0.8049,
"step": 450
},
{
"epoch": 1.3036876355748372,
"grad_norm": 0.9323035478591919,
"learning_rate": 1.836890243902439e-05,
"loss": 0.8285,
"step": 451
},
{
"epoch": 1.3065798987707882,
"grad_norm": 0.979387640953064,
"learning_rate": 1.8292682926829268e-05,
"loss": 0.833,
"step": 452
},
{
"epoch": 1.309472161966739,
"grad_norm": 0.9406694173812866,
"learning_rate": 1.8216463414634147e-05,
"loss": 0.823,
"step": 453
},
{
"epoch": 1.3123644251626898,
"grad_norm": 0.9428540468215942,
"learning_rate": 1.8140243902439026e-05,
"loss": 0.7691,
"step": 454
},
{
"epoch": 1.3152566883586405,
"grad_norm": 0.9734871983528137,
"learning_rate": 1.8064024390243902e-05,
"loss": 0.7952,
"step": 455
},
{
"epoch": 1.3181489515545914,
"grad_norm": 0.9358460307121277,
"learning_rate": 1.798780487804878e-05,
"loss": 0.7799,
"step": 456
},
{
"epoch": 1.3210412147505424,
"grad_norm": 0.9847381711006165,
"learning_rate": 1.7911585365853658e-05,
"loss": 0.8272,
"step": 457
},
{
"epoch": 1.323933477946493,
"grad_norm": 1.0185282230377197,
"learning_rate": 1.7835365853658537e-05,
"loss": 0.7397,
"step": 458
},
{
"epoch": 1.326825741142444,
"grad_norm": 1.019514560699463,
"learning_rate": 1.7759146341463416e-05,
"loss": 0.8922,
"step": 459
},
{
"epoch": 1.3297180043383947,
"grad_norm": 1.0088555812835693,
"learning_rate": 1.7682926829268292e-05,
"loss": 0.8657,
"step": 460
},
{
"epoch": 1.3326102675343456,
"grad_norm": 0.9719268679618835,
"learning_rate": 1.760670731707317e-05,
"loss": 0.8074,
"step": 461
},
{
"epoch": 1.3355025307302966,
"grad_norm": 0.9707063436508179,
"learning_rate": 1.7530487804878047e-05,
"loss": 0.7983,
"step": 462
},
{
"epoch": 1.3383947939262473,
"grad_norm": 1.0087740421295166,
"learning_rate": 1.7454268292682927e-05,
"loss": 0.8205,
"step": 463
},
{
"epoch": 1.3412870571221982,
"grad_norm": 0.957075297832489,
"learning_rate": 1.7378048780487806e-05,
"loss": 0.8248,
"step": 464
},
{
"epoch": 1.344179320318149,
"grad_norm": 0.9987917542457581,
"learning_rate": 1.7301829268292682e-05,
"loss": 0.8194,
"step": 465
},
{
"epoch": 1.3470715835140998,
"grad_norm": 0.959826648235321,
"learning_rate": 1.722560975609756e-05,
"loss": 0.754,
"step": 466
},
{
"epoch": 1.3499638467100505,
"grad_norm": 0.9746386408805847,
"learning_rate": 1.714939024390244e-05,
"loss": 0.7998,
"step": 467
},
{
"epoch": 1.3528561099060015,
"grad_norm": 0.9507508873939514,
"learning_rate": 1.707317073170732e-05,
"loss": 0.7447,
"step": 468
},
{
"epoch": 1.3557483731019522,
"grad_norm": 1.0092105865478516,
"learning_rate": 1.6996951219512196e-05,
"loss": 0.8063,
"step": 469
},
{
"epoch": 1.358640636297903,
"grad_norm": 0.973320484161377,
"learning_rate": 1.6920731707317075e-05,
"loss": 0.7818,
"step": 470
},
{
"epoch": 1.361532899493854,
"grad_norm": 0.9913963675498962,
"learning_rate": 1.6844512195121955e-05,
"loss": 0.8006,
"step": 471
},
{
"epoch": 1.3644251626898047,
"grad_norm": 1.0580593347549438,
"learning_rate": 1.676829268292683e-05,
"loss": 0.8488,
"step": 472
},
{
"epoch": 1.3673174258857557,
"grad_norm": 0.9785270094871521,
"learning_rate": 1.669207317073171e-05,
"loss": 0.8249,
"step": 473
},
{
"epoch": 1.3702096890817064,
"grad_norm": 0.981171727180481,
"learning_rate": 1.6615853658536586e-05,
"loss": 0.7762,
"step": 474
},
{
"epoch": 1.3731019522776573,
"grad_norm": 1.0523923635482788,
"learning_rate": 1.6539634146341465e-05,
"loss": 0.7582,
"step": 475
},
{
"epoch": 1.3759942154736082,
"grad_norm": 1.0290507078170776,
"learning_rate": 1.6463414634146345e-05,
"loss": 0.7927,
"step": 476
},
{
"epoch": 1.378886478669559,
"grad_norm": 0.9900729060173035,
"learning_rate": 1.638719512195122e-05,
"loss": 0.7436,
"step": 477
},
{
"epoch": 1.3817787418655096,
"grad_norm": 0.9794175028800964,
"learning_rate": 1.63109756097561e-05,
"loss": 0.7744,
"step": 478
},
{
"epoch": 1.3846710050614606,
"grad_norm": 1.0114864110946655,
"learning_rate": 1.6234756097560976e-05,
"loss": 0.8683,
"step": 479
},
{
"epoch": 1.3875632682574115,
"grad_norm": 1.026435375213623,
"learning_rate": 1.6158536585365855e-05,
"loss": 0.8049,
"step": 480
},
{
"epoch": 1.3904555314533622,
"grad_norm": 1.0069879293441772,
"learning_rate": 1.6082317073170734e-05,
"loss": 0.9052,
"step": 481
},
{
"epoch": 1.3933477946493131,
"grad_norm": 0.9856945276260376,
"learning_rate": 1.600609756097561e-05,
"loss": 0.8129,
"step": 482
},
{
"epoch": 1.3962400578452638,
"grad_norm": 0.9632019400596619,
"learning_rate": 1.592987804878049e-05,
"loss": 0.7651,
"step": 483
},
{
"epoch": 1.3991323210412148,
"grad_norm": 0.9180967807769775,
"learning_rate": 1.5853658536585366e-05,
"loss": 0.7798,
"step": 484
},
{
"epoch": 1.4020245842371657,
"grad_norm": 0.9854956269264221,
"learning_rate": 1.5777439024390245e-05,
"loss": 0.7869,
"step": 485
},
{
"epoch": 1.4049168474331164,
"grad_norm": 0.9699094891548157,
"learning_rate": 1.5701219512195124e-05,
"loss": 0.7424,
"step": 486
},
{
"epoch": 1.407809110629067,
"grad_norm": 1.0167737007141113,
"learning_rate": 1.5625e-05,
"loss": 0.8369,
"step": 487
},
{
"epoch": 1.410701373825018,
"grad_norm": 0.9676855802536011,
"learning_rate": 1.554878048780488e-05,
"loss": 0.8397,
"step": 488
},
{
"epoch": 1.413593637020969,
"grad_norm": 0.974721372127533,
"learning_rate": 1.5472560975609756e-05,
"loss": 0.7772,
"step": 489
},
{
"epoch": 1.4164859002169197,
"grad_norm": 0.981971800327301,
"learning_rate": 1.5396341463414635e-05,
"loss": 0.8626,
"step": 490
},
{
"epoch": 1.4193781634128706,
"grad_norm": 1.004634976387024,
"learning_rate": 1.5320121951219514e-05,
"loss": 0.7482,
"step": 491
},
{
"epoch": 1.4222704266088213,
"grad_norm": 0.995227575302124,
"learning_rate": 1.524390243902439e-05,
"loss": 0.7898,
"step": 492
},
{
"epoch": 1.4251626898047722,
"grad_norm": 0.9808421730995178,
"learning_rate": 1.5167682926829268e-05,
"loss": 0.7677,
"step": 493
},
{
"epoch": 1.4280549530007232,
"grad_norm": 0.9480452537536621,
"learning_rate": 1.5091463414634147e-05,
"loss": 0.7852,
"step": 494
},
{
"epoch": 1.4309472161966739,
"grad_norm": 0.9107538461685181,
"learning_rate": 1.5015243902439025e-05,
"loss": 0.8798,
"step": 495
},
{
"epoch": 1.4338394793926248,
"grad_norm": 0.9696621894836426,
"learning_rate": 1.4939024390243902e-05,
"loss": 0.8056,
"step": 496
},
{
"epoch": 1.4367317425885755,
"grad_norm": 1.025511384010315,
"learning_rate": 1.486280487804878e-05,
"loss": 0.8319,
"step": 497
},
{
"epoch": 1.4396240057845264,
"grad_norm": 0.9872826337814331,
"learning_rate": 1.4786585365853658e-05,
"loss": 0.7518,
"step": 498
},
{
"epoch": 1.4425162689804774,
"grad_norm": 0.9867232441902161,
"learning_rate": 1.4710365853658537e-05,
"loss": 0.7372,
"step": 499
},
{
"epoch": 1.445408532176428,
"grad_norm": 1.0221909284591675,
"learning_rate": 1.4634146341463415e-05,
"loss": 0.764,
"step": 500
},
{
"epoch": 1.4483007953723788,
"grad_norm": 0.9744577407836914,
"learning_rate": 1.4557926829268292e-05,
"loss": 0.7816,
"step": 501
},
{
"epoch": 1.4511930585683297,
"grad_norm": 0.9650794267654419,
"learning_rate": 1.448170731707317e-05,
"loss": 0.7687,
"step": 502
},
{
"epoch": 1.4540853217642806,
"grad_norm": 1.067771077156067,
"learning_rate": 1.4405487804878048e-05,
"loss": 0.7803,
"step": 503
},
{
"epoch": 1.4569775849602313,
"grad_norm": 1.0217148065567017,
"learning_rate": 1.4329268292682927e-05,
"loss": 0.8766,
"step": 504
},
{
"epoch": 1.4598698481561823,
"grad_norm": 0.9869562983512878,
"learning_rate": 1.4253048780487805e-05,
"loss": 0.7447,
"step": 505
},
{
"epoch": 1.462762111352133,
"grad_norm": 1.004603385925293,
"learning_rate": 1.4176829268292682e-05,
"loss": 0.7725,
"step": 506
},
{
"epoch": 1.465654374548084,
"grad_norm": 1.0009071826934814,
"learning_rate": 1.410060975609756e-05,
"loss": 0.8871,
"step": 507
},
{
"epoch": 1.4685466377440348,
"grad_norm": 1.0561660528182983,
"learning_rate": 1.4024390243902441e-05,
"loss": 0.7484,
"step": 508
},
{
"epoch": 1.4714389009399855,
"grad_norm": 0.9575408101081848,
"learning_rate": 1.3948170731707319e-05,
"loss": 0.7578,
"step": 509
},
{
"epoch": 1.4743311641359362,
"grad_norm": 1.0391199588775635,
"learning_rate": 1.3871951219512196e-05,
"loss": 0.8065,
"step": 510
},
{
"epoch": 1.4772234273318872,
"grad_norm": 1.00908625125885,
"learning_rate": 1.3795731707317076e-05,
"loss": 0.7456,
"step": 511
},
{
"epoch": 1.480115690527838,
"grad_norm": 0.9751247763633728,
"learning_rate": 1.3719512195121953e-05,
"loss": 0.6815,
"step": 512
},
{
"epoch": 1.4830079537237888,
"grad_norm": 1.007405161857605,
"learning_rate": 1.3643292682926831e-05,
"loss": 0.7728,
"step": 513
},
{
"epoch": 1.4859002169197397,
"grad_norm": 0.9923568964004517,
"learning_rate": 1.3567073170731709e-05,
"loss": 0.7887,
"step": 514
},
{
"epoch": 1.4887924801156904,
"grad_norm": 0.9783514142036438,
"learning_rate": 1.3490853658536586e-05,
"loss": 0.8677,
"step": 515
},
{
"epoch": 1.4916847433116414,
"grad_norm": 0.9877396821975708,
"learning_rate": 1.3414634146341466e-05,
"loss": 0.8264,
"step": 516
},
{
"epoch": 1.4945770065075923,
"grad_norm": 0.973827600479126,
"learning_rate": 1.3338414634146343e-05,
"loss": 0.8344,
"step": 517
},
{
"epoch": 1.497469269703543,
"grad_norm": 0.9245984554290771,
"learning_rate": 1.326219512195122e-05,
"loss": 0.7671,
"step": 518
},
{
"epoch": 1.5003615328994937,
"grad_norm": 1.0020720958709717,
"learning_rate": 1.3185975609756098e-05,
"loss": 0.794,
"step": 519
},
{
"epoch": 1.5032537960954446,
"grad_norm": 0.9446883797645569,
"learning_rate": 1.3109756097560976e-05,
"loss": 0.7783,
"step": 520
},
{
"epoch": 1.5061460592913956,
"grad_norm": 0.9875244498252869,
"learning_rate": 1.3033536585365855e-05,
"loss": 0.8469,
"step": 521
},
{
"epoch": 1.5090383224873465,
"grad_norm": 1.0033190250396729,
"learning_rate": 1.2957317073170733e-05,
"loss": 0.8749,
"step": 522
},
{
"epoch": 1.5119305856832972,
"grad_norm": 0.9534813165664673,
"learning_rate": 1.288109756097561e-05,
"loss": 0.8684,
"step": 523
},
{
"epoch": 1.514822848879248,
"grad_norm": 0.9435486793518066,
"learning_rate": 1.2804878048780488e-05,
"loss": 0.8012,
"step": 524
},
{
"epoch": 1.5177151120751988,
"grad_norm": 1.0029319524765015,
"learning_rate": 1.2728658536585366e-05,
"loss": 0.762,
"step": 525
},
{
"epoch": 1.5206073752711498,
"grad_norm": 1.0000132322311401,
"learning_rate": 1.2652439024390245e-05,
"loss": 0.7812,
"step": 526
},
{
"epoch": 1.5234996384671005,
"grad_norm": 0.9410236477851868,
"learning_rate": 1.2576219512195123e-05,
"loss": 0.775,
"step": 527
},
{
"epoch": 1.5263919016630514,
"grad_norm": 0.9614347815513611,
"learning_rate": 1.25e-05,
"loss": 0.7783,
"step": 528
},
{
"epoch": 1.529284164859002,
"grad_norm": 0.9015387296676636,
"learning_rate": 1.2423780487804878e-05,
"loss": 0.7767,
"step": 529
},
{
"epoch": 1.532176428054953,
"grad_norm": 0.9506531357765198,
"learning_rate": 1.2347560975609756e-05,
"loss": 0.7928,
"step": 530
},
{
"epoch": 1.535068691250904,
"grad_norm": 1.0034101009368896,
"learning_rate": 1.2271341463414635e-05,
"loss": 0.794,
"step": 531
},
{
"epoch": 1.5379609544468547,
"grad_norm": 1.0089356899261475,
"learning_rate": 1.2195121951219513e-05,
"loss": 0.7306,
"step": 532
},
{
"epoch": 1.5408532176428054,
"grad_norm": 1.0234556198120117,
"learning_rate": 1.211890243902439e-05,
"loss": 0.7613,
"step": 533
},
{
"epoch": 1.5437454808387563,
"grad_norm": 0.9771298170089722,
"learning_rate": 1.2042682926829268e-05,
"loss": 0.7869,
"step": 534
},
{
"epoch": 1.5466377440347072,
"grad_norm": 1.019014835357666,
"learning_rate": 1.1966463414634146e-05,
"loss": 0.8096,
"step": 535
},
{
"epoch": 1.5495300072306581,
"grad_norm": 0.95261150598526,
"learning_rate": 1.1890243902439025e-05,
"loss": 0.843,
"step": 536
},
{
"epoch": 1.5524222704266089,
"grad_norm": 0.9801099300384521,
"learning_rate": 1.1814024390243903e-05,
"loss": 0.7219,
"step": 537
},
{
"epoch": 1.5553145336225596,
"grad_norm": 1.0174713134765625,
"learning_rate": 1.173780487804878e-05,
"loss": 0.787,
"step": 538
},
{
"epoch": 1.5582067968185105,
"grad_norm": 1.119850754737854,
"learning_rate": 1.166158536585366e-05,
"loss": 0.8341,
"step": 539
},
{
"epoch": 1.5610990600144614,
"grad_norm": 0.996792733669281,
"learning_rate": 1.1585365853658537e-05,
"loss": 0.8291,
"step": 540
},
{
"epoch": 1.5639913232104121,
"grad_norm": 1.0276952981948853,
"learning_rate": 1.1509146341463415e-05,
"loss": 0.7911,
"step": 541
},
{
"epoch": 1.5668835864063628,
"grad_norm": 0.9893227815628052,
"learning_rate": 1.1432926829268294e-05,
"loss": 0.8017,
"step": 542
},
{
"epoch": 1.5697758496023138,
"grad_norm": 1.0083463191986084,
"learning_rate": 1.1356707317073172e-05,
"loss": 0.8681,
"step": 543
},
{
"epoch": 1.5726681127982647,
"grad_norm": 1.0352839231491089,
"learning_rate": 1.128048780487805e-05,
"loss": 0.7451,
"step": 544
},
{
"epoch": 1.5755603759942156,
"grad_norm": 1.0231815576553345,
"learning_rate": 1.1204268292682927e-05,
"loss": 0.7971,
"step": 545
},
{
"epoch": 1.5784526391901663,
"grad_norm": 0.9740004539489746,
"learning_rate": 1.1128048780487805e-05,
"loss": 0.7174,
"step": 546
},
{
"epoch": 1.581344902386117,
"grad_norm": 0.9921448826789856,
"learning_rate": 1.1051829268292684e-05,
"loss": 0.7669,
"step": 547
},
{
"epoch": 1.584237165582068,
"grad_norm": 0.9635536670684814,
"learning_rate": 1.0975609756097562e-05,
"loss": 0.7851,
"step": 548
},
{
"epoch": 1.5871294287780189,
"grad_norm": 0.9930370450019836,
"learning_rate": 1.089939024390244e-05,
"loss": 0.749,
"step": 549
},
{
"epoch": 1.5900216919739696,
"grad_norm": 1.0188409090042114,
"learning_rate": 1.0823170731707317e-05,
"loss": 0.8287,
"step": 550
},
{
"epoch": 1.5929139551699205,
"grad_norm": 0.9855648875236511,
"learning_rate": 1.0746951219512195e-05,
"loss": 0.7985,
"step": 551
},
{
"epoch": 1.5958062183658712,
"grad_norm": 1.0312644243240356,
"learning_rate": 1.0670731707317074e-05,
"loss": 0.824,
"step": 552
},
{
"epoch": 1.5986984815618221,
"grad_norm": 0.9914786219596863,
"learning_rate": 1.0594512195121952e-05,
"loss": 0.8491,
"step": 553
},
{
"epoch": 1.601590744757773,
"grad_norm": 1.0038225650787354,
"learning_rate": 1.051829268292683e-05,
"loss": 0.8882,
"step": 554
},
{
"epoch": 1.6044830079537238,
"grad_norm": 1.0336111783981323,
"learning_rate": 1.0442073170731707e-05,
"loss": 0.7973,
"step": 555
},
{
"epoch": 1.6073752711496745,
"grad_norm": 0.9833325743675232,
"learning_rate": 1.0365853658536585e-05,
"loss": 0.7918,
"step": 556
},
{
"epoch": 1.6102675343456254,
"grad_norm": 1.0113708972930908,
"learning_rate": 1.0289634146341464e-05,
"loss": 0.803,
"step": 557
},
{
"epoch": 1.6131597975415763,
"grad_norm": 1.0248537063598633,
"learning_rate": 1.0213414634146342e-05,
"loss": 0.8015,
"step": 558
},
{
"epoch": 1.6160520607375273,
"grad_norm": 0.9835037589073181,
"learning_rate": 1.0137195121951221e-05,
"loss": 0.7493,
"step": 559
},
{
"epoch": 1.618944323933478,
"grad_norm": 0.9587700963020325,
"learning_rate": 1.0060975609756099e-05,
"loss": 0.7041,
"step": 560
},
{
"epoch": 1.6218365871294287,
"grad_norm": 1.0020424127578735,
"learning_rate": 9.984756097560976e-06,
"loss": 0.7743,
"step": 561
},
{
"epoch": 1.6247288503253796,
"grad_norm": 1.0215778350830078,
"learning_rate": 9.908536585365854e-06,
"loss": 0.9143,
"step": 562
},
{
"epoch": 1.6276211135213305,
"grad_norm": 1.05181086063385,
"learning_rate": 9.832317073170733e-06,
"loss": 0.7612,
"step": 563
},
{
"epoch": 1.6305133767172812,
"grad_norm": 0.9703447222709656,
"learning_rate": 9.756097560975611e-06,
"loss": 0.7819,
"step": 564
},
{
"epoch": 1.633405639913232,
"grad_norm": 1.0287517309188843,
"learning_rate": 9.679878048780489e-06,
"loss": 0.8443,
"step": 565
},
{
"epoch": 1.6362979031091829,
"grad_norm": 1.0159296989440918,
"learning_rate": 9.603658536585366e-06,
"loss": 0.7781,
"step": 566
},
{
"epoch": 1.6391901663051338,
"grad_norm": 1.0067027807235718,
"learning_rate": 9.527439024390244e-06,
"loss": 0.7417,
"step": 567
},
{
"epoch": 1.6420824295010847,
"grad_norm": 1.067325472831726,
"learning_rate": 9.451219512195123e-06,
"loss": 0.856,
"step": 568
},
{
"epoch": 1.6449746926970354,
"grad_norm": 1.0160930156707764,
"learning_rate": 9.375000000000001e-06,
"loss": 0.856,
"step": 569
},
{
"epoch": 1.6478669558929862,
"grad_norm": 0.9937707781791687,
"learning_rate": 9.298780487804879e-06,
"loss": 0.7341,
"step": 570
},
{
"epoch": 1.650759219088937,
"grad_norm": 1.0597978830337524,
"learning_rate": 9.222560975609756e-06,
"loss": 0.7363,
"step": 571
},
{
"epoch": 1.653651482284888,
"grad_norm": 1.0080229043960571,
"learning_rate": 9.146341463414634e-06,
"loss": 0.7734,
"step": 572
},
{
"epoch": 1.6565437454808387,
"grad_norm": 1.0394561290740967,
"learning_rate": 9.070121951219513e-06,
"loss": 0.8179,
"step": 573
},
{
"epoch": 1.6594360086767896,
"grad_norm": 1.0613329410552979,
"learning_rate": 8.99390243902439e-06,
"loss": 0.8376,
"step": 574
},
{
"epoch": 1.6623282718727403,
"grad_norm": 1.0188164710998535,
"learning_rate": 8.917682926829268e-06,
"loss": 0.7931,
"step": 575
},
{
"epoch": 1.6652205350686913,
"grad_norm": 0.9689257740974426,
"learning_rate": 8.841463414634146e-06,
"loss": 0.8066,
"step": 576
},
{
"epoch": 1.6681127982646422,
"grad_norm": 0.9878205060958862,
"learning_rate": 8.765243902439024e-06,
"loss": 0.7386,
"step": 577
},
{
"epoch": 1.671005061460593,
"grad_norm": 0.9607040286064148,
"learning_rate": 8.689024390243903e-06,
"loss": 0.7762,
"step": 578
},
{
"epoch": 1.6738973246565436,
"grad_norm": 0.934492290019989,
"learning_rate": 8.61280487804878e-06,
"loss": 0.8317,
"step": 579
},
{
"epoch": 1.6767895878524945,
"grad_norm": 1.0009124279022217,
"learning_rate": 8.53658536585366e-06,
"loss": 0.7755,
"step": 580
},
{
"epoch": 1.6796818510484455,
"grad_norm": 0.9868451952934265,
"learning_rate": 8.460365853658538e-06,
"loss": 0.7688,
"step": 581
},
{
"epoch": 1.6825741142443964,
"grad_norm": 1.0356996059417725,
"learning_rate": 8.384146341463415e-06,
"loss": 0.7601,
"step": 582
},
{
"epoch": 1.685466377440347,
"grad_norm": 1.0577391386032104,
"learning_rate": 8.307926829268293e-06,
"loss": 0.7847,
"step": 583
},
{
"epoch": 1.6883586406362978,
"grad_norm": 1.0306715965270996,
"learning_rate": 8.231707317073172e-06,
"loss": 0.8193,
"step": 584
},
{
"epoch": 1.6912509038322487,
"grad_norm": 1.04917311668396,
"learning_rate": 8.15548780487805e-06,
"loss": 0.7714,
"step": 585
},
{
"epoch": 1.6941431670281997,
"grad_norm": 0.9596878290176392,
"learning_rate": 8.079268292682928e-06,
"loss": 0.8267,
"step": 586
},
{
"epoch": 1.6970354302241504,
"grad_norm": 1.041686773300171,
"learning_rate": 8.003048780487805e-06,
"loss": 0.7706,
"step": 587
},
{
"epoch": 1.699927693420101,
"grad_norm": 1.0023382902145386,
"learning_rate": 7.926829268292683e-06,
"loss": 0.8456,
"step": 588
},
{
"epoch": 1.702819956616052,
"grad_norm": 1.009926438331604,
"learning_rate": 7.850609756097562e-06,
"loss": 0.7796,
"step": 589
},
{
"epoch": 1.705712219812003,
"grad_norm": 1.0054479837417603,
"learning_rate": 7.77439024390244e-06,
"loss": 0.7221,
"step": 590
},
{
"epoch": 1.7086044830079539,
"grad_norm": 0.9531407952308655,
"learning_rate": 7.698170731707317e-06,
"loss": 0.7801,
"step": 591
},
{
"epoch": 1.7114967462039046,
"grad_norm": 1.0707489252090454,
"learning_rate": 7.621951219512195e-06,
"loss": 0.8474,
"step": 592
},
{
"epoch": 1.7143890093998553,
"grad_norm": 1.0391806364059448,
"learning_rate": 7.545731707317074e-06,
"loss": 0.8122,
"step": 593
},
{
"epoch": 1.7172812725958062,
"grad_norm": 0.9896015524864197,
"learning_rate": 7.469512195121951e-06,
"loss": 0.8505,
"step": 594
},
{
"epoch": 1.7201735357917571,
"grad_norm": 1.122521162033081,
"learning_rate": 7.393292682926829e-06,
"loss": 0.878,
"step": 595
},
{
"epoch": 1.7230657989877078,
"grad_norm": 1.0091516971588135,
"learning_rate": 7.317073170731707e-06,
"loss": 0.7846,
"step": 596
},
{
"epoch": 1.7259580621836585,
"grad_norm": 0.9725529551506042,
"learning_rate": 7.240853658536585e-06,
"loss": 0.8274,
"step": 597
},
{
"epoch": 1.7288503253796095,
"grad_norm": 1.0169364213943481,
"learning_rate": 7.1646341463414635e-06,
"loss": 0.9092,
"step": 598
},
{
"epoch": 1.7317425885755604,
"grad_norm": 0.9752337336540222,
"learning_rate": 7.088414634146341e-06,
"loss": 0.7489,
"step": 599
},
{
"epoch": 1.7346348517715113,
"grad_norm": 1.0482772588729858,
"learning_rate": 7.0121951219512205e-06,
"loss": 0.7379,
"step": 600
},
{
"epoch": 1.737527114967462,
"grad_norm": 0.9847067594528198,
"learning_rate": 6.935975609756098e-06,
"loss": 0.7102,
"step": 601
},
{
"epoch": 1.7404193781634127,
"grad_norm": 0.9766717553138733,
"learning_rate": 6.859756097560977e-06,
"loss": 0.8012,
"step": 602
},
{
"epoch": 1.7433116413593637,
"grad_norm": 0.9498171806335449,
"learning_rate": 6.783536585365854e-06,
"loss": 0.7409,
"step": 603
},
{
"epoch": 1.7462039045553146,
"grad_norm": 1.0003339052200317,
"learning_rate": 6.707317073170733e-06,
"loss": 0.7585,
"step": 604
},
{
"epoch": 1.7490961677512655,
"grad_norm": 1.0416187047958374,
"learning_rate": 6.63109756097561e-06,
"loss": 0.7591,
"step": 605
},
{
"epoch": 1.7519884309472162,
"grad_norm": 0.9981351494789124,
"learning_rate": 6.554878048780488e-06,
"loss": 0.741,
"step": 606
},
{
"epoch": 1.754880694143167,
"grad_norm": 0.998756468296051,
"learning_rate": 6.4786585365853665e-06,
"loss": 0.8408,
"step": 607
},
{
"epoch": 1.7577729573391179,
"grad_norm": 1.0053471326828003,
"learning_rate": 6.402439024390244e-06,
"loss": 0.7636,
"step": 608
},
{
"epoch": 1.7606652205350688,
"grad_norm": 1.0228371620178223,
"learning_rate": 6.326219512195123e-06,
"loss": 0.7811,
"step": 609
},
{
"epoch": 1.7635574837310195,
"grad_norm": 1.0302461385726929,
"learning_rate": 6.25e-06,
"loss": 0.7339,
"step": 610
},
{
"epoch": 1.7664497469269702,
"grad_norm": 1.0541510581970215,
"learning_rate": 6.173780487804878e-06,
"loss": 0.7718,
"step": 611
},
{
"epoch": 1.7693420101229211,
"grad_norm": 0.9746615290641785,
"learning_rate": 6.0975609756097564e-06,
"loss": 0.849,
"step": 612
},
{
"epoch": 1.772234273318872,
"grad_norm": 0.9652546048164368,
"learning_rate": 6.021341463414634e-06,
"loss": 0.8287,
"step": 613
},
{
"epoch": 1.775126536514823,
"grad_norm": 1.0296525955200195,
"learning_rate": 5.9451219512195126e-06,
"loss": 0.7493,
"step": 614
},
{
"epoch": 1.7780187997107737,
"grad_norm": 1.045018196105957,
"learning_rate": 5.86890243902439e-06,
"loss": 0.7284,
"step": 615
},
{
"epoch": 1.7809110629067244,
"grad_norm": 1.0308400392532349,
"learning_rate": 5.792682926829269e-06,
"loss": 0.8641,
"step": 616
},
{
"epoch": 1.7838033261026753,
"grad_norm": 1.0580596923828125,
"learning_rate": 5.716463414634147e-06,
"loss": 0.8282,
"step": 617
},
{
"epoch": 1.7866955892986263,
"grad_norm": 1.0240721702575684,
"learning_rate": 5.640243902439025e-06,
"loss": 0.765,
"step": 618
},
{
"epoch": 1.789587852494577,
"grad_norm": 1.0127959251403809,
"learning_rate": 5.5640243902439025e-06,
"loss": 0.7923,
"step": 619
},
{
"epoch": 1.7924801156905277,
"grad_norm": 1.1011825799942017,
"learning_rate": 5.487804878048781e-06,
"loss": 0.7251,
"step": 620
},
{
"epoch": 1.7953723788864786,
"grad_norm": 1.0520384311676025,
"learning_rate": 5.411585365853659e-06,
"loss": 0.7217,
"step": 621
},
{
"epoch": 1.7982646420824295,
"grad_norm": 1.0805737972259521,
"learning_rate": 5.335365853658537e-06,
"loss": 0.8411,
"step": 622
},
{
"epoch": 1.8011569052783805,
"grad_norm": 1.0442290306091309,
"learning_rate": 5.259146341463415e-06,
"loss": 0.7386,
"step": 623
},
{
"epoch": 1.8040491684743312,
"grad_norm": 1.0919840335845947,
"learning_rate": 5.182926829268292e-06,
"loss": 0.7858,
"step": 624
},
{
"epoch": 1.8069414316702819,
"grad_norm": 0.9759023785591125,
"learning_rate": 5.106707317073171e-06,
"loss": 0.697,
"step": 625
},
{
"epoch": 1.8098336948662328,
"grad_norm": 1.017999291419983,
"learning_rate": 5.030487804878049e-06,
"loss": 0.8095,
"step": 626
},
{
"epoch": 1.8127259580621837,
"grad_norm": 1.0746080875396729,
"learning_rate": 4.954268292682927e-06,
"loss": 0.7828,
"step": 627
},
{
"epoch": 1.8156182212581344,
"grad_norm": 1.0229034423828125,
"learning_rate": 4.8780487804878055e-06,
"loss": 0.8028,
"step": 628
},
{
"epoch": 1.8185104844540854,
"grad_norm": 1.0520620346069336,
"learning_rate": 4.801829268292683e-06,
"loss": 0.7629,
"step": 629
},
{
"epoch": 1.821402747650036,
"grad_norm": 1.0495305061340332,
"learning_rate": 4.725609756097562e-06,
"loss": 0.7609,
"step": 630
},
{
"epoch": 1.824295010845987,
"grad_norm": 0.9548224806785583,
"learning_rate": 4.649390243902439e-06,
"loss": 0.752,
"step": 631
},
{
"epoch": 1.827187274041938,
"grad_norm": 1.0313746929168701,
"learning_rate": 4.573170731707317e-06,
"loss": 0.8528,
"step": 632
},
{
"epoch": 1.8300795372378886,
"grad_norm": 1.0014350414276123,
"learning_rate": 4.496951219512195e-06,
"loss": 0.7587,
"step": 633
},
{
"epoch": 1.8329718004338393,
"grad_norm": 1.069353461265564,
"learning_rate": 4.420731707317073e-06,
"loss": 0.8193,
"step": 634
},
{
"epoch": 1.8358640636297903,
"grad_norm": 1.085693120956421,
"learning_rate": 4.3445121951219515e-06,
"loss": 0.799,
"step": 635
},
{
"epoch": 1.8387563268257412,
"grad_norm": 0.97664475440979,
"learning_rate": 4.26829268292683e-06,
"loss": 0.7018,
"step": 636
},
{
"epoch": 1.8416485900216921,
"grad_norm": 1.0830881595611572,
"learning_rate": 4.192073170731708e-06,
"loss": 0.7851,
"step": 637
},
{
"epoch": 1.8445408532176428,
"grad_norm": 0.9672832489013672,
"learning_rate": 4.115853658536586e-06,
"loss": 0.7542,
"step": 638
},
{
"epoch": 1.8474331164135935,
"grad_norm": 1.0837608575820923,
"learning_rate": 4.039634146341464e-06,
"loss": 0.8329,
"step": 639
},
{
"epoch": 1.8503253796095445,
"grad_norm": 1.0772196054458618,
"learning_rate": 3.9634146341463414e-06,
"loss": 0.7884,
"step": 640
},
{
"epoch": 1.8532176428054954,
"grad_norm": 1.1313399076461792,
"learning_rate": 3.88719512195122e-06,
"loss": 0.7771,
"step": 641
},
{
"epoch": 1.856109906001446,
"grad_norm": 1.0799105167388916,
"learning_rate": 3.8109756097560976e-06,
"loss": 0.8173,
"step": 642
},
{
"epoch": 1.8590021691973968,
"grad_norm": 1.035786509513855,
"learning_rate": 3.7347560975609756e-06,
"loss": 0.7445,
"step": 643
},
{
"epoch": 1.8618944323933477,
"grad_norm": 1.0022109746932983,
"learning_rate": 3.6585365853658537e-06,
"loss": 0.7441,
"step": 644
},
{
"epoch": 1.8647866955892987,
"grad_norm": 1.0012871026992798,
"learning_rate": 3.5823170731707318e-06,
"loss": 0.7731,
"step": 645
},
{
"epoch": 1.8676789587852496,
"grad_norm": 1.0303922891616821,
"learning_rate": 3.5060975609756102e-06,
"loss": 0.7432,
"step": 646
},
{
"epoch": 1.8705712219812003,
"grad_norm": 0.9990852475166321,
"learning_rate": 3.4298780487804883e-06,
"loss": 0.7346,
"step": 647
},
{
"epoch": 1.873463485177151,
"grad_norm": 1.0499917268753052,
"learning_rate": 3.3536585365853664e-06,
"loss": 0.8286,
"step": 648
},
{
"epoch": 1.876355748373102,
"grad_norm": 0.9858948588371277,
"learning_rate": 3.277439024390244e-06,
"loss": 0.7513,
"step": 649
},
{
"epoch": 1.8792480115690529,
"grad_norm": 1.020816445350647,
"learning_rate": 3.201219512195122e-06,
"loss": 0.7283,
"step": 650
},
{
"epoch": 1.8821402747650036,
"grad_norm": 1.0142725706100464,
"learning_rate": 3.125e-06,
"loss": 0.8384,
"step": 651
},
{
"epoch": 1.8850325379609545,
"grad_norm": 1.0734213590621948,
"learning_rate": 3.0487804878048782e-06,
"loss": 0.7657,
"step": 652
},
{
"epoch": 1.8879248011569052,
"grad_norm": 0.9841848611831665,
"learning_rate": 2.9725609756097563e-06,
"loss": 0.7097,
"step": 653
},
{
"epoch": 1.8908170643528561,
"grad_norm": 1.4696120023727417,
"learning_rate": 2.8963414634146343e-06,
"loss": 0.6966,
"step": 654
},
{
"epoch": 1.893709327548807,
"grad_norm": 1.0753856897354126,
"learning_rate": 2.8201219512195124e-06,
"loss": 0.7836,
"step": 655
},
{
"epoch": 1.8966015907447578,
"grad_norm": 1.058305025100708,
"learning_rate": 2.7439024390243905e-06,
"loss": 0.7982,
"step": 656
},
{
"epoch": 1.8994938539407085,
"grad_norm": 1.0660943984985352,
"learning_rate": 2.6676829268292685e-06,
"loss": 0.7404,
"step": 657
},
{
"epoch": 1.9023861171366594,
"grad_norm": 1.0167231559753418,
"learning_rate": 2.591463414634146e-06,
"loss": 0.6959,
"step": 658
},
{
"epoch": 1.9052783803326103,
"grad_norm": 0.9782930016517639,
"learning_rate": 2.5152439024390247e-06,
"loss": 0.7038,
"step": 659
},
{
"epoch": 1.9081706435285612,
"grad_norm": 1.0442514419555664,
"learning_rate": 2.4390243902439027e-06,
"loss": 0.8573,
"step": 660
},
{
"epoch": 1.911062906724512,
"grad_norm": 1.0171256065368652,
"learning_rate": 2.362804878048781e-06,
"loss": 0.7684,
"step": 661
},
{
"epoch": 1.9139551699204627,
"grad_norm": 1.020768165588379,
"learning_rate": 2.2865853658536584e-06,
"loss": 0.8061,
"step": 662
},
{
"epoch": 1.9168474331164136,
"grad_norm": 0.9942306876182556,
"learning_rate": 2.2103658536585365e-06,
"loss": 0.7691,
"step": 663
},
{
"epoch": 1.9197396963123645,
"grad_norm": 0.9986061453819275,
"learning_rate": 2.134146341463415e-06,
"loss": 0.7012,
"step": 664
},
{
"epoch": 1.9226319595083152,
"grad_norm": 1.0474562644958496,
"learning_rate": 2.057926829268293e-06,
"loss": 0.728,
"step": 665
},
{
"epoch": 1.925524222704266,
"grad_norm": 1.0567129850387573,
"learning_rate": 1.9817073170731707e-06,
"loss": 0.7762,
"step": 666
},
{
"epoch": 1.9284164859002169,
"grad_norm": 1.0257785320281982,
"learning_rate": 1.9054878048780488e-06,
"loss": 0.7986,
"step": 667
},
{
"epoch": 1.9313087490961678,
"grad_norm": 0.9999968409538269,
"learning_rate": 1.8292682926829268e-06,
"loss": 0.7539,
"step": 668
},
{
"epoch": 1.9342010122921187,
"grad_norm": 1.082047462463379,
"learning_rate": 1.7530487804878051e-06,
"loss": 0.7971,
"step": 669
},
{
"epoch": 1.9370932754880694,
"grad_norm": 0.994654655456543,
"learning_rate": 1.6768292682926832e-06,
"loss": 0.7363,
"step": 670
},
{
"epoch": 1.9399855386840201,
"grad_norm": 1.0056068897247314,
"learning_rate": 1.600609756097561e-06,
"loss": 0.7643,
"step": 671
},
{
"epoch": 1.942877801879971,
"grad_norm": 1.015271782875061,
"learning_rate": 1.5243902439024391e-06,
"loss": 0.7108,
"step": 672
},
{
"epoch": 1.945770065075922,
"grad_norm": 0.9946292042732239,
"learning_rate": 1.4481707317073172e-06,
"loss": 0.8213,
"step": 673
},
{
"epoch": 1.9486623282718727,
"grad_norm": 0.9914453625679016,
"learning_rate": 1.3719512195121952e-06,
"loss": 0.7917,
"step": 674
},
{
"epoch": 1.9515545914678236,
"grad_norm": 1.062779426574707,
"learning_rate": 1.295731707317073e-06,
"loss": 0.725,
"step": 675
},
{
"epoch": 1.9544468546637743,
"grad_norm": 1.0502513647079468,
"learning_rate": 1.2195121951219514e-06,
"loss": 0.7978,
"step": 676
},
{
"epoch": 1.9573391178597253,
"grad_norm": 1.0494405031204224,
"learning_rate": 1.1432926829268292e-06,
"loss": 0.7927,
"step": 677
},
{
"epoch": 1.9602313810556762,
"grad_norm": 1.054677128791809,
"learning_rate": 1.0670731707317075e-06,
"loss": 0.7595,
"step": 678
},
{
"epoch": 1.9631236442516269,
"grad_norm": 1.0292917490005493,
"learning_rate": 9.908536585365854e-07,
"loss": 0.8302,
"step": 679
},
{
"epoch": 1.9660159074475776,
"grad_norm": 1.1083894968032837,
"learning_rate": 9.146341463414634e-07,
"loss": 0.8153,
"step": 680
},
{
"epoch": 1.9689081706435285,
"grad_norm": 1.086378574371338,
"learning_rate": 8.384146341463416e-07,
"loss": 0.7676,
"step": 681
},
{
"epoch": 1.9718004338394794,
"grad_norm": 1.0098559856414795,
"learning_rate": 7.621951219512196e-07,
"loss": 0.7764,
"step": 682
},
{
"epoch": 1.9746926970354304,
"grad_norm": 1.0091646909713745,
"learning_rate": 6.859756097560976e-07,
"loss": 0.8242,
"step": 683
},
{
"epoch": 1.977584960231381,
"grad_norm": 1.0496336221694946,
"learning_rate": 6.097560975609757e-07,
"loss": 0.7758,
"step": 684
},
{
"epoch": 1.9804772234273318,
"grad_norm": 1.0282728672027588,
"learning_rate": 5.335365853658538e-07,
"loss": 0.7421,
"step": 685
},
{
"epoch": 1.9833694866232827,
"grad_norm": 1.0808695554733276,
"learning_rate": 4.573170731707317e-07,
"loss": 0.7813,
"step": 686
},
{
"epoch": 1.9862617498192336,
"grad_norm": 1.0309821367263794,
"learning_rate": 3.810975609756098e-07,
"loss": 0.7839,
"step": 687
},
{
"epoch": 1.9891540130151844,
"grad_norm": 1.0294197797775269,
"learning_rate": 3.0487804878048784e-07,
"loss": 0.697,
"step": 688
},
{
"epoch": 1.992046276211135,
"grad_norm": 1.0775706768035889,
"learning_rate": 2.2865853658536586e-07,
"loss": 0.7508,
"step": 689
},
{
"epoch": 1.994938539407086,
"grad_norm": 1.0518558025360107,
"learning_rate": 1.5243902439024392e-07,
"loss": 0.7384,
"step": 690
},
{
"epoch": 1.997830802603037,
"grad_norm": 1.0389012098312378,
"learning_rate": 7.621951219512196e-08,
"loss": 0.7942,
"step": 691
}
],
"logging_steps": 1,
"max_steps": 691,
"num_input_tokens_seen": 0,
"num_train_epochs": 2,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 4.687294393884475e+18,
"train_batch_size": 8,
"trial_name": null,
"trial_params": null
}