sft-625-zero / trainer_state.json
iamPi's picture
Add files using upload-large-folder tool
ec45a55 verified
Raw
History Blame Contribute Delete
107 kB
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 1.0,
"eval_steps": 500,
"global_step": 625,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0016,
"grad_norm": 36.86253356933594,
"learning_rate": 0.0,
"loss": 6.970664024353027,
"step": 1
},
{
"epoch": 0.0032,
"grad_norm": 46.62815475463867,
"learning_rate": 1e-05,
"loss": 7.119298934936523,
"step": 2
},
{
"epoch": 0.0048,
"grad_norm": 38.75471878051758,
"learning_rate": 2e-05,
"loss": 6.804569721221924,
"step": 3
},
{
"epoch": 0.0064,
"grad_norm": 23.79888343811035,
"learning_rate": 3e-05,
"loss": 6.857824325561523,
"step": 4
},
{
"epoch": 0.008,
"grad_norm": 21.74065589904785,
"learning_rate": 4e-05,
"loss": 6.550006866455078,
"step": 5
},
{
"epoch": 0.0096,
"grad_norm": 18.586503982543945,
"learning_rate": 5e-05,
"loss": 6.686573028564453,
"step": 6
},
{
"epoch": 0.0112,
"grad_norm": 15.070769309997559,
"learning_rate": 6e-05,
"loss": 6.578455924987793,
"step": 7
},
{
"epoch": 0.0128,
"grad_norm": 7.648688316345215,
"learning_rate": 7.000000000000001e-05,
"loss": 6.312182426452637,
"step": 8
},
{
"epoch": 0.0144,
"grad_norm": 11.109657287597656,
"learning_rate": 8e-05,
"loss": 6.330634593963623,
"step": 9
},
{
"epoch": 0.016,
"grad_norm": 11.878482818603516,
"learning_rate": 8.999999999999999e-05,
"loss": 6.246118545532227,
"step": 10
},
{
"epoch": 0.0176,
"grad_norm": 15.020891189575195,
"learning_rate": 0.0001,
"loss": 6.080811500549316,
"step": 11
},
{
"epoch": 0.0192,
"grad_norm": 10.975037574768066,
"learning_rate": 0.00011,
"loss": 6.293004989624023,
"step": 12
},
{
"epoch": 0.0208,
"grad_norm": 5.8413214683532715,
"learning_rate": 0.00012,
"loss": 6.052936553955078,
"step": 13
},
{
"epoch": 0.0224,
"grad_norm": 11.264659881591797,
"learning_rate": 0.00013000000000000002,
"loss": 6.178928852081299,
"step": 14
},
{
"epoch": 0.024,
"grad_norm": 5.662702560424805,
"learning_rate": 0.00014000000000000001,
"loss": 6.111515998840332,
"step": 15
},
{
"epoch": 0.0256,
"grad_norm": 7.999163627624512,
"learning_rate": 0.00015,
"loss": 6.4416985511779785,
"step": 16
},
{
"epoch": 0.0272,
"grad_norm": 4.368480205535889,
"learning_rate": 0.00016,
"loss": 6.070431709289551,
"step": 17
},
{
"epoch": 0.0288,
"grad_norm": 6.195078372955322,
"learning_rate": 0.00017,
"loss": 6.400940895080566,
"step": 18
},
{
"epoch": 0.0304,
"grad_norm": 4.218802452087402,
"learning_rate": 0.00017999999999999998,
"loss": 5.938872337341309,
"step": 19
},
{
"epoch": 0.032,
"grad_norm": 8.09906005859375,
"learning_rate": 0.00019,
"loss": 6.1384053230285645,
"step": 20
},
{
"epoch": 0.0336,
"grad_norm": 5.7899065017700195,
"learning_rate": 0.0002,
"loss": 6.211584568023682,
"step": 21
},
{
"epoch": 0.0352,
"grad_norm": 3.5106873512268066,
"learning_rate": 0.00021,
"loss": 6.081808567047119,
"step": 22
},
{
"epoch": 0.0368,
"grad_norm": 5.990793228149414,
"learning_rate": 0.00022,
"loss": 6.311020374298096,
"step": 23
},
{
"epoch": 0.0384,
"grad_norm": 4.144802570343018,
"learning_rate": 0.00023,
"loss": 6.124863147735596,
"step": 24
},
{
"epoch": 0.04,
"grad_norm": 4.716309547424316,
"learning_rate": 0.00024,
"loss": 6.189701557159424,
"step": 25
},
{
"epoch": 0.0416,
"grad_norm": 3.5594468116760254,
"learning_rate": 0.00025,
"loss": 5.711904048919678,
"step": 26
},
{
"epoch": 0.0432,
"grad_norm": 7.885351657867432,
"learning_rate": 0.00026000000000000003,
"loss": 6.188915729522705,
"step": 27
},
{
"epoch": 0.0448,
"grad_norm": 4.330770492553711,
"learning_rate": 0.00027,
"loss": 6.156501293182373,
"step": 28
},
{
"epoch": 0.0464,
"grad_norm": 6.669336318969727,
"learning_rate": 0.00028000000000000003,
"loss": 6.337223052978516,
"step": 29
},
{
"epoch": 0.048,
"grad_norm": 4.431726932525635,
"learning_rate": 0.00029,
"loss": 5.854226112365723,
"step": 30
},
{
"epoch": 0.0496,
"grad_norm": 5.652588367462158,
"learning_rate": 0.0003,
"loss": 6.1465911865234375,
"step": 31
},
{
"epoch": 0.0512,
"grad_norm": 4.4275360107421875,
"learning_rate": 0.00031,
"loss": 6.568665504455566,
"step": 32
},
{
"epoch": 0.0528,
"grad_norm": 5.78800106048584,
"learning_rate": 0.00032,
"loss": 5.84707498550415,
"step": 33
},
{
"epoch": 0.0544,
"grad_norm": 5.778809547424316,
"learning_rate": 0.00033,
"loss": 6.26806640625,
"step": 34
},
{
"epoch": 0.056,
"grad_norm": 3.150599718093872,
"learning_rate": 0.00034,
"loss": 5.942642688751221,
"step": 35
},
{
"epoch": 0.0576,
"grad_norm": 5.755363464355469,
"learning_rate": 0.00035,
"loss": 6.048552989959717,
"step": 36
},
{
"epoch": 0.0592,
"grad_norm": 4.171663284301758,
"learning_rate": 0.00035999999999999997,
"loss": 6.371613025665283,
"step": 37
},
{
"epoch": 0.0608,
"grad_norm": 4.288946628570557,
"learning_rate": 0.00037,
"loss": 6.1250200271606445,
"step": 38
},
{
"epoch": 0.0624,
"grad_norm": 4.6287713050842285,
"learning_rate": 0.00038,
"loss": 6.222686767578125,
"step": 39
},
{
"epoch": 0.064,
"grad_norm": 5.058150291442871,
"learning_rate": 0.00039000000000000005,
"loss": 6.543748378753662,
"step": 40
},
{
"epoch": 0.0656,
"grad_norm": 4.104369640350342,
"learning_rate": 0.0004,
"loss": 6.065921783447266,
"step": 41
},
{
"epoch": 0.0672,
"grad_norm": 6.011862754821777,
"learning_rate": 0.00041,
"loss": 5.975309371948242,
"step": 42
},
{
"epoch": 0.0688,
"grad_norm": 3.899702787399292,
"learning_rate": 0.00042,
"loss": 6.357814788818359,
"step": 43
},
{
"epoch": 0.0704,
"grad_norm": 4.300708770751953,
"learning_rate": 0.00043,
"loss": 5.761978626251221,
"step": 44
},
{
"epoch": 0.072,
"grad_norm": 5.165529727935791,
"learning_rate": 0.00044,
"loss": 6.23648738861084,
"step": 45
},
{
"epoch": 0.0736,
"grad_norm": 3.270381212234497,
"learning_rate": 0.00045000000000000004,
"loss": 6.216146469116211,
"step": 46
},
{
"epoch": 0.0752,
"grad_norm": 3.381625175476074,
"learning_rate": 0.00046,
"loss": 5.920130729675293,
"step": 47
},
{
"epoch": 0.0768,
"grad_norm": 2.7397425174713135,
"learning_rate": 0.00047,
"loss": 5.948547840118408,
"step": 48
},
{
"epoch": 0.0784,
"grad_norm": 4.689820289611816,
"learning_rate": 0.00048,
"loss": 6.4204936027526855,
"step": 49
},
{
"epoch": 0.08,
"grad_norm": 4.353704929351807,
"learning_rate": 0.00049,
"loss": 5.919530391693115,
"step": 50
},
{
"epoch": 0.0816,
"grad_norm": 3.5159783363342285,
"learning_rate": 0.0005,
"loss": 6.303610324859619,
"step": 51
},
{
"epoch": 0.0832,
"grad_norm": 3.121208429336548,
"learning_rate": 0.000499996268589849,
"loss": 5.74945068359375,
"step": 52
},
{
"epoch": 0.0848,
"grad_norm": 3.5212831497192383,
"learning_rate": 0.0004999850744707835,
"loss": 6.07124662399292,
"step": 53
},
{
"epoch": 0.0864,
"grad_norm": 2.848412275314331,
"learning_rate": 0.0004999664179769621,
"loss": 6.209238052368164,
"step": 54
},
{
"epoch": 0.088,
"grad_norm": 2.6709463596343994,
"learning_rate": 0.0004999402996653051,
"loss": 5.881043910980225,
"step": 55
},
{
"epoch": 0.0896,
"grad_norm": 2.7929718494415283,
"learning_rate": 0.0004999067203154777,
"loss": 6.170549392700195,
"step": 56
},
{
"epoch": 0.0912,
"grad_norm": 2.7909319400787354,
"learning_rate": 0.0004998656809298664,
"loss": 5.91437292098999,
"step": 57
},
{
"epoch": 0.0928,
"grad_norm": 3.028071880340576,
"learning_rate": 0.0004998171827335494,
"loss": 5.768723964691162,
"step": 58
},
{
"epoch": 0.0944,
"grad_norm": 3.5717194080352783,
"learning_rate": 0.0004997612271742601,
"loss": 6.126382827758789,
"step": 59
},
{
"epoch": 0.096,
"grad_norm": 2.5707123279571533,
"learning_rate": 0.0004996978159223436,
"loss": 6.031285285949707,
"step": 60
},
{
"epoch": 0.0976,
"grad_norm": 2.886106252670288,
"learning_rate": 0.000499626950870707,
"loss": 5.81216287612915,
"step": 61
},
{
"epoch": 0.0992,
"grad_norm": 3.2320756912231445,
"learning_rate": 0.000499548634134763,
"loss": 6.256302833557129,
"step": 62
},
{
"epoch": 0.1008,
"grad_norm": 2.3101658821105957,
"learning_rate": 0.0004994628680523662,
"loss": 6.089540481567383,
"step": 63
},
{
"epoch": 0.1024,
"grad_norm": 2.2067813873291016,
"learning_rate": 0.0004993696551837443,
"loss": 6.167810440063477,
"step": 64
},
{
"epoch": 0.104,
"grad_norm": 2.936598300933838,
"learning_rate": 0.0004992689983114208,
"loss": 6.019635200500488,
"step": 65
},
{
"epoch": 0.1056,
"grad_norm": 3.3017938137054443,
"learning_rate": 0.0004991609004401324,
"loss": 5.883628845214844,
"step": 66
},
{
"epoch": 0.1072,
"grad_norm": 3.359445333480835,
"learning_rate": 0.0004990453647967389,
"loss": 5.827721118927002,
"step": 67
},
{
"epoch": 0.1088,
"grad_norm": 3.057800769805908,
"learning_rate": 0.0004989223948301272,
"loss": 5.853091239929199,
"step": 68
},
{
"epoch": 0.1104,
"grad_norm": 3.4538474082946777,
"learning_rate": 0.0004987919942111087,
"loss": 6.159923553466797,
"step": 69
},
{
"epoch": 0.112,
"grad_norm": 2.778003692626953,
"learning_rate": 0.0004986541668323086,
"loss": 5.855865478515625,
"step": 70
},
{
"epoch": 0.1136,
"grad_norm": 2.497781753540039,
"learning_rate": 0.0004985089168080509,
"loss": 6.018093109130859,
"step": 71
},
{
"epoch": 0.1152,
"grad_norm": 2.0816121101379395,
"learning_rate": 0.0004983562484742349,
"loss": 6.006240367889404,
"step": 72
},
{
"epoch": 0.1168,
"grad_norm": 2.8136582374572754,
"learning_rate": 0.000498196166388206,
"loss": 5.550631999969482,
"step": 73
},
{
"epoch": 0.1184,
"grad_norm": 2.223203420639038,
"learning_rate": 0.0004980286753286195,
"loss": 5.823319911956787,
"step": 74
},
{
"epoch": 0.12,
"grad_norm": 2.3398818969726562,
"learning_rate": 0.0004978537802952981,
"loss": 5.757394790649414,
"step": 75
},
{
"epoch": 0.1216,
"grad_norm": 3.7000091075897217,
"learning_rate": 0.0004976714865090827,
"loss": 6.139785289764404,
"step": 76
},
{
"epoch": 0.1232,
"grad_norm": 2.992990255355835,
"learning_rate": 0.0004974817994116764,
"loss": 5.841603755950928,
"step": 77
},
{
"epoch": 0.1248,
"grad_norm": 4.935225963592529,
"learning_rate": 0.0004972847246654819,
"loss": 5.688216209411621,
"step": 78
},
{
"epoch": 0.1264,
"grad_norm": 2.531768798828125,
"learning_rate": 0.0004970802681534331,
"loss": 6.026415824890137,
"step": 79
},
{
"epoch": 0.128,
"grad_norm": 3.366121530532837,
"learning_rate": 0.0004968684359788187,
"loss": 6.1217217445373535,
"step": 80
},
{
"epoch": 0.1296,
"grad_norm": 2.439563035964966,
"learning_rate": 0.0004966492344651005,
"loss": 5.786462783813477,
"step": 81
},
{
"epoch": 0.1312,
"grad_norm": 2.759390115737915,
"learning_rate": 0.0004964226701557246,
"loss": 6.397160053253174,
"step": 82
},
{
"epoch": 0.1328,
"grad_norm": 2.6187775135040283,
"learning_rate": 0.000496188749813926,
"loss": 5.781584739685059,
"step": 83
},
{
"epoch": 0.1344,
"grad_norm": 2.3311808109283447,
"learning_rate": 0.0004959474804225263,
"loss": 5.623251914978027,
"step": 84
},
{
"epoch": 0.136,
"grad_norm": 1.8278515338897705,
"learning_rate": 0.0004956988691837262,
"loss": 5.646507263183594,
"step": 85
},
{
"epoch": 0.1376,
"grad_norm": 1.940083622932434,
"learning_rate": 0.0004954429235188896,
"loss": 5.845520496368408,
"step": 86
},
{
"epoch": 0.1392,
"grad_norm": 1.715268611907959,
"learning_rate": 0.0004951796510683226,
"loss": 5.86661434173584,
"step": 87
},
{
"epoch": 0.1408,
"grad_norm": 2.3065476417541504,
"learning_rate": 0.0004949090596910452,
"loss": 6.391292572021484,
"step": 88
},
{
"epoch": 0.1424,
"grad_norm": 2.54691481590271,
"learning_rate": 0.0004946311574645565,
"loss": 5.941152572631836,
"step": 89
},
{
"epoch": 0.144,
"grad_norm": 2.3436925411224365,
"learning_rate": 0.0004943459526845942,
"loss": 5.867047309875488,
"step": 90
},
{
"epoch": 0.1456,
"grad_norm": 2.8488574028015137,
"learning_rate": 0.0004940534538648862,
"loss": 6.295483112335205,
"step": 91
},
{
"epoch": 0.1472,
"grad_norm": 2.0991811752319336,
"learning_rate": 0.0004937536697368971,
"loss": 6.155615329742432,
"step": 92
},
{
"epoch": 0.1488,
"grad_norm": 2.874187707901001,
"learning_rate": 0.0004934466092495673,
"loss": 6.002193450927734,
"step": 93
},
{
"epoch": 0.1504,
"grad_norm": 2.6309406757354736,
"learning_rate": 0.0004931322815690456,
"loss": 6.190125942230225,
"step": 94
},
{
"epoch": 0.152,
"grad_norm": 2.5140063762664795,
"learning_rate": 0.0004928106960784163,
"loss": 5.832353591918945,
"step": 95
},
{
"epoch": 0.1536,
"grad_norm": 2.2540531158447266,
"learning_rate": 0.0004924818623774179,
"loss": 5.870430946350098,
"step": 96
},
{
"epoch": 0.1552,
"grad_norm": 2.5736892223358154,
"learning_rate": 0.0004921457902821578,
"loss": 5.9354658126831055,
"step": 97
},
{
"epoch": 0.1568,
"grad_norm": 2.8597569465637207,
"learning_rate": 0.0004918024898248188,
"loss": 5.980432987213135,
"step": 98
},
{
"epoch": 0.1584,
"grad_norm": 2.679422616958618,
"learning_rate": 0.0004914519712533592,
"loss": 5.808017253875732,
"step": 99
},
{
"epoch": 0.16,
"grad_norm": 2.6200029850006104,
"learning_rate": 0.0004910942450312075,
"loss": 6.042236804962158,
"step": 100
},
{
"epoch": 0.1616,
"grad_norm": 2.3748672008514404,
"learning_rate": 0.0004907293218369499,
"loss": 5.913302421569824,
"step": 101
},
{
"epoch": 0.1632,
"grad_norm": 2.0950937271118164,
"learning_rate": 0.000490357212564011,
"loss": 5.478336334228516,
"step": 102
},
{
"epoch": 0.1648,
"grad_norm": 2.222339391708374,
"learning_rate": 0.0004899779283203296,
"loss": 5.753122329711914,
"step": 103
},
{
"epoch": 0.1664,
"grad_norm": 1.8135013580322266,
"learning_rate": 0.0004895914804280262,
"loss": 5.8378705978393555,
"step": 104
},
{
"epoch": 0.168,
"grad_norm": 1.834136962890625,
"learning_rate": 0.0004891978804230655,
"loss": 5.386728286743164,
"step": 105
},
{
"epoch": 0.1696,
"grad_norm": 2.7069461345672607,
"learning_rate": 0.000488797140054912,
"loss": 5.91385555267334,
"step": 106
},
{
"epoch": 0.1712,
"grad_norm": 2.961819648742676,
"learning_rate": 0.0004883892712861791,
"loss": 5.622028350830078,
"step": 107
},
{
"epoch": 0.1728,
"grad_norm": 2.6172969341278076,
"learning_rate": 0.0004879742862922721,
"loss": 5.701954364776611,
"step": 108
},
{
"epoch": 0.1744,
"grad_norm": 2.4764273166656494,
"learning_rate": 0.0004875521974610247,
"loss": 5.922611236572266,
"step": 109
},
{
"epoch": 0.176,
"grad_norm": 2.321749448776245,
"learning_rate": 0.00048712301739232933,
"loss": 5.958606719970703,
"step": 110
},
{
"epoch": 0.1776,
"grad_norm": 2.569371461868286,
"learning_rate": 0.00048668675889776094,
"loss": 5.966418266296387,
"step": 111
},
{
"epoch": 0.1792,
"grad_norm": 2.0367257595062256,
"learning_rate": 0.00048624343500019453,
"loss": 5.828032970428467,
"step": 112
},
{
"epoch": 0.1808,
"grad_norm": 2.0033013820648193,
"learning_rate": 0.0004857930589334164,
"loss": 5.9207658767700195,
"step": 113
},
{
"epoch": 0.1824,
"grad_norm": 2.4433813095092773,
"learning_rate": 0.00048533564414172915,
"loss": 5.987303256988525,
"step": 114
},
{
"epoch": 0.184,
"grad_norm": 1.6759791374206543,
"learning_rate": 0.00048487120427955047,
"loss": 5.758200168609619,
"step": 115
},
{
"epoch": 0.1856,
"grad_norm": 1.9562362432479858,
"learning_rate": 0.0004843997532110051,
"loss": 6.076003074645996,
"step": 116
},
{
"epoch": 0.1872,
"grad_norm": 1.9110207557678223,
"learning_rate": 0.0004839213050095116,
"loss": 5.927783966064453,
"step": 117
},
{
"epoch": 0.1888,
"grad_norm": 1.9068591594696045,
"learning_rate": 0.00048343587395736177,
"loss": 5.609103202819824,
"step": 118
},
{
"epoch": 0.1904,
"grad_norm": 1.801079273223877,
"learning_rate": 0.0004829434745452944,
"loss": 6.146678924560547,
"step": 119
},
{
"epoch": 0.192,
"grad_norm": 2.160980224609375,
"learning_rate": 0.00048244412147206283,
"loss": 5.927748203277588,
"step": 120
},
{
"epoch": 0.1936,
"grad_norm": 2.2416179180145264,
"learning_rate": 0.0004819378296439961,
"loss": 5.8890509605407715,
"step": 121
},
{
"epoch": 0.1952,
"grad_norm": 2.4966790676116943,
"learning_rate": 0.000481424614174554,
"loss": 5.660029411315918,
"step": 122
},
{
"epoch": 0.1968,
"grad_norm": 2.2897145748138428,
"learning_rate": 0.00048090449038387564,
"loss": 5.889649391174316,
"step": 123
},
{
"epoch": 0.1984,
"grad_norm": 2.1526010036468506,
"learning_rate": 0.00048037747379832266,
"loss": 5.937025547027588,
"step": 124
},
{
"epoch": 0.2,
"grad_norm": 2.3477089405059814,
"learning_rate": 0.0004798435801500154,
"loss": 5.83440637588501,
"step": 125
},
{
"epoch": 0.2016,
"grad_norm": 1.9011043310165405,
"learning_rate": 0.00047930282537636326,
"loss": 6.049851417541504,
"step": 126
},
{
"epoch": 0.2032,
"grad_norm": 2.7886276245117188,
"learning_rate": 0.00047875522561958907,
"loss": 6.053065299987793,
"step": 127
},
{
"epoch": 0.2048,
"grad_norm": 1.8351131677627563,
"learning_rate": 0.0004782007972262471,
"loss": 5.606479644775391,
"step": 128
},
{
"epoch": 0.2064,
"grad_norm": 1.759033441543579,
"learning_rate": 0.0004776395567467353,
"loss": 5.892756462097168,
"step": 129
},
{
"epoch": 0.208,
"grad_norm": 1.9948967695236206,
"learning_rate": 0.00047707152093480097,
"loss": 5.802677631378174,
"step": 130
},
{
"epoch": 0.2096,
"grad_norm": 1.7873433828353882,
"learning_rate": 0.0004764967067470409,
"loss": 5.694087505340576,
"step": 131
},
{
"epoch": 0.2112,
"grad_norm": 2.129274606704712,
"learning_rate": 0.00047591513134239506,
"loss": 6.053646087646484,
"step": 132
},
{
"epoch": 0.2128,
"grad_norm": 1.815743327140808,
"learning_rate": 0.0004753268120816344,
"loss": 5.840423107147217,
"step": 133
},
{
"epoch": 0.2144,
"grad_norm": 1.6211766004562378,
"learning_rate": 0.0004747317665268427,
"loss": 5.866158962249756,
"step": 134
},
{
"epoch": 0.216,
"grad_norm": 1.5764577388763428,
"learning_rate": 0.000474130012440892,
"loss": 5.642172813415527,
"step": 135
},
{
"epoch": 0.2176,
"grad_norm": 1.6282553672790527,
"learning_rate": 0.0004735215677869128,
"loss": 5.813696384429932,
"step": 136
},
{
"epoch": 0.2192,
"grad_norm": 1.587697148323059,
"learning_rate": 0.0004729064507277576,
"loss": 5.456190586090088,
"step": 137
},
{
"epoch": 0.2208,
"grad_norm": 2.2339489459991455,
"learning_rate": 0.0004722846796254586,
"loss": 5.826436996459961,
"step": 138
},
{
"epoch": 0.2224,
"grad_norm": 1.6775805950164795,
"learning_rate": 0.00047165627304068,
"loss": 5.307504653930664,
"step": 139
},
{
"epoch": 0.224,
"grad_norm": 1.7358742952346802,
"learning_rate": 0.0004710212497321633,
"loss": 5.858373641967773,
"step": 140
},
{
"epoch": 0.2256,
"grad_norm": 1.7377792596817017,
"learning_rate": 0.0004703796286561679,
"loss": 5.746421813964844,
"step": 141
},
{
"epoch": 0.2272,
"grad_norm": 1.7279226779937744,
"learning_rate": 0.00046973142896590504,
"loss": 5.818030834197998,
"step": 142
},
{
"epoch": 0.2288,
"grad_norm": 1.896462321281433,
"learning_rate": 0.0004690766700109659,
"loss": 5.706021308898926,
"step": 143
},
{
"epoch": 0.2304,
"grad_norm": 1.599483609199524,
"learning_rate": 0.00046841537133674414,
"loss": 5.414737701416016,
"step": 144
},
{
"epoch": 0.232,
"grad_norm": 2.0782713890075684,
"learning_rate": 0.00046774755268385253,
"loss": 6.040131092071533,
"step": 145
},
{
"epoch": 0.2336,
"grad_norm": 1.5299904346466064,
"learning_rate": 0.00046707323398753343,
"loss": 5.940986633300781,
"step": 146
},
{
"epoch": 0.2352,
"grad_norm": 1.7263022661209106,
"learning_rate": 0.00046639243537706387,
"loss": 5.658965587615967,
"step": 147
},
{
"epoch": 0.2368,
"grad_norm": 1.9568145275115967,
"learning_rate": 0.0004657051771751546,
"loss": 5.630545139312744,
"step": 148
},
{
"epoch": 0.2384,
"grad_norm": 1.7731075286865234,
"learning_rate": 0.0004650114798973434,
"loss": 5.288701057434082,
"step": 149
},
{
"epoch": 0.24,
"grad_norm": 1.5925266742706299,
"learning_rate": 0.000464311364251383,
"loss": 5.936962127685547,
"step": 150
},
{
"epoch": 0.2416,
"grad_norm": 1.6020593643188477,
"learning_rate": 0.0004636048511366222,
"loss": 5.519335746765137,
"step": 151
},
{
"epoch": 0.2432,
"grad_norm": 1.5809364318847656,
"learning_rate": 0.0004628919616433827,
"loss": 5.557144641876221,
"step": 152
},
{
"epoch": 0.2448,
"grad_norm": 1.8422110080718994,
"learning_rate": 0.0004621727170523293,
"loss": 5.852574348449707,
"step": 153
},
{
"epoch": 0.2464,
"grad_norm": 1.6175079345703125,
"learning_rate": 0.0004614471388338346,
"loss": 5.70945405960083,
"step": 154
},
{
"epoch": 0.248,
"grad_norm": 1.7624582052230835,
"learning_rate": 0.00046071524864733796,
"loss": 5.58186149597168,
"step": 155
},
{
"epoch": 0.2496,
"grad_norm": 1.5593520402908325,
"learning_rate": 0.0004599770683406991,
"loss": 5.716488361358643,
"step": 156
},
{
"epoch": 0.2512,
"grad_norm": 1.9119805097579956,
"learning_rate": 0.0004592326199495461,
"loss": 5.6072845458984375,
"step": 157
},
{
"epoch": 0.2528,
"grad_norm": 1.7177708148956299,
"learning_rate": 0.0004584819256966171,
"loss": 5.845829010009766,
"step": 158
},
{
"epoch": 0.2544,
"grad_norm": 2.197434663772583,
"learning_rate": 0.0004577250079910973,
"loss": 5.7057013511657715,
"step": 159
},
{
"epoch": 0.256,
"grad_norm": 2.089193344116211,
"learning_rate": 0.00045696188942795005,
"loss": 5.745038986206055,
"step": 160
},
{
"epoch": 0.2576,
"grad_norm": 2.2623579502105713,
"learning_rate": 0.0004561925927872421,
"loss": 5.437371253967285,
"step": 161
},
{
"epoch": 0.2592,
"grad_norm": 1.5014855861663818,
"learning_rate": 0.000455417141033464,
"loss": 5.617335796356201,
"step": 162
},
{
"epoch": 0.2608,
"grad_norm": 1.6091152429580688,
"learning_rate": 0.00045463555731484396,
"loss": 5.750364303588867,
"step": 163
},
{
"epoch": 0.2624,
"grad_norm": 1.7927204370498657,
"learning_rate": 0.0004538478649626574,
"loss": 6.134846210479736,
"step": 164
},
{
"epoch": 0.264,
"grad_norm": 1.5488578081130981,
"learning_rate": 0.00045305408749053016,
"loss": 5.881228923797607,
"step": 165
},
{
"epoch": 0.2656,
"grad_norm": 1.6964894533157349,
"learning_rate": 0.0004522542485937369,
"loss": 5.726894855499268,
"step": 166
},
{
"epoch": 0.2672,
"grad_norm": 1.640055775642395,
"learning_rate": 0.0004514483721484933,
"loss": 5.594513893127441,
"step": 167
},
{
"epoch": 0.2688,
"grad_norm": 1.622751235961914,
"learning_rate": 0.0004506364822112439,
"loss": 5.518566131591797,
"step": 168
},
{
"epoch": 0.2704,
"grad_norm": 1.5396101474761963,
"learning_rate": 0.00044981860301794335,
"loss": 5.589843273162842,
"step": 169
},
{
"epoch": 0.272,
"grad_norm": 1.4792349338531494,
"learning_rate": 0.0004489947589833336,
"loss": 5.4407501220703125,
"step": 170
},
{
"epoch": 0.2736,
"grad_norm": 1.678307056427002,
"learning_rate": 0.00044816497470021456,
"loss": 5.557910919189453,
"step": 171
},
{
"epoch": 0.2752,
"grad_norm": 1.7133512496948242,
"learning_rate": 0.0004473292749387102,
"loss": 5.618350982666016,
"step": 172
},
{
"epoch": 0.2768,
"grad_norm": 1.4833654165267944,
"learning_rate": 0.00044648768464552904,
"loss": 5.650544166564941,
"step": 173
},
{
"epoch": 0.2784,
"grad_norm": 1.787833571434021,
"learning_rate": 0.00044564022894321966,
"loss": 5.516573429107666,
"step": 174
},
{
"epoch": 0.28,
"grad_norm": 2.016937255859375,
"learning_rate": 0.00044478693312942054,
"loss": 5.867213249206543,
"step": 175
},
{
"epoch": 0.2816,
"grad_norm": 1.6533347368240356,
"learning_rate": 0.00044392782267610497,
"loss": 5.728193283081055,
"step": 176
},
{
"epoch": 0.2832,
"grad_norm": 1.545316457748413,
"learning_rate": 0.00044306292322882063,
"loss": 5.591842174530029,
"step": 177
},
{
"epoch": 0.2848,
"grad_norm": 1.8199504613876343,
"learning_rate": 0.00044219226060592415,
"loss": 5.673701763153076,
"step": 178
},
{
"epoch": 0.2864,
"grad_norm": 1.597760558128357,
"learning_rate": 0.0004413158607978104,
"loss": 5.541760444641113,
"step": 179
},
{
"epoch": 0.288,
"grad_norm": 1.8495144844055176,
"learning_rate": 0.0004404337499661364,
"loss": 5.602829456329346,
"step": 180
},
{
"epoch": 0.2896,
"grad_norm": 1.8567280769348145,
"learning_rate": 0.00043954595444304067,
"loss": 5.71918249130249,
"step": 181
},
{
"epoch": 0.2912,
"grad_norm": 1.8808255195617676,
"learning_rate": 0.0004386525007303571,
"loss": 5.545975208282471,
"step": 182
},
{
"epoch": 0.2928,
"grad_norm": 1.7914137840270996,
"learning_rate": 0.00043775341549882364,
"loss": 5.760030269622803,
"step": 183
},
{
"epoch": 0.2944,
"grad_norm": 1.5386247634887695,
"learning_rate": 0.00043684872558728637,
"loss": 5.41167688369751,
"step": 184
},
{
"epoch": 0.296,
"grad_norm": 1.7406638860702515,
"learning_rate": 0.00043593845800189826,
"loss": 5.6405463218688965,
"step": 185
},
{
"epoch": 0.2976,
"grad_norm": 1.7136033773422241,
"learning_rate": 0.000435022639915313,
"loss": 5.921665191650391,
"step": 186
},
{
"epoch": 0.2992,
"grad_norm": 1.6137181520462036,
"learning_rate": 0.00043410129866587377,
"loss": 5.523682117462158,
"step": 187
},
{
"epoch": 0.3008,
"grad_norm": 1.4593943357467651,
"learning_rate": 0.00043317446175679733,
"loss": 5.579282283782959,
"step": 188
},
{
"epoch": 0.3024,
"grad_norm": 1.498769760131836,
"learning_rate": 0.00043224215685535287,
"loss": 5.65568733215332,
"step": 189
},
{
"epoch": 0.304,
"grad_norm": 1.4099656343460083,
"learning_rate": 0.00043130441179203626,
"loss": 5.450364589691162,
"step": 190
},
{
"epoch": 0.3056,
"grad_norm": 1.762242317199707,
"learning_rate": 0.00043036125455973894,
"loss": 5.701364517211914,
"step": 191
},
{
"epoch": 0.3072,
"grad_norm": 1.9644355773925781,
"learning_rate": 0.00042941271331291275,
"loss": 5.515183448791504,
"step": 192
},
{
"epoch": 0.3088,
"grad_norm": 1.9126542806625366,
"learning_rate": 0.0004284588163667292,
"loss": 5.794773578643799,
"step": 193
},
{
"epoch": 0.3104,
"grad_norm": 1.8638148307800293,
"learning_rate": 0.0004274995921962343,
"loss": 5.806097030639648,
"step": 194
},
{
"epoch": 0.312,
"grad_norm": 1.701051115989685,
"learning_rate": 0.00042653506943549844,
"loss": 5.101565361022949,
"step": 195
},
{
"epoch": 0.3136,
"grad_norm": 2.270686626434326,
"learning_rate": 0.00042556527687676184,
"loss": 5.6310319900512695,
"step": 196
},
{
"epoch": 0.3152,
"grad_norm": 1.8609226942062378,
"learning_rate": 0.00042459024346957477,
"loss": 5.535915851593018,
"step": 197
},
{
"epoch": 0.3168,
"grad_norm": 2.0503954887390137,
"learning_rate": 0.0004236099983199338,
"loss": 5.734372138977051,
"step": 198
},
{
"epoch": 0.3184,
"grad_norm": 1.6068768501281738,
"learning_rate": 0.00042262457068941247,
"loss": 5.578657150268555,
"step": 199
},
{
"epoch": 0.32,
"grad_norm": 1.602341651916504,
"learning_rate": 0.000421633989994288,
"loss": 5.451129913330078,
"step": 200
},
{
"epoch": 0.3216,
"grad_norm": 1.4740185737609863,
"learning_rate": 0.00042063828580466355,
"loss": 5.597467422485352,
"step": 201
},
{
"epoch": 0.3232,
"grad_norm": 1.6884571313858032,
"learning_rate": 0.0004196374878435846,
"loss": 5.773179054260254,
"step": 202
},
{
"epoch": 0.3248,
"grad_norm": 3.2064454555511475,
"learning_rate": 0.00041863162598615265,
"loss": 5.903354167938232,
"step": 203
},
{
"epoch": 0.3264,
"grad_norm": 2.3717195987701416,
"learning_rate": 0.0004176207302586329,
"loss": 5.43741512298584,
"step": 204
},
{
"epoch": 0.328,
"grad_norm": 1.7029227018356323,
"learning_rate": 0.0004166048308375578,
"loss": 5.542079925537109,
"step": 205
},
{
"epoch": 0.3296,
"grad_norm": 1.4132956266403198,
"learning_rate": 0.0004155839580488269,
"loss": 5.548293590545654,
"step": 206
},
{
"epoch": 0.3312,
"grad_norm": 1.7507219314575195,
"learning_rate": 0.0004145581423668008,
"loss": 5.625497817993164,
"step": 207
},
{
"epoch": 0.3328,
"grad_norm": 1.7790549993515015,
"learning_rate": 0.00041352741441339175,
"loss": 5.523196220397949,
"step": 208
},
{
"epoch": 0.3344,
"grad_norm": 1.6135910749435425,
"learning_rate": 0.0004124918049571499,
"loss": 5.497952461242676,
"step": 209
},
{
"epoch": 0.336,
"grad_norm": 1.700406789779663,
"learning_rate": 0.00041145134491234425,
"loss": 5.513679027557373,
"step": 210
},
{
"epoch": 0.3376,
"grad_norm": 1.5768215656280518,
"learning_rate": 0.00041040606533804025,
"loss": 5.65580940246582,
"step": 211
},
{
"epoch": 0.3392,
"grad_norm": 1.5992205142974854,
"learning_rate": 0.00040935599743717243,
"loss": 5.415986061096191,
"step": 212
},
{
"epoch": 0.3408,
"grad_norm": 2.1629347801208496,
"learning_rate": 0.00040830117255561294,
"loss": 5.394900321960449,
"step": 213
},
{
"epoch": 0.3424,
"grad_norm": 1.5803372859954834,
"learning_rate": 0.000407241622181236,
"loss": 5.085600852966309,
"step": 214
},
{
"epoch": 0.344,
"grad_norm": 1.4815354347229004,
"learning_rate": 0.0004061773779429776,
"loss": 5.647576332092285,
"step": 215
},
{
"epoch": 0.3456,
"grad_norm": 1.5663725137710571,
"learning_rate": 0.00040510847160989203,
"loss": 5.418036460876465,
"step": 216
},
{
"epoch": 0.3472,
"grad_norm": 1.7371917963027954,
"learning_rate": 0.00040403493509020275,
"loss": 5.280213356018066,
"step": 217
},
{
"epoch": 0.3488,
"grad_norm": 1.4984663724899292,
"learning_rate": 0.0004029568004303501,
"loss": 5.509110927581787,
"step": 218
},
{
"epoch": 0.3504,
"grad_norm": 1.5602787733078003,
"learning_rate": 0.0004018740998140352,
"loss": 5.608109951019287,
"step": 219
},
{
"epoch": 0.352,
"grad_norm": 1.6253869533538818,
"learning_rate": 0.0004007868655612586,
"loss": 5.198980331420898,
"step": 220
},
{
"epoch": 0.3536,
"grad_norm": 2.013225555419922,
"learning_rate": 0.00039969513012735566,
"loss": 5.129229545593262,
"step": 221
},
{
"epoch": 0.3552,
"grad_norm": 1.4294469356536865,
"learning_rate": 0.00039859892610202786,
"loss": 5.616961479187012,
"step": 222
},
{
"epoch": 0.3568,
"grad_norm": 1.7147184610366821,
"learning_rate": 0.0003974982862083697,
"loss": 5.369600772857666,
"step": 223
},
{
"epoch": 0.3584,
"grad_norm": 1.6554255485534668,
"learning_rate": 0.00039639324330189234,
"loss": 5.445437431335449,
"step": 224
},
{
"epoch": 0.36,
"grad_norm": 2.799031972885132,
"learning_rate": 0.00039528383036954224,
"loss": 5.5256500244140625,
"step": 225
},
{
"epoch": 0.3616,
"grad_norm": 1.364023208618164,
"learning_rate": 0.00039417008052871684,
"loss": 5.256645202636719,
"step": 226
},
{
"epoch": 0.3632,
"grad_norm": 1.6340276002883911,
"learning_rate": 0.0003930520270262757,
"loss": 5.542902946472168,
"step": 227
},
{
"epoch": 0.3648,
"grad_norm": 1.289225459098816,
"learning_rate": 0.0003919297032375485,
"loss": 5.363834381103516,
"step": 228
},
{
"epoch": 0.3664,
"grad_norm": 1.7022228240966797,
"learning_rate": 0.00039080314266533826,
"loss": 5.533950328826904,
"step": 229
},
{
"epoch": 0.368,
"grad_norm": 1.5650995969772339,
"learning_rate": 0.00038967237893892134,
"loss": 5.173304557800293,
"step": 230
},
{
"epoch": 0.3696,
"grad_norm": 1.7082035541534424,
"learning_rate": 0.00038853744581304376,
"loss": 5.347742080688477,
"step": 231
},
{
"epoch": 0.3712,
"grad_norm": 1.5300484895706177,
"learning_rate": 0.00038739837716691327,
"loss": 5.307585716247559,
"step": 232
},
{
"epoch": 0.3728,
"grad_norm": 1.4221162796020508,
"learning_rate": 0.0003862552070031886,
"loss": 5.390194892883301,
"step": 233
},
{
"epoch": 0.3744,
"grad_norm": 1.5934863090515137,
"learning_rate": 0.00038510796944696355,
"loss": 5.698745250701904,
"step": 234
},
{
"epoch": 0.376,
"grad_norm": 1.574376106262207,
"learning_rate": 0.00038395669874474915,
"loss": 5.695178508758545,
"step": 235
},
{
"epoch": 0.3776,
"grad_norm": 1.4545917510986328,
"learning_rate": 0.00038280142926345084,
"loss": 5.21755313873291,
"step": 236
},
{
"epoch": 0.3792,
"grad_norm": 1.6824661493301392,
"learning_rate": 0.0003816421954893428,
"loss": 5.816608428955078,
"step": 237
},
{
"epoch": 0.3808,
"grad_norm": 1.943800449371338,
"learning_rate": 0.0003804790320270384,
"loss": 5.530592441558838,
"step": 238
},
{
"epoch": 0.3824,
"grad_norm": 1.4291504621505737,
"learning_rate": 0.00037931197359845713,
"loss": 5.4604811668396,
"step": 239
},
{
"epoch": 0.384,
"grad_norm": 1.450872778892517,
"learning_rate": 0.00037814105504178853,
"loss": 5.420169353485107,
"step": 240
},
{
"epoch": 0.3856,
"grad_norm": 1.431982159614563,
"learning_rate": 0.00037696631131045155,
"loss": 5.437797546386719,
"step": 241
},
{
"epoch": 0.3872,
"grad_norm": 1.5654010772705078,
"learning_rate": 0.00037578777747205173,
"loss": 5.542431354522705,
"step": 242
},
{
"epoch": 0.3888,
"grad_norm": 1.4680758714675903,
"learning_rate": 0.000374605488707334,
"loss": 5.8609299659729,
"step": 243
},
{
"epoch": 0.3904,
"grad_norm": 1.484171748161316,
"learning_rate": 0.0003734194803091329,
"loss": 5.2261762619018555,
"step": 244
},
{
"epoch": 0.392,
"grad_norm": 1.378163456916809,
"learning_rate": 0.00037222978768131857,
"loss": 5.523834228515625,
"step": 245
},
{
"epoch": 0.3936,
"grad_norm": 1.8471333980560303,
"learning_rate": 0.00037103644633774014,
"loss": 5.406384468078613,
"step": 246
},
{
"epoch": 0.3952,
"grad_norm": 1.4139055013656616,
"learning_rate": 0.00036983949190116575,
"loss": 5.400781631469727,
"step": 247
},
{
"epoch": 0.3968,
"grad_norm": 1.2311971187591553,
"learning_rate": 0.0003686389601022188,
"loss": 5.407512664794922,
"step": 248
},
{
"epoch": 0.3984,
"grad_norm": 1.7283658981323242,
"learning_rate": 0.0003674348867783115,
"loss": 5.575046062469482,
"step": 249
},
{
"epoch": 0.4,
"grad_norm": 1.3995170593261719,
"learning_rate": 0.0003662273078725754,
"loss": 5.523738384246826,
"step": 250
},
{
"epoch": 0.4016,
"grad_norm": 1.3066350221633911,
"learning_rate": 0.00036501625943278804,
"loss": 5.64078426361084,
"step": 251
},
{
"epoch": 0.4032,
"grad_norm": 1.3789863586425781,
"learning_rate": 0.0003638017776102968,
"loss": 5.428204536437988,
"step": 252
},
{
"epoch": 0.4048,
"grad_norm": 1.721011757850647,
"learning_rate": 0.00036258389865894027,
"loss": 5.646852016448975,
"step": 253
},
{
"epoch": 0.4064,
"grad_norm": 1.7198848724365234,
"learning_rate": 0.0003613626589339652,
"loss": 5.864961624145508,
"step": 254
},
{
"epoch": 0.408,
"grad_norm": 1.8125197887420654,
"learning_rate": 0.00036013809489094246,
"loss": 5.502827167510986,
"step": 255
},
{
"epoch": 0.4096,
"grad_norm": 1.5398613214492798,
"learning_rate": 0.00035891024308467727,
"loss": 5.422593116760254,
"step": 256
},
{
"epoch": 0.4112,
"grad_norm": 1.2854444980621338,
"learning_rate": 0.0003576791401681194,
"loss": 5.769440650939941,
"step": 257
},
{
"epoch": 0.4128,
"grad_norm": 1.302415370941162,
"learning_rate": 0.0003564448228912682,
"loss": 5.568209171295166,
"step": 258
},
{
"epoch": 0.4144,
"grad_norm": 1.4718657732009888,
"learning_rate": 0.00035520732810007566,
"loss": 5.543675422668457,
"step": 259
},
{
"epoch": 0.416,
"grad_norm": 1.6336448192596436,
"learning_rate": 0.0003539666927353469,
"loss": 5.599291801452637,
"step": 260
},
{
"epoch": 0.4176,
"grad_norm": 1.7621365785598755,
"learning_rate": 0.00035272295383163713,
"loss": 5.4962263107299805,
"step": 261
},
{
"epoch": 0.4192,
"grad_norm": 1.6452198028564453,
"learning_rate": 0.00035147614851614587,
"loss": 5.347473621368408,
"step": 262
},
{
"epoch": 0.4208,
"grad_norm": 1.3223097324371338,
"learning_rate": 0.00035022631400760944,
"loss": 5.4395928382873535,
"step": 263
},
{
"epoch": 0.4224,
"grad_norm": 1.178402304649353,
"learning_rate": 0.0003489734876151891,
"loss": 5.452559471130371,
"step": 264
},
{
"epoch": 0.424,
"grad_norm": 1.493491530418396,
"learning_rate": 0.0003477177067373579,
"loss": 5.549748420715332,
"step": 265
},
{
"epoch": 0.4256,
"grad_norm": 1.2983075380325317,
"learning_rate": 0.0003464590088607839,
"loss": 5.593997478485107,
"step": 266
},
{
"epoch": 0.4272,
"grad_norm": 1.4325454235076904,
"learning_rate": 0.00034519743155921127,
"loss": 5.567399978637695,
"step": 267
},
{
"epoch": 0.4288,
"grad_norm": 1.3392157554626465,
"learning_rate": 0.00034393301249233897,
"loss": 5.392118453979492,
"step": 268
},
{
"epoch": 0.4304,
"grad_norm": 1.543241262435913,
"learning_rate": 0.000342665789404696,
"loss": 5.2302565574646,
"step": 269
},
{
"epoch": 0.432,
"grad_norm": 1.5115416049957275,
"learning_rate": 0.00034139580012451523,
"loss": 5.704424858093262,
"step": 270
},
{
"epoch": 0.4336,
"grad_norm": 1.3637906312942505,
"learning_rate": 0.0003401230825626037,
"loss": 5.522019863128662,
"step": 271
},
{
"epoch": 0.4352,
"grad_norm": 1.5312447547912598,
"learning_rate": 0.00033884767471121125,
"loss": 5.600247859954834,
"step": 272
},
{
"epoch": 0.4368,
"grad_norm": 1.467431664466858,
"learning_rate": 0.00033756961464289633,
"loss": 5.204289436340332,
"step": 273
},
{
"epoch": 0.4384,
"grad_norm": 1.352095603942871,
"learning_rate": 0.0003362889405093894,
"loss": 5.327722549438477,
"step": 274
},
{
"epoch": 0.44,
"grad_norm": 1.3652808666229248,
"learning_rate": 0.0003350056905404543,
"loss": 5.118766784667969,
"step": 275
},
{
"epoch": 0.4416,
"grad_norm": 1.6171950101852417,
"learning_rate": 0.00033371990304274655,
"loss": 5.259974479675293,
"step": 276
},
{
"epoch": 0.4432,
"grad_norm": 1.6351940631866455,
"learning_rate": 0.0003324316163986704,
"loss": 5.432730197906494,
"step": 277
},
{
"epoch": 0.4448,
"grad_norm": 1.6966768503189087,
"learning_rate": 0.00033114086906523265,
"loss": 5.381967544555664,
"step": 278
},
{
"epoch": 0.4464,
"grad_norm": 1.3781499862670898,
"learning_rate": 0.00032984769957289503,
"loss": 5.303073883056641,
"step": 279
},
{
"epoch": 0.448,
"grad_norm": 1.5721884965896606,
"learning_rate": 0.0003285521465244237,
"loss": 5.291014671325684,
"step": 280
},
{
"epoch": 0.4496,
"grad_norm": 1.1372907161712646,
"learning_rate": 0.00032725424859373687,
"loss": 5.211060523986816,
"step": 281
},
{
"epoch": 0.4512,
"grad_norm": 1.293617844581604,
"learning_rate": 0.00032595404452475085,
"loss": 5.443847179412842,
"step": 282
},
{
"epoch": 0.4528,
"grad_norm": 2.1258699893951416,
"learning_rate": 0.0003246515731302228,
"loss": 5.064897537231445,
"step": 283
},
{
"epoch": 0.4544,
"grad_norm": 1.418958067893982,
"learning_rate": 0.00032334687329059264,
"loss": 5.420772552490234,
"step": 284
},
{
"epoch": 0.456,
"grad_norm": 1.2100834846496582,
"learning_rate": 0.0003220399839528222,
"loss": 5.425792217254639,
"step": 285
},
{
"epoch": 0.4576,
"grad_norm": 1.2931607961654663,
"learning_rate": 0.0003207309441292325,
"loss": 5.330716609954834,
"step": 286
},
{
"epoch": 0.4592,
"grad_norm": 1.4552083015441895,
"learning_rate": 0.0003194197928963396,
"loss": 5.734864234924316,
"step": 287
},
{
"epoch": 0.4608,
"grad_norm": 1.377821683883667,
"learning_rate": 0.00031810656939368744,
"loss": 5.4975361824035645,
"step": 288
},
{
"epoch": 0.4624,
"grad_norm": 1.3547130823135376,
"learning_rate": 0.0003167913128226803,
"loss": 5.421193599700928,
"step": 289
},
{
"epoch": 0.464,
"grad_norm": 1.4445191621780396,
"learning_rate": 0.0003154740624454118,
"loss": 5.138959884643555,
"step": 290
},
{
"epoch": 0.4656,
"grad_norm": 1.3757892847061157,
"learning_rate": 0.00031415485758349345,
"loss": 5.1781840324401855,
"step": 291
},
{
"epoch": 0.4672,
"grad_norm": 1.2458899021148682,
"learning_rate": 0.0003128337376168805,
"loss": 4.89755916595459,
"step": 292
},
{
"epoch": 0.4688,
"grad_norm": 1.581918478012085,
"learning_rate": 0.00031151074198269656,
"loss": 5.327348709106445,
"step": 293
},
{
"epoch": 0.4704,
"grad_norm": 1.5751845836639404,
"learning_rate": 0.00031018591017405644,
"loss": 5.386034965515137,
"step": 294
},
{
"epoch": 0.472,
"grad_norm": 1.6921762228012085,
"learning_rate": 0.0003088592817388869,
"loss": 5.158099174499512,
"step": 295
},
{
"epoch": 0.4736,
"grad_norm": 1.62604820728302,
"learning_rate": 0.0003075308962787466,
"loss": 5.450359344482422,
"step": 296
},
{
"epoch": 0.4752,
"grad_norm": 1.2735328674316406,
"learning_rate": 0.00030620079344764327,
"loss": 5.264720439910889,
"step": 297
},
{
"epoch": 0.4768,
"grad_norm": 1.6045722961425781,
"learning_rate": 0.00030486901295085066,
"loss": 5.421563625335693,
"step": 298
},
{
"epoch": 0.4784,
"grad_norm": 1.4631224870681763,
"learning_rate": 0.0003035355945437228,
"loss": 5.549293041229248,
"step": 299
},
{
"epoch": 0.48,
"grad_norm": 1.34758460521698,
"learning_rate": 0.00030220057803050765,
"loss": 5.213095664978027,
"step": 300
},
{
"epoch": 0.4816,
"grad_norm": 1.659041404724121,
"learning_rate": 0.0003008640032631585,
"loss": 5.40679931640625,
"step": 301
},
{
"epoch": 0.4832,
"grad_norm": 1.3234513998031616,
"learning_rate": 0.00029952591014014454,
"loss": 5.249087333679199,
"step": 302
},
{
"epoch": 0.4848,
"grad_norm": 1.2783095836639404,
"learning_rate": 0.0002981863386052599,
"loss": 5.571717262268066,
"step": 303
},
{
"epoch": 0.4864,
"grad_norm": 1.2698612213134766,
"learning_rate": 0.0002968453286464312,
"loss": 5.460443019866943,
"step": 304
},
{
"epoch": 0.488,
"grad_norm": 1.411340594291687,
"learning_rate": 0.00029550292029452375,
"loss": 5.521218776702881,
"step": 305
},
{
"epoch": 0.4896,
"grad_norm": 1.2482413053512573,
"learning_rate": 0.0002941591536221469,
"loss": 5.2962646484375,
"step": 306
},
{
"epoch": 0.4912,
"grad_norm": 1.3746726512908936,
"learning_rate": 0.0002928140687424573,
"loss": 5.614439964294434,
"step": 307
},
{
"epoch": 0.4928,
"grad_norm": 1.5684117078781128,
"learning_rate": 0.00029146770580796205,
"loss": 5.34489107131958,
"step": 308
},
{
"epoch": 0.4944,
"grad_norm": 1.8253686428070068,
"learning_rate": 0.00029012010500931965,
"loss": 5.56744384765625,
"step": 309
},
{
"epoch": 0.496,
"grad_norm": 1.4048644304275513,
"learning_rate": 0.00028877130657414054,
"loss": 5.361034393310547,
"step": 310
},
{
"epoch": 0.4976,
"grad_norm": 1.3948677778244019,
"learning_rate": 0.0002874213507657861,
"loss": 5.47017240524292,
"step": 311
},
{
"epoch": 0.4992,
"grad_norm": 1.4963343143463135,
"learning_rate": 0.00028607027788216674,
"loss": 5.397054672241211,
"step": 312
},
{
"epoch": 0.5008,
"grad_norm": 1.3787459135055542,
"learning_rate": 0.00028471812825453914,
"loss": 5.223832607269287,
"step": 313
},
{
"epoch": 0.5024,
"grad_norm": 1.6353243589401245,
"learning_rate": 0.0002833649422463019,
"loss": 5.2796525955200195,
"step": 314
},
{
"epoch": 0.504,
"grad_norm": 1.3684626817703247,
"learning_rate": 0.0002820107602517913,
"loss": 5.421512126922607,
"step": 315
},
{
"epoch": 0.5056,
"grad_norm": 1.2275117635726929,
"learning_rate": 0.0002806556226950746,
"loss": 5.282046318054199,
"step": 316
},
{
"epoch": 0.5072,
"grad_norm": 1.5556248426437378,
"learning_rate": 0.00027929957002874436,
"loss": 5.28046178817749,
"step": 317
},
{
"epoch": 0.5088,
"grad_norm": 1.5862129926681519,
"learning_rate": 0.00027794264273270987,
"loss": 5.368446350097656,
"step": 318
},
{
"epoch": 0.5104,
"grad_norm": 1.4724379777908325,
"learning_rate": 0.00027658488131298946,
"loss": 5.535717010498047,
"step": 319
},
{
"epoch": 0.512,
"grad_norm": 1.257763147354126,
"learning_rate": 0.00027522632630050116,
"loss": 5.145805835723877,
"step": 320
},
{
"epoch": 0.5136,
"grad_norm": 1.2067614793777466,
"learning_rate": 0.00027386701824985254,
"loss": 5.230715274810791,
"step": 321
},
{
"epoch": 0.5152,
"grad_norm": 1.3885655403137207,
"learning_rate": 0.00027250699773813066,
"loss": 5.397106170654297,
"step": 322
},
{
"epoch": 0.5168,
"grad_norm": 1.3261369466781616,
"learning_rate": 0.00027114630536369,
"loss": 5.118717193603516,
"step": 323
},
{
"epoch": 0.5184,
"grad_norm": 1.3677432537078857,
"learning_rate": 0.0002697849817449415,
"loss": 5.1717400550842285,
"step": 324
},
{
"epoch": 0.52,
"grad_norm": 1.476125955581665,
"learning_rate": 0.00026842306751913926,
"loss": 5.247461318969727,
"step": 325
},
{
"epoch": 0.5216,
"grad_norm": 1.4229127168655396,
"learning_rate": 0.0002670606033411678,
"loss": 5.157002925872803,
"step": 326
},
{
"epoch": 0.5232,
"grad_norm": 1.4474886655807495,
"learning_rate": 0.0002656976298823284,
"loss": 5.441634178161621,
"step": 327
},
{
"epoch": 0.5248,
"grad_norm": 1.4530051946640015,
"learning_rate": 0.00026433418782912505,
"loss": 5.526297569274902,
"step": 328
},
{
"epoch": 0.5264,
"grad_norm": 1.1628731489181519,
"learning_rate": 0.00026297031788205,
"loss": 5.242552280426025,
"step": 329
},
{
"epoch": 0.528,
"grad_norm": 1.245635747909546,
"learning_rate": 0.00026160606075436844,
"loss": 5.074901103973389,
"step": 330
},
{
"epoch": 0.5296,
"grad_norm": 1.2995966672897339,
"learning_rate": 0.0002602414571709036,
"loss": 5.35468864440918,
"step": 331
},
{
"epoch": 0.5312,
"grad_norm": 1.1434332132339478,
"learning_rate": 0.00025887654786682076,
"loss": 5.233968257904053,
"step": 332
},
{
"epoch": 0.5328,
"grad_norm": 1.8108292818069458,
"learning_rate": 0.0002575113735864114,
"loss": 5.389377593994141,
"step": 333
},
{
"epoch": 0.5344,
"grad_norm": 1.7074164152145386,
"learning_rate": 0.0002561459750818769,
"loss": 5.581827163696289,
"step": 334
},
{
"epoch": 0.536,
"grad_norm": 1.5946106910705566,
"learning_rate": 0.0002547803931121119,
"loss": 5.279594898223877,
"step": 335
},
{
"epoch": 0.5376,
"grad_norm": 1.6184440851211548,
"learning_rate": 0.00025341466844148775,
"loss": 5.198509693145752,
"step": 336
},
{
"epoch": 0.5392,
"grad_norm": 1.2537761926651,
"learning_rate": 0.0002520488418386358,
"loss": 5.231502056121826,
"step": 337
},
{
"epoch": 0.5408,
"grad_norm": 1.4233760833740234,
"learning_rate": 0.00025068295407523,
"loss": 5.152407646179199,
"step": 338
},
{
"epoch": 0.5424,
"grad_norm": 1.2714813947677612,
"learning_rate": 0.00024931704592477,
"loss": 5.5605878829956055,
"step": 339
},
{
"epoch": 0.544,
"grad_norm": 1.2656306028366089,
"learning_rate": 0.0002479511581613642,
"loss": 5.457594394683838,
"step": 340
},
{
"epoch": 0.5456,
"grad_norm": 1.1355462074279785,
"learning_rate": 0.00024658533155851227,
"loss": 5.645468711853027,
"step": 341
},
{
"epoch": 0.5472,
"grad_norm": 1.564833641052246,
"learning_rate": 0.0002452196068878881,
"loss": 5.560579299926758,
"step": 342
},
{
"epoch": 0.5488,
"grad_norm": 1.2560124397277832,
"learning_rate": 0.00024385402491812317,
"loss": 5.102597236633301,
"step": 343
},
{
"epoch": 0.5504,
"grad_norm": 1.47645103931427,
"learning_rate": 0.00024248862641358866,
"loss": 5.347832679748535,
"step": 344
},
{
"epoch": 0.552,
"grad_norm": 1.634925127029419,
"learning_rate": 0.00024112345213317933,
"loss": 5.229283332824707,
"step": 345
},
{
"epoch": 0.5536,
"grad_norm": 1.3205620050430298,
"learning_rate": 0.00023975854282909641,
"loss": 5.406874179840088,
"step": 346
},
{
"epoch": 0.5552,
"grad_norm": 1.5099257230758667,
"learning_rate": 0.00023839393924563162,
"loss": 5.050958156585693,
"step": 347
},
{
"epoch": 0.5568,
"grad_norm": 1.2842683792114258,
"learning_rate": 0.0002370296821179501,
"loss": 5.189534664154053,
"step": 348
},
{
"epoch": 0.5584,
"grad_norm": 1.1710087060928345,
"learning_rate": 0.00023566581217087493,
"loss": 5.227584362030029,
"step": 349
},
{
"epoch": 0.56,
"grad_norm": 1.3577237129211426,
"learning_rate": 0.00023430237011767165,
"loss": 5.079989433288574,
"step": 350
},
{
"epoch": 0.5616,
"grad_norm": 1.2834707498550415,
"learning_rate": 0.00023293939665883229,
"loss": 5.309730052947998,
"step": 351
},
{
"epoch": 0.5632,
"grad_norm": 1.4233572483062744,
"learning_rate": 0.0002315769324808608,
"loss": 5.27959680557251,
"step": 352
},
{
"epoch": 0.5648,
"grad_norm": 1.7741755247116089,
"learning_rate": 0.00023021501825505847,
"loss": 5.245169162750244,
"step": 353
},
{
"epoch": 0.5664,
"grad_norm": 1.743356466293335,
"learning_rate": 0.00022885369463631,
"loss": 5.384469985961914,
"step": 354
},
{
"epoch": 0.568,
"grad_norm": 1.3255281448364258,
"learning_rate": 0.00022749300226186948,
"loss": 5.170154094696045,
"step": 355
},
{
"epoch": 0.5696,
"grad_norm": 1.3135267496109009,
"learning_rate": 0.0002261329817501475,
"loss": 5.177214622497559,
"step": 356
},
{
"epoch": 0.5712,
"grad_norm": 1.334771990776062,
"learning_rate": 0.00022477367369949885,
"loss": 5.129632472991943,
"step": 357
},
{
"epoch": 0.5728,
"grad_norm": 1.5144758224487305,
"learning_rate": 0.00022341511868701055,
"loss": 4.999809741973877,
"step": 358
},
{
"epoch": 0.5744,
"grad_norm": 1.2315837144851685,
"learning_rate": 0.0002220573572672902,
"loss": 5.348094940185547,
"step": 359
},
{
"epoch": 0.576,
"grad_norm": 1.4335271120071411,
"learning_rate": 0.00022070042997125567,
"loss": 5.095552444458008,
"step": 360
},
{
"epoch": 0.5776,
"grad_norm": 1.6949377059936523,
"learning_rate": 0.00021934437730492543,
"loss": 5.0214924812316895,
"step": 361
},
{
"epoch": 0.5792,
"grad_norm": 1.3676100969314575,
"learning_rate": 0.00021798923974820884,
"loss": 5.584174633026123,
"step": 362
},
{
"epoch": 0.5808,
"grad_norm": 1.3218090534210205,
"learning_rate": 0.0002166350577536981,
"loss": 5.239519119262695,
"step": 363
},
{
"epoch": 0.5824,
"grad_norm": 1.3069649934768677,
"learning_rate": 0.00021528187174546092,
"loss": 5.372768402099609,
"step": 364
},
{
"epoch": 0.584,
"grad_norm": 1.3426185846328735,
"learning_rate": 0.00021392972211783332,
"loss": 5.219846248626709,
"step": 365
},
{
"epoch": 0.5856,
"grad_norm": 1.3411294221878052,
"learning_rate": 0.00021257864923421402,
"loss": 4.874852180480957,
"step": 366
},
{
"epoch": 0.5872,
"grad_norm": 1.4060436487197876,
"learning_rate": 0.00021122869342585948,
"loss": 5.2531046867370605,
"step": 367
},
{
"epoch": 0.5888,
"grad_norm": 1.192141056060791,
"learning_rate": 0.00020987989499068042,
"loss": 5.342706203460693,
"step": 368
},
{
"epoch": 0.5904,
"grad_norm": 1.3001792430877686,
"learning_rate": 0.00020853229419203807,
"loss": 5.323460578918457,
"step": 369
},
{
"epoch": 0.592,
"grad_norm": 1.4926820993423462,
"learning_rate": 0.0002071859312575427,
"loss": 5.296498775482178,
"step": 370
},
{
"epoch": 0.5936,
"grad_norm": 1.434384822845459,
"learning_rate": 0.00020584084637785316,
"loss": 5.084543228149414,
"step": 371
},
{
"epoch": 0.5952,
"grad_norm": 2.288747787475586,
"learning_rate": 0.00020449707970547629,
"loss": 5.0905585289001465,
"step": 372
},
{
"epoch": 0.5968,
"grad_norm": 1.4251408576965332,
"learning_rate": 0.0002031546713535688,
"loss": 5.365981101989746,
"step": 373
},
{
"epoch": 0.5984,
"grad_norm": 1.317584753036499,
"learning_rate": 0.00020181366139474012,
"loss": 5.608163356781006,
"step": 374
},
{
"epoch": 0.6,
"grad_norm": 1.187654733657837,
"learning_rate": 0.00020047408985985552,
"loss": 4.876247406005859,
"step": 375
},
{
"epoch": 0.6016,
"grad_norm": 2.2563083171844482,
"learning_rate": 0.0001991359967368416,
"loss": 5.187510013580322,
"step": 376
},
{
"epoch": 0.6032,
"grad_norm": 1.282902479171753,
"learning_rate": 0.00019779942196949238,
"loss": 5.240813255310059,
"step": 377
},
{
"epoch": 0.6048,
"grad_norm": 1.345765471458435,
"learning_rate": 0.00019646440545627723,
"loss": 5.2197957038879395,
"step": 378
},
{
"epoch": 0.6064,
"grad_norm": 1.2917436361312866,
"learning_rate": 0.0001951309870491494,
"loss": 5.324549674987793,
"step": 379
},
{
"epoch": 0.608,
"grad_norm": 1.2900662422180176,
"learning_rate": 0.0001937992065523567,
"loss": 5.294788360595703,
"step": 380
},
{
"epoch": 0.6096,
"grad_norm": 1.5021880865097046,
"learning_rate": 0.00019246910372125342,
"loss": 5.409048080444336,
"step": 381
},
{
"epoch": 0.6112,
"grad_norm": 1.6828486919403076,
"learning_rate": 0.0001911407182611131,
"loss": 5.392390251159668,
"step": 382
},
{
"epoch": 0.6128,
"grad_norm": 1.5416451692581177,
"learning_rate": 0.00018981408982594365,
"loss": 5.151852130889893,
"step": 383
},
{
"epoch": 0.6144,
"grad_norm": 1.2479101419448853,
"learning_rate": 0.00018848925801730342,
"loss": 5.193958759307861,
"step": 384
},
{
"epoch": 0.616,
"grad_norm": 1.471063256263733,
"learning_rate": 0.00018716626238311958,
"loss": 4.912611961364746,
"step": 385
},
{
"epoch": 0.6176,
"grad_norm": 1.285828948020935,
"learning_rate": 0.00018584514241650667,
"loss": 4.898399829864502,
"step": 386
},
{
"epoch": 0.6192,
"grad_norm": 1.8733534812927246,
"learning_rate": 0.0001845259375545882,
"loss": 4.954188346862793,
"step": 387
},
{
"epoch": 0.6208,
"grad_norm": 1.55088472366333,
"learning_rate": 0.00018320868717731977,
"loss": 5.39755916595459,
"step": 388
},
{
"epoch": 0.6224,
"grad_norm": 1.876209020614624,
"learning_rate": 0.00018189343060631257,
"loss": 5.461378574371338,
"step": 389
},
{
"epoch": 0.624,
"grad_norm": 1.192241907119751,
"learning_rate": 0.0001805802071036605,
"loss": 4.95612096786499,
"step": 390
},
{
"epoch": 0.6256,
"grad_norm": 1.3008099794387817,
"learning_rate": 0.00017926905587076748,
"loss": 5.485091209411621,
"step": 391
},
{
"epoch": 0.6272,
"grad_norm": 1.7544057369232178,
"learning_rate": 0.00017796001604717787,
"loss": 4.80226993560791,
"step": 392
},
{
"epoch": 0.6288,
"grad_norm": 1.2537293434143066,
"learning_rate": 0.00017665312670940743,
"loss": 5.096302509307861,
"step": 393
},
{
"epoch": 0.6304,
"grad_norm": 1.1589773893356323,
"learning_rate": 0.0001753484268697772,
"loss": 5.296406269073486,
"step": 394
},
{
"epoch": 0.632,
"grad_norm": 1.2438563108444214,
"learning_rate": 0.0001740459554752492,
"loss": 5.258586406707764,
"step": 395
},
{
"epoch": 0.6336,
"grad_norm": 1.2174347639083862,
"learning_rate": 0.00017274575140626317,
"loss": 5.269428253173828,
"step": 396
},
{
"epoch": 0.6352,
"grad_norm": 1.4118070602416992,
"learning_rate": 0.00017144785347557643,
"loss": 4.895862579345703,
"step": 397
},
{
"epoch": 0.6368,
"grad_norm": 1.2514833211898804,
"learning_rate": 0.000170152300427105,
"loss": 5.026675701141357,
"step": 398
},
{
"epoch": 0.6384,
"grad_norm": 1.2788375616073608,
"learning_rate": 0.0001688591309347674,
"loss": 5.225519180297852,
"step": 399
},
{
"epoch": 0.64,
"grad_norm": 2.105532646179199,
"learning_rate": 0.00016756838360132968,
"loss": 4.846694469451904,
"step": 400
},
{
"epoch": 0.6416,
"grad_norm": 1.5078997611999512,
"learning_rate": 0.00016628009695725346,
"loss": 5.365673065185547,
"step": 401
},
{
"epoch": 0.6432,
"grad_norm": 1.2744578123092651,
"learning_rate": 0.00016499430945954576,
"loss": 5.406460285186768,
"step": 402
},
{
"epoch": 0.6448,
"grad_norm": 1.494751214981079,
"learning_rate": 0.0001637110594906106,
"loss": 5.130960464477539,
"step": 403
},
{
"epoch": 0.6464,
"grad_norm": 1.4280880689620972,
"learning_rate": 0.00016243038535710365,
"loss": 5.194888114929199,
"step": 404
},
{
"epoch": 0.648,
"grad_norm": 1.2241549491882324,
"learning_rate": 0.00016115232528878876,
"loss": 4.969592571258545,
"step": 405
},
{
"epoch": 0.6496,
"grad_norm": 1.2147563695907593,
"learning_rate": 0.00015987691743739636,
"loss": 5.176176071166992,
"step": 406
},
{
"epoch": 0.6512,
"grad_norm": 1.1825661659240723,
"learning_rate": 0.00015860419987548486,
"loss": 5.012125015258789,
"step": 407
},
{
"epoch": 0.6528,
"grad_norm": 1.3765822649002075,
"learning_rate": 0.00015733421059530397,
"loss": 5.192166328430176,
"step": 408
},
{
"epoch": 0.6544,
"grad_norm": 1.455336332321167,
"learning_rate": 0.00015606698750766107,
"loss": 5.153839111328125,
"step": 409
},
{
"epoch": 0.656,
"grad_norm": 1.2426291704177856,
"learning_rate": 0.00015480256844078877,
"loss": 5.300335884094238,
"step": 410
},
{
"epoch": 0.6576,
"grad_norm": 1.2273467779159546,
"learning_rate": 0.00015354099113921613,
"loss": 5.370866775512695,
"step": 411
},
{
"epoch": 0.6592,
"grad_norm": 1.3272308111190796,
"learning_rate": 0.0001522822932626421,
"loss": 5.237664699554443,
"step": 412
},
{
"epoch": 0.6608,
"grad_norm": 1.486881136894226,
"learning_rate": 0.00015102651238481092,
"loss": 5.199460029602051,
"step": 413
},
{
"epoch": 0.6624,
"grad_norm": 1.225791096687317,
"learning_rate": 0.0001497736859923906,
"loss": 5.001354217529297,
"step": 414
},
{
"epoch": 0.664,
"grad_norm": 1.1577017307281494,
"learning_rate": 0.00014852385148385412,
"loss": 4.978085517883301,
"step": 415
},
{
"epoch": 0.6656,
"grad_norm": 1.1296128034591675,
"learning_rate": 0.00014727704616836296,
"loss": 5.08205509185791,
"step": 416
},
{
"epoch": 0.6672,
"grad_norm": 1.450363278388977,
"learning_rate": 0.00014603330726465315,
"loss": 5.209231853485107,
"step": 417
},
{
"epoch": 0.6688,
"grad_norm": 1.1756222248077393,
"learning_rate": 0.00014479267189992435,
"loss": 5.059493064880371,
"step": 418
},
{
"epoch": 0.6704,
"grad_norm": 1.3998825550079346,
"learning_rate": 0.00014355517710873183,
"loss": 4.99937629699707,
"step": 419
},
{
"epoch": 0.672,
"grad_norm": 1.3438893556594849,
"learning_rate": 0.00014232085983188064,
"loss": 5.317448616027832,
"step": 420
},
{
"epoch": 0.6736,
"grad_norm": 1.080320119857788,
"learning_rate": 0.00014108975691532271,
"loss": 5.1715264320373535,
"step": 421
},
{
"epoch": 0.6752,
"grad_norm": 1.2611881494522095,
"learning_rate": 0.00013986190510905758,
"loss": 4.58638858795166,
"step": 422
},
{
"epoch": 0.6768,
"grad_norm": 1.2457435131072998,
"learning_rate": 0.0001386373410660347,
"loss": 4.950125217437744,
"step": 423
},
{
"epoch": 0.6784,
"grad_norm": 1.7552827596664429,
"learning_rate": 0.00013741610134105983,
"loss": 5.444072723388672,
"step": 424
},
{
"epoch": 0.68,
"grad_norm": 1.21152925491333,
"learning_rate": 0.0001361982223897032,
"loss": 5.073456287384033,
"step": 425
},
{
"epoch": 0.6816,
"grad_norm": 1.5059016942977905,
"learning_rate": 0.00013498374056721197,
"loss": 5.584665298461914,
"step": 426
},
{
"epoch": 0.6832,
"grad_norm": 1.4177290201187134,
"learning_rate": 0.00013377269212742457,
"loss": 5.289451599121094,
"step": 427
},
{
"epoch": 0.6848,
"grad_norm": 1.4181674718856812,
"learning_rate": 0.0001325651132216886,
"loss": 4.7561540603637695,
"step": 428
},
{
"epoch": 0.6864,
"grad_norm": 1.1193443536758423,
"learning_rate": 0.00013136103989778137,
"loss": 5.055768013000488,
"step": 429
},
{
"epoch": 0.688,
"grad_norm": 1.1662368774414062,
"learning_rate": 0.00013016050809883434,
"loss": 4.925864219665527,
"step": 430
},
{
"epoch": 0.6896,
"grad_norm": 1.188244104385376,
"learning_rate": 0.00012896355366225998,
"loss": 4.825364589691162,
"step": 431
},
{
"epoch": 0.6912,
"grad_norm": 1.4330700635910034,
"learning_rate": 0.00012777021231868144,
"loss": 5.1424055099487305,
"step": 432
},
{
"epoch": 0.6928,
"grad_norm": 1.5289138555526733,
"learning_rate": 0.00012658051969086713,
"loss": 5.1443772315979,
"step": 433
},
{
"epoch": 0.6944,
"grad_norm": 1.455989122390747,
"learning_rate": 0.00012539451129266603,
"loss": 4.967620849609375,
"step": 434
},
{
"epoch": 0.696,
"grad_norm": 1.36936354637146,
"learning_rate": 0.00012421222252794833,
"loss": 5.1624908447265625,
"step": 435
},
{
"epoch": 0.6976,
"grad_norm": 1.3274517059326172,
"learning_rate": 0.0001230336886895485,
"loss": 5.160506725311279,
"step": 436
},
{
"epoch": 0.6992,
"grad_norm": 1.3301618099212646,
"learning_rate": 0.0001218589449582116,
"loss": 4.8344645500183105,
"step": 437
},
{
"epoch": 0.7008,
"grad_norm": 1.4845178127288818,
"learning_rate": 0.00012068802640154292,
"loss": 4.987344264984131,
"step": 438
},
{
"epoch": 0.7024,
"grad_norm": 1.2381513118743896,
"learning_rate": 0.00011952096797296167,
"loss": 4.904998779296875,
"step": 439
},
{
"epoch": 0.704,
"grad_norm": 1.395328402519226,
"learning_rate": 0.00011835780451065722,
"loss": 4.8166656494140625,
"step": 440
},
{
"epoch": 0.7056,
"grad_norm": 1.9488160610198975,
"learning_rate": 0.00011719857073654922,
"loss": 5.329633712768555,
"step": 441
},
{
"epoch": 0.7072,
"grad_norm": 1.4535843133926392,
"learning_rate": 0.00011604330125525078,
"loss": 4.918258190155029,
"step": 442
},
{
"epoch": 0.7088,
"grad_norm": 1.4393301010131836,
"learning_rate": 0.00011489203055303646,
"loss": 5.293149471282959,
"step": 443
},
{
"epoch": 0.7104,
"grad_norm": 1.5147560834884644,
"learning_rate": 0.00011374479299681142,
"loss": 5.193087100982666,
"step": 444
},
{
"epoch": 0.712,
"grad_norm": 1.9008417129516602,
"learning_rate": 0.00011260162283308678,
"loss": 5.060847282409668,
"step": 445
},
{
"epoch": 0.7136,
"grad_norm": 1.42693030834198,
"learning_rate": 0.00011146255418695633,
"loss": 5.017470836639404,
"step": 446
},
{
"epoch": 0.7152,
"grad_norm": 1.3155730962753296,
"learning_rate": 0.00011032762106107872,
"loss": 5.276302337646484,
"step": 447
},
{
"epoch": 0.7168,
"grad_norm": 1.414832592010498,
"learning_rate": 0.00010919685733466175,
"loss": 5.105321884155273,
"step": 448
},
{
"epoch": 0.7184,
"grad_norm": 1.4331352710723877,
"learning_rate": 0.00010807029676245145,
"loss": 5.178823471069336,
"step": 449
},
{
"epoch": 0.72,
"grad_norm": 2.958193302154541,
"learning_rate": 0.00010694797297372433,
"loss": 5.053134918212891,
"step": 450
},
{
"epoch": 0.7216,
"grad_norm": 1.4789056777954102,
"learning_rate": 0.00010582991947128323,
"loss": 5.253017425537109,
"step": 451
},
{
"epoch": 0.7232,
"grad_norm": 1.4438488483428955,
"learning_rate": 0.00010471616963045788,
"loss": 4.795893669128418,
"step": 452
},
{
"epoch": 0.7248,
"grad_norm": 1.0840559005737305,
"learning_rate": 0.00010360675669810765,
"loss": 4.984047889709473,
"step": 453
},
{
"epoch": 0.7264,
"grad_norm": 1.0405324697494507,
"learning_rate": 0.00010250171379163034,
"loss": 5.2449116706848145,
"step": 454
},
{
"epoch": 0.728,
"grad_norm": 1.5884569883346558,
"learning_rate": 0.00010140107389797223,
"loss": 4.744875907897949,
"step": 455
},
{
"epoch": 0.7296,
"grad_norm": 1.3832892179489136,
"learning_rate": 0.00010030486987264437,
"loss": 5.204304218292236,
"step": 456
},
{
"epoch": 0.7312,
"grad_norm": 1.5350919961929321,
"learning_rate": 9.921313443874142e-05,
"loss": 4.8627400398254395,
"step": 457
},
{
"epoch": 0.7328,
"grad_norm": 1.3951729536056519,
"learning_rate": 9.812590018596485e-05,
"loss": 4.816617488861084,
"step": 458
},
{
"epoch": 0.7344,
"grad_norm": 1.4187312126159668,
"learning_rate": 9.704319956964996e-05,
"loss": 5.244232654571533,
"step": 459
},
{
"epoch": 0.736,
"grad_norm": 1.9965143203735352,
"learning_rate": 9.596506490979737e-05,
"loss": 5.668506145477295,
"step": 460
},
{
"epoch": 0.7376,
"grad_norm": 1.6400834321975708,
"learning_rate": 9.489152839010798e-05,
"loss": 5.365629196166992,
"step": 461
},
{
"epoch": 0.7392,
"grad_norm": 1.442253828048706,
"learning_rate": 9.382262205702247e-05,
"loss": 5.322830677032471,
"step": 462
},
{
"epoch": 0.7408,
"grad_norm": 1.1397078037261963,
"learning_rate": 9.275837781876404e-05,
"loss": 5.002555847167969,
"step": 463
},
{
"epoch": 0.7424,
"grad_norm": 1.4520896673202515,
"learning_rate": 9.16988274443871e-05,
"loss": 5.138970375061035,
"step": 464
},
{
"epoch": 0.744,
"grad_norm": 1.3373026847839355,
"learning_rate": 9.064400256282756e-05,
"loss": 5.060115814208984,
"step": 465
},
{
"epoch": 0.7456,
"grad_norm": 1.3698216676712036,
"learning_rate": 8.959393466195972e-05,
"loss": 5.160407066345215,
"step": 466
},
{
"epoch": 0.7472,
"grad_norm": 1.45284104347229,
"learning_rate": 8.854865508765577e-05,
"loss": 4.794371604919434,
"step": 467
},
{
"epoch": 0.7488,
"grad_norm": 1.2445486783981323,
"learning_rate": 8.750819504285015e-05,
"loss": 4.926098823547363,
"step": 468
},
{
"epoch": 0.7504,
"grad_norm": 1.5558010339736938,
"learning_rate": 8.647258558660828e-05,
"loss": 5.0971245765686035,
"step": 469
},
{
"epoch": 0.752,
"grad_norm": 1.5887895822525024,
"learning_rate": 8.544185763319925e-05,
"loss": 5.4126152992248535,
"step": 470
},
{
"epoch": 0.7536,
"grad_norm": 1.1927727460861206,
"learning_rate": 8.441604195117314e-05,
"loss": 4.76765251159668,
"step": 471
},
{
"epoch": 0.7552,
"grad_norm": 1.1783281564712524,
"learning_rate": 8.339516916244216e-05,
"loss": 5.2575907707214355,
"step": 472
},
{
"epoch": 0.7568,
"grad_norm": 1.4256731271743774,
"learning_rate": 8.237926974136715e-05,
"loss": 4.811319351196289,
"step": 473
},
{
"epoch": 0.7584,
"grad_norm": 1.1950210332870483,
"learning_rate": 8.136837401384733e-05,
"loss": 5.229648590087891,
"step": 474
},
{
"epoch": 0.76,
"grad_norm": 1.409590721130371,
"learning_rate": 8.036251215641546e-05,
"loss": 5.007275104522705,
"step": 475
},
{
"epoch": 0.7616,
"grad_norm": 1.3664684295654297,
"learning_rate": 7.936171419533653e-05,
"loss": 5.1865339279174805,
"step": 476
},
{
"epoch": 0.7632,
"grad_norm": 1.272782564163208,
"learning_rate": 7.836601000571197e-05,
"loss": 5.0746636390686035,
"step": 477
},
{
"epoch": 0.7648,
"grad_norm": 1.430291771888733,
"learning_rate": 7.737542931058755e-05,
"loss": 5.309817790985107,
"step": 478
},
{
"epoch": 0.7664,
"grad_norm": 1.391274094581604,
"learning_rate": 7.63900016800663e-05,
"loss": 4.913700103759766,
"step": 479
},
{
"epoch": 0.768,
"grad_norm": 1.8367639780044556,
"learning_rate": 7.54097565304252e-05,
"loss": 4.870950222015381,
"step": 480
},
{
"epoch": 0.7696,
"grad_norm": 1.5375534296035767,
"learning_rate": 7.443472312323824e-05,
"loss": 5.078888893127441,
"step": 481
},
{
"epoch": 0.7712,
"grad_norm": 1.3212310075759888,
"learning_rate": 7.346493056450157e-05,
"loss": 4.916213512420654,
"step": 482
},
{
"epoch": 0.7728,
"grad_norm": 1.4506617784500122,
"learning_rate": 7.250040780376577e-05,
"loss": 4.79956579208374,
"step": 483
},
{
"epoch": 0.7744,
"grad_norm": 1.269956350326538,
"learning_rate": 7.154118363327075e-05,
"loss": 5.207999229431152,
"step": 484
},
{
"epoch": 0.776,
"grad_norm": 1.386398196220398,
"learning_rate": 7.058728668708727e-05,
"loss": 4.866647720336914,
"step": 485
},
{
"epoch": 0.7776,
"grad_norm": 1.2891589403152466,
"learning_rate": 6.963874544026109e-05,
"loss": 5.038686752319336,
"step": 486
},
{
"epoch": 0.7792,
"grad_norm": 1.2647722959518433,
"learning_rate": 6.869558820796376e-05,
"loss": 5.102810859680176,
"step": 487
},
{
"epoch": 0.7808,
"grad_norm": 1.2693649530410767,
"learning_rate": 6.775784314464717e-05,
"loss": 4.887539863586426,
"step": 488
},
{
"epoch": 0.7824,
"grad_norm": 1.6362860202789307,
"learning_rate": 6.68255382432027e-05,
"loss": 4.774933338165283,
"step": 489
},
{
"epoch": 0.784,
"grad_norm": 1.5527857542037964,
"learning_rate": 6.589870133412626e-05,
"loss": 5.0828680992126465,
"step": 490
},
{
"epoch": 0.7856,
"grad_norm": 1.6107929944992065,
"learning_rate": 6.497736008468701e-05,
"loss": 4.6461639404296875,
"step": 491
},
{
"epoch": 0.7872,
"grad_norm": 1.12363862991333,
"learning_rate": 6.406154199810179e-05,
"loss": 5.033900260925293,
"step": 492
},
{
"epoch": 0.7888,
"grad_norm": 1.1499987840652466,
"learning_rate": 6.315127441271368e-05,
"loss": 4.9476094245910645,
"step": 493
},
{
"epoch": 0.7904,
"grad_norm": 1.5613439083099365,
"learning_rate": 6.224658450117637e-05,
"loss": 5.146108150482178,
"step": 494
},
{
"epoch": 0.792,
"grad_norm": 1.2324504852294922,
"learning_rate": 6.134749926964289e-05,
"loss": 4.819706916809082,
"step": 495
},
{
"epoch": 0.7936,
"grad_norm": 1.1125681400299072,
"learning_rate": 6.0454045556959356e-05,
"loss": 4.930054664611816,
"step": 496
},
{
"epoch": 0.7952,
"grad_norm": 1.6992604732513428,
"learning_rate": 5.9566250033863567e-05,
"loss": 5.198884963989258,
"step": 497
},
{
"epoch": 0.7968,
"grad_norm": 1.920567512512207,
"learning_rate": 5.8684139202189654e-05,
"loss": 5.21380615234375,
"step": 498
},
{
"epoch": 0.7984,
"grad_norm": 1.3954874277114868,
"learning_rate": 5.780773939407585e-05,
"loss": 4.928266525268555,
"step": 499
},
{
"epoch": 0.8,
"grad_norm": 1.4884490966796875,
"learning_rate": 5.693707677117943e-05,
"loss": 5.14831018447876,
"step": 500
},
{
"epoch": 0.8016,
"grad_norm": 1.7664364576339722,
"learning_rate": 5.607217732389502e-05,
"loss": 5.231863975524902,
"step": 501
},
{
"epoch": 0.8032,
"grad_norm": 1.1272830963134766,
"learning_rate": 5.5213066870579476e-05,
"loss": 5.004734039306641,
"step": 502
},
{
"epoch": 0.8048,
"grad_norm": 1.2964353561401367,
"learning_rate": 5.4359771056780333e-05,
"loss": 4.362703323364258,
"step": 503
},
{
"epoch": 0.8064,
"grad_norm": 1.3352986574172974,
"learning_rate": 5.3512315354470956e-05,
"loss": 4.99576473236084,
"step": 504
},
{
"epoch": 0.808,
"grad_norm": 1.4980597496032715,
"learning_rate": 5.267072506128981e-05,
"loss": 5.139542579650879,
"step": 505
},
{
"epoch": 0.8096,
"grad_norm": 1.1959021091461182,
"learning_rate": 5.183502529978548e-05,
"loss": 5.123270034790039,
"step": 506
},
{
"epoch": 0.8112,
"grad_norm": 1.3239198923110962,
"learning_rate": 5.10052410166664e-05,
"loss": 5.379024028778076,
"step": 507
},
{
"epoch": 0.8128,
"grad_norm": 1.204946756362915,
"learning_rate": 5.018139698205665e-05,
"loss": 5.012156963348389,
"step": 508
},
{
"epoch": 0.8144,
"grad_norm": 1.5109254121780396,
"learning_rate": 4.9363517788756195e-05,
"loss": 4.902032852172852,
"step": 509
},
{
"epoch": 0.816,
"grad_norm": 1.1028631925582886,
"learning_rate": 4.855162785150674e-05,
"loss": 5.165895938873291,
"step": 510
},
{
"epoch": 0.8176,
"grad_norm": 1.042698860168457,
"learning_rate": 4.7745751406263163e-05,
"loss": 4.897646427154541,
"step": 511
},
{
"epoch": 0.8192,
"grad_norm": 1.2713276147842407,
"learning_rate": 4.694591250946983e-05,
"loss": 4.820833206176758,
"step": 512
},
{
"epoch": 0.8208,
"grad_norm": 1.1189286708831787,
"learning_rate": 4.615213503734267e-05,
"loss": 4.981866836547852,
"step": 513
},
{
"epoch": 0.8224,
"grad_norm": 1.3545044660568237,
"learning_rate": 4.536444268515608e-05,
"loss": 4.901456832885742,
"step": 514
},
{
"epoch": 0.824,
"grad_norm": 1.3025493621826172,
"learning_rate": 4.458285896653602e-05,
"loss": 5.010705947875977,
"step": 515
},
{
"epoch": 0.8256,
"grad_norm": 1.5655075311660767,
"learning_rate": 4.380740721275786e-05,
"loss": 5.438045501708984,
"step": 516
},
{
"epoch": 0.8272,
"grad_norm": 1.4804078340530396,
"learning_rate": 4.303811057205007e-05,
"loss": 4.864298343658447,
"step": 517
},
{
"epoch": 0.8288,
"grad_norm": 1.3067195415496826,
"learning_rate": 4.227499200890275e-05,
"loss": 5.399082183837891,
"step": 518
},
{
"epoch": 0.8304,
"grad_norm": 1.3728652000427246,
"learning_rate": 4.1518074303383004e-05,
"loss": 4.861635684967041,
"step": 519
},
{
"epoch": 0.832,
"grad_norm": 1.0616425275802612,
"learning_rate": 4.076738005045394e-05,
"loss": 5.093954563140869,
"step": 520
},
{
"epoch": 0.8336,
"grad_norm": 1.2632859945297241,
"learning_rate": 4.002293165930088e-05,
"loss": 5.069172382354736,
"step": 521
},
{
"epoch": 0.8352,
"grad_norm": 1.54668390750885,
"learning_rate": 3.9284751352662045e-05,
"loss": 5.132449150085449,
"step": 522
},
{
"epoch": 0.8368,
"grad_norm": 1.4716906547546387,
"learning_rate": 3.855286116616541e-05,
"loss": 4.952608585357666,
"step": 523
},
{
"epoch": 0.8384,
"grad_norm": 1.315252423286438,
"learning_rate": 3.782728294767068e-05,
"loss": 4.983213424682617,
"step": 524
},
{
"epoch": 0.84,
"grad_norm": 1.4445892572402954,
"learning_rate": 3.7108038356617305e-05,
"loss": 5.154409885406494,
"step": 525
},
{
"epoch": 0.8416,
"grad_norm": 1.3014910221099854,
"learning_rate": 3.6395148863377855e-05,
"loss": 4.867927551269531,
"step": 526
},
{
"epoch": 0.8432,
"grad_norm": 1.1832693815231323,
"learning_rate": 3.568863574861708e-05,
"loss": 4.7219462394714355,
"step": 527
},
{
"epoch": 0.8448,
"grad_norm": 1.38213312625885,
"learning_rate": 3.49885201026566e-05,
"loss": 4.771894931793213,
"step": 528
},
{
"epoch": 0.8464,
"grad_norm": 1.2693217992782593,
"learning_rate": 3.4294822824845444e-05,
"loss": 4.964877128601074,
"step": 529
},
{
"epoch": 0.848,
"grad_norm": 1.170465350151062,
"learning_rate": 3.3607564622936207e-05,
"loss": 4.916166305541992,
"step": 530
},
{
"epoch": 0.8496,
"grad_norm": 1.267838716506958,
"learning_rate": 3.292676601246661e-05,
"loss": 5.243579387664795,
"step": 531
},
{
"epoch": 0.8512,
"grad_norm": 1.3622010946273804,
"learning_rate": 3.2252447316147456e-05,
"loss": 4.598936080932617,
"step": 532
},
{
"epoch": 0.8528,
"grad_norm": 1.5820192098617554,
"learning_rate": 3.1584628663255847e-05,
"loss": 5.2594170570373535,
"step": 533
},
{
"epoch": 0.8544,
"grad_norm": 1.5312021970748901,
"learning_rate": 3.092332998903416e-05,
"loss": 5.157290935516357,
"step": 534
},
{
"epoch": 0.856,
"grad_norm": 1.4027749300003052,
"learning_rate": 3.0268571034094944e-05,
"loss": 5.125532150268555,
"step": 535
},
{
"epoch": 0.8576,
"grad_norm": 1.1611146926879883,
"learning_rate": 2.962037134383211e-05,
"loss": 5.000718593597412,
"step": 536
},
{
"epoch": 0.8592,
"grad_norm": 1.3523814678192139,
"learning_rate": 2.8978750267836752e-05,
"loss": 4.671696662902832,
"step": 537
},
{
"epoch": 0.8608,
"grad_norm": 1.2509510517120361,
"learning_rate": 2.8343726959320082e-05,
"loss": 5.075153350830078,
"step": 538
},
{
"epoch": 0.8624,
"grad_norm": 1.3108588457107544,
"learning_rate": 2.7715320374541357e-05,
"loss": 4.994152545928955,
"step": 539
},
{
"epoch": 0.864,
"grad_norm": 1.1837953329086304,
"learning_rate": 2.7093549272242445e-05,
"loss": 5.121654510498047,
"step": 540
},
{
"epoch": 0.8656,
"grad_norm": 1.5410609245300293,
"learning_rate": 2.6478432213087213e-05,
"loss": 4.955600738525391,
"step": 541
},
{
"epoch": 0.8672,
"grad_norm": 1.0305265188217163,
"learning_rate": 2.5869987559107992e-05,
"loss": 5.132237911224365,
"step": 542
},
{
"epoch": 0.8688,
"grad_norm": 1.219406247138977,
"learning_rate": 2.5268233473157294e-05,
"loss": 4.905612468719482,
"step": 543
},
{
"epoch": 0.8704,
"grad_norm": 1.5246868133544922,
"learning_rate": 2.467318791836559e-05,
"loss": 5.272589206695557,
"step": 544
},
{
"epoch": 0.872,
"grad_norm": 1.2425482273101807,
"learning_rate": 2.408486865760495e-05,
"loss": 5.108579158782959,
"step": 545
},
{
"epoch": 0.8736,
"grad_norm": 1.1925750970840454,
"learning_rate": 2.3503293252959136e-05,
"loss": 5.024507522583008,
"step": 546
},
{
"epoch": 0.8752,
"grad_norm": 1.2723841667175293,
"learning_rate": 2.2928479065199072e-05,
"loss": 5.255931377410889,
"step": 547
},
{
"epoch": 0.8768,
"grad_norm": 1.620451807975769,
"learning_rate": 2.2360443253264777e-05,
"loss": 5.196926593780518,
"step": 548
},
{
"epoch": 0.8784,
"grad_norm": 1.1335077285766602,
"learning_rate": 2.179920277375294e-05,
"loss": 4.717995643615723,
"step": 549
},
{
"epoch": 0.88,
"grad_norm": 1.1418888568878174,
"learning_rate": 2.1244774380410976e-05,
"loss": 5.335053443908691,
"step": 550
},
{
"epoch": 0.8816,
"grad_norm": 1.3852171897888184,
"learning_rate": 2.0697174623636794e-05,
"loss": 5.047591209411621,
"step": 551
},
{
"epoch": 0.8832,
"grad_norm": 1.2350728511810303,
"learning_rate": 2.015641984998459e-05,
"loss": 4.715671062469482,
"step": 552
},
{
"epoch": 0.8848,
"grad_norm": 1.115648865699768,
"learning_rate": 1.9622526201677344e-05,
"loss": 5.0985612869262695,
"step": 553
},
{
"epoch": 0.8864,
"grad_norm": 1.7186869382858276,
"learning_rate": 1.9095509616124385e-05,
"loss": 4.931835651397705,
"step": 554
},
{
"epoch": 0.888,
"grad_norm": 1.2360730171203613,
"learning_rate": 1.85753858254461e-05,
"loss": 4.8929924964904785,
"step": 555
},
{
"epoch": 0.8896,
"grad_norm": 1.146570086479187,
"learning_rate": 1.8062170356003854e-05,
"loss": 5.117987632751465,
"step": 556
},
{
"epoch": 0.8912,
"grad_norm": 1.1873035430908203,
"learning_rate": 1.7555878527937163e-05,
"loss": 4.8101091384887695,
"step": 557
},
{
"epoch": 0.8928,
"grad_norm": 1.1898494958877563,
"learning_rate": 1.7056525454705623e-05,
"loss": 5.127380847930908,
"step": 558
},
{
"epoch": 0.8944,
"grad_norm": 1.431149959564209,
"learning_rate": 1.656412604263824e-05,
"loss": 5.338906764984131,
"step": 559
},
{
"epoch": 0.896,
"grad_norm": 1.1228066682815552,
"learning_rate": 1.607869499048839e-05,
"loss": 4.9782185554504395,
"step": 560
},
{
"epoch": 0.8976,
"grad_norm": 1.3961535692214966,
"learning_rate": 1.5600246788994937e-05,
"loss": 4.974421501159668,
"step": 561
},
{
"epoch": 0.8992,
"grad_norm": 1.281671166419983,
"learning_rate": 1.5128795720449617e-05,
"loss": 4.919782638549805,
"step": 562
},
{
"epoch": 0.9008,
"grad_norm": 1.220367670059204,
"learning_rate": 1.4664355858270862e-05,
"loss": 4.936645030975342,
"step": 563
},
{
"epoch": 0.9024,
"grad_norm": 1.0977709293365479,
"learning_rate": 1.4206941066583629e-05,
"loss": 4.759374618530273,
"step": 564
},
{
"epoch": 0.904,
"grad_norm": 1.2086211442947388,
"learning_rate": 1.3756564999805515e-05,
"loss": 5.17381477355957,
"step": 565
},
{
"epoch": 0.9056,
"grad_norm": 1.172023892402649,
"learning_rate": 1.3313241102239054e-05,
"loss": 4.950685977935791,
"step": 566
},
{
"epoch": 0.9072,
"grad_norm": 1.210207462310791,
"learning_rate": 1.2876982607670674e-05,
"loss": 5.04666805267334,
"step": 567
},
{
"epoch": 0.9088,
"grad_norm": 1.0206573009490967,
"learning_rate": 1.2447802538975345e-05,
"loss": 5.030869483947754,
"step": 568
},
{
"epoch": 0.9104,
"grad_norm": 1.2772059440612793,
"learning_rate": 1.2025713707727953e-05,
"loss": 5.230049133300781,
"step": 569
},
{
"epoch": 0.912,
"grad_norm": 1.1435636281967163,
"learning_rate": 1.1610728713820906e-05,
"loss": 5.214902400970459,
"step": 570
},
{
"epoch": 0.9136,
"grad_norm": 1.430433988571167,
"learning_rate": 1.120285994508799e-05,
"loss": 4.8903584480285645,
"step": 571
},
{
"epoch": 0.9152,
"grad_norm": 1.2580111026763916,
"learning_rate": 1.08021195769345e-05,
"loss": 5.1730055809021,
"step": 572
},
{
"epoch": 0.9168,
"grad_norm": 1.3038173913955688,
"learning_rate": 1.0408519571973806e-05,
"loss": 5.069331169128418,
"step": 573
},
{
"epoch": 0.9184,
"grad_norm": 1.4082874059677124,
"learning_rate": 1.0022071679670425e-05,
"loss": 5.165510177612305,
"step": 574
},
{
"epoch": 0.92,
"grad_norm": 1.3335379362106323,
"learning_rate": 9.642787435989008e-06,
"loss": 4.859002113342285,
"step": 575
},
{
"epoch": 0.9216,
"grad_norm": 1.1995774507522583,
"learning_rate": 9.270678163050216e-06,
"loss": 5.164345741271973,
"step": 576
},
{
"epoch": 0.9232,
"grad_norm": 1.0635286569595337,
"learning_rate": 8.90575496879248e-06,
"loss": 4.728398323059082,
"step": 577
},
{
"epoch": 0.9248,
"grad_norm": 1.1882269382476807,
"learning_rate": 8.548028746640846e-06,
"loss": 4.7602972984313965,
"step": 578
},
{
"epoch": 0.9264,
"grad_norm": 1.389762282371521,
"learning_rate": 8.197510175181277e-06,
"loss": 5.069275856018066,
"step": 579
},
{
"epoch": 0.928,
"grad_norm": 1.1334697008132935,
"learning_rate": 7.854209717842232e-06,
"loss": 5.110383033752441,
"step": 580
},
{
"epoch": 0.9296,
"grad_norm": 1.244832992553711,
"learning_rate": 7.518137622582188e-06,
"loss": 5.184660911560059,
"step": 581
},
{
"epoch": 0.9312,
"grad_norm": 1.1092815399169922,
"learning_rate": 7.1893039215838175e-06,
"loss": 4.963058948516846,
"step": 582
},
{
"epoch": 0.9328,
"grad_norm": 1.6420494318008423,
"learning_rate": 6.867718430954351e-06,
"loss": 4.9267964363098145,
"step": 583
},
{
"epoch": 0.9344,
"grad_norm": 1.49501371383667,
"learning_rate": 6.553390750432708e-06,
"loss": 4.730033874511719,
"step": 584
},
{
"epoch": 0.936,
"grad_norm": 1.2878178358078003,
"learning_rate": 6.246330263102895e-06,
"loss": 5.060173034667969,
"step": 585
},
{
"epoch": 0.9376,
"grad_norm": 1.2040040493011475,
"learning_rate": 5.9465461351138615e-06,
"loss": 5.053962707519531,
"step": 586
},
{
"epoch": 0.9392,
"grad_norm": 1.1503539085388184,
"learning_rate": 5.654047315405892e-06,
"loss": 4.980835437774658,
"step": 587
},
{
"epoch": 0.9408,
"grad_norm": 0.9667116403579712,
"learning_rate": 5.368842535443508e-06,
"loss": 5.023655414581299,
"step": 588
},
{
"epoch": 0.9424,
"grad_norm": 1.2056710720062256,
"learning_rate": 5.09094030895485e-06,
"loss": 4.959043979644775,
"step": 589
},
{
"epoch": 0.944,
"grad_norm": 1.0608792304992676,
"learning_rate": 4.8203489316773485e-06,
"loss": 5.312167644500732,
"step": 590
},
{
"epoch": 0.9456,
"grad_norm": 1.4500396251678467,
"learning_rate": 4.557076481110367e-06,
"loss": 4.965682029724121,
"step": 591
},
{
"epoch": 0.9472,
"grad_norm": 1.118233561515808,
"learning_rate": 4.301130816273813e-06,
"loss": 4.988546848297119,
"step": 592
},
{
"epoch": 0.9488,
"grad_norm": 1.2060961723327637,
"learning_rate": 4.05251957747374e-06,
"loss": 5.0205888748168945,
"step": 593
},
{
"epoch": 0.9504,
"grad_norm": 1.270868182182312,
"learning_rate": 3.811250186074089e-06,
"loss": 5.278676509857178,
"step": 594
},
{
"epoch": 0.952,
"grad_norm": 1.4645127058029175,
"learning_rate": 3.5773298442753898e-06,
"loss": 4.93894100189209,
"step": 595
},
{
"epoch": 0.9536,
"grad_norm": 1.21164870262146,
"learning_rate": 3.3507655348995192e-06,
"loss": 5.321264266967773,
"step": 596
},
{
"epoch": 0.9552,
"grad_norm": 1.2144756317138672,
"learning_rate": 3.131564021181338e-06,
"loss": 4.879669666290283,
"step": 597
},
{
"epoch": 0.9568,
"grad_norm": 1.7862255573272705,
"learning_rate": 2.9197318465669364e-06,
"loss": 5.113965034484863,
"step": 598
},
{
"epoch": 0.9584,
"grad_norm": 1.427722692489624,
"learning_rate": 2.7152753345181247e-06,
"loss": 4.928999423980713,
"step": 599
},
{
"epoch": 0.96,
"grad_norm": 1.3544409275054932,
"learning_rate": 2.518200588323666e-06,
"loss": 5.407461166381836,
"step": 600
},
{
"epoch": 0.9616,
"grad_norm": 1.8953897953033447,
"learning_rate": 2.328513490917311e-06,
"loss": 4.892749309539795,
"step": 601
},
{
"epoch": 0.9632,
"grad_norm": 1.3621735572814941,
"learning_rate": 2.1462197047019127e-06,
"loss": 5.107844352722168,
"step": 602
},
{
"epoch": 0.9648,
"grad_norm": 1.19562566280365,
"learning_rate": 1.9713246713805587e-06,
"loss": 5.338631629943848,
"step": 603
},
{
"epoch": 0.9664,
"grad_norm": 1.0211833715438843,
"learning_rate": 1.803833611794037e-06,
"loss": 4.848773002624512,
"step": 604
},
{
"epoch": 0.968,
"grad_norm": 1.4424593448638916,
"learning_rate": 1.643751525765097e-06,
"loss": 5.272921562194824,
"step": 605
},
{
"epoch": 0.9696,
"grad_norm": 1.2189918756484985,
"learning_rate": 1.4910831919490997e-06,
"loss": 4.7630157470703125,
"step": 606
},
{
"epoch": 0.9712,
"grad_norm": 1.1489924192428589,
"learning_rate": 1.345833167691407e-06,
"loss": 5.053176403045654,
"step": 607
},
{
"epoch": 0.9728,
"grad_norm": 1.102137804031372,
"learning_rate": 1.2080057888913253e-06,
"loss": 5.1648359298706055,
"step": 608
},
{
"epoch": 0.9744,
"grad_norm": 1.193506121635437,
"learning_rate": 1.0776051698727362e-06,
"loss": 4.978764533996582,
"step": 609
},
{
"epoch": 0.976,
"grad_norm": 1.3150538206100464,
"learning_rate": 9.546352032611395e-07,
"loss": 5.2356038093566895,
"step": 610
},
{
"epoch": 0.9776,
"grad_norm": 1.2881925106048584,
"learning_rate": 8.390995598676066e-07,
"loss": 5.024952411651611,
"step": 611
},
{
"epoch": 0.9792,
"grad_norm": 1.2736302614212036,
"learning_rate": 7.310016885791471e-07,
"loss": 5.065498352050781,
"step": 612
},
{
"epoch": 0.9808,
"grad_norm": 1.3327683210372925,
"learning_rate": 6.303448162556791e-07,
"loss": 5.073752403259277,
"step": 613
},
{
"epoch": 0.9824,
"grad_norm": 1.3384580612182617,
"learning_rate": 5.371319476338288e-07,
"loss": 5.055788993835449,
"step": 614
},
{
"epoch": 0.984,
"grad_norm": 1.3576717376708984,
"learning_rate": 4.513658652371133e-07,
"loss": 5.128819465637207,
"step": 615
},
{
"epoch": 0.9856,
"grad_norm": 1.2477798461914062,
"learning_rate": 3.7304912929300716e-07,
"loss": 4.873608112335205,
"step": 616
},
{
"epoch": 0.9872,
"grad_norm": 1.2135578393936157,
"learning_rate": 3.0218407765642e-07,
"loss": 5.116058349609375,
"step": 617
},
{
"epoch": 0.9888,
"grad_norm": 1.3859200477600098,
"learning_rate": 2.387728257399191e-07,
"loss": 4.957461357116699,
"step": 618
},
{
"epoch": 0.9904,
"grad_norm": 1.2815113067626953,
"learning_rate": 1.8281726645061335e-07,
"loss": 4.715893745422363,
"step": 619
},
{
"epoch": 0.992,
"grad_norm": 1.275434136390686,
"learning_rate": 1.343190701336705e-07,
"loss": 4.888550281524658,
"step": 620
},
{
"epoch": 0.9936,
"grad_norm": 1.0546596050262451,
"learning_rate": 9.327968452232938e-08,
"loss": 4.995277404785156,
"step": 621
},
{
"epoch": 0.9952,
"grad_norm": 1.248382568359375,
"learning_rate": 5.970033469490655e-08,
"loss": 5.184177398681641,
"step": 622
},
{
"epoch": 0.9968,
"grad_norm": 1.3588132858276367,
"learning_rate": 3.3582023037964645e-08,
"loss": 4.7490763664245605,
"step": 623
},
{
"epoch": 0.9984,
"grad_norm": 1.121005654335022,
"learning_rate": 1.492552921655843e-08,
"loss": 4.857783317565918,
"step": 624
},
{
"epoch": 1.0,
"grad_norm": 1.4019795656204224,
"learning_rate": 3.731410150975556e-09,
"loss": 5.065018177032471,
"step": 625
}
],
"logging_steps": 1,
"max_steps": 625,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 5.933484093429535e+17,
"train_batch_size": 32,
"trial_name": null,
"trial_params": null
}