WeatherSynRFT / trainer_state.json
compasszzn's picture
Upload 14 files
4e97e9e verified
Raw
History Blame Contribute Delete
56 kB
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 1.0,
"eval_steps": 500,
"global_step": 319,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.003134796238244514,
"grad_norm": 43.949100494384766,
"learning_rate": 5.000000000000001e-07,
"loss": 1.8127,
"step": 1
},
{
"epoch": 0.006269592476489028,
"grad_norm": 46.380184173583984,
"learning_rate": 1.0000000000000002e-06,
"loss": 1.782,
"step": 2
},
{
"epoch": 0.009404388714733543,
"grad_norm": 42.30755615234375,
"learning_rate": 1.5e-06,
"loss": 1.7996,
"step": 3
},
{
"epoch": 0.012539184952978056,
"grad_norm": 42.93296813964844,
"learning_rate": 2.0000000000000003e-06,
"loss": 1.789,
"step": 4
},
{
"epoch": 0.01567398119122257,
"grad_norm": 38.31450653076172,
"learning_rate": 2.5e-06,
"loss": 1.6763,
"step": 5
},
{
"epoch": 0.018808777429467086,
"grad_norm": 30.79650115966797,
"learning_rate": 3e-06,
"loss": 1.4802,
"step": 6
},
{
"epoch": 0.0219435736677116,
"grad_norm": 37.57395935058594,
"learning_rate": 3.5e-06,
"loss": 1.4183,
"step": 7
},
{
"epoch": 0.025078369905956112,
"grad_norm": 19.238168716430664,
"learning_rate": 4.000000000000001e-06,
"loss": 1.1746,
"step": 8
},
{
"epoch": 0.02821316614420063,
"grad_norm": 20.116256713867188,
"learning_rate": 4.5e-06,
"loss": 1.0266,
"step": 9
},
{
"epoch": 0.03134796238244514,
"grad_norm": 8.39165210723877,
"learning_rate": 5e-06,
"loss": 0.9115,
"step": 10
},
{
"epoch": 0.034482758620689655,
"grad_norm": 9.131606101989746,
"learning_rate": 4.9998707921028104e-06,
"loss": 0.8921,
"step": 11
},
{
"epoch": 0.03761755485893417,
"grad_norm": 6.591880798339844,
"learning_rate": 4.999483181766986e-06,
"loss": 0.8571,
"step": 12
},
{
"epoch": 0.04075235109717868,
"grad_norm": 6.148767948150635,
"learning_rate": 4.998837209058379e-06,
"loss": 0.8377,
"step": 13
},
{
"epoch": 0.0438871473354232,
"grad_norm": 5.670365333557129,
"learning_rate": 4.997932940748811e-06,
"loss": 0.8791,
"step": 14
},
{
"epoch": 0.047021943573667714,
"grad_norm": 4.403618335723877,
"learning_rate": 4.996770470309167e-06,
"loss": 0.892,
"step": 15
},
{
"epoch": 0.050156739811912224,
"grad_norm": 3.835578441619873,
"learning_rate": 4.995349917899735e-06,
"loss": 0.7415,
"step": 16
},
{
"epoch": 0.05329153605015674,
"grad_norm": 4.489407062530518,
"learning_rate": 4.993671430357788e-06,
"loss": 0.7822,
"step": 17
},
{
"epoch": 0.05642633228840126,
"grad_norm": 5.612992763519287,
"learning_rate": 4.991735181182401e-06,
"loss": 0.7184,
"step": 18
},
{
"epoch": 0.05956112852664577,
"grad_norm": 13.66352367401123,
"learning_rate": 4.989541370516523e-06,
"loss": 0.7222,
"step": 19
},
{
"epoch": 0.06269592476489028,
"grad_norm": 3.666353940963745,
"learning_rate": 4.987090225126285e-06,
"loss": 0.7231,
"step": 20
},
{
"epoch": 0.06583072100313479,
"grad_norm": 4.0020294189453125,
"learning_rate": 4.9843819983775575e-06,
"loss": 0.7674,
"step": 21
},
{
"epoch": 0.06896551724137931,
"grad_norm": 3.6200690269470215,
"learning_rate": 4.98141697020977e-06,
"loss": 0.7478,
"step": 22
},
{
"epoch": 0.07210031347962383,
"grad_norm": 3.7657089233398438,
"learning_rate": 4.978195447106965e-06,
"loss": 0.7388,
"step": 23
},
{
"epoch": 0.07523510971786834,
"grad_norm": 3.262274742126465,
"learning_rate": 4.974717762066123e-06,
"loss": 0.7176,
"step": 24
},
{
"epoch": 0.07836990595611286,
"grad_norm": 3.8943138122558594,
"learning_rate": 4.970984274562741e-06,
"loss": 0.6745,
"step": 25
},
{
"epoch": 0.08150470219435736,
"grad_norm": 3.2295124530792236,
"learning_rate": 4.966995370513675e-06,
"loss": 0.7079,
"step": 26
},
{
"epoch": 0.08463949843260188,
"grad_norm": 4.9130730628967285,
"learning_rate": 4.962751462237248e-06,
"loss": 0.7824,
"step": 27
},
{
"epoch": 0.0877742946708464,
"grad_norm": 3.4682295322418213,
"learning_rate": 4.958252988410631e-06,
"loss": 0.7122,
"step": 28
},
{
"epoch": 0.09090909090909091,
"grad_norm": 3.1807892322540283,
"learning_rate": 4.9535004140245005e-06,
"loss": 0.752,
"step": 29
},
{
"epoch": 0.09404388714733543,
"grad_norm": 6.412112712860107,
"learning_rate": 4.94849423033497e-06,
"loss": 0.7503,
"step": 30
},
{
"epoch": 0.09717868338557993,
"grad_norm": 3.3517048358917236,
"learning_rate": 4.943234954812812e-06,
"loss": 0.6783,
"step": 31
},
{
"epoch": 0.10031347962382445,
"grad_norm": 3.2728397846221924,
"learning_rate": 4.937723131089974e-06,
"loss": 0.713,
"step": 32
},
{
"epoch": 0.10344827586206896,
"grad_norm": 3.6628053188323975,
"learning_rate": 4.931959328903376e-06,
"loss": 0.6842,
"step": 33
},
{
"epoch": 0.10658307210031348,
"grad_norm": 4.155019283294678,
"learning_rate": 4.925944144036027e-06,
"loss": 0.6627,
"step": 34
},
{
"epoch": 0.109717868338558,
"grad_norm": 2.7333738803863525,
"learning_rate": 4.919678198255438e-06,
"loss": 0.6807,
"step": 35
},
{
"epoch": 0.11285266457680251,
"grad_norm": 3.560575246810913,
"learning_rate": 4.91316213924935e-06,
"loss": 0.6802,
"step": 36
},
{
"epoch": 0.11598746081504702,
"grad_norm": 2.562624454498291,
"learning_rate": 4.90639664055879e-06,
"loss": 0.6264,
"step": 37
},
{
"epoch": 0.11912225705329153,
"grad_norm": 5.172059059143066,
"learning_rate": 4.899382401508446e-06,
"loss": 0.6783,
"step": 38
},
{
"epoch": 0.12225705329153605,
"grad_norm": 3.2025134563446045,
"learning_rate": 4.892120147134378e-06,
"loss": 0.7013,
"step": 39
},
{
"epoch": 0.12539184952978055,
"grad_norm": 2.37896466255188,
"learning_rate": 4.884610628109082e-06,
"loss": 0.6241,
"step": 40
},
{
"epoch": 0.12852664576802508,
"grad_norm": 2.381072759628296,
"learning_rate": 4.876854620663887e-06,
"loss": 0.6328,
"step": 41
},
{
"epoch": 0.13166144200626959,
"grad_norm": 2.695284605026245,
"learning_rate": 4.868852926508721e-06,
"loss": 0.6626,
"step": 42
},
{
"epoch": 0.13479623824451412,
"grad_norm": 6.515556335449219,
"learning_rate": 4.860606372749247e-06,
"loss": 0.7056,
"step": 43
},
{
"epoch": 0.13793103448275862,
"grad_norm": 3.1236801147460938,
"learning_rate": 4.8521158118013605e-06,
"loss": 0.678,
"step": 44
},
{
"epoch": 0.14106583072100312,
"grad_norm": 3.5594000816345215,
"learning_rate": 4.843382121303082e-06,
"loss": 0.6649,
"step": 45
},
{
"epoch": 0.14420062695924765,
"grad_norm": 2.737841844558716,
"learning_rate": 4.83440620402384e-06,
"loss": 0.5953,
"step": 46
},
{
"epoch": 0.14733542319749215,
"grad_norm": 2.5807104110717773,
"learning_rate": 4.825188987771149e-06,
"loss": 0.6678,
"step": 47
},
{
"epoch": 0.15047021943573669,
"grad_norm": 2.5918169021606445,
"learning_rate": 4.815731425294716e-06,
"loss": 0.7223,
"step": 48
},
{
"epoch": 0.1536050156739812,
"grad_norm": 2.677417278289795,
"learning_rate": 4.806034494187949e-06,
"loss": 0.6688,
"step": 49
},
{
"epoch": 0.15673981191222572,
"grad_norm": 2.6603641510009766,
"learning_rate": 4.796099196786908e-06,
"loss": 0.6652,
"step": 50
},
{
"epoch": 0.15987460815047022,
"grad_norm": 2.502426862716675,
"learning_rate": 4.785926560066703e-06,
"loss": 0.6092,
"step": 51
},
{
"epoch": 0.16300940438871472,
"grad_norm": 6.598116397857666,
"learning_rate": 4.775517635535332e-06,
"loss": 0.6496,
"step": 52
},
{
"epoch": 0.16614420062695925,
"grad_norm": 3.122972249984741,
"learning_rate": 4.764873499124997e-06,
"loss": 0.6687,
"step": 53
},
{
"epoch": 0.16927899686520376,
"grad_norm": 3.360384225845337,
"learning_rate": 4.753995251080884e-06,
"loss": 0.6911,
"step": 54
},
{
"epoch": 0.1724137931034483,
"grad_norm": 2.609579563140869,
"learning_rate": 4.742884015847436e-06,
"loss": 0.6397,
"step": 55
},
{
"epoch": 0.1755485893416928,
"grad_norm": 2.5354106426239014,
"learning_rate": 4.731540941952126e-06,
"loss": 0.6009,
"step": 56
},
{
"epoch": 0.1786833855799373,
"grad_norm": 2.5063207149505615,
"learning_rate": 4.719967201886734e-06,
"loss": 0.5991,
"step": 57
},
{
"epoch": 0.18181818181818182,
"grad_norm": 2.7579185962677,
"learning_rate": 4.708163991986152e-06,
"loss": 0.6064,
"step": 58
},
{
"epoch": 0.18495297805642633,
"grad_norm": 2.891486883163452,
"learning_rate": 4.696132532304727e-06,
"loss": 0.6391,
"step": 59
},
{
"epoch": 0.18808777429467086,
"grad_norm": 2.916727304458618,
"learning_rate": 4.683874066490143e-06,
"loss": 0.6727,
"step": 60
},
{
"epoch": 0.19122257053291536,
"grad_norm": 2.708406686782837,
"learning_rate": 4.671389861654873e-06,
"loss": 0.6499,
"step": 61
},
{
"epoch": 0.19435736677115986,
"grad_norm": 4.942819118499756,
"learning_rate": 4.658681208245198e-06,
"loss": 0.6356,
"step": 62
},
{
"epoch": 0.1974921630094044,
"grad_norm": 3.262690544128418,
"learning_rate": 4.645749419907829e-06,
"loss": 0.6732,
"step": 63
},
{
"epoch": 0.2006269592476489,
"grad_norm": 2.6621508598327637,
"learning_rate": 4.632595833354105e-06,
"loss": 0.6615,
"step": 64
},
{
"epoch": 0.20376175548589343,
"grad_norm": 2.9821105003356934,
"learning_rate": 4.619221808221833e-06,
"loss": 0.6667,
"step": 65
},
{
"epoch": 0.20689655172413793,
"grad_norm": 5.076481342315674,
"learning_rate": 4.605628726934747e-06,
"loss": 0.6505,
"step": 66
},
{
"epoch": 0.21003134796238246,
"grad_norm": 4.245087623596191,
"learning_rate": 4.5918179945596055e-06,
"loss": 0.6129,
"step": 67
},
{
"epoch": 0.21316614420062696,
"grad_norm": 3.060851812362671,
"learning_rate": 4.577791038660959e-06,
"loss": 0.6436,
"step": 68
},
{
"epoch": 0.21630094043887146,
"grad_norm": 3.0752642154693604,
"learning_rate": 4.563549309153589e-06,
"loss": 0.6957,
"step": 69
},
{
"epoch": 0.219435736677116,
"grad_norm": 2.996229887008667,
"learning_rate": 4.549094278152631e-06,
"loss": 0.6877,
"step": 70
},
{
"epoch": 0.2225705329153605,
"grad_norm": 4.217442035675049,
"learning_rate": 4.534427439821416e-06,
"loss": 0.6231,
"step": 71
},
{
"epoch": 0.22570532915360503,
"grad_norm": 3.009230136871338,
"learning_rate": 4.519550310217013e-06,
"loss": 0.6574,
"step": 72
},
{
"epoch": 0.22884012539184953,
"grad_norm": 3.037616491317749,
"learning_rate": 4.504464427133527e-06,
"loss": 0.6238,
"step": 73
},
{
"epoch": 0.23197492163009403,
"grad_norm": 3.02176833152771,
"learning_rate": 4.489171349943144e-06,
"loss": 0.6985,
"step": 74
},
{
"epoch": 0.23510971786833856,
"grad_norm": 3.0625197887420654,
"learning_rate": 4.473672659434941e-06,
"loss": 0.7531,
"step": 75
},
{
"epoch": 0.23824451410658307,
"grad_norm": 3.099853277206421,
"learning_rate": 4.457969957651485e-06,
"loss": 0.6188,
"step": 76
},
{
"epoch": 0.2413793103448276,
"grad_norm": 3.041682004928589,
"learning_rate": 4.442064867723236e-06,
"loss": 0.6754,
"step": 77
},
{
"epoch": 0.2445141065830721,
"grad_norm": 2.7343809604644775,
"learning_rate": 4.425959033700776e-06,
"loss": 0.615,
"step": 78
},
{
"epoch": 0.2476489028213166,
"grad_norm": 2.6281471252441406,
"learning_rate": 4.409654120384863e-06,
"loss": 0.6247,
"step": 79
},
{
"epoch": 0.2507836990595611,
"grad_norm": 2.4146194458007812,
"learning_rate": 4.393151813154345e-06,
"loss": 0.6237,
"step": 80
},
{
"epoch": 0.25391849529780564,
"grad_norm": 2.641150712966919,
"learning_rate": 4.3764538177919555e-06,
"loss": 0.5891,
"step": 81
},
{
"epoch": 0.25705329153605017,
"grad_norm": 2.4911434650421143,
"learning_rate": 4.35956186030799e-06,
"loss": 0.6456,
"step": 82
},
{
"epoch": 0.2601880877742947,
"grad_norm": 3.6674325466156006,
"learning_rate": 4.3424776867618935e-06,
"loss": 0.5946,
"step": 83
},
{
"epoch": 0.26332288401253917,
"grad_norm": 8.596332550048828,
"learning_rate": 4.325203063081776e-06,
"loss": 0.6133,
"step": 84
},
{
"epoch": 0.2664576802507837,
"grad_norm": 2.6561825275421143,
"learning_rate": 4.307739774881878e-06,
"loss": 0.6654,
"step": 85
},
{
"epoch": 0.26959247648902823,
"grad_norm": 3.8426294326782227,
"learning_rate": 4.290089627277998e-06,
"loss": 0.6124,
"step": 86
},
{
"epoch": 0.2727272727272727,
"grad_norm": 3.1375346183776855,
"learning_rate": 4.2722544447008995e-06,
"loss": 0.6183,
"step": 87
},
{
"epoch": 0.27586206896551724,
"grad_norm": 2.7622690200805664,
"learning_rate": 4.254236070707734e-06,
"loss": 0.6555,
"step": 88
},
{
"epoch": 0.27899686520376177,
"grad_norm": 2.8743462562561035,
"learning_rate": 4.236036367791471e-06,
"loss": 0.5867,
"step": 89
},
{
"epoch": 0.28213166144200624,
"grad_norm": 2.7058961391448975,
"learning_rate": 4.2176572171883865e-06,
"loss": 0.6346,
"step": 90
},
{
"epoch": 0.2852664576802508,
"grad_norm": 2.520024538040161,
"learning_rate": 4.199100518683601e-06,
"loss": 0.646,
"step": 91
},
{
"epoch": 0.2884012539184953,
"grad_norm": 2.6446547508239746,
"learning_rate": 4.18036819041471e-06,
"loss": 0.6502,
"step": 92
},
{
"epoch": 0.29153605015673983,
"grad_norm": 2.559896945953369,
"learning_rate": 4.161462168673508e-06,
"loss": 0.6166,
"step": 93
},
{
"epoch": 0.2946708463949843,
"grad_norm": 2.6093804836273193,
"learning_rate": 4.142384407705846e-06,
"loss": 0.607,
"step": 94
},
{
"epoch": 0.29780564263322884,
"grad_norm": 2.8598270416259766,
"learning_rate": 4.123136879509626e-06,
"loss": 0.6094,
"step": 95
},
{
"epoch": 0.30094043887147337,
"grad_norm": 2.6045126914978027,
"learning_rate": 4.103721573630965e-06,
"loss": 0.6639,
"step": 96
},
{
"epoch": 0.30407523510971785,
"grad_norm": 2.7823448181152344,
"learning_rate": 4.084140496958539e-06,
"loss": 0.6341,
"step": 97
},
{
"epoch": 0.3072100313479624,
"grad_norm": 2.320493459701538,
"learning_rate": 4.06439567351614e-06,
"loss": 0.5939,
"step": 98
},
{
"epoch": 0.3103448275862069,
"grad_norm": 2.2848961353302,
"learning_rate": 4.0444891442534615e-06,
"loss": 0.5916,
"step": 99
},
{
"epoch": 0.31347962382445144,
"grad_norm": 2.4216668605804443,
"learning_rate": 4.024422966835137e-06,
"loss": 0.6116,
"step": 100
},
{
"epoch": 0.3166144200626959,
"grad_norm": 2.654069185256958,
"learning_rate": 4.004199215428032e-06,
"loss": 0.654,
"step": 101
},
{
"epoch": 0.31974921630094044,
"grad_norm": 2.4687626361846924,
"learning_rate": 3.9838199804868635e-06,
"loss": 0.6136,
"step": 102
},
{
"epoch": 0.322884012539185,
"grad_norm": 3.22712779045105,
"learning_rate": 3.963287368538105e-06,
"loss": 0.6208,
"step": 103
},
{
"epoch": 0.32601880877742945,
"grad_norm": 2.4066531658172607,
"learning_rate": 3.942603501962249e-06,
"loss": 0.6191,
"step": 104
},
{
"epoch": 0.329153605015674,
"grad_norm": 2.9688427448272705,
"learning_rate": 3.92177051877442e-06,
"loss": 0.6132,
"step": 105
},
{
"epoch": 0.3322884012539185,
"grad_norm": 2.6026861667633057,
"learning_rate": 3.900790572403376e-06,
"loss": 0.5957,
"step": 106
},
{
"epoch": 0.335423197492163,
"grad_norm": 3.1237998008728027,
"learning_rate": 3.8796658314689205e-06,
"loss": 0.6267,
"step": 107
},
{
"epoch": 0.3385579937304075,
"grad_norm": 2.7337708473205566,
"learning_rate": 3.858398479557739e-06,
"loss": 0.6635,
"step": 108
},
{
"epoch": 0.34169278996865204,
"grad_norm": 2.707108497619629,
"learning_rate": 3.836990714997686e-06,
"loss": 0.6217,
"step": 109
},
{
"epoch": 0.3448275862068966,
"grad_norm": 2.4689037799835205,
"learning_rate": 3.815444750630555e-06,
"loss": 0.6364,
"step": 110
},
{
"epoch": 0.34796238244514105,
"grad_norm": 2.9303574562072754,
"learning_rate": 3.7937628135833453e-06,
"loss": 0.6117,
"step": 111
},
{
"epoch": 0.3510971786833856,
"grad_norm": 2.5677459239959717,
"learning_rate": 3.7719471450380518e-06,
"loss": 0.6007,
"step": 112
},
{
"epoch": 0.3542319749216301,
"grad_norm": 2.688203811645508,
"learning_rate": 3.7500000000000005e-06,
"loss": 0.5778,
"step": 113
},
{
"epoch": 0.3573667711598746,
"grad_norm": 2.5018489360809326,
"learning_rate": 3.7279236470647593e-06,
"loss": 0.5826,
"step": 114
},
{
"epoch": 0.3605015673981191,
"grad_norm": 2.8333818912506104,
"learning_rate": 3.7057203681836407e-06,
"loss": 0.59,
"step": 115
},
{
"epoch": 0.36363636363636365,
"grad_norm": 2.9619221687316895,
"learning_rate": 3.683392458427825e-06,
"loss": 0.6616,
"step": 116
},
{
"epoch": 0.3667711598746082,
"grad_norm": 2.860719919204712,
"learning_rate": 3.660942225751126e-06,
"loss": 0.5618,
"step": 117
},
{
"epoch": 0.36990595611285265,
"grad_norm": 2.507951021194458,
"learning_rate": 3.638371990751428e-06,
"loss": 0.6305,
"step": 118
},
{
"epoch": 0.3730407523510972,
"grad_norm": 2.5088613033294678,
"learning_rate": 3.615684086430815e-06,
"loss": 0.5831,
"step": 119
},
{
"epoch": 0.3761755485893417,
"grad_norm": 2.534837245941162,
"learning_rate": 3.592880857954413e-06,
"loss": 0.5818,
"step": 120
},
{
"epoch": 0.3793103448275862,
"grad_norm": 2.4578607082366943,
"learning_rate": 3.5699646624079824e-06,
"loss": 0.6267,
"step": 121
},
{
"epoch": 0.3824451410658307,
"grad_norm": 2.6599507331848145,
"learning_rate": 3.5469378685542742e-06,
"loss": 0.5954,
"step": 122
},
{
"epoch": 0.38557993730407525,
"grad_norm": 3.10658597946167,
"learning_rate": 3.52380285658818e-06,
"loss": 0.5963,
"step": 123
},
{
"epoch": 0.3887147335423197,
"grad_norm": 2.4190332889556885,
"learning_rate": 3.500562017890695e-06,
"loss": 0.6224,
"step": 124
},
{
"epoch": 0.39184952978056425,
"grad_norm": 3.2278661727905273,
"learning_rate": 3.4772177547817387e-06,
"loss": 0.6182,
"step": 125
},
{
"epoch": 0.3949843260188088,
"grad_norm": 2.7765655517578125,
"learning_rate": 3.4537724802718294e-06,
"loss": 0.6399,
"step": 126
},
{
"epoch": 0.3981191222570533,
"grad_norm": 2.6910951137542725,
"learning_rate": 3.430228617812661e-06,
"loss": 0.5898,
"step": 127
},
{
"epoch": 0.4012539184952978,
"grad_norm": 4.030726909637451,
"learning_rate": 3.4065886010466014e-06,
"loss": 0.6093,
"step": 128
},
{
"epoch": 0.4043887147335423,
"grad_norm": 2.7327334880828857,
"learning_rate": 3.382854873555137e-06,
"loss": 0.5574,
"step": 129
},
{
"epoch": 0.40752351097178685,
"grad_norm": 3.4407501220703125,
"learning_rate": 3.3590298886062833e-06,
"loss": 0.6339,
"step": 130
},
{
"epoch": 0.4106583072100313,
"grad_norm": 2.798560380935669,
"learning_rate": 3.3351161089010055e-06,
"loss": 0.6152,
"step": 131
},
{
"epoch": 0.41379310344827586,
"grad_norm": 2.4109294414520264,
"learning_rate": 3.3111160063186553e-06,
"loss": 0.5964,
"step": 132
},
{
"epoch": 0.4169278996865204,
"grad_norm": 6.183897018432617,
"learning_rate": 3.2870320616614626e-06,
"loss": 0.5906,
"step": 133
},
{
"epoch": 0.4200626959247649,
"grad_norm": 2.960341691970825,
"learning_rate": 3.2628667643981036e-06,
"loss": 0.663,
"step": 134
},
{
"epoch": 0.4231974921630094,
"grad_norm": 2.809236764907837,
"learning_rate": 3.238622612406373e-06,
"loss": 0.6198,
"step": 135
},
{
"epoch": 0.4263322884012539,
"grad_norm": 3.4371628761291504,
"learning_rate": 3.21430211171499e-06,
"loss": 0.6275,
"step": 136
},
{
"epoch": 0.42946708463949845,
"grad_norm": 3.0913808345794678,
"learning_rate": 3.189907776244556e-06,
"loss": 0.6232,
"step": 137
},
{
"epoch": 0.43260188087774293,
"grad_norm": 2.930027961730957,
"learning_rate": 3.1654421275477045e-06,
"loss": 0.5638,
"step": 138
},
{
"epoch": 0.43573667711598746,
"grad_norm": 2.9050416946411133,
"learning_rate": 3.1409076945484513e-06,
"loss": 0.621,
"step": 139
},
{
"epoch": 0.438871473354232,
"grad_norm": 2.39300274848938,
"learning_rate": 3.116307013280793e-06,
"loss": 0.5852,
"step": 140
},
{
"epoch": 0.44200626959247646,
"grad_norm": 2.8745641708374023,
"learning_rate": 3.0916426266265676e-06,
"loss": 0.6119,
"step": 141
},
{
"epoch": 0.445141065830721,
"grad_norm": 3.0072708129882812,
"learning_rate": 3.066917084052603e-06,
"loss": 0.5851,
"step": 142
},
{
"epoch": 0.4482758620689655,
"grad_norm": 2.8062074184417725,
"learning_rate": 3.042132941347189e-06,
"loss": 0.586,
"step": 143
},
{
"epoch": 0.45141065830721006,
"grad_norm": 3.202028274536133,
"learning_rate": 3.017292760355896e-06,
"loss": 0.6312,
"step": 144
},
{
"epoch": 0.45454545454545453,
"grad_norm": 2.7520525455474854,
"learning_rate": 2.9923991087167657e-06,
"loss": 0.5769,
"step": 145
},
{
"epoch": 0.45768025078369906,
"grad_norm": 2.7981884479522705,
"learning_rate": 2.967454559594903e-06,
"loss": 0.6349,
"step": 146
},
{
"epoch": 0.4608150470219436,
"grad_norm": 2.9364802837371826,
"learning_rate": 2.9424616914164982e-06,
"loss": 0.5936,
"step": 147
},
{
"epoch": 0.46394984326018807,
"grad_norm": 2.985931873321533,
"learning_rate": 2.917423087602306e-06,
"loss": 0.5731,
"step": 148
},
{
"epoch": 0.4670846394984326,
"grad_norm": 2.4261667728424072,
"learning_rate": 2.8923413363006038e-06,
"loss": 0.602,
"step": 149
},
{
"epoch": 0.4702194357366771,
"grad_norm": 2.6361424922943115,
"learning_rate": 2.8672190301196655e-06,
"loss": 0.5851,
"step": 150
},
{
"epoch": 0.47335423197492166,
"grad_norm": 2.6938276290893555,
"learning_rate": 2.842058765859776e-06,
"loss": 0.6026,
"step": 151
},
{
"epoch": 0.47648902821316613,
"grad_norm": 2.6547839641571045,
"learning_rate": 2.8168631442448046e-06,
"loss": 0.5863,
"step": 152
},
{
"epoch": 0.47962382445141066,
"grad_norm": 2.8255505561828613,
"learning_rate": 2.791634769653381e-06,
"loss": 0.6096,
"step": 153
},
{
"epoch": 0.4827586206896552,
"grad_norm": 2.461580514907837,
"learning_rate": 2.7663762498496905e-06,
"loss": 0.5744,
"step": 154
},
{
"epoch": 0.48589341692789967,
"grad_norm": 2.9616644382476807,
"learning_rate": 2.741090195713917e-06,
"loss": 0.5849,
"step": 155
},
{
"epoch": 0.4890282131661442,
"grad_norm": 2.7509751319885254,
"learning_rate": 2.7157792209723654e-06,
"loss": 0.5711,
"step": 156
},
{
"epoch": 0.49216300940438873,
"grad_norm": 3.163322687149048,
"learning_rate": 2.6904459419272955e-06,
"loss": 0.6491,
"step": 157
},
{
"epoch": 0.4952978056426332,
"grad_norm": 2.5019166469573975,
"learning_rate": 2.6650929771864776e-06,
"loss": 0.5608,
"step": 158
},
{
"epoch": 0.49843260188087773,
"grad_norm": 2.8161232471466064,
"learning_rate": 2.639722947392521e-06,
"loss": 0.6116,
"step": 159
},
{
"epoch": 0.5015673981191222,
"grad_norm": 2.9896793365478516,
"learning_rate": 2.614338474951987e-06,
"loss": 0.5859,
"step": 160
},
{
"epoch": 0.5047021943573667,
"grad_norm": 2.9633073806762695,
"learning_rate": 2.5889421837643186e-06,
"loss": 0.5757,
"step": 161
},
{
"epoch": 0.5078369905956113,
"grad_norm": 7.553648471832275,
"learning_rate": 2.563536698950624e-06,
"loss": 0.5985,
"step": 162
},
{
"epoch": 0.5109717868338558,
"grad_norm": 2.4814646244049072,
"learning_rate": 2.538124646582315e-06,
"loss": 0.5918,
"step": 163
},
{
"epoch": 0.5141065830721003,
"grad_norm": 2.601158380508423,
"learning_rate": 2.512708653409674e-06,
"loss": 0.5768,
"step": 164
},
{
"epoch": 0.5172413793103449,
"grad_norm": 2.3953468799591064,
"learning_rate": 2.487291346590326e-06,
"loss": 0.5801,
"step": 165
},
{
"epoch": 0.5203761755485894,
"grad_norm": 2.782278060913086,
"learning_rate": 2.4618753534176854e-06,
"loss": 0.5953,
"step": 166
},
{
"epoch": 0.5235109717868338,
"grad_norm": 2.6170544624328613,
"learning_rate": 2.436463301049378e-06,
"loss": 0.584,
"step": 167
},
{
"epoch": 0.5266457680250783,
"grad_norm": 2.2067277431488037,
"learning_rate": 2.4110578162356814e-06,
"loss": 0.5709,
"step": 168
},
{
"epoch": 0.5297805642633229,
"grad_norm": 2.650946617126465,
"learning_rate": 2.385661525048014e-06,
"loss": 0.6157,
"step": 169
},
{
"epoch": 0.5329153605015674,
"grad_norm": 2.4111557006835938,
"learning_rate": 2.3602770526074804e-06,
"loss": 0.587,
"step": 170
},
{
"epoch": 0.5360501567398119,
"grad_norm": 2.574047088623047,
"learning_rate": 2.334907022813523e-06,
"loss": 0.5945,
"step": 171
},
{
"epoch": 0.5391849529780565,
"grad_norm": 5.312939167022705,
"learning_rate": 2.3095540580727054e-06,
"loss": 0.5584,
"step": 172
},
{
"epoch": 0.542319749216301,
"grad_norm": 2.999335289001465,
"learning_rate": 2.2842207790276355e-06,
"loss": 0.5588,
"step": 173
},
{
"epoch": 0.5454545454545454,
"grad_norm": 2.8886678218841553,
"learning_rate": 2.2589098042860838e-06,
"loss": 0.5834,
"step": 174
},
{
"epoch": 0.54858934169279,
"grad_norm": 2.7539076805114746,
"learning_rate": 2.2336237501503103e-06,
"loss": 0.529,
"step": 175
},
{
"epoch": 0.5517241379310345,
"grad_norm": 2.600677967071533,
"learning_rate": 2.2083652303466196e-06,
"loss": 0.5694,
"step": 176
},
{
"epoch": 0.554858934169279,
"grad_norm": 2.550015449523926,
"learning_rate": 2.1831368557551962e-06,
"loss": 0.5734,
"step": 177
},
{
"epoch": 0.5579937304075235,
"grad_norm": 2.4202213287353516,
"learning_rate": 2.157941234140225e-06,
"loss": 0.5664,
"step": 178
},
{
"epoch": 0.5611285266457681,
"grad_norm": 2.6022562980651855,
"learning_rate": 2.1327809698803354e-06,
"loss": 0.5516,
"step": 179
},
{
"epoch": 0.5642633228840125,
"grad_norm": 2.3936119079589844,
"learning_rate": 2.1076586636993975e-06,
"loss": 0.5697,
"step": 180
},
{
"epoch": 0.567398119122257,
"grad_norm": 2.6119375228881836,
"learning_rate": 2.0825769123976954e-06,
"loss": 0.5524,
"step": 181
},
{
"epoch": 0.5705329153605015,
"grad_norm": 2.9607300758361816,
"learning_rate": 2.057538308583502e-06,
"loss": 0.5539,
"step": 182
},
{
"epoch": 0.5736677115987461,
"grad_norm": 2.5394551753997803,
"learning_rate": 2.0325454404050983e-06,
"loss": 0.5902,
"step": 183
},
{
"epoch": 0.5768025078369906,
"grad_norm": 2.248412609100342,
"learning_rate": 2.0076008912832355e-06,
"loss": 0.5684,
"step": 184
},
{
"epoch": 0.5799373040752351,
"grad_norm": 2.369102954864502,
"learning_rate": 1.9827072396441044e-06,
"loss": 0.5473,
"step": 185
},
{
"epoch": 0.5830721003134797,
"grad_norm": 2.8046441078186035,
"learning_rate": 1.957867058652812e-06,
"loss": 0.5125,
"step": 186
},
{
"epoch": 0.5862068965517241,
"grad_norm": 2.5482161045074463,
"learning_rate": 1.933082915947398e-06,
"loss": 0.5586,
"step": 187
},
{
"epoch": 0.5893416927899686,
"grad_norm": 2.53220534324646,
"learning_rate": 1.9083573733734328e-06,
"loss": 0.598,
"step": 188
},
{
"epoch": 0.5924764890282131,
"grad_norm": 2.742966413497925,
"learning_rate": 1.8836929867192077e-06,
"loss": 0.5432,
"step": 189
},
{
"epoch": 0.5956112852664577,
"grad_norm": 4.084339141845703,
"learning_rate": 1.8590923054515504e-06,
"loss": 0.5232,
"step": 190
},
{
"epoch": 0.5987460815047022,
"grad_norm": 2.8514344692230225,
"learning_rate": 1.8345578724522957e-06,
"loss": 0.5748,
"step": 191
},
{
"epoch": 0.6018808777429467,
"grad_norm": 2.736140012741089,
"learning_rate": 1.8100922237554442e-06,
"loss": 0.5315,
"step": 192
},
{
"epoch": 0.6050156739811913,
"grad_norm": 2.4470133781433105,
"learning_rate": 1.7856978882850112e-06,
"loss": 0.5722,
"step": 193
},
{
"epoch": 0.6081504702194357,
"grad_norm": 2.587562084197998,
"learning_rate": 1.7613773875936274e-06,
"loss": 0.5697,
"step": 194
},
{
"epoch": 0.6112852664576802,
"grad_norm": 2.340610980987549,
"learning_rate": 1.7371332356018972e-06,
"loss": 0.5292,
"step": 195
},
{
"epoch": 0.6144200626959248,
"grad_norm": 4.569915771484375,
"learning_rate": 1.7129679383385384e-06,
"loss": 0.582,
"step": 196
},
{
"epoch": 0.6175548589341693,
"grad_norm": 2.7119388580322266,
"learning_rate": 1.688883993681345e-06,
"loss": 0.6219,
"step": 197
},
{
"epoch": 0.6206896551724138,
"grad_norm": 2.8180112838745117,
"learning_rate": 1.6648838910989955e-06,
"loss": 0.5649,
"step": 198
},
{
"epoch": 0.6238244514106583,
"grad_norm": 3.1212546825408936,
"learning_rate": 1.6409701113937182e-06,
"loss": 0.5269,
"step": 199
},
{
"epoch": 0.6269592476489029,
"grad_norm": 3.6080574989318848,
"learning_rate": 1.617145126444864e-06,
"loss": 0.5903,
"step": 200
},
{
"epoch": 0.6300940438871473,
"grad_norm": 2.5957369804382324,
"learning_rate": 1.5934113989533992e-06,
"loss": 0.6123,
"step": 201
},
{
"epoch": 0.6332288401253918,
"grad_norm": 2.664415121078491,
"learning_rate": 1.5697713821873401e-06,
"loss": 0.6159,
"step": 202
},
{
"epoch": 0.6363636363636364,
"grad_norm": 2.5966672897338867,
"learning_rate": 1.5462275197281717e-06,
"loss": 0.5255,
"step": 203
},
{
"epoch": 0.6394984326018809,
"grad_norm": 2.552795886993408,
"learning_rate": 1.5227822452182617e-06,
"loss": 0.5485,
"step": 204
},
{
"epoch": 0.6426332288401254,
"grad_norm": 2.581660032272339,
"learning_rate": 1.499437982109305e-06,
"loss": 0.5727,
"step": 205
},
{
"epoch": 0.64576802507837,
"grad_norm": 2.6740126609802246,
"learning_rate": 1.4761971434118207e-06,
"loss": 0.568,
"step": 206
},
{
"epoch": 0.6489028213166145,
"grad_norm": 2.314016342163086,
"learning_rate": 1.4530621314457255e-06,
"loss": 0.5335,
"step": 207
},
{
"epoch": 0.6520376175548589,
"grad_norm": 2.5612449645996094,
"learning_rate": 1.430035337592018e-06,
"loss": 0.5422,
"step": 208
},
{
"epoch": 0.6551724137931034,
"grad_norm": 6.558284759521484,
"learning_rate": 1.4071191420455873e-06,
"loss": 0.5938,
"step": 209
},
{
"epoch": 0.658307210031348,
"grad_norm": 2.5115890502929688,
"learning_rate": 1.3843159135691859e-06,
"loss": 0.5194,
"step": 210
},
{
"epoch": 0.6614420062695925,
"grad_norm": 3.0726919174194336,
"learning_rate": 1.3616280092485719e-06,
"loss": 0.554,
"step": 211
},
{
"epoch": 0.664576802507837,
"grad_norm": 2.4502389430999756,
"learning_rate": 1.3390577742488747e-06,
"loss": 0.6057,
"step": 212
},
{
"epoch": 0.6677115987460815,
"grad_norm": 2.853550434112549,
"learning_rate": 1.3166075415721762e-06,
"loss": 0.5049,
"step": 213
},
{
"epoch": 0.670846394984326,
"grad_norm": 2.496123790740967,
"learning_rate": 1.2942796318163595e-06,
"loss": 0.5625,
"step": 214
},
{
"epoch": 0.6739811912225705,
"grad_norm": 2.3185224533081055,
"learning_rate": 1.2720763529352415e-06,
"loss": 0.5336,
"step": 215
},
{
"epoch": 0.677115987460815,
"grad_norm": 2.621919631958008,
"learning_rate": 1.2500000000000007e-06,
"loss": 0.539,
"step": 216
},
{
"epoch": 0.6802507836990596,
"grad_norm": 2.744100570678711,
"learning_rate": 1.2280528549619487e-06,
"loss": 0.5213,
"step": 217
},
{
"epoch": 0.6833855799373041,
"grad_norm": 2.494028329849243,
"learning_rate": 1.2062371864166553e-06,
"loss": 0.5419,
"step": 218
},
{
"epoch": 0.6865203761755486,
"grad_norm": 2.599900245666504,
"learning_rate": 1.1845552493694462e-06,
"loss": 0.5456,
"step": 219
},
{
"epoch": 0.6896551724137931,
"grad_norm": 2.5224337577819824,
"learning_rate": 1.1630092850023148e-06,
"loss": 0.566,
"step": 220
},
{
"epoch": 0.6927899686520376,
"grad_norm": 2.492403030395508,
"learning_rate": 1.141601520442262e-06,
"loss": 0.5415,
"step": 221
},
{
"epoch": 0.6959247648902821,
"grad_norm": 2.5345394611358643,
"learning_rate": 1.120334168531081e-06,
"loss": 0.5301,
"step": 222
},
{
"epoch": 0.6990595611285266,
"grad_norm": 2.418922185897827,
"learning_rate": 1.0992094275966256e-06,
"loss": 0.5764,
"step": 223
},
{
"epoch": 0.7021943573667712,
"grad_norm": 3.3536760807037354,
"learning_rate": 1.078229481225582e-06,
"loss": 0.5596,
"step": 224
},
{
"epoch": 0.7053291536050157,
"grad_norm": 2.531526803970337,
"learning_rate": 1.0573964980377517e-06,
"loss": 0.549,
"step": 225
},
{
"epoch": 0.7084639498432602,
"grad_norm": 2.7442548274993896,
"learning_rate": 1.0367126314618946e-06,
"loss": 0.5025,
"step": 226
},
{
"epoch": 0.7115987460815048,
"grad_norm": 2.368351459503174,
"learning_rate": 1.0161800195131372e-06,
"loss": 0.5311,
"step": 227
},
{
"epoch": 0.7147335423197492,
"grad_norm": 2.8416459560394287,
"learning_rate": 9.95800784571969e-07,
"loss": 0.5243,
"step": 228
},
{
"epoch": 0.7178683385579937,
"grad_norm": 2.772183656692505,
"learning_rate": 9.755770331648642e-07,
"loss": 0.5677,
"step": 229
},
{
"epoch": 0.7210031347962382,
"grad_norm": 2.4133315086364746,
"learning_rate": 9.555108557465383e-07,
"loss": 0.5507,
"step": 230
},
{
"epoch": 0.7241379310344828,
"grad_norm": 2.677746295928955,
"learning_rate": 9.356043264838607e-07,
"loss": 0.5553,
"step": 231
},
{
"epoch": 0.7272727272727273,
"grad_norm": 2.8451437950134277,
"learning_rate": 9.158595030414621e-07,
"loss": 0.5135,
"step": 232
},
{
"epoch": 0.7304075235109718,
"grad_norm": 2.838019609451294,
"learning_rate": 8.962784263690358e-07,
"loss": 0.59,
"step": 233
},
{
"epoch": 0.7335423197492164,
"grad_norm": 2.865750312805176,
"learning_rate": 8.768631204903738e-07,
"loss": 0.5164,
"step": 234
},
{
"epoch": 0.7366771159874608,
"grad_norm": 2.9796836376190186,
"learning_rate": 8.576155922941548e-07,
"loss": 0.5242,
"step": 235
},
{
"epoch": 0.7398119122257053,
"grad_norm": 3.789559841156006,
"learning_rate": 8.385378313264933e-07,
"loss": 0.5419,
"step": 236
},
{
"epoch": 0.7429467084639498,
"grad_norm": 2.552150249481201,
"learning_rate": 8.196318095852909e-07,
"loss": 0.5426,
"step": 237
},
{
"epoch": 0.7460815047021944,
"grad_norm": 3.243431568145752,
"learning_rate": 8.008994813163995e-07,
"loss": 0.5121,
"step": 238
},
{
"epoch": 0.7492163009404389,
"grad_norm": 3.1874783039093018,
"learning_rate": 7.823427828116148e-07,
"loss": 0.5512,
"step": 239
},
{
"epoch": 0.7523510971786834,
"grad_norm": 2.545905828475952,
"learning_rate": 7.6396363220853e-07,
"loss": 0.5483,
"step": 240
},
{
"epoch": 0.7554858934169278,
"grad_norm": 2.719782829284668,
"learning_rate": 7.457639292922675e-07,
"loss": 0.5683,
"step": 241
},
{
"epoch": 0.7586206896551724,
"grad_norm": 2.9073195457458496,
"learning_rate": 7.277455552991011e-07,
"loss": 0.5711,
"step": 242
},
{
"epoch": 0.7617554858934169,
"grad_norm": 2.301893949508667,
"learning_rate": 7.099103727220024e-07,
"loss": 0.533,
"step": 243
},
{
"epoch": 0.7648902821316614,
"grad_norm": 2.9436652660369873,
"learning_rate": 6.922602251181221e-07,
"loss": 0.5447,
"step": 244
},
{
"epoch": 0.768025078369906,
"grad_norm": 3.2471468448638916,
"learning_rate": 6.747969369182248e-07,
"loss": 0.5551,
"step": 245
},
{
"epoch": 0.7711598746081505,
"grad_norm": 2.480755567550659,
"learning_rate": 6.575223132381067e-07,
"loss": 0.5143,
"step": 246
},
{
"epoch": 0.774294670846395,
"grad_norm": 2.8075945377349854,
"learning_rate": 6.4043813969201e-07,
"loss": 0.5099,
"step": 247
},
{
"epoch": 0.7774294670846394,
"grad_norm": 3.024644136428833,
"learning_rate": 6.235461822080449e-07,
"loss": 0.5393,
"step": 248
},
{
"epoch": 0.780564263322884,
"grad_norm": 2.6839873790740967,
"learning_rate": 6.068481868456558e-07,
"loss": 0.5509,
"step": 249
},
{
"epoch": 0.7836990595611285,
"grad_norm": 3.0200679302215576,
"learning_rate": 5.903458796151382e-07,
"loss": 0.5647,
"step": 250
},
{
"epoch": 0.786833855799373,
"grad_norm": 2.65813946723938,
"learning_rate": 5.740409662992244e-07,
"loss": 0.5202,
"step": 251
},
{
"epoch": 0.7899686520376176,
"grad_norm": 2.6419460773468018,
"learning_rate": 5.579351322767643e-07,
"loss": 0.5412,
"step": 252
},
{
"epoch": 0.7931034482758621,
"grad_norm": 2.5918684005737305,
"learning_rate": 5.420300423485167e-07,
"loss": 0.5671,
"step": 253
},
{
"epoch": 0.7962382445141066,
"grad_norm": 2.6645092964172363,
"learning_rate": 5.263273405650601e-07,
"loss": 0.5971,
"step": 254
},
{
"epoch": 0.799373040752351,
"grad_norm": 2.6975386142730713,
"learning_rate": 5.108286500568562e-07,
"loss": 0.5569,
"step": 255
},
{
"epoch": 0.8025078369905956,
"grad_norm": 2.585435628890991,
"learning_rate": 4.95535572866474e-07,
"loss": 0.5394,
"step": 256
},
{
"epoch": 0.8056426332288401,
"grad_norm": 2.4776594638824463,
"learning_rate": 4.804496897829883e-07,
"loss": 0.5231,
"step": 257
},
{
"epoch": 0.8087774294670846,
"grad_norm": 2.783409833908081,
"learning_rate": 4.6557256017858485e-07,
"loss": 0.5114,
"step": 258
},
{
"epoch": 0.8119122257053292,
"grad_norm": 2.355269193649292,
"learning_rate": 4.5090572184736863e-07,
"loss": 0.5202,
"step": 259
},
{
"epoch": 0.8150470219435737,
"grad_norm": 2.541964292526245,
"learning_rate": 4.3645069084641195e-07,
"loss": 0.5414,
"step": 260
},
{
"epoch": 0.8181818181818182,
"grad_norm": 2.6445441246032715,
"learning_rate": 4.222089613390412e-07,
"loss": 0.5289,
"step": 261
},
{
"epoch": 0.8213166144200627,
"grad_norm": 2.8741798400878906,
"learning_rate": 4.0818200544039484e-07,
"loss": 0.5541,
"step": 262
},
{
"epoch": 0.8244514106583072,
"grad_norm": 3.0294582843780518,
"learning_rate": 3.9437127306525295e-07,
"loss": 0.5234,
"step": 263
},
{
"epoch": 0.8275862068965517,
"grad_norm": 2.6354918479919434,
"learning_rate": 3.8077819177816695e-07,
"loss": 0.5061,
"step": 264
},
{
"epoch": 0.8307210031347962,
"grad_norm": 2.5358927249908447,
"learning_rate": 3.6740416664589634e-07,
"loss": 0.5108,
"step": 265
},
{
"epoch": 0.8338557993730408,
"grad_norm": 3.637833595275879,
"learning_rate": 3.5425058009217193e-07,
"loss": 0.5398,
"step": 266
},
{
"epoch": 0.8369905956112853,
"grad_norm": 2.77534556388855,
"learning_rate": 3.413187917548019e-07,
"loss": 0.5727,
"step": 267
},
{
"epoch": 0.8401253918495298,
"grad_norm": 2.57776141166687,
"learning_rate": 3.2861013834512844e-07,
"loss": 0.5309,
"step": 268
},
{
"epoch": 0.8432601880877743,
"grad_norm": 2.4252471923828125,
"learning_rate": 3.161259335098571e-07,
"loss": 0.4912,
"step": 269
},
{
"epoch": 0.8463949843260188,
"grad_norm": 4.025521278381348,
"learning_rate": 3.0386746769527323e-07,
"loss": 0.5448,
"step": 270
},
{
"epoch": 0.8495297805642633,
"grad_norm": 2.6746034622192383,
"learning_rate": 2.9183600801384853e-07,
"loss": 0.5454,
"step": 271
},
{
"epoch": 0.8526645768025078,
"grad_norm": 2.7830092906951904,
"learning_rate": 2.8003279811326724e-07,
"loss": 0.539,
"step": 272
},
{
"epoch": 0.8557993730407524,
"grad_norm": 2.5394887924194336,
"learning_rate": 2.684590580478749e-07,
"loss": 0.5234,
"step": 273
},
{
"epoch": 0.8589341692789969,
"grad_norm": 2.765644073486328,
"learning_rate": 2.57115984152565e-07,
"loss": 0.5105,
"step": 274
},
{
"epoch": 0.8620689655172413,
"grad_norm": 2.5733540058135986,
"learning_rate": 2.4600474891911696e-07,
"loss": 0.5381,
"step": 275
},
{
"epoch": 0.8652037617554859,
"grad_norm": 2.2739200592041016,
"learning_rate": 2.3512650087500338e-07,
"loss": 0.5344,
"step": 276
},
{
"epoch": 0.8683385579937304,
"grad_norm": 2.446244478225708,
"learning_rate": 2.2448236446466847e-07,
"loss": 0.5271,
"step": 277
},
{
"epoch": 0.8714733542319749,
"grad_norm": 2.833040237426758,
"learning_rate": 2.140734399332975e-07,
"loss": 0.5841,
"step": 278
},
{
"epoch": 0.8746081504702194,
"grad_norm": 2.487649440765381,
"learning_rate": 2.0390080321309236e-07,
"loss": 0.5353,
"step": 279
},
{
"epoch": 0.877742946708464,
"grad_norm": 3.27183198928833,
"learning_rate": 1.9396550581205208e-07,
"loss": 0.5181,
"step": 280
},
{
"epoch": 0.8808777429467085,
"grad_norm": 2.5979673862457275,
"learning_rate": 1.8426857470528414e-07,
"loss": 0.521,
"step": 281
},
{
"epoch": 0.8840125391849529,
"grad_norm": 2.452927589416504,
"learning_rate": 1.7481101222885126e-07,
"loss": 0.5394,
"step": 282
},
{
"epoch": 0.8871473354231975,
"grad_norm": 3.278170585632324,
"learning_rate": 1.6559379597616136e-07,
"loss": 0.5098,
"step": 283
},
{
"epoch": 0.890282131661442,
"grad_norm": 2.9773340225219727,
"learning_rate": 1.5661787869691858e-07,
"loss": 0.5046,
"step": 284
},
{
"epoch": 0.8934169278996865,
"grad_norm": 2.6932992935180664,
"learning_rate": 1.4788418819864037e-07,
"loss": 0.5517,
"step": 285
},
{
"epoch": 0.896551724137931,
"grad_norm": 2.7016708850860596,
"learning_rate": 1.3939362725075344e-07,
"loss": 0.5386,
"step": 286
},
{
"epoch": 0.8996865203761756,
"grad_norm": 3.2767891883850098,
"learning_rate": 1.3114707349127954e-07,
"loss": 0.53,
"step": 287
},
{
"epoch": 0.9028213166144201,
"grad_norm": 2.865053653717041,
"learning_rate": 1.2314537933611425e-07,
"loss": 0.5306,
"step": 288
},
{
"epoch": 0.9059561128526645,
"grad_norm": 2.703111410140991,
"learning_rate": 1.1538937189091825e-07,
"loss": 0.5677,
"step": 289
},
{
"epoch": 0.9090909090909091,
"grad_norm": 2.6944828033447266,
"learning_rate": 1.0787985286562219e-07,
"loss": 0.5488,
"step": 290
},
{
"epoch": 0.9122257053291536,
"grad_norm": 2.602788209915161,
"learning_rate": 1.00617598491555e-07,
"loss": 0.5406,
"step": 291
},
{
"epoch": 0.9153605015673981,
"grad_norm": 2.350580930709839,
"learning_rate": 9.360335944121029e-08,
"loss": 0.5027,
"step": 292
},
{
"epoch": 0.9184952978056427,
"grad_norm": 2.7530674934387207,
"learning_rate": 8.683786075065065e-08,
"loss": 0.5458,
"step": 293
},
{
"epoch": 0.9216300940438872,
"grad_norm": 2.564846992492676,
"learning_rate": 8.032180174456283e-08,
"loss": 0.5267,
"step": 294
},
{
"epoch": 0.9247648902821317,
"grad_norm": 3.4500110149383545,
"learning_rate": 7.405585596397314e-08,
"loss": 0.5129,
"step": 295
},
{
"epoch": 0.9278996865203761,
"grad_norm": 2.9540138244628906,
"learning_rate": 6.804067109662443e-08,
"loss": 0.5221,
"step": 296
},
{
"epoch": 0.9310344827586207,
"grad_norm": 2.5607857704162598,
"learning_rate": 6.227686891002671e-08,
"loss": 0.4751,
"step": 297
},
{
"epoch": 0.9341692789968652,
"grad_norm": 2.6755456924438477,
"learning_rate": 5.6765045187187614e-08,
"loss": 0.5014,
"step": 298
},
{
"epoch": 0.9373040752351097,
"grad_norm": 3.1413800716400146,
"learning_rate": 5.150576966503063e-08,
"loss": 0.5191,
"step": 299
},
{
"epoch": 0.9404388714733543,
"grad_norm": 2.7286934852600098,
"learning_rate": 4.649958597549964e-08,
"loss": 0.5229,
"step": 300
},
{
"epoch": 0.9435736677115988,
"grad_norm": 4.638148784637451,
"learning_rate": 4.174701158936895e-08,
"loss": 0.5181,
"step": 301
},
{
"epoch": 0.9467084639498433,
"grad_norm": 2.8824093341827393,
"learning_rate": 3.7248537762752666e-08,
"loss": 0.5899,
"step": 302
},
{
"epoch": 0.9498432601880877,
"grad_norm": 3.2580602169036865,
"learning_rate": 3.300462948632593e-08,
"loss": 0.5234,
"step": 303
},
{
"epoch": 0.9529780564263323,
"grad_norm": 2.773378610610962,
"learning_rate": 2.9015725437259724e-08,
"loss": 0.5406,
"step": 304
},
{
"epoch": 0.9561128526645768,
"grad_norm": 2.5910532474517822,
"learning_rate": 2.5282237933877962e-08,
"loss": 0.5322,
"step": 305
},
{
"epoch": 0.9592476489028213,
"grad_norm": 4.50349760055542,
"learning_rate": 2.180455289303579e-08,
"loss": 0.6053,
"step": 306
},
{
"epoch": 0.9623824451410659,
"grad_norm": 2.9559998512268066,
"learning_rate": 1.8583029790230356e-08,
"loss": 0.4884,
"step": 307
},
{
"epoch": 0.9655172413793104,
"grad_norm": 3.511509418487549,
"learning_rate": 1.561800162244248e-08,
"loss": 0.5252,
"step": 308
},
{
"epoch": 0.9686520376175548,
"grad_norm": 2.7472832202911377,
"learning_rate": 1.2909774873715585e-08,
"loss": 0.5427,
"step": 309
},
{
"epoch": 0.9717868338557993,
"grad_norm": 3.0372958183288574,
"learning_rate": 1.0458629483476868e-08,
"loss": 0.5447,
"step": 310
},
{
"epoch": 0.9749216300940439,
"grad_norm": 2.645599126815796,
"learning_rate": 8.264818817599052e-09,
"loss": 0.5911,
"step": 311
},
{
"epoch": 0.9780564263322884,
"grad_norm": 3.0988388061523438,
"learning_rate": 6.328569642212734e-09,
"loss": 0.522,
"step": 312
},
{
"epoch": 0.9811912225705329,
"grad_norm": 4.376644134521484,
"learning_rate": 4.6500821002654075e-09,
"loss": 0.551,
"step": 313
},
{
"epoch": 0.9843260188087775,
"grad_norm": 2.846813917160034,
"learning_rate": 3.2295296908338437e-09,
"loss": 0.5204,
"step": 314
},
{
"epoch": 0.987460815047022,
"grad_norm": 4.044336318969727,
"learning_rate": 2.067059251189274e-09,
"loss": 0.5254,
"step": 315
},
{
"epoch": 0.9905956112852664,
"grad_norm": 2.489978790283203,
"learning_rate": 1.1627909416211947e-09,
"loss": 0.5397,
"step": 316
},
{
"epoch": 0.9937304075235109,
"grad_norm": 2.610431671142578,
"learning_rate": 5.168182330145266e-10,
"loss": 0.555,
"step": 317
},
{
"epoch": 0.9968652037617555,
"grad_norm": 3.202317953109741,
"learning_rate": 1.292078971898425e-10,
"loss": 0.5094,
"step": 318
},
{
"epoch": 1.0,
"grad_norm": 2.7502589225769043,
"learning_rate": 0.0,
"loss": 0.5585,
"step": 319
},
{
"epoch": 1.0,
"step": 319,
"total_flos": 834160017014784.0,
"train_loss": 0.6235808798325099,
"train_runtime": 6415.1151,
"train_samples_per_second": 3.182,
"train_steps_per_second": 0.05
}
],
"logging_steps": 1.0,
"max_steps": 319,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 400,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 834160017014784.0,
"train_batch_size": 4,
"trial_name": null,
"trial_params": null
}