24B-E1 / trainer_state.json
Delta-Vector's picture
Upload folder using huggingface_hub
87d5df8 verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 1.0,
"eval_steps": 296,
"global_step": 592,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0016891891891891893,
"grad_norm": 14.480994151598582,
"learning_rate": 0.0,
"loss": 0.8543,
"step": 1
},
{
"epoch": 0.0016891891891891893,
"eval_loss": 0.9275368452072144,
"eval_runtime": 972.3995,
"eval_samples_per_second": 5.605,
"eval_steps_per_second": 0.351,
"step": 1
},
{
"epoch": 0.0033783783783783786,
"grad_norm": 12.018325552585269,
"learning_rate": 1e-07,
"loss": 0.8892,
"step": 2
},
{
"epoch": 0.005067567567567568,
"grad_norm": 12.365747505763508,
"learning_rate": 2e-07,
"loss": 0.8855,
"step": 3
},
{
"epoch": 0.006756756756756757,
"grad_norm": 12.800829498404358,
"learning_rate": 3e-07,
"loss": 0.9267,
"step": 4
},
{
"epoch": 0.008445945945945946,
"grad_norm": 11.737599190531979,
"learning_rate": 4e-07,
"loss": 0.9281,
"step": 5
},
{
"epoch": 0.010135135135135136,
"grad_norm": 8.637373821708078,
"learning_rate": 5e-07,
"loss": 0.8875,
"step": 6
},
{
"epoch": 0.011824324324324325,
"grad_norm": 7.636499275338768,
"learning_rate": 6e-07,
"loss": 0.872,
"step": 7
},
{
"epoch": 0.013513513513513514,
"grad_norm": 8.385216478501201,
"learning_rate": 7e-07,
"loss": 0.9226,
"step": 8
},
{
"epoch": 0.015202702702702704,
"grad_norm": 6.885449636338203,
"learning_rate": 8e-07,
"loss": 0.8754,
"step": 9
},
{
"epoch": 0.016891891891891893,
"grad_norm": 5.298697529194119,
"learning_rate": 9e-07,
"loss": 0.8494,
"step": 10
},
{
"epoch": 0.018581081081081082,
"grad_norm": 3.472286230584162,
"learning_rate": 1e-06,
"loss": 0.9455,
"step": 11
},
{
"epoch": 0.02027027027027027,
"grad_norm": 3.0364570491026885,
"learning_rate": 1.1e-06,
"loss": 0.9047,
"step": 12
},
{
"epoch": 0.02195945945945946,
"grad_norm": 2.6279136763742565,
"learning_rate": 1.2e-06,
"loss": 0.8626,
"step": 13
},
{
"epoch": 0.02364864864864865,
"grad_norm": 3.9893451802262754,
"learning_rate": 1.3e-06,
"loss": 0.8449,
"step": 14
},
{
"epoch": 0.02533783783783784,
"grad_norm": 2.4501659804647864,
"learning_rate": 1.4e-06,
"loss": 0.8304,
"step": 15
},
{
"epoch": 0.02702702702702703,
"grad_norm": 3.1609047731381277,
"learning_rate": 1.5e-06,
"loss": 0.8189,
"step": 16
},
{
"epoch": 0.028716216216216218,
"grad_norm": 2.708859155242058,
"learning_rate": 1.6e-06,
"loss": 0.8888,
"step": 17
},
{
"epoch": 0.030405405405405407,
"grad_norm": 2.5829432245407586,
"learning_rate": 1.6999999999999998e-06,
"loss": 0.8441,
"step": 18
},
{
"epoch": 0.03209459459459459,
"grad_norm": 2.2191942002061102,
"learning_rate": 1.8e-06,
"loss": 0.8493,
"step": 19
},
{
"epoch": 0.033783783783783786,
"grad_norm": 2.1116693433922187,
"learning_rate": 1.8999999999999998e-06,
"loss": 0.8055,
"step": 20
},
{
"epoch": 0.03547297297297297,
"grad_norm": 1.8584288181443849,
"learning_rate": 2e-06,
"loss": 0.7823,
"step": 21
},
{
"epoch": 0.037162162162162164,
"grad_norm": 2.0456344199742884,
"learning_rate": 2.1e-06,
"loss": 0.8027,
"step": 22
},
{
"epoch": 0.03885135135135135,
"grad_norm": 2.9705048998517145,
"learning_rate": 2.2e-06,
"loss": 0.8329,
"step": 23
},
{
"epoch": 0.04054054054054054,
"grad_norm": 1.7403034865771894,
"learning_rate": 2.2999999999999996e-06,
"loss": 0.7923,
"step": 24
},
{
"epoch": 0.04222972972972973,
"grad_norm": 1.694602617591211,
"learning_rate": 2.4e-06,
"loss": 0.7855,
"step": 25
},
{
"epoch": 0.04391891891891892,
"grad_norm": 1.9634724614647339,
"learning_rate": 2.4999999999999998e-06,
"loss": 0.8557,
"step": 26
},
{
"epoch": 0.04560810810810811,
"grad_norm": 1.7393403646085595,
"learning_rate": 2.6e-06,
"loss": 0.7526,
"step": 27
},
{
"epoch": 0.0472972972972973,
"grad_norm": 1.6361547258883284,
"learning_rate": 2.7e-06,
"loss": 0.7496,
"step": 28
},
{
"epoch": 0.048986486486486486,
"grad_norm": 1.6311908238581783,
"learning_rate": 2.8e-06,
"loss": 0.7876,
"step": 29
},
{
"epoch": 0.05067567567567568,
"grad_norm": 1.6978843460505462,
"learning_rate": 2.8999999999999998e-06,
"loss": 0.7926,
"step": 30
},
{
"epoch": 0.052364864864864864,
"grad_norm": 1.700306641243955,
"learning_rate": 3e-06,
"loss": 0.8115,
"step": 31
},
{
"epoch": 0.05405405405405406,
"grad_norm": 1.6289463279253593,
"learning_rate": 3.1e-06,
"loss": 0.7209,
"step": 32
},
{
"epoch": 0.05574324324324324,
"grad_norm": 1.5306464291392823,
"learning_rate": 3.2e-06,
"loss": 0.7284,
"step": 33
},
{
"epoch": 0.057432432432432436,
"grad_norm": 1.6255878142473301,
"learning_rate": 3.2999999999999997e-06,
"loss": 0.8205,
"step": 34
},
{
"epoch": 0.05912162162162162,
"grad_norm": 1.5806183262012883,
"learning_rate": 3.3999999999999996e-06,
"loss": 0.7471,
"step": 35
},
{
"epoch": 0.060810810810810814,
"grad_norm": 1.4925108083317444,
"learning_rate": 3.5e-06,
"loss": 0.7333,
"step": 36
},
{
"epoch": 0.0625,
"grad_norm": 1.47721649504872,
"learning_rate": 3.6e-06,
"loss": 0.785,
"step": 37
},
{
"epoch": 0.06418918918918919,
"grad_norm": 1.44983881509458,
"learning_rate": 3.7e-06,
"loss": 0.7666,
"step": 38
},
{
"epoch": 0.06587837837837837,
"grad_norm": 1.5847671132911032,
"learning_rate": 3.7999999999999996e-06,
"loss": 0.7811,
"step": 39
},
{
"epoch": 0.06756756756756757,
"grad_norm": 1.498427406292611,
"learning_rate": 3.9e-06,
"loss": 0.7249,
"step": 40
},
{
"epoch": 0.06925675675675676,
"grad_norm": 1.5689523363961653,
"learning_rate": 4e-06,
"loss": 0.7381,
"step": 41
},
{
"epoch": 0.07094594594594594,
"grad_norm": 1.7469630894889612,
"learning_rate": 3.999992458679062e-06,
"loss": 0.734,
"step": 42
},
{
"epoch": 0.07263513513513513,
"grad_norm": 1.4229571463310593,
"learning_rate": 3.999969834773121e-06,
"loss": 0.7057,
"step": 43
},
{
"epoch": 0.07432432432432433,
"grad_norm": 1.4908625411485357,
"learning_rate": 3.99993212845279e-06,
"loss": 0.7758,
"step": 44
},
{
"epoch": 0.07601351351351351,
"grad_norm": 1.650679164773279,
"learning_rate": 3.9998793400024255e-06,
"loss": 0.7301,
"step": 45
},
{
"epoch": 0.0777027027027027,
"grad_norm": 1.7430443156485362,
"learning_rate": 3.99981146982012e-06,
"loss": 0.8024,
"step": 46
},
{
"epoch": 0.07939189189189189,
"grad_norm": 1.4650446297524533,
"learning_rate": 3.999728518417708e-06,
"loss": 0.7601,
"step": 47
},
{
"epoch": 0.08108108108108109,
"grad_norm": 1.4330737453276827,
"learning_rate": 3.99963048642075e-06,
"loss": 0.6946,
"step": 48
},
{
"epoch": 0.08277027027027027,
"grad_norm": 1.6240982322183675,
"learning_rate": 3.999517374568536e-06,
"loss": 0.7218,
"step": 49
},
{
"epoch": 0.08445945945945946,
"grad_norm": 1.5139893643970004,
"learning_rate": 3.9993891837140806e-06,
"loss": 0.7464,
"step": 50
},
{
"epoch": 0.08614864864864864,
"grad_norm": 1.3839941970510703,
"learning_rate": 3.999245914824112e-06,
"loss": 0.783,
"step": 51
},
{
"epoch": 0.08783783783783784,
"grad_norm": 1.4041597855533474,
"learning_rate": 3.999087568979067e-06,
"loss": 0.7357,
"step": 52
},
{
"epoch": 0.08952702702702703,
"grad_norm": 1.4228074713312242,
"learning_rate": 3.9989141473730804e-06,
"loss": 0.7528,
"step": 53
},
{
"epoch": 0.09121621621621621,
"grad_norm": 1.45544094128086,
"learning_rate": 3.998725651313984e-06,
"loss": 0.7121,
"step": 54
},
{
"epoch": 0.0929054054054054,
"grad_norm": 2.4199878467039975,
"learning_rate": 3.998522082223282e-06,
"loss": 0.7945,
"step": 55
},
{
"epoch": 0.0945945945945946,
"grad_norm": 1.5961731492055695,
"learning_rate": 3.9983034416361594e-06,
"loss": 0.7397,
"step": 56
},
{
"epoch": 0.09628378378378379,
"grad_norm": 1.4512814325426553,
"learning_rate": 3.998069731201452e-06,
"loss": 0.7034,
"step": 57
},
{
"epoch": 0.09797297297297297,
"grad_norm": 1.5843175046581277,
"learning_rate": 3.997820952681645e-06,
"loss": 0.7441,
"step": 58
},
{
"epoch": 0.09966216216216216,
"grad_norm": 1.3953565784623518,
"learning_rate": 3.9975571079528596e-06,
"loss": 0.7193,
"step": 59
},
{
"epoch": 0.10135135135135136,
"grad_norm": 1.3805266599685402,
"learning_rate": 3.997278199004831e-06,
"loss": 0.7262,
"step": 60
},
{
"epoch": 0.10304054054054054,
"grad_norm": 1.4612943703890746,
"learning_rate": 3.996984227940902e-06,
"loss": 0.7983,
"step": 61
},
{
"epoch": 0.10472972972972973,
"grad_norm": 1.810133635495203,
"learning_rate": 3.9966751969780025e-06,
"loss": 0.7769,
"step": 62
},
{
"epoch": 0.10641891891891891,
"grad_norm": 1.4598985705910221,
"learning_rate": 3.996351108446635e-06,
"loss": 0.7429,
"step": 63
},
{
"epoch": 0.10810810810810811,
"grad_norm": 1.443083533415171,
"learning_rate": 3.9960119647908545e-06,
"loss": 0.732,
"step": 64
},
{
"epoch": 0.1097972972972973,
"grad_norm": 1.3883608013574098,
"learning_rate": 3.995657768568251e-06,
"loss": 0.716,
"step": 65
},
{
"epoch": 0.11148648648648649,
"grad_norm": 1.498198395419281,
"learning_rate": 3.995288522449935e-06,
"loss": 0.6985,
"step": 66
},
{
"epoch": 0.11317567567567567,
"grad_norm": 1.4554676659901469,
"learning_rate": 3.994904229220507e-06,
"loss": 0.7372,
"step": 67
},
{
"epoch": 0.11486486486486487,
"grad_norm": 1.5008357924858833,
"learning_rate": 3.994504891778047e-06,
"loss": 0.7126,
"step": 68
},
{
"epoch": 0.11655405405405406,
"grad_norm": 1.419811370376343,
"learning_rate": 3.994090513134086e-06,
"loss": 0.7243,
"step": 69
},
{
"epoch": 0.11824324324324324,
"grad_norm": 1.442331674686139,
"learning_rate": 3.9936610964135874e-06,
"loss": 0.7305,
"step": 70
},
{
"epoch": 0.11993243243243243,
"grad_norm": 1.5182212943849125,
"learning_rate": 3.99321664485492e-06,
"loss": 0.6941,
"step": 71
},
{
"epoch": 0.12162162162162163,
"grad_norm": 1.3401060100127358,
"learning_rate": 3.992757161809835e-06,
"loss": 0.7111,
"step": 72
},
{
"epoch": 0.12331081081081081,
"grad_norm": 1.4117702617033703,
"learning_rate": 3.992282650743443e-06,
"loss": 0.7177,
"step": 73
},
{
"epoch": 0.125,
"grad_norm": 1.6738584983948046,
"learning_rate": 3.991793115234182e-06,
"loss": 0.7896,
"step": 74
},
{
"epoch": 0.1266891891891892,
"grad_norm": 1.8574841037381855,
"learning_rate": 3.991288558973798e-06,
"loss": 0.7902,
"step": 75
},
{
"epoch": 0.12837837837837837,
"grad_norm": 1.497828677704926,
"learning_rate": 3.990768985767312e-06,
"loss": 0.7076,
"step": 76
},
{
"epoch": 0.13006756756756757,
"grad_norm": 1.52038921491966,
"learning_rate": 3.9902343995329916e-06,
"loss": 0.8006,
"step": 77
},
{
"epoch": 0.13175675675675674,
"grad_norm": 1.4458874014195138,
"learning_rate": 3.989684804302323e-06,
"loss": 0.7043,
"step": 78
},
{
"epoch": 0.13344594594594594,
"grad_norm": 2.0596220547582176,
"learning_rate": 3.98912020421998e-06,
"loss": 0.7644,
"step": 79
},
{
"epoch": 0.13513513513513514,
"grad_norm": 2.8723719852090985,
"learning_rate": 3.988540603543794e-06,
"loss": 0.766,
"step": 80
},
{
"epoch": 0.13682432432432431,
"grad_norm": 1.3880889948705695,
"learning_rate": 3.98794600664472e-06,
"loss": 0.7704,
"step": 81
},
{
"epoch": 0.13851351351351351,
"grad_norm": 1.3581068937102048,
"learning_rate": 3.987336418006802e-06,
"loss": 0.713,
"step": 82
},
{
"epoch": 0.14020270270270271,
"grad_norm": 1.435765423914678,
"learning_rate": 3.986711842227146e-06,
"loss": 0.7187,
"step": 83
},
{
"epoch": 0.14189189189189189,
"grad_norm": 1.4288300487189134,
"learning_rate": 3.9860722840158765e-06,
"loss": 0.6821,
"step": 84
},
{
"epoch": 0.14358108108108109,
"grad_norm": 1.441392279654618,
"learning_rate": 3.985417748196107e-06,
"loss": 0.7272,
"step": 85
},
{
"epoch": 0.14527027027027026,
"grad_norm": 2.182568121286923,
"learning_rate": 3.984748239703905e-06,
"loss": 0.7365,
"step": 86
},
{
"epoch": 0.14695945945945946,
"grad_norm": 1.4323037713767275,
"learning_rate": 3.984063763588246e-06,
"loss": 0.7054,
"step": 87
},
{
"epoch": 0.14864864864864866,
"grad_norm": 1.4352524812290801,
"learning_rate": 3.983364325010986e-06,
"loss": 0.7827,
"step": 88
},
{
"epoch": 0.15033783783783783,
"grad_norm": 1.3585268883879378,
"learning_rate": 3.9826499292468135e-06,
"loss": 0.7121,
"step": 89
},
{
"epoch": 0.15202702702702703,
"grad_norm": 1.468711558201286,
"learning_rate": 3.981920581683218e-06,
"loss": 0.7408,
"step": 90
},
{
"epoch": 0.15371621621621623,
"grad_norm": 1.5155258611932023,
"learning_rate": 3.981176287820444e-06,
"loss": 0.6812,
"step": 91
},
{
"epoch": 0.1554054054054054,
"grad_norm": 1.4211773705458912,
"learning_rate": 3.9804170532714495e-06,
"loss": 0.7134,
"step": 92
},
{
"epoch": 0.1570945945945946,
"grad_norm": 1.5526244069056863,
"learning_rate": 3.979642883761865e-06,
"loss": 0.773,
"step": 93
},
{
"epoch": 0.15878378378378377,
"grad_norm": 1.4535911061477287,
"learning_rate": 3.978853785129953e-06,
"loss": 0.6815,
"step": 94
},
{
"epoch": 0.16047297297297297,
"grad_norm": 1.377349590888399,
"learning_rate": 3.978049763326558e-06,
"loss": 0.6711,
"step": 95
},
{
"epoch": 0.16216216216216217,
"grad_norm": 1.4269071997724252,
"learning_rate": 3.977230824415068e-06,
"loss": 0.725,
"step": 96
},
{
"epoch": 0.16385135135135134,
"grad_norm": 1.8258026487599963,
"learning_rate": 3.9763969745713635e-06,
"loss": 0.742,
"step": 97
},
{
"epoch": 0.16554054054054054,
"grad_norm": 1.5637199081600301,
"learning_rate": 3.975548220083773e-06,
"loss": 0.7176,
"step": 98
},
{
"epoch": 0.16722972972972974,
"grad_norm": 1.4418683327068436,
"learning_rate": 3.974684567353027e-06,
"loss": 0.7704,
"step": 99
},
{
"epoch": 0.16891891891891891,
"grad_norm": 1.500650299775909,
"learning_rate": 3.973806022892209e-06,
"loss": 0.7777,
"step": 100
},
{
"epoch": 0.17060810810810811,
"grad_norm": 1.4229499052679826,
"learning_rate": 3.972912593326703e-06,
"loss": 0.6773,
"step": 101
},
{
"epoch": 0.17229729729729729,
"grad_norm": 1.4069619452197821,
"learning_rate": 3.9720042853941494e-06,
"loss": 0.7166,
"step": 102
},
{
"epoch": 0.17398648648648649,
"grad_norm": 1.4641029705511208,
"learning_rate": 3.971081105944389e-06,
"loss": 0.6622,
"step": 103
},
{
"epoch": 0.17567567567567569,
"grad_norm": 1.3822866897164043,
"learning_rate": 3.970143061939414e-06,
"loss": 0.6678,
"step": 104
},
{
"epoch": 0.17736486486486486,
"grad_norm": 1.3486512924905725,
"learning_rate": 3.969190160453317e-06,
"loss": 0.7085,
"step": 105
},
{
"epoch": 0.17905405405405406,
"grad_norm": 1.4080642954468132,
"learning_rate": 3.968222408672232e-06,
"loss": 0.6805,
"step": 106
},
{
"epoch": 0.18074324324324326,
"grad_norm": 1.3245403639949178,
"learning_rate": 3.9672398138942874e-06,
"loss": 0.6723,
"step": 107
},
{
"epoch": 0.18243243243243243,
"grad_norm": 1.522527598129808,
"learning_rate": 3.966242383529544e-06,
"loss": 0.7348,
"step": 108
},
{
"epoch": 0.18412162162162163,
"grad_norm": 1.4593718177402826,
"learning_rate": 3.965230125099946e-06,
"loss": 0.6859,
"step": 109
},
{
"epoch": 0.1858108108108108,
"grad_norm": 1.6113096358693728,
"learning_rate": 3.964203046239258e-06,
"loss": 0.7133,
"step": 110
},
{
"epoch": 0.1875,
"grad_norm": 2.1901731817473227,
"learning_rate": 3.963161154693013e-06,
"loss": 0.6989,
"step": 111
},
{
"epoch": 0.1891891891891892,
"grad_norm": 1.4041902139290952,
"learning_rate": 3.962104458318446e-06,
"loss": 0.683,
"step": 112
},
{
"epoch": 0.19087837837837837,
"grad_norm": 1.3355403920639282,
"learning_rate": 3.961032965084447e-06,
"loss": 0.7055,
"step": 113
},
{
"epoch": 0.19256756756756757,
"grad_norm": 1.4378012677241268,
"learning_rate": 3.959946683071489e-06,
"loss": 0.7721,
"step": 114
},
{
"epoch": 0.19425675675675674,
"grad_norm": 1.5234306742219028,
"learning_rate": 3.958845620471573e-06,
"loss": 0.7021,
"step": 115
},
{
"epoch": 0.19594594594594594,
"grad_norm": 1.329738948124563,
"learning_rate": 3.957729785588166e-06,
"loss": 0.6521,
"step": 116
},
{
"epoch": 0.19763513513513514,
"grad_norm": 1.5992344878568325,
"learning_rate": 3.956599186836137e-06,
"loss": 0.7275,
"step": 117
},
{
"epoch": 0.19932432432432431,
"grad_norm": 1.4020110254102955,
"learning_rate": 3.955453832741693e-06,
"loss": 0.6748,
"step": 118
},
{
"epoch": 0.20101351351351351,
"grad_norm": 1.3573035489579244,
"learning_rate": 3.954293731942319e-06,
"loss": 0.7025,
"step": 119
},
{
"epoch": 0.20270270270270271,
"grad_norm": 1.335897627914034,
"learning_rate": 3.953118893186705e-06,
"loss": 0.6984,
"step": 120
},
{
"epoch": 0.20439189189189189,
"grad_norm": 1.4918423194384791,
"learning_rate": 3.951929325334689e-06,
"loss": 0.6686,
"step": 121
},
{
"epoch": 0.20608108108108109,
"grad_norm": 1.6979742909503677,
"learning_rate": 3.950725037357182e-06,
"loss": 0.7658,
"step": 122
},
{
"epoch": 0.20777027027027026,
"grad_norm": 1.6274287291465286,
"learning_rate": 3.949506038336108e-06,
"loss": 0.7144,
"step": 123
},
{
"epoch": 0.20945945945945946,
"grad_norm": 1.3302624319860148,
"learning_rate": 3.94827233746433e-06,
"loss": 0.6349,
"step": 124
},
{
"epoch": 0.21114864864864866,
"grad_norm": 1.4826441921680285,
"learning_rate": 3.94702394404558e-06,
"loss": 0.7219,
"step": 125
},
{
"epoch": 0.21283783783783783,
"grad_norm": 1.4005282602108076,
"learning_rate": 3.9457608674943945e-06,
"loss": 0.7359,
"step": 126
},
{
"epoch": 0.21452702702702703,
"grad_norm": 1.5877469483748996,
"learning_rate": 3.9444831173360406e-06,
"loss": 0.693,
"step": 127
},
{
"epoch": 0.21621621621621623,
"grad_norm": 1.5442843013623662,
"learning_rate": 3.94319070320644e-06,
"loss": 0.7417,
"step": 128
},
{
"epoch": 0.2179054054054054,
"grad_norm": 1.4333541902470606,
"learning_rate": 3.941883634852104e-06,
"loss": 0.6699,
"step": 129
},
{
"epoch": 0.2195945945945946,
"grad_norm": 1.5080751183762988,
"learning_rate": 3.940561922130054e-06,
"loss": 0.6839,
"step": 130
},
{
"epoch": 0.22128378378378377,
"grad_norm": 1.3898529898507852,
"learning_rate": 3.93922557500775e-06,
"loss": 0.663,
"step": 131
},
{
"epoch": 0.22297297297297297,
"grad_norm": 1.3772254909736483,
"learning_rate": 3.937874603563015e-06,
"loss": 0.6593,
"step": 132
},
{
"epoch": 0.22466216216216217,
"grad_norm": 1.760208679944535,
"learning_rate": 3.936509017983956e-06,
"loss": 0.7082,
"step": 133
},
{
"epoch": 0.22635135135135134,
"grad_norm": 1.8196575759309455,
"learning_rate": 3.935128828568896e-06,
"loss": 0.6945,
"step": 134
},
{
"epoch": 0.22804054054054054,
"grad_norm": 1.385346989951205,
"learning_rate": 3.933734045726283e-06,
"loss": 0.6863,
"step": 135
},
{
"epoch": 0.22972972972972974,
"grad_norm": 1.3685547174486146,
"learning_rate": 3.932324679974623e-06,
"loss": 0.7477,
"step": 136
},
{
"epoch": 0.23141891891891891,
"grad_norm": 1.4295145309120525,
"learning_rate": 3.930900741942396e-06,
"loss": 0.6747,
"step": 137
},
{
"epoch": 0.23310810810810811,
"grad_norm": 1.3906500390926828,
"learning_rate": 3.929462242367975e-06,
"loss": 0.764,
"step": 138
},
{
"epoch": 0.23479729729729729,
"grad_norm": 1.384756160443465,
"learning_rate": 3.928009192099548e-06,
"loss": 0.6678,
"step": 139
},
{
"epoch": 0.23648648648648649,
"grad_norm": 1.7220512704516102,
"learning_rate": 3.926541602095032e-06,
"loss": 0.7969,
"step": 140
},
{
"epoch": 0.23817567567567569,
"grad_norm": 1.477284673606475,
"learning_rate": 3.925059483421996e-06,
"loss": 0.6866,
"step": 141
},
{
"epoch": 0.23986486486486486,
"grad_norm": 1.431239746187604,
"learning_rate": 3.9235628472575705e-06,
"loss": 0.6979,
"step": 142
},
{
"epoch": 0.24155405405405406,
"grad_norm": 1.604028789928828,
"learning_rate": 3.92205170488837e-06,
"loss": 0.6599,
"step": 143
},
{
"epoch": 0.24324324324324326,
"grad_norm": 1.3489137098771946,
"learning_rate": 3.9205260677104055e-06,
"loss": 0.7128,
"step": 144
},
{
"epoch": 0.24493243243243243,
"grad_norm": 1.3940741727669859,
"learning_rate": 3.9189859472289945e-06,
"loss": 0.6735,
"step": 145
},
{
"epoch": 0.24662162162162163,
"grad_norm": 1.3893021240993075,
"learning_rate": 3.917431355058681e-06,
"loss": 0.6915,
"step": 146
},
{
"epoch": 0.2483108108108108,
"grad_norm": 1.5080140713249557,
"learning_rate": 3.915862302923143e-06,
"loss": 0.7439,
"step": 147
},
{
"epoch": 0.25,
"grad_norm": 1.3355739307749024,
"learning_rate": 3.914278802655106e-06,
"loss": 0.7065,
"step": 148
},
{
"epoch": 0.2516891891891892,
"grad_norm": 1.6241902843973968,
"learning_rate": 3.912680866196255e-06,
"loss": 0.7081,
"step": 149
},
{
"epoch": 0.2533783783783784,
"grad_norm": 1.5261637427264052,
"learning_rate": 3.9110685055971406e-06,
"loss": 0.6994,
"step": 150
},
{
"epoch": 0.25506756756756754,
"grad_norm": 1.479630710856107,
"learning_rate": 3.909441733017091e-06,
"loss": 0.689,
"step": 151
},
{
"epoch": 0.25675675675675674,
"grad_norm": 1.4933663608483565,
"learning_rate": 3.907800560724121e-06,
"loss": 0.6942,
"step": 152
},
{
"epoch": 0.25844594594594594,
"grad_norm": 1.5302529362714137,
"learning_rate": 3.906145001094839e-06,
"loss": 0.6868,
"step": 153
},
{
"epoch": 0.26013513513513514,
"grad_norm": 1.4040014862243637,
"learning_rate": 3.904475066614349e-06,
"loss": 0.6988,
"step": 154
},
{
"epoch": 0.26182432432432434,
"grad_norm": 1.4226489729992533,
"learning_rate": 3.902790769876164e-06,
"loss": 0.7645,
"step": 155
},
{
"epoch": 0.2635135135135135,
"grad_norm": 1.5321366676822086,
"learning_rate": 3.901092123582107e-06,
"loss": 0.7381,
"step": 156
},
{
"epoch": 0.2652027027027027,
"grad_norm": 1.430585178455411,
"learning_rate": 3.899379140542213e-06,
"loss": 0.7184,
"step": 157
},
{
"epoch": 0.2668918918918919,
"grad_norm": 1.4320550163328156,
"learning_rate": 3.897651833674639e-06,
"loss": 0.6898,
"step": 158
},
{
"epoch": 0.2685810810810811,
"grad_norm": 1.4007180357015416,
"learning_rate": 3.895910216005559e-06,
"loss": 0.7204,
"step": 159
},
{
"epoch": 0.2702702702702703,
"grad_norm": 1.4760164587014226,
"learning_rate": 3.894154300669071e-06,
"loss": 0.6809,
"step": 160
},
{
"epoch": 0.2719594594594595,
"grad_norm": 1.359805865137023,
"learning_rate": 3.892384100907097e-06,
"loss": 0.6664,
"step": 161
},
{
"epoch": 0.27364864864864863,
"grad_norm": 1.494271130133251,
"learning_rate": 3.89059963006928e-06,
"loss": 0.7812,
"step": 162
},
{
"epoch": 0.27533783783783783,
"grad_norm": 1.4693114375854741,
"learning_rate": 3.888800901612889e-06,
"loss": 0.6843,
"step": 163
},
{
"epoch": 0.27702702702702703,
"grad_norm": 1.3756308293515527,
"learning_rate": 3.886987929102711e-06,
"loss": 0.6712,
"step": 164
},
{
"epoch": 0.27871621621621623,
"grad_norm": 1.4414208695652082,
"learning_rate": 3.885160726210954e-06,
"loss": 0.6686,
"step": 165
},
{
"epoch": 0.28040540540540543,
"grad_norm": 1.6068914930388214,
"learning_rate": 3.883319306717143e-06,
"loss": 0.6479,
"step": 166
},
{
"epoch": 0.28209459459459457,
"grad_norm": 1.5676401171578263,
"learning_rate": 3.881463684508011e-06,
"loss": 0.7549,
"step": 167
},
{
"epoch": 0.28378378378378377,
"grad_norm": 1.370828841111984,
"learning_rate": 3.879593873577402e-06,
"loss": 0.7173,
"step": 168
},
{
"epoch": 0.28547297297297297,
"grad_norm": 1.3973893255241456,
"learning_rate": 3.877709888026159e-06,
"loss": 0.7046,
"step": 169
},
{
"epoch": 0.28716216216216217,
"grad_norm": 1.3059515131618362,
"learning_rate": 3.875811742062024e-06,
"loss": 0.6521,
"step": 170
},
{
"epoch": 0.28885135135135137,
"grad_norm": 1.3428841635145867,
"learning_rate": 3.873899449999524e-06,
"loss": 0.6404,
"step": 171
},
{
"epoch": 0.2905405405405405,
"grad_norm": 1.3845007444348196,
"learning_rate": 3.871973026259865e-06,
"loss": 0.6715,
"step": 172
},
{
"epoch": 0.2922297297297297,
"grad_norm": 1.4166123181291566,
"learning_rate": 3.8700324853708295e-06,
"loss": 0.7466,
"step": 173
},
{
"epoch": 0.2939189189189189,
"grad_norm": 1.3710082585316177,
"learning_rate": 3.8680778419666576e-06,
"loss": 0.6271,
"step": 174
},
{
"epoch": 0.2956081081081081,
"grad_norm": 1.3982194102229981,
"learning_rate": 3.8661091107879434e-06,
"loss": 0.7215,
"step": 175
},
{
"epoch": 0.2972972972972973,
"grad_norm": 1.7210038720236769,
"learning_rate": 3.8641263066815205e-06,
"loss": 0.721,
"step": 176
},
{
"epoch": 0.2989864864864865,
"grad_norm": 1.4097607569308712,
"learning_rate": 3.862129444600349e-06,
"loss": 0.6755,
"step": 177
},
{
"epoch": 0.30067567567567566,
"grad_norm": 1.4419897609366905,
"learning_rate": 3.86011853960341e-06,
"loss": 0.7348,
"step": 178
},
{
"epoch": 0.30236486486486486,
"grad_norm": 1.507066037469795,
"learning_rate": 3.8580936068555815e-06,
"loss": 0.7768,
"step": 179
},
{
"epoch": 0.30405405405405406,
"grad_norm": 1.4044353506211649,
"learning_rate": 3.856054661627532e-06,
"loss": 0.6697,
"step": 180
},
{
"epoch": 0.30574324324324326,
"grad_norm": 1.3011596028963899,
"learning_rate": 3.854001719295601e-06,
"loss": 0.6593,
"step": 181
},
{
"epoch": 0.30743243243243246,
"grad_norm": 1.4882426091073055,
"learning_rate": 3.851934795341686e-06,
"loss": 0.725,
"step": 182
},
{
"epoch": 0.3091216216216216,
"grad_norm": 1.412044516159466,
"learning_rate": 3.849853905353123e-06,
"loss": 0.7254,
"step": 183
},
{
"epoch": 0.3108108108108108,
"grad_norm": 1.3552048527285785,
"learning_rate": 3.847759065022573e-06,
"loss": 0.6612,
"step": 184
},
{
"epoch": 0.3125,
"grad_norm": 1.4893729275292906,
"learning_rate": 3.845650290147898e-06,
"loss": 0.7761,
"step": 185
},
{
"epoch": 0.3141891891891892,
"grad_norm": 1.5227378731808672,
"learning_rate": 3.843527596632047e-06,
"loss": 0.7112,
"step": 186
},
{
"epoch": 0.3158783783783784,
"grad_norm": 1.3652595903876206,
"learning_rate": 3.841391000482931e-06,
"loss": 0.7052,
"step": 187
},
{
"epoch": 0.31756756756756754,
"grad_norm": 1.3170940458876272,
"learning_rate": 3.839240517813311e-06,
"loss": 0.621,
"step": 188
},
{
"epoch": 0.31925675675675674,
"grad_norm": 1.4119825821508496,
"learning_rate": 3.837076164840663e-06,
"loss": 0.6619,
"step": 189
},
{
"epoch": 0.32094594594594594,
"grad_norm": 1.7966706386985845,
"learning_rate": 3.834897957887069e-06,
"loss": 0.7411,
"step": 190
},
{
"epoch": 0.32263513513513514,
"grad_norm": 1.7260868732827916,
"learning_rate": 3.832705913379087e-06,
"loss": 0.6961,
"step": 191
},
{
"epoch": 0.32432432432432434,
"grad_norm": 1.417051027809352,
"learning_rate": 3.830500047847628e-06,
"loss": 0.681,
"step": 192
},
{
"epoch": 0.3260135135135135,
"grad_norm": 1.479899217335837,
"learning_rate": 3.828280377927833e-06,
"loss": 0.701,
"step": 193
},
{
"epoch": 0.3277027027027027,
"grad_norm": 1.4191602227151927,
"learning_rate": 3.826046920358943e-06,
"loss": 0.6454,
"step": 194
},
{
"epoch": 0.3293918918918919,
"grad_norm": 1.3454083625003137,
"learning_rate": 3.82379969198418e-06,
"loss": 0.6771,
"step": 195
},
{
"epoch": 0.3310810810810811,
"grad_norm": 1.9842557988763356,
"learning_rate": 3.821538709750614e-06,
"loss": 0.7267,
"step": 196
},
{
"epoch": 0.3327702702702703,
"grad_norm": 1.4358235281752614,
"learning_rate": 3.819263990709037e-06,
"loss": 0.7401,
"step": 197
},
{
"epoch": 0.3344594594594595,
"grad_norm": 1.3515480343034862,
"learning_rate": 3.816975552013836e-06,
"loss": 0.672,
"step": 198
},
{
"epoch": 0.33614864864864863,
"grad_norm": 1.3859740753777527,
"learning_rate": 3.814673410922861e-06,
"loss": 0.6601,
"step": 199
},
{
"epoch": 0.33783783783783783,
"grad_norm": 1.4541566973376086,
"learning_rate": 3.8123575847972977e-06,
"loss": 0.7193,
"step": 200
},
{
"epoch": 0.33952702702702703,
"grad_norm": 1.480164342260533,
"learning_rate": 3.8100280911015333e-06,
"loss": 0.7088,
"step": 201
},
{
"epoch": 0.34121621621621623,
"grad_norm": 1.4289314706161513,
"learning_rate": 3.8076849474030286e-06,
"loss": 0.6853,
"step": 202
},
{
"epoch": 0.34290540540540543,
"grad_norm": 1.4130120631059673,
"learning_rate": 3.8053281713721804e-06,
"loss": 0.6854,
"step": 203
},
{
"epoch": 0.34459459459459457,
"grad_norm": 1.3964253471976507,
"learning_rate": 3.802957780782195e-06,
"loss": 0.6842,
"step": 204
},
{
"epoch": 0.34628378378378377,
"grad_norm": 1.2981821719803015,
"learning_rate": 3.800573793508948e-06,
"loss": 0.7186,
"step": 205
},
{
"epoch": 0.34797297297297297,
"grad_norm": 1.5058610876069536,
"learning_rate": 3.7981762275308514e-06,
"loss": 0.6836,
"step": 206
},
{
"epoch": 0.34966216216216217,
"grad_norm": 1.3543583050604886,
"learning_rate": 3.7957651009287214e-06,
"loss": 0.6814,
"step": 207
},
{
"epoch": 0.35135135135135137,
"grad_norm": 1.4591142445601828,
"learning_rate": 3.7933404318856365e-06,
"loss": 0.6472,
"step": 208
},
{
"epoch": 0.3530405405405405,
"grad_norm": 1.5357516266772646,
"learning_rate": 3.7909022386868042e-06,
"loss": 0.7252,
"step": 209
},
{
"epoch": 0.3547297297297297,
"grad_norm": 1.3504853847903922,
"learning_rate": 3.7884505397194224e-06,
"loss": 0.6627,
"step": 210
},
{
"epoch": 0.3564189189189189,
"grad_norm": 1.4204456318259766,
"learning_rate": 3.7859853534725393e-06,
"loss": 0.7244,
"step": 211
},
{
"epoch": 0.3581081081081081,
"grad_norm": 1.3315700281844445,
"learning_rate": 3.783506698536916e-06,
"loss": 0.6536,
"step": 212
},
{
"epoch": 0.3597972972972973,
"grad_norm": 1.552425593407634,
"learning_rate": 3.7810145936048846e-06,
"loss": 0.6733,
"step": 213
},
{
"epoch": 0.3614864864864865,
"grad_norm": 1.407167273299544,
"learning_rate": 3.778509057470208e-06,
"loss": 0.6622,
"step": 214
},
{
"epoch": 0.36317567567567566,
"grad_norm": 1.4261346346811004,
"learning_rate": 3.7759901090279385e-06,
"loss": 0.7443,
"step": 215
},
{
"epoch": 0.36486486486486486,
"grad_norm": 1.370518370669021,
"learning_rate": 3.7734577672742754e-06,
"loss": 0.7047,
"step": 216
},
{
"epoch": 0.36655405405405406,
"grad_norm": 1.3021833652741044,
"learning_rate": 3.7709120513064196e-06,
"loss": 0.691,
"step": 217
},
{
"epoch": 0.36824324324324326,
"grad_norm": 1.3463537418495264,
"learning_rate": 3.768352980322433e-06,
"loss": 0.686,
"step": 218
},
{
"epoch": 0.36993243243243246,
"grad_norm": 1.5841455095635015,
"learning_rate": 3.7657805736210905e-06,
"loss": 0.7098,
"step": 219
},
{
"epoch": 0.3716216216216216,
"grad_norm": 1.2932754452322628,
"learning_rate": 3.763194850601737e-06,
"loss": 0.6325,
"step": 220
},
{
"epoch": 0.3733108108108108,
"grad_norm": 1.3674162336593572,
"learning_rate": 3.7605958307641393e-06,
"loss": 0.7169,
"step": 221
},
{
"epoch": 0.375,
"grad_norm": 1.4335929535708738,
"learning_rate": 3.7579835337083408e-06,
"loss": 0.6958,
"step": 222
},
{
"epoch": 0.3766891891891892,
"grad_norm": 1.3707580832113364,
"learning_rate": 3.755357979134511e-06,
"loss": 0.6482,
"step": 223
},
{
"epoch": 0.3783783783783784,
"grad_norm": 1.4338771915089066,
"learning_rate": 3.7527191868428003e-06,
"loss": 0.7001,
"step": 224
},
{
"epoch": 0.38006756756756754,
"grad_norm": 1.4843289475059827,
"learning_rate": 3.750067176733189e-06,
"loss": 0.6732,
"step": 225
},
{
"epoch": 0.38175675675675674,
"grad_norm": 1.5775222455201545,
"learning_rate": 3.7474019688053346e-06,
"loss": 0.7174,
"step": 226
},
{
"epoch": 0.38344594594594594,
"grad_norm": 1.4415117197399947,
"learning_rate": 3.744723583158427e-06,
"loss": 0.7392,
"step": 227
},
{
"epoch": 0.38513513513513514,
"grad_norm": 1.343529072793593,
"learning_rate": 3.742032039991031e-06,
"loss": 0.6349,
"step": 228
},
{
"epoch": 0.38682432432432434,
"grad_norm": 2.10680046167969,
"learning_rate": 3.739327359600938e-06,
"loss": 0.6924,
"step": 229
},
{
"epoch": 0.3885135135135135,
"grad_norm": 1.332769490719328,
"learning_rate": 3.736609562385011e-06,
"loss": 0.6563,
"step": 230
},
{
"epoch": 0.3902027027027027,
"grad_norm": 1.3133901303899669,
"learning_rate": 3.73387866883903e-06,
"loss": 0.6815,
"step": 231
},
{
"epoch": 0.3918918918918919,
"grad_norm": 1.358025933731894,
"learning_rate": 3.731134699557541e-06,
"loss": 0.7082,
"step": 232
},
{
"epoch": 0.3935810810810811,
"grad_norm": 1.3424987866213658,
"learning_rate": 3.7283776752336966e-06,
"loss": 0.6801,
"step": 233
},
{
"epoch": 0.3952702702702703,
"grad_norm": 1.4157554026587067,
"learning_rate": 3.725607616659101e-06,
"loss": 0.709,
"step": 234
},
{
"epoch": 0.3969594594594595,
"grad_norm": 1.3201455751629254,
"learning_rate": 3.7228245447236565e-06,
"loss": 0.7228,
"step": 235
},
{
"epoch": 0.39864864864864863,
"grad_norm": 1.327402943647592,
"learning_rate": 3.7200284804154006e-06,
"loss": 0.6788,
"step": 236
},
{
"epoch": 0.40033783783783783,
"grad_norm": 1.2643583008290484,
"learning_rate": 3.717219444820353e-06,
"loss": 0.7641,
"step": 237
},
{
"epoch": 0.40202702702702703,
"grad_norm": 1.409052594991303,
"learning_rate": 3.7143974591223507e-06,
"loss": 0.6654,
"step": 238
},
{
"epoch": 0.40371621621621623,
"grad_norm": 1.3890965869384935,
"learning_rate": 3.711562544602895e-06,
"loss": 0.7743,
"step": 239
},
{
"epoch": 0.40540540540540543,
"grad_norm": 1.2046354984122658,
"learning_rate": 3.7087147226409854e-06,
"loss": 0.6307,
"step": 240
},
{
"epoch": 0.40709459459459457,
"grad_norm": 1.3627485443451046,
"learning_rate": 3.705854014712962e-06,
"loss": 0.6762,
"step": 241
},
{
"epoch": 0.40878378378378377,
"grad_norm": 1.443105456149895,
"learning_rate": 3.7029804423923405e-06,
"loss": 0.6653,
"step": 242
},
{
"epoch": 0.41047297297297297,
"grad_norm": 1.3802124106502922,
"learning_rate": 3.7000940273496526e-06,
"loss": 0.6859,
"step": 243
},
{
"epoch": 0.41216216216216217,
"grad_norm": 1.3786708758892288,
"learning_rate": 3.69719479135228e-06,
"loss": 0.704,
"step": 244
},
{
"epoch": 0.41385135135135137,
"grad_norm": 1.6258577516719144,
"learning_rate": 3.694282756264293e-06,
"loss": 0.6779,
"step": 245
},
{
"epoch": 0.4155405405405405,
"grad_norm": 1.440767640944531,
"learning_rate": 3.6913579440462813e-06,
"loss": 0.6907,
"step": 246
},
{
"epoch": 0.4172297297297297,
"grad_norm": 1.5767111896131487,
"learning_rate": 3.6884203767551933e-06,
"loss": 0.7245,
"step": 247
},
{
"epoch": 0.4189189189189189,
"grad_norm": 1.4496934104421961,
"learning_rate": 3.685470076544167e-06,
"loss": 0.7094,
"step": 248
},
{
"epoch": 0.4206081081081081,
"grad_norm": 1.90004392560482,
"learning_rate": 3.6825070656623624e-06,
"loss": 0.6901,
"step": 249
},
{
"epoch": 0.4222972972972973,
"grad_norm": 1.4383292682665025,
"learning_rate": 3.679531366454796e-06,
"loss": 0.6375,
"step": 250
},
{
"epoch": 0.4239864864864865,
"grad_norm": 1.2362701389465913,
"learning_rate": 3.67654300136217e-06,
"loss": 0.665,
"step": 251
},
{
"epoch": 0.42567567567567566,
"grad_norm": 1.46687594364853,
"learning_rate": 3.6735419929207053e-06,
"loss": 0.7318,
"step": 252
},
{
"epoch": 0.42736486486486486,
"grad_norm": 1.4321445965555986,
"learning_rate": 3.670528363761969e-06,
"loss": 0.7179,
"step": 253
},
{
"epoch": 0.42905405405405406,
"grad_norm": 1.8152115334660424,
"learning_rate": 3.6675021366127065e-06,
"loss": 0.7054,
"step": 254
},
{
"epoch": 0.43074324324324326,
"grad_norm": 1.429230095995432,
"learning_rate": 3.6644633342946684e-06,
"loss": 0.6142,
"step": 255
},
{
"epoch": 0.43243243243243246,
"grad_norm": 1.4522752487614075,
"learning_rate": 3.6614119797244365e-06,
"loss": 0.698,
"step": 256
},
{
"epoch": 0.4341216216216216,
"grad_norm": 1.5568752097630847,
"learning_rate": 3.6583480959132564e-06,
"loss": 0.7271,
"step": 257
},
{
"epoch": 0.4358108108108108,
"grad_norm": 1.3610557476395386,
"learning_rate": 3.655271705966859e-06,
"loss": 0.7206,
"step": 258
},
{
"epoch": 0.4375,
"grad_norm": 1.4012888384359043,
"learning_rate": 3.6521828330852876e-06,
"loss": 0.7165,
"step": 259
},
{
"epoch": 0.4391891891891892,
"grad_norm": 1.4325647232787657,
"learning_rate": 3.6490815005627244e-06,
"loss": 0.7167,
"step": 260
},
{
"epoch": 0.4408783783783784,
"grad_norm": 1.3071946737965967,
"learning_rate": 3.6459677317873127e-06,
"loss": 0.7068,
"step": 261
},
{
"epoch": 0.44256756756756754,
"grad_norm": 1.3337930545508176,
"learning_rate": 3.6428415502409832e-06,
"loss": 0.6375,
"step": 262
},
{
"epoch": 0.44425675675675674,
"grad_norm": 1.374084685518983,
"learning_rate": 3.6397029794992734e-06,
"loss": 0.6604,
"step": 263
},
{
"epoch": 0.44594594594594594,
"grad_norm": 1.5129308344285353,
"learning_rate": 3.6365520432311526e-06,
"loss": 0.7708,
"step": 264
},
{
"epoch": 0.44763513513513514,
"grad_norm": 1.3480810073566278,
"learning_rate": 3.633388765198843e-06,
"loss": 0.672,
"step": 265
},
{
"epoch": 0.44932432432432434,
"grad_norm": 1.4142271399614021,
"learning_rate": 3.6302131692576397e-06,
"loss": 0.6493,
"step": 266
},
{
"epoch": 0.4510135135135135,
"grad_norm": 1.3926074050818393,
"learning_rate": 3.62702527935573e-06,
"loss": 0.7032,
"step": 267
},
{
"epoch": 0.4527027027027027,
"grad_norm": 1.3416220663606933,
"learning_rate": 3.6238251195340146e-06,
"loss": 0.6994,
"step": 268
},
{
"epoch": 0.4543918918918919,
"grad_norm": 1.4002752256004076,
"learning_rate": 3.6206127139259264e-06,
"loss": 0.713,
"step": 269
},
{
"epoch": 0.4560810810810811,
"grad_norm": 1.4228643045362366,
"learning_rate": 3.6173880867572475e-06,
"loss": 0.6981,
"step": 270
},
{
"epoch": 0.4577702702702703,
"grad_norm": 1.3277514022496684,
"learning_rate": 3.614151262345925e-06,
"loss": 0.6278,
"step": 271
},
{
"epoch": 0.4594594594594595,
"grad_norm": 1.5614757390183474,
"learning_rate": 3.610902265101892e-06,
"loss": 0.6662,
"step": 272
},
{
"epoch": 0.46114864864864863,
"grad_norm": 1.377452591839154,
"learning_rate": 3.607641119526878e-06,
"loss": 0.7131,
"step": 273
},
{
"epoch": 0.46283783783783783,
"grad_norm": 1.3254757212732167,
"learning_rate": 3.6043678502142293e-06,
"loss": 0.6995,
"step": 274
},
{
"epoch": 0.46452702702702703,
"grad_norm": 1.361651841573283,
"learning_rate": 3.6010824818487207e-06,
"loss": 0.7173,
"step": 275
},
{
"epoch": 0.46621621621621623,
"grad_norm": 1.4283922402308293,
"learning_rate": 3.5977850392063687e-06,
"loss": 0.6889,
"step": 276
},
{
"epoch": 0.46790540540540543,
"grad_norm": 1.3050372631128504,
"learning_rate": 3.5944755471542464e-06,
"loss": 0.6299,
"step": 277
},
{
"epoch": 0.46959459459459457,
"grad_norm": 1.3310272595986719,
"learning_rate": 3.591154030650296e-06,
"loss": 0.7197,
"step": 278
},
{
"epoch": 0.47128378378378377,
"grad_norm": 1.462114533432708,
"learning_rate": 3.587820514743139e-06,
"loss": 0.6827,
"step": 279
},
{
"epoch": 0.47297297297297297,
"grad_norm": 1.360632157690388,
"learning_rate": 3.5844750245718897e-06,
"loss": 0.7245,
"step": 280
},
{
"epoch": 0.47466216216216217,
"grad_norm": 1.4456255398928297,
"learning_rate": 3.5811175853659623e-06,
"loss": 0.7162,
"step": 281
},
{
"epoch": 0.47635135135135137,
"grad_norm": 1.3510909857414901,
"learning_rate": 3.5777482224448836e-06,
"loss": 0.6797,
"step": 282
},
{
"epoch": 0.4780405405405405,
"grad_norm": 1.3321362166986153,
"learning_rate": 3.5743669612181e-06,
"loss": 0.6759,
"step": 283
},
{
"epoch": 0.4797297297297297,
"grad_norm": 1.379461432799908,
"learning_rate": 3.570973827184789e-06,
"loss": 0.7133,
"step": 284
},
{
"epoch": 0.4814189189189189,
"grad_norm": 1.4068419824919371,
"learning_rate": 3.5675688459336623e-06,
"loss": 0.6568,
"step": 285
},
{
"epoch": 0.4831081081081081,
"grad_norm": 1.3786876200293305,
"learning_rate": 3.5641520431427766e-06,
"loss": 0.6935,
"step": 286
},
{
"epoch": 0.4847972972972973,
"grad_norm": 1.3495616584593104,
"learning_rate": 3.5607234445793387e-06,
"loss": 0.6745,
"step": 287
},
{
"epoch": 0.4864864864864865,
"grad_norm": 1.3576777119745298,
"learning_rate": 3.55728307609951e-06,
"loss": 0.6784,
"step": 288
},
{
"epoch": 0.48817567567567566,
"grad_norm": 1.307035703069375,
"learning_rate": 3.553830963648214e-06,
"loss": 0.6857,
"step": 289
},
{
"epoch": 0.48986486486486486,
"grad_norm": 1.4002775832288137,
"learning_rate": 3.5503671332589384e-06,
"loss": 0.7473,
"step": 290
},
{
"epoch": 0.49155405405405406,
"grad_norm": 1.373483342786104,
"learning_rate": 3.5468916110535397e-06,
"loss": 0.7624,
"step": 291
},
{
"epoch": 0.49324324324324326,
"grad_norm": 1.301856349289806,
"learning_rate": 3.5434044232420463e-06,
"loss": 0.6667,
"step": 292
},
{
"epoch": 0.49493243243243246,
"grad_norm": 1.3278884354841156,
"learning_rate": 3.539905596122461e-06,
"loss": 0.6685,
"step": 293
},
{
"epoch": 0.4966216216216216,
"grad_norm": 1.3342318682934637,
"learning_rate": 3.536395156080561e-06,
"loss": 0.6051,
"step": 294
},
{
"epoch": 0.4983108108108108,
"grad_norm": 1.34236526276834,
"learning_rate": 3.532873129589702e-06,
"loss": 0.6658,
"step": 295
},
{
"epoch": 0.5,
"grad_norm": 1.3367750769250621,
"learning_rate": 3.529339543210617e-06,
"loss": 0.6505,
"step": 296
},
{
"epoch": 0.5,
"eval_loss": 0.5758998990058899,
"eval_runtime": 949.2326,
"eval_samples_per_second": 5.741,
"eval_steps_per_second": 0.359,
"step": 296
},
{
"epoch": 0.5016891891891891,
"grad_norm": 1.5001207756046795,
"learning_rate": 3.5257944235912133e-06,
"loss": 0.7564,
"step": 297
},
{
"epoch": 0.5033783783783784,
"grad_norm": 1.3616930930034892,
"learning_rate": 3.522237797466377e-06,
"loss": 0.7024,
"step": 298
},
{
"epoch": 0.5050675675675675,
"grad_norm": 1.324985823959547,
"learning_rate": 3.5186696916577665e-06,
"loss": 0.6364,
"step": 299
},
{
"epoch": 0.5067567567567568,
"grad_norm": 1.3179657688973248,
"learning_rate": 3.5150901330736132e-06,
"loss": 0.6321,
"step": 300
},
{
"epoch": 0.5084459459459459,
"grad_norm": 1.2671693135409419,
"learning_rate": 3.5114991487085164e-06,
"loss": 0.6639,
"step": 301
},
{
"epoch": 0.5101351351351351,
"grad_norm": 1.3146371625412312,
"learning_rate": 3.5078967656432427e-06,
"loss": 0.6528,
"step": 302
},
{
"epoch": 0.5118243243243243,
"grad_norm": 1.4881216764035425,
"learning_rate": 3.5042830110445183e-06,
"loss": 0.7018,
"step": 303
},
{
"epoch": 0.5135135135135135,
"grad_norm": 1.4149915220983738,
"learning_rate": 3.5006579121648267e-06,
"loss": 0.6319,
"step": 304
},
{
"epoch": 0.5152027027027027,
"grad_norm": 1.2678817742325494,
"learning_rate": 3.497021496342202e-06,
"loss": 0.6819,
"step": 305
},
{
"epoch": 0.5168918918918919,
"grad_norm": 1.4242737673502084,
"learning_rate": 3.4933737910000226e-06,
"loss": 0.6619,
"step": 306
},
{
"epoch": 0.518581081081081,
"grad_norm": 1.3987855120700334,
"learning_rate": 3.489714823646806e-06,
"loss": 0.6767,
"step": 307
},
{
"epoch": 0.5202702702702703,
"grad_norm": 1.3015873879741848,
"learning_rate": 3.4860446218759982e-06,
"loss": 0.6568,
"step": 308
},
{
"epoch": 0.5219594594594594,
"grad_norm": 1.2995415158579773,
"learning_rate": 3.4823632133657698e-06,
"loss": 0.6928,
"step": 309
},
{
"epoch": 0.5236486486486487,
"grad_norm": 1.3973146351155765,
"learning_rate": 3.478670625878803e-06,
"loss": 0.7464,
"step": 310
},
{
"epoch": 0.5253378378378378,
"grad_norm": 1.3419676372680656,
"learning_rate": 3.474966887262085e-06,
"loss": 0.627,
"step": 311
},
{
"epoch": 0.527027027027027,
"grad_norm": 1.2957464216055319,
"learning_rate": 3.4712520254466985e-06,
"loss": 0.6509,
"step": 312
},
{
"epoch": 0.5287162162162162,
"grad_norm": 1.4041048903525055,
"learning_rate": 3.4675260684476077e-06,
"loss": 0.7146,
"step": 313
},
{
"epoch": 0.5304054054054054,
"grad_norm": 1.3505461190296533,
"learning_rate": 3.4637890443634507e-06,
"loss": 0.6383,
"step": 314
},
{
"epoch": 0.5320945945945946,
"grad_norm": 1.5396055781468738,
"learning_rate": 3.460040981376325e-06,
"loss": 0.6604,
"step": 315
},
{
"epoch": 0.5337837837837838,
"grad_norm": 1.448008568089465,
"learning_rate": 3.4562819077515765e-06,
"loss": 0.7216,
"step": 316
},
{
"epoch": 0.535472972972973,
"grad_norm": 1.286441416505065,
"learning_rate": 3.4525118518375863e-06,
"loss": 0.6402,
"step": 317
},
{
"epoch": 0.5371621621621622,
"grad_norm": 1.5964613265568737,
"learning_rate": 3.4487308420655557e-06,
"loss": 0.7228,
"step": 318
},
{
"epoch": 0.5388513513513513,
"grad_norm": 1.4345479080610504,
"learning_rate": 3.444938906949293e-06,
"loss": 0.7091,
"step": 319
},
{
"epoch": 0.5405405405405406,
"grad_norm": 1.3311234454544891,
"learning_rate": 3.4411360750849973e-06,
"loss": 0.7218,
"step": 320
},
{
"epoch": 0.5422297297297297,
"grad_norm": 1.5087568673234424,
"learning_rate": 3.437322375151045e-06,
"loss": 0.7483,
"step": 321
},
{
"epoch": 0.543918918918919,
"grad_norm": 1.4737933458588186,
"learning_rate": 3.433497835907771e-06,
"loss": 0.7068,
"step": 322
},
{
"epoch": 0.5456081081081081,
"grad_norm": 1.397907808243913,
"learning_rate": 3.4296624861972524e-06,
"loss": 0.6457,
"step": 323
},
{
"epoch": 0.5472972972972973,
"grad_norm": 1.4048567528248315,
"learning_rate": 3.425816354943094e-06,
"loss": 0.6434,
"step": 324
},
{
"epoch": 0.5489864864864865,
"grad_norm": 1.4428709908043869,
"learning_rate": 3.421959471150203e-06,
"loss": 0.6821,
"step": 325
},
{
"epoch": 0.5506756756756757,
"grad_norm": 1.446561332479648,
"learning_rate": 3.418091863904582e-06,
"loss": 0.6819,
"step": 326
},
{
"epoch": 0.5523648648648649,
"grad_norm": 1.4357382541799415,
"learning_rate": 3.414213562373095e-06,
"loss": 0.6778,
"step": 327
},
{
"epoch": 0.5540540540540541,
"grad_norm": 1.325570913409981,
"learning_rate": 3.41032459580326e-06,
"loss": 0.7177,
"step": 328
},
{
"epoch": 0.5557432432432432,
"grad_norm": 1.4430530715112349,
"learning_rate": 3.4064249935230217e-06,
"loss": 0.6964,
"step": 329
},
{
"epoch": 0.5574324324324325,
"grad_norm": 1.4498604144957874,
"learning_rate": 3.4025147849405334e-06,
"loss": 0.7229,
"step": 330
},
{
"epoch": 0.5591216216216216,
"grad_norm": 1.4203134795507701,
"learning_rate": 3.3985939995439314e-06,
"loss": 0.718,
"step": 331
},
{
"epoch": 0.5608108108108109,
"grad_norm": 1.3911765358785673,
"learning_rate": 3.3946626669011175e-06,
"loss": 0.5948,
"step": 332
},
{
"epoch": 0.5625,
"grad_norm": 1.3184457107748184,
"learning_rate": 3.3907208166595326e-06,
"loss": 0.7103,
"step": 333
},
{
"epoch": 0.5641891891891891,
"grad_norm": 1.301270165189858,
"learning_rate": 3.3867684785459353e-06,
"loss": 0.6371,
"step": 334
},
{
"epoch": 0.5658783783783784,
"grad_norm": 1.454573205764328,
"learning_rate": 3.3828056823661754e-06,
"loss": 0.7071,
"step": 335
},
{
"epoch": 0.5675675675675675,
"grad_norm": 1.264644666852447,
"learning_rate": 3.378832458004969e-06,
"loss": 0.6851,
"step": 336
},
{
"epoch": 0.5692567567567568,
"grad_norm": 1.459099029368749,
"learning_rate": 3.3748488354256786e-06,
"loss": 0.6663,
"step": 337
},
{
"epoch": 0.5709459459459459,
"grad_norm": 1.5212632276053133,
"learning_rate": 3.370854844670079e-06,
"loss": 0.7275,
"step": 338
},
{
"epoch": 0.5726351351351351,
"grad_norm": 1.2987587957080753,
"learning_rate": 3.3668505158581376e-06,
"loss": 0.6521,
"step": 339
},
{
"epoch": 0.5743243243243243,
"grad_norm": 1.4304350175753886,
"learning_rate": 3.3628358791877826e-06,
"loss": 0.6774,
"step": 340
},
{
"epoch": 0.5760135135135135,
"grad_norm": 1.3183042000590088,
"learning_rate": 3.358810964934676e-06,
"loss": 0.6695,
"step": 341
},
{
"epoch": 0.5777027027027027,
"grad_norm": 1.3865225075934426,
"learning_rate": 3.3547758034519904e-06,
"loss": 0.6528,
"step": 342
},
{
"epoch": 0.5793918918918919,
"grad_norm": 1.312113624994429,
"learning_rate": 3.3507304251701724e-06,
"loss": 0.6622,
"step": 343
},
{
"epoch": 0.581081081081081,
"grad_norm": 1.9551062964719512,
"learning_rate": 3.3466748605967173e-06,
"loss": 0.7228,
"step": 344
},
{
"epoch": 0.5827702702702703,
"grad_norm": 1.488075428878002,
"learning_rate": 3.3426091403159404e-06,
"loss": 0.7062,
"step": 345
},
{
"epoch": 0.5844594594594594,
"grad_norm": 1.2927191218758096,
"learning_rate": 3.3385332949887426e-06,
"loss": 0.6142,
"step": 346
},
{
"epoch": 0.5861486486486487,
"grad_norm": 1.418534591843739,
"learning_rate": 3.334447355352381e-06,
"loss": 0.6474,
"step": 347
},
{
"epoch": 0.5878378378378378,
"grad_norm": 1.3562653998164598,
"learning_rate": 3.3303513522202396e-06,
"loss": 0.7144,
"step": 348
},
{
"epoch": 0.589527027027027,
"grad_norm": 1.4149402373315325,
"learning_rate": 3.3262453164815904e-06,
"loss": 0.629,
"step": 349
},
{
"epoch": 0.5912162162162162,
"grad_norm": 1.4220368804725194,
"learning_rate": 3.322129279101368e-06,
"loss": 0.6083,
"step": 350
},
{
"epoch": 0.5929054054054054,
"grad_norm": 1.4608723242530153,
"learning_rate": 3.3180032711199305e-06,
"loss": 0.7124,
"step": 351
},
{
"epoch": 0.5945945945945946,
"grad_norm": 1.3227326094937233,
"learning_rate": 3.3138673236528285e-06,
"loss": 0.7078,
"step": 352
},
{
"epoch": 0.5962837837837838,
"grad_norm": 1.379499039240384,
"learning_rate": 3.3097214678905703e-06,
"loss": 0.6387,
"step": 353
},
{
"epoch": 0.597972972972973,
"grad_norm": 1.2469548286035632,
"learning_rate": 3.305565735098383e-06,
"loss": 0.7052,
"step": 354
},
{
"epoch": 0.5996621621621622,
"grad_norm": 1.2848214472322932,
"learning_rate": 3.3014001566159823e-06,
"loss": 0.6886,
"step": 355
},
{
"epoch": 0.6013513513513513,
"grad_norm": 1.5153345614568845,
"learning_rate": 3.2972247638573326e-06,
"loss": 0.6655,
"step": 356
},
{
"epoch": 0.6030405405405406,
"grad_norm": 1.325640865438285,
"learning_rate": 3.2930395883104106e-06,
"loss": 0.7274,
"step": 357
},
{
"epoch": 0.6047297297297297,
"grad_norm": 1.3993390256389666,
"learning_rate": 3.2888446615369684e-06,
"loss": 0.6558,
"step": 358
},
{
"epoch": 0.606418918918919,
"grad_norm": 1.4018696537559865,
"learning_rate": 3.284640015172294e-06,
"loss": 0.71,
"step": 359
},
{
"epoch": 0.6081081081081081,
"grad_norm": 1.415708082259971,
"learning_rate": 3.280425680924976e-06,
"loss": 0.7008,
"step": 360
},
{
"epoch": 0.6097972972972973,
"grad_norm": 1.3763409840970744,
"learning_rate": 3.2762016905766614e-06,
"loss": 0.7022,
"step": 361
},
{
"epoch": 0.6114864864864865,
"grad_norm": 1.325496237946089,
"learning_rate": 3.271968075981817e-06,
"loss": 0.6589,
"step": 362
},
{
"epoch": 0.6131756756756757,
"grad_norm": 1.2826988616253867,
"learning_rate": 3.2677248690674903e-06,
"loss": 0.6855,
"step": 363
},
{
"epoch": 0.6148648648648649,
"grad_norm": 1.2591029878970108,
"learning_rate": 3.2634721018330638e-06,
"loss": 0.6219,
"step": 364
},
{
"epoch": 0.6165540540540541,
"grad_norm": 1.3387803144672588,
"learning_rate": 3.2592098063500222e-06,
"loss": 0.7039,
"step": 365
},
{
"epoch": 0.6182432432432432,
"grad_norm": 1.309308982930036,
"learning_rate": 3.2549380147617037e-06,
"loss": 0.6709,
"step": 366
},
{
"epoch": 0.6199324324324325,
"grad_norm": 1.3431163959736079,
"learning_rate": 3.2506567592830585e-06,
"loss": 0.6324,
"step": 367
},
{
"epoch": 0.6216216216216216,
"grad_norm": 1.4979970516420926,
"learning_rate": 3.246366072200409e-06,
"loss": 0.6709,
"step": 368
},
{
"epoch": 0.6233108108108109,
"grad_norm": 1.306386734821211,
"learning_rate": 3.2420659858712035e-06,
"loss": 0.6706,
"step": 369
},
{
"epoch": 0.625,
"grad_norm": 1.8009179522234993,
"learning_rate": 3.2377565327237727e-06,
"loss": 0.7755,
"step": 370
},
{
"epoch": 0.6266891891891891,
"grad_norm": 1.495289761525164,
"learning_rate": 3.2334377452570866e-06,
"loss": 0.6674,
"step": 371
},
{
"epoch": 0.6283783783783784,
"grad_norm": 1.348145783274342,
"learning_rate": 3.2291096560405055e-06,
"loss": 0.6731,
"step": 372
},
{
"epoch": 0.6300675675675675,
"grad_norm": 1.3341921892745692,
"learning_rate": 3.2247722977135416e-06,
"loss": 0.6764,
"step": 373
},
{
"epoch": 0.6317567567567568,
"grad_norm": 1.2841278378116288,
"learning_rate": 3.2204257029856054e-06,
"loss": 0.632,
"step": 374
},
{
"epoch": 0.6334459459459459,
"grad_norm": 1.720730722690385,
"learning_rate": 3.216069904635762e-06,
"loss": 0.7334,
"step": 375
},
{
"epoch": 0.6351351351351351,
"grad_norm": 1.3542485100514752,
"learning_rate": 3.2117049355124853e-06,
"loss": 0.7303,
"step": 376
},
{
"epoch": 0.6368243243243243,
"grad_norm": 1.2936690883966144,
"learning_rate": 3.207330828533408e-06,
"loss": 0.643,
"step": 377
},
{
"epoch": 0.6385135135135135,
"grad_norm": 1.3864749698122996,
"learning_rate": 3.2029476166850754e-06,
"loss": 0.705,
"step": 378
},
{
"epoch": 0.6402027027027027,
"grad_norm": 1.3256233676801643,
"learning_rate": 3.1985553330226935e-06,
"loss": 0.6869,
"step": 379
},
{
"epoch": 0.6418918918918919,
"grad_norm": 1.2944610218294335,
"learning_rate": 3.1941540106698846e-06,
"loss": 0.6514,
"step": 380
},
{
"epoch": 0.643581081081081,
"grad_norm": 1.501294959024127,
"learning_rate": 3.189743682818432e-06,
"loss": 0.6621,
"step": 381
},
{
"epoch": 0.6452702702702703,
"grad_norm": 1.3461382821528753,
"learning_rate": 3.1853243827280337e-06,
"loss": 0.6812,
"step": 382
},
{
"epoch": 0.6469594594594594,
"grad_norm": 1.3019577565523754,
"learning_rate": 3.1808961437260504e-06,
"loss": 0.7056,
"step": 383
},
{
"epoch": 0.6486486486486487,
"grad_norm": 1.3684415920358612,
"learning_rate": 3.176458999207252e-06,
"loss": 0.6613,
"step": 384
},
{
"epoch": 0.6503378378378378,
"grad_norm": 1.2594356862234934,
"learning_rate": 3.1720129826335723e-06,
"loss": 0.6383,
"step": 385
},
{
"epoch": 0.652027027027027,
"grad_norm": 1.3224364937940278,
"learning_rate": 3.167558127533847e-06,
"loss": 0.662,
"step": 386
},
{
"epoch": 0.6537162162162162,
"grad_norm": 1.361590839149971,
"learning_rate": 3.163094467503568e-06,
"loss": 0.6702,
"step": 387
},
{
"epoch": 0.6554054054054054,
"grad_norm": 1.3971962151286201,
"learning_rate": 3.1586220362046296e-06,
"loss": 0.6106,
"step": 388
},
{
"epoch": 0.6570945945945946,
"grad_norm": 1.3370080857838247,
"learning_rate": 3.15414086736507e-06,
"loss": 0.6868,
"step": 389
},
{
"epoch": 0.6587837837837838,
"grad_norm": 1.2272857766272367,
"learning_rate": 3.1496509947788235e-06,
"loss": 0.6709,
"step": 390
},
{
"epoch": 0.660472972972973,
"grad_norm": 1.3086690016897455,
"learning_rate": 3.145152452305458e-06,
"loss": 0.6632,
"step": 391
},
{
"epoch": 0.6621621621621622,
"grad_norm": 1.3851181280606764,
"learning_rate": 3.140645273869928e-06,
"loss": 0.6983,
"step": 392
},
{
"epoch": 0.6638513513513513,
"grad_norm": 1.5878523469615937,
"learning_rate": 3.136129493462311e-06,
"loss": 0.6712,
"step": 393
},
{
"epoch": 0.6655405405405406,
"grad_norm": 1.371150403456645,
"learning_rate": 3.1316051451375583e-06,
"loss": 0.684,
"step": 394
},
{
"epoch": 0.6672297297297297,
"grad_norm": 1.3555611551082325,
"learning_rate": 3.127072263015231e-06,
"loss": 0.7178,
"step": 395
},
{
"epoch": 0.668918918918919,
"grad_norm": 1.325289229316507,
"learning_rate": 3.122530881279248e-06,
"loss": 0.6923,
"step": 396
},
{
"epoch": 0.6706081081081081,
"grad_norm": 1.5525432364230671,
"learning_rate": 3.1179810341776267e-06,
"loss": 0.674,
"step": 397
},
{
"epoch": 0.6722972972972973,
"grad_norm": 1.3171395938666564,
"learning_rate": 3.113422756022225e-06,
"loss": 0.642,
"step": 398
},
{
"epoch": 0.6739864864864865,
"grad_norm": 1.3270242075423406,
"learning_rate": 3.108856081188481e-06,
"loss": 0.6707,
"step": 399
},
{
"epoch": 0.6756756756756757,
"grad_norm": 1.3484108066463232,
"learning_rate": 3.1042810441151553e-06,
"loss": 0.6379,
"step": 400
},
{
"epoch": 0.6773648648648649,
"grad_norm": 1.3309851729704076,
"learning_rate": 3.0996976793040695e-06,
"loss": 0.6682,
"step": 401
},
{
"epoch": 0.6790540540540541,
"grad_norm": 1.2652506659655327,
"learning_rate": 3.095106021319851e-06,
"loss": 0.6847,
"step": 402
},
{
"epoch": 0.6807432432432432,
"grad_norm": 1.3081964276113105,
"learning_rate": 3.0905061047896643e-06,
"loss": 0.6441,
"step": 403
},
{
"epoch": 0.6824324324324325,
"grad_norm": 1.35647381776027,
"learning_rate": 3.0858979644029575e-06,
"loss": 0.7234,
"step": 404
},
{
"epoch": 0.6841216216216216,
"grad_norm": 1.372565244390501,
"learning_rate": 3.0812816349111954e-06,
"loss": 0.6534,
"step": 405
},
{
"epoch": 0.6858108108108109,
"grad_norm": 1.33328522768366,
"learning_rate": 3.0766571511276002e-06,
"loss": 0.6732,
"step": 406
},
{
"epoch": 0.6875,
"grad_norm": 1.320063333364407,
"learning_rate": 3.0720245479268884e-06,
"loss": 0.6463,
"step": 407
},
{
"epoch": 0.6891891891891891,
"grad_norm": 1.281578342443152,
"learning_rate": 3.0673838602450085e-06,
"loss": 0.6341,
"step": 408
},
{
"epoch": 0.6908783783783784,
"grad_norm": 1.81611714342352,
"learning_rate": 3.0627351230788744e-06,
"loss": 0.6715,
"step": 409
},
{
"epoch": 0.6925675675675675,
"grad_norm": 1.4517692849833734,
"learning_rate": 3.0580783714861054e-06,
"loss": 0.6293,
"step": 410
},
{
"epoch": 0.6942567567567568,
"grad_norm": 1.2955581967933887,
"learning_rate": 3.05341364058476e-06,
"loss": 0.6706,
"step": 411
},
{
"epoch": 0.6959459459459459,
"grad_norm": 1.2989161044167714,
"learning_rate": 3.0487409655530706e-06,
"loss": 0.6485,
"step": 412
},
{
"epoch": 0.6976351351351351,
"grad_norm": 1.3016968351593294,
"learning_rate": 3.0440603816291807e-06,
"loss": 0.6685,
"step": 413
},
{
"epoch": 0.6993243243243243,
"grad_norm": 1.3145081529605454,
"learning_rate": 3.0393719241108735e-06,
"loss": 0.6469,
"step": 414
},
{
"epoch": 0.7010135135135135,
"grad_norm": 1.3639479980615052,
"learning_rate": 3.0346756283553134e-06,
"loss": 0.653,
"step": 415
},
{
"epoch": 0.7027027027027027,
"grad_norm": 1.318013354050016,
"learning_rate": 3.0299715297787737e-06,
"loss": 0.7239,
"step": 416
},
{
"epoch": 0.7043918918918919,
"grad_norm": 1.341591059459244,
"learning_rate": 3.025259663856371e-06,
"loss": 0.6943,
"step": 417
},
{
"epoch": 0.706081081081081,
"grad_norm": 1.3721630646268186,
"learning_rate": 3.0205400661217995e-06,
"loss": 0.6693,
"step": 418
},
{
"epoch": 0.7077702702702703,
"grad_norm": 1.3952679062107705,
"learning_rate": 3.0158127721670584e-06,
"loss": 0.7067,
"step": 419
},
{
"epoch": 0.7094594594594594,
"grad_norm": 1.262343902848384,
"learning_rate": 3.0110778176421913e-06,
"loss": 0.6788,
"step": 420
},
{
"epoch": 0.7111486486486487,
"grad_norm": 1.3420851582874196,
"learning_rate": 3.0063352382550074e-06,
"loss": 0.6689,
"step": 421
},
{
"epoch": 0.7128378378378378,
"grad_norm": 1.3791855105828084,
"learning_rate": 3.0015850697708217e-06,
"loss": 0.6805,
"step": 422
},
{
"epoch": 0.714527027027027,
"grad_norm": 1.5875407253674527,
"learning_rate": 2.996827348012178e-06,
"loss": 0.687,
"step": 423
},
{
"epoch": 0.7162162162162162,
"grad_norm": 1.3650745130884876,
"learning_rate": 2.992062108858584e-06,
"loss": 0.6995,
"step": 424
},
{
"epoch": 0.7179054054054054,
"grad_norm": 1.4782912422272074,
"learning_rate": 2.987289388246237e-06,
"loss": 0.7213,
"step": 425
},
{
"epoch": 0.7195945945945946,
"grad_norm": 1.3549493700401416,
"learning_rate": 2.9825092221677545e-06,
"loss": 0.6851,
"step": 426
},
{
"epoch": 0.7212837837837838,
"grad_norm": 1.3065450320634688,
"learning_rate": 2.9777216466719036e-06,
"loss": 0.6733,
"step": 427
},
{
"epoch": 0.722972972972973,
"grad_norm": 1.3874606374901184,
"learning_rate": 2.972926697863328e-06,
"loss": 0.6551,
"step": 428
},
{
"epoch": 0.7246621621621622,
"grad_norm": 1.2251403325861798,
"learning_rate": 2.968124411902275e-06,
"loss": 0.6603,
"step": 429
},
{
"epoch": 0.7263513513513513,
"grad_norm": 1.4125671966195497,
"learning_rate": 2.9633148250043236e-06,
"loss": 0.6989,
"step": 430
},
{
"epoch": 0.7280405405405406,
"grad_norm": 1.3582792827647256,
"learning_rate": 2.9584979734401135e-06,
"loss": 0.7313,
"step": 431
},
{
"epoch": 0.7297297297297297,
"grad_norm": 1.4040248422818056,
"learning_rate": 2.953673893535067e-06,
"loss": 0.6655,
"step": 432
},
{
"epoch": 0.731418918918919,
"grad_norm": 1.3542440874977937,
"learning_rate": 2.9488426216691204e-06,
"loss": 0.7208,
"step": 433
},
{
"epoch": 0.7331081081081081,
"grad_norm": 1.6421535799103113,
"learning_rate": 2.9440041942764443e-06,
"loss": 0.6847,
"step": 434
},
{
"epoch": 0.7347972972972973,
"grad_norm": 1.3015301627339435,
"learning_rate": 2.9391586478451726e-06,
"loss": 0.6793,
"step": 435
},
{
"epoch": 0.7364864864864865,
"grad_norm": 1.3756341557406138,
"learning_rate": 2.934306018917126e-06,
"loss": 0.6719,
"step": 436
},
{
"epoch": 0.7381756756756757,
"grad_norm": 1.3684325540715159,
"learning_rate": 2.929446344087537e-06,
"loss": 0.6926,
"step": 437
},
{
"epoch": 0.7398648648648649,
"grad_norm": 1.348921387693639,
"learning_rate": 2.924579660004773e-06,
"loss": 0.6538,
"step": 438
},
{
"epoch": 0.7415540540540541,
"grad_norm": 1.4451251249573025,
"learning_rate": 2.9197060033700603e-06,
"loss": 0.6952,
"step": 439
},
{
"epoch": 0.7432432432432432,
"grad_norm": 1.435659952122345,
"learning_rate": 2.914825410937208e-06,
"loss": 0.7609,
"step": 440
},
{
"epoch": 0.7449324324324325,
"grad_norm": 1.463004294888883,
"learning_rate": 2.90993791951233e-06,
"loss": 0.6777,
"step": 441
},
{
"epoch": 0.7466216216216216,
"grad_norm": 1.2911426285111054,
"learning_rate": 2.9050435659535678e-06,
"loss": 0.6805,
"step": 442
},
{
"epoch": 0.7483108108108109,
"grad_norm": 1.380380264635941,
"learning_rate": 2.900142387170812e-06,
"loss": 0.7359,
"step": 443
},
{
"epoch": 0.75,
"grad_norm": 1.4287814635062361,
"learning_rate": 2.895234420125425e-06,
"loss": 0.6564,
"step": 444
},
{
"epoch": 0.7516891891891891,
"grad_norm": 1.364150218756707,
"learning_rate": 2.8903197018299613e-06,
"loss": 0.6779,
"step": 445
},
{
"epoch": 0.7533783783783784,
"grad_norm": 1.6302546987919042,
"learning_rate": 2.8853982693478895e-06,
"loss": 0.6686,
"step": 446
},
{
"epoch": 0.7550675675675675,
"grad_norm": 1.4176851311608176,
"learning_rate": 2.8804701597933108e-06,
"loss": 0.7193,
"step": 447
},
{
"epoch": 0.7567567567567568,
"grad_norm": 1.3980473477530322,
"learning_rate": 2.8755354103306806e-06,
"loss": 0.6763,
"step": 448
},
{
"epoch": 0.7584459459459459,
"grad_norm": 1.306040542269856,
"learning_rate": 2.87059405817453e-06,
"loss": 0.6565,
"step": 449
},
{
"epoch": 0.7601351351351351,
"grad_norm": 1.421190776938713,
"learning_rate": 2.8656461405891794e-06,
"loss": 0.6544,
"step": 450
},
{
"epoch": 0.7618243243243243,
"grad_norm": 1.3558925930101302,
"learning_rate": 2.8606916948884644e-06,
"loss": 0.6992,
"step": 451
},
{
"epoch": 0.7635135135135135,
"grad_norm": 1.3597956638790247,
"learning_rate": 2.85573075843545e-06,
"loss": 0.7236,
"step": 452
},
{
"epoch": 0.7652027027027027,
"grad_norm": 1.2642452651402398,
"learning_rate": 2.8507633686421496e-06,
"loss": 0.665,
"step": 453
},
{
"epoch": 0.7668918918918919,
"grad_norm": 1.318361028472153,
"learning_rate": 2.845789562969245e-06,
"loss": 0.701,
"step": 454
},
{
"epoch": 0.768581081081081,
"grad_norm": 1.3360445051985224,
"learning_rate": 2.8408093789258e-06,
"loss": 0.6707,
"step": 455
},
{
"epoch": 0.7702702702702703,
"grad_norm": 1.2691241582564996,
"learning_rate": 2.8358228540689812e-06,
"loss": 0.6077,
"step": 456
},
{
"epoch": 0.7719594594594594,
"grad_norm": 1.323540568143679,
"learning_rate": 2.830830026003773e-06,
"loss": 0.7048,
"step": 457
},
{
"epoch": 0.7736486486486487,
"grad_norm": 1.2731559364058054,
"learning_rate": 2.825830932382694e-06,
"loss": 0.6104,
"step": 458
},
{
"epoch": 0.7753378378378378,
"grad_norm": 1.3118198520290825,
"learning_rate": 2.820825610905514e-06,
"loss": 0.6426,
"step": 459
},
{
"epoch": 0.777027027027027,
"grad_norm": 1.371723875417946,
"learning_rate": 2.815814099318968e-06,
"loss": 0.7205,
"step": 460
},
{
"epoch": 0.7787162162162162,
"grad_norm": 1.2988247931459596,
"learning_rate": 2.810796435416473e-06,
"loss": 0.6985,
"step": 461
},
{
"epoch": 0.7804054054054054,
"grad_norm": 1.3027800359441715,
"learning_rate": 2.8057726570378447e-06,
"loss": 0.7064,
"step": 462
},
{
"epoch": 0.7820945945945946,
"grad_norm": 1.299894099802323,
"learning_rate": 2.800742802069006e-06,
"loss": 0.628,
"step": 463
},
{
"epoch": 0.7837837837837838,
"grad_norm": 1.4642656243160261,
"learning_rate": 2.7957069084417093e-06,
"loss": 0.6943,
"step": 464
},
{
"epoch": 0.785472972972973,
"grad_norm": 1.3308107990437001,
"learning_rate": 2.7906650141332427e-06,
"loss": 0.7128,
"step": 465
},
{
"epoch": 0.7871621621621622,
"grad_norm": 1.3874905549157548,
"learning_rate": 2.7856171571661514e-06,
"loss": 0.6766,
"step": 466
},
{
"epoch": 0.7888513513513513,
"grad_norm": 1.3259468751863204,
"learning_rate": 2.7805633756079426e-06,
"loss": 0.6743,
"step": 467
},
{
"epoch": 0.7905405405405406,
"grad_norm": 1.3215238136524157,
"learning_rate": 2.775503707570808e-06,
"loss": 0.6575,
"step": 468
},
{
"epoch": 0.7922297297297297,
"grad_norm": 1.2709398810755754,
"learning_rate": 2.7704381912113245e-06,
"loss": 0.6739,
"step": 469
},
{
"epoch": 0.793918918918919,
"grad_norm": 1.3037615494242192,
"learning_rate": 2.7653668647301796e-06,
"loss": 0.7025,
"step": 470
},
{
"epoch": 0.7956081081081081,
"grad_norm": 1.2917843369183553,
"learning_rate": 2.7602897663718725e-06,
"loss": 0.6327,
"step": 471
},
{
"epoch": 0.7972972972972973,
"grad_norm": 1.309216686164246,
"learning_rate": 2.755206934424431e-06,
"loss": 0.6277,
"step": 472
},
{
"epoch": 0.7989864864864865,
"grad_norm": 1.371910975646893,
"learning_rate": 2.7501184072191237e-06,
"loss": 0.6407,
"step": 473
},
{
"epoch": 0.8006756756756757,
"grad_norm": 1.3727580581951706,
"learning_rate": 2.7450242231301655e-06,
"loss": 0.6536,
"step": 474
},
{
"epoch": 0.8023648648648649,
"grad_norm": 1.4020595724687033,
"learning_rate": 2.7399244205744347e-06,
"loss": 0.6807,
"step": 475
},
{
"epoch": 0.8040540540540541,
"grad_norm": 1.4148426490517583,
"learning_rate": 2.734819038011179e-06,
"loss": 0.7045,
"step": 476
},
{
"epoch": 0.8057432432432432,
"grad_norm": 1.3409500564912207,
"learning_rate": 2.729708113941727e-06,
"loss": 0.6991,
"step": 477
},
{
"epoch": 0.8074324324324325,
"grad_norm": 1.3823977849949307,
"learning_rate": 2.724591686909196e-06,
"loss": 0.6594,
"step": 478
},
{
"epoch": 0.8091216216216216,
"grad_norm": 1.2394815303889972,
"learning_rate": 2.719469795498206e-06,
"loss": 0.6467,
"step": 479
},
{
"epoch": 0.8108108108108109,
"grad_norm": 1.281104807999005,
"learning_rate": 2.714342478334583e-06,
"loss": 0.6547,
"step": 480
},
{
"epoch": 0.8125,
"grad_norm": 1.3287359372486716,
"learning_rate": 2.709209774085071e-06,
"loss": 0.6273,
"step": 481
},
{
"epoch": 0.8141891891891891,
"grad_norm": 1.3379278170149733,
"learning_rate": 2.7040717214570415e-06,
"loss": 0.646,
"step": 482
},
{
"epoch": 0.8158783783783784,
"grad_norm": 1.2926999526730814,
"learning_rate": 2.698928359198197e-06,
"loss": 0.6677,
"step": 483
},
{
"epoch": 0.8175675675675675,
"grad_norm": 1.3127200141418005,
"learning_rate": 2.693779726096283e-06,
"loss": 0.6618,
"step": 484
},
{
"epoch": 0.8192567567567568,
"grad_norm": 1.3197962107489793,
"learning_rate": 2.6886258609787946e-06,
"loss": 0.6282,
"step": 485
},
{
"epoch": 0.8209459459459459,
"grad_norm": 1.2848795428712034,
"learning_rate": 2.683466802712683e-06,
"loss": 0.6269,
"step": 486
},
{
"epoch": 0.8226351351351351,
"grad_norm": 1.3093990553658448,
"learning_rate": 2.678302590204062e-06,
"loss": 0.6661,
"step": 487
},
{
"epoch": 0.8243243243243243,
"grad_norm": 1.3439659038952452,
"learning_rate": 2.6731332623979154e-06,
"loss": 0.6191,
"step": 488
},
{
"epoch": 0.8260135135135135,
"grad_norm": 1.3151017233480091,
"learning_rate": 2.6679588582778024e-06,
"loss": 0.692,
"step": 489
},
{
"epoch": 0.8277027027027027,
"grad_norm": 1.367520065635975,
"learning_rate": 2.662779416865567e-06,
"loss": 0.6235,
"step": 490
},
{
"epoch": 0.8293918918918919,
"grad_norm": 1.2805691498538,
"learning_rate": 2.6575949772210376e-06,
"loss": 0.6885,
"step": 491
},
{
"epoch": 0.831081081081081,
"grad_norm": 1.3093657104644763,
"learning_rate": 2.6524055784417386e-06,
"loss": 0.6832,
"step": 492
},
{
"epoch": 0.8327702702702703,
"grad_norm": 1.2719434085445112,
"learning_rate": 2.6472112596625912e-06,
"loss": 0.6358,
"step": 493
},
{
"epoch": 0.8344594594594594,
"grad_norm": 1.3289440001002792,
"learning_rate": 2.642012060055619e-06,
"loss": 0.6951,
"step": 494
},
{
"epoch": 0.8361486486486487,
"grad_norm": 1.4234708370546796,
"learning_rate": 2.6368080188296577e-06,
"loss": 0.6366,
"step": 495
},
{
"epoch": 0.8378378378378378,
"grad_norm": 1.3379252399438568,
"learning_rate": 2.63159917523005e-06,
"loss": 0.6828,
"step": 496
},
{
"epoch": 0.839527027027027,
"grad_norm": 1.2971666902550303,
"learning_rate": 2.626385568538358e-06,
"loss": 0.6548,
"step": 497
},
{
"epoch": 0.8412162162162162,
"grad_norm": 1.319788315724304,
"learning_rate": 2.6211672380720625e-06,
"loss": 0.675,
"step": 498
},
{
"epoch": 0.8429054054054054,
"grad_norm": 1.3693649054808243,
"learning_rate": 2.6159442231842693e-06,
"loss": 0.6685,
"step": 499
},
{
"epoch": 0.8445945945945946,
"grad_norm": 1.2826721891793782,
"learning_rate": 2.6107165632634098e-06,
"loss": 0.6231,
"step": 500
},
{
"epoch": 0.8462837837837838,
"grad_norm": 1.3927785373092278,
"learning_rate": 2.605484297732944e-06,
"loss": 0.696,
"step": 501
},
{
"epoch": 0.847972972972973,
"grad_norm": 1.2238179436862426,
"learning_rate": 2.6002474660510665e-06,
"loss": 0.599,
"step": 502
},
{
"epoch": 0.8496621621621622,
"grad_norm": 1.3395786217070758,
"learning_rate": 2.595006107710406e-06,
"loss": 0.6785,
"step": 503
},
{
"epoch": 0.8513513513513513,
"grad_norm": 1.3197500690461326,
"learning_rate": 2.5897602622377272e-06,
"loss": 0.6828,
"step": 504
},
{
"epoch": 0.8530405405405406,
"grad_norm": 1.2526702269554713,
"learning_rate": 2.5845099691936343e-06,
"loss": 0.6678,
"step": 505
},
{
"epoch": 0.8547297297297297,
"grad_norm": 1.284205313573379,
"learning_rate": 2.579255268172273e-06,
"loss": 0.6653,
"step": 506
},
{
"epoch": 0.856418918918919,
"grad_norm": 1.2411561149505208,
"learning_rate": 2.573996198801029e-06,
"loss": 0.6274,
"step": 507
},
{
"epoch": 0.8581081081081081,
"grad_norm": 1.2770163226003766,
"learning_rate": 2.568732800740233e-06,
"loss": 0.6527,
"step": 508
},
{
"epoch": 0.8597972972972973,
"grad_norm": 1.3037995073619448,
"learning_rate": 2.5634651136828594e-06,
"loss": 0.6832,
"step": 509
},
{
"epoch": 0.8614864864864865,
"grad_norm": 1.3305574812358596,
"learning_rate": 2.5581931773542263e-06,
"loss": 0.6716,
"step": 510
},
{
"epoch": 0.8631756756756757,
"grad_norm": 1.2699825016633381,
"learning_rate": 2.552917031511697e-06,
"loss": 0.6807,
"step": 511
},
{
"epoch": 0.8648648648648649,
"grad_norm": 1.384721350284805,
"learning_rate": 2.547636715944382e-06,
"loss": 0.7128,
"step": 512
},
{
"epoch": 0.8665540540540541,
"grad_norm": 1.2962160985630566,
"learning_rate": 2.542352270472834e-06,
"loss": 0.7097,
"step": 513
},
{
"epoch": 0.8682432432432432,
"grad_norm": 1.3315884714121142,
"learning_rate": 2.5370637349487537e-06,
"loss": 0.7239,
"step": 514
},
{
"epoch": 0.8699324324324325,
"grad_norm": 1.29687671033792,
"learning_rate": 2.5317711492546836e-06,
"loss": 0.6688,
"step": 515
},
{
"epoch": 0.8716216216216216,
"grad_norm": 1.2711205250804978,
"learning_rate": 2.5264745533037123e-06,
"loss": 0.6742,
"step": 516
},
{
"epoch": 0.8733108108108109,
"grad_norm": 1.2996812621542553,
"learning_rate": 2.521173987039169e-06,
"loss": 0.6811,
"step": 517
},
{
"epoch": 0.875,
"grad_norm": 1.6950421844669081,
"learning_rate": 2.5158694904343246e-06,
"loss": 0.6618,
"step": 518
},
{
"epoch": 0.8766891891891891,
"grad_norm": 1.3193723681629466,
"learning_rate": 2.510561103492091e-06,
"loss": 0.6402,
"step": 519
},
{
"epoch": 0.8783783783783784,
"grad_norm": 1.2695628123088936,
"learning_rate": 2.505248866244718e-06,
"loss": 0.6831,
"step": 520
},
{
"epoch": 0.8800675675675675,
"grad_norm": 1.2787241205061413,
"learning_rate": 2.4999328187534915e-06,
"loss": 0.6802,
"step": 521
},
{
"epoch": 0.8817567567567568,
"grad_norm": 1.300609736482222,
"learning_rate": 2.4946130011084306e-06,
"loss": 0.6546,
"step": 522
},
{
"epoch": 0.8834459459459459,
"grad_norm": 1.3468074007186794,
"learning_rate": 2.489289453427989e-06,
"loss": 0.6704,
"step": 523
},
{
"epoch": 0.8851351351351351,
"grad_norm": 1.3306700582716369,
"learning_rate": 2.483962215858748e-06,
"loss": 0.6587,
"step": 524
},
{
"epoch": 0.8868243243243243,
"grad_norm": 1.339892354934288,
"learning_rate": 2.4786313285751155e-06,
"loss": 0.6632,
"step": 525
},
{
"epoch": 0.8885135135135135,
"grad_norm": 1.4002140225966508,
"learning_rate": 2.473296831779023e-06,
"loss": 0.6419,
"step": 526
},
{
"epoch": 0.8902027027027027,
"grad_norm": 1.288664747488742,
"learning_rate": 2.4679587656996235e-06,
"loss": 0.6371,
"step": 527
},
{
"epoch": 0.8918918918918919,
"grad_norm": 1.3734784253236045,
"learning_rate": 2.462617170592987e-06,
"loss": 0.6561,
"step": 528
},
{
"epoch": 0.893581081081081,
"grad_norm": 1.2050784201474516,
"learning_rate": 2.4572720867417945e-06,
"loss": 0.632,
"step": 529
},
{
"epoch": 0.8952702702702703,
"grad_norm": 1.3132258094685767,
"learning_rate": 2.4519235544550412e-06,
"loss": 0.6538,
"step": 530
},
{
"epoch": 0.8969594594594594,
"grad_norm": 1.337760432140806,
"learning_rate": 2.4465716140677234e-06,
"loss": 0.7156,
"step": 531
},
{
"epoch": 0.8986486486486487,
"grad_norm": 1.2666914241716667,
"learning_rate": 2.4412163059405435e-06,
"loss": 0.6577,
"step": 532
},
{
"epoch": 0.9003378378378378,
"grad_norm": 1.2823601807164624,
"learning_rate": 2.4358576704595965e-06,
"loss": 0.697,
"step": 533
},
{
"epoch": 0.902027027027027,
"grad_norm": 1.3367451920880444,
"learning_rate": 2.4304957480360744e-06,
"loss": 0.6527,
"step": 534
},
{
"epoch": 0.9037162162162162,
"grad_norm": 1.3739268155417224,
"learning_rate": 2.425130579105953e-06,
"loss": 0.7149,
"step": 535
},
{
"epoch": 0.9054054054054054,
"grad_norm": 1.329350014321296,
"learning_rate": 2.419762204129695e-06,
"loss": 0.7081,
"step": 536
},
{
"epoch": 0.9070945945945946,
"grad_norm": 1.2638498827293603,
"learning_rate": 2.414390663591938e-06,
"loss": 0.6665,
"step": 537
},
{
"epoch": 0.9087837837837838,
"grad_norm": 1.2762258266716298,
"learning_rate": 2.4090159980011934e-06,
"loss": 0.6079,
"step": 538
},
{
"epoch": 0.910472972972973,
"grad_norm": 1.3002809021905946,
"learning_rate": 2.4036382478895393e-06,
"loss": 0.655,
"step": 539
},
{
"epoch": 0.9121621621621622,
"grad_norm": 1.3593466428722907,
"learning_rate": 2.398257453812315e-06,
"loss": 0.6471,
"step": 540
},
{
"epoch": 0.9138513513513513,
"grad_norm": 1.3217770368871993,
"learning_rate": 2.392873656347815e-06,
"loss": 0.6376,
"step": 541
},
{
"epoch": 0.9155405405405406,
"grad_norm": 1.357348425007889,
"learning_rate": 2.387486896096986e-06,
"loss": 0.7056,
"step": 542
},
{
"epoch": 0.9172297297297297,
"grad_norm": 1.3138607334781762,
"learning_rate": 2.382097213683114e-06,
"loss": 0.7244,
"step": 543
},
{
"epoch": 0.918918918918919,
"grad_norm": 1.2943145130114848,
"learning_rate": 2.3767046497515235e-06,
"loss": 0.6455,
"step": 544
},
{
"epoch": 0.9206081081081081,
"grad_norm": 1.3329448183257002,
"learning_rate": 2.3713092449692705e-06,
"loss": 0.6655,
"step": 545
},
{
"epoch": 0.9222972972972973,
"grad_norm": 1.3827889113122052,
"learning_rate": 2.365911040024835e-06,
"loss": 0.6085,
"step": 546
},
{
"epoch": 0.9239864864864865,
"grad_norm": 1.2873550101593443,
"learning_rate": 2.3605100756278114e-06,
"loss": 0.6496,
"step": 547
},
{
"epoch": 0.9256756756756757,
"grad_norm": 1.4635505647607276,
"learning_rate": 2.355106392508607e-06,
"loss": 0.7255,
"step": 548
},
{
"epoch": 0.9273648648648649,
"grad_norm": 1.274562634214393,
"learning_rate": 2.349700031418129e-06,
"loss": 0.622,
"step": 549
},
{
"epoch": 0.9290540540540541,
"grad_norm": 1.2916834992250223,
"learning_rate": 2.344291033127482e-06,
"loss": 0.6483,
"step": 550
},
{
"epoch": 0.9307432432432432,
"grad_norm": 1.2974562634280429,
"learning_rate": 2.338879438427659e-06,
"loss": 0.6368,
"step": 551
},
{
"epoch": 0.9324324324324325,
"grad_norm": 1.3244349894986713,
"learning_rate": 2.333465288129231e-06,
"loss": 0.6852,
"step": 552
},
{
"epoch": 0.9341216216216216,
"grad_norm": 1.3151764611776746,
"learning_rate": 2.3280486230620433e-06,
"loss": 0.6805,
"step": 553
},
{
"epoch": 0.9358108108108109,
"grad_norm": 1.2843621068360367,
"learning_rate": 2.322629484074907e-06,
"loss": 0.6746,
"step": 554
},
{
"epoch": 0.9375,
"grad_norm": 1.3014490689776925,
"learning_rate": 2.3172079120352865e-06,
"loss": 0.6901,
"step": 555
},
{
"epoch": 0.9391891891891891,
"grad_norm": 1.6358853991358127,
"learning_rate": 2.3117839478289983e-06,
"loss": 0.6609,
"step": 556
},
{
"epoch": 0.9408783783783784,
"grad_norm": 1.4871449836737003,
"learning_rate": 2.3063576323598955e-06,
"loss": 0.663,
"step": 557
},
{
"epoch": 0.9425675675675675,
"grad_norm": 1.2851645298579233,
"learning_rate": 2.3009290065495662e-06,
"loss": 0.6648,
"step": 558
},
{
"epoch": 0.9442567567567568,
"grad_norm": 1.3989687257878538,
"learning_rate": 2.2954981113370182e-06,
"loss": 0.6431,
"step": 559
},
{
"epoch": 0.9459459459459459,
"grad_norm": 1.4550664360042096,
"learning_rate": 2.290064987678377e-06,
"loss": 0.6861,
"step": 560
},
{
"epoch": 0.9476351351351351,
"grad_norm": 1.3725831272335538,
"learning_rate": 2.2846296765465706e-06,
"loss": 0.6985,
"step": 561
},
{
"epoch": 0.9493243243243243,
"grad_norm": 1.3575443413890222,
"learning_rate": 2.2791922189310244e-06,
"loss": 0.6626,
"step": 562
},
{
"epoch": 0.9510135135135135,
"grad_norm": 1.256005428809596,
"learning_rate": 2.2737526558373527e-06,
"loss": 0.654,
"step": 563
},
{
"epoch": 0.9527027027027027,
"grad_norm": 1.4238550908429117,
"learning_rate": 2.268311028287045e-06,
"loss": 0.6715,
"step": 564
},
{
"epoch": 0.9543918918918919,
"grad_norm": 1.3317043187800373,
"learning_rate": 2.262867377317163e-06,
"loss": 0.65,
"step": 565
},
{
"epoch": 0.956081081081081,
"grad_norm": 1.3154947523106852,
"learning_rate": 2.257421743980024e-06,
"loss": 0.6567,
"step": 566
},
{
"epoch": 0.9577702702702703,
"grad_norm": 1.2964749823164652,
"learning_rate": 2.2519741693428976e-06,
"loss": 0.6065,
"step": 567
},
{
"epoch": 0.9594594594594594,
"grad_norm": 1.2614425014750057,
"learning_rate": 2.246524694487692e-06,
"loss": 0.6928,
"step": 568
},
{
"epoch": 0.9611486486486487,
"grad_norm": 1.3147013825054574,
"learning_rate": 2.2410733605106456e-06,
"loss": 0.6271,
"step": 569
},
{
"epoch": 0.9628378378378378,
"grad_norm": 1.2464850380254473,
"learning_rate": 2.235620208522019e-06,
"loss": 0.6187,
"step": 570
},
{
"epoch": 0.964527027027027,
"grad_norm": 1.3118794490711199,
"learning_rate": 2.2301652796457807e-06,
"loss": 0.6434,
"step": 571
},
{
"epoch": 0.9662162162162162,
"grad_norm": 1.3652152462118168,
"learning_rate": 2.2247086150192997e-06,
"loss": 0.6581,
"step": 572
},
{
"epoch": 0.9679054054054054,
"grad_norm": 1.4431845392319815,
"learning_rate": 2.2192502557930343e-06,
"loss": 0.6747,
"step": 573
},
{
"epoch": 0.9695945945945946,
"grad_norm": 1.333305348582229,
"learning_rate": 2.213790243130226e-06,
"loss": 0.6562,
"step": 574
},
{
"epoch": 0.9712837837837838,
"grad_norm": 1.3059868201157803,
"learning_rate": 2.20832861820658e-06,
"loss": 0.6073,
"step": 575
},
{
"epoch": 0.972972972972973,
"grad_norm": 1.255203354957442,
"learning_rate": 2.202865422209963e-06,
"loss": 0.6399,
"step": 576
},
{
"epoch": 0.9746621621621622,
"grad_norm": 1.2966274532372035,
"learning_rate": 2.197400696340091e-06,
"loss": 0.6904,
"step": 577
},
{
"epoch": 0.9763513513513513,
"grad_norm": 1.342746601472998,
"learning_rate": 2.1919344818082144e-06,
"loss": 0.7327,
"step": 578
},
{
"epoch": 0.9780405405405406,
"grad_norm": 1.271167602819476,
"learning_rate": 2.1864668198368116e-06,
"loss": 0.6235,
"step": 579
},
{
"epoch": 0.9797297297297297,
"grad_norm": 1.2704298842023867,
"learning_rate": 2.1809977516592758e-06,
"loss": 0.6397,
"step": 580
},
{
"epoch": 0.981418918918919,
"grad_norm": 1.4659602830194913,
"learning_rate": 2.175527318519606e-06,
"loss": 0.6375,
"step": 581
},
{
"epoch": 0.9831081081081081,
"grad_norm": 1.2837108523299343,
"learning_rate": 2.1700555616720934e-06,
"loss": 0.5946,
"step": 582
},
{
"epoch": 0.9847972972972973,
"grad_norm": 1.3258335291339656,
"learning_rate": 2.1645825223810135e-06,
"loss": 0.646,
"step": 583
},
{
"epoch": 0.9864864864864865,
"grad_norm": 1.381249010402763,
"learning_rate": 2.159108241920312e-06,
"loss": 0.6993,
"step": 584
},
{
"epoch": 0.9881756756756757,
"grad_norm": 1.3321089783532771,
"learning_rate": 2.1536327615732937e-06,
"loss": 0.637,
"step": 585
},
{
"epoch": 0.9898648648648649,
"grad_norm": 1.3022198415919726,
"learning_rate": 2.148156122632314e-06,
"loss": 0.6623,
"step": 586
},
{
"epoch": 0.9915540540540541,
"grad_norm": 1.2277665889364764,
"learning_rate": 2.1426783663984645e-06,
"loss": 0.6595,
"step": 587
},
{
"epoch": 0.9932432432432432,
"grad_norm": 1.3098439041046739,
"learning_rate": 2.1371995341812636e-06,
"loss": 0.6805,
"step": 588
},
{
"epoch": 0.9949324324324325,
"grad_norm": 1.3300371685472199,
"learning_rate": 2.1317196672983425e-06,
"loss": 0.6783,
"step": 589
},
{
"epoch": 0.9966216216216216,
"grad_norm": 1.3496135502588562,
"learning_rate": 2.126238807075137e-06,
"loss": 0.6528,
"step": 590
},
{
"epoch": 0.9983108108108109,
"grad_norm": 1.361261277820641,
"learning_rate": 2.120756994844572e-06,
"loss": 0.6694,
"step": 591
},
{
"epoch": 1.0,
"grad_norm": 1.3306654265805096,
"learning_rate": 2.115274271946754e-06,
"loss": 0.7216,
"step": 592
},
{
"epoch": 1.0,
"eval_loss": 0.5579404234886169,
"eval_runtime": 948.1228,
"eval_samples_per_second": 5.748,
"eval_steps_per_second": 0.36,
"step": 592
}
],
"logging_steps": 1,
"max_steps": 1184,
"num_input_tokens_seen": 0,
"num_train_epochs": 2,
"save_steps": 296,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 6.25646117018665e+17,
"train_batch_size": 2,
"trial_name": null,
"trial_params": null
}