tinyllama-42M-fp32 / trainer_state.json
C10X's picture
Upload trainer_state.json with huggingface_hub
e75c876 verified
{
"best_global_step": 375,
"best_metric": 2.8448235988616943,
"best_model_checkpoint": "outputs/checkpoint-375",
"epoch": 14.970873786407767,
"eval_steps": 500,
"global_step": 375,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.038834951456310676,
"grad_norm": 21.30242156982422,
"learning_rate": 0.0,
"loss": 6.5474,
"step": 1
},
{
"epoch": 0.07766990291262135,
"grad_norm": 20.775470733642578,
"learning_rate": 2.0000000000000002e-07,
"loss": 6.5613,
"step": 2
},
{
"epoch": 0.11650485436893204,
"grad_norm": 20.96541976928711,
"learning_rate": 4.0000000000000003e-07,
"loss": 6.5127,
"step": 3
},
{
"epoch": 0.1553398058252427,
"grad_norm": 20.376543045043945,
"learning_rate": 6.000000000000001e-07,
"loss": 6.4569,
"step": 4
},
{
"epoch": 0.1941747572815534,
"grad_norm": 19.54267692565918,
"learning_rate": 8.000000000000001e-07,
"loss": 6.3743,
"step": 5
},
{
"epoch": 0.23300970873786409,
"grad_norm": 19.233882904052734,
"learning_rate": 1.0000000000000002e-06,
"loss": 6.3899,
"step": 6
},
{
"epoch": 0.27184466019417475,
"grad_norm": 20.25909423828125,
"learning_rate": 1.2000000000000002e-06,
"loss": 6.4415,
"step": 7
},
{
"epoch": 0.3106796116504854,
"grad_norm": 19.33000373840332,
"learning_rate": 1.4000000000000001e-06,
"loss": 6.3191,
"step": 8
},
{
"epoch": 0.34951456310679613,
"grad_norm": 18.305322647094727,
"learning_rate": 1.6000000000000001e-06,
"loss": 6.2681,
"step": 9
},
{
"epoch": 0.3883495145631068,
"grad_norm": 17.74665069580078,
"learning_rate": 1.8e-06,
"loss": 6.4206,
"step": 10
},
{
"epoch": 0.42718446601941745,
"grad_norm": 14.93736457824707,
"learning_rate": 2.0000000000000003e-06,
"loss": 6.2288,
"step": 11
},
{
"epoch": 0.46601941747572817,
"grad_norm": 14.914277076721191,
"learning_rate": 2.2e-06,
"loss": 6.3961,
"step": 12
},
{
"epoch": 0.5048543689320388,
"grad_norm": 13.266161918640137,
"learning_rate": 2.4000000000000003e-06,
"loss": 6.0076,
"step": 13
},
{
"epoch": 0.5436893203883495,
"grad_norm": 12.377790451049805,
"learning_rate": 2.6e-06,
"loss": 6.2259,
"step": 14
},
{
"epoch": 0.5825242718446602,
"grad_norm": 11.322343826293945,
"learning_rate": 2.8000000000000003e-06,
"loss": 6.1832,
"step": 15
},
{
"epoch": 0.6213592233009708,
"grad_norm": 10.584484100341797,
"learning_rate": 3e-06,
"loss": 6.0051,
"step": 16
},
{
"epoch": 0.6601941747572816,
"grad_norm": 10.82979965209961,
"learning_rate": 3.2000000000000003e-06,
"loss": 5.9786,
"step": 17
},
{
"epoch": 0.6990291262135923,
"grad_norm": 10.112428665161133,
"learning_rate": 3.4000000000000005e-06,
"loss": 5.9778,
"step": 18
},
{
"epoch": 0.7378640776699029,
"grad_norm": 9.44952392578125,
"learning_rate": 3.6e-06,
"loss": 5.9459,
"step": 19
},
{
"epoch": 0.7766990291262136,
"grad_norm": 9.057659149169922,
"learning_rate": 3.8e-06,
"loss": 6.0317,
"step": 20
},
{
"epoch": 0.8155339805825242,
"grad_norm": 9.000926971435547,
"learning_rate": 4.000000000000001e-06,
"loss": 5.9749,
"step": 21
},
{
"epoch": 0.8543689320388349,
"grad_norm": 7.747213840484619,
"learning_rate": 4.2000000000000004e-06,
"loss": 5.8036,
"step": 22
},
{
"epoch": 0.8932038834951457,
"grad_norm": 6.968072891235352,
"learning_rate": 4.4e-06,
"loss": 5.7705,
"step": 23
},
{
"epoch": 0.9320388349514563,
"grad_norm": 7.167684555053711,
"learning_rate": 4.6e-06,
"loss": 5.7804,
"step": 24
},
{
"epoch": 0.970873786407767,
"grad_norm": 6.384294033050537,
"learning_rate": 4.800000000000001e-06,
"loss": 5.6137,
"step": 25
},
{
"epoch": 0.970873786407767,
"eval_loss": 5.6718363761901855,
"eval_runtime": 2.435,
"eval_samples_per_second": 9.035,
"eval_steps_per_second": 2.464,
"step": 25
},
{
"epoch": 1.0388349514563107,
"grad_norm": 12.656047821044922,
"learning_rate": 5e-06,
"loss": 11.4741,
"step": 26
},
{
"epoch": 1.0776699029126213,
"grad_norm": 6.408062934875488,
"learning_rate": 5.2e-06,
"loss": 5.7835,
"step": 27
},
{
"epoch": 1.116504854368932,
"grad_norm": 6.457642078399658,
"learning_rate": 5.4e-06,
"loss": 5.7659,
"step": 28
},
{
"epoch": 1.1553398058252426,
"grad_norm": 6.716769218444824,
"learning_rate": 5.600000000000001e-06,
"loss": 5.6133,
"step": 29
},
{
"epoch": 1.1941747572815533,
"grad_norm": 5.562079906463623,
"learning_rate": 5.8e-06,
"loss": 5.6588,
"step": 30
},
{
"epoch": 1.233009708737864,
"grad_norm": 5.209117412567139,
"learning_rate": 6e-06,
"loss": 5.6118,
"step": 31
},
{
"epoch": 1.2718446601941746,
"grad_norm": 5.505391597747803,
"learning_rate": 6.2e-06,
"loss": 5.6468,
"step": 32
},
{
"epoch": 1.3106796116504853,
"grad_norm": 4.989831924438477,
"learning_rate": 6.4000000000000006e-06,
"loss": 5.5483,
"step": 33
},
{
"epoch": 1.3495145631067962,
"grad_norm": 5.000854015350342,
"learning_rate": 6.6e-06,
"loss": 5.4522,
"step": 34
},
{
"epoch": 1.3883495145631068,
"grad_norm": 4.343570232391357,
"learning_rate": 6.800000000000001e-06,
"loss": 5.3562,
"step": 35
},
{
"epoch": 1.4271844660194175,
"grad_norm": 4.40326452255249,
"learning_rate": 7.000000000000001e-06,
"loss": 5.4561,
"step": 36
},
{
"epoch": 1.4660194174757282,
"grad_norm": 4.1591901779174805,
"learning_rate": 7.2e-06,
"loss": 5.3806,
"step": 37
},
{
"epoch": 1.5048543689320388,
"grad_norm": 4.1347246170043945,
"learning_rate": 7.4e-06,
"loss": 5.419,
"step": 38
},
{
"epoch": 1.5436893203883495,
"grad_norm": 4.123111248016357,
"learning_rate": 7.6e-06,
"loss": 5.2831,
"step": 39
},
{
"epoch": 1.5825242718446602,
"grad_norm": 4.009028911590576,
"learning_rate": 7.8e-06,
"loss": 5.332,
"step": 40
},
{
"epoch": 1.6213592233009708,
"grad_norm": 4.013438701629639,
"learning_rate": 8.000000000000001e-06,
"loss": 5.2177,
"step": 41
},
{
"epoch": 1.6601941747572817,
"grad_norm": 3.698003053665161,
"learning_rate": 8.200000000000001e-06,
"loss": 5.3019,
"step": 42
},
{
"epoch": 1.6990291262135924,
"grad_norm": 3.66217041015625,
"learning_rate": 8.400000000000001e-06,
"loss": 5.1967,
"step": 43
},
{
"epoch": 1.737864077669903,
"grad_norm": 3.455019235610962,
"learning_rate": 8.599999999999999e-06,
"loss": 5.1573,
"step": 44
},
{
"epoch": 1.7766990291262137,
"grad_norm": 3.5593278408050537,
"learning_rate": 8.8e-06,
"loss": 5.1463,
"step": 45
},
{
"epoch": 1.8155339805825244,
"grad_norm": 3.332477331161499,
"learning_rate": 9e-06,
"loss": 5.1732,
"step": 46
},
{
"epoch": 1.854368932038835,
"grad_norm": 3.2428054809570312,
"learning_rate": 9.2e-06,
"loss": 5.0962,
"step": 47
},
{
"epoch": 1.8932038834951457,
"grad_norm": 3.339063882827759,
"learning_rate": 9.4e-06,
"loss": 5.0253,
"step": 48
},
{
"epoch": 1.9320388349514563,
"grad_norm": 3.4746124744415283,
"learning_rate": 9.600000000000001e-06,
"loss": 5.1363,
"step": 49
},
{
"epoch": 1.970873786407767,
"grad_norm": 3.371466875076294,
"learning_rate": 9.800000000000001e-06,
"loss": 5.1445,
"step": 50
},
{
"epoch": 1.970873786407767,
"eval_loss": 5.052736282348633,
"eval_runtime": 0.9737,
"eval_samples_per_second": 22.593,
"eval_steps_per_second": 6.162,
"step": 50
},
{
"epoch": 2.0388349514563107,
"grad_norm": 5.6498637199401855,
"learning_rate": 1e-05,
"loss": 10.112,
"step": 51
},
{
"epoch": 2.0776699029126213,
"grad_norm": 3.1301138401031494,
"learning_rate": 1.02e-05,
"loss": 5.1063,
"step": 52
},
{
"epoch": 2.116504854368932,
"grad_norm": 3.452958345413208,
"learning_rate": 1.04e-05,
"loss": 5.0082,
"step": 53
},
{
"epoch": 2.1553398058252426,
"grad_norm": 3.1977169513702393,
"learning_rate": 1.06e-05,
"loss": 4.9698,
"step": 54
},
{
"epoch": 2.1941747572815533,
"grad_norm": 2.6776535511016846,
"learning_rate": 1.08e-05,
"loss": 4.9449,
"step": 55
},
{
"epoch": 2.233009708737864,
"grad_norm": 3.5574913024902344,
"learning_rate": 1.1000000000000001e-05,
"loss": 5.0442,
"step": 56
},
{
"epoch": 2.2718446601941746,
"grad_norm": 2.867915391921997,
"learning_rate": 1.1200000000000001e-05,
"loss": 4.8769,
"step": 57
},
{
"epoch": 2.3106796116504853,
"grad_norm": 2.764223098754883,
"learning_rate": 1.1400000000000001e-05,
"loss": 4.9286,
"step": 58
},
{
"epoch": 2.349514563106796,
"grad_norm": 3.816723585128784,
"learning_rate": 1.16e-05,
"loss": 4.8921,
"step": 59
},
{
"epoch": 2.3883495145631066,
"grad_norm": 3.161980152130127,
"learning_rate": 1.18e-05,
"loss": 4.916,
"step": 60
},
{
"epoch": 2.4271844660194173,
"grad_norm": 2.8373942375183105,
"learning_rate": 1.2e-05,
"loss": 4.8942,
"step": 61
},
{
"epoch": 2.466019417475728,
"grad_norm": 2.8898000717163086,
"learning_rate": 1.22e-05,
"loss": 4.8206,
"step": 62
},
{
"epoch": 2.5048543689320386,
"grad_norm": 2.726362943649292,
"learning_rate": 1.24e-05,
"loss": 4.846,
"step": 63
},
{
"epoch": 2.5436893203883493,
"grad_norm": 2.73665714263916,
"learning_rate": 1.2600000000000001e-05,
"loss": 4.8375,
"step": 64
},
{
"epoch": 2.58252427184466,
"grad_norm": 3.1228106021881104,
"learning_rate": 1.2800000000000001e-05,
"loss": 4.7526,
"step": 65
},
{
"epoch": 2.6213592233009706,
"grad_norm": 2.9702351093292236,
"learning_rate": 1.3000000000000001e-05,
"loss": 4.8024,
"step": 66
},
{
"epoch": 2.6601941747572817,
"grad_norm": 3.0533952713012695,
"learning_rate": 1.32e-05,
"loss": 4.7883,
"step": 67
},
{
"epoch": 2.6990291262135924,
"grad_norm": 3.1949095726013184,
"learning_rate": 1.3400000000000002e-05,
"loss": 4.8197,
"step": 68
},
{
"epoch": 2.737864077669903,
"grad_norm": 3.399998426437378,
"learning_rate": 1.3600000000000002e-05,
"loss": 4.6677,
"step": 69
},
{
"epoch": 2.7766990291262137,
"grad_norm": 2.80118465423584,
"learning_rate": 1.3800000000000002e-05,
"loss": 4.6291,
"step": 70
},
{
"epoch": 2.8155339805825244,
"grad_norm": 2.8477330207824707,
"learning_rate": 1.4000000000000001e-05,
"loss": 4.7767,
"step": 71
},
{
"epoch": 2.854368932038835,
"grad_norm": 2.6895911693573,
"learning_rate": 1.42e-05,
"loss": 4.7057,
"step": 72
},
{
"epoch": 2.8932038834951457,
"grad_norm": 2.914586067199707,
"learning_rate": 1.44e-05,
"loss": 4.6386,
"step": 73
},
{
"epoch": 2.9320388349514563,
"grad_norm": 2.6184370517730713,
"learning_rate": 1.4599999999999999e-05,
"loss": 4.6679,
"step": 74
},
{
"epoch": 2.970873786407767,
"grad_norm": 3.00891375541687,
"learning_rate": 1.48e-05,
"loss": 4.6319,
"step": 75
},
{
"epoch": 2.970873786407767,
"eval_loss": 4.614713668823242,
"eval_runtime": 0.9702,
"eval_samples_per_second": 22.675,
"eval_steps_per_second": 6.184,
"step": 75
},
{
"epoch": 3.0388349514563107,
"grad_norm": 5.222214221954346,
"learning_rate": 1.5e-05,
"loss": 9.2212,
"step": 76
},
{
"epoch": 3.0776699029126213,
"grad_norm": 2.716062307357788,
"learning_rate": 1.52e-05,
"loss": 4.6294,
"step": 77
},
{
"epoch": 3.116504854368932,
"grad_norm": 2.503143548965454,
"learning_rate": 1.54e-05,
"loss": 4.5572,
"step": 78
},
{
"epoch": 3.1553398058252426,
"grad_norm": 2.9183573722839355,
"learning_rate": 1.56e-05,
"loss": 4.453,
"step": 79
},
{
"epoch": 3.1941747572815533,
"grad_norm": 2.7854349613189697,
"learning_rate": 1.58e-05,
"loss": 4.5746,
"step": 80
},
{
"epoch": 3.233009708737864,
"grad_norm": 2.8391106128692627,
"learning_rate": 1.6000000000000003e-05,
"loss": 4.5228,
"step": 81
},
{
"epoch": 3.2718446601941746,
"grad_norm": 2.5229265689849854,
"learning_rate": 1.62e-05,
"loss": 4.4692,
"step": 82
},
{
"epoch": 3.3106796116504853,
"grad_norm": 2.643170118331909,
"learning_rate": 1.6400000000000002e-05,
"loss": 4.498,
"step": 83
},
{
"epoch": 3.349514563106796,
"grad_norm": 2.542393922805786,
"learning_rate": 1.66e-05,
"loss": 4.4816,
"step": 84
},
{
"epoch": 3.3883495145631066,
"grad_norm": 2.563282012939453,
"learning_rate": 1.6800000000000002e-05,
"loss": 4.4824,
"step": 85
},
{
"epoch": 3.4271844660194173,
"grad_norm": 2.698516368865967,
"learning_rate": 1.7000000000000003e-05,
"loss": 4.4717,
"step": 86
},
{
"epoch": 3.466019417475728,
"grad_norm": 2.936776638031006,
"learning_rate": 1.7199999999999998e-05,
"loss": 4.346,
"step": 87
},
{
"epoch": 3.5048543689320386,
"grad_norm": 2.9594175815582275,
"learning_rate": 1.74e-05,
"loss": 4.3689,
"step": 88
},
{
"epoch": 3.5436893203883493,
"grad_norm": 3.02431583404541,
"learning_rate": 1.76e-05,
"loss": 4.3922,
"step": 89
},
{
"epoch": 3.58252427184466,
"grad_norm": 3.238933563232422,
"learning_rate": 1.78e-05,
"loss": 4.4046,
"step": 90
},
{
"epoch": 3.6213592233009706,
"grad_norm": 3.368084192276001,
"learning_rate": 1.8e-05,
"loss": 4.3768,
"step": 91
},
{
"epoch": 3.6601941747572817,
"grad_norm": 3.8072586059570312,
"learning_rate": 1.8200000000000002e-05,
"loss": 4.3188,
"step": 92
},
{
"epoch": 3.6990291262135924,
"grad_norm": 3.2370452880859375,
"learning_rate": 1.84e-05,
"loss": 4.3368,
"step": 93
},
{
"epoch": 3.737864077669903,
"grad_norm": 3.302961826324463,
"learning_rate": 1.86e-05,
"loss": 4.3339,
"step": 94
},
{
"epoch": 3.7766990291262137,
"grad_norm": 3.5947256088256836,
"learning_rate": 1.88e-05,
"loss": 4.2763,
"step": 95
},
{
"epoch": 3.8155339805825244,
"grad_norm": 2.955308437347412,
"learning_rate": 1.9e-05,
"loss": 4.3941,
"step": 96
},
{
"epoch": 3.854368932038835,
"grad_norm": 3.303628444671631,
"learning_rate": 1.9200000000000003e-05,
"loss": 4.2748,
"step": 97
},
{
"epoch": 3.8932038834951457,
"grad_norm": 2.7507269382476807,
"learning_rate": 1.94e-05,
"loss": 4.2881,
"step": 98
},
{
"epoch": 3.9320388349514563,
"grad_norm": 2.6451849937438965,
"learning_rate": 1.9600000000000002e-05,
"loss": 4.3818,
"step": 99
},
{
"epoch": 3.970873786407767,
"grad_norm": 4.112302780151367,
"learning_rate": 1.9800000000000004e-05,
"loss": 4.2882,
"step": 100
},
{
"epoch": 3.970873786407767,
"eval_loss": 4.266085624694824,
"eval_runtime": 1.036,
"eval_samples_per_second": 21.235,
"eval_steps_per_second": 5.791,
"step": 100
},
{
"epoch": 4.038834951456311,
"grad_norm": 5.2990498542785645,
"learning_rate": 2e-05,
"loss": 8.631,
"step": 101
},
{
"epoch": 4.077669902912621,
"grad_norm": 3.757814407348633,
"learning_rate": 2.0200000000000003e-05,
"loss": 4.2183,
"step": 102
},
{
"epoch": 4.116504854368932,
"grad_norm": 2.905704975128174,
"learning_rate": 2.04e-05,
"loss": 4.1782,
"step": 103
},
{
"epoch": 4.155339805825243,
"grad_norm": 3.7264492511749268,
"learning_rate": 2.06e-05,
"loss": 4.2959,
"step": 104
},
{
"epoch": 4.194174757281553,
"grad_norm": 3.9989054203033447,
"learning_rate": 2.08e-05,
"loss": 4.1876,
"step": 105
},
{
"epoch": 4.233009708737864,
"grad_norm": 2.978239059448242,
"learning_rate": 2.1e-05,
"loss": 4.1484,
"step": 106
},
{
"epoch": 4.271844660194175,
"grad_norm": 3.223487138748169,
"learning_rate": 2.12e-05,
"loss": 4.1501,
"step": 107
},
{
"epoch": 4.310679611650485,
"grad_norm": 3.035008668899536,
"learning_rate": 2.1400000000000002e-05,
"loss": 4.1316,
"step": 108
},
{
"epoch": 4.349514563106796,
"grad_norm": 2.878307819366455,
"learning_rate": 2.16e-05,
"loss": 4.1824,
"step": 109
},
{
"epoch": 4.388349514563107,
"grad_norm": 3.095815420150757,
"learning_rate": 2.18e-05,
"loss": 4.1726,
"step": 110
},
{
"epoch": 4.427184466019417,
"grad_norm": 3.0754470825195312,
"learning_rate": 2.2000000000000003e-05,
"loss": 3.9618,
"step": 111
},
{
"epoch": 4.466019417475728,
"grad_norm": 3.4234559535980225,
"learning_rate": 2.22e-05,
"loss": 4.0646,
"step": 112
},
{
"epoch": 4.504854368932039,
"grad_norm": 3.2128183841705322,
"learning_rate": 2.2400000000000002e-05,
"loss": 4.0639,
"step": 113
},
{
"epoch": 4.543689320388349,
"grad_norm": 2.9789934158325195,
"learning_rate": 2.26e-05,
"loss": 4.1373,
"step": 114
},
{
"epoch": 4.58252427184466,
"grad_norm": 2.5928032398223877,
"learning_rate": 2.2800000000000002e-05,
"loss": 3.9855,
"step": 115
},
{
"epoch": 4.621359223300971,
"grad_norm": 3.082489252090454,
"learning_rate": 2.3000000000000003e-05,
"loss": 4.1163,
"step": 116
},
{
"epoch": 4.660194174757281,
"grad_norm": 3.028413772583008,
"learning_rate": 2.32e-05,
"loss": 4.0571,
"step": 117
},
{
"epoch": 4.699029126213592,
"grad_norm": 2.8744428157806396,
"learning_rate": 2.3400000000000003e-05,
"loss": 4.027,
"step": 118
},
{
"epoch": 4.737864077669903,
"grad_norm": 2.866056442260742,
"learning_rate": 2.36e-05,
"loss": 4.0299,
"step": 119
},
{
"epoch": 4.776699029126213,
"grad_norm": 2.75072979927063,
"learning_rate": 2.38e-05,
"loss": 3.993,
"step": 120
},
{
"epoch": 4.815533980582524,
"grad_norm": 2.8751604557037354,
"learning_rate": 2.4e-05,
"loss": 3.9961,
"step": 121
},
{
"epoch": 4.854368932038835,
"grad_norm": 2.5905075073242188,
"learning_rate": 2.4200000000000002e-05,
"loss": 3.9582,
"step": 122
},
{
"epoch": 4.893203883495145,
"grad_norm": 3.143044948577881,
"learning_rate": 2.44e-05,
"loss": 3.9464,
"step": 123
},
{
"epoch": 4.932038834951456,
"grad_norm": 2.6397016048431396,
"learning_rate": 2.46e-05,
"loss": 4.0075,
"step": 124
},
{
"epoch": 4.970873786407767,
"grad_norm": 3.2383229732513428,
"learning_rate": 2.48e-05,
"loss": 3.9822,
"step": 125
},
{
"epoch": 4.970873786407767,
"eval_loss": 3.980665445327759,
"eval_runtime": 1.0248,
"eval_samples_per_second": 21.467,
"eval_steps_per_second": 5.855,
"step": 125
},
{
"epoch": 5.038834951456311,
"grad_norm": 5.962584495544434,
"learning_rate": 2.5e-05,
"loss": 7.7604,
"step": 126
},
{
"epoch": 5.077669902912621,
"grad_norm": 3.243708610534668,
"learning_rate": 2.5200000000000003e-05,
"loss": 3.9438,
"step": 127
},
{
"epoch": 5.116504854368932,
"grad_norm": 2.763148307800293,
"learning_rate": 2.54e-05,
"loss": 3.8661,
"step": 128
},
{
"epoch": 5.155339805825243,
"grad_norm": 2.6233339309692383,
"learning_rate": 2.5600000000000002e-05,
"loss": 3.9006,
"step": 129
},
{
"epoch": 5.194174757281553,
"grad_norm": 3.1037437915802,
"learning_rate": 2.58e-05,
"loss": 3.9066,
"step": 130
},
{
"epoch": 5.233009708737864,
"grad_norm": 3.3434383869171143,
"learning_rate": 2.6000000000000002e-05,
"loss": 3.8425,
"step": 131
},
{
"epoch": 5.271844660194175,
"grad_norm": 3.0016958713531494,
"learning_rate": 2.6200000000000003e-05,
"loss": 3.8723,
"step": 132
},
{
"epoch": 5.310679611650485,
"grad_norm": 3.2040951251983643,
"learning_rate": 2.64e-05,
"loss": 3.8326,
"step": 133
},
{
"epoch": 5.349514563106796,
"grad_norm": 3.892890453338623,
"learning_rate": 2.6600000000000003e-05,
"loss": 3.9277,
"step": 134
},
{
"epoch": 5.388349514563107,
"grad_norm": 3.3505635261535645,
"learning_rate": 2.6800000000000004e-05,
"loss": 3.7381,
"step": 135
},
{
"epoch": 5.427184466019417,
"grad_norm": 3.60493803024292,
"learning_rate": 2.7000000000000002e-05,
"loss": 3.9003,
"step": 136
},
{
"epoch": 5.466019417475728,
"grad_norm": 3.3468196392059326,
"learning_rate": 2.7200000000000004e-05,
"loss": 3.829,
"step": 137
},
{
"epoch": 5.504854368932039,
"grad_norm": 2.7208919525146484,
"learning_rate": 2.7400000000000002e-05,
"loss": 3.7987,
"step": 138
},
{
"epoch": 5.543689320388349,
"grad_norm": 4.0348920822143555,
"learning_rate": 2.7600000000000003e-05,
"loss": 3.8318,
"step": 139
},
{
"epoch": 5.58252427184466,
"grad_norm": 3.560403347015381,
"learning_rate": 2.7800000000000005e-05,
"loss": 3.763,
"step": 140
},
{
"epoch": 5.621359223300971,
"grad_norm": 3.262423515319824,
"learning_rate": 2.8000000000000003e-05,
"loss": 3.7441,
"step": 141
},
{
"epoch": 5.660194174757281,
"grad_norm": 2.7930023670196533,
"learning_rate": 2.8199999999999998e-05,
"loss": 3.7323,
"step": 142
},
{
"epoch": 5.699029126213592,
"grad_norm": 2.5322391986846924,
"learning_rate": 2.84e-05,
"loss": 3.6681,
"step": 143
},
{
"epoch": 5.737864077669903,
"grad_norm": 4.258012294769287,
"learning_rate": 2.86e-05,
"loss": 3.7049,
"step": 144
},
{
"epoch": 5.776699029126213,
"grad_norm": 3.0756101608276367,
"learning_rate": 2.88e-05,
"loss": 3.7184,
"step": 145
},
{
"epoch": 5.815533980582524,
"grad_norm": 3.0040361881256104,
"learning_rate": 2.9e-05,
"loss": 3.6077,
"step": 146
},
{
"epoch": 5.854368932038835,
"grad_norm": 4.292761325836182,
"learning_rate": 2.9199999999999998e-05,
"loss": 3.7214,
"step": 147
},
{
"epoch": 5.893203883495145,
"grad_norm": 2.876159906387329,
"learning_rate": 2.94e-05,
"loss": 3.6643,
"step": 148
},
{
"epoch": 5.932038834951456,
"grad_norm": 3.1686434745788574,
"learning_rate": 2.96e-05,
"loss": 3.68,
"step": 149
},
{
"epoch": 5.970873786407767,
"grad_norm": 3.1515626907348633,
"learning_rate": 2.98e-05,
"loss": 3.6581,
"step": 150
},
{
"epoch": 5.970873786407767,
"eval_loss": 3.7385447025299072,
"eval_runtime": 1.0393,
"eval_samples_per_second": 21.169,
"eval_steps_per_second": 5.773,
"step": 150
},
{
"epoch": 6.038834951456311,
"grad_norm": 6.013641834259033,
"learning_rate": 3e-05,
"loss": 7.2601,
"step": 151
},
{
"epoch": 6.077669902912621,
"grad_norm": 3.0433292388916016,
"learning_rate": 3.02e-05,
"loss": 3.626,
"step": 152
},
{
"epoch": 6.116504854368932,
"grad_norm": 2.9623515605926514,
"learning_rate": 3.04e-05,
"loss": 3.5856,
"step": 153
},
{
"epoch": 6.155339805825243,
"grad_norm": 3.333615779876709,
"learning_rate": 3.06e-05,
"loss": 3.6268,
"step": 154
},
{
"epoch": 6.194174757281553,
"grad_norm": 3.0843307971954346,
"learning_rate": 3.08e-05,
"loss": 3.5651,
"step": 155
},
{
"epoch": 6.233009708737864,
"grad_norm": 2.859063148498535,
"learning_rate": 3.1e-05,
"loss": 3.5464,
"step": 156
},
{
"epoch": 6.271844660194175,
"grad_norm": 2.92948842048645,
"learning_rate": 3.12e-05,
"loss": 3.6385,
"step": 157
},
{
"epoch": 6.310679611650485,
"grad_norm": 3.552112340927124,
"learning_rate": 3.1400000000000004e-05,
"loss": 3.5479,
"step": 158
},
{
"epoch": 6.349514563106796,
"grad_norm": 2.9934771060943604,
"learning_rate": 3.16e-05,
"loss": 3.5697,
"step": 159
},
{
"epoch": 6.388349514563107,
"grad_norm": 2.595054864883423,
"learning_rate": 3.18e-05,
"loss": 3.4817,
"step": 160
},
{
"epoch": 6.427184466019417,
"grad_norm": 3.077573537826538,
"learning_rate": 3.2000000000000005e-05,
"loss": 3.5286,
"step": 161
},
{
"epoch": 6.466019417475728,
"grad_norm": 2.5149052143096924,
"learning_rate": 3.2200000000000003e-05,
"loss": 3.6065,
"step": 162
},
{
"epoch": 6.504854368932039,
"grad_norm": 2.6401753425598145,
"learning_rate": 3.24e-05,
"loss": 3.4707,
"step": 163
},
{
"epoch": 6.543689320388349,
"grad_norm": 2.725781202316284,
"learning_rate": 3.26e-05,
"loss": 3.5645,
"step": 164
},
{
"epoch": 6.58252427184466,
"grad_norm": 2.7084786891937256,
"learning_rate": 3.2800000000000004e-05,
"loss": 3.5482,
"step": 165
},
{
"epoch": 6.621359223300971,
"grad_norm": 2.6076486110687256,
"learning_rate": 3.3e-05,
"loss": 3.4842,
"step": 166
},
{
"epoch": 6.660194174757281,
"grad_norm": 3.0237390995025635,
"learning_rate": 3.32e-05,
"loss": 3.5313,
"step": 167
},
{
"epoch": 6.699029126213592,
"grad_norm": 2.807459831237793,
"learning_rate": 3.3400000000000005e-05,
"loss": 3.5354,
"step": 168
},
{
"epoch": 6.737864077669903,
"grad_norm": 3.13301420211792,
"learning_rate": 3.3600000000000004e-05,
"loss": 3.4923,
"step": 169
},
{
"epoch": 6.776699029126213,
"grad_norm": 2.5862674713134766,
"learning_rate": 3.38e-05,
"loss": 3.5315,
"step": 170
},
{
"epoch": 6.815533980582524,
"grad_norm": 3.192603588104248,
"learning_rate": 3.4000000000000007e-05,
"loss": 3.4937,
"step": 171
},
{
"epoch": 6.854368932038835,
"grad_norm": 2.440667152404785,
"learning_rate": 3.4200000000000005e-05,
"loss": 3.4632,
"step": 172
},
{
"epoch": 6.893203883495145,
"grad_norm": 3.0425989627838135,
"learning_rate": 3.4399999999999996e-05,
"loss": 3.4456,
"step": 173
},
{
"epoch": 6.932038834951456,
"grad_norm": 3.369929313659668,
"learning_rate": 3.46e-05,
"loss": 3.4061,
"step": 174
},
{
"epoch": 6.970873786407767,
"grad_norm": 2.883514165878296,
"learning_rate": 3.48e-05,
"loss": 3.4312,
"step": 175
},
{
"epoch": 6.970873786407767,
"eval_loss": 3.5276877880096436,
"eval_runtime": 0.9695,
"eval_samples_per_second": 22.692,
"eval_steps_per_second": 6.189,
"step": 175
},
{
"epoch": 7.038834951456311,
"grad_norm": 5.757262706756592,
"learning_rate": 3.5e-05,
"loss": 6.8588,
"step": 176
},
{
"epoch": 7.077669902912621,
"grad_norm": 2.7623355388641357,
"learning_rate": 3.52e-05,
"loss": 3.389,
"step": 177
},
{
"epoch": 7.116504854368932,
"grad_norm": 3.601408004760742,
"learning_rate": 3.54e-05,
"loss": 3.4136,
"step": 178
},
{
"epoch": 7.155339805825243,
"grad_norm": 2.4193849563598633,
"learning_rate": 3.56e-05,
"loss": 3.3387,
"step": 179
},
{
"epoch": 7.194174757281553,
"grad_norm": 3.1988773345947266,
"learning_rate": 3.58e-05,
"loss": 3.3565,
"step": 180
},
{
"epoch": 7.233009708737864,
"grad_norm": 3.6124112606048584,
"learning_rate": 3.6e-05,
"loss": 3.3662,
"step": 181
},
{
"epoch": 7.271844660194175,
"grad_norm": 2.836766242980957,
"learning_rate": 3.62e-05,
"loss": 3.2874,
"step": 182
},
{
"epoch": 7.310679611650485,
"grad_norm": 3.2610206604003906,
"learning_rate": 3.6400000000000004e-05,
"loss": 3.2984,
"step": 183
},
{
"epoch": 7.349514563106796,
"grad_norm": 2.8655757904052734,
"learning_rate": 3.66e-05,
"loss": 3.2584,
"step": 184
},
{
"epoch": 7.388349514563107,
"grad_norm": 3.4718809127807617,
"learning_rate": 3.68e-05,
"loss": 3.2489,
"step": 185
},
{
"epoch": 7.427184466019417,
"grad_norm": 3.2131571769714355,
"learning_rate": 3.7e-05,
"loss": 3.3321,
"step": 186
},
{
"epoch": 7.466019417475728,
"grad_norm": 3.1714115142822266,
"learning_rate": 3.72e-05,
"loss": 3.3489,
"step": 187
},
{
"epoch": 7.504854368932039,
"grad_norm": 2.877065658569336,
"learning_rate": 3.74e-05,
"loss": 3.245,
"step": 188
},
{
"epoch": 7.543689320388349,
"grad_norm": 3.1105806827545166,
"learning_rate": 3.76e-05,
"loss": 3.272,
"step": 189
},
{
"epoch": 7.58252427184466,
"grad_norm": 3.5332155227661133,
"learning_rate": 3.7800000000000004e-05,
"loss": 3.3132,
"step": 190
},
{
"epoch": 7.621359223300971,
"grad_norm": 2.8226609230041504,
"learning_rate": 3.8e-05,
"loss": 3.2721,
"step": 191
},
{
"epoch": 7.660194174757281,
"grad_norm": 2.5367422103881836,
"learning_rate": 3.82e-05,
"loss": 3.3234,
"step": 192
},
{
"epoch": 7.699029126213592,
"grad_norm": 2.9826626777648926,
"learning_rate": 3.8400000000000005e-05,
"loss": 3.2643,
"step": 193
},
{
"epoch": 7.737864077669903,
"grad_norm": 3.456496477127075,
"learning_rate": 3.86e-05,
"loss": 3.2105,
"step": 194
},
{
"epoch": 7.776699029126213,
"grad_norm": 3.286680221557617,
"learning_rate": 3.88e-05,
"loss": 3.2156,
"step": 195
},
{
"epoch": 7.815533980582524,
"grad_norm": 2.996983528137207,
"learning_rate": 3.9000000000000006e-05,
"loss": 3.3637,
"step": 196
},
{
"epoch": 7.854368932038835,
"grad_norm": 3.129873037338257,
"learning_rate": 3.9200000000000004e-05,
"loss": 3.2444,
"step": 197
},
{
"epoch": 7.893203883495145,
"grad_norm": 2.591716766357422,
"learning_rate": 3.94e-05,
"loss": 3.2831,
"step": 198
},
{
"epoch": 7.932038834951456,
"grad_norm": 2.664017677307129,
"learning_rate": 3.960000000000001e-05,
"loss": 3.1692,
"step": 199
},
{
"epoch": 7.970873786407767,
"grad_norm": 2.8941309452056885,
"learning_rate": 3.9800000000000005e-05,
"loss": 3.2986,
"step": 200
},
{
"epoch": 7.970873786407767,
"eval_loss": 3.3523428440093994,
"eval_runtime": 0.9896,
"eval_samples_per_second": 22.23,
"eval_steps_per_second": 6.063,
"step": 200
},
{
"epoch": 8.03883495145631,
"grad_norm": 6.677456378936768,
"learning_rate": 4e-05,
"loss": 6.5052,
"step": 201
},
{
"epoch": 8.077669902912621,
"grad_norm": 3.38222599029541,
"learning_rate": 4.02e-05,
"loss": 3.1682,
"step": 202
},
{
"epoch": 8.116504854368932,
"grad_norm": 2.9144835472106934,
"learning_rate": 4.0400000000000006e-05,
"loss": 3.1505,
"step": 203
},
{
"epoch": 8.155339805825243,
"grad_norm": 2.837830066680908,
"learning_rate": 4.0600000000000004e-05,
"loss": 3.1576,
"step": 204
},
{
"epoch": 8.194174757281553,
"grad_norm": 3.3070290088653564,
"learning_rate": 4.08e-05,
"loss": 3.1545,
"step": 205
},
{
"epoch": 8.233009708737864,
"grad_norm": 2.6031386852264404,
"learning_rate": 4.1e-05,
"loss": 3.1598,
"step": 206
},
{
"epoch": 8.271844660194175,
"grad_norm": 2.879425525665283,
"learning_rate": 4.12e-05,
"loss": 3.1222,
"step": 207
},
{
"epoch": 8.310679611650485,
"grad_norm": 3.14932918548584,
"learning_rate": 4.14e-05,
"loss": 3.0743,
"step": 208
},
{
"epoch": 8.349514563106796,
"grad_norm": 3.3993191719055176,
"learning_rate": 4.16e-05,
"loss": 3.1589,
"step": 209
},
{
"epoch": 8.388349514563107,
"grad_norm": 3.2141942977905273,
"learning_rate": 4.18e-05,
"loss": 3.0428,
"step": 210
},
{
"epoch": 8.427184466019417,
"grad_norm": 2.791717290878296,
"learning_rate": 4.2e-05,
"loss": 3.1158,
"step": 211
},
{
"epoch": 8.466019417475728,
"grad_norm": 3.1668970584869385,
"learning_rate": 4.22e-05,
"loss": 3.0463,
"step": 212
},
{
"epoch": 8.504854368932039,
"grad_norm": 2.4356696605682373,
"learning_rate": 4.24e-05,
"loss": 3.1434,
"step": 213
},
{
"epoch": 8.54368932038835,
"grad_norm": 2.9241132736206055,
"learning_rate": 4.26e-05,
"loss": 3.0292,
"step": 214
},
{
"epoch": 8.58252427184466,
"grad_norm": 2.4170773029327393,
"learning_rate": 4.2800000000000004e-05,
"loss": 3.0923,
"step": 215
},
{
"epoch": 8.62135922330097,
"grad_norm": 2.4428963661193848,
"learning_rate": 4.3e-05,
"loss": 3.0588,
"step": 216
},
{
"epoch": 8.660194174757281,
"grad_norm": 3.0066943168640137,
"learning_rate": 4.32e-05,
"loss": 3.0815,
"step": 217
},
{
"epoch": 8.699029126213592,
"grad_norm": 3.0532405376434326,
"learning_rate": 4.3400000000000005e-05,
"loss": 3.1377,
"step": 218
},
{
"epoch": 8.737864077669903,
"grad_norm": 2.9405910968780518,
"learning_rate": 4.36e-05,
"loss": 3.081,
"step": 219
},
{
"epoch": 8.776699029126213,
"grad_norm": 2.82438325881958,
"learning_rate": 4.38e-05,
"loss": 3.0538,
"step": 220
},
{
"epoch": 8.815533980582524,
"grad_norm": 2.899946928024292,
"learning_rate": 4.4000000000000006e-05,
"loss": 3.0664,
"step": 221
},
{
"epoch": 8.854368932038835,
"grad_norm": 2.4132299423217773,
"learning_rate": 4.4200000000000004e-05,
"loss": 3.0723,
"step": 222
},
{
"epoch": 8.893203883495145,
"grad_norm": 3.2833642959594727,
"learning_rate": 4.44e-05,
"loss": 3.0445,
"step": 223
},
{
"epoch": 8.932038834951456,
"grad_norm": 2.60457706451416,
"learning_rate": 4.46e-05,
"loss": 3.0134,
"step": 224
},
{
"epoch": 8.970873786407767,
"grad_norm": 2.7552649974823,
"learning_rate": 4.4800000000000005e-05,
"loss": 3.0772,
"step": 225
},
{
"epoch": 8.970873786407767,
"eval_loss": 3.2081830501556396,
"eval_runtime": 0.971,
"eval_samples_per_second": 22.657,
"eval_steps_per_second": 6.179,
"step": 225
},
{
"epoch": 9.03883495145631,
"grad_norm": 6.834669589996338,
"learning_rate": 4.5e-05,
"loss": 6.0,
"step": 226
},
{
"epoch": 9.077669902912621,
"grad_norm": 2.7431795597076416,
"learning_rate": 4.52e-05,
"loss": 2.8938,
"step": 227
},
{
"epoch": 9.116504854368932,
"grad_norm": 3.5260982513427734,
"learning_rate": 4.5400000000000006e-05,
"loss": 2.955,
"step": 228
},
{
"epoch": 9.155339805825243,
"grad_norm": 2.705111026763916,
"learning_rate": 4.5600000000000004e-05,
"loss": 2.9586,
"step": 229
},
{
"epoch": 9.194174757281553,
"grad_norm": 3.2462103366851807,
"learning_rate": 4.58e-05,
"loss": 2.8973,
"step": 230
},
{
"epoch": 9.233009708737864,
"grad_norm": 2.788363218307495,
"learning_rate": 4.600000000000001e-05,
"loss": 2.9328,
"step": 231
},
{
"epoch": 9.271844660194175,
"grad_norm": 2.4145243167877197,
"learning_rate": 4.6200000000000005e-05,
"loss": 2.9653,
"step": 232
},
{
"epoch": 9.310679611650485,
"grad_norm": 3.2271153926849365,
"learning_rate": 4.64e-05,
"loss": 2.922,
"step": 233
},
{
"epoch": 9.349514563106796,
"grad_norm": 2.3625218868255615,
"learning_rate": 4.660000000000001e-05,
"loss": 3.0413,
"step": 234
},
{
"epoch": 9.388349514563107,
"grad_norm": 3.17262864112854,
"learning_rate": 4.6800000000000006e-05,
"loss": 2.9962,
"step": 235
},
{
"epoch": 9.427184466019417,
"grad_norm": 2.906003475189209,
"learning_rate": 4.7e-05,
"loss": 2.9422,
"step": 236
},
{
"epoch": 9.466019417475728,
"grad_norm": 2.1498398780822754,
"learning_rate": 4.72e-05,
"loss": 2.9061,
"step": 237
},
{
"epoch": 9.504854368932039,
"grad_norm": 2.9519286155700684,
"learning_rate": 4.74e-05,
"loss": 2.967,
"step": 238
},
{
"epoch": 9.54368932038835,
"grad_norm": 2.561063528060913,
"learning_rate": 4.76e-05,
"loss": 2.9191,
"step": 239
},
{
"epoch": 9.58252427184466,
"grad_norm": 3.8291261196136475,
"learning_rate": 4.78e-05,
"loss": 2.9071,
"step": 240
},
{
"epoch": 9.62135922330097,
"grad_norm": 3.4280309677124023,
"learning_rate": 4.8e-05,
"loss": 2.9384,
"step": 241
},
{
"epoch": 9.660194174757281,
"grad_norm": 3.460054397583008,
"learning_rate": 4.82e-05,
"loss": 2.9387,
"step": 242
},
{
"epoch": 9.699029126213592,
"grad_norm": 3.3750805854797363,
"learning_rate": 4.8400000000000004e-05,
"loss": 2.9552,
"step": 243
},
{
"epoch": 9.737864077669903,
"grad_norm": 2.6689562797546387,
"learning_rate": 4.86e-05,
"loss": 2.8809,
"step": 244
},
{
"epoch": 9.776699029126213,
"grad_norm": 2.9314560890197754,
"learning_rate": 4.88e-05,
"loss": 2.7902,
"step": 245
},
{
"epoch": 9.815533980582524,
"grad_norm": 2.630530595779419,
"learning_rate": 4.9e-05,
"loss": 2.8857,
"step": 246
},
{
"epoch": 9.854368932038835,
"grad_norm": 2.546659231185913,
"learning_rate": 4.92e-05,
"loss": 2.8896,
"step": 247
},
{
"epoch": 9.893203883495145,
"grad_norm": 2.795778751373291,
"learning_rate": 4.94e-05,
"loss": 2.9516,
"step": 248
},
{
"epoch": 9.932038834951456,
"grad_norm": 3.0504794120788574,
"learning_rate": 4.96e-05,
"loss": 2.9132,
"step": 249
},
{
"epoch": 9.970873786407767,
"grad_norm": 3.444287061691284,
"learning_rate": 4.9800000000000004e-05,
"loss": 2.973,
"step": 250
},
{
"epoch": 9.970873786407767,
"eval_loss": 3.089428186416626,
"eval_runtime": 0.9754,
"eval_samples_per_second": 22.555,
"eval_steps_per_second": 6.151,
"step": 250
},
{
"epoch": 10.03883495145631,
"grad_norm": 5.480017185211182,
"learning_rate": 5e-05,
"loss": 5.6196,
"step": 251
},
{
"epoch": 10.077669902912621,
"grad_norm": 3.3957669734954834,
"learning_rate": 4.999997563061038e-05,
"loss": 2.8152,
"step": 252
},
{
"epoch": 10.116504854368932,
"grad_norm": 2.6747496128082275,
"learning_rate": 4.9999902522489015e-05,
"loss": 2.8624,
"step": 253
},
{
"epoch": 10.155339805825243,
"grad_norm": 3.2186131477355957,
"learning_rate": 4.999978067577844e-05,
"loss": 2.7587,
"step": 254
},
{
"epoch": 10.194174757281553,
"grad_norm": 3.7385358810424805,
"learning_rate": 4.999961009071621e-05,
"loss": 2.8117,
"step": 255
},
{
"epoch": 10.233009708737864,
"grad_norm": 2.586005926132202,
"learning_rate": 4.999939076763487e-05,
"loss": 2.7617,
"step": 256
},
{
"epoch": 10.271844660194175,
"grad_norm": 2.7468533515930176,
"learning_rate": 4.999912270696202e-05,
"loss": 2.802,
"step": 257
},
{
"epoch": 10.310679611650485,
"grad_norm": 2.7268691062927246,
"learning_rate": 4.999880590922025e-05,
"loss": 2.7928,
"step": 258
},
{
"epoch": 10.349514563106796,
"grad_norm": 2.6305949687957764,
"learning_rate": 4.9998440375027166e-05,
"loss": 2.8245,
"step": 259
},
{
"epoch": 10.388349514563107,
"grad_norm": 2.8977084159851074,
"learning_rate": 4.9998026105095405e-05,
"loss": 2.7525,
"step": 260
},
{
"epoch": 10.427184466019417,
"grad_norm": 2.394578218460083,
"learning_rate": 4.999756310023261e-05,
"loss": 2.731,
"step": 261
},
{
"epoch": 10.466019417475728,
"grad_norm": 3.0859174728393555,
"learning_rate": 4.9997051361341425e-05,
"loss": 2.7902,
"step": 262
},
{
"epoch": 10.504854368932039,
"grad_norm": 2.929978370666504,
"learning_rate": 4.9996490889419514e-05,
"loss": 2.7723,
"step": 263
},
{
"epoch": 10.54368932038835,
"grad_norm": 2.6215100288391113,
"learning_rate": 4.999588168555954e-05,
"loss": 2.7892,
"step": 264
},
{
"epoch": 10.58252427184466,
"grad_norm": 2.744954824447632,
"learning_rate": 4.999522375094919e-05,
"loss": 2.8024,
"step": 265
},
{
"epoch": 10.62135922330097,
"grad_norm": 2.775912046432495,
"learning_rate": 4.999451708687114e-05,
"loss": 2.642,
"step": 266
},
{
"epoch": 10.660194174757281,
"grad_norm": 2.5821340084075928,
"learning_rate": 4.999376169470306e-05,
"loss": 2.7808,
"step": 267
},
{
"epoch": 10.699029126213592,
"grad_norm": 2.4101083278656006,
"learning_rate": 4.999295757591762e-05,
"loss": 2.7318,
"step": 268
},
{
"epoch": 10.737864077669903,
"grad_norm": 2.4816181659698486,
"learning_rate": 4.99921047320825e-05,
"loss": 2.7707,
"step": 269
},
{
"epoch": 10.776699029126213,
"grad_norm": 2.366009473800659,
"learning_rate": 4.9991203164860365e-05,
"loss": 2.7481,
"step": 270
},
{
"epoch": 10.815533980582524,
"grad_norm": 2.9792630672454834,
"learning_rate": 4.999025287600886e-05,
"loss": 2.7204,
"step": 271
},
{
"epoch": 10.854368932038835,
"grad_norm": 3.0781967639923096,
"learning_rate": 4.998925386738063e-05,
"loss": 2.7248,
"step": 272
},
{
"epoch": 10.893203883495145,
"grad_norm": 2.6866307258605957,
"learning_rate": 4.998820614092328e-05,
"loss": 2.7456,
"step": 273
},
{
"epoch": 10.932038834951456,
"grad_norm": 2.789808988571167,
"learning_rate": 4.998710969867942e-05,
"loss": 2.7224,
"step": 274
},
{
"epoch": 10.970873786407767,
"grad_norm": 2.4948067665100098,
"learning_rate": 4.9985964542786614e-05,
"loss": 2.6724,
"step": 275
},
{
"epoch": 10.970873786407767,
"eval_loss": 2.9974570274353027,
"eval_runtime": 0.9771,
"eval_samples_per_second": 22.516,
"eval_steps_per_second": 6.141,
"step": 275
},
{
"epoch": 11.03883495145631,
"grad_norm": 4.32741117477417,
"learning_rate": 4.99847706754774e-05,
"loss": 5.4423,
"step": 276
},
{
"epoch": 11.077669902912621,
"grad_norm": 2.3572208881378174,
"learning_rate": 4.998352809907928e-05,
"loss": 2.6372,
"step": 277
},
{
"epoch": 11.116504854368932,
"grad_norm": 2.4045934677124023,
"learning_rate": 4.998223681601473e-05,
"loss": 2.6205,
"step": 278
},
{
"epoch": 11.155339805825243,
"grad_norm": 2.5755131244659424,
"learning_rate": 4.998089682880117e-05,
"loss": 2.5939,
"step": 279
},
{
"epoch": 11.194174757281553,
"grad_norm": 2.5768463611602783,
"learning_rate": 4.997950814005098e-05,
"loss": 2.6925,
"step": 280
},
{
"epoch": 11.233009708737864,
"grad_norm": 2.5549166202545166,
"learning_rate": 4.997807075247146e-05,
"loss": 2.6172,
"step": 281
},
{
"epoch": 11.271844660194175,
"grad_norm": 2.761068344116211,
"learning_rate": 4.997658466886489e-05,
"loss": 2.6572,
"step": 282
},
{
"epoch": 11.310679611650485,
"grad_norm": 2.5051231384277344,
"learning_rate": 4.9975049892128455e-05,
"loss": 2.6549,
"step": 283
},
{
"epoch": 11.349514563106796,
"grad_norm": 2.7434117794036865,
"learning_rate": 4.9973466425254286e-05,
"loss": 2.5632,
"step": 284
},
{
"epoch": 11.388349514563107,
"grad_norm": 2.328563928604126,
"learning_rate": 4.997183427132943e-05,
"loss": 2.5751,
"step": 285
},
{
"epoch": 11.427184466019417,
"grad_norm": 2.7668466567993164,
"learning_rate": 4.997015343353585e-05,
"loss": 2.6609,
"step": 286
},
{
"epoch": 11.466019417475728,
"grad_norm": 2.0831525325775146,
"learning_rate": 4.996842391515044e-05,
"loss": 2.6428,
"step": 287
},
{
"epoch": 11.504854368932039,
"grad_norm": 2.4443278312683105,
"learning_rate": 4.996664571954497e-05,
"loss": 2.6012,
"step": 288
},
{
"epoch": 11.54368932038835,
"grad_norm": 2.4806153774261475,
"learning_rate": 4.9964818850186135e-05,
"loss": 2.6649,
"step": 289
},
{
"epoch": 11.58252427184466,
"grad_norm": 2.539933919906616,
"learning_rate": 4.99629433106355e-05,
"loss": 2.6253,
"step": 290
},
{
"epoch": 11.62135922330097,
"grad_norm": 2.7404544353485107,
"learning_rate": 4.996101910454953e-05,
"loss": 2.6224,
"step": 291
},
{
"epoch": 11.660194174757281,
"grad_norm": 2.5377357006073,
"learning_rate": 4.9959046235679565e-05,
"loss": 2.6249,
"step": 292
},
{
"epoch": 11.699029126213592,
"grad_norm": 2.8488271236419678,
"learning_rate": 4.9957024707871806e-05,
"loss": 2.6232,
"step": 293
},
{
"epoch": 11.737864077669903,
"grad_norm": 2.4895827770233154,
"learning_rate": 4.9954954525067334e-05,
"loss": 2.5983,
"step": 294
},
{
"epoch": 11.776699029126213,
"grad_norm": 3.038975954055786,
"learning_rate": 4.995283569130207e-05,
"loss": 2.5715,
"step": 295
},
{
"epoch": 11.815533980582524,
"grad_norm": 2.674245595932007,
"learning_rate": 4.995066821070679e-05,
"loss": 2.6201,
"step": 296
},
{
"epoch": 11.854368932038835,
"grad_norm": 3.5277645587921143,
"learning_rate": 4.9948452087507116e-05,
"loss": 2.6376,
"step": 297
},
{
"epoch": 11.893203883495145,
"grad_norm": 3.0974984169006348,
"learning_rate": 4.994618732602349e-05,
"loss": 2.6268,
"step": 298
},
{
"epoch": 11.932038834951456,
"grad_norm": 2.309119462966919,
"learning_rate": 4.994387393067117e-05,
"loss": 2.5594,
"step": 299
},
{
"epoch": 11.970873786407767,
"grad_norm": 2.540464162826538,
"learning_rate": 4.994151190596025e-05,
"loss": 2.5765,
"step": 300
},
{
"epoch": 11.970873786407767,
"eval_loss": 2.9208481311798096,
"eval_runtime": 1.0115,
"eval_samples_per_second": 21.749,
"eval_steps_per_second": 5.932,
"step": 300
},
{
"epoch": 12.03883495145631,
"grad_norm": 5.542501449584961,
"learning_rate": 4.993910125649561e-05,
"loss": 5.1943,
"step": 301
},
{
"epoch": 12.077669902912621,
"grad_norm": 2.2998414039611816,
"learning_rate": 4.993664198697694e-05,
"loss": 2.5311,
"step": 302
},
{
"epoch": 12.116504854368932,
"grad_norm": 3.0827107429504395,
"learning_rate": 4.993413410219871e-05,
"loss": 2.5587,
"step": 303
},
{
"epoch": 12.155339805825243,
"grad_norm": 2.7742204666137695,
"learning_rate": 4.9931577607050175e-05,
"loss": 2.4549,
"step": 304
},
{
"epoch": 12.194174757281553,
"grad_norm": 2.5605695247650146,
"learning_rate": 4.992897250651535e-05,
"loss": 2.5602,
"step": 305
},
{
"epoch": 12.233009708737864,
"grad_norm": 2.8852667808532715,
"learning_rate": 4.992631880567301e-05,
"loss": 2.5069,
"step": 306
},
{
"epoch": 12.271844660194175,
"grad_norm": 3.006777048110962,
"learning_rate": 4.9923616509696683e-05,
"loss": 2.5326,
"step": 307
},
{
"epoch": 12.310679611650485,
"grad_norm": 2.1645665168762207,
"learning_rate": 4.9920865623854615e-05,
"loss": 2.4739,
"step": 308
},
{
"epoch": 12.349514563106796,
"grad_norm": 2.941042423248291,
"learning_rate": 4.9918066153509834e-05,
"loss": 2.5149,
"step": 309
},
{
"epoch": 12.388349514563107,
"grad_norm": 2.598097562789917,
"learning_rate": 4.991521810412002e-05,
"loss": 2.5214,
"step": 310
},
{
"epoch": 12.427184466019417,
"grad_norm": 2.408721446990967,
"learning_rate": 4.991232148123761e-05,
"loss": 2.4747,
"step": 311
},
{
"epoch": 12.466019417475728,
"grad_norm": 2.39508318901062,
"learning_rate": 4.990937629050971e-05,
"loss": 2.5304,
"step": 312
},
{
"epoch": 12.504854368932039,
"grad_norm": 2.9436190128326416,
"learning_rate": 4.990638253767812e-05,
"loss": 2.5046,
"step": 313
},
{
"epoch": 12.54368932038835,
"grad_norm": 2.6037611961364746,
"learning_rate": 4.990334022857932e-05,
"loss": 2.4537,
"step": 314
},
{
"epoch": 12.58252427184466,
"grad_norm": 2.892789602279663,
"learning_rate": 4.9900249369144434e-05,
"loss": 2.4817,
"step": 315
},
{
"epoch": 12.62135922330097,
"grad_norm": 2.6804611682891846,
"learning_rate": 4.989710996539926e-05,
"loss": 2.5012,
"step": 316
},
{
"epoch": 12.660194174757281,
"grad_norm": 2.458824396133423,
"learning_rate": 4.9893922023464236e-05,
"loss": 2.4661,
"step": 317
},
{
"epoch": 12.699029126213592,
"grad_norm": 2.6641952991485596,
"learning_rate": 4.989068554955439e-05,
"loss": 2.4971,
"step": 318
},
{
"epoch": 12.737864077669903,
"grad_norm": 2.421142101287842,
"learning_rate": 4.988740054997943e-05,
"loss": 2.4014,
"step": 319
},
{
"epoch": 12.776699029126213,
"grad_norm": 2.4107542037963867,
"learning_rate": 4.98840670311436e-05,
"loss": 2.4636,
"step": 320
},
{
"epoch": 12.815533980582524,
"grad_norm": 2.5701303482055664,
"learning_rate": 4.988068499954578e-05,
"loss": 2.4564,
"step": 321
},
{
"epoch": 12.854368932038835,
"grad_norm": 2.3998067378997803,
"learning_rate": 4.987725446177941e-05,
"loss": 2.4561,
"step": 322
},
{
"epoch": 12.893203883495145,
"grad_norm": 2.6888773441314697,
"learning_rate": 4.987377542453251e-05,
"loss": 2.4392,
"step": 323
},
{
"epoch": 12.932038834951456,
"grad_norm": 2.313508987426758,
"learning_rate": 4.987024789458762e-05,
"loss": 2.4438,
"step": 324
},
{
"epoch": 12.970873786407767,
"grad_norm": 2.5614566802978516,
"learning_rate": 4.986667187882186e-05,
"loss": 2.557,
"step": 325
},
{
"epoch": 12.970873786407767,
"eval_loss": 2.8833444118499756,
"eval_runtime": 1.0479,
"eval_samples_per_second": 20.995,
"eval_steps_per_second": 5.726,
"step": 325
},
{
"epoch": 13.03883495145631,
"grad_norm": 5.110360145568848,
"learning_rate": 4.9863047384206835e-05,
"loss": 4.9144,
"step": 326
},
{
"epoch": 13.077669902912621,
"grad_norm": 2.7373085021972656,
"learning_rate": 4.98593744178087e-05,
"loss": 2.3994,
"step": 327
},
{
"epoch": 13.116504854368932,
"grad_norm": 2.542954206466675,
"learning_rate": 4.985565298678809e-05,
"loss": 2.3535,
"step": 328
},
{
"epoch": 13.155339805825243,
"grad_norm": 2.6374223232269287,
"learning_rate": 4.985188309840012e-05,
"loss": 2.3894,
"step": 329
},
{
"epoch": 13.194174757281553,
"grad_norm": 2.541004180908203,
"learning_rate": 4.984806475999437e-05,
"loss": 2.391,
"step": 330
},
{
"epoch": 13.233009708737864,
"grad_norm": 2.6150271892547607,
"learning_rate": 4.984419797901491e-05,
"loss": 2.3927,
"step": 331
},
{
"epoch": 13.271844660194175,
"grad_norm": 2.47719144821167,
"learning_rate": 4.984028276300021e-05,
"loss": 2.3751,
"step": 332
},
{
"epoch": 13.310679611650485,
"grad_norm": 2.679882764816284,
"learning_rate": 4.983631911958319e-05,
"loss": 2.374,
"step": 333
},
{
"epoch": 13.349514563106796,
"grad_norm": 2.784619092941284,
"learning_rate": 4.983230705649118e-05,
"loss": 2.3539,
"step": 334
},
{
"epoch": 13.388349514563107,
"grad_norm": 2.188197135925293,
"learning_rate": 4.982824658154589e-05,
"loss": 2.3553,
"step": 335
},
{
"epoch": 13.427184466019417,
"grad_norm": 2.232978582382202,
"learning_rate": 4.982413770266342e-05,
"loss": 2.3389,
"step": 336
},
{
"epoch": 13.466019417475728,
"grad_norm": 2.563889980316162,
"learning_rate": 4.981998042785427e-05,
"loss": 2.3623,
"step": 337
},
{
"epoch": 13.504854368932039,
"grad_norm": 2.9053828716278076,
"learning_rate": 4.9815774765223226e-05,
"loss": 2.3705,
"step": 338
},
{
"epoch": 13.54368932038835,
"grad_norm": 2.5447866916656494,
"learning_rate": 4.9811520722969465e-05,
"loss": 2.3216,
"step": 339
},
{
"epoch": 13.58252427184466,
"grad_norm": 3.22255277633667,
"learning_rate": 4.9807218309386444e-05,
"loss": 2.3418,
"step": 340
},
{
"epoch": 13.62135922330097,
"grad_norm": 3.154477119445801,
"learning_rate": 4.980286753286195e-05,
"loss": 2.3843,
"step": 341
},
{
"epoch": 13.660194174757281,
"grad_norm": 3.3448827266693115,
"learning_rate": 4.979846840187804e-05,
"loss": 2.419,
"step": 342
},
{
"epoch": 13.699029126213592,
"grad_norm": 3.275527238845825,
"learning_rate": 4.9794020925011044e-05,
"loss": 2.3756,
"step": 343
},
{
"epoch": 13.737864077669903,
"grad_norm": 2.3320887088775635,
"learning_rate": 4.9789525110931545e-05,
"loss": 2.3201,
"step": 344
},
{
"epoch": 13.776699029126213,
"grad_norm": 2.804107427597046,
"learning_rate": 4.978498096840436e-05,
"loss": 2.3461,
"step": 345
},
{
"epoch": 13.815533980582524,
"grad_norm": 2.809633255004883,
"learning_rate": 4.978038850628854e-05,
"loss": 2.3418,
"step": 346
},
{
"epoch": 13.854368932038835,
"grad_norm": 2.9983737468719482,
"learning_rate": 4.977574773353732e-05,
"loss": 2.4238,
"step": 347
},
{
"epoch": 13.893203883495145,
"grad_norm": 2.892005443572998,
"learning_rate": 4.977105865919812e-05,
"loss": 2.4266,
"step": 348
},
{
"epoch": 13.932038834951456,
"grad_norm": 2.766019821166992,
"learning_rate": 4.976632129241252e-05,
"loss": 2.3937,
"step": 349
},
{
"epoch": 13.970873786407767,
"grad_norm": 2.5251376628875732,
"learning_rate": 4.976153564241628e-05,
"loss": 2.3557,
"step": 350
},
{
"epoch": 13.970873786407767,
"eval_loss": 2.855170965194702,
"eval_runtime": 1.0307,
"eval_samples_per_second": 21.345,
"eval_steps_per_second": 5.821,
"step": 350
},
{
"epoch": 14.03883495145631,
"grad_norm": 5.263445854187012,
"learning_rate": 4.975670171853926e-05,
"loss": 4.6103,
"step": 351
},
{
"epoch": 14.077669902912621,
"grad_norm": 2.6694159507751465,
"learning_rate": 4.975181953020544e-05,
"loss": 2.2714,
"step": 352
},
{
"epoch": 14.116504854368932,
"grad_norm": 3.4369680881500244,
"learning_rate": 4.9746889086932895e-05,
"loss": 2.2303,
"step": 353
},
{
"epoch": 14.155339805825243,
"grad_norm": 3.053704023361206,
"learning_rate": 4.974191039833378e-05,
"loss": 2.2659,
"step": 354
},
{
"epoch": 14.194174757281553,
"grad_norm": 2.9966983795166016,
"learning_rate": 4.973688347411431e-05,
"loss": 2.3092,
"step": 355
},
{
"epoch": 14.233009708737864,
"grad_norm": 2.965481758117676,
"learning_rate": 4.9731808324074717e-05,
"loss": 2.2537,
"step": 356
},
{
"epoch": 14.271844660194175,
"grad_norm": 2.9761455059051514,
"learning_rate": 4.9726684958109266e-05,
"loss": 2.2865,
"step": 357
},
{
"epoch": 14.310679611650485,
"grad_norm": 2.936624050140381,
"learning_rate": 4.972151338620623e-05,
"loss": 2.2589,
"step": 358
},
{
"epoch": 14.349514563106796,
"grad_norm": 3.4442408084869385,
"learning_rate": 4.971629361844785e-05,
"loss": 2.2636,
"step": 359
},
{
"epoch": 14.388349514563107,
"grad_norm": 3.0097110271453857,
"learning_rate": 4.971102566501034e-05,
"loss": 2.204,
"step": 360
},
{
"epoch": 14.427184466019417,
"grad_norm": 3.7276322841644287,
"learning_rate": 4.9705709536163824e-05,
"loss": 2.2811,
"step": 361
},
{
"epoch": 14.466019417475728,
"grad_norm": 2.8004868030548096,
"learning_rate": 4.970034524227238e-05,
"loss": 2.1964,
"step": 362
},
{
"epoch": 14.504854368932039,
"grad_norm": 3.1439263820648193,
"learning_rate": 4.969493279379398e-05,
"loss": 2.294,
"step": 363
},
{
"epoch": 14.54368932038835,
"grad_norm": 2.971735954284668,
"learning_rate": 4.968947220128045e-05,
"loss": 2.2882,
"step": 364
},
{
"epoch": 14.58252427184466,
"grad_norm": 2.860797166824341,
"learning_rate": 4.968396347537751e-05,
"loss": 2.1807,
"step": 365
},
{
"epoch": 14.62135922330097,
"grad_norm": 2.8869500160217285,
"learning_rate": 4.96784066268247e-05,
"loss": 2.267,
"step": 366
},
{
"epoch": 14.660194174757281,
"grad_norm": 3.185670852661133,
"learning_rate": 4.967280166645538e-05,
"loss": 2.2956,
"step": 367
},
{
"epoch": 14.699029126213592,
"grad_norm": 2.750898838043213,
"learning_rate": 4.96671486051967e-05,
"loss": 2.2429,
"step": 368
},
{
"epoch": 14.737864077669903,
"grad_norm": 2.690889596939087,
"learning_rate": 4.966144745406961e-05,
"loss": 2.2645,
"step": 369
},
{
"epoch": 14.776699029126213,
"grad_norm": 2.5257797241210938,
"learning_rate": 4.965569822418877e-05,
"loss": 2.1714,
"step": 370
},
{
"epoch": 14.815533980582524,
"grad_norm": 2.550966739654541,
"learning_rate": 4.964990092676263e-05,
"loss": 2.2281,
"step": 371
},
{
"epoch": 14.854368932038835,
"grad_norm": 2.6299831867218018,
"learning_rate": 4.964405557309328e-05,
"loss": 2.2925,
"step": 372
},
{
"epoch": 14.893203883495145,
"grad_norm": 2.8115315437316895,
"learning_rate": 4.963816217457657e-05,
"loss": 2.3404,
"step": 373
},
{
"epoch": 14.932038834951456,
"grad_norm": 2.646278142929077,
"learning_rate": 4.9632220742701965e-05,
"loss": 2.2326,
"step": 374
},
{
"epoch": 14.970873786407767,
"grad_norm": 2.667069435119629,
"learning_rate": 4.9626231289052596e-05,
"loss": 2.318,
"step": 375
},
{
"epoch": 14.970873786407767,
"eval_loss": 2.8448235988616943,
"eval_runtime": 0.9921,
"eval_samples_per_second": 22.176,
"eval_steps_per_second": 6.048,
"step": 375
}
],
"logging_steps": 1,
"max_steps": 2500,
"num_input_tokens_seen": 0,
"num_train_epochs": 100,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 3395309036544000.0,
"train_batch_size": 4,
"trial_name": null,
"trial_params": null
}