qwen7b / trainer_state.json
Mia Fournier
Upload folder using huggingface_hub
188c360 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.999136690647482,
"eval_steps": 163,
"global_step": 651,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0015347721822541965,
"grad_norm": 4.09375,
"learning_rate": 3.846153846153847e-07,
"loss": 1.5132,
"step": 1
},
{
"epoch": 0.0015347721822541965,
"eval_loss": 1.6450660228729248,
"eval_runtime": 59.1102,
"eval_samples_per_second": 21.215,
"eval_steps_per_second": 21.215,
"step": 1
},
{
"epoch": 0.003069544364508393,
"grad_norm": 4.0,
"learning_rate": 7.692307692307694e-07,
"loss": 1.7745,
"step": 2
},
{
"epoch": 0.00460431654676259,
"grad_norm": 3.5625,
"learning_rate": 1.153846153846154e-06,
"loss": 1.4893,
"step": 3
},
{
"epoch": 0.006139088729016786,
"grad_norm": 4.21875,
"learning_rate": 1.5384615384615387e-06,
"loss": 1.593,
"step": 4
},
{
"epoch": 0.007673860911270983,
"grad_norm": 3.765625,
"learning_rate": 1.9230769230769234e-06,
"loss": 1.5801,
"step": 5
},
{
"epoch": 0.00920863309352518,
"grad_norm": 3.65625,
"learning_rate": 2.307692307692308e-06,
"loss": 1.6869,
"step": 6
},
{
"epoch": 0.010743405275779376,
"grad_norm": 3.625,
"learning_rate": 2.6923076923076923e-06,
"loss": 1.6167,
"step": 7
},
{
"epoch": 0.012278177458033572,
"grad_norm": 3.90625,
"learning_rate": 3.0769230769230774e-06,
"loss": 1.6349,
"step": 8
},
{
"epoch": 0.01381294964028777,
"grad_norm": 3.28125,
"learning_rate": 3.4615384615384617e-06,
"loss": 1.5281,
"step": 9
},
{
"epoch": 0.015347721822541967,
"grad_norm": 3.46875,
"learning_rate": 3.846153846153847e-06,
"loss": 1.7309,
"step": 10
},
{
"epoch": 0.01688249400479616,
"grad_norm": 3.625,
"learning_rate": 4.230769230769231e-06,
"loss": 1.6916,
"step": 11
},
{
"epoch": 0.01841726618705036,
"grad_norm": 3.46875,
"learning_rate": 4.615384615384616e-06,
"loss": 1.7503,
"step": 12
},
{
"epoch": 0.019952038369304557,
"grad_norm": 3.078125,
"learning_rate": 5e-06,
"loss": 1.6742,
"step": 13
},
{
"epoch": 0.021486810551558752,
"grad_norm": 2.8125,
"learning_rate": 5.384615384615385e-06,
"loss": 1.6483,
"step": 14
},
{
"epoch": 0.02302158273381295,
"grad_norm": 2.71875,
"learning_rate": 5.769230769230769e-06,
"loss": 1.6429,
"step": 15
},
{
"epoch": 0.024556354916067145,
"grad_norm": 2.390625,
"learning_rate": 6.153846153846155e-06,
"loss": 1.4412,
"step": 16
},
{
"epoch": 0.026091127098321343,
"grad_norm": 2.484375,
"learning_rate": 6.538461538461539e-06,
"loss": 1.4474,
"step": 17
},
{
"epoch": 0.02762589928057554,
"grad_norm": 2.546875,
"learning_rate": 6.923076923076923e-06,
"loss": 1.4794,
"step": 18
},
{
"epoch": 0.029160671462829735,
"grad_norm": 2.84375,
"learning_rate": 7.307692307692308e-06,
"loss": 1.5863,
"step": 19
},
{
"epoch": 0.030695443645083934,
"grad_norm": 2.5625,
"learning_rate": 7.692307692307694e-06,
"loss": 1.7385,
"step": 20
},
{
"epoch": 0.03223021582733813,
"grad_norm": 2.328125,
"learning_rate": 8.076923076923077e-06,
"loss": 1.5931,
"step": 21
},
{
"epoch": 0.03376498800959232,
"grad_norm": 2.1875,
"learning_rate": 8.461538461538462e-06,
"loss": 1.3675,
"step": 22
},
{
"epoch": 0.035299760191846524,
"grad_norm": 2.109375,
"learning_rate": 8.846153846153847e-06,
"loss": 1.5693,
"step": 23
},
{
"epoch": 0.03683453237410072,
"grad_norm": 2.125,
"learning_rate": 9.230769230769232e-06,
"loss": 1.572,
"step": 24
},
{
"epoch": 0.03836930455635491,
"grad_norm": 1.953125,
"learning_rate": 9.615384615384616e-06,
"loss": 1.5319,
"step": 25
},
{
"epoch": 0.039904076738609115,
"grad_norm": 2.0,
"learning_rate": 1e-05,
"loss": 1.4973,
"step": 26
},
{
"epoch": 0.04143884892086331,
"grad_norm": 1.921875,
"learning_rate": 1.0384615384615386e-05,
"loss": 1.6304,
"step": 27
},
{
"epoch": 0.042973621103117504,
"grad_norm": 2.03125,
"learning_rate": 1.076923076923077e-05,
"loss": 1.5787,
"step": 28
},
{
"epoch": 0.044508393285371706,
"grad_norm": 1.8515625,
"learning_rate": 1.1153846153846154e-05,
"loss": 1.4448,
"step": 29
},
{
"epoch": 0.0460431654676259,
"grad_norm": 1.8203125,
"learning_rate": 1.1538461538461538e-05,
"loss": 1.455,
"step": 30
},
{
"epoch": 0.047577937649880095,
"grad_norm": 1.8203125,
"learning_rate": 1.1923076923076925e-05,
"loss": 1.479,
"step": 31
},
{
"epoch": 0.04911270983213429,
"grad_norm": 1.7890625,
"learning_rate": 1.230769230769231e-05,
"loss": 1.4277,
"step": 32
},
{
"epoch": 0.05064748201438849,
"grad_norm": 1.7890625,
"learning_rate": 1.2692307692307693e-05,
"loss": 1.5078,
"step": 33
},
{
"epoch": 0.052182254196642686,
"grad_norm": 1.8125,
"learning_rate": 1.3076923076923078e-05,
"loss": 1.337,
"step": 34
},
{
"epoch": 0.05371702637889688,
"grad_norm": 1.9296875,
"learning_rate": 1.3461538461538463e-05,
"loss": 1.4207,
"step": 35
},
{
"epoch": 0.05525179856115108,
"grad_norm": 1.8671875,
"learning_rate": 1.3846153846153847e-05,
"loss": 1.5279,
"step": 36
},
{
"epoch": 0.056786570743405276,
"grad_norm": 1.8671875,
"learning_rate": 1.4230769230769232e-05,
"loss": 1.5279,
"step": 37
},
{
"epoch": 0.05832134292565947,
"grad_norm": 1.8671875,
"learning_rate": 1.4615384615384615e-05,
"loss": 1.3525,
"step": 38
},
{
"epoch": 0.05985611510791367,
"grad_norm": 1.8671875,
"learning_rate": 1.5000000000000002e-05,
"loss": 1.5602,
"step": 39
},
{
"epoch": 0.06139088729016787,
"grad_norm": 1.8671875,
"learning_rate": 1.5384615384615387e-05,
"loss": 1.6014,
"step": 40
},
{
"epoch": 0.06292565947242207,
"grad_norm": 1.8515625,
"learning_rate": 1.576923076923077e-05,
"loss": 1.4031,
"step": 41
},
{
"epoch": 0.06446043165467626,
"grad_norm": 1.9140625,
"learning_rate": 1.6153846153846154e-05,
"loss": 1.6676,
"step": 42
},
{
"epoch": 0.06599520383693046,
"grad_norm": 1.71875,
"learning_rate": 1.653846153846154e-05,
"loss": 1.4937,
"step": 43
},
{
"epoch": 0.06752997601918465,
"grad_norm": 1.7265625,
"learning_rate": 1.6923076923076924e-05,
"loss": 1.5643,
"step": 44
},
{
"epoch": 0.06906474820143885,
"grad_norm": 1.7265625,
"learning_rate": 1.730769230769231e-05,
"loss": 1.4824,
"step": 45
},
{
"epoch": 0.07059952038369305,
"grad_norm": 1.7578125,
"learning_rate": 1.7692307692307694e-05,
"loss": 1.4603,
"step": 46
},
{
"epoch": 0.07213429256594724,
"grad_norm": 1.859375,
"learning_rate": 1.807692307692308e-05,
"loss": 1.5862,
"step": 47
},
{
"epoch": 0.07366906474820144,
"grad_norm": 1.8125,
"learning_rate": 1.8461538461538465e-05,
"loss": 1.4719,
"step": 48
},
{
"epoch": 0.07520383693045564,
"grad_norm": 1.90625,
"learning_rate": 1.8846153846153846e-05,
"loss": 1.4937,
"step": 49
},
{
"epoch": 0.07673860911270983,
"grad_norm": 1.9453125,
"learning_rate": 1.923076923076923e-05,
"loss": 1.5793,
"step": 50
},
{
"epoch": 0.07827338129496403,
"grad_norm": 1.84375,
"learning_rate": 1.9615384615384617e-05,
"loss": 1.3622,
"step": 51
},
{
"epoch": 0.07980815347721823,
"grad_norm": 1.71875,
"learning_rate": 2e-05,
"loss": 1.2495,
"step": 52
},
{
"epoch": 0.08134292565947242,
"grad_norm": 1.765625,
"learning_rate": 1.9999968417282542e-05,
"loss": 1.4069,
"step": 53
},
{
"epoch": 0.08287769784172662,
"grad_norm": 1.90625,
"learning_rate": 1.999987366932966e-05,
"loss": 1.49,
"step": 54
},
{
"epoch": 0.08441247002398082,
"grad_norm": 1.7265625,
"learning_rate": 1.9999715756739833e-05,
"loss": 1.4333,
"step": 55
},
{
"epoch": 0.08594724220623501,
"grad_norm": 1.9375,
"learning_rate": 1.999949468051052e-05,
"loss": 1.5375,
"step": 56
},
{
"epoch": 0.08748201438848921,
"grad_norm": 2.0,
"learning_rate": 1.9999210442038164e-05,
"loss": 1.5002,
"step": 57
},
{
"epoch": 0.08901678657074341,
"grad_norm": 1.8203125,
"learning_rate": 1.9998863043118163e-05,
"loss": 1.5414,
"step": 58
},
{
"epoch": 0.0905515587529976,
"grad_norm": 1.9375,
"learning_rate": 1.999845248594489e-05,
"loss": 1.4547,
"step": 59
},
{
"epoch": 0.0920863309352518,
"grad_norm": 1.8359375,
"learning_rate": 1.999797877311163e-05,
"loss": 1.308,
"step": 60
},
{
"epoch": 0.093621103117506,
"grad_norm": 1.6875,
"learning_rate": 1.9997441907610624e-05,
"loss": 1.4789,
"step": 61
},
{
"epoch": 0.09515587529976019,
"grad_norm": 1.7890625,
"learning_rate": 1.9996841892833e-05,
"loss": 1.2503,
"step": 62
},
{
"epoch": 0.09669064748201439,
"grad_norm": 1.828125,
"learning_rate": 1.9996178732568784e-05,
"loss": 1.4519,
"step": 63
},
{
"epoch": 0.09822541966426858,
"grad_norm": 1.7109375,
"learning_rate": 1.9995452431006844e-05,
"loss": 1.2409,
"step": 64
},
{
"epoch": 0.09976019184652278,
"grad_norm": 1.7890625,
"learning_rate": 1.999466299273491e-05,
"loss": 1.4345,
"step": 65
},
{
"epoch": 0.10129496402877698,
"grad_norm": 1.7421875,
"learning_rate": 1.9993810422739496e-05,
"loss": 1.4491,
"step": 66
},
{
"epoch": 0.10282973621103117,
"grad_norm": 1.7265625,
"learning_rate": 1.9992894726405894e-05,
"loss": 1.4475,
"step": 67
},
{
"epoch": 0.10436450839328537,
"grad_norm": 1.78125,
"learning_rate": 1.9991915909518146e-05,
"loss": 1.4997,
"step": 68
},
{
"epoch": 0.10589928057553957,
"grad_norm": 1.6640625,
"learning_rate": 1.999087397825899e-05,
"loss": 1.4607,
"step": 69
},
{
"epoch": 0.10743405275779376,
"grad_norm": 1.7421875,
"learning_rate": 1.9989768939209826e-05,
"loss": 1.4236,
"step": 70
},
{
"epoch": 0.10896882494004796,
"grad_norm": 1.8359375,
"learning_rate": 1.9988600799350685e-05,
"loss": 1.4913,
"step": 71
},
{
"epoch": 0.11050359712230216,
"grad_norm": 1.8046875,
"learning_rate": 1.998736956606018e-05,
"loss": 1.5473,
"step": 72
},
{
"epoch": 0.11203836930455635,
"grad_norm": 1.75,
"learning_rate": 1.998607524711543e-05,
"loss": 1.4922,
"step": 73
},
{
"epoch": 0.11357314148681055,
"grad_norm": 1.703125,
"learning_rate": 1.998471785069208e-05,
"loss": 1.3825,
"step": 74
},
{
"epoch": 0.11510791366906475,
"grad_norm": 1.7734375,
"learning_rate": 1.9983297385364166e-05,
"loss": 1.4647,
"step": 75
},
{
"epoch": 0.11664268585131894,
"grad_norm": 1.7734375,
"learning_rate": 1.998181386010413e-05,
"loss": 1.4448,
"step": 76
},
{
"epoch": 0.11817745803357314,
"grad_norm": 1.8515625,
"learning_rate": 1.9980267284282718e-05,
"loss": 1.5147,
"step": 77
},
{
"epoch": 0.11971223021582734,
"grad_norm": 1.703125,
"learning_rate": 1.9978657667668945e-05,
"loss": 1.4692,
"step": 78
},
{
"epoch": 0.12124700239808153,
"grad_norm": 1.9296875,
"learning_rate": 1.9976985020430022e-05,
"loss": 1.5878,
"step": 79
},
{
"epoch": 0.12278177458033573,
"grad_norm": 1.6796875,
"learning_rate": 1.9975249353131304e-05,
"loss": 1.3711,
"step": 80
},
{
"epoch": 0.12431654676258992,
"grad_norm": 1.8515625,
"learning_rate": 1.9973450676736205e-05,
"loss": 1.5019,
"step": 81
},
{
"epoch": 0.12585131894484414,
"grad_norm": 1.8671875,
"learning_rate": 1.997158900260614e-05,
"loss": 1.4792,
"step": 82
},
{
"epoch": 0.1273860911270983,
"grad_norm": 1.7265625,
"learning_rate": 1.996966434250046e-05,
"loss": 1.4263,
"step": 83
},
{
"epoch": 0.1289208633093525,
"grad_norm": 1.8125,
"learning_rate": 1.9967676708576362e-05,
"loss": 1.507,
"step": 84
},
{
"epoch": 0.13045563549160671,
"grad_norm": 1.8359375,
"learning_rate": 1.9965626113388823e-05,
"loss": 1.4734,
"step": 85
},
{
"epoch": 0.13199040767386092,
"grad_norm": 1.75,
"learning_rate": 1.9963512569890512e-05,
"loss": 1.4456,
"step": 86
},
{
"epoch": 0.13352517985611512,
"grad_norm": 1.7578125,
"learning_rate": 1.9961336091431728e-05,
"loss": 1.2601,
"step": 87
},
{
"epoch": 0.1350599520383693,
"grad_norm": 1.8671875,
"learning_rate": 1.9959096691760284e-05,
"loss": 1.5481,
"step": 88
},
{
"epoch": 0.1365947242206235,
"grad_norm": 1.75,
"learning_rate": 1.9956794385021444e-05,
"loss": 1.4177,
"step": 89
},
{
"epoch": 0.1381294964028777,
"grad_norm": 1.8046875,
"learning_rate": 1.9954429185757835e-05,
"loss": 1.6005,
"step": 90
},
{
"epoch": 0.1396642685851319,
"grad_norm": 1.7109375,
"learning_rate": 1.9952001108909336e-05,
"loss": 1.4551,
"step": 91
},
{
"epoch": 0.1411990407673861,
"grad_norm": 1.75,
"learning_rate": 1.9949510169813006e-05,
"loss": 1.4133,
"step": 92
},
{
"epoch": 0.1427338129496403,
"grad_norm": 1.828125,
"learning_rate": 1.994695638420296e-05,
"loss": 1.4024,
"step": 93
},
{
"epoch": 0.14426858513189447,
"grad_norm": 1.828125,
"learning_rate": 1.994433976821031e-05,
"loss": 1.4194,
"step": 94
},
{
"epoch": 0.14580335731414867,
"grad_norm": 1.703125,
"learning_rate": 1.9941660338363008e-05,
"loss": 1.5165,
"step": 95
},
{
"epoch": 0.14733812949640288,
"grad_norm": 1.875,
"learning_rate": 1.9938918111585805e-05,
"loss": 1.5456,
"step": 96
},
{
"epoch": 0.14887290167865708,
"grad_norm": 1.7734375,
"learning_rate": 1.9936113105200085e-05,
"loss": 1.5324,
"step": 97
},
{
"epoch": 0.15040767386091128,
"grad_norm": 1.828125,
"learning_rate": 1.9933245336923798e-05,
"loss": 1.3492,
"step": 98
},
{
"epoch": 0.15194244604316548,
"grad_norm": 1.7421875,
"learning_rate": 1.9930314824871326e-05,
"loss": 1.4851,
"step": 99
},
{
"epoch": 0.15347721822541965,
"grad_norm": 1.6953125,
"learning_rate": 1.9927321587553378e-05,
"loss": 1.4352,
"step": 100
},
{
"epoch": 0.15501199040767386,
"grad_norm": 1.75,
"learning_rate": 1.992426564387686e-05,
"loss": 1.4341,
"step": 101
},
{
"epoch": 0.15654676258992806,
"grad_norm": 1.7578125,
"learning_rate": 1.9921147013144782e-05,
"loss": 1.5551,
"step": 102
},
{
"epoch": 0.15808153477218226,
"grad_norm": 1.9140625,
"learning_rate": 1.9917965715056106e-05,
"loss": 1.4501,
"step": 103
},
{
"epoch": 0.15961630695443646,
"grad_norm": 1.7734375,
"learning_rate": 1.9914721769705637e-05,
"loss": 1.4443,
"step": 104
},
{
"epoch": 0.16115107913669063,
"grad_norm": 1.765625,
"learning_rate": 1.9911415197583904e-05,
"loss": 1.4481,
"step": 105
},
{
"epoch": 0.16268585131894484,
"grad_norm": 1.7109375,
"learning_rate": 1.990804601957701e-05,
"loss": 1.4376,
"step": 106
},
{
"epoch": 0.16422062350119904,
"grad_norm": 1.7890625,
"learning_rate": 1.9904614256966514e-05,
"loss": 1.5503,
"step": 107
},
{
"epoch": 0.16575539568345324,
"grad_norm": 1.7421875,
"learning_rate": 1.9901119931429294e-05,
"loss": 1.406,
"step": 108
},
{
"epoch": 0.16729016786570744,
"grad_norm": 1.7421875,
"learning_rate": 1.9897563065037412e-05,
"loss": 1.3818,
"step": 109
},
{
"epoch": 0.16882494004796164,
"grad_norm": 1.703125,
"learning_rate": 1.9893943680257964e-05,
"loss": 1.3271,
"step": 110
},
{
"epoch": 0.17035971223021582,
"grad_norm": 1.859375,
"learning_rate": 1.9890261799952954e-05,
"loss": 1.5972,
"step": 111
},
{
"epoch": 0.17189448441247002,
"grad_norm": 1.8671875,
"learning_rate": 1.988651744737914e-05,
"loss": 1.5008,
"step": 112
},
{
"epoch": 0.17342925659472422,
"grad_norm": 1.7578125,
"learning_rate": 1.988271064618789e-05,
"loss": 1.3596,
"step": 113
},
{
"epoch": 0.17496402877697842,
"grad_norm": 1.75,
"learning_rate": 1.9878841420425023e-05,
"loss": 1.3632,
"step": 114
},
{
"epoch": 0.17649880095923262,
"grad_norm": 1.765625,
"learning_rate": 1.9874909794530677e-05,
"loss": 1.6275,
"step": 115
},
{
"epoch": 0.17803357314148682,
"grad_norm": 1.703125,
"learning_rate": 1.9870915793339137e-05,
"loss": 1.4541,
"step": 116
},
{
"epoch": 0.179568345323741,
"grad_norm": 1.8671875,
"learning_rate": 1.986685944207868e-05,
"loss": 1.4712,
"step": 117
},
{
"epoch": 0.1811031175059952,
"grad_norm": 1.75,
"learning_rate": 1.9862740766371434e-05,
"loss": 1.5007,
"step": 118
},
{
"epoch": 0.1826378896882494,
"grad_norm": 1.8828125,
"learning_rate": 1.985855979223319e-05,
"loss": 1.4757,
"step": 119
},
{
"epoch": 0.1841726618705036,
"grad_norm": 1.8046875,
"learning_rate": 1.985431654607325e-05,
"loss": 1.3876,
"step": 120
},
{
"epoch": 0.1857074340527578,
"grad_norm": 1.7265625,
"learning_rate": 1.9850011054694264e-05,
"loss": 1.4466,
"step": 121
},
{
"epoch": 0.187242206235012,
"grad_norm": 1.734375,
"learning_rate": 1.9845643345292055e-05,
"loss": 1.3912,
"step": 122
},
{
"epoch": 0.18877697841726618,
"grad_norm": 1.875,
"learning_rate": 1.984121344545545e-05,
"loss": 1.4602,
"step": 123
},
{
"epoch": 0.19031175059952038,
"grad_norm": 1.90625,
"learning_rate": 1.983672138316611e-05,
"loss": 1.5255,
"step": 124
},
{
"epoch": 0.19184652278177458,
"grad_norm": 1.7734375,
"learning_rate": 1.9832167186798333e-05,
"loss": 1.3831,
"step": 125
},
{
"epoch": 0.19338129496402878,
"grad_norm": 1.7734375,
"learning_rate": 1.9827550885118902e-05,
"loss": 1.478,
"step": 126
},
{
"epoch": 0.19491606714628298,
"grad_norm": 1.7890625,
"learning_rate": 1.982287250728689e-05,
"loss": 1.5094,
"step": 127
},
{
"epoch": 0.19645083932853716,
"grad_norm": 1.8046875,
"learning_rate": 1.9818132082853466e-05,
"loss": 1.5228,
"step": 128
},
{
"epoch": 0.19798561151079136,
"grad_norm": 1.7421875,
"learning_rate": 1.9813329641761738e-05,
"loss": 1.4506,
"step": 129
},
{
"epoch": 0.19952038369304556,
"grad_norm": 1.8984375,
"learning_rate": 1.9808465214346525e-05,
"loss": 1.5035,
"step": 130
},
{
"epoch": 0.20105515587529976,
"grad_norm": 1.71875,
"learning_rate": 1.98035388313342e-05,
"loss": 1.4746,
"step": 131
},
{
"epoch": 0.20258992805755396,
"grad_norm": 1.6875,
"learning_rate": 1.979855052384247e-05,
"loss": 1.5071,
"step": 132
},
{
"epoch": 0.20412470023980817,
"grad_norm": 1.7265625,
"learning_rate": 1.97935003233802e-05,
"loss": 1.472,
"step": 133
},
{
"epoch": 0.20565947242206234,
"grad_norm": 1.6640625,
"learning_rate": 1.9788388261847204e-05,
"loss": 1.3509,
"step": 134
},
{
"epoch": 0.20719424460431654,
"grad_norm": 1.6640625,
"learning_rate": 1.9783214371534037e-05,
"loss": 1.4248,
"step": 135
},
{
"epoch": 0.20872901678657074,
"grad_norm": 1.734375,
"learning_rate": 1.97779786851218e-05,
"loss": 1.5036,
"step": 136
},
{
"epoch": 0.21026378896882494,
"grad_norm": 1.7265625,
"learning_rate": 1.9772681235681936e-05,
"loss": 1.4228,
"step": 137
},
{
"epoch": 0.21179856115107915,
"grad_norm": 1.6875,
"learning_rate": 1.9767322056676018e-05,
"loss": 1.4464,
"step": 138
},
{
"epoch": 0.21333333333333335,
"grad_norm": 1.84375,
"learning_rate": 1.976190118195553e-05,
"loss": 1.3816,
"step": 139
},
{
"epoch": 0.21486810551558752,
"grad_norm": 1.734375,
"learning_rate": 1.975641864576166e-05,
"loss": 1.5671,
"step": 140
},
{
"epoch": 0.21640287769784172,
"grad_norm": 1.7421875,
"learning_rate": 1.9750874482725093e-05,
"loss": 1.4326,
"step": 141
},
{
"epoch": 0.21793764988009592,
"grad_norm": 1.90625,
"learning_rate": 1.9745268727865774e-05,
"loss": 1.5385,
"step": 142
},
{
"epoch": 0.21947242206235013,
"grad_norm": 1.8984375,
"learning_rate": 1.9739601416592693e-05,
"loss": 1.2213,
"step": 143
},
{
"epoch": 0.22100719424460433,
"grad_norm": 1.7890625,
"learning_rate": 1.9733872584703673e-05,
"loss": 1.4447,
"step": 144
},
{
"epoch": 0.2225419664268585,
"grad_norm": 1.59375,
"learning_rate": 1.9728082268385126e-05,
"loss": 1.5023,
"step": 145
},
{
"epoch": 0.2240767386091127,
"grad_norm": 1.6875,
"learning_rate": 1.9722230504211843e-05,
"loss": 1.496,
"step": 146
},
{
"epoch": 0.2256115107913669,
"grad_norm": 1.59375,
"learning_rate": 1.971631732914674e-05,
"loss": 1.4072,
"step": 147
},
{
"epoch": 0.2271462829736211,
"grad_norm": 1.6875,
"learning_rate": 1.971034278054065e-05,
"loss": 1.3316,
"step": 148
},
{
"epoch": 0.2286810551558753,
"grad_norm": 1.796875,
"learning_rate": 1.9704306896132063e-05,
"loss": 1.3629,
"step": 149
},
{
"epoch": 0.2302158273381295,
"grad_norm": 1.671875,
"learning_rate": 1.969820971404691e-05,
"loss": 1.405,
"step": 150
},
{
"epoch": 0.23175059952038368,
"grad_norm": 1.6796875,
"learning_rate": 1.9692051272798304e-05,
"loss": 1.3486,
"step": 151
},
{
"epoch": 0.23328537170263788,
"grad_norm": 1.6796875,
"learning_rate": 1.9685831611286312e-05,
"loss": 1.3927,
"step": 152
},
{
"epoch": 0.23482014388489209,
"grad_norm": 1.734375,
"learning_rate": 1.967955076879769e-05,
"loss": 1.3901,
"step": 153
},
{
"epoch": 0.2363549160671463,
"grad_norm": 1.8125,
"learning_rate": 1.9673208785005658e-05,
"loss": 1.5,
"step": 154
},
{
"epoch": 0.2378896882494005,
"grad_norm": 1.734375,
"learning_rate": 1.966680569996963e-05,
"loss": 1.2666,
"step": 155
},
{
"epoch": 0.2394244604316547,
"grad_norm": 1.7109375,
"learning_rate": 1.9660341554134972e-05,
"loss": 1.3021,
"step": 156
},
{
"epoch": 0.24095923261390886,
"grad_norm": 1.7265625,
"learning_rate": 1.965381638833274e-05,
"loss": 1.4593,
"step": 157
},
{
"epoch": 0.24249400479616307,
"grad_norm": 1.859375,
"learning_rate": 1.9647230243779432e-05,
"loss": 1.5067,
"step": 158
},
{
"epoch": 0.24402877697841727,
"grad_norm": 1.875,
"learning_rate": 1.964058316207671e-05,
"loss": 1.5402,
"step": 159
},
{
"epoch": 0.24556354916067147,
"grad_norm": 1.6875,
"learning_rate": 1.963387518521116e-05,
"loss": 1.3881,
"step": 160
},
{
"epoch": 0.24709832134292567,
"grad_norm": 1.8671875,
"learning_rate": 1.962710635555401e-05,
"loss": 1.6041,
"step": 161
},
{
"epoch": 0.24863309352517984,
"grad_norm": 1.8515625,
"learning_rate": 1.962027671586086e-05,
"loss": 1.5419,
"step": 162
},
{
"epoch": 0.25016786570743405,
"grad_norm": 1.6953125,
"learning_rate": 1.9613386309271437e-05,
"loss": 1.3791,
"step": 163
},
{
"epoch": 0.25016786570743405,
"eval_loss": 1.441381812095642,
"eval_runtime": 58.8537,
"eval_samples_per_second": 21.307,
"eval_steps_per_second": 21.307,
"step": 163
},
{
"epoch": 0.2517026378896883,
"grad_norm": 1.8203125,
"learning_rate": 1.9606435179309284e-05,
"loss": 1.4643,
"step": 164
},
{
"epoch": 0.25323741007194245,
"grad_norm": 1.6328125,
"learning_rate": 1.959942336988152e-05,
"loss": 1.3267,
"step": 165
},
{
"epoch": 0.2547721822541966,
"grad_norm": 1.78125,
"learning_rate": 1.9592350925278546e-05,
"loss": 1.507,
"step": 166
},
{
"epoch": 0.25630695443645085,
"grad_norm": 1.6484375,
"learning_rate": 1.958521789017376e-05,
"loss": 1.3429,
"step": 167
},
{
"epoch": 0.257841726618705,
"grad_norm": 1.8125,
"learning_rate": 1.9578024309623296e-05,
"loss": 1.464,
"step": 168
},
{
"epoch": 0.25937649880095925,
"grad_norm": 1.8046875,
"learning_rate": 1.9570770229065716e-05,
"loss": 1.3992,
"step": 169
},
{
"epoch": 0.26091127098321343,
"grad_norm": 1.8125,
"learning_rate": 1.956345569432173e-05,
"loss": 1.6172,
"step": 170
},
{
"epoch": 0.2624460431654676,
"grad_norm": 1.75,
"learning_rate": 1.955608075159392e-05,
"loss": 1.3302,
"step": 171
},
{
"epoch": 0.26398081534772183,
"grad_norm": 1.6953125,
"learning_rate": 1.9548645447466433e-05,
"loss": 1.4756,
"step": 172
},
{
"epoch": 0.265515587529976,
"grad_norm": 1.75,
"learning_rate": 1.9541149828904686e-05,
"loss": 1.2405,
"step": 173
},
{
"epoch": 0.26705035971223023,
"grad_norm": 1.71875,
"learning_rate": 1.9533593943255087e-05,
"loss": 1.3672,
"step": 174
},
{
"epoch": 0.2685851318944844,
"grad_norm": 1.8515625,
"learning_rate": 1.952597783824471e-05,
"loss": 1.3942,
"step": 175
},
{
"epoch": 0.2701199040767386,
"grad_norm": 1.96875,
"learning_rate": 1.9518301561981016e-05,
"loss": 1.5685,
"step": 176
},
{
"epoch": 0.2716546762589928,
"grad_norm": 1.9140625,
"learning_rate": 1.9510565162951538e-05,
"loss": 1.5139,
"step": 177
},
{
"epoch": 0.273189448441247,
"grad_norm": 1.90625,
"learning_rate": 1.9502768690023574e-05,
"loss": 1.4495,
"step": 178
},
{
"epoch": 0.2747242206235012,
"grad_norm": 1.6328125,
"learning_rate": 1.949491219244389e-05,
"loss": 1.3186,
"step": 179
},
{
"epoch": 0.2762589928057554,
"grad_norm": 1.8359375,
"learning_rate": 1.9486995719838392e-05,
"loss": 1.372,
"step": 180
},
{
"epoch": 0.2777937649880096,
"grad_norm": 1.6796875,
"learning_rate": 1.9479019322211824e-05,
"loss": 1.3685,
"step": 181
},
{
"epoch": 0.2793285371702638,
"grad_norm": 1.7578125,
"learning_rate": 1.9470983049947446e-05,
"loss": 1.4346,
"step": 182
},
{
"epoch": 0.28086330935251796,
"grad_norm": 1.71875,
"learning_rate": 1.946288695380672e-05,
"loss": 1.3951,
"step": 183
},
{
"epoch": 0.2823980815347722,
"grad_norm": 1.8203125,
"learning_rate": 1.9454731084928995e-05,
"loss": 1.3547,
"step": 184
},
{
"epoch": 0.28393285371702637,
"grad_norm": 1.8671875,
"learning_rate": 1.9446515494831168e-05,
"loss": 1.5246,
"step": 185
},
{
"epoch": 0.2854676258992806,
"grad_norm": 1.8515625,
"learning_rate": 1.9438240235407375e-05,
"loss": 1.459,
"step": 186
},
{
"epoch": 0.28700239808153477,
"grad_norm": 1.703125,
"learning_rate": 1.9429905358928648e-05,
"loss": 1.3077,
"step": 187
},
{
"epoch": 0.28853717026378894,
"grad_norm": 1.7890625,
"learning_rate": 1.9421510918042593e-05,
"loss": 1.4291,
"step": 188
},
{
"epoch": 0.2900719424460432,
"grad_norm": 1.734375,
"learning_rate": 1.941305696577307e-05,
"loss": 1.4862,
"step": 189
},
{
"epoch": 0.29160671462829735,
"grad_norm": 1.6875,
"learning_rate": 1.940454355551983e-05,
"loss": 1.2541,
"step": 190
},
{
"epoch": 0.2931414868105516,
"grad_norm": 1.734375,
"learning_rate": 1.9395970741058202e-05,
"loss": 1.4298,
"step": 191
},
{
"epoch": 0.29467625899280575,
"grad_norm": 1.8984375,
"learning_rate": 1.9387338576538743e-05,
"loss": 1.5857,
"step": 192
},
{
"epoch": 0.2962110311750599,
"grad_norm": 1.8203125,
"learning_rate": 1.937864711648689e-05,
"loss": 1.5354,
"step": 193
},
{
"epoch": 0.29774580335731415,
"grad_norm": 1.7421875,
"learning_rate": 1.936989641580263e-05,
"loss": 1.5927,
"step": 194
},
{
"epoch": 0.2992805755395683,
"grad_norm": 1.71875,
"learning_rate": 1.936108652976015e-05,
"loss": 1.3927,
"step": 195
},
{
"epoch": 0.30081534772182256,
"grad_norm": 1.6328125,
"learning_rate": 1.935221751400747e-05,
"loss": 1.3322,
"step": 196
},
{
"epoch": 0.30235011990407673,
"grad_norm": 1.609375,
"learning_rate": 1.9343289424566122e-05,
"loss": 1.3863,
"step": 197
},
{
"epoch": 0.30388489208633096,
"grad_norm": 1.6953125,
"learning_rate": 1.9334302317830764e-05,
"loss": 1.3775,
"step": 198
},
{
"epoch": 0.30541966426858513,
"grad_norm": 1.6640625,
"learning_rate": 1.9325256250568852e-05,
"loss": 1.3753,
"step": 199
},
{
"epoch": 0.3069544364508393,
"grad_norm": 1.6953125,
"learning_rate": 1.931615127992026e-05,
"loss": 1.4834,
"step": 200
},
{
"epoch": 0.30848920863309354,
"grad_norm": 1.828125,
"learning_rate": 1.9306987463396934e-05,
"loss": 1.4202,
"step": 201
},
{
"epoch": 0.3100239808153477,
"grad_norm": 1.828125,
"learning_rate": 1.9297764858882516e-05,
"loss": 1.3512,
"step": 202
},
{
"epoch": 0.31155875299760194,
"grad_norm": 1.8046875,
"learning_rate": 1.928848352463199e-05,
"loss": 1.5923,
"step": 203
},
{
"epoch": 0.3130935251798561,
"grad_norm": 1.7578125,
"learning_rate": 1.927914351927131e-05,
"loss": 1.447,
"step": 204
},
{
"epoch": 0.3146282973621103,
"grad_norm": 1.796875,
"learning_rate": 1.9269744901797022e-05,
"loss": 1.4988,
"step": 205
},
{
"epoch": 0.3161630695443645,
"grad_norm": 1.7421875,
"learning_rate": 1.9260287731575902e-05,
"loss": 1.5078,
"step": 206
},
{
"epoch": 0.3176978417266187,
"grad_norm": 1.65625,
"learning_rate": 1.925077206834458e-05,
"loss": 1.4488,
"step": 207
},
{
"epoch": 0.3192326139088729,
"grad_norm": 1.5859375,
"learning_rate": 1.9241197972209157e-05,
"loss": 1.2465,
"step": 208
},
{
"epoch": 0.3207673860911271,
"grad_norm": 1.75,
"learning_rate": 1.9231565503644826e-05,
"loss": 1.4034,
"step": 209
},
{
"epoch": 0.32230215827338127,
"grad_norm": 1.7265625,
"learning_rate": 1.9221874723495494e-05,
"loss": 1.4779,
"step": 210
},
{
"epoch": 0.3238369304556355,
"grad_norm": 1.6875,
"learning_rate": 1.9212125692973396e-05,
"loss": 1.2789,
"step": 211
},
{
"epoch": 0.32537170263788967,
"grad_norm": 1.7890625,
"learning_rate": 1.9202318473658707e-05,
"loss": 1.4495,
"step": 212
},
{
"epoch": 0.3269064748201439,
"grad_norm": 1.859375,
"learning_rate": 1.919245312749915e-05,
"loss": 1.4263,
"step": 213
},
{
"epoch": 0.3284412470023981,
"grad_norm": 1.703125,
"learning_rate": 1.9182529716809618e-05,
"loss": 1.4215,
"step": 214
},
{
"epoch": 0.3299760191846523,
"grad_norm": 1.6484375,
"learning_rate": 1.9172548304271766e-05,
"loss": 1.2981,
"step": 215
},
{
"epoch": 0.3315107913669065,
"grad_norm": 1.640625,
"learning_rate": 1.916250895293362e-05,
"loss": 1.3433,
"step": 216
},
{
"epoch": 0.33304556354916065,
"grad_norm": 1.859375,
"learning_rate": 1.9152411726209176e-05,
"loss": 1.5089,
"step": 217
},
{
"epoch": 0.3345803357314149,
"grad_norm": 1.78125,
"learning_rate": 1.9142256687878012e-05,
"loss": 1.511,
"step": 218
},
{
"epoch": 0.33611510791366905,
"grad_norm": 1.6875,
"learning_rate": 1.9132043902084864e-05,
"loss": 1.4224,
"step": 219
},
{
"epoch": 0.3376498800959233,
"grad_norm": 1.7578125,
"learning_rate": 1.912177343333924e-05,
"loss": 1.3882,
"step": 220
},
{
"epoch": 0.33918465227817746,
"grad_norm": 1.7734375,
"learning_rate": 1.9111445346515003e-05,
"loss": 1.5708,
"step": 221
},
{
"epoch": 0.34071942446043163,
"grad_norm": 1.828125,
"learning_rate": 1.9101059706849957e-05,
"loss": 1.4676,
"step": 222
},
{
"epoch": 0.34225419664268586,
"grad_norm": 1.828125,
"learning_rate": 1.9090616579945455e-05,
"loss": 1.479,
"step": 223
},
{
"epoch": 0.34378896882494003,
"grad_norm": 1.8359375,
"learning_rate": 1.908011603176596e-05,
"loss": 1.5031,
"step": 224
},
{
"epoch": 0.34532374100719426,
"grad_norm": 1.6953125,
"learning_rate": 1.9069558128638636e-05,
"loss": 1.4443,
"step": 225
},
{
"epoch": 0.34685851318944844,
"grad_norm": 1.78125,
"learning_rate": 1.9058942937252943e-05,
"loss": 1.466,
"step": 226
},
{
"epoch": 0.34839328537170267,
"grad_norm": 1.7421875,
"learning_rate": 1.9048270524660197e-05,
"loss": 1.4357,
"step": 227
},
{
"epoch": 0.34992805755395684,
"grad_norm": 1.7265625,
"learning_rate": 1.903754095827316e-05,
"loss": 1.4573,
"step": 228
},
{
"epoch": 0.351462829736211,
"grad_norm": 1.7265625,
"learning_rate": 1.9026754305865593e-05,
"loss": 1.3422,
"step": 229
},
{
"epoch": 0.35299760191846524,
"grad_norm": 1.6171875,
"learning_rate": 1.901591063557187e-05,
"loss": 1.2501,
"step": 230
},
{
"epoch": 0.3545323741007194,
"grad_norm": 1.890625,
"learning_rate": 1.9005010015886495e-05,
"loss": 1.4802,
"step": 231
},
{
"epoch": 0.35606714628297365,
"grad_norm": 1.9453125,
"learning_rate": 1.899405251566371e-05,
"loss": 1.5655,
"step": 232
},
{
"epoch": 0.3576019184652278,
"grad_norm": 1.84375,
"learning_rate": 1.8983038204117046e-05,
"loss": 1.4752,
"step": 233
},
{
"epoch": 0.359136690647482,
"grad_norm": 1.7421875,
"learning_rate": 1.897196715081888e-05,
"loss": 1.4711,
"step": 234
},
{
"epoch": 0.3606714628297362,
"grad_norm": 1.7421875,
"learning_rate": 1.8960839425699992e-05,
"loss": 1.5812,
"step": 235
},
{
"epoch": 0.3622062350119904,
"grad_norm": 1.6484375,
"learning_rate": 1.894965509904915e-05,
"loss": 1.4105,
"step": 236
},
{
"epoch": 0.3637410071942446,
"grad_norm": 1.640625,
"learning_rate": 1.893841424151264e-05,
"loss": 1.3949,
"step": 237
},
{
"epoch": 0.3652757793764988,
"grad_norm": 2.046875,
"learning_rate": 1.8927116924093824e-05,
"loss": 1.3143,
"step": 238
},
{
"epoch": 0.366810551558753,
"grad_norm": 1.7734375,
"learning_rate": 1.8915763218152704e-05,
"loss": 1.466,
"step": 239
},
{
"epoch": 0.3683453237410072,
"grad_norm": 1.71875,
"learning_rate": 1.890435319540545e-05,
"loss": 1.3584,
"step": 240
},
{
"epoch": 0.3698800959232614,
"grad_norm": 1.84375,
"learning_rate": 1.8892886927923972e-05,
"loss": 1.4642,
"step": 241
},
{
"epoch": 0.3714148681055156,
"grad_norm": 1.765625,
"learning_rate": 1.8881364488135448e-05,
"loss": 1.3241,
"step": 242
},
{
"epoch": 0.3729496402877698,
"grad_norm": 1.6640625,
"learning_rate": 1.8869785948821865e-05,
"loss": 1.3893,
"step": 243
},
{
"epoch": 0.374484412470024,
"grad_norm": 1.7734375,
"learning_rate": 1.8858151383119576e-05,
"loss": 1.4159,
"step": 244
},
{
"epoch": 0.3760191846522782,
"grad_norm": 1.859375,
"learning_rate": 1.8846460864518818e-05,
"loss": 1.5998,
"step": 245
},
{
"epoch": 0.37755395683453236,
"grad_norm": 1.765625,
"learning_rate": 1.883471446686326e-05,
"loss": 1.3294,
"step": 246
},
{
"epoch": 0.3790887290167866,
"grad_norm": 1.859375,
"learning_rate": 1.8822912264349535e-05,
"loss": 1.43,
"step": 247
},
{
"epoch": 0.38062350119904076,
"grad_norm": 1.8125,
"learning_rate": 1.881105433152677e-05,
"loss": 1.516,
"step": 248
},
{
"epoch": 0.382158273381295,
"grad_norm": 1.96875,
"learning_rate": 1.8799140743296104e-05,
"loss": 1.4576,
"step": 249
},
{
"epoch": 0.38369304556354916,
"grad_norm": 1.8046875,
"learning_rate": 1.878717157491025e-05,
"loss": 1.4947,
"step": 250
},
{
"epoch": 0.38522781774580334,
"grad_norm": 1.6875,
"learning_rate": 1.877514690197297e-05,
"loss": 1.2411,
"step": 251
},
{
"epoch": 0.38676258992805757,
"grad_norm": 1.84375,
"learning_rate": 1.8763066800438638e-05,
"loss": 1.3762,
"step": 252
},
{
"epoch": 0.38829736211031174,
"grad_norm": 1.640625,
"learning_rate": 1.875093134661174e-05,
"loss": 1.2754,
"step": 253
},
{
"epoch": 0.38983213429256597,
"grad_norm": 1.6953125,
"learning_rate": 1.8738740617146396e-05,
"loss": 1.4927,
"step": 254
},
{
"epoch": 0.39136690647482014,
"grad_norm": 1.7265625,
"learning_rate": 1.8726494689045878e-05,
"loss": 1.4043,
"step": 255
},
{
"epoch": 0.3929016786570743,
"grad_norm": 1.8046875,
"learning_rate": 1.871419363966213e-05,
"loss": 1.5075,
"step": 256
},
{
"epoch": 0.39443645083932855,
"grad_norm": 1.875,
"learning_rate": 1.870183754669526e-05,
"loss": 1.4552,
"step": 257
},
{
"epoch": 0.3959712230215827,
"grad_norm": 1.8125,
"learning_rate": 1.8689426488193066e-05,
"loss": 1.494,
"step": 258
},
{
"epoch": 0.39750599520383695,
"grad_norm": 1.71875,
"learning_rate": 1.867696054255054e-05,
"loss": 1.5097,
"step": 259
},
{
"epoch": 0.3990407673860911,
"grad_norm": 1.6328125,
"learning_rate": 1.866443978850937e-05,
"loss": 1.2745,
"step": 260
},
{
"epoch": 0.40057553956834535,
"grad_norm": 1.71875,
"learning_rate": 1.865186430515745e-05,
"loss": 1.4509,
"step": 261
},
{
"epoch": 0.4021103117505995,
"grad_norm": 1.71875,
"learning_rate": 1.8639234171928355e-05,
"loss": 1.4734,
"step": 262
},
{
"epoch": 0.4036450839328537,
"grad_norm": 1.8046875,
"learning_rate": 1.862654946860088e-05,
"loss": 1.5411,
"step": 263
},
{
"epoch": 0.40517985611510793,
"grad_norm": 1.8359375,
"learning_rate": 1.86138102752985e-05,
"loss": 1.4693,
"step": 264
},
{
"epoch": 0.4067146282973621,
"grad_norm": 1.8203125,
"learning_rate": 1.8601016672488887e-05,
"loss": 1.4294,
"step": 265
},
{
"epoch": 0.40824940047961633,
"grad_norm": 1.78125,
"learning_rate": 1.858816874098339e-05,
"loss": 1.4605,
"step": 266
},
{
"epoch": 0.4097841726618705,
"grad_norm": 1.8828125,
"learning_rate": 1.8575266561936526e-05,
"loss": 1.4601,
"step": 267
},
{
"epoch": 0.4113189448441247,
"grad_norm": 1.75,
"learning_rate": 1.8562310216845463e-05,
"loss": 1.3688,
"step": 268
},
{
"epoch": 0.4128537170263789,
"grad_norm": 1.703125,
"learning_rate": 1.8549299787549536e-05,
"loss": 1.3078,
"step": 269
},
{
"epoch": 0.4143884892086331,
"grad_norm": 1.75,
"learning_rate": 1.8536235356229667e-05,
"loss": 1.5405,
"step": 270
},
{
"epoch": 0.4159232613908873,
"grad_norm": 1.8671875,
"learning_rate": 1.852311700540792e-05,
"loss": 1.4657,
"step": 271
},
{
"epoch": 0.4174580335731415,
"grad_norm": 1.734375,
"learning_rate": 1.850994481794692e-05,
"loss": 1.6243,
"step": 272
},
{
"epoch": 0.41899280575539566,
"grad_norm": 1.796875,
"learning_rate": 1.8496718877049367e-05,
"loss": 1.4455,
"step": 273
},
{
"epoch": 0.4205275779376499,
"grad_norm": 1.7265625,
"learning_rate": 1.8483439266257485e-05,
"loss": 1.4315,
"step": 274
},
{
"epoch": 0.42206235011990406,
"grad_norm": 1.734375,
"learning_rate": 1.8470106069452522e-05,
"loss": 1.568,
"step": 275
},
{
"epoch": 0.4235971223021583,
"grad_norm": 1.78125,
"learning_rate": 1.845671937085419e-05,
"loss": 1.5067,
"step": 276
},
{
"epoch": 0.42513189448441246,
"grad_norm": 1.828125,
"learning_rate": 1.8443279255020153e-05,
"loss": 1.3536,
"step": 277
},
{
"epoch": 0.4266666666666667,
"grad_norm": 1.8125,
"learning_rate": 1.842978580684549e-05,
"loss": 1.4866,
"step": 278
},
{
"epoch": 0.42820143884892087,
"grad_norm": 1.7890625,
"learning_rate": 1.841623911156215e-05,
"loss": 1.5661,
"step": 279
},
{
"epoch": 0.42973621103117504,
"grad_norm": 1.859375,
"learning_rate": 1.8402639254738422e-05,
"loss": 1.5654,
"step": 280
},
{
"epoch": 0.43127098321342927,
"grad_norm": 1.71875,
"learning_rate": 1.83889863222784e-05,
"loss": 1.5065,
"step": 281
},
{
"epoch": 0.43280575539568344,
"grad_norm": 1.6796875,
"learning_rate": 1.837528040042142e-05,
"loss": 1.3752,
"step": 282
},
{
"epoch": 0.4343405275779377,
"grad_norm": 1.640625,
"learning_rate": 1.8361521575741533e-05,
"loss": 1.26,
"step": 283
},
{
"epoch": 0.43587529976019185,
"grad_norm": 1.671875,
"learning_rate": 1.8347709935146958e-05,
"loss": 1.4494,
"step": 284
},
{
"epoch": 0.437410071942446,
"grad_norm": 1.7421875,
"learning_rate": 1.8333845565879517e-05,
"loss": 1.4768,
"step": 285
},
{
"epoch": 0.43894484412470025,
"grad_norm": 1.7109375,
"learning_rate": 1.8319928555514108e-05,
"loss": 1.5487,
"step": 286
},
{
"epoch": 0.4404796163069544,
"grad_norm": 1.6796875,
"learning_rate": 1.830595899195813e-05,
"loss": 1.4256,
"step": 287
},
{
"epoch": 0.44201438848920865,
"grad_norm": 1.7734375,
"learning_rate": 1.8291936963450933e-05,
"loss": 1.3908,
"step": 288
},
{
"epoch": 0.4435491606714628,
"grad_norm": 1.828125,
"learning_rate": 1.827786255856328e-05,
"loss": 1.4406,
"step": 289
},
{
"epoch": 0.445083932853717,
"grad_norm": 1.703125,
"learning_rate": 1.8263735866196758e-05,
"loss": 1.4099,
"step": 290
},
{
"epoch": 0.44661870503597123,
"grad_norm": 1.78125,
"learning_rate": 1.824955697558323e-05,
"loss": 1.4792,
"step": 291
},
{
"epoch": 0.4481534772182254,
"grad_norm": 1.71875,
"learning_rate": 1.8235325976284276e-05,
"loss": 1.4086,
"step": 292
},
{
"epoch": 0.44968824940047963,
"grad_norm": 1.90625,
"learning_rate": 1.8221042958190628e-05,
"loss": 1.5445,
"step": 293
},
{
"epoch": 0.4512230215827338,
"grad_norm": 1.8203125,
"learning_rate": 1.820670801152158e-05,
"loss": 1.4573,
"step": 294
},
{
"epoch": 0.45275779376498804,
"grad_norm": 1.671875,
"learning_rate": 1.8192321226824455e-05,
"loss": 1.257,
"step": 295
},
{
"epoch": 0.4542925659472422,
"grad_norm": 1.84375,
"learning_rate": 1.8177882694974008e-05,
"loss": 1.4822,
"step": 296
},
{
"epoch": 0.4558273381294964,
"grad_norm": 1.7734375,
"learning_rate": 1.816339250717184e-05,
"loss": 1.3933,
"step": 297
},
{
"epoch": 0.4573621103117506,
"grad_norm": 1.8359375,
"learning_rate": 1.8148850754945865e-05,
"loss": 1.4232,
"step": 298
},
{
"epoch": 0.4588968824940048,
"grad_norm": 1.6484375,
"learning_rate": 1.8134257530149684e-05,
"loss": 1.424,
"step": 299
},
{
"epoch": 0.460431654676259,
"grad_norm": 1.65625,
"learning_rate": 1.8119612924962043e-05,
"loss": 1.3909,
"step": 300
},
{
"epoch": 0.4619664268585132,
"grad_norm": 1.71875,
"learning_rate": 1.8104917031886223e-05,
"loss": 1.4939,
"step": 301
},
{
"epoch": 0.46350119904076736,
"grad_norm": 1.8359375,
"learning_rate": 1.8090169943749477e-05,
"loss": 1.4614,
"step": 302
},
{
"epoch": 0.4650359712230216,
"grad_norm": 1.765625,
"learning_rate": 1.8075371753702423e-05,
"loss": 1.3587,
"step": 303
},
{
"epoch": 0.46657074340527577,
"grad_norm": 1.6953125,
"learning_rate": 1.806052255521847e-05,
"loss": 1.4853,
"step": 304
},
{
"epoch": 0.46810551558753,
"grad_norm": 1.75,
"learning_rate": 1.8045622442093237e-05,
"loss": 1.4516,
"step": 305
},
{
"epoch": 0.46964028776978417,
"grad_norm": 1.6875,
"learning_rate": 1.8030671508443928e-05,
"loss": 1.3532,
"step": 306
},
{
"epoch": 0.47117505995203834,
"grad_norm": 1.890625,
"learning_rate": 1.8015669848708768e-05,
"loss": 1.5189,
"step": 307
},
{
"epoch": 0.4727098321342926,
"grad_norm": 1.6953125,
"learning_rate": 1.8000617557646392e-05,
"loss": 1.3214,
"step": 308
},
{
"epoch": 0.47424460431654675,
"grad_norm": 1.828125,
"learning_rate": 1.798551473033525e-05,
"loss": 1.6297,
"step": 309
},
{
"epoch": 0.475779376498801,
"grad_norm": 1.8125,
"learning_rate": 1.797036146217301e-05,
"loss": 1.3899,
"step": 310
},
{
"epoch": 0.47731414868105515,
"grad_norm": 1.703125,
"learning_rate": 1.795515784887595e-05,
"loss": 1.4549,
"step": 311
},
{
"epoch": 0.4788489208633094,
"grad_norm": 1.7734375,
"learning_rate": 1.7939903986478354e-05,
"loss": 1.3179,
"step": 312
},
{
"epoch": 0.48038369304556355,
"grad_norm": 1.7890625,
"learning_rate": 1.792459997133191e-05,
"loss": 1.5416,
"step": 313
},
{
"epoch": 0.4819184652278177,
"grad_norm": 1.671875,
"learning_rate": 1.7909245900105085e-05,
"loss": 1.4575,
"step": 314
},
{
"epoch": 0.48345323741007196,
"grad_norm": 1.6484375,
"learning_rate": 1.7893841869782548e-05,
"loss": 1.3526,
"step": 315
},
{
"epoch": 0.48498800959232613,
"grad_norm": 1.78125,
"learning_rate": 1.7878387977664522e-05,
"loss": 1.4582,
"step": 316
},
{
"epoch": 0.48652278177458036,
"grad_norm": 1.7265625,
"learning_rate": 1.786288432136619e-05,
"loss": 1.3788,
"step": 317
},
{
"epoch": 0.48805755395683453,
"grad_norm": 1.828125,
"learning_rate": 1.784733099881707e-05,
"loss": 1.4998,
"step": 318
},
{
"epoch": 0.4895923261390887,
"grad_norm": 1.84375,
"learning_rate": 1.7831728108260407e-05,
"loss": 1.4229,
"step": 319
},
{
"epoch": 0.49112709832134294,
"grad_norm": 1.8046875,
"learning_rate": 1.7816075748252526e-05,
"loss": 1.4159,
"step": 320
},
{
"epoch": 0.4926618705035971,
"grad_norm": 1.7734375,
"learning_rate": 1.780037401766225e-05,
"loss": 1.4787,
"step": 321
},
{
"epoch": 0.49419664268585134,
"grad_norm": 1.734375,
"learning_rate": 1.7784623015670237e-05,
"loss": 1.4399,
"step": 322
},
{
"epoch": 0.4957314148681055,
"grad_norm": 2.0,
"learning_rate": 1.776882284176838e-05,
"loss": 1.4658,
"step": 323
},
{
"epoch": 0.4972661870503597,
"grad_norm": 1.8671875,
"learning_rate": 1.775297359575916e-05,
"loss": 1.4715,
"step": 324
},
{
"epoch": 0.4988009592326139,
"grad_norm": 1.7265625,
"learning_rate": 1.7737075377755032e-05,
"loss": 1.4503,
"step": 325
},
{
"epoch": 0.5003357314148681,
"grad_norm": 1.625,
"learning_rate": 1.7721128288177782e-05,
"loss": 1.4218,
"step": 326
},
{
"epoch": 0.5003357314148681,
"eval_loss": 1.419244647026062,
"eval_runtime": 58.9368,
"eval_samples_per_second": 21.277,
"eval_steps_per_second": 21.277,
"step": 326
},
{
"epoch": 0.5018705035971223,
"grad_norm": 1.6953125,
"learning_rate": 1.7705132427757895e-05,
"loss": 1.4326,
"step": 327
},
{
"epoch": 0.5034052757793765,
"grad_norm": 1.78125,
"learning_rate": 1.7689087897533916e-05,
"loss": 1.4476,
"step": 328
},
{
"epoch": 0.5049400479616307,
"grad_norm": 1.71875,
"learning_rate": 1.767299479885182e-05,
"loss": 1.4214,
"step": 329
},
{
"epoch": 0.5064748201438849,
"grad_norm": 1.6875,
"learning_rate": 1.765685323336437e-05,
"loss": 1.4763,
"step": 330
},
{
"epoch": 0.5080095923261391,
"grad_norm": 1.8203125,
"learning_rate": 1.7640663303030452e-05,
"loss": 1.4451,
"step": 331
},
{
"epoch": 0.5095443645083932,
"grad_norm": 1.7734375,
"learning_rate": 1.762442511011448e-05,
"loss": 1.3918,
"step": 332
},
{
"epoch": 0.5110791366906475,
"grad_norm": 1.7421875,
"learning_rate": 1.76081387571857e-05,
"loss": 1.4037,
"step": 333
},
{
"epoch": 0.5126139088729017,
"grad_norm": 1.7578125,
"learning_rate": 1.759180434711757e-05,
"loss": 1.3785,
"step": 334
},
{
"epoch": 0.5141486810551559,
"grad_norm": 1.8828125,
"learning_rate": 1.7575421983087095e-05,
"loss": 1.3338,
"step": 335
},
{
"epoch": 0.51568345323741,
"grad_norm": 1.6640625,
"learning_rate": 1.7558991768574197e-05,
"loss": 1.2631,
"step": 336
},
{
"epoch": 0.5172182254196642,
"grad_norm": 1.6640625,
"learning_rate": 1.754251380736104e-05,
"loss": 1.4594,
"step": 337
},
{
"epoch": 0.5187529976019185,
"grad_norm": 1.859375,
"learning_rate": 1.752598820353138e-05,
"loss": 1.4979,
"step": 338
},
{
"epoch": 0.5202877697841727,
"grad_norm": 1.8203125,
"learning_rate": 1.7509415061469916e-05,
"loss": 1.5528,
"step": 339
},
{
"epoch": 0.5218225419664269,
"grad_norm": 1.8203125,
"learning_rate": 1.749279448586162e-05,
"loss": 1.4236,
"step": 340
},
{
"epoch": 0.523357314148681,
"grad_norm": 1.7734375,
"learning_rate": 1.7476126581691072e-05,
"loss": 1.3933,
"step": 341
},
{
"epoch": 0.5248920863309352,
"grad_norm": 1.96875,
"learning_rate": 1.7459411454241822e-05,
"loss": 1.336,
"step": 342
},
{
"epoch": 0.5264268585131895,
"grad_norm": 1.734375,
"learning_rate": 1.7442649209095703e-05,
"loss": 1.5121,
"step": 343
},
{
"epoch": 0.5279616306954437,
"grad_norm": 1.703125,
"learning_rate": 1.7425839952132157e-05,
"loss": 1.3951,
"step": 344
},
{
"epoch": 0.5294964028776978,
"grad_norm": 1.6640625,
"learning_rate": 1.7408983789527588e-05,
"loss": 1.4327,
"step": 345
},
{
"epoch": 0.531031175059952,
"grad_norm": 1.6953125,
"learning_rate": 1.739208082775469e-05,
"loss": 1.4532,
"step": 346
},
{
"epoch": 0.5325659472422062,
"grad_norm": 1.8046875,
"learning_rate": 1.737513117358174e-05,
"loss": 1.459,
"step": 347
},
{
"epoch": 0.5341007194244605,
"grad_norm": 1.6796875,
"learning_rate": 1.7358134934071978e-05,
"loss": 1.3645,
"step": 348
},
{
"epoch": 0.5356354916067146,
"grad_norm": 1.8359375,
"learning_rate": 1.7341092216582886e-05,
"loss": 1.5174,
"step": 349
},
{
"epoch": 0.5371702637889688,
"grad_norm": 1.7578125,
"learning_rate": 1.7324003128765536e-05,
"loss": 1.4671,
"step": 350
},
{
"epoch": 0.538705035971223,
"grad_norm": 1.6875,
"learning_rate": 1.730686777856388e-05,
"loss": 1.4129,
"step": 351
},
{
"epoch": 0.5402398081534772,
"grad_norm": 1.7890625,
"learning_rate": 1.7289686274214116e-05,
"loss": 1.3426,
"step": 352
},
{
"epoch": 0.5417745803357314,
"grad_norm": 1.6953125,
"learning_rate": 1.7272458724243957e-05,
"loss": 1.4186,
"step": 353
},
{
"epoch": 0.5433093525179856,
"grad_norm": 1.7734375,
"learning_rate": 1.7255185237471978e-05,
"loss": 1.4067,
"step": 354
},
{
"epoch": 0.5448441247002398,
"grad_norm": 1.6796875,
"learning_rate": 1.7237865923006904e-05,
"loss": 1.4288,
"step": 355
},
{
"epoch": 0.546378896882494,
"grad_norm": 1.8984375,
"learning_rate": 1.7220500890246944e-05,
"loss": 1.4377,
"step": 356
},
{
"epoch": 0.5479136690647483,
"grad_norm": 1.6875,
"learning_rate": 1.720309024887907e-05,
"loss": 1.3125,
"step": 357
},
{
"epoch": 0.5494484412470024,
"grad_norm": 1.71875,
"learning_rate": 1.7185634108878367e-05,
"loss": 1.4347,
"step": 358
},
{
"epoch": 0.5509832134292566,
"grad_norm": 1.765625,
"learning_rate": 1.7168132580507298e-05,
"loss": 1.4743,
"step": 359
},
{
"epoch": 0.5525179856115108,
"grad_norm": 1.7890625,
"learning_rate": 1.715058577431503e-05,
"loss": 1.473,
"step": 360
},
{
"epoch": 0.554052757793765,
"grad_norm": 1.78125,
"learning_rate": 1.713299380113673e-05,
"loss": 1.3648,
"step": 361
},
{
"epoch": 0.5555875299760192,
"grad_norm": 1.9375,
"learning_rate": 1.7115356772092858e-05,
"loss": 1.3193,
"step": 362
},
{
"epoch": 0.5571223021582734,
"grad_norm": 1.8125,
"learning_rate": 1.709767479858847e-05,
"loss": 1.4627,
"step": 363
},
{
"epoch": 0.5586570743405276,
"grad_norm": 1.734375,
"learning_rate": 1.707994799231253e-05,
"loss": 1.3789,
"step": 364
},
{
"epoch": 0.5601918465227818,
"grad_norm": 1.75,
"learning_rate": 1.7062176465237175e-05,
"loss": 1.4083,
"step": 365
},
{
"epoch": 0.5617266187050359,
"grad_norm": 1.6484375,
"learning_rate": 1.704436032961703e-05,
"loss": 1.4053,
"step": 366
},
{
"epoch": 0.5632613908872902,
"grad_norm": 1.7265625,
"learning_rate": 1.7026499697988496e-05,
"loss": 1.444,
"step": 367
},
{
"epoch": 0.5647961630695444,
"grad_norm": 1.734375,
"learning_rate": 1.7008594683169018e-05,
"loss": 1.3111,
"step": 368
},
{
"epoch": 0.5663309352517986,
"grad_norm": 1.765625,
"learning_rate": 1.6990645398256412e-05,
"loss": 1.432,
"step": 369
},
{
"epoch": 0.5678657074340527,
"grad_norm": 1.90625,
"learning_rate": 1.6972651956628108e-05,
"loss": 1.4439,
"step": 370
},
{
"epoch": 0.5694004796163069,
"grad_norm": 1.6796875,
"learning_rate": 1.695461447194047e-05,
"loss": 1.3627,
"step": 371
},
{
"epoch": 0.5709352517985612,
"grad_norm": 1.921875,
"learning_rate": 1.693653305812805e-05,
"loss": 1.5243,
"step": 372
},
{
"epoch": 0.5724700239808154,
"grad_norm": 1.96875,
"learning_rate": 1.6918407829402888e-05,
"loss": 1.5582,
"step": 373
},
{
"epoch": 0.5740047961630695,
"grad_norm": 1.78125,
"learning_rate": 1.6900238900253777e-05,
"loss": 1.3232,
"step": 374
},
{
"epoch": 0.5755395683453237,
"grad_norm": 1.7421875,
"learning_rate": 1.6882026385445548e-05,
"loss": 1.4903,
"step": 375
},
{
"epoch": 0.5770743405275779,
"grad_norm": 1.7734375,
"learning_rate": 1.6863770400018344e-05,
"loss": 1.4271,
"step": 376
},
{
"epoch": 0.5786091127098322,
"grad_norm": 1.75,
"learning_rate": 1.684547105928689e-05,
"loss": 1.406,
"step": 377
},
{
"epoch": 0.5801438848920863,
"grad_norm": 1.7578125,
"learning_rate": 1.6827128478839767e-05,
"loss": 1.4787,
"step": 378
},
{
"epoch": 0.5816786570743405,
"grad_norm": 1.703125,
"learning_rate": 1.6808742774538683e-05,
"loss": 1.4338,
"step": 379
},
{
"epoch": 0.5832134292565947,
"grad_norm": 1.6796875,
"learning_rate": 1.679031406251774e-05,
"loss": 1.3572,
"step": 380
},
{
"epoch": 0.5847482014388489,
"grad_norm": 1.6953125,
"learning_rate": 1.6771842459182703e-05,
"loss": 1.3706,
"step": 381
},
{
"epoch": 0.5862829736211032,
"grad_norm": 1.7578125,
"learning_rate": 1.6753328081210244e-05,
"loss": 1.3664,
"step": 382
},
{
"epoch": 0.5878177458033573,
"grad_norm": 1.6953125,
"learning_rate": 1.673477104554725e-05,
"loss": 1.3524,
"step": 383
},
{
"epoch": 0.5893525179856115,
"grad_norm": 1.8203125,
"learning_rate": 1.6716171469410042e-05,
"loss": 1.4479,
"step": 384
},
{
"epoch": 0.5908872901678657,
"grad_norm": 1.6328125,
"learning_rate": 1.6697529470283646e-05,
"loss": 1.3061,
"step": 385
},
{
"epoch": 0.5924220623501198,
"grad_norm": 1.6015625,
"learning_rate": 1.6678845165921066e-05,
"loss": 1.3439,
"step": 386
},
{
"epoch": 0.5939568345323741,
"grad_norm": 1.7421875,
"learning_rate": 1.666011867434252e-05,
"loss": 1.474,
"step": 387
},
{
"epoch": 0.5954916067146283,
"grad_norm": 1.8125,
"learning_rate": 1.6641350113834705e-05,
"loss": 1.5257,
"step": 388
},
{
"epoch": 0.5970263788968825,
"grad_norm": 1.578125,
"learning_rate": 1.662253960295005e-05,
"loss": 1.2708,
"step": 389
},
{
"epoch": 0.5985611510791367,
"grad_norm": 1.7265625,
"learning_rate": 1.660368726050597e-05,
"loss": 1.468,
"step": 390
},
{
"epoch": 0.6000959232613909,
"grad_norm": 1.6875,
"learning_rate": 1.65847932055841e-05,
"loss": 1.3418,
"step": 391
},
{
"epoch": 0.6016306954436451,
"grad_norm": 1.6875,
"learning_rate": 1.6565857557529567e-05,
"loss": 1.403,
"step": 392
},
{
"epoch": 0.6031654676258993,
"grad_norm": 1.796875,
"learning_rate": 1.6546880435950207e-05,
"loss": 1.456,
"step": 393
},
{
"epoch": 0.6047002398081535,
"grad_norm": 1.7109375,
"learning_rate": 1.652786196071584e-05,
"loss": 1.4762,
"step": 394
},
{
"epoch": 0.6062350119904076,
"grad_norm": 1.84375,
"learning_rate": 1.6508802251957488e-05,
"loss": 1.4517,
"step": 395
},
{
"epoch": 0.6077697841726619,
"grad_norm": 1.7578125,
"learning_rate": 1.6489701430066632e-05,
"loss": 1.3809,
"step": 396
},
{
"epoch": 0.6093045563549161,
"grad_norm": 1.828125,
"learning_rate": 1.6470559615694445e-05,
"loss": 1.4589,
"step": 397
},
{
"epoch": 0.6108393285371703,
"grad_norm": 1.671875,
"learning_rate": 1.6451376929751028e-05,
"loss": 1.396,
"step": 398
},
{
"epoch": 0.6123741007194244,
"grad_norm": 1.703125,
"learning_rate": 1.6432153493404654e-05,
"loss": 1.3241,
"step": 399
},
{
"epoch": 0.6139088729016786,
"grad_norm": 1.6484375,
"learning_rate": 1.6412889428080992e-05,
"loss": 1.4123,
"step": 400
},
{
"epoch": 0.6154436450839329,
"grad_norm": 1.78125,
"learning_rate": 1.639358485546235e-05,
"loss": 1.4098,
"step": 401
},
{
"epoch": 0.6169784172661871,
"grad_norm": 1.703125,
"learning_rate": 1.63742398974869e-05,
"loss": 1.3087,
"step": 402
},
{
"epoch": 0.6185131894484412,
"grad_norm": 1.7265625,
"learning_rate": 1.635485467634791e-05,
"loss": 1.2647,
"step": 403
},
{
"epoch": 0.6200479616306954,
"grad_norm": 1.8125,
"learning_rate": 1.633542931449297e-05,
"loss": 1.4002,
"step": 404
},
{
"epoch": 0.6215827338129496,
"grad_norm": 1.828125,
"learning_rate": 1.6315963934623228e-05,
"loss": 1.3966,
"step": 405
},
{
"epoch": 0.6231175059952039,
"grad_norm": 1.6953125,
"learning_rate": 1.62964586596926e-05,
"loss": 1.3257,
"step": 406
},
{
"epoch": 0.624652278177458,
"grad_norm": 1.75,
"learning_rate": 1.6276913612907005e-05,
"loss": 1.2805,
"step": 407
},
{
"epoch": 0.6261870503597122,
"grad_norm": 1.7890625,
"learning_rate": 1.625732891772358e-05,
"loss": 1.4642,
"step": 408
},
{
"epoch": 0.6277218225419664,
"grad_norm": 1.7421875,
"learning_rate": 1.6237704697849903e-05,
"loss": 1.4683,
"step": 409
},
{
"epoch": 0.6292565947242206,
"grad_norm": 1.703125,
"learning_rate": 1.6218041077243213e-05,
"loss": 1.448,
"step": 410
},
{
"epoch": 0.6307913669064749,
"grad_norm": 1.6640625,
"learning_rate": 1.6198338180109624e-05,
"loss": 1.4157,
"step": 411
},
{
"epoch": 0.632326139088729,
"grad_norm": 1.6875,
"learning_rate": 1.6178596130903345e-05,
"loss": 1.3459,
"step": 412
},
{
"epoch": 0.6338609112709832,
"grad_norm": 1.6171875,
"learning_rate": 1.6158815054325887e-05,
"loss": 1.2427,
"step": 413
},
{
"epoch": 0.6353956834532374,
"grad_norm": 1.828125,
"learning_rate": 1.6138995075325277e-05,
"loss": 1.3427,
"step": 414
},
{
"epoch": 0.6369304556354916,
"grad_norm": 1.765625,
"learning_rate": 1.611913631909528e-05,
"loss": 1.483,
"step": 415
},
{
"epoch": 0.6384652278177458,
"grad_norm": 1.7421875,
"learning_rate": 1.609923891107459e-05,
"loss": 1.429,
"step": 416
},
{
"epoch": 0.64,
"grad_norm": 1.8359375,
"learning_rate": 1.6079302976946055e-05,
"loss": 1.5569,
"step": 417
},
{
"epoch": 0.6415347721822542,
"grad_norm": 1.796875,
"learning_rate": 1.6059328642635864e-05,
"loss": 1.4083,
"step": 418
},
{
"epoch": 0.6430695443645084,
"grad_norm": 1.7265625,
"learning_rate": 1.6039316034312767e-05,
"loss": 1.4171,
"step": 419
},
{
"epoch": 0.6446043165467625,
"grad_norm": 1.75,
"learning_rate": 1.6019265278387287e-05,
"loss": 1.4593,
"step": 420
},
{
"epoch": 0.6461390887290168,
"grad_norm": 1.6875,
"learning_rate": 1.5999176501510883e-05,
"loss": 1.406,
"step": 421
},
{
"epoch": 0.647673860911271,
"grad_norm": 1.7109375,
"learning_rate": 1.597904983057519e-05,
"loss": 1.4094,
"step": 422
},
{
"epoch": 0.6492086330935252,
"grad_norm": 1.7578125,
"learning_rate": 1.5958885392711203e-05,
"loss": 1.6037,
"step": 423
},
{
"epoch": 0.6507434052757793,
"grad_norm": 1.7734375,
"learning_rate": 1.5938683315288472e-05,
"loss": 1.3668,
"step": 424
},
{
"epoch": 0.6522781774580336,
"grad_norm": 1.6875,
"learning_rate": 1.5918443725914298e-05,
"loss": 1.4602,
"step": 425
},
{
"epoch": 0.6538129496402878,
"grad_norm": 1.7421875,
"learning_rate": 1.589816675243292e-05,
"loss": 1.4616,
"step": 426
},
{
"epoch": 0.655347721822542,
"grad_norm": 1.6640625,
"learning_rate": 1.5877852522924733e-05,
"loss": 1.3704,
"step": 427
},
{
"epoch": 0.6568824940047961,
"grad_norm": 1.546875,
"learning_rate": 1.5857501165705443e-05,
"loss": 1.2654,
"step": 428
},
{
"epoch": 0.6584172661870503,
"grad_norm": 1.75,
"learning_rate": 1.583711280932529e-05,
"loss": 1.45,
"step": 429
},
{
"epoch": 0.6599520383693046,
"grad_norm": 1.78125,
"learning_rate": 1.581668758256821e-05,
"loss": 1.4815,
"step": 430
},
{
"epoch": 0.6614868105515588,
"grad_norm": 1.7578125,
"learning_rate": 1.5796225614451034e-05,
"loss": 1.4874,
"step": 431
},
{
"epoch": 0.663021582733813,
"grad_norm": 1.9296875,
"learning_rate": 1.5775727034222675e-05,
"loss": 1.4731,
"step": 432
},
{
"epoch": 0.6645563549160671,
"grad_norm": 1.6328125,
"learning_rate": 1.5755191971363313e-05,
"loss": 1.3568,
"step": 433
},
{
"epoch": 0.6660911270983213,
"grad_norm": 1.7265625,
"learning_rate": 1.5734620555583555e-05,
"loss": 1.2038,
"step": 434
},
{
"epoch": 0.6676258992805756,
"grad_norm": 1.75,
"learning_rate": 1.5714012916823653e-05,
"loss": 1.3923,
"step": 435
},
{
"epoch": 0.6691606714628298,
"grad_norm": 1.6328125,
"learning_rate": 1.5693369185252648e-05,
"loss": 1.4711,
"step": 436
},
{
"epoch": 0.6706954436450839,
"grad_norm": 1.75,
"learning_rate": 1.567268949126757e-05,
"loss": 1.4183,
"step": 437
},
{
"epoch": 0.6722302158273381,
"grad_norm": 1.7265625,
"learning_rate": 1.56519739654926e-05,
"loss": 1.4671,
"step": 438
},
{
"epoch": 0.6737649880095923,
"grad_norm": 1.6953125,
"learning_rate": 1.5631222738778268e-05,
"loss": 1.416,
"step": 439
},
{
"epoch": 0.6752997601918466,
"grad_norm": 1.7265625,
"learning_rate": 1.561043594220059e-05,
"loss": 1.4353,
"step": 440
},
{
"epoch": 0.6768345323741007,
"grad_norm": 1.7265625,
"learning_rate": 1.5589613707060278e-05,
"loss": 1.516,
"step": 441
},
{
"epoch": 0.6783693045563549,
"grad_norm": 1.7265625,
"learning_rate": 1.556875616488188e-05,
"loss": 1.2044,
"step": 442
},
{
"epoch": 0.6799040767386091,
"grad_norm": 1.8125,
"learning_rate": 1.5547863447412973e-05,
"loss": 1.3343,
"step": 443
},
{
"epoch": 0.6814388489208633,
"grad_norm": 1.828125,
"learning_rate": 1.5526935686623316e-05,
"loss": 1.389,
"step": 444
},
{
"epoch": 0.6829736211031175,
"grad_norm": 1.8984375,
"learning_rate": 1.5505973014704017e-05,
"loss": 1.4935,
"step": 445
},
{
"epoch": 0.6845083932853717,
"grad_norm": 1.828125,
"learning_rate": 1.5484975564066704e-05,
"loss": 1.4248,
"step": 446
},
{
"epoch": 0.6860431654676259,
"grad_norm": 1.6640625,
"learning_rate": 1.5463943467342694e-05,
"loss": 1.4196,
"step": 447
},
{
"epoch": 0.6875779376498801,
"grad_norm": 1.8515625,
"learning_rate": 1.544287685738213e-05,
"loss": 1.3694,
"step": 448
},
{
"epoch": 0.6891127098321342,
"grad_norm": 1.9140625,
"learning_rate": 1.542177586725318e-05,
"loss": 1.5132,
"step": 449
},
{
"epoch": 0.6906474820143885,
"grad_norm": 1.8203125,
"learning_rate": 1.540064063024116e-05,
"loss": 1.5271,
"step": 450
},
{
"epoch": 0.6921822541966427,
"grad_norm": 1.75,
"learning_rate": 1.5379471279847714e-05,
"loss": 1.4123,
"step": 451
},
{
"epoch": 0.6937170263788969,
"grad_norm": 1.65625,
"learning_rate": 1.5358267949789968e-05,
"loss": 1.4703,
"step": 452
},
{
"epoch": 0.695251798561151,
"grad_norm": 1.78125,
"learning_rate": 1.5337030773999674e-05,
"loss": 1.4658,
"step": 453
},
{
"epoch": 0.6967865707434053,
"grad_norm": 1.84375,
"learning_rate": 1.531575988662238e-05,
"loss": 1.3538,
"step": 454
},
{
"epoch": 0.6983213429256595,
"grad_norm": 1.734375,
"learning_rate": 1.5294455422016576e-05,
"loss": 1.4328,
"step": 455
},
{
"epoch": 0.6998561151079137,
"grad_norm": 1.6953125,
"learning_rate": 1.5273117514752826e-05,
"loss": 1.4393,
"step": 456
},
{
"epoch": 0.7013908872901679,
"grad_norm": 1.7421875,
"learning_rate": 1.5251746299612959e-05,
"loss": 1.5345,
"step": 457
},
{
"epoch": 0.702925659472422,
"grad_norm": 1.7890625,
"learning_rate": 1.5230341911589183e-05,
"loss": 1.3759,
"step": 458
},
{
"epoch": 0.7044604316546763,
"grad_norm": 1.609375,
"learning_rate": 1.5208904485883244e-05,
"loss": 1.3337,
"step": 459
},
{
"epoch": 0.7059952038369305,
"grad_norm": 1.71875,
"learning_rate": 1.5187434157905575e-05,
"loss": 1.5585,
"step": 460
},
{
"epoch": 0.7075299760191847,
"grad_norm": 1.8515625,
"learning_rate": 1.5165931063274442e-05,
"loss": 1.4298,
"step": 461
},
{
"epoch": 0.7090647482014388,
"grad_norm": 1.828125,
"learning_rate": 1.5144395337815066e-05,
"loss": 1.3482,
"step": 462
},
{
"epoch": 0.710599520383693,
"grad_norm": 2.0,
"learning_rate": 1.5122827117558802e-05,
"loss": 1.6308,
"step": 463
},
{
"epoch": 0.7121342925659473,
"grad_norm": 1.65625,
"learning_rate": 1.5101226538742248e-05,
"loss": 1.4727,
"step": 464
},
{
"epoch": 0.7136690647482015,
"grad_norm": 1.6953125,
"learning_rate": 1.50795937378064e-05,
"loss": 1.3891,
"step": 465
},
{
"epoch": 0.7152038369304556,
"grad_norm": 1.7578125,
"learning_rate": 1.505792885139579e-05,
"loss": 1.5394,
"step": 466
},
{
"epoch": 0.7167386091127098,
"grad_norm": 1.6875,
"learning_rate": 1.503623201635761e-05,
"loss": 1.3889,
"step": 467
},
{
"epoch": 0.718273381294964,
"grad_norm": 1.71875,
"learning_rate": 1.5014503369740866e-05,
"loss": 1.4508,
"step": 468
},
{
"epoch": 0.7198081534772183,
"grad_norm": 1.71875,
"learning_rate": 1.4992743048795493e-05,
"loss": 1.4323,
"step": 469
},
{
"epoch": 0.7213429256594724,
"grad_norm": 1.90625,
"learning_rate": 1.4970951190971512e-05,
"loss": 1.549,
"step": 470
},
{
"epoch": 0.7228776978417266,
"grad_norm": 1.8046875,
"learning_rate": 1.4949127933918136e-05,
"loss": 1.4607,
"step": 471
},
{
"epoch": 0.7244124700239808,
"grad_norm": 1.984375,
"learning_rate": 1.4927273415482916e-05,
"loss": 1.5684,
"step": 472
},
{
"epoch": 0.725947242206235,
"grad_norm": 2.0625,
"learning_rate": 1.4905387773710876e-05,
"loss": 1.3445,
"step": 473
},
{
"epoch": 0.7274820143884893,
"grad_norm": 1.7421875,
"learning_rate": 1.4883471146843617e-05,
"loss": 1.3575,
"step": 474
},
{
"epoch": 0.7290167865707434,
"grad_norm": 1.71875,
"learning_rate": 1.486152367331847e-05,
"loss": 1.4004,
"step": 475
},
{
"epoch": 0.7305515587529976,
"grad_norm": 1.78125,
"learning_rate": 1.4839545491767599e-05,
"loss": 1.5095,
"step": 476
},
{
"epoch": 0.7320863309352518,
"grad_norm": 1.859375,
"learning_rate": 1.4817536741017153e-05,
"loss": 1.4924,
"step": 477
},
{
"epoch": 0.733621103117506,
"grad_norm": 1.6796875,
"learning_rate": 1.4795497560086358e-05,
"loss": 1.451,
"step": 478
},
{
"epoch": 0.7351558752997602,
"grad_norm": 1.6796875,
"learning_rate": 1.4773428088186662e-05,
"loss": 1.4369,
"step": 479
},
{
"epoch": 0.7366906474820144,
"grad_norm": 1.75,
"learning_rate": 1.4751328464720842e-05,
"loss": 1.3582,
"step": 480
},
{
"epoch": 0.7382254196642686,
"grad_norm": 1.703125,
"learning_rate": 1.4729198829282127e-05,
"loss": 1.4842,
"step": 481
},
{
"epoch": 0.7397601918465228,
"grad_norm": 1.6796875,
"learning_rate": 1.470703932165333e-05,
"loss": 1.3498,
"step": 482
},
{
"epoch": 0.7412949640287769,
"grad_norm": 1.8125,
"learning_rate": 1.4684850081805934e-05,
"loss": 1.522,
"step": 483
},
{
"epoch": 0.7428297362110312,
"grad_norm": 1.703125,
"learning_rate": 1.4662631249899248e-05,
"loss": 1.4422,
"step": 484
},
{
"epoch": 0.7443645083932854,
"grad_norm": 1.7421875,
"learning_rate": 1.4640382966279484e-05,
"loss": 1.331,
"step": 485
},
{
"epoch": 0.7458992805755396,
"grad_norm": 1.8125,
"learning_rate": 1.4618105371478896e-05,
"loss": 1.3508,
"step": 486
},
{
"epoch": 0.7474340527577937,
"grad_norm": 1.640625,
"learning_rate": 1.4595798606214882e-05,
"loss": 1.3859,
"step": 487
},
{
"epoch": 0.748968824940048,
"grad_norm": 1.765625,
"learning_rate": 1.4573462811389087e-05,
"loss": 1.3691,
"step": 488
},
{
"epoch": 0.7505035971223022,
"grad_norm": 1.6953125,
"learning_rate": 1.4551098128086538e-05,
"loss": 1.3617,
"step": 489
},
{
"epoch": 0.7505035971223022,
"eval_loss": 1.4075348377227783,
"eval_runtime": 58.9728,
"eval_samples_per_second": 21.264,
"eval_steps_per_second": 21.264,
"step": 489
},
{
"epoch": 0.7520383693045564,
"grad_norm": 1.7265625,
"learning_rate": 1.4528704697574729e-05,
"loss": 1.3604,
"step": 490
},
{
"epoch": 0.7535731414868105,
"grad_norm": 1.6796875,
"learning_rate": 1.4506282661302735e-05,
"loss": 1.2984,
"step": 491
},
{
"epoch": 0.7551079136690647,
"grad_norm": 1.7421875,
"learning_rate": 1.4483832160900326e-05,
"loss": 1.4131,
"step": 492
},
{
"epoch": 0.756642685851319,
"grad_norm": 1.6328125,
"learning_rate": 1.446135333817706e-05,
"loss": 1.4902,
"step": 493
},
{
"epoch": 0.7581774580335732,
"grad_norm": 1.796875,
"learning_rate": 1.4438846335121402e-05,
"loss": 1.4437,
"step": 494
},
{
"epoch": 0.7597122302158273,
"grad_norm": 1.6953125,
"learning_rate": 1.4416311293899816e-05,
"loss": 1.4344,
"step": 495
},
{
"epoch": 0.7612470023980815,
"grad_norm": 1.703125,
"learning_rate": 1.4393748356855865e-05,
"loss": 1.2446,
"step": 496
},
{
"epoch": 0.7627817745803357,
"grad_norm": 1.7265625,
"learning_rate": 1.437115766650933e-05,
"loss": 1.4651,
"step": 497
},
{
"epoch": 0.76431654676259,
"grad_norm": 1.6796875,
"learning_rate": 1.4348539365555283e-05,
"loss": 1.3397,
"step": 498
},
{
"epoch": 0.7658513189448442,
"grad_norm": 1.6796875,
"learning_rate": 1.432589359686321e-05,
"loss": 1.3666,
"step": 499
},
{
"epoch": 0.7673860911270983,
"grad_norm": 1.7109375,
"learning_rate": 1.430322050347609e-05,
"loss": 1.3271,
"step": 500
},
{
"epoch": 0.7689208633093525,
"grad_norm": 1.71875,
"learning_rate": 1.4280520228609503e-05,
"loss": 1.3099,
"step": 501
},
{
"epoch": 0.7704556354916067,
"grad_norm": 1.640625,
"learning_rate": 1.4257792915650728e-05,
"loss": 1.3086,
"step": 502
},
{
"epoch": 0.771990407673861,
"grad_norm": 1.7109375,
"learning_rate": 1.423503870815782e-05,
"loss": 1.3125,
"step": 503
},
{
"epoch": 0.7735251798561151,
"grad_norm": 1.7109375,
"learning_rate": 1.4212257749858727e-05,
"loss": 1.3633,
"step": 504
},
{
"epoch": 0.7750599520383693,
"grad_norm": 1.625,
"learning_rate": 1.4189450184650354e-05,
"loss": 1.2312,
"step": 505
},
{
"epoch": 0.7765947242206235,
"grad_norm": 1.7578125,
"learning_rate": 1.416661615659768e-05,
"loss": 1.3498,
"step": 506
},
{
"epoch": 0.7781294964028777,
"grad_norm": 1.8984375,
"learning_rate": 1.4143755809932843e-05,
"loss": 1.4694,
"step": 507
},
{
"epoch": 0.7796642685851319,
"grad_norm": 1.6953125,
"learning_rate": 1.412086928905421e-05,
"loss": 1.1963,
"step": 508
},
{
"epoch": 0.7811990407673861,
"grad_norm": 1.6015625,
"learning_rate": 1.4097956738525493e-05,
"loss": 1.2387,
"step": 509
},
{
"epoch": 0.7827338129496403,
"grad_norm": 1.8125,
"learning_rate": 1.4075018303074808e-05,
"loss": 1.434,
"step": 510
},
{
"epoch": 0.7842685851318945,
"grad_norm": 1.75,
"learning_rate": 1.4052054127593782e-05,
"loss": 1.3472,
"step": 511
},
{
"epoch": 0.7858033573141486,
"grad_norm": 1.71875,
"learning_rate": 1.4029064357136628e-05,
"loss": 1.461,
"step": 512
},
{
"epoch": 0.7873381294964029,
"grad_norm": 1.65625,
"learning_rate": 1.4006049136919229e-05,
"loss": 1.4106,
"step": 513
},
{
"epoch": 0.7888729016786571,
"grad_norm": 1.6953125,
"learning_rate": 1.398300861231823e-05,
"loss": 1.3753,
"step": 514
},
{
"epoch": 0.7904076738609113,
"grad_norm": 1.8203125,
"learning_rate": 1.3959942928870101e-05,
"loss": 1.3298,
"step": 515
},
{
"epoch": 0.7919424460431654,
"grad_norm": 1.6328125,
"learning_rate": 1.3936852232270236e-05,
"loss": 1.3482,
"step": 516
},
{
"epoch": 0.7934772182254196,
"grad_norm": 1.8671875,
"learning_rate": 1.3913736668372027e-05,
"loss": 1.4847,
"step": 517
},
{
"epoch": 0.7950119904076739,
"grad_norm": 1.6875,
"learning_rate": 1.3890596383185934e-05,
"loss": 1.4272,
"step": 518
},
{
"epoch": 0.7965467625899281,
"grad_norm": 1.6640625,
"learning_rate": 1.386743152287858e-05,
"loss": 1.3732,
"step": 519
},
{
"epoch": 0.7980815347721822,
"grad_norm": 1.78125,
"learning_rate": 1.384424223377181e-05,
"loss": 1.4443,
"step": 520
},
{
"epoch": 0.7996163069544364,
"grad_norm": 1.796875,
"learning_rate": 1.3821028662341776e-05,
"loss": 1.4799,
"step": 521
},
{
"epoch": 0.8011510791366907,
"grad_norm": 1.6953125,
"learning_rate": 1.3797790955218014e-05,
"loss": 1.4129,
"step": 522
},
{
"epoch": 0.8026858513189449,
"grad_norm": 1.8515625,
"learning_rate": 1.3774529259182508e-05,
"loss": 1.3799,
"step": 523
},
{
"epoch": 0.804220623501199,
"grad_norm": 1.7578125,
"learning_rate": 1.3751243721168778e-05,
"loss": 1.4216,
"step": 524
},
{
"epoch": 0.8057553956834532,
"grad_norm": 1.921875,
"learning_rate": 1.3727934488260934e-05,
"loss": 1.4918,
"step": 525
},
{
"epoch": 0.8072901678657074,
"grad_norm": 1.7109375,
"learning_rate": 1.3704601707692762e-05,
"loss": 1.3585,
"step": 526
},
{
"epoch": 0.8088249400479617,
"grad_norm": 1.75,
"learning_rate": 1.3681245526846782e-05,
"loss": 1.3654,
"step": 527
},
{
"epoch": 0.8103597122302159,
"grad_norm": 1.71875,
"learning_rate": 1.3657866093253327e-05,
"loss": 1.3949,
"step": 528
},
{
"epoch": 0.81189448441247,
"grad_norm": 1.6875,
"learning_rate": 1.3634463554589608e-05,
"loss": 1.3878,
"step": 529
},
{
"epoch": 0.8134292565947242,
"grad_norm": 1.765625,
"learning_rate": 1.3611038058678776e-05,
"loss": 1.4569,
"step": 530
},
{
"epoch": 0.8149640287769784,
"grad_norm": 1.7265625,
"learning_rate": 1.3587589753488999e-05,
"loss": 1.4791,
"step": 531
},
{
"epoch": 0.8164988009592327,
"grad_norm": 1.8125,
"learning_rate": 1.3564118787132507e-05,
"loss": 1.5325,
"step": 532
},
{
"epoch": 0.8180335731414868,
"grad_norm": 1.71875,
"learning_rate": 1.3540625307864693e-05,
"loss": 1.3922,
"step": 533
},
{
"epoch": 0.819568345323741,
"grad_norm": 1.8359375,
"learning_rate": 1.3517109464083129e-05,
"loss": 1.5347,
"step": 534
},
{
"epoch": 0.8211031175059952,
"grad_norm": 1.7421875,
"learning_rate": 1.3493571404326671e-05,
"loss": 1.4345,
"step": 535
},
{
"epoch": 0.8226378896882494,
"grad_norm": 1.765625,
"learning_rate": 1.3470011277274497e-05,
"loss": 1.4761,
"step": 536
},
{
"epoch": 0.8241726618705036,
"grad_norm": 1.6953125,
"learning_rate": 1.344642923174517e-05,
"loss": 1.3524,
"step": 537
},
{
"epoch": 0.8257074340527578,
"grad_norm": 1.7421875,
"learning_rate": 1.3422825416695713e-05,
"loss": 1.3459,
"step": 538
},
{
"epoch": 0.827242206235012,
"grad_norm": 1.6796875,
"learning_rate": 1.3399199981220648e-05,
"loss": 1.3793,
"step": 539
},
{
"epoch": 0.8287769784172662,
"grad_norm": 1.6484375,
"learning_rate": 1.337555307455106e-05,
"loss": 1.2792,
"step": 540
},
{
"epoch": 0.8303117505995203,
"grad_norm": 1.6875,
"learning_rate": 1.3351884846053668e-05,
"loss": 1.4985,
"step": 541
},
{
"epoch": 0.8318465227817746,
"grad_norm": 1.7734375,
"learning_rate": 1.3328195445229869e-05,
"loss": 1.5698,
"step": 542
},
{
"epoch": 0.8333812949640288,
"grad_norm": 1.625,
"learning_rate": 1.330448502171479e-05,
"loss": 1.3706,
"step": 543
},
{
"epoch": 0.834916067146283,
"grad_norm": 1.8046875,
"learning_rate": 1.3280753725276352e-05,
"loss": 1.4925,
"step": 544
},
{
"epoch": 0.8364508393285371,
"grad_norm": 1.875,
"learning_rate": 1.3257001705814323e-05,
"loss": 1.5999,
"step": 545
},
{
"epoch": 0.8379856115107913,
"grad_norm": 1.84375,
"learning_rate": 1.3233229113359368e-05,
"loss": 1.495,
"step": 546
},
{
"epoch": 0.8395203836930456,
"grad_norm": 1.765625,
"learning_rate": 1.3209436098072095e-05,
"loss": 1.3432,
"step": 547
},
{
"epoch": 0.8410551558752998,
"grad_norm": 1.65625,
"learning_rate": 1.3185622810242129e-05,
"loss": 1.3679,
"step": 548
},
{
"epoch": 0.842589928057554,
"grad_norm": 1.8203125,
"learning_rate": 1.316178940028713e-05,
"loss": 1.424,
"step": 549
},
{
"epoch": 0.8441247002398081,
"grad_norm": 1.6953125,
"learning_rate": 1.3137936018751876e-05,
"loss": 1.4842,
"step": 550
},
{
"epoch": 0.8456594724220623,
"grad_norm": 1.6484375,
"learning_rate": 1.3114062816307284e-05,
"loss": 1.2388,
"step": 551
},
{
"epoch": 0.8471942446043166,
"grad_norm": 1.7109375,
"learning_rate": 1.3090169943749475e-05,
"loss": 1.4275,
"step": 552
},
{
"epoch": 0.8487290167865708,
"grad_norm": 1.765625,
"learning_rate": 1.3066257551998822e-05,
"loss": 1.508,
"step": 553
},
{
"epoch": 0.8502637889688249,
"grad_norm": 1.828125,
"learning_rate": 1.3042325792098982e-05,
"loss": 1.5241,
"step": 554
},
{
"epoch": 0.8517985611510791,
"grad_norm": 1.7578125,
"learning_rate": 1.3018374815215962e-05,
"loss": 1.3896,
"step": 555
},
{
"epoch": 0.8533333333333334,
"grad_norm": 1.6953125,
"learning_rate": 1.2994404772637145e-05,
"loss": 1.3765,
"step": 556
},
{
"epoch": 0.8548681055155876,
"grad_norm": 1.6796875,
"learning_rate": 1.297041581577035e-05,
"loss": 1.3199,
"step": 557
},
{
"epoch": 0.8564028776978417,
"grad_norm": 1.6875,
"learning_rate": 1.2946408096142866e-05,
"loss": 1.511,
"step": 558
},
{
"epoch": 0.8579376498800959,
"grad_norm": 1.7265625,
"learning_rate": 1.2922381765400501e-05,
"loss": 1.3674,
"step": 559
},
{
"epoch": 0.8594724220623501,
"grad_norm": 1.6484375,
"learning_rate": 1.289833697530661e-05,
"loss": 1.3332,
"step": 560
},
{
"epoch": 0.8610071942446044,
"grad_norm": 1.6484375,
"learning_rate": 1.2874273877741165e-05,
"loss": 1.3316,
"step": 561
},
{
"epoch": 0.8625419664268585,
"grad_norm": 1.9140625,
"learning_rate": 1.2850192624699762e-05,
"loss": 1.3782,
"step": 562
},
{
"epoch": 0.8640767386091127,
"grad_norm": 1.7109375,
"learning_rate": 1.2826093368292687e-05,
"loss": 1.4955,
"step": 563
},
{
"epoch": 0.8656115107913669,
"grad_norm": 1.8359375,
"learning_rate": 1.2801976260743937e-05,
"loss": 1.5445,
"step": 564
},
{
"epoch": 0.8671462829736211,
"grad_norm": 1.7265625,
"learning_rate": 1.2777841454390276e-05,
"loss": 1.3496,
"step": 565
},
{
"epoch": 0.8686810551558753,
"grad_norm": 1.7578125,
"learning_rate": 1.2753689101680252e-05,
"loss": 1.4302,
"step": 566
},
{
"epoch": 0.8702158273381295,
"grad_norm": 1.7578125,
"learning_rate": 1.2729519355173254e-05,
"loss": 1.3425,
"step": 567
},
{
"epoch": 0.8717505995203837,
"grad_norm": 1.703125,
"learning_rate": 1.2705332367538539e-05,
"loss": 1.5349,
"step": 568
},
{
"epoch": 0.8732853717026379,
"grad_norm": 1.765625,
"learning_rate": 1.2681128291554263e-05,
"loss": 1.4892,
"step": 569
},
{
"epoch": 0.874820143884892,
"grad_norm": 1.78125,
"learning_rate": 1.2656907280106528e-05,
"loss": 1.455,
"step": 570
},
{
"epoch": 0.8763549160671463,
"grad_norm": 1.75,
"learning_rate": 1.2632669486188403e-05,
"loss": 1.5562,
"step": 571
},
{
"epoch": 0.8778896882494005,
"grad_norm": 1.6328125,
"learning_rate": 1.2608415062898971e-05,
"loss": 1.3335,
"step": 572
},
{
"epoch": 0.8794244604316547,
"grad_norm": 1.765625,
"learning_rate": 1.2584144163442347e-05,
"loss": 1.4161,
"step": 573
},
{
"epoch": 0.8809592326139088,
"grad_norm": 1.71875,
"learning_rate": 1.255985694112673e-05,
"loss": 1.4954,
"step": 574
},
{
"epoch": 0.882494004796163,
"grad_norm": 1.796875,
"learning_rate": 1.2535553549363407e-05,
"loss": 1.415,
"step": 575
},
{
"epoch": 0.8840287769784173,
"grad_norm": 1.6796875,
"learning_rate": 1.2511234141665816e-05,
"loss": 1.4748,
"step": 576
},
{
"epoch": 0.8855635491606715,
"grad_norm": 1.7109375,
"learning_rate": 1.2486898871648552e-05,
"loss": 1.2765,
"step": 577
},
{
"epoch": 0.8870983213429257,
"grad_norm": 1.765625,
"learning_rate": 1.2462547893026403e-05,
"loss": 1.5225,
"step": 578
},
{
"epoch": 0.8886330935251798,
"grad_norm": 1.7734375,
"learning_rate": 1.2438181359613388e-05,
"loss": 1.4643,
"step": 579
},
{
"epoch": 0.890167865707434,
"grad_norm": 1.765625,
"learning_rate": 1.2413799425321774e-05,
"loss": 1.4519,
"step": 580
},
{
"epoch": 0.8917026378896883,
"grad_norm": 1.6953125,
"learning_rate": 1.2389402244161107e-05,
"loss": 1.4527,
"step": 581
},
{
"epoch": 0.8932374100719425,
"grad_norm": 1.859375,
"learning_rate": 1.236498997023725e-05,
"loss": 1.4682,
"step": 582
},
{
"epoch": 0.8947721822541966,
"grad_norm": 1.7578125,
"learning_rate": 1.2340562757751385e-05,
"loss": 1.4018,
"step": 583
},
{
"epoch": 0.8963069544364508,
"grad_norm": 1.734375,
"learning_rate": 1.2316120760999066e-05,
"loss": 1.4245,
"step": 584
},
{
"epoch": 0.897841726618705,
"grad_norm": 1.8515625,
"learning_rate": 1.2291664134369229e-05,
"loss": 1.4849,
"step": 585
},
{
"epoch": 0.8993764988009593,
"grad_norm": 1.6328125,
"learning_rate": 1.2267193032343219e-05,
"loss": 1.3096,
"step": 586
},
{
"epoch": 0.9009112709832134,
"grad_norm": 1.71875,
"learning_rate": 1.2242707609493814e-05,
"loss": 1.4568,
"step": 587
},
{
"epoch": 0.9024460431654676,
"grad_norm": 1.8203125,
"learning_rate": 1.2218208020484255e-05,
"loss": 1.4273,
"step": 588
},
{
"epoch": 0.9039808153477218,
"grad_norm": 1.8046875,
"learning_rate": 1.219369442006726e-05,
"loss": 1.3472,
"step": 589
},
{
"epoch": 0.9055155875299761,
"grad_norm": 1.71875,
"learning_rate": 1.2169166963084056e-05,
"loss": 1.2589,
"step": 590
},
{
"epoch": 0.9070503597122302,
"grad_norm": 1.78125,
"learning_rate": 1.2144625804463384e-05,
"loss": 1.5012,
"step": 591
},
{
"epoch": 0.9085851318944844,
"grad_norm": 1.7890625,
"learning_rate": 1.212007109922055e-05,
"loss": 1.513,
"step": 592
},
{
"epoch": 0.9101199040767386,
"grad_norm": 1.6875,
"learning_rate": 1.2095503002456405e-05,
"loss": 1.3524,
"step": 593
},
{
"epoch": 0.9116546762589928,
"grad_norm": 1.6953125,
"learning_rate": 1.2070921669356415e-05,
"loss": 1.291,
"step": 594
},
{
"epoch": 0.913189448441247,
"grad_norm": 1.8046875,
"learning_rate": 1.2046327255189627e-05,
"loss": 1.4709,
"step": 595
},
{
"epoch": 0.9147242206235012,
"grad_norm": 1.7265625,
"learning_rate": 1.2021719915307737e-05,
"loss": 1.4373,
"step": 596
},
{
"epoch": 0.9162589928057554,
"grad_norm": 1.7109375,
"learning_rate": 1.1997099805144071e-05,
"loss": 1.3847,
"step": 597
},
{
"epoch": 0.9177937649880096,
"grad_norm": 1.6796875,
"learning_rate": 1.197246708021263e-05,
"loss": 1.3515,
"step": 598
},
{
"epoch": 0.9193285371702637,
"grad_norm": 1.9609375,
"learning_rate": 1.194782189610709e-05,
"loss": 1.5657,
"step": 599
},
{
"epoch": 0.920863309352518,
"grad_norm": 1.6953125,
"learning_rate": 1.192316440849983e-05,
"loss": 1.3718,
"step": 600
},
{
"epoch": 0.9223980815347722,
"grad_norm": 1.71875,
"learning_rate": 1.1898494773140942e-05,
"loss": 1.4058,
"step": 601
},
{
"epoch": 0.9239328537170264,
"grad_norm": 1.75,
"learning_rate": 1.187381314585725e-05,
"loss": 1.4138,
"step": 602
},
{
"epoch": 0.9254676258992806,
"grad_norm": 1.6953125,
"learning_rate": 1.1849119682551323e-05,
"loss": 1.5707,
"step": 603
},
{
"epoch": 0.9270023980815347,
"grad_norm": 1.7890625,
"learning_rate": 1.1824414539200505e-05,
"loss": 1.5327,
"step": 604
},
{
"epoch": 0.928537170263789,
"grad_norm": 1.8125,
"learning_rate": 1.17996978718559e-05,
"loss": 1.4747,
"step": 605
},
{
"epoch": 0.9300719424460432,
"grad_norm": 1.7578125,
"learning_rate": 1.1774969836641417e-05,
"loss": 1.5315,
"step": 606
},
{
"epoch": 0.9316067146282974,
"grad_norm": 1.765625,
"learning_rate": 1.1750230589752763e-05,
"loss": 1.501,
"step": 607
},
{
"epoch": 0.9331414868105515,
"grad_norm": 1.6796875,
"learning_rate": 1.1725480287456467e-05,
"loss": 1.3595,
"step": 608
},
{
"epoch": 0.9346762589928057,
"grad_norm": 1.78125,
"learning_rate": 1.1700719086088891e-05,
"loss": 1.5799,
"step": 609
},
{
"epoch": 0.93621103117506,
"grad_norm": 1.6640625,
"learning_rate": 1.1675947142055241e-05,
"loss": 1.4093,
"step": 610
},
{
"epoch": 0.9377458033573142,
"grad_norm": 1.796875,
"learning_rate": 1.165116461182858e-05,
"loss": 1.3595,
"step": 611
},
{
"epoch": 0.9392805755395683,
"grad_norm": 1.7265625,
"learning_rate": 1.1626371651948839e-05,
"loss": 1.4811,
"step": 612
},
{
"epoch": 0.9408153477218225,
"grad_norm": 1.8203125,
"learning_rate": 1.160156841902182e-05,
"loss": 1.3337,
"step": 613
},
{
"epoch": 0.9423501199040767,
"grad_norm": 1.8515625,
"learning_rate": 1.1576755069718229e-05,
"loss": 1.3943,
"step": 614
},
{
"epoch": 0.943884892086331,
"grad_norm": 1.734375,
"learning_rate": 1.155193176077266e-05,
"loss": 1.4056,
"step": 615
},
{
"epoch": 0.9454196642685851,
"grad_norm": 1.6953125,
"learning_rate": 1.1527098648982634e-05,
"loss": 1.5342,
"step": 616
},
{
"epoch": 0.9469544364508393,
"grad_norm": 1.7109375,
"learning_rate": 1.1502255891207572e-05,
"loss": 1.3041,
"step": 617
},
{
"epoch": 0.9484892086330935,
"grad_norm": 1.6953125,
"learning_rate": 1.1477403644367839e-05,
"loss": 1.3494,
"step": 618
},
{
"epoch": 0.9500239808153477,
"grad_norm": 1.6875,
"learning_rate": 1.1452542065443728e-05,
"loss": 1.465,
"step": 619
},
{
"epoch": 0.951558752997602,
"grad_norm": 1.765625,
"learning_rate": 1.1427671311474489e-05,
"loss": 1.336,
"step": 620
},
{
"epoch": 0.9530935251798561,
"grad_norm": 1.71875,
"learning_rate": 1.140279153955732e-05,
"loss": 1.3387,
"step": 621
},
{
"epoch": 0.9546282973621103,
"grad_norm": 1.71875,
"learning_rate": 1.137790290684638e-05,
"loss": 1.398,
"step": 622
},
{
"epoch": 0.9561630695443645,
"grad_norm": 1.6875,
"learning_rate": 1.1353005570551803e-05,
"loss": 1.509,
"step": 623
},
{
"epoch": 0.9576978417266188,
"grad_norm": 1.765625,
"learning_rate": 1.1328099687938696e-05,
"loss": 1.4732,
"step": 624
},
{
"epoch": 0.9592326139088729,
"grad_norm": 1.71875,
"learning_rate": 1.1303185416326148e-05,
"loss": 1.3411,
"step": 625
},
{
"epoch": 0.9607673860911271,
"grad_norm": 1.7734375,
"learning_rate": 1.1278262913086238e-05,
"loss": 1.4715,
"step": 626
},
{
"epoch": 0.9623021582733813,
"grad_norm": 1.71875,
"learning_rate": 1.1253332335643043e-05,
"loss": 1.3279,
"step": 627
},
{
"epoch": 0.9638369304556355,
"grad_norm": 1.6796875,
"learning_rate": 1.1228393841471644e-05,
"loss": 1.3264,
"step": 628
},
{
"epoch": 0.9653717026378897,
"grad_norm": 1.703125,
"learning_rate": 1.1203447588097115e-05,
"loss": 1.2625,
"step": 629
},
{
"epoch": 0.9669064748201439,
"grad_norm": 1.75,
"learning_rate": 1.117849373309356e-05,
"loss": 1.4892,
"step": 630
},
{
"epoch": 0.9684412470023981,
"grad_norm": 1.8046875,
"learning_rate": 1.1153532434083083e-05,
"loss": 1.4547,
"step": 631
},
{
"epoch": 0.9699760191846523,
"grad_norm": 1.8046875,
"learning_rate": 1.1128563848734817e-05,
"loss": 1.5092,
"step": 632
},
{
"epoch": 0.9715107913669064,
"grad_norm": 1.6640625,
"learning_rate": 1.1103588134763918e-05,
"loss": 1.424,
"step": 633
},
{
"epoch": 0.9730455635491607,
"grad_norm": 1.7109375,
"learning_rate": 1.1078605449930569e-05,
"loss": 1.4477,
"step": 634
},
{
"epoch": 0.9745803357314149,
"grad_norm": 1.625,
"learning_rate": 1.1053615952038984e-05,
"loss": 1.2692,
"step": 635
},
{
"epoch": 0.9761151079136691,
"grad_norm": 1.71875,
"learning_rate": 1.1028619798936418e-05,
"loss": 1.4006,
"step": 636
},
{
"epoch": 0.9776498800959232,
"grad_norm": 1.6875,
"learning_rate": 1.1003617148512149e-05,
"loss": 1.4153,
"step": 637
},
{
"epoch": 0.9791846522781774,
"grad_norm": 1.6796875,
"learning_rate": 1.0978608158696517e-05,
"loss": 1.4156,
"step": 638
},
{
"epoch": 0.9807194244604317,
"grad_norm": 1.796875,
"learning_rate": 1.0953592987459886e-05,
"loss": 1.4369,
"step": 639
},
{
"epoch": 0.9822541966426859,
"grad_norm": 1.8828125,
"learning_rate": 1.092857179281168e-05,
"loss": 1.482,
"step": 640
},
{
"epoch": 0.98378896882494,
"grad_norm": 1.734375,
"learning_rate": 1.0903544732799357e-05,
"loss": 1.4631,
"step": 641
},
{
"epoch": 0.9853237410071942,
"grad_norm": 1.6640625,
"learning_rate": 1.0878511965507435e-05,
"loss": 1.4286,
"step": 642
},
{
"epoch": 0.9868585131894484,
"grad_norm": 1.859375,
"learning_rate": 1.0853473649056472e-05,
"loss": 1.5408,
"step": 643
},
{
"epoch": 0.9883932853717027,
"grad_norm": 1.78125,
"learning_rate": 1.0828429941602082e-05,
"loss": 1.4246,
"step": 644
},
{
"epoch": 0.9899280575539569,
"grad_norm": 1.703125,
"learning_rate": 1.0803381001333943e-05,
"loss": 1.401,
"step": 645
},
{
"epoch": 0.991462829736211,
"grad_norm": 1.71875,
"learning_rate": 1.0778326986474765e-05,
"loss": 1.3825,
"step": 646
},
{
"epoch": 0.9929976019184652,
"grad_norm": 1.75,
"learning_rate": 1.0753268055279328e-05,
"loss": 1.4568,
"step": 647
},
{
"epoch": 0.9945323741007194,
"grad_norm": 1.6875,
"learning_rate": 1.072820436603346e-05,
"loss": 1.4033,
"step": 648
},
{
"epoch": 0.9960671462829737,
"grad_norm": 1.734375,
"learning_rate": 1.070313607705304e-05,
"loss": 1.4913,
"step": 649
},
{
"epoch": 0.9976019184652278,
"grad_norm": 1.7421875,
"learning_rate": 1.067806334668301e-05,
"loss": 1.452,
"step": 650
},
{
"epoch": 0.999136690647482,
"grad_norm": 1.8046875,
"learning_rate": 1.0652986333296358e-05,
"loss": 1.5346,
"step": 651
}
],
"logging_steps": 1,
"max_steps": 1302,
"num_input_tokens_seen": 0,
"num_train_epochs": 2,
"save_steps": 651,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 1.6771286351373926e+17,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}