spur_metaworld / trainer_state.json
abraranwar's picture
Upload RFM model
320e086 verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 61.53846153846154,
"eval_steps": 5,
"global_step": 800,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.07692307692307693,
"grad_norm": 10603.1904296875,
"learning_rate": 0.0,
"loss": 22.6733,
"step": 1
},
{
"epoch": 0.15384615384615385,
"grad_norm": 18773.23046875,
"learning_rate": 4e-08,
"loss": 20.714,
"step": 2
},
{
"epoch": 0.23076923076923078,
"grad_norm": 4270.68408203125,
"learning_rate": 8e-08,
"loss": 22.6768,
"step": 3
},
{
"epoch": 0.3076923076923077,
"grad_norm": 8179.71484375,
"learning_rate": 1.2000000000000002e-07,
"loss": 25.0131,
"step": 4
},
{
"epoch": 0.38461538461538464,
"grad_norm": 16160.1201171875,
"learning_rate": 1.6e-07,
"loss": 22.2821,
"step": 5
},
{
"epoch": 0.46153846153846156,
"grad_norm": 8064.51904296875,
"learning_rate": 2.0000000000000002e-07,
"loss": 21.2213,
"step": 6
},
{
"epoch": 0.5384615384615384,
"grad_norm": 12106.345703125,
"learning_rate": 2.4000000000000003e-07,
"loss": 24.638,
"step": 7
},
{
"epoch": 0.6153846153846154,
"grad_norm": 8413.4814453125,
"learning_rate": 2.8e-07,
"loss": 20.6285,
"step": 8
},
{
"epoch": 0.6923076923076923,
"grad_norm": 15460.4384765625,
"learning_rate": 3.2e-07,
"loss": 21.5866,
"step": 9
},
{
"epoch": 0.7692307692307693,
"grad_norm": 4440.8818359375,
"learning_rate": 3.6e-07,
"loss": 22.0973,
"step": 10
},
{
"epoch": 0.8461538461538461,
"grad_norm": 4846.01171875,
"learning_rate": 4.0000000000000003e-07,
"loss": 26.5506,
"step": 11
},
{
"epoch": 0.9230769230769231,
"grad_norm": 6784.392578125,
"learning_rate": 4.4e-07,
"loss": 26.4308,
"step": 12
},
{
"epoch": 1.0,
"grad_norm": 10356.7490234375,
"learning_rate": 4.800000000000001e-07,
"loss": 21.2232,
"step": 13
},
{
"epoch": 1.0769230769230769,
"grad_norm": 12669.5966796875,
"learning_rate": 5.2e-07,
"loss": 19.8431,
"step": 14
},
{
"epoch": 1.1538461538461537,
"grad_norm": 7368.5390625,
"learning_rate": 5.6e-07,
"loss": 19.2139,
"step": 15
},
{
"epoch": 1.2307692307692308,
"grad_norm": 4959.1923828125,
"learning_rate": 6.000000000000001e-07,
"loss": 18.3705,
"step": 16
},
{
"epoch": 1.3076923076923077,
"grad_norm": 5191.330078125,
"learning_rate": 6.4e-07,
"loss": 22.0389,
"step": 17
},
{
"epoch": 1.3846153846153846,
"grad_norm": 10824.8740234375,
"learning_rate": 6.800000000000001e-07,
"loss": 20.2947,
"step": 18
},
{
"epoch": 1.4615384615384617,
"grad_norm": 5129.83056640625,
"learning_rate": 7.2e-07,
"loss": 21.5472,
"step": 19
},
{
"epoch": 1.5384615384615383,
"grad_norm": 7372.88818359375,
"learning_rate": 7.6e-07,
"loss": 19.5856,
"step": 20
},
{
"epoch": 1.6153846153846154,
"grad_norm": 4771.4990234375,
"learning_rate": 8.000000000000001e-07,
"loss": 18.078,
"step": 21
},
{
"epoch": 1.6923076923076923,
"grad_norm": 8714.8642578125,
"learning_rate": 8.400000000000001e-07,
"loss": 20.8261,
"step": 22
},
{
"epoch": 1.7692307692307692,
"grad_norm": 10882.0322265625,
"learning_rate": 8.8e-07,
"loss": 17.8416,
"step": 23
},
{
"epoch": 1.8461538461538463,
"grad_norm": 3724.619873046875,
"learning_rate": 9.200000000000001e-07,
"loss": 18.9121,
"step": 24
},
{
"epoch": 1.9230769230769231,
"grad_norm": 8529.5771484375,
"learning_rate": 9.600000000000001e-07,
"loss": 24.5815,
"step": 25
},
{
"epoch": 2.0,
"grad_norm": 8510.6318359375,
"learning_rate": 1.0000000000000002e-06,
"loss": 20.9304,
"step": 26
},
{
"epoch": 2.076923076923077,
"grad_norm": 9672.4150390625,
"learning_rate": 1.04e-06,
"loss": 18.5716,
"step": 27
},
{
"epoch": 2.1538461538461537,
"grad_norm": 7587.6533203125,
"learning_rate": 1.08e-06,
"loss": 18.0011,
"step": 28
},
{
"epoch": 2.230769230769231,
"grad_norm": 33927.44140625,
"learning_rate": 1.12e-06,
"loss": 17.8299,
"step": 29
},
{
"epoch": 2.3076923076923075,
"grad_norm": 5066.283203125,
"learning_rate": 1.1600000000000001e-06,
"loss": 15.1792,
"step": 30
},
{
"epoch": 2.3846153846153846,
"grad_norm": 11348.0380859375,
"learning_rate": 1.2000000000000002e-06,
"loss": 17.1275,
"step": 31
},
{
"epoch": 2.4615384615384617,
"grad_norm": 44569.71484375,
"learning_rate": 1.2400000000000002e-06,
"loss": 17.5373,
"step": 32
},
{
"epoch": 2.5384615384615383,
"grad_norm": 11042.56640625,
"learning_rate": 1.28e-06,
"loss": 14.4275,
"step": 33
},
{
"epoch": 2.6153846153846154,
"grad_norm": 14324.48046875,
"learning_rate": 1.32e-06,
"loss": 16.0846,
"step": 34
},
{
"epoch": 2.6923076923076925,
"grad_norm": 6262.25732421875,
"learning_rate": 1.3600000000000001e-06,
"loss": 12.7508,
"step": 35
},
{
"epoch": 2.769230769230769,
"grad_norm": 4430.26611328125,
"learning_rate": 1.4000000000000001e-06,
"loss": 13.6969,
"step": 36
},
{
"epoch": 2.8461538461538463,
"grad_norm": 6138.24267578125,
"learning_rate": 1.44e-06,
"loss": 12.5667,
"step": 37
},
{
"epoch": 2.9230769230769234,
"grad_norm": 3587.69482421875,
"learning_rate": 1.48e-06,
"loss": 13.052,
"step": 38
},
{
"epoch": 3.0,
"grad_norm": 11029.201171875,
"learning_rate": 1.52e-06,
"loss": 14.1599,
"step": 39
},
{
"epoch": 3.076923076923077,
"grad_norm": 4541.20166015625,
"learning_rate": 1.56e-06,
"loss": 14.2273,
"step": 40
},
{
"epoch": 3.1538461538461537,
"grad_norm": 6240.2138671875,
"learning_rate": 1.6000000000000001e-06,
"loss": 11.7308,
"step": 41
},
{
"epoch": 3.230769230769231,
"grad_norm": 11963.646484375,
"learning_rate": 1.6400000000000002e-06,
"loss": 12.2135,
"step": 42
},
{
"epoch": 3.3076923076923075,
"grad_norm": 7477.02392578125,
"learning_rate": 1.6800000000000002e-06,
"loss": 7.8914,
"step": 43
},
{
"epoch": 3.3846153846153846,
"grad_norm": 4601.59130859375,
"learning_rate": 1.72e-06,
"loss": 10.5209,
"step": 44
},
{
"epoch": 3.4615384615384617,
"grad_norm": 12468.453125,
"learning_rate": 1.76e-06,
"loss": 9.8911,
"step": 45
},
{
"epoch": 3.5384615384615383,
"grad_norm": 4691.3603515625,
"learning_rate": 1.8000000000000001e-06,
"loss": 8.5108,
"step": 46
},
{
"epoch": 3.6153846153846154,
"grad_norm": 14303.9404296875,
"learning_rate": 1.8400000000000002e-06,
"loss": 9.6945,
"step": 47
},
{
"epoch": 3.6923076923076925,
"grad_norm": 2076.6015625,
"learning_rate": 1.8800000000000002e-06,
"loss": 5.6841,
"step": 48
},
{
"epoch": 3.769230769230769,
"grad_norm": 2748.860107421875,
"learning_rate": 1.9200000000000003e-06,
"loss": 5.7316,
"step": 49
},
{
"epoch": 3.8461538461538463,
"grad_norm": 4779.833984375,
"learning_rate": 1.9600000000000003e-06,
"loss": 6.1169,
"step": 50
},
{
"epoch": 3.9230769230769234,
"grad_norm": 6074.677734375,
"learning_rate": 2.0000000000000003e-06,
"loss": 6.5592,
"step": 51
},
{
"epoch": 4.0,
"grad_norm": 7416.8369140625,
"learning_rate": 2.04e-06,
"loss": 7.5151,
"step": 52
},
{
"epoch": 4.076923076923077,
"grad_norm": 4175.63232421875,
"learning_rate": 2.08e-06,
"loss": 6.613,
"step": 53
},
{
"epoch": 4.153846153846154,
"grad_norm": 2247.82177734375,
"learning_rate": 2.12e-06,
"loss": 5.7516,
"step": 54
},
{
"epoch": 4.230769230769231,
"grad_norm": 6121.06298828125,
"learning_rate": 2.16e-06,
"loss": 6.1682,
"step": 55
},
{
"epoch": 4.3076923076923075,
"grad_norm": 4141.83349609375,
"learning_rate": 2.2e-06,
"loss": 6.0549,
"step": 56
},
{
"epoch": 4.384615384615385,
"grad_norm": 1865.611572265625,
"learning_rate": 2.24e-06,
"loss": 5.7047,
"step": 57
},
{
"epoch": 4.461538461538462,
"grad_norm": 3789.943115234375,
"learning_rate": 2.28e-06,
"loss": 6.7775,
"step": 58
},
{
"epoch": 4.538461538461538,
"grad_norm": 10279.765625,
"learning_rate": 2.3200000000000002e-06,
"loss": 6.5486,
"step": 59
},
{
"epoch": 4.615384615384615,
"grad_norm": 4512.77392578125,
"learning_rate": 2.3600000000000003e-06,
"loss": 5.9955,
"step": 60
},
{
"epoch": 4.6923076923076925,
"grad_norm": 9854.623046875,
"learning_rate": 2.4000000000000003e-06,
"loss": 5.008,
"step": 61
},
{
"epoch": 4.769230769230769,
"grad_norm": 1842.1689453125,
"learning_rate": 2.4400000000000004e-06,
"loss": 5.6766,
"step": 62
},
{
"epoch": 4.846153846153846,
"grad_norm": 8768.17578125,
"learning_rate": 2.4800000000000004e-06,
"loss": 4.1796,
"step": 63
},
{
"epoch": 4.923076923076923,
"grad_norm": 1296.5732421875,
"learning_rate": 2.52e-06,
"loss": 4.6935,
"step": 64
},
{
"epoch": 5.0,
"grad_norm": 3000.628662109375,
"learning_rate": 2.56e-06,
"loss": 3.4662,
"step": 65
},
{
"epoch": 5.076923076923077,
"grad_norm": 2375.695556640625,
"learning_rate": 2.6e-06,
"loss": 5.7503,
"step": 66
},
{
"epoch": 5.153846153846154,
"grad_norm": 1394.402587890625,
"learning_rate": 2.64e-06,
"loss": 5.1836,
"step": 67
},
{
"epoch": 5.230769230769231,
"grad_norm": 5750.8896484375,
"learning_rate": 2.68e-06,
"loss": 4.6198,
"step": 68
},
{
"epoch": 5.3076923076923075,
"grad_norm": 2214.36572265625,
"learning_rate": 2.7200000000000002e-06,
"loss": 4.5119,
"step": 69
},
{
"epoch": 5.384615384615385,
"grad_norm": 4186.42919921875,
"learning_rate": 2.7600000000000003e-06,
"loss": 4.4198,
"step": 70
},
{
"epoch": 5.461538461538462,
"grad_norm": 1318.0018310546875,
"learning_rate": 2.8000000000000003e-06,
"loss": 3.2784,
"step": 71
},
{
"epoch": 5.538461538461538,
"grad_norm": 1700.1236572265625,
"learning_rate": 2.84e-06,
"loss": 3.6871,
"step": 72
},
{
"epoch": 5.615384615384615,
"grad_norm": 3383.92626953125,
"learning_rate": 2.88e-06,
"loss": 4.5666,
"step": 73
},
{
"epoch": 5.6923076923076925,
"grad_norm": 1545.00439453125,
"learning_rate": 2.92e-06,
"loss": 3.5176,
"step": 74
},
{
"epoch": 5.769230769230769,
"grad_norm": 2377.838623046875,
"learning_rate": 2.96e-06,
"loss": 3.8324,
"step": 75
},
{
"epoch": 5.846153846153846,
"grad_norm": 884.7638549804688,
"learning_rate": 3e-06,
"loss": 2.3883,
"step": 76
},
{
"epoch": 5.923076923076923,
"grad_norm": 1984.365234375,
"learning_rate": 3.04e-06,
"loss": 3.7156,
"step": 77
},
{
"epoch": 6.0,
"grad_norm": 3334.85205078125,
"learning_rate": 3.08e-06,
"loss": 3.0619,
"step": 78
},
{
"epoch": 6.076923076923077,
"grad_norm": 2713.583740234375,
"learning_rate": 3.12e-06,
"loss": 4.0911,
"step": 79
},
{
"epoch": 6.153846153846154,
"grad_norm": 831.1427612304688,
"learning_rate": 3.1600000000000002e-06,
"loss": 4.4941,
"step": 80
},
{
"epoch": 6.230769230769231,
"grad_norm": 1827.432861328125,
"learning_rate": 3.2000000000000003e-06,
"loss": 3.5104,
"step": 81
},
{
"epoch": 6.3076923076923075,
"grad_norm": 1751.157958984375,
"learning_rate": 3.2400000000000003e-06,
"loss": 3.3955,
"step": 82
},
{
"epoch": 6.384615384615385,
"grad_norm": 1335.197265625,
"learning_rate": 3.2800000000000004e-06,
"loss": 3.0868,
"step": 83
},
{
"epoch": 6.461538461538462,
"grad_norm": 2164.307373046875,
"learning_rate": 3.3200000000000004e-06,
"loss": 3.5908,
"step": 84
},
{
"epoch": 6.538461538461538,
"grad_norm": 3304.489990234375,
"learning_rate": 3.3600000000000004e-06,
"loss": 3.2302,
"step": 85
},
{
"epoch": 6.615384615384615,
"grad_norm": 3139.857421875,
"learning_rate": 3.4000000000000005e-06,
"loss": 3.3594,
"step": 86
},
{
"epoch": 6.6923076923076925,
"grad_norm": 1009.633544921875,
"learning_rate": 3.44e-06,
"loss": 2.9373,
"step": 87
},
{
"epoch": 6.769230769230769,
"grad_norm": 1588.922119140625,
"learning_rate": 3.48e-06,
"loss": 3.0434,
"step": 88
},
{
"epoch": 6.846153846153846,
"grad_norm": 2253.773681640625,
"learning_rate": 3.52e-06,
"loss": 2.5412,
"step": 89
},
{
"epoch": 6.923076923076923,
"grad_norm": 1642.713134765625,
"learning_rate": 3.5600000000000002e-06,
"loss": 2.2512,
"step": 90
},
{
"epoch": 7.0,
"grad_norm": 1364.428955078125,
"learning_rate": 3.6000000000000003e-06,
"loss": 3.0326,
"step": 91
},
{
"epoch": 7.076923076923077,
"grad_norm": 11717.6826171875,
"learning_rate": 3.6400000000000003e-06,
"loss": 3.6506,
"step": 92
},
{
"epoch": 7.153846153846154,
"grad_norm": 1019.0966186523438,
"learning_rate": 3.6800000000000003e-06,
"loss": 2.8424,
"step": 93
},
{
"epoch": 7.230769230769231,
"grad_norm": 508.2272644042969,
"learning_rate": 3.7200000000000004e-06,
"loss": 2.4272,
"step": 94
},
{
"epoch": 7.3076923076923075,
"grad_norm": 6681.02880859375,
"learning_rate": 3.7600000000000004e-06,
"loss": 2.5006,
"step": 95
},
{
"epoch": 7.384615384615385,
"grad_norm": 327.763916015625,
"learning_rate": 3.8000000000000005e-06,
"loss": 1.7235,
"step": 96
},
{
"epoch": 7.461538461538462,
"grad_norm": 717.2875366210938,
"learning_rate": 3.8400000000000005e-06,
"loss": 1.3483,
"step": 97
},
{
"epoch": 7.538461538461538,
"grad_norm": 14631.060546875,
"learning_rate": 3.88e-06,
"loss": 1.7917,
"step": 98
},
{
"epoch": 7.615384615384615,
"grad_norm": 449.271484375,
"learning_rate": 3.920000000000001e-06,
"loss": 1.3824,
"step": 99
},
{
"epoch": 7.6923076923076925,
"grad_norm": 1226.365966796875,
"learning_rate": 3.96e-06,
"loss": 1.6257,
"step": 100
},
{
"epoch": 7.769230769230769,
"grad_norm": 254.60325622558594,
"learning_rate": 4.000000000000001e-06,
"loss": 1.9142,
"step": 101
},
{
"epoch": 7.846153846153846,
"grad_norm": 558.2205200195312,
"learning_rate": 4.04e-06,
"loss": 1.1077,
"step": 102
},
{
"epoch": 7.923076923076923,
"grad_norm": 188.0357666015625,
"learning_rate": 4.08e-06,
"loss": 0.9966,
"step": 103
},
{
"epoch": 8.0,
"grad_norm": 145.9339599609375,
"learning_rate": 4.12e-06,
"loss": 1.1691,
"step": 104
},
{
"epoch": 8.076923076923077,
"grad_norm": 122.64077758789062,
"learning_rate": 4.16e-06,
"loss": 1.0857,
"step": 105
},
{
"epoch": 8.153846153846153,
"grad_norm": 416.3562927246094,
"learning_rate": 4.2000000000000004e-06,
"loss": 1.3436,
"step": 106
},
{
"epoch": 8.23076923076923,
"grad_norm": 313.8348388671875,
"learning_rate": 4.24e-06,
"loss": 1.5924,
"step": 107
},
{
"epoch": 8.307692307692308,
"grad_norm": 174.41867065429688,
"learning_rate": 4.2800000000000005e-06,
"loss": 1.2701,
"step": 108
},
{
"epoch": 8.384615384615385,
"grad_norm": 135.88780212402344,
"learning_rate": 4.32e-06,
"loss": 0.9964,
"step": 109
},
{
"epoch": 8.461538461538462,
"grad_norm": 119.21726989746094,
"learning_rate": 4.360000000000001e-06,
"loss": 0.9765,
"step": 110
},
{
"epoch": 8.538461538461538,
"grad_norm": 124.00638580322266,
"learning_rate": 4.4e-06,
"loss": 1.0436,
"step": 111
},
{
"epoch": 8.615384615384615,
"grad_norm": 87.69738006591797,
"learning_rate": 4.440000000000001e-06,
"loss": 0.9144,
"step": 112
},
{
"epoch": 8.692307692307692,
"grad_norm": 98.52690124511719,
"learning_rate": 4.48e-06,
"loss": 0.8034,
"step": 113
},
{
"epoch": 8.76923076923077,
"grad_norm": 67.8703842163086,
"learning_rate": 4.520000000000001e-06,
"loss": 0.7949,
"step": 114
},
{
"epoch": 8.846153846153847,
"grad_norm": 129.94183349609375,
"learning_rate": 4.56e-06,
"loss": 1.1878,
"step": 115
},
{
"epoch": 8.923076923076923,
"grad_norm": 185.4768829345703,
"learning_rate": 4.600000000000001e-06,
"loss": 1.015,
"step": 116
},
{
"epoch": 9.0,
"grad_norm": 122.55597686767578,
"learning_rate": 4.6400000000000005e-06,
"loss": 0.9446,
"step": 117
},
{
"epoch": 9.076923076923077,
"grad_norm": 86.92323303222656,
"learning_rate": 4.680000000000001e-06,
"loss": 0.833,
"step": 118
},
{
"epoch": 9.153846153846153,
"grad_norm": 109.41946411132812,
"learning_rate": 4.7200000000000005e-06,
"loss": 0.9403,
"step": 119
},
{
"epoch": 9.23076923076923,
"grad_norm": 79.83373260498047,
"learning_rate": 4.76e-06,
"loss": 0.6093,
"step": 120
},
{
"epoch": 9.307692307692308,
"grad_norm": 102.91453552246094,
"learning_rate": 4.800000000000001e-06,
"loss": 0.9903,
"step": 121
},
{
"epoch": 9.384615384615385,
"grad_norm": 329.02606201171875,
"learning_rate": 4.84e-06,
"loss": 1.2028,
"step": 122
},
{
"epoch": 9.461538461538462,
"grad_norm": 160.85386657714844,
"learning_rate": 4.880000000000001e-06,
"loss": 1.4724,
"step": 123
},
{
"epoch": 9.538461538461538,
"grad_norm": 229.27630615234375,
"learning_rate": 4.92e-06,
"loss": 0.9034,
"step": 124
},
{
"epoch": 9.615384615384615,
"grad_norm": 91.00493621826172,
"learning_rate": 4.960000000000001e-06,
"loss": 1.0996,
"step": 125
},
{
"epoch": 9.692307692307692,
"grad_norm": 60.38620376586914,
"learning_rate": 5e-06,
"loss": 0.7422,
"step": 126
},
{
"epoch": 9.76923076923077,
"grad_norm": 52.122467041015625,
"learning_rate": 5.04e-06,
"loss": 0.8313,
"step": 127
},
{
"epoch": 9.846153846153847,
"grad_norm": 36.39122772216797,
"learning_rate": 5.0800000000000005e-06,
"loss": 0.8565,
"step": 128
},
{
"epoch": 9.923076923076923,
"grad_norm": 46.33543014526367,
"learning_rate": 5.12e-06,
"loss": 0.8425,
"step": 129
},
{
"epoch": 10.0,
"grad_norm": 85.27603149414062,
"learning_rate": 5.1600000000000006e-06,
"loss": 0.6042,
"step": 130
},
{
"epoch": 10.076923076923077,
"grad_norm": 108.0735855102539,
"learning_rate": 5.2e-06,
"loss": 0.8783,
"step": 131
},
{
"epoch": 10.153846153846153,
"grad_norm": 153.18704223632812,
"learning_rate": 5.240000000000001e-06,
"loss": 1.1126,
"step": 132
},
{
"epoch": 10.23076923076923,
"grad_norm": 93.4756851196289,
"learning_rate": 5.28e-06,
"loss": 0.8742,
"step": 133
},
{
"epoch": 10.307692307692308,
"grad_norm": 33.396385192871094,
"learning_rate": 5.320000000000001e-06,
"loss": 0.7842,
"step": 134
},
{
"epoch": 10.384615384615385,
"grad_norm": 64.87910461425781,
"learning_rate": 5.36e-06,
"loss": 0.783,
"step": 135
},
{
"epoch": 10.461538461538462,
"grad_norm": 92.65341186523438,
"learning_rate": 5.400000000000001e-06,
"loss": 0.7836,
"step": 136
},
{
"epoch": 10.538461538461538,
"grad_norm": 145.02798461914062,
"learning_rate": 5.4400000000000004e-06,
"loss": 0.9292,
"step": 137
},
{
"epoch": 10.615384615384615,
"grad_norm": 70.17644500732422,
"learning_rate": 5.480000000000001e-06,
"loss": 0.7544,
"step": 138
},
{
"epoch": 10.692307692307692,
"grad_norm": 41.04573059082031,
"learning_rate": 5.5200000000000005e-06,
"loss": 0.7911,
"step": 139
},
{
"epoch": 10.76923076923077,
"grad_norm": 109.60137176513672,
"learning_rate": 5.560000000000001e-06,
"loss": 0.9269,
"step": 140
},
{
"epoch": 10.846153846153847,
"grad_norm": 129.09300231933594,
"learning_rate": 5.600000000000001e-06,
"loss": 0.9927,
"step": 141
},
{
"epoch": 10.923076923076923,
"grad_norm": 140.2090301513672,
"learning_rate": 5.64e-06,
"loss": 0.9177,
"step": 142
},
{
"epoch": 11.0,
"grad_norm": 136.94422912597656,
"learning_rate": 5.68e-06,
"loss": 0.8187,
"step": 143
},
{
"epoch": 11.076923076923077,
"grad_norm": 85.431396484375,
"learning_rate": 5.72e-06,
"loss": 0.7158,
"step": 144
},
{
"epoch": 11.153846153846153,
"grad_norm": 38.3684196472168,
"learning_rate": 5.76e-06,
"loss": 0.7148,
"step": 145
},
{
"epoch": 11.23076923076923,
"grad_norm": 49.99292755126953,
"learning_rate": 5.8e-06,
"loss": 0.761,
"step": 146
},
{
"epoch": 11.307692307692308,
"grad_norm": 64.29403686523438,
"learning_rate": 5.84e-06,
"loss": 0.8409,
"step": 147
},
{
"epoch": 11.384615384615385,
"grad_norm": 105.57362365722656,
"learning_rate": 5.8800000000000005e-06,
"loss": 0.7556,
"step": 148
},
{
"epoch": 11.461538461538462,
"grad_norm": 93.06744384765625,
"learning_rate": 5.92e-06,
"loss": 0.8834,
"step": 149
},
{
"epoch": 11.538461538461538,
"grad_norm": 30.977516174316406,
"learning_rate": 5.9600000000000005e-06,
"loss": 0.7184,
"step": 150
},
{
"epoch": 11.615384615384615,
"grad_norm": 58.94570541381836,
"learning_rate": 6e-06,
"loss": 0.643,
"step": 151
},
{
"epoch": 11.692307692307692,
"grad_norm": 102.8298110961914,
"learning_rate": 6.040000000000001e-06,
"loss": 0.8481,
"step": 152
},
{
"epoch": 11.76923076923077,
"grad_norm": 105.51367950439453,
"learning_rate": 6.08e-06,
"loss": 0.8117,
"step": 153
},
{
"epoch": 11.846153846153847,
"grad_norm": 73.83160400390625,
"learning_rate": 6.120000000000001e-06,
"loss": 0.7865,
"step": 154
},
{
"epoch": 11.923076923076923,
"grad_norm": 83.67406463623047,
"learning_rate": 6.16e-06,
"loss": 0.6597,
"step": 155
},
{
"epoch": 12.0,
"grad_norm": 37.48268127441406,
"learning_rate": 6.200000000000001e-06,
"loss": 0.5755,
"step": 156
},
{
"epoch": 12.076923076923077,
"grad_norm": 42.852882385253906,
"learning_rate": 6.24e-06,
"loss": 0.5435,
"step": 157
},
{
"epoch": 12.153846153846153,
"grad_norm": 68.47445678710938,
"learning_rate": 6.280000000000001e-06,
"loss": 0.9999,
"step": 158
},
{
"epoch": 12.23076923076923,
"grad_norm": 90.29669189453125,
"learning_rate": 6.3200000000000005e-06,
"loss": 1.3117,
"step": 159
},
{
"epoch": 12.307692307692308,
"grad_norm": 88.94297790527344,
"learning_rate": 6.360000000000001e-06,
"loss": 1.1126,
"step": 160
},
{
"epoch": 12.384615384615385,
"grad_norm": 34.859493255615234,
"learning_rate": 6.4000000000000006e-06,
"loss": 0.6848,
"step": 161
},
{
"epoch": 12.461538461538462,
"grad_norm": 39.13771057128906,
"learning_rate": 6.440000000000001e-06,
"loss": 0.7297,
"step": 162
},
{
"epoch": 12.538461538461538,
"grad_norm": 65.69542694091797,
"learning_rate": 6.480000000000001e-06,
"loss": 0.8216,
"step": 163
},
{
"epoch": 12.615384615384615,
"grad_norm": 51.67008972167969,
"learning_rate": 6.520000000000001e-06,
"loss": 0.725,
"step": 164
},
{
"epoch": 12.692307692307692,
"grad_norm": 23.950634002685547,
"learning_rate": 6.560000000000001e-06,
"loss": 0.6542,
"step": 165
},
{
"epoch": 12.76923076923077,
"grad_norm": 62.85305404663086,
"learning_rate": 6.600000000000001e-06,
"loss": 1.0018,
"step": 166
},
{
"epoch": 12.846153846153847,
"grad_norm": 71.49420928955078,
"learning_rate": 6.640000000000001e-06,
"loss": 0.8524,
"step": 167
},
{
"epoch": 12.923076923076923,
"grad_norm": 65.00899505615234,
"learning_rate": 6.680000000000001e-06,
"loss": 0.8182,
"step": 168
},
{
"epoch": 13.0,
"grad_norm": 62.54741668701172,
"learning_rate": 6.720000000000001e-06,
"loss": 0.6744,
"step": 169
},
{
"epoch": 13.076923076923077,
"grad_norm": 53.9980354309082,
"learning_rate": 6.760000000000001e-06,
"loss": 0.7276,
"step": 170
},
{
"epoch": 13.153846153846153,
"grad_norm": 49.69089126586914,
"learning_rate": 6.800000000000001e-06,
"loss": 0.7577,
"step": 171
},
{
"epoch": 13.23076923076923,
"grad_norm": 56.17091751098633,
"learning_rate": 6.8400000000000014e-06,
"loss": 0.6323,
"step": 172
},
{
"epoch": 13.307692307692308,
"grad_norm": 131.43931579589844,
"learning_rate": 6.88e-06,
"loss": 1.235,
"step": 173
},
{
"epoch": 13.384615384615385,
"grad_norm": 75.32357788085938,
"learning_rate": 6.92e-06,
"loss": 0.6758,
"step": 174
},
{
"epoch": 13.461538461538462,
"grad_norm": 69.21751403808594,
"learning_rate": 6.96e-06,
"loss": 0.7003,
"step": 175
},
{
"epoch": 13.538461538461538,
"grad_norm": 129.3466339111328,
"learning_rate": 7e-06,
"loss": 0.7214,
"step": 176
},
{
"epoch": 13.615384615384615,
"grad_norm": 44.15930938720703,
"learning_rate": 7.04e-06,
"loss": 0.526,
"step": 177
},
{
"epoch": 13.692307692307692,
"grad_norm": 53.00956344604492,
"learning_rate": 7.08e-06,
"loss": 0.5758,
"step": 178
},
{
"epoch": 13.76923076923077,
"grad_norm": 68.80349731445312,
"learning_rate": 7.1200000000000004e-06,
"loss": 1.0676,
"step": 179
},
{
"epoch": 13.846153846153847,
"grad_norm": 178.73776245117188,
"learning_rate": 7.16e-06,
"loss": 0.7152,
"step": 180
},
{
"epoch": 13.923076923076923,
"grad_norm": 113.57772064208984,
"learning_rate": 7.2000000000000005e-06,
"loss": 0.9495,
"step": 181
},
{
"epoch": 14.0,
"grad_norm": 93.48717498779297,
"learning_rate": 7.24e-06,
"loss": 0.7061,
"step": 182
},
{
"epoch": 14.076923076923077,
"grad_norm": 82.60102081298828,
"learning_rate": 7.280000000000001e-06,
"loss": 1.2033,
"step": 183
},
{
"epoch": 14.153846153846153,
"grad_norm": 66.38690948486328,
"learning_rate": 7.32e-06,
"loss": 0.9306,
"step": 184
},
{
"epoch": 14.23076923076923,
"grad_norm": 62.00544357299805,
"learning_rate": 7.360000000000001e-06,
"loss": 0.6083,
"step": 185
},
{
"epoch": 14.307692307692308,
"grad_norm": 42.88933563232422,
"learning_rate": 7.4e-06,
"loss": 0.9682,
"step": 186
},
{
"epoch": 14.384615384615385,
"grad_norm": 35.30773162841797,
"learning_rate": 7.440000000000001e-06,
"loss": 0.9414,
"step": 187
},
{
"epoch": 14.461538461538462,
"grad_norm": 58.071876525878906,
"learning_rate": 7.48e-06,
"loss": 0.8669,
"step": 188
},
{
"epoch": 14.538461538461538,
"grad_norm": 94.3634033203125,
"learning_rate": 7.520000000000001e-06,
"loss": 0.9253,
"step": 189
},
{
"epoch": 14.615384615384615,
"grad_norm": 70.96503448486328,
"learning_rate": 7.5600000000000005e-06,
"loss": 0.9492,
"step": 190
},
{
"epoch": 14.692307692307692,
"grad_norm": 53.165340423583984,
"learning_rate": 7.600000000000001e-06,
"loss": 0.7418,
"step": 191
},
{
"epoch": 14.76923076923077,
"grad_norm": 29.172006607055664,
"learning_rate": 7.640000000000001e-06,
"loss": 0.6271,
"step": 192
},
{
"epoch": 14.846153846153847,
"grad_norm": 115.76858520507812,
"learning_rate": 7.680000000000001e-06,
"loss": 1.0605,
"step": 193
},
{
"epoch": 14.923076923076923,
"grad_norm": 61.08775329589844,
"learning_rate": 7.72e-06,
"loss": 0.8579,
"step": 194
},
{
"epoch": 15.0,
"grad_norm": 65.41018676757812,
"learning_rate": 7.76e-06,
"loss": 0.694,
"step": 195
},
{
"epoch": 15.076923076923077,
"grad_norm": 67.5228271484375,
"learning_rate": 7.800000000000002e-06,
"loss": 0.695,
"step": 196
},
{
"epoch": 15.153846153846153,
"grad_norm": 30.65340805053711,
"learning_rate": 7.840000000000001e-06,
"loss": 0.6205,
"step": 197
},
{
"epoch": 15.23076923076923,
"grad_norm": 83.59008026123047,
"learning_rate": 7.88e-06,
"loss": 0.6932,
"step": 198
},
{
"epoch": 15.307692307692308,
"grad_norm": 48.95726013183594,
"learning_rate": 7.92e-06,
"loss": 0.7861,
"step": 199
},
{
"epoch": 15.384615384615385,
"grad_norm": 65.74507904052734,
"learning_rate": 7.960000000000002e-06,
"loss": 0.7961,
"step": 200
},
{
"epoch": 15.461538461538462,
"grad_norm": 58.61296081542969,
"learning_rate": 8.000000000000001e-06,
"loss": 0.7204,
"step": 201
},
{
"epoch": 15.538461538461538,
"grad_norm": 79.05587005615234,
"learning_rate": 8.040000000000001e-06,
"loss": 0.8421,
"step": 202
},
{
"epoch": 15.615384615384615,
"grad_norm": 44.74805450439453,
"learning_rate": 8.08e-06,
"loss": 0.6835,
"step": 203
},
{
"epoch": 15.692307692307692,
"grad_norm": 86.16783142089844,
"learning_rate": 8.120000000000002e-06,
"loss": 0.8516,
"step": 204
},
{
"epoch": 15.76923076923077,
"grad_norm": 78.34517669677734,
"learning_rate": 8.16e-06,
"loss": 0.5696,
"step": 205
},
{
"epoch": 15.846153846153847,
"grad_norm": 60.46382141113281,
"learning_rate": 8.2e-06,
"loss": 0.7431,
"step": 206
},
{
"epoch": 15.923076923076923,
"grad_norm": 75.38282012939453,
"learning_rate": 8.24e-06,
"loss": 0.7579,
"step": 207
},
{
"epoch": 16.0,
"grad_norm": 61.66571807861328,
"learning_rate": 8.28e-06,
"loss": 0.5579,
"step": 208
},
{
"epoch": 16.076923076923077,
"grad_norm": 176.31370544433594,
"learning_rate": 8.32e-06,
"loss": 1.0035,
"step": 209
},
{
"epoch": 16.153846153846153,
"grad_norm": 217.41488647460938,
"learning_rate": 8.36e-06,
"loss": 1.1147,
"step": 210
},
{
"epoch": 16.23076923076923,
"grad_norm": 45.08114242553711,
"learning_rate": 8.400000000000001e-06,
"loss": 0.7391,
"step": 211
},
{
"epoch": 16.307692307692307,
"grad_norm": 56.17387771606445,
"learning_rate": 8.44e-06,
"loss": 0.7647,
"step": 212
},
{
"epoch": 16.384615384615383,
"grad_norm": 51.677215576171875,
"learning_rate": 8.48e-06,
"loss": 0.6528,
"step": 213
},
{
"epoch": 16.46153846153846,
"grad_norm": 49.8151969909668,
"learning_rate": 8.52e-06,
"loss": 0.8012,
"step": 214
},
{
"epoch": 16.53846153846154,
"grad_norm": 83.117431640625,
"learning_rate": 8.560000000000001e-06,
"loss": 0.7326,
"step": 215
},
{
"epoch": 16.615384615384617,
"grad_norm": 31.789459228515625,
"learning_rate": 8.6e-06,
"loss": 0.7549,
"step": 216
},
{
"epoch": 16.692307692307693,
"grad_norm": 80.32394409179688,
"learning_rate": 8.64e-06,
"loss": 0.6813,
"step": 217
},
{
"epoch": 16.76923076923077,
"grad_norm": 76.61673736572266,
"learning_rate": 8.68e-06,
"loss": 0.6526,
"step": 218
},
{
"epoch": 16.846153846153847,
"grad_norm": 46.9598503112793,
"learning_rate": 8.720000000000001e-06,
"loss": 0.501,
"step": 219
},
{
"epoch": 16.923076923076923,
"grad_norm": 73.53797912597656,
"learning_rate": 8.76e-06,
"loss": 0.9022,
"step": 220
},
{
"epoch": 17.0,
"grad_norm": 58.83550262451172,
"learning_rate": 8.8e-06,
"loss": 0.7628,
"step": 221
},
{
"epoch": 17.076923076923077,
"grad_norm": 90.0180435180664,
"learning_rate": 8.84e-06,
"loss": 1.0188,
"step": 222
},
{
"epoch": 17.153846153846153,
"grad_norm": 48.37046813964844,
"learning_rate": 8.880000000000001e-06,
"loss": 0.7707,
"step": 223
},
{
"epoch": 17.23076923076923,
"grad_norm": 52.4448356628418,
"learning_rate": 8.920000000000001e-06,
"loss": 0.7103,
"step": 224
},
{
"epoch": 17.307692307692307,
"grad_norm": 36.983585357666016,
"learning_rate": 8.96e-06,
"loss": 0.7305,
"step": 225
},
{
"epoch": 17.384615384615383,
"grad_norm": 16.054697036743164,
"learning_rate": 9e-06,
"loss": 0.6611,
"step": 226
},
{
"epoch": 17.46153846153846,
"grad_norm": 45.36906814575195,
"learning_rate": 9.040000000000002e-06,
"loss": 0.6746,
"step": 227
},
{
"epoch": 17.53846153846154,
"grad_norm": 40.45295715332031,
"learning_rate": 9.080000000000001e-06,
"loss": 0.7042,
"step": 228
},
{
"epoch": 17.615384615384617,
"grad_norm": 56.89924240112305,
"learning_rate": 9.12e-06,
"loss": 0.5384,
"step": 229
},
{
"epoch": 17.692307692307693,
"grad_norm": 56.17110061645508,
"learning_rate": 9.16e-06,
"loss": 0.9373,
"step": 230
},
{
"epoch": 17.76923076923077,
"grad_norm": 61.18904113769531,
"learning_rate": 9.200000000000002e-06,
"loss": 0.6669,
"step": 231
},
{
"epoch": 17.846153846153847,
"grad_norm": 42.18205261230469,
"learning_rate": 9.240000000000001e-06,
"loss": 0.6855,
"step": 232
},
{
"epoch": 17.923076923076923,
"grad_norm": 73.21139526367188,
"learning_rate": 9.280000000000001e-06,
"loss": 0.8783,
"step": 233
},
{
"epoch": 18.0,
"grad_norm": 60.66477584838867,
"learning_rate": 9.32e-06,
"loss": 0.751,
"step": 234
},
{
"epoch": 18.076923076923077,
"grad_norm": 42.04085159301758,
"learning_rate": 9.360000000000002e-06,
"loss": 0.6296,
"step": 235
},
{
"epoch": 18.153846153846153,
"grad_norm": 68.64347076416016,
"learning_rate": 9.4e-06,
"loss": 0.7494,
"step": 236
},
{
"epoch": 18.23076923076923,
"grad_norm": 72.14678192138672,
"learning_rate": 9.440000000000001e-06,
"loss": 0.6706,
"step": 237
},
{
"epoch": 18.307692307692307,
"grad_norm": 90.8731918334961,
"learning_rate": 9.48e-06,
"loss": 0.6978,
"step": 238
},
{
"epoch": 18.384615384615383,
"grad_norm": 85.11774444580078,
"learning_rate": 9.52e-06,
"loss": 0.7161,
"step": 239
},
{
"epoch": 18.46153846153846,
"grad_norm": 30.654109954833984,
"learning_rate": 9.56e-06,
"loss": 0.7263,
"step": 240
},
{
"epoch": 18.53846153846154,
"grad_norm": 29.126256942749023,
"learning_rate": 9.600000000000001e-06,
"loss": 0.4471,
"step": 241
},
{
"epoch": 18.615384615384617,
"grad_norm": 39.70819854736328,
"learning_rate": 9.640000000000001e-06,
"loss": 0.5729,
"step": 242
},
{
"epoch": 18.692307692307693,
"grad_norm": 43.10990524291992,
"learning_rate": 9.68e-06,
"loss": 0.6048,
"step": 243
},
{
"epoch": 18.76923076923077,
"grad_norm": 57.911685943603516,
"learning_rate": 9.72e-06,
"loss": 0.6993,
"step": 244
},
{
"epoch": 18.846153846153847,
"grad_norm": 23.198652267456055,
"learning_rate": 9.760000000000001e-06,
"loss": 0.5685,
"step": 245
},
{
"epoch": 18.923076923076923,
"grad_norm": 45.444454193115234,
"learning_rate": 9.800000000000001e-06,
"loss": 0.678,
"step": 246
},
{
"epoch": 19.0,
"grad_norm": 26.42611312866211,
"learning_rate": 9.84e-06,
"loss": 0.5872,
"step": 247
},
{
"epoch": 19.076923076923077,
"grad_norm": 50.92378234863281,
"learning_rate": 9.88e-06,
"loss": 0.4833,
"step": 248
},
{
"epoch": 19.153846153846153,
"grad_norm": 46.32240676879883,
"learning_rate": 9.920000000000002e-06,
"loss": 0.5696,
"step": 249
},
{
"epoch": 19.23076923076923,
"grad_norm": 30.97173500061035,
"learning_rate": 9.960000000000001e-06,
"loss": 0.3094,
"step": 250
},
{
"epoch": 19.307692307692307,
"grad_norm": 99.83480834960938,
"learning_rate": 1e-05,
"loss": 1.3693,
"step": 251
},
{
"epoch": 19.384615384615383,
"grad_norm": 41.79911422729492,
"learning_rate": 1.004e-05,
"loss": 0.7863,
"step": 252
},
{
"epoch": 19.46153846153846,
"grad_norm": 40.042179107666016,
"learning_rate": 1.008e-05,
"loss": 0.4821,
"step": 253
},
{
"epoch": 19.53846153846154,
"grad_norm": 36.340492248535156,
"learning_rate": 1.0120000000000001e-05,
"loss": 0.6628,
"step": 254
},
{
"epoch": 19.615384615384617,
"grad_norm": 37.4381217956543,
"learning_rate": 1.0160000000000001e-05,
"loss": 0.5221,
"step": 255
},
{
"epoch": 19.692307692307693,
"grad_norm": 78.24024963378906,
"learning_rate": 1.02e-05,
"loss": 0.6961,
"step": 256
},
{
"epoch": 19.76923076923077,
"grad_norm": 61.44567108154297,
"learning_rate": 1.024e-05,
"loss": 0.6614,
"step": 257
},
{
"epoch": 19.846153846153847,
"grad_norm": 69.41854095458984,
"learning_rate": 1.0280000000000002e-05,
"loss": 1.0344,
"step": 258
},
{
"epoch": 19.923076923076923,
"grad_norm": 54.26890182495117,
"learning_rate": 1.0320000000000001e-05,
"loss": 0.6941,
"step": 259
},
{
"epoch": 20.0,
"grad_norm": 119.05464935302734,
"learning_rate": 1.036e-05,
"loss": 0.5696,
"step": 260
},
{
"epoch": 20.076923076923077,
"grad_norm": 43.01278305053711,
"learning_rate": 1.04e-05,
"loss": 0.6823,
"step": 261
},
{
"epoch": 20.153846153846153,
"grad_norm": 46.86002731323242,
"learning_rate": 1.0440000000000002e-05,
"loss": 0.6659,
"step": 262
},
{
"epoch": 20.23076923076923,
"grad_norm": 38.46581268310547,
"learning_rate": 1.0480000000000001e-05,
"loss": 0.4717,
"step": 263
},
{
"epoch": 20.307692307692307,
"grad_norm": 185.26800537109375,
"learning_rate": 1.0520000000000001e-05,
"loss": 0.7102,
"step": 264
},
{
"epoch": 20.384615384615383,
"grad_norm": 74.64105224609375,
"learning_rate": 1.056e-05,
"loss": 0.3521,
"step": 265
},
{
"epoch": 20.46153846153846,
"grad_norm": 60.956748962402344,
"learning_rate": 1.0600000000000002e-05,
"loss": 0.6713,
"step": 266
},
{
"epoch": 20.53846153846154,
"grad_norm": 43.28743362426758,
"learning_rate": 1.0640000000000001e-05,
"loss": 0.7422,
"step": 267
},
{
"epoch": 20.615384615384617,
"grad_norm": 56.09255599975586,
"learning_rate": 1.0680000000000001e-05,
"loss": 0.5013,
"step": 268
},
{
"epoch": 20.692307692307693,
"grad_norm": 70.18143463134766,
"learning_rate": 1.072e-05,
"loss": 0.5307,
"step": 269
},
{
"epoch": 20.76923076923077,
"grad_norm": 41.185638427734375,
"learning_rate": 1.0760000000000002e-05,
"loss": 0.8098,
"step": 270
},
{
"epoch": 20.846153846153847,
"grad_norm": 19.666332244873047,
"learning_rate": 1.0800000000000002e-05,
"loss": 0.4061,
"step": 271
},
{
"epoch": 20.923076923076923,
"grad_norm": 52.35540771484375,
"learning_rate": 1.0840000000000001e-05,
"loss": 0.2785,
"step": 272
},
{
"epoch": 21.0,
"grad_norm": 64.35240936279297,
"learning_rate": 1.0880000000000001e-05,
"loss": 0.4654,
"step": 273
},
{
"epoch": 21.076923076923077,
"grad_norm": 39.35055923461914,
"learning_rate": 1.0920000000000002e-05,
"loss": 0.4549,
"step": 274
},
{
"epoch": 21.153846153846153,
"grad_norm": 46.019657135009766,
"learning_rate": 1.0960000000000002e-05,
"loss": 0.4013,
"step": 275
},
{
"epoch": 21.23076923076923,
"grad_norm": 34.99978256225586,
"learning_rate": 1.1000000000000001e-05,
"loss": 0.5063,
"step": 276
},
{
"epoch": 21.307692307692307,
"grad_norm": 274.4259033203125,
"learning_rate": 1.1040000000000001e-05,
"loss": 0.8115,
"step": 277
},
{
"epoch": 21.384615384615383,
"grad_norm": 58.749969482421875,
"learning_rate": 1.1080000000000002e-05,
"loss": 1.0227,
"step": 278
},
{
"epoch": 21.46153846153846,
"grad_norm": 68.37541961669922,
"learning_rate": 1.1120000000000002e-05,
"loss": 0.462,
"step": 279
},
{
"epoch": 21.53846153846154,
"grad_norm": 30.333087921142578,
"learning_rate": 1.1160000000000002e-05,
"loss": 0.4942,
"step": 280
},
{
"epoch": 21.615384615384617,
"grad_norm": 25.71001434326172,
"learning_rate": 1.1200000000000001e-05,
"loss": 0.2491,
"step": 281
},
{
"epoch": 21.692307692307693,
"grad_norm": 76.39506530761719,
"learning_rate": 1.1240000000000002e-05,
"loss": 1.0506,
"step": 282
},
{
"epoch": 21.76923076923077,
"grad_norm": 55.327606201171875,
"learning_rate": 1.128e-05,
"loss": 0.6434,
"step": 283
},
{
"epoch": 21.846153846153847,
"grad_norm": 56.551788330078125,
"learning_rate": 1.132e-05,
"loss": 0.8624,
"step": 284
},
{
"epoch": 21.923076923076923,
"grad_norm": 49.042423248291016,
"learning_rate": 1.136e-05,
"loss": 0.5041,
"step": 285
},
{
"epoch": 22.0,
"grad_norm": 58.502357482910156,
"learning_rate": 1.14e-05,
"loss": 0.4436,
"step": 286
},
{
"epoch": 22.076923076923077,
"grad_norm": 62.351776123046875,
"learning_rate": 1.144e-05,
"loss": 0.7322,
"step": 287
},
{
"epoch": 22.153846153846153,
"grad_norm": 98.29476928710938,
"learning_rate": 1.148e-05,
"loss": 0.9344,
"step": 288
},
{
"epoch": 22.23076923076923,
"grad_norm": 51.842838287353516,
"learning_rate": 1.152e-05,
"loss": 0.6241,
"step": 289
},
{
"epoch": 22.307692307692307,
"grad_norm": 44.713233947753906,
"learning_rate": 1.156e-05,
"loss": 0.418,
"step": 290
},
{
"epoch": 22.384615384615383,
"grad_norm": 40.88868713378906,
"learning_rate": 1.16e-05,
"loss": 0.3467,
"step": 291
},
{
"epoch": 22.46153846153846,
"grad_norm": 71.91492462158203,
"learning_rate": 1.164e-05,
"loss": 0.6216,
"step": 292
},
{
"epoch": 22.53846153846154,
"grad_norm": 69.22135925292969,
"learning_rate": 1.168e-05,
"loss": 0.3065,
"step": 293
},
{
"epoch": 22.615384615384617,
"grad_norm": 50.13063430786133,
"learning_rate": 1.172e-05,
"loss": 0.2739,
"step": 294
},
{
"epoch": 22.692307692307693,
"grad_norm": 43.53316879272461,
"learning_rate": 1.1760000000000001e-05,
"loss": 0.676,
"step": 295
},
{
"epoch": 22.76923076923077,
"grad_norm": 72.19952392578125,
"learning_rate": 1.18e-05,
"loss": 0.5518,
"step": 296
},
{
"epoch": 22.846153846153847,
"grad_norm": 99.4913330078125,
"learning_rate": 1.184e-05,
"loss": 0.4786,
"step": 297
},
{
"epoch": 22.923076923076923,
"grad_norm": 130.802978515625,
"learning_rate": 1.188e-05,
"loss": 0.3785,
"step": 298
},
{
"epoch": 23.0,
"grad_norm": 62.112648010253906,
"learning_rate": 1.1920000000000001e-05,
"loss": 0.3357,
"step": 299
},
{
"epoch": 23.076923076923077,
"grad_norm": 27.632856369018555,
"learning_rate": 1.196e-05,
"loss": 0.498,
"step": 300
},
{
"epoch": 23.153846153846153,
"grad_norm": 73.1949691772461,
"learning_rate": 1.2e-05,
"loss": 0.4796,
"step": 301
},
{
"epoch": 23.23076923076923,
"grad_norm": 101.7704086303711,
"learning_rate": 1.204e-05,
"loss": 0.4953,
"step": 302
},
{
"epoch": 23.307692307692307,
"grad_norm": 101.37821197509766,
"learning_rate": 1.2080000000000001e-05,
"loss": 0.594,
"step": 303
},
{
"epoch": 23.384615384615383,
"grad_norm": 80.45525360107422,
"learning_rate": 1.2120000000000001e-05,
"loss": 0.827,
"step": 304
},
{
"epoch": 23.46153846153846,
"grad_norm": 34.36630630493164,
"learning_rate": 1.216e-05,
"loss": 0.5362,
"step": 305
},
{
"epoch": 23.53846153846154,
"grad_norm": 41.87324905395508,
"learning_rate": 1.22e-05,
"loss": 0.3961,
"step": 306
},
{
"epoch": 23.615384615384617,
"grad_norm": 97.23712158203125,
"learning_rate": 1.2240000000000001e-05,
"loss": 0.5439,
"step": 307
},
{
"epoch": 23.692307692307693,
"grad_norm": 125.3348388671875,
"learning_rate": 1.2280000000000001e-05,
"loss": 0.7085,
"step": 308
},
{
"epoch": 23.76923076923077,
"grad_norm": 66.71159362792969,
"learning_rate": 1.232e-05,
"loss": 0.3981,
"step": 309
},
{
"epoch": 23.846153846153847,
"grad_norm": 45.817039489746094,
"learning_rate": 1.236e-05,
"loss": 0.4767,
"step": 310
},
{
"epoch": 23.923076923076923,
"grad_norm": 60.648868560791016,
"learning_rate": 1.2400000000000002e-05,
"loss": 0.1732,
"step": 311
},
{
"epoch": 24.0,
"grad_norm": 93.42353820800781,
"learning_rate": 1.2440000000000001e-05,
"loss": 0.7817,
"step": 312
},
{
"epoch": 24.076923076923077,
"grad_norm": 98.92288970947266,
"learning_rate": 1.248e-05,
"loss": 0.7387,
"step": 313
},
{
"epoch": 24.153846153846153,
"grad_norm": 79.44998168945312,
"learning_rate": 1.252e-05,
"loss": 0.8928,
"step": 314
},
{
"epoch": 24.23076923076923,
"grad_norm": 39.60928726196289,
"learning_rate": 1.2560000000000002e-05,
"loss": 0.4137,
"step": 315
},
{
"epoch": 24.307692307692307,
"grad_norm": 60.29780960083008,
"learning_rate": 1.2600000000000001e-05,
"loss": 0.4034,
"step": 316
},
{
"epoch": 24.384615384615383,
"grad_norm": 49.01469039916992,
"learning_rate": 1.2640000000000001e-05,
"loss": 0.64,
"step": 317
},
{
"epoch": 24.46153846153846,
"grad_norm": 77.11458587646484,
"learning_rate": 1.268e-05,
"loss": 0.5456,
"step": 318
},
{
"epoch": 24.53846153846154,
"grad_norm": 76.5569839477539,
"learning_rate": 1.2720000000000002e-05,
"loss": 0.8046,
"step": 319
},
{
"epoch": 24.615384615384617,
"grad_norm": 36.19839096069336,
"learning_rate": 1.2760000000000001e-05,
"loss": 0.5598,
"step": 320
},
{
"epoch": 24.692307692307693,
"grad_norm": 48.85591506958008,
"learning_rate": 1.2800000000000001e-05,
"loss": 0.6344,
"step": 321
},
{
"epoch": 24.76923076923077,
"grad_norm": 41.91597366333008,
"learning_rate": 1.284e-05,
"loss": 0.5952,
"step": 322
},
{
"epoch": 24.846153846153847,
"grad_norm": 23.071367263793945,
"learning_rate": 1.2880000000000002e-05,
"loss": 0.5319,
"step": 323
},
{
"epoch": 24.923076923076923,
"grad_norm": 24.764896392822266,
"learning_rate": 1.2920000000000002e-05,
"loss": 0.3762,
"step": 324
},
{
"epoch": 25.0,
"grad_norm": 57.47997283935547,
"learning_rate": 1.2960000000000001e-05,
"loss": 1.0161,
"step": 325
},
{
"epoch": 25.076923076923077,
"grad_norm": 89.52471923828125,
"learning_rate": 1.3000000000000001e-05,
"loss": 0.61,
"step": 326
},
{
"epoch": 25.153846153846153,
"grad_norm": 70.85286712646484,
"learning_rate": 1.3040000000000002e-05,
"loss": 0.6296,
"step": 327
},
{
"epoch": 25.23076923076923,
"grad_norm": 29.83013343811035,
"learning_rate": 1.3080000000000002e-05,
"loss": 0.4853,
"step": 328
},
{
"epoch": 25.307692307692307,
"grad_norm": 42.2051887512207,
"learning_rate": 1.3120000000000001e-05,
"loss": 0.3599,
"step": 329
},
{
"epoch": 25.384615384615383,
"grad_norm": 22.57636260986328,
"learning_rate": 1.3160000000000001e-05,
"loss": 0.1958,
"step": 330
},
{
"epoch": 25.46153846153846,
"grad_norm": 31.4184513092041,
"learning_rate": 1.3200000000000002e-05,
"loss": 0.4977,
"step": 331
},
{
"epoch": 25.53846153846154,
"grad_norm": 65.11288452148438,
"learning_rate": 1.3240000000000002e-05,
"loss": 0.8784,
"step": 332
},
{
"epoch": 25.615384615384617,
"grad_norm": 63.44681930541992,
"learning_rate": 1.3280000000000002e-05,
"loss": 0.8258,
"step": 333
},
{
"epoch": 25.692307692307693,
"grad_norm": 61.35824203491211,
"learning_rate": 1.3320000000000001e-05,
"loss": 0.6076,
"step": 334
},
{
"epoch": 25.76923076923077,
"grad_norm": 42.7951774597168,
"learning_rate": 1.3360000000000003e-05,
"loss": 0.64,
"step": 335
},
{
"epoch": 25.846153846153847,
"grad_norm": 53.61860656738281,
"learning_rate": 1.3400000000000002e-05,
"loss": 0.3146,
"step": 336
},
{
"epoch": 25.923076923076923,
"grad_norm": 120.34032440185547,
"learning_rate": 1.3440000000000002e-05,
"loss": 1.4029,
"step": 337
},
{
"epoch": 26.0,
"grad_norm": 54.58380126953125,
"learning_rate": 1.3480000000000001e-05,
"loss": 0.4976,
"step": 338
},
{
"epoch": 26.076923076923077,
"grad_norm": 35.85188293457031,
"learning_rate": 1.3520000000000003e-05,
"loss": 0.2843,
"step": 339
},
{
"epoch": 26.153846153846153,
"grad_norm": 43.697750091552734,
"learning_rate": 1.3560000000000002e-05,
"loss": 0.3621,
"step": 340
},
{
"epoch": 26.23076923076923,
"grad_norm": 76.05105590820312,
"learning_rate": 1.3600000000000002e-05,
"loss": 0.4223,
"step": 341
},
{
"epoch": 26.307692307692307,
"grad_norm": 118.16177368164062,
"learning_rate": 1.3640000000000002e-05,
"loss": 0.8052,
"step": 342
},
{
"epoch": 26.384615384615383,
"grad_norm": 113.29070281982422,
"learning_rate": 1.3680000000000003e-05,
"loss": 0.8527,
"step": 343
},
{
"epoch": 26.46153846153846,
"grad_norm": 65.74857330322266,
"learning_rate": 1.3720000000000002e-05,
"loss": 0.4324,
"step": 344
},
{
"epoch": 26.53846153846154,
"grad_norm": 76.03609466552734,
"learning_rate": 1.376e-05,
"loss": 0.6704,
"step": 345
},
{
"epoch": 26.615384615384617,
"grad_norm": 84.36861419677734,
"learning_rate": 1.38e-05,
"loss": 0.6508,
"step": 346
},
{
"epoch": 26.692307692307693,
"grad_norm": 73.54359436035156,
"learning_rate": 1.384e-05,
"loss": 0.9164,
"step": 347
},
{
"epoch": 26.76923076923077,
"grad_norm": 80.98385620117188,
"learning_rate": 1.3880000000000001e-05,
"loss": 0.6096,
"step": 348
},
{
"epoch": 26.846153846153847,
"grad_norm": 54.7966194152832,
"learning_rate": 1.392e-05,
"loss": 0.6865,
"step": 349
},
{
"epoch": 26.923076923076923,
"grad_norm": 36.034706115722656,
"learning_rate": 1.396e-05,
"loss": 0.5209,
"step": 350
},
{
"epoch": 27.0,
"grad_norm": 58.19102478027344,
"learning_rate": 1.4e-05,
"loss": 0.3163,
"step": 351
},
{
"epoch": 27.076923076923077,
"grad_norm": 22.627710342407227,
"learning_rate": 1.4040000000000001e-05,
"loss": 0.3443,
"step": 352
},
{
"epoch": 27.153846153846153,
"grad_norm": 37.32178497314453,
"learning_rate": 1.408e-05,
"loss": 1.0384,
"step": 353
},
{
"epoch": 27.23076923076923,
"grad_norm": 38.546485900878906,
"learning_rate": 1.412e-05,
"loss": 0.3171,
"step": 354
},
{
"epoch": 27.307692307692307,
"grad_norm": 22.176666259765625,
"learning_rate": 1.416e-05,
"loss": 0.5926,
"step": 355
},
{
"epoch": 27.384615384615383,
"grad_norm": 23.193613052368164,
"learning_rate": 1.4200000000000001e-05,
"loss": 0.5167,
"step": 356
},
{
"epoch": 27.46153846153846,
"grad_norm": 23.118364334106445,
"learning_rate": 1.4240000000000001e-05,
"loss": 0.2792,
"step": 357
},
{
"epoch": 27.53846153846154,
"grad_norm": 14.209169387817383,
"learning_rate": 1.428e-05,
"loss": 0.4094,
"step": 358
},
{
"epoch": 27.615384615384617,
"grad_norm": 43.388553619384766,
"learning_rate": 1.432e-05,
"loss": 0.1616,
"step": 359
},
{
"epoch": 27.692307692307693,
"grad_norm": 37.662174224853516,
"learning_rate": 1.4360000000000001e-05,
"loss": 0.5995,
"step": 360
},
{
"epoch": 27.76923076923077,
"grad_norm": 53.796566009521484,
"learning_rate": 1.4400000000000001e-05,
"loss": 0.7795,
"step": 361
},
{
"epoch": 27.846153846153847,
"grad_norm": 25.604795455932617,
"learning_rate": 1.444e-05,
"loss": 0.2185,
"step": 362
},
{
"epoch": 27.923076923076923,
"grad_norm": 27.013303756713867,
"learning_rate": 1.448e-05,
"loss": 0.1652,
"step": 363
},
{
"epoch": 28.0,
"grad_norm": 44.7171516418457,
"learning_rate": 1.4520000000000002e-05,
"loss": 0.987,
"step": 364
},
{
"epoch": 28.076923076923077,
"grad_norm": 41.05904006958008,
"learning_rate": 1.4560000000000001e-05,
"loss": 0.4601,
"step": 365
},
{
"epoch": 28.153846153846153,
"grad_norm": 45.71525955200195,
"learning_rate": 1.46e-05,
"loss": 0.3749,
"step": 366
},
{
"epoch": 28.23076923076923,
"grad_norm": 24.003860473632812,
"learning_rate": 1.464e-05,
"loss": 0.4918,
"step": 367
},
{
"epoch": 28.307692307692307,
"grad_norm": 55.62363815307617,
"learning_rate": 1.4680000000000002e-05,
"loss": 0.7905,
"step": 368
},
{
"epoch": 28.384615384615383,
"grad_norm": 40.681175231933594,
"learning_rate": 1.4720000000000001e-05,
"loss": 0.4372,
"step": 369
},
{
"epoch": 28.46153846153846,
"grad_norm": 23.041379928588867,
"learning_rate": 1.4760000000000001e-05,
"loss": 0.3825,
"step": 370
},
{
"epoch": 28.53846153846154,
"grad_norm": 48.101505279541016,
"learning_rate": 1.48e-05,
"loss": 0.2319,
"step": 371
},
{
"epoch": 28.615384615384617,
"grad_norm": 36.70085525512695,
"learning_rate": 1.4840000000000002e-05,
"loss": 0.5991,
"step": 372
},
{
"epoch": 28.692307692307693,
"grad_norm": 37.7666130065918,
"learning_rate": 1.4880000000000002e-05,
"loss": 0.2135,
"step": 373
},
{
"epoch": 28.76923076923077,
"grad_norm": 25.588153839111328,
"learning_rate": 1.4920000000000001e-05,
"loss": 0.1974,
"step": 374
},
{
"epoch": 28.846153846153847,
"grad_norm": 11.466187477111816,
"learning_rate": 1.496e-05,
"loss": 0.1845,
"step": 375
},
{
"epoch": 28.923076923076923,
"grad_norm": 41.00111770629883,
"learning_rate": 1.5000000000000002e-05,
"loss": 0.3138,
"step": 376
},
{
"epoch": 29.0,
"grad_norm": 50.57583999633789,
"learning_rate": 1.5040000000000002e-05,
"loss": 0.4909,
"step": 377
},
{
"epoch": 29.076923076923077,
"grad_norm": 45.61771774291992,
"learning_rate": 1.5080000000000001e-05,
"loss": 0.8464,
"step": 378
},
{
"epoch": 29.153846153846153,
"grad_norm": 40.37639617919922,
"learning_rate": 1.5120000000000001e-05,
"loss": 0.3515,
"step": 379
},
{
"epoch": 29.23076923076923,
"grad_norm": 67.5796890258789,
"learning_rate": 1.516e-05,
"loss": 0.6139,
"step": 380
},
{
"epoch": 29.307692307692307,
"grad_norm": 51.77175521850586,
"learning_rate": 1.5200000000000002e-05,
"loss": 0.1876,
"step": 381
},
{
"epoch": 29.384615384615383,
"grad_norm": 48.26830291748047,
"learning_rate": 1.5240000000000001e-05,
"loss": 0.3167,
"step": 382
},
{
"epoch": 29.46153846153846,
"grad_norm": 47.83180618286133,
"learning_rate": 1.5280000000000003e-05,
"loss": 0.4492,
"step": 383
},
{
"epoch": 29.53846153846154,
"grad_norm": 73.85650634765625,
"learning_rate": 1.5320000000000002e-05,
"loss": 0.783,
"step": 384
},
{
"epoch": 29.615384615384617,
"grad_norm": 73.28657531738281,
"learning_rate": 1.5360000000000002e-05,
"loss": 0.9757,
"step": 385
},
{
"epoch": 29.692307692307693,
"grad_norm": 32.33823776245117,
"learning_rate": 1.54e-05,
"loss": 0.6493,
"step": 386
},
{
"epoch": 29.76923076923077,
"grad_norm": 40.99183654785156,
"learning_rate": 1.544e-05,
"loss": 0.407,
"step": 387
},
{
"epoch": 29.846153846153847,
"grad_norm": 19.310026168823242,
"learning_rate": 1.548e-05,
"loss": 0.1387,
"step": 388
},
{
"epoch": 29.923076923076923,
"grad_norm": 86.82865142822266,
"learning_rate": 1.552e-05,
"loss": 0.9859,
"step": 389
},
{
"epoch": 30.0,
"grad_norm": 66.46072387695312,
"learning_rate": 1.556e-05,
"loss": 0.5921,
"step": 390
},
{
"epoch": 30.076923076923077,
"grad_norm": 47.81773376464844,
"learning_rate": 1.5600000000000003e-05,
"loss": 0.3752,
"step": 391
},
{
"epoch": 30.153846153846153,
"grad_norm": 76.41130828857422,
"learning_rate": 1.5640000000000003e-05,
"loss": 0.4966,
"step": 392
},
{
"epoch": 30.23076923076923,
"grad_norm": 67.10013580322266,
"learning_rate": 1.5680000000000002e-05,
"loss": 0.3609,
"step": 393
},
{
"epoch": 30.307692307692307,
"grad_norm": 38.465450286865234,
"learning_rate": 1.5720000000000002e-05,
"loss": 0.6302,
"step": 394
},
{
"epoch": 30.384615384615383,
"grad_norm": 37.98847579956055,
"learning_rate": 1.576e-05,
"loss": 0.3228,
"step": 395
},
{
"epoch": 30.46153846153846,
"grad_norm": 83.65345001220703,
"learning_rate": 1.58e-05,
"loss": 0.6936,
"step": 396
},
{
"epoch": 30.53846153846154,
"grad_norm": 81.71379089355469,
"learning_rate": 1.584e-05,
"loss": 0.5248,
"step": 397
},
{
"epoch": 30.615384615384617,
"grad_norm": 46.50620651245117,
"learning_rate": 1.588e-05,
"loss": 0.6357,
"step": 398
},
{
"epoch": 30.692307692307693,
"grad_norm": 68.04540252685547,
"learning_rate": 1.5920000000000003e-05,
"loss": 0.429,
"step": 399
},
{
"epoch": 30.76923076923077,
"grad_norm": 55.85519790649414,
"learning_rate": 1.5960000000000003e-05,
"loss": 0.4192,
"step": 400
},
{
"epoch": 30.846153846153847,
"grad_norm": 71.962890625,
"learning_rate": 1.6000000000000003e-05,
"loss": 0.4128,
"step": 401
},
{
"epoch": 30.923076923076923,
"grad_norm": 77.69264221191406,
"learning_rate": 1.6040000000000002e-05,
"loss": 1.0509,
"step": 402
},
{
"epoch": 31.0,
"grad_norm": 36.69390106201172,
"learning_rate": 1.6080000000000002e-05,
"loss": 0.6102,
"step": 403
},
{
"epoch": 31.076923076923077,
"grad_norm": 59.204612731933594,
"learning_rate": 1.612e-05,
"loss": 0.1778,
"step": 404
},
{
"epoch": 31.153846153846153,
"grad_norm": 65.2522964477539,
"learning_rate": 1.616e-05,
"loss": 0.6148,
"step": 405
},
{
"epoch": 31.23076923076923,
"grad_norm": 14.981352806091309,
"learning_rate": 1.62e-05,
"loss": 0.1487,
"step": 406
},
{
"epoch": 31.307692307692307,
"grad_norm": 47.70904541015625,
"learning_rate": 1.6240000000000004e-05,
"loss": 0.5439,
"step": 407
},
{
"epoch": 31.384615384615383,
"grad_norm": 77.24309539794922,
"learning_rate": 1.628e-05,
"loss": 0.8684,
"step": 408
},
{
"epoch": 31.46153846153846,
"grad_norm": 73.8941879272461,
"learning_rate": 1.632e-05,
"loss": 0.4744,
"step": 409
},
{
"epoch": 31.53846153846154,
"grad_norm": 59.932586669921875,
"learning_rate": 1.636e-05,
"loss": 0.8794,
"step": 410
},
{
"epoch": 31.615384615384617,
"grad_norm": 21.144742965698242,
"learning_rate": 1.64e-05,
"loss": 0.2288,
"step": 411
},
{
"epoch": 31.692307692307693,
"grad_norm": 45.16164016723633,
"learning_rate": 1.6440000000000002e-05,
"loss": 0.3846,
"step": 412
},
{
"epoch": 31.76923076923077,
"grad_norm": 63.43020248413086,
"learning_rate": 1.648e-05,
"loss": 0.607,
"step": 413
},
{
"epoch": 31.846153846153847,
"grad_norm": 74.06291961669922,
"learning_rate": 1.652e-05,
"loss": 0.6584,
"step": 414
},
{
"epoch": 31.923076923076923,
"grad_norm": 76.64544677734375,
"learning_rate": 1.656e-05,
"loss": 0.9753,
"step": 415
},
{
"epoch": 32.0,
"grad_norm": 66.81476593017578,
"learning_rate": 1.66e-05,
"loss": 0.3477,
"step": 416
},
{
"epoch": 32.07692307692308,
"grad_norm": 58.571075439453125,
"learning_rate": 1.664e-05,
"loss": 0.3969,
"step": 417
},
{
"epoch": 32.15384615384615,
"grad_norm": 50.05775451660156,
"learning_rate": 1.668e-05,
"loss": 0.5592,
"step": 418
},
{
"epoch": 32.23076923076923,
"grad_norm": 69.51533508300781,
"learning_rate": 1.672e-05,
"loss": 0.3837,
"step": 419
},
{
"epoch": 32.30769230769231,
"grad_norm": 64.64315032958984,
"learning_rate": 1.6760000000000002e-05,
"loss": 0.7959,
"step": 420
},
{
"epoch": 32.38461538461539,
"grad_norm": 51.027652740478516,
"learning_rate": 1.6800000000000002e-05,
"loss": 0.8768,
"step": 421
},
{
"epoch": 32.46153846153846,
"grad_norm": 35.286190032958984,
"learning_rate": 1.684e-05,
"loss": 0.6673,
"step": 422
},
{
"epoch": 32.53846153846154,
"grad_norm": 38.671775817871094,
"learning_rate": 1.688e-05,
"loss": 0.6072,
"step": 423
},
{
"epoch": 32.61538461538461,
"grad_norm": 63.433650970458984,
"learning_rate": 1.692e-05,
"loss": 1.3278,
"step": 424
},
{
"epoch": 32.69230769230769,
"grad_norm": 68.43083190917969,
"learning_rate": 1.696e-05,
"loss": 0.3996,
"step": 425
},
{
"epoch": 32.76923076923077,
"grad_norm": 34.67466354370117,
"learning_rate": 1.7e-05,
"loss": 0.2814,
"step": 426
},
{
"epoch": 32.84615384615385,
"grad_norm": 16.92761993408203,
"learning_rate": 1.704e-05,
"loss": 0.1775,
"step": 427
},
{
"epoch": 32.92307692307692,
"grad_norm": 44.911170959472656,
"learning_rate": 1.7080000000000002e-05,
"loss": 0.7652,
"step": 428
},
{
"epoch": 33.0,
"grad_norm": 33.2980842590332,
"learning_rate": 1.7120000000000002e-05,
"loss": 0.3709,
"step": 429
},
{
"epoch": 33.07692307692308,
"grad_norm": 43.332359313964844,
"learning_rate": 1.7160000000000002e-05,
"loss": 0.8261,
"step": 430
},
{
"epoch": 33.15384615384615,
"grad_norm": 47.77151107788086,
"learning_rate": 1.72e-05,
"loss": 0.4492,
"step": 431
},
{
"epoch": 33.23076923076923,
"grad_norm": 77.03539276123047,
"learning_rate": 1.724e-05,
"loss": 0.9605,
"step": 432
},
{
"epoch": 33.30769230769231,
"grad_norm": 36.878074645996094,
"learning_rate": 1.728e-05,
"loss": 0.5138,
"step": 433
},
{
"epoch": 33.38461538461539,
"grad_norm": 78.08937072753906,
"learning_rate": 1.732e-05,
"loss": 0.7583,
"step": 434
},
{
"epoch": 33.46153846153846,
"grad_norm": 18.130016326904297,
"learning_rate": 1.736e-05,
"loss": 0.5514,
"step": 435
},
{
"epoch": 33.53846153846154,
"grad_norm": 53.54596710205078,
"learning_rate": 1.7400000000000003e-05,
"loss": 0.588,
"step": 436
},
{
"epoch": 33.61538461538461,
"grad_norm": 46.70728302001953,
"learning_rate": 1.7440000000000002e-05,
"loss": 0.3447,
"step": 437
},
{
"epoch": 33.69230769230769,
"grad_norm": 20.540084838867188,
"learning_rate": 1.7480000000000002e-05,
"loss": 0.1428,
"step": 438
},
{
"epoch": 33.76923076923077,
"grad_norm": 48.03786849975586,
"learning_rate": 1.752e-05,
"loss": 0.413,
"step": 439
},
{
"epoch": 33.84615384615385,
"grad_norm": 75.21665954589844,
"learning_rate": 1.756e-05,
"loss": 0.2307,
"step": 440
},
{
"epoch": 33.92307692307692,
"grad_norm": 38.96320343017578,
"learning_rate": 1.76e-05,
"loss": 0.5157,
"step": 441
},
{
"epoch": 34.0,
"grad_norm": 62.66585922241211,
"learning_rate": 1.764e-05,
"loss": 0.7783,
"step": 442
},
{
"epoch": 34.07692307692308,
"grad_norm": 68.1346206665039,
"learning_rate": 1.768e-05,
"loss": 0.7453,
"step": 443
},
{
"epoch": 34.15384615384615,
"grad_norm": 64.73094177246094,
"learning_rate": 1.7720000000000003e-05,
"loss": 0.4063,
"step": 444
},
{
"epoch": 34.23076923076923,
"grad_norm": 45.85354995727539,
"learning_rate": 1.7760000000000003e-05,
"loss": 0.6128,
"step": 445
},
{
"epoch": 34.30769230769231,
"grad_norm": 60.594276428222656,
"learning_rate": 1.7800000000000002e-05,
"loss": 0.4022,
"step": 446
},
{
"epoch": 34.38461538461539,
"grad_norm": 65.88479614257812,
"learning_rate": 1.7840000000000002e-05,
"loss": 0.5737,
"step": 447
},
{
"epoch": 34.46153846153846,
"grad_norm": 47.70244216918945,
"learning_rate": 1.788e-05,
"loss": 0.3676,
"step": 448
},
{
"epoch": 34.53846153846154,
"grad_norm": 34.25014114379883,
"learning_rate": 1.792e-05,
"loss": 0.5729,
"step": 449
},
{
"epoch": 34.61538461538461,
"grad_norm": 59.29137420654297,
"learning_rate": 1.796e-05,
"loss": 0.3314,
"step": 450
},
{
"epoch": 34.69230769230769,
"grad_norm": 59.163780212402344,
"learning_rate": 1.8e-05,
"loss": 0.3097,
"step": 451
},
{
"epoch": 34.76923076923077,
"grad_norm": 46.27375793457031,
"learning_rate": 1.8040000000000003e-05,
"loss": 0.5513,
"step": 452
},
{
"epoch": 34.84615384615385,
"grad_norm": 39.06897735595703,
"learning_rate": 1.8080000000000003e-05,
"loss": 0.4424,
"step": 453
},
{
"epoch": 34.92307692307692,
"grad_norm": 48.06359100341797,
"learning_rate": 1.8120000000000003e-05,
"loss": 0.4674,
"step": 454
},
{
"epoch": 35.0,
"grad_norm": 72.48213195800781,
"learning_rate": 1.8160000000000002e-05,
"loss": 0.6007,
"step": 455
},
{
"epoch": 35.07692307692308,
"grad_norm": 44.720558166503906,
"learning_rate": 1.8200000000000002e-05,
"loss": 0.1797,
"step": 456
},
{
"epoch": 35.15384615384615,
"grad_norm": 14.797567367553711,
"learning_rate": 1.824e-05,
"loss": 0.1474,
"step": 457
},
{
"epoch": 35.23076923076923,
"grad_norm": 30.398681640625,
"learning_rate": 1.828e-05,
"loss": 0.4062,
"step": 458
},
{
"epoch": 35.30769230769231,
"grad_norm": 44.051231384277344,
"learning_rate": 1.832e-05,
"loss": 0.6224,
"step": 459
},
{
"epoch": 35.38461538461539,
"grad_norm": 24.96405601501465,
"learning_rate": 1.8360000000000004e-05,
"loss": 0.6572,
"step": 460
},
{
"epoch": 35.46153846153846,
"grad_norm": 51.63269805908203,
"learning_rate": 1.8400000000000003e-05,
"loss": 0.8461,
"step": 461
},
{
"epoch": 35.53846153846154,
"grad_norm": 49.89200973510742,
"learning_rate": 1.8440000000000003e-05,
"loss": 0.7514,
"step": 462
},
{
"epoch": 35.61538461538461,
"grad_norm": 41.851051330566406,
"learning_rate": 1.8480000000000003e-05,
"loss": 0.544,
"step": 463
},
{
"epoch": 35.69230769230769,
"grad_norm": 22.07673454284668,
"learning_rate": 1.8520000000000002e-05,
"loss": 0.6125,
"step": 464
},
{
"epoch": 35.76923076923077,
"grad_norm": 93.32474517822266,
"learning_rate": 1.8560000000000002e-05,
"loss": 0.5891,
"step": 465
},
{
"epoch": 35.84615384615385,
"grad_norm": 95.31482696533203,
"learning_rate": 1.86e-05,
"loss": 0.8448,
"step": 466
},
{
"epoch": 35.92307692307692,
"grad_norm": 121.12299346923828,
"learning_rate": 1.864e-05,
"loss": 1.0558,
"step": 467
},
{
"epoch": 36.0,
"grad_norm": 64.19267272949219,
"learning_rate": 1.8680000000000004e-05,
"loss": 0.4891,
"step": 468
},
{
"epoch": 36.07692307692308,
"grad_norm": 50.5765495300293,
"learning_rate": 1.8720000000000004e-05,
"loss": 0.5591,
"step": 469
},
{
"epoch": 36.15384615384615,
"grad_norm": 37.23386764526367,
"learning_rate": 1.876e-05,
"loss": 0.4727,
"step": 470
},
{
"epoch": 36.23076923076923,
"grad_norm": 44.87166213989258,
"learning_rate": 1.88e-05,
"loss": 0.4375,
"step": 471
},
{
"epoch": 36.30769230769231,
"grad_norm": 31.29073715209961,
"learning_rate": 1.884e-05,
"loss": 0.5385,
"step": 472
},
{
"epoch": 36.38461538461539,
"grad_norm": 32.528541564941406,
"learning_rate": 1.8880000000000002e-05,
"loss": 0.6831,
"step": 473
},
{
"epoch": 36.46153846153846,
"grad_norm": 43.69649887084961,
"learning_rate": 1.8920000000000002e-05,
"loss": 0.4986,
"step": 474
},
{
"epoch": 36.53846153846154,
"grad_norm": 22.515159606933594,
"learning_rate": 1.896e-05,
"loss": 0.298,
"step": 475
},
{
"epoch": 36.61538461538461,
"grad_norm": 59.822330474853516,
"learning_rate": 1.9e-05,
"loss": 0.2583,
"step": 476
},
{
"epoch": 36.69230769230769,
"grad_norm": 65.19972229003906,
"learning_rate": 1.904e-05,
"loss": 0.5001,
"step": 477
},
{
"epoch": 36.76923076923077,
"grad_norm": 50.5339469909668,
"learning_rate": 1.908e-05,
"loss": 0.5146,
"step": 478
},
{
"epoch": 36.84615384615385,
"grad_norm": 46.214683532714844,
"learning_rate": 1.912e-05,
"loss": 0.4223,
"step": 479
},
{
"epoch": 36.92307692307692,
"grad_norm": 38.803321838378906,
"learning_rate": 1.916e-05,
"loss": 0.7122,
"step": 480
},
{
"epoch": 37.0,
"grad_norm": 27.04830551147461,
"learning_rate": 1.9200000000000003e-05,
"loss": 0.1822,
"step": 481
},
{
"epoch": 37.07692307692308,
"grad_norm": 21.252769470214844,
"learning_rate": 1.9240000000000002e-05,
"loss": 0.2326,
"step": 482
},
{
"epoch": 37.15384615384615,
"grad_norm": 34.464412689208984,
"learning_rate": 1.9280000000000002e-05,
"loss": 0.2239,
"step": 483
},
{
"epoch": 37.23076923076923,
"grad_norm": 41.89882278442383,
"learning_rate": 1.932e-05,
"loss": 0.3468,
"step": 484
},
{
"epoch": 37.30769230769231,
"grad_norm": 25.541357040405273,
"learning_rate": 1.936e-05,
"loss": 0.405,
"step": 485
},
{
"epoch": 37.38461538461539,
"grad_norm": 48.3160400390625,
"learning_rate": 1.94e-05,
"loss": 0.2665,
"step": 486
},
{
"epoch": 37.46153846153846,
"grad_norm": 26.55426597595215,
"learning_rate": 1.944e-05,
"loss": 0.1677,
"step": 487
},
{
"epoch": 37.53846153846154,
"grad_norm": 71.87734985351562,
"learning_rate": 1.948e-05,
"loss": 1.3239,
"step": 488
},
{
"epoch": 37.61538461538461,
"grad_norm": 53.14076232910156,
"learning_rate": 1.9520000000000003e-05,
"loss": 0.5742,
"step": 489
},
{
"epoch": 37.69230769230769,
"grad_norm": 55.616424560546875,
"learning_rate": 1.9560000000000002e-05,
"loss": 0.5266,
"step": 490
},
{
"epoch": 37.76923076923077,
"grad_norm": 32.749019622802734,
"learning_rate": 1.9600000000000002e-05,
"loss": 0.2234,
"step": 491
},
{
"epoch": 37.84615384615385,
"grad_norm": 30.874309539794922,
"learning_rate": 1.9640000000000002e-05,
"loss": 0.2585,
"step": 492
},
{
"epoch": 37.92307692307692,
"grad_norm": 17.95946502685547,
"learning_rate": 1.968e-05,
"loss": 0.1004,
"step": 493
},
{
"epoch": 38.0,
"grad_norm": 76.27622985839844,
"learning_rate": 1.972e-05,
"loss": 0.4079,
"step": 494
},
{
"epoch": 38.07692307692308,
"grad_norm": 79.43333435058594,
"learning_rate": 1.976e-05,
"loss": 0.8453,
"step": 495
},
{
"epoch": 38.15384615384615,
"grad_norm": 60.751834869384766,
"learning_rate": 1.98e-05,
"loss": 0.9003,
"step": 496
},
{
"epoch": 38.23076923076923,
"grad_norm": 49.671142578125,
"learning_rate": 1.9840000000000003e-05,
"loss": 0.172,
"step": 497
},
{
"epoch": 38.30769230769231,
"grad_norm": 32.0305061340332,
"learning_rate": 1.9880000000000003e-05,
"loss": 0.3262,
"step": 498
},
{
"epoch": 38.38461538461539,
"grad_norm": 46.92594909667969,
"learning_rate": 1.9920000000000002e-05,
"loss": 0.6245,
"step": 499
},
{
"epoch": 38.46153846153846,
"grad_norm": 43.71823501586914,
"learning_rate": 1.9960000000000002e-05,
"loss": 0.4124,
"step": 500
},
{
"epoch": 38.53846153846154,
"grad_norm": 19.13474464416504,
"learning_rate": 2e-05,
"loss": 0.2149,
"step": 501
},
{
"epoch": 38.61538461538461,
"grad_norm": 38.319427490234375,
"learning_rate": 1.9999997563060744e-05,
"loss": 0.235,
"step": 502
},
{
"epoch": 38.69230769230769,
"grad_norm": 24.14805793762207,
"learning_rate": 1.9999990252244153e-05,
"loss": 0.2052,
"step": 503
},
{
"epoch": 38.76923076923077,
"grad_norm": 22.26850128173828,
"learning_rate": 1.9999978067553796e-05,
"loss": 0.1105,
"step": 504
},
{
"epoch": 38.84615384615385,
"grad_norm": 43.19673156738281,
"learning_rate": 1.9999961008995607e-05,
"loss": 0.7993,
"step": 505
},
{
"epoch": 38.92307692307692,
"grad_norm": 36.335838317871094,
"learning_rate": 1.9999939076577906e-05,
"loss": 0.1197,
"step": 506
},
{
"epoch": 39.0,
"grad_norm": 63.675071716308594,
"learning_rate": 1.9999912270311376e-05,
"loss": 0.767,
"step": 507
},
{
"epoch": 39.07692307692308,
"grad_norm": 24.863452911376953,
"learning_rate": 1.999988059020909e-05,
"loss": 0.3484,
"step": 508
},
{
"epoch": 39.15384615384615,
"grad_norm": 23.017812728881836,
"learning_rate": 1.9999844036286483e-05,
"loss": 0.4688,
"step": 509
},
{
"epoch": 39.23076923076923,
"grad_norm": 60.56824493408203,
"learning_rate": 1.999980260856137e-05,
"loss": 0.471,
"step": 510
},
{
"epoch": 39.30769230769231,
"grad_norm": 45.144256591796875,
"learning_rate": 1.9999756307053947e-05,
"loss": 0.5551,
"step": 511
},
{
"epoch": 39.38461538461539,
"grad_norm": 18.20176887512207,
"learning_rate": 1.999970513178678e-05,
"loss": 0.6448,
"step": 512
},
{
"epoch": 39.46153846153846,
"grad_norm": 25.116472244262695,
"learning_rate": 1.9999649082784807e-05,
"loss": 0.3543,
"step": 513
},
{
"epoch": 39.53846153846154,
"grad_norm": 56.92240524291992,
"learning_rate": 1.999958816007535e-05,
"loss": 0.5884,
"step": 514
},
{
"epoch": 39.61538461538461,
"grad_norm": 36.347930908203125,
"learning_rate": 1.99995223636881e-05,
"loss": 0.8939,
"step": 515
},
{
"epoch": 39.69230769230769,
"grad_norm": 38.58857345581055,
"learning_rate": 1.9999451693655125e-05,
"loss": 0.513,
"step": 516
},
{
"epoch": 39.76923076923077,
"grad_norm": 26.801912307739258,
"learning_rate": 1.9999376150010868e-05,
"loss": 0.1728,
"step": 517
},
{
"epoch": 39.84615384615385,
"grad_norm": 48.60188674926758,
"learning_rate": 1.9999295732792146e-05,
"loss": 0.524,
"step": 518
},
{
"epoch": 39.92307692307692,
"grad_norm": 27.04090690612793,
"learning_rate": 1.9999210442038164e-05,
"loss": 0.2746,
"step": 519
},
{
"epoch": 40.0,
"grad_norm": 67.3294906616211,
"learning_rate": 1.9999120277790477e-05,
"loss": 0.767,
"step": 520
},
{
"epoch": 40.07692307692308,
"grad_norm": 69.61468505859375,
"learning_rate": 1.9999025240093045e-05,
"loss": 0.5924,
"step": 521
},
{
"epoch": 40.15384615384615,
"grad_norm": 28.077123641967773,
"learning_rate": 1.9998925328992175e-05,
"loss": 0.254,
"step": 522
},
{
"epoch": 40.23076923076923,
"grad_norm": 108.91661071777344,
"learning_rate": 1.999882054453657e-05,
"loss": 0.6933,
"step": 523
},
{
"epoch": 40.30769230769231,
"grad_norm": 42.598304748535156,
"learning_rate": 1.9998710886777298e-05,
"loss": 0.4899,
"step": 524
},
{
"epoch": 40.38461538461539,
"grad_norm": 34.36406326293945,
"learning_rate": 1.9998596355767805e-05,
"loss": 0.5635,
"step": 525
},
{
"epoch": 40.46153846153846,
"grad_norm": 48.5196647644043,
"learning_rate": 1.9998476951563914e-05,
"loss": 0.3365,
"step": 526
},
{
"epoch": 40.53846153846154,
"grad_norm": 56.3973274230957,
"learning_rate": 1.9998352674223816e-05,
"loss": 0.8872,
"step": 527
},
{
"epoch": 40.61538461538461,
"grad_norm": 56.69743347167969,
"learning_rate": 1.9998223523808092e-05,
"loss": 0.5956,
"step": 528
},
{
"epoch": 40.69230769230769,
"grad_norm": 80.67139434814453,
"learning_rate": 1.999808950037968e-05,
"loss": 0.2614,
"step": 529
},
{
"epoch": 40.76923076923077,
"grad_norm": 52.51334762573242,
"learning_rate": 1.99979506040039e-05,
"loss": 0.5652,
"step": 530
},
{
"epoch": 40.84615384615385,
"grad_norm": 77.20426940917969,
"learning_rate": 1.9997806834748455e-05,
"loss": 0.3495,
"step": 531
},
{
"epoch": 40.92307692307692,
"grad_norm": 46.386573791503906,
"learning_rate": 1.9997658192683412e-05,
"loss": 0.4954,
"step": 532
},
{
"epoch": 41.0,
"grad_norm": 93.00538635253906,
"learning_rate": 1.9997504677881224e-05,
"loss": 0.3318,
"step": 533
},
{
"epoch": 41.07692307692308,
"grad_norm": 56.72392654418945,
"learning_rate": 1.9997346290416703e-05,
"loss": 0.2394,
"step": 534
},
{
"epoch": 41.15384615384615,
"grad_norm": 62.16911697387695,
"learning_rate": 1.999718303036705e-05,
"loss": 0.3713,
"step": 535
},
{
"epoch": 41.23076923076923,
"grad_norm": 65.15827178955078,
"learning_rate": 1.9997014897811834e-05,
"loss": 0.3201,
"step": 536
},
{
"epoch": 41.30769230769231,
"grad_norm": 100.19380187988281,
"learning_rate": 1.9996841892833e-05,
"loss": 0.8308,
"step": 537
},
{
"epoch": 41.38461538461539,
"grad_norm": 99.45419311523438,
"learning_rate": 1.999666401551487e-05,
"loss": 1.2223,
"step": 538
},
{
"epoch": 41.46153846153846,
"grad_norm": 54.75605392456055,
"learning_rate": 1.9996481265944146e-05,
"loss": 0.3295,
"step": 539
},
{
"epoch": 41.53846153846154,
"grad_norm": 72.30227661132812,
"learning_rate": 1.9996293644209886e-05,
"loss": 0.8631,
"step": 540
},
{
"epoch": 41.61538461538461,
"grad_norm": 90.02909851074219,
"learning_rate": 1.9996101150403543e-05,
"loss": 0.6333,
"step": 541
},
{
"epoch": 41.69230769230769,
"grad_norm": 91.50467681884766,
"learning_rate": 1.9995903784618936e-05,
"loss": 0.6422,
"step": 542
},
{
"epoch": 41.76923076923077,
"grad_norm": 53.51863479614258,
"learning_rate": 1.9995701546952252e-05,
"loss": 0.1777,
"step": 543
},
{
"epoch": 41.84615384615385,
"grad_norm": 46.842586517333984,
"learning_rate": 1.9995494437502064e-05,
"loss": 0.4172,
"step": 544
},
{
"epoch": 41.92307692307692,
"grad_norm": 76.82200622558594,
"learning_rate": 1.9995282456369313e-05,
"loss": 0.2708,
"step": 545
},
{
"epoch": 42.0,
"grad_norm": 50.79060363769531,
"learning_rate": 1.9995065603657317e-05,
"loss": 0.4857,
"step": 546
},
{
"epoch": 42.07692307692308,
"grad_norm": 64.46525573730469,
"learning_rate": 1.999484387947177e-05,
"loss": 0.5747,
"step": 547
},
{
"epoch": 42.15384615384615,
"grad_norm": 15.964370727539062,
"learning_rate": 1.999461728392073e-05,
"loss": 0.2815,
"step": 548
},
{
"epoch": 42.23076923076923,
"grad_norm": 28.7789306640625,
"learning_rate": 1.9994385817114644e-05,
"loss": 0.1986,
"step": 549
},
{
"epoch": 42.30769230769231,
"grad_norm": 43.08668518066406,
"learning_rate": 1.9994149479166324e-05,
"loss": 0.5432,
"step": 550
},
{
"epoch": 42.38461538461539,
"grad_norm": 14.126999855041504,
"learning_rate": 1.999390827019096e-05,
"loss": 0.1372,
"step": 551
},
{
"epoch": 42.46153846153846,
"grad_norm": 21.118301391601562,
"learning_rate": 1.999366219030611e-05,
"loss": 0.1582,
"step": 552
},
{
"epoch": 42.53846153846154,
"grad_norm": 39.95669174194336,
"learning_rate": 1.9993411239631713e-05,
"loss": 0.344,
"step": 553
},
{
"epoch": 42.61538461538461,
"grad_norm": 28.600141525268555,
"learning_rate": 1.999315541829008e-05,
"loss": 0.4129,
"step": 554
},
{
"epoch": 42.69230769230769,
"grad_norm": 47.09449005126953,
"learning_rate": 1.9992894726405894e-05,
"loss": 0.092,
"step": 555
},
{
"epoch": 42.76923076923077,
"grad_norm": 16.952911376953125,
"learning_rate": 1.999262916410621e-05,
"loss": 0.1394,
"step": 556
},
{
"epoch": 42.84615384615385,
"grad_norm": 32.12388610839844,
"learning_rate": 1.999235873152047e-05,
"loss": 0.219,
"step": 557
},
{
"epoch": 42.92307692307692,
"grad_norm": 44.476688385009766,
"learning_rate": 1.999208342878047e-05,
"loss": 0.2061,
"step": 558
},
{
"epoch": 43.0,
"grad_norm": 33.65949630737305,
"learning_rate": 1.9991803256020393e-05,
"loss": 0.0972,
"step": 559
},
{
"epoch": 43.07692307692308,
"grad_norm": 17.978668212890625,
"learning_rate": 1.9991518213376787e-05,
"loss": 0.1124,
"step": 560
},
{
"epoch": 43.15384615384615,
"grad_norm": 56.47621154785156,
"learning_rate": 1.9991228300988586e-05,
"loss": 0.604,
"step": 561
},
{
"epoch": 43.23076923076923,
"grad_norm": 45.38515853881836,
"learning_rate": 1.9990933518997086e-05,
"loss": 0.6914,
"step": 562
},
{
"epoch": 43.30769230769231,
"grad_norm": 40.5052604675293,
"learning_rate": 1.9990633867545956e-05,
"loss": 0.5163,
"step": 563
},
{
"epoch": 43.38461538461539,
"grad_norm": 30.98360252380371,
"learning_rate": 1.999032934678125e-05,
"loss": 0.4277,
"step": 564
},
{
"epoch": 43.46153846153846,
"grad_norm": 26.534019470214844,
"learning_rate": 1.9990019956851384e-05,
"loss": 0.218,
"step": 565
},
{
"epoch": 43.53846153846154,
"grad_norm": 27.41169548034668,
"learning_rate": 1.998970569790715e-05,
"loss": 0.1273,
"step": 566
},
{
"epoch": 43.61538461538461,
"grad_norm": 22.965587615966797,
"learning_rate": 1.9989386570101716e-05,
"loss": 0.1367,
"step": 567
},
{
"epoch": 43.69230769230769,
"grad_norm": 74.6824951171875,
"learning_rate": 1.9989062573590618e-05,
"loss": 1.1331,
"step": 568
},
{
"epoch": 43.76923076923077,
"grad_norm": 65.099609375,
"learning_rate": 1.9988733708531772e-05,
"loss": 1.4358,
"step": 569
},
{
"epoch": 43.84615384615385,
"grad_norm": 39.96792984008789,
"learning_rate": 1.998839997508546e-05,
"loss": 0.2261,
"step": 570
},
{
"epoch": 43.92307692307692,
"grad_norm": 23.558868408203125,
"learning_rate": 1.9988061373414342e-05,
"loss": 0.2481,
"step": 571
},
{
"epoch": 44.0,
"grad_norm": 95.01315307617188,
"learning_rate": 1.9987717903683447e-05,
"loss": 1.1462,
"step": 572
},
{
"epoch": 44.07692307692308,
"grad_norm": 27.77621841430664,
"learning_rate": 1.998736956606018e-05,
"loss": 0.5905,
"step": 573
},
{
"epoch": 44.15384615384615,
"grad_norm": 40.2691535949707,
"learning_rate": 1.9987016360714307e-05,
"loss": 0.4781,
"step": 574
},
{
"epoch": 44.23076923076923,
"grad_norm": 36.15031433105469,
"learning_rate": 1.998665828781799e-05,
"loss": 0.4033,
"step": 575
},
{
"epoch": 44.30769230769231,
"grad_norm": 46.30068588256836,
"learning_rate": 1.9986295347545738e-05,
"loss": 0.5782,
"step": 576
},
{
"epoch": 44.38461538461539,
"grad_norm": 22.156299591064453,
"learning_rate": 1.9985927540074453e-05,
"loss": 0.2182,
"step": 577
},
{
"epoch": 44.46153846153846,
"grad_norm": 51.11417007446289,
"learning_rate": 1.9985554865583394e-05,
"loss": 0.5061,
"step": 578
},
{
"epoch": 44.53846153846154,
"grad_norm": 32.809696197509766,
"learning_rate": 1.99851773242542e-05,
"loss": 0.265,
"step": 579
},
{
"epoch": 44.61538461538461,
"grad_norm": 46.30125045776367,
"learning_rate": 1.9984794916270876e-05,
"loss": 0.6807,
"step": 580
},
{
"epoch": 44.69230769230769,
"grad_norm": 12.768202781677246,
"learning_rate": 1.9984407641819812e-05,
"loss": 0.1261,
"step": 581
},
{
"epoch": 44.76923076923077,
"grad_norm": 48.871124267578125,
"learning_rate": 1.998401550108975e-05,
"loss": 0.7503,
"step": 582
},
{
"epoch": 44.84615384615385,
"grad_norm": 22.152219772338867,
"learning_rate": 1.9983618494271825e-05,
"loss": 0.2886,
"step": 583
},
{
"epoch": 44.92307692307692,
"grad_norm": 24.310588836669922,
"learning_rate": 1.9983216621559525e-05,
"loss": 0.0716,
"step": 584
},
{
"epoch": 45.0,
"grad_norm": 11.027695655822754,
"learning_rate": 1.998280988314872e-05,
"loss": 0.1276,
"step": 585
},
{
"epoch": 45.07692307692308,
"grad_norm": 31.255794525146484,
"learning_rate": 1.9982398279237657e-05,
"loss": 0.1271,
"step": 586
},
{
"epoch": 45.15384615384615,
"grad_norm": 31.5863037109375,
"learning_rate": 1.9981981810026932e-05,
"loss": 0.2312,
"step": 587
},
{
"epoch": 45.23076923076923,
"grad_norm": 47.123294830322266,
"learning_rate": 1.998156047571954e-05,
"loss": 0.5386,
"step": 588
},
{
"epoch": 45.30769230769231,
"grad_norm": 19.838153839111328,
"learning_rate": 1.9981134276520828e-05,
"loss": 0.1585,
"step": 589
},
{
"epoch": 45.38461538461539,
"grad_norm": 52.45973205566406,
"learning_rate": 1.9980703212638522e-05,
"loss": 0.963,
"step": 590
},
{
"epoch": 45.46153846153846,
"grad_norm": 93.08457946777344,
"learning_rate": 1.9980267284282718e-05,
"loss": 1.381,
"step": 591
},
{
"epoch": 45.53846153846154,
"grad_norm": 45.8686637878418,
"learning_rate": 1.997982649166588e-05,
"loss": 0.1468,
"step": 592
},
{
"epoch": 45.61538461538461,
"grad_norm": 45.240047454833984,
"learning_rate": 1.9979380835002846e-05,
"loss": 0.3763,
"step": 593
},
{
"epoch": 45.69230769230769,
"grad_norm": 36.02961730957031,
"learning_rate": 1.9978930314510826e-05,
"loss": 0.1194,
"step": 594
},
{
"epoch": 45.76923076923077,
"grad_norm": 37.365089416503906,
"learning_rate": 1.9978474930409396e-05,
"loss": 0.2036,
"step": 595
},
{
"epoch": 45.84615384615385,
"grad_norm": 38.51081085205078,
"learning_rate": 1.9978014682920503e-05,
"loss": 0.3827,
"step": 596
},
{
"epoch": 45.92307692307692,
"grad_norm": 24.498191833496094,
"learning_rate": 1.997754957226847e-05,
"loss": 0.118,
"step": 597
},
{
"epoch": 46.0,
"grad_norm": 55.18220901489258,
"learning_rate": 1.9977079598679978e-05,
"loss": 0.23,
"step": 598
},
{
"epoch": 46.07692307692308,
"grad_norm": 31.934246063232422,
"learning_rate": 1.99766047623841e-05,
"loss": 0.2494,
"step": 599
},
{
"epoch": 46.15384615384615,
"grad_norm": 38.73695373535156,
"learning_rate": 1.9976125063612254e-05,
"loss": 0.4175,
"step": 600
},
{
"epoch": 46.23076923076923,
"grad_norm": 40.32987976074219,
"learning_rate": 1.9975640502598243e-05,
"loss": 0.3902,
"step": 601
},
{
"epoch": 46.30769230769231,
"grad_norm": 50.24580001831055,
"learning_rate": 1.9975151079578238e-05,
"loss": 0.4656,
"step": 602
},
{
"epoch": 46.38461538461539,
"grad_norm": 66.17344665527344,
"learning_rate": 1.9974656794790777e-05,
"loss": 1.1274,
"step": 603
},
{
"epoch": 46.46153846153846,
"grad_norm": 60.75761032104492,
"learning_rate": 1.9974157648476768e-05,
"loss": 0.4145,
"step": 604
},
{
"epoch": 46.53846153846154,
"grad_norm": 44.54948043823242,
"learning_rate": 1.9973653640879486e-05,
"loss": 0.5102,
"step": 605
},
{
"epoch": 46.61538461538461,
"grad_norm": 58.077396392822266,
"learning_rate": 1.997314477224458e-05,
"loss": 0.8864,
"step": 606
},
{
"epoch": 46.69230769230769,
"grad_norm": 69.51765441894531,
"learning_rate": 1.997263104282007e-05,
"loss": 0.7171,
"step": 607
},
{
"epoch": 46.76923076923077,
"grad_norm": 38.186458587646484,
"learning_rate": 1.997211245285634e-05,
"loss": 0.4604,
"step": 608
},
{
"epoch": 46.84615384615385,
"grad_norm": 39.436607360839844,
"learning_rate": 1.997158900260614e-05,
"loss": 0.3123,
"step": 609
},
{
"epoch": 46.92307692307692,
"grad_norm": 15.021434783935547,
"learning_rate": 1.99710606923246e-05,
"loss": 0.1642,
"step": 610
},
{
"epoch": 47.0,
"grad_norm": 14.399913787841797,
"learning_rate": 1.9970527522269204e-05,
"loss": 0.105,
"step": 611
},
{
"epoch": 47.07692307692308,
"grad_norm": 44.68442153930664,
"learning_rate": 1.996998949269982e-05,
"loss": 0.3639,
"step": 612
},
{
"epoch": 47.15384615384615,
"grad_norm": 41.75777816772461,
"learning_rate": 1.9969446603878673e-05,
"loss": 0.3786,
"step": 613
},
{
"epoch": 47.23076923076923,
"grad_norm": 55.93494415283203,
"learning_rate": 1.996889885607036e-05,
"loss": 0.5464,
"step": 614
},
{
"epoch": 47.30769230769231,
"grad_norm": 27.718795776367188,
"learning_rate": 1.9968346249541848e-05,
"loss": 0.4085,
"step": 615
},
{
"epoch": 47.38461538461539,
"grad_norm": 26.89528465270996,
"learning_rate": 1.9967788784562474e-05,
"loss": 0.2946,
"step": 616
},
{
"epoch": 47.46153846153846,
"grad_norm": 19.02779197692871,
"learning_rate": 1.9967226461403934e-05,
"loss": 0.2457,
"step": 617
},
{
"epoch": 47.53846153846154,
"grad_norm": 28.85311508178711,
"learning_rate": 1.99666592803403e-05,
"loss": 0.5738,
"step": 618
},
{
"epoch": 47.61538461538461,
"grad_norm": 57.92436599731445,
"learning_rate": 1.996608724164801e-05,
"loss": 0.2921,
"step": 619
},
{
"epoch": 47.69230769230769,
"grad_norm": 56.59422302246094,
"learning_rate": 1.9965510345605866e-05,
"loss": 0.6956,
"step": 620
},
{
"epoch": 47.76923076923077,
"grad_norm": 20.687774658203125,
"learning_rate": 1.9964928592495046e-05,
"loss": 0.233,
"step": 621
},
{
"epoch": 47.84615384615385,
"grad_norm": 29.573144912719727,
"learning_rate": 1.996434198259908e-05,
"loss": 0.3683,
"step": 622
},
{
"epoch": 47.92307692307692,
"grad_norm": 21.21002769470215,
"learning_rate": 1.9963750516203887e-05,
"loss": 0.1318,
"step": 623
},
{
"epoch": 48.0,
"grad_norm": 39.380615234375,
"learning_rate": 1.9963154193597728e-05,
"loss": 0.6031,
"step": 624
},
{
"epoch": 48.07692307692308,
"grad_norm": 39.11266326904297,
"learning_rate": 1.996255301507125e-05,
"loss": 0.1495,
"step": 625
},
{
"epoch": 48.15384615384615,
"grad_norm": 46.29478454589844,
"learning_rate": 1.9961946980917457e-05,
"loss": 0.5444,
"step": 626
},
{
"epoch": 48.23076923076923,
"grad_norm": 31.465709686279297,
"learning_rate": 1.9961336091431728e-05,
"loss": 0.4781,
"step": 627
},
{
"epoch": 48.30769230769231,
"grad_norm": 37.89440155029297,
"learning_rate": 1.9960720346911798e-05,
"loss": 0.3573,
"step": 628
},
{
"epoch": 48.38461538461539,
"grad_norm": 39.69857406616211,
"learning_rate": 1.9960099747657774e-05,
"loss": 0.3921,
"step": 629
},
{
"epoch": 48.46153846153846,
"grad_norm": 55.70697021484375,
"learning_rate": 1.995947429397213e-05,
"loss": 0.5863,
"step": 630
},
{
"epoch": 48.53846153846154,
"grad_norm": 52.685943603515625,
"learning_rate": 1.9958843986159705e-05,
"loss": 0.5162,
"step": 631
},
{
"epoch": 48.61538461538461,
"grad_norm": 33.439208984375,
"learning_rate": 1.9958208824527702e-05,
"loss": 0.2662,
"step": 632
},
{
"epoch": 48.69230769230769,
"grad_norm": 34.79633331298828,
"learning_rate": 1.9957568809385693e-05,
"loss": 0.2713,
"step": 633
},
{
"epoch": 48.76923076923077,
"grad_norm": 62.909305572509766,
"learning_rate": 1.9956923941045613e-05,
"loss": 0.385,
"step": 634
},
{
"epoch": 48.84615384615385,
"grad_norm": 79.76982116699219,
"learning_rate": 1.995627421982176e-05,
"loss": 0.7246,
"step": 635
},
{
"epoch": 48.92307692307692,
"grad_norm": 51.8908805847168,
"learning_rate": 1.99556196460308e-05,
"loss": 0.4349,
"step": 636
},
{
"epoch": 49.0,
"grad_norm": 33.157596588134766,
"learning_rate": 1.995496021999177e-05,
"loss": 0.1781,
"step": 637
},
{
"epoch": 49.07692307692308,
"grad_norm": 23.671682357788086,
"learning_rate": 1.9954295942026065e-05,
"loss": 0.1832,
"step": 638
},
{
"epoch": 49.15384615384615,
"grad_norm": 37.103172302246094,
"learning_rate": 1.995362681245744e-05,
"loss": 0.1814,
"step": 639
},
{
"epoch": 49.23076923076923,
"grad_norm": 42.37955856323242,
"learning_rate": 1.9952952831612027e-05,
"loss": 0.7221,
"step": 640
},
{
"epoch": 49.30769230769231,
"grad_norm": 29.361839294433594,
"learning_rate": 1.9952273999818312e-05,
"loss": 0.3707,
"step": 641
},
{
"epoch": 49.38461538461539,
"grad_norm": 13.604857444763184,
"learning_rate": 1.9951590317407152e-05,
"loss": 0.2947,
"step": 642
},
{
"epoch": 49.46153846153846,
"grad_norm": 52.00338363647461,
"learning_rate": 1.9950901784711765e-05,
"loss": 0.4188,
"step": 643
},
{
"epoch": 49.53846153846154,
"grad_norm": 56.11465835571289,
"learning_rate": 1.9950208402067735e-05,
"loss": 0.3924,
"step": 644
},
{
"epoch": 49.61538461538461,
"grad_norm": 58.536102294921875,
"learning_rate": 1.9949510169813006e-05,
"loss": 0.4006,
"step": 645
},
{
"epoch": 49.69230769230769,
"grad_norm": 31.55925178527832,
"learning_rate": 1.9948807088287884e-05,
"loss": 0.0908,
"step": 646
},
{
"epoch": 49.76923076923077,
"grad_norm": 38.99277877807617,
"learning_rate": 1.994809915783505e-05,
"loss": 0.4612,
"step": 647
},
{
"epoch": 49.84615384615385,
"grad_norm": 62.03502655029297,
"learning_rate": 1.9947386378799534e-05,
"loss": 0.6084,
"step": 648
},
{
"epoch": 49.92307692307692,
"grad_norm": 70.17023468017578,
"learning_rate": 1.9946668751528745e-05,
"loss": 0.3815,
"step": 649
},
{
"epoch": 50.0,
"grad_norm": 70.30763244628906,
"learning_rate": 1.9945946276372435e-05,
"loss": 0.2997,
"step": 650
},
{
"epoch": 50.07692307692308,
"grad_norm": 35.853515625,
"learning_rate": 1.9945218953682736e-05,
"loss": 0.5588,
"step": 651
},
{
"epoch": 50.15384615384615,
"grad_norm": 33.02523422241211,
"learning_rate": 1.9944486783814135e-05,
"loss": 0.5543,
"step": 652
},
{
"epoch": 50.23076923076923,
"grad_norm": 48.58433532714844,
"learning_rate": 1.994374976712348e-05,
"loss": 0.5824,
"step": 653
},
{
"epoch": 50.30769230769231,
"grad_norm": 49.95491027832031,
"learning_rate": 1.994300790396999e-05,
"loss": 0.438,
"step": 654
},
{
"epoch": 50.38461538461539,
"grad_norm": 44.924007415771484,
"learning_rate": 1.9942261194715236e-05,
"loss": 0.4423,
"step": 655
},
{
"epoch": 50.46153846153846,
"grad_norm": 12.01087474822998,
"learning_rate": 1.9941509639723155e-05,
"loss": 0.2123,
"step": 656
},
{
"epoch": 50.53846153846154,
"grad_norm": 23.593204498291016,
"learning_rate": 1.9940753239360047e-05,
"loss": 0.2493,
"step": 657
},
{
"epoch": 50.61538461538461,
"grad_norm": 38.11962890625,
"learning_rate": 1.993999199399457e-05,
"loss": 0.3431,
"step": 658
},
{
"epoch": 50.69230769230769,
"grad_norm": 13.917471885681152,
"learning_rate": 1.9939225903997748e-05,
"loss": 0.0597,
"step": 659
},
{
"epoch": 50.76923076923077,
"grad_norm": 29.287017822265625,
"learning_rate": 1.993845496974297e-05,
"loss": 0.307,
"step": 660
},
{
"epoch": 50.84615384615385,
"grad_norm": 12.36963176727295,
"learning_rate": 1.9937679191605964e-05,
"loss": 0.2267,
"step": 661
},
{
"epoch": 50.92307692307692,
"grad_norm": 19.681686401367188,
"learning_rate": 1.993689856996485e-05,
"loss": 0.0911,
"step": 662
},
{
"epoch": 51.0,
"grad_norm": 17.56113052368164,
"learning_rate": 1.9936113105200085e-05,
"loss": 0.276,
"step": 663
},
{
"epoch": 51.07692307692308,
"grad_norm": 36.885414123535156,
"learning_rate": 1.99353227976945e-05,
"loss": 0.2167,
"step": 664
},
{
"epoch": 51.15384615384615,
"grad_norm": 36.57621383666992,
"learning_rate": 1.9934527647833276e-05,
"loss": 0.2526,
"step": 665
},
{
"epoch": 51.23076923076923,
"grad_norm": 31.334314346313477,
"learning_rate": 1.9933727656003964e-05,
"loss": 0.2672,
"step": 666
},
{
"epoch": 51.30769230769231,
"grad_norm": 22.630327224731445,
"learning_rate": 1.993292282259647e-05,
"loss": 0.091,
"step": 667
},
{
"epoch": 51.38461538461539,
"grad_norm": 48.03598403930664,
"learning_rate": 1.9932113148003057e-05,
"loss": 0.3044,
"step": 668
},
{
"epoch": 51.46153846153846,
"grad_norm": 16.784225463867188,
"learning_rate": 1.9931298632618355e-05,
"loss": 0.2004,
"step": 669
},
{
"epoch": 51.53846153846154,
"grad_norm": 32.46980285644531,
"learning_rate": 1.9930479276839347e-05,
"loss": 0.2895,
"step": 670
},
{
"epoch": 51.61538461538461,
"grad_norm": 40.98526382446289,
"learning_rate": 1.992965508106537e-05,
"loss": 0.1088,
"step": 671
},
{
"epoch": 51.69230769230769,
"grad_norm": 43.09739685058594,
"learning_rate": 1.9928826045698138e-05,
"loss": 0.5275,
"step": 672
},
{
"epoch": 51.76923076923077,
"grad_norm": 22.318811416625977,
"learning_rate": 1.9927992171141707e-05,
"loss": 0.2584,
"step": 673
},
{
"epoch": 51.84615384615385,
"grad_norm": 49.650753021240234,
"learning_rate": 1.99271534578025e-05,
"loss": 0.4818,
"step": 674
},
{
"epoch": 51.92307692307692,
"grad_norm": 53.377262115478516,
"learning_rate": 1.992630990608929e-05,
"loss": 0.7079,
"step": 675
},
{
"epoch": 52.0,
"grad_norm": 58.257423400878906,
"learning_rate": 1.9925461516413224e-05,
"loss": 0.1021,
"step": 676
},
{
"epoch": 52.07692307692308,
"grad_norm": 69.56619262695312,
"learning_rate": 1.9924608289187786e-05,
"loss": 0.5832,
"step": 677
},
{
"epoch": 52.15384615384615,
"grad_norm": 62.240211486816406,
"learning_rate": 1.9923750224828833e-05,
"loss": 0.7071,
"step": 678
},
{
"epoch": 52.23076923076923,
"grad_norm": 21.16132164001465,
"learning_rate": 1.992288732375458e-05,
"loss": 0.2678,
"step": 679
},
{
"epoch": 52.30769230769231,
"grad_norm": 24.814916610717773,
"learning_rate": 1.9922019586385587e-05,
"loss": 0.1287,
"step": 680
},
{
"epoch": 52.38461538461539,
"grad_norm": 35.99689865112305,
"learning_rate": 1.9921147013144782e-05,
"loss": 0.2424,
"step": 681
},
{
"epoch": 52.46153846153846,
"grad_norm": 40.905635833740234,
"learning_rate": 1.9920269604457444e-05,
"loss": 0.4172,
"step": 682
},
{
"epoch": 52.53846153846154,
"grad_norm": 17.19913101196289,
"learning_rate": 1.9919387360751216e-05,
"loss": 0.1629,
"step": 683
},
{
"epoch": 52.61538461538461,
"grad_norm": 20.01600456237793,
"learning_rate": 1.991850028245609e-05,
"loss": 0.1851,
"step": 684
},
{
"epoch": 52.69230769230769,
"grad_norm": 49.92776107788086,
"learning_rate": 1.9917608370004417e-05,
"loss": 0.4779,
"step": 685
},
{
"epoch": 52.76923076923077,
"grad_norm": 104.74503326416016,
"learning_rate": 1.9916711623830904e-05,
"loss": 1.3957,
"step": 686
},
{
"epoch": 52.84615384615385,
"grad_norm": 42.07792663574219,
"learning_rate": 1.9915810044372618e-05,
"loss": 0.6729,
"step": 687
},
{
"epoch": 52.92307692307692,
"grad_norm": 35.505592346191406,
"learning_rate": 1.9914903632068975e-05,
"loss": 0.3948,
"step": 688
},
{
"epoch": 53.0,
"grad_norm": 44.838714599609375,
"learning_rate": 1.9913992387361747e-05,
"loss": 1.169,
"step": 689
},
{
"epoch": 53.07692307692308,
"grad_norm": 34.66373062133789,
"learning_rate": 1.9913076310695068e-05,
"loss": 0.2086,
"step": 690
},
{
"epoch": 53.15384615384615,
"grad_norm": 20.64740562438965,
"learning_rate": 1.991215540251542e-05,
"loss": 0.1257,
"step": 691
},
{
"epoch": 53.23076923076923,
"grad_norm": 19.88491439819336,
"learning_rate": 1.991122966327164e-05,
"loss": 0.1137,
"step": 692
},
{
"epoch": 53.30769230769231,
"grad_norm": 29.428546905517578,
"learning_rate": 1.991029909341493e-05,
"loss": 0.3089,
"step": 693
},
{
"epoch": 53.38461538461539,
"grad_norm": 59.91632843017578,
"learning_rate": 1.9909363693398828e-05,
"loss": 0.5156,
"step": 694
},
{
"epoch": 53.46153846153846,
"grad_norm": 58.243263244628906,
"learning_rate": 1.9908423463679246e-05,
"loss": 0.1847,
"step": 695
},
{
"epoch": 53.53846153846154,
"grad_norm": 29.226238250732422,
"learning_rate": 1.9907478404714438e-05,
"loss": 0.2015,
"step": 696
},
{
"epoch": 53.61538461538461,
"grad_norm": 22.936357498168945,
"learning_rate": 1.990652851696501e-05,
"loss": 0.2968,
"step": 697
},
{
"epoch": 53.69230769230769,
"grad_norm": 22.54434585571289,
"learning_rate": 1.990557380089393e-05,
"loss": 0.1628,
"step": 698
},
{
"epoch": 53.76923076923077,
"grad_norm": 42.838775634765625,
"learning_rate": 1.9904614256966514e-05,
"loss": 0.4323,
"step": 699
},
{
"epoch": 53.84615384615385,
"grad_norm": 33.568359375,
"learning_rate": 1.990364988565043e-05,
"loss": 0.4084,
"step": 700
},
{
"epoch": 53.92307692307692,
"grad_norm": 13.65829849243164,
"learning_rate": 1.9902680687415704e-05,
"loss": 0.2112,
"step": 701
},
{
"epoch": 54.0,
"grad_norm": 36.00635528564453,
"learning_rate": 1.990170666273471e-05,
"loss": 0.2038,
"step": 702
},
{
"epoch": 54.07692307692308,
"grad_norm": 40.733489990234375,
"learning_rate": 1.9900727812082177e-05,
"loss": 0.1079,
"step": 703
},
{
"epoch": 54.15384615384615,
"grad_norm": 18.362619400024414,
"learning_rate": 1.989974413593518e-05,
"loss": 0.2279,
"step": 704
},
{
"epoch": 54.23076923076923,
"grad_norm": 18.40960121154785,
"learning_rate": 1.989875563477316e-05,
"loss": 0.1829,
"step": 705
},
{
"epoch": 54.30769230769231,
"grad_norm": 25.479307174682617,
"learning_rate": 1.989776230907789e-05,
"loss": 0.2494,
"step": 706
},
{
"epoch": 54.38461538461539,
"grad_norm": 25.522314071655273,
"learning_rate": 1.989676415933351e-05,
"loss": 0.5584,
"step": 707
},
{
"epoch": 54.46153846153846,
"grad_norm": 52.02535629272461,
"learning_rate": 1.989576118602651e-05,
"loss": 0.4463,
"step": 708
},
{
"epoch": 54.53846153846154,
"grad_norm": 8.517179489135742,
"learning_rate": 1.9894753389645723e-05,
"loss": 0.0818,
"step": 709
},
{
"epoch": 54.61538461538461,
"grad_norm": 21.887935638427734,
"learning_rate": 1.9893740770682334e-05,
"loss": 0.4517,
"step": 710
},
{
"epoch": 54.69230769230769,
"grad_norm": 26.428762435913086,
"learning_rate": 1.9892723329629885e-05,
"loss": 0.3262,
"step": 711
},
{
"epoch": 54.76923076923077,
"grad_norm": 28.858240127563477,
"learning_rate": 1.9891701066984264e-05,
"loss": 0.5892,
"step": 712
},
{
"epoch": 54.84615384615385,
"grad_norm": 27.605409622192383,
"learning_rate": 1.9890673983243708e-05,
"loss": 0.2364,
"step": 713
},
{
"epoch": 54.92307692307692,
"grad_norm": 15.756985664367676,
"learning_rate": 1.9889642078908805e-05,
"loss": 0.1361,
"step": 714
},
{
"epoch": 55.0,
"grad_norm": 44.47917175292969,
"learning_rate": 1.9888605354482494e-05,
"loss": 0.5785,
"step": 715
},
{
"epoch": 55.07692307692308,
"grad_norm": 18.722244262695312,
"learning_rate": 1.988756381047006e-05,
"loss": 0.1005,
"step": 716
},
{
"epoch": 55.15384615384615,
"grad_norm": 26.933124542236328,
"learning_rate": 1.988651744737914e-05,
"loss": 0.5561,
"step": 717
},
{
"epoch": 55.23076923076923,
"grad_norm": 32.02790451049805,
"learning_rate": 1.9885466265719723e-05,
"loss": 0.0889,
"step": 718
},
{
"epoch": 55.30769230769231,
"grad_norm": 27.264633178710938,
"learning_rate": 1.9884410266004134e-05,
"loss": 0.2253,
"step": 719
},
{
"epoch": 55.38461538461539,
"grad_norm": 30.937807083129883,
"learning_rate": 1.988334944874706e-05,
"loss": 0.3582,
"step": 720
},
{
"epoch": 55.46153846153846,
"grad_norm": 46.29901885986328,
"learning_rate": 1.988228381446553e-05,
"loss": 0.4241,
"step": 721
},
{
"epoch": 55.53846153846154,
"grad_norm": 42.519954681396484,
"learning_rate": 1.988121336367892e-05,
"loss": 0.8594,
"step": 722
},
{
"epoch": 55.61538461538461,
"grad_norm": 39.3632926940918,
"learning_rate": 1.9880138096908955e-05,
"loss": 0.1339,
"step": 723
},
{
"epoch": 55.69230769230769,
"grad_norm": 32.2740364074707,
"learning_rate": 1.9879058014679704e-05,
"loss": 0.1419,
"step": 724
},
{
"epoch": 55.76923076923077,
"grad_norm": 28.521841049194336,
"learning_rate": 1.987797311751759e-05,
"loss": 0.2118,
"step": 725
},
{
"epoch": 55.84615384615385,
"grad_norm": 28.847856521606445,
"learning_rate": 1.9876883405951378e-05,
"loss": 0.2045,
"step": 726
},
{
"epoch": 55.92307692307692,
"grad_norm": 26.72178077697754,
"learning_rate": 1.9875788880512183e-05,
"loss": 0.7234,
"step": 727
},
{
"epoch": 56.0,
"grad_norm": 24.55845069885254,
"learning_rate": 1.9874689541733455e-05,
"loss": 0.078,
"step": 728
},
{
"epoch": 56.07692307692308,
"grad_norm": 30.381683349609375,
"learning_rate": 1.9873585390151003e-05,
"loss": 0.2247,
"step": 729
},
{
"epoch": 56.15384615384615,
"grad_norm": 18.63511085510254,
"learning_rate": 1.9872476426302983e-05,
"loss": 0.2439,
"step": 730
},
{
"epoch": 56.23076923076923,
"grad_norm": 18.45151710510254,
"learning_rate": 1.987136265072988e-05,
"loss": 0.0866,
"step": 731
},
{
"epoch": 56.30769230769231,
"grad_norm": 29.958969116210938,
"learning_rate": 1.987024406397454e-05,
"loss": 0.3542,
"step": 732
},
{
"epoch": 56.38461538461539,
"grad_norm": 19.41910171508789,
"learning_rate": 1.9869120666582153e-05,
"loss": 0.1824,
"step": 733
},
{
"epoch": 56.46153846153846,
"grad_norm": 26.677873611450195,
"learning_rate": 1.986799245910024e-05,
"loss": 0.3551,
"step": 734
},
{
"epoch": 56.53846153846154,
"grad_norm": 28.94716453552246,
"learning_rate": 1.986685944207868e-05,
"loss": 0.4222,
"step": 735
},
{
"epoch": 56.61538461538461,
"grad_norm": 18.966806411743164,
"learning_rate": 1.9865721616069695e-05,
"loss": 0.0927,
"step": 736
},
{
"epoch": 56.69230769230769,
"grad_norm": 11.261028289794922,
"learning_rate": 1.9864578981627844e-05,
"loss": 0.0623,
"step": 737
},
{
"epoch": 56.76923076923077,
"grad_norm": 17.40890884399414,
"learning_rate": 1.9863431539310033e-05,
"loss": 0.2936,
"step": 738
},
{
"epoch": 56.84615384615385,
"grad_norm": 24.21697998046875,
"learning_rate": 1.986227928967551e-05,
"loss": 0.0637,
"step": 739
},
{
"epoch": 56.92307692307692,
"grad_norm": 31.43907356262207,
"learning_rate": 1.9861122233285873e-05,
"loss": 0.5369,
"step": 740
},
{
"epoch": 57.0,
"grad_norm": 50.822444915771484,
"learning_rate": 1.985996037070505e-05,
"loss": 0.3608,
"step": 741
},
{
"epoch": 57.07692307692308,
"grad_norm": 15.900673866271973,
"learning_rate": 1.9858793702499322e-05,
"loss": 0.1084,
"step": 742
},
{
"epoch": 57.15384615384615,
"grad_norm": 44.476287841796875,
"learning_rate": 1.9857622229237315e-05,
"loss": 0.4102,
"step": 743
},
{
"epoch": 57.23076923076923,
"grad_norm": 65.13040161132812,
"learning_rate": 1.9856445951489984e-05,
"loss": 0.4725,
"step": 744
},
{
"epoch": 57.30769230769231,
"grad_norm": 24.75728416442871,
"learning_rate": 1.985526486983063e-05,
"loss": 0.267,
"step": 745
},
{
"epoch": 57.38461538461539,
"grad_norm": 41.35836410522461,
"learning_rate": 1.9854078984834904e-05,
"loss": 0.6484,
"step": 746
},
{
"epoch": 57.46153846153846,
"grad_norm": 45.00423049926758,
"learning_rate": 1.985288829708079e-05,
"loss": 0.9051,
"step": 747
},
{
"epoch": 57.53846153846154,
"grad_norm": 28.75640869140625,
"learning_rate": 1.9851692807148612e-05,
"loss": 0.2823,
"step": 748
},
{
"epoch": 57.61538461538461,
"grad_norm": 25.26997184753418,
"learning_rate": 1.9850492515621038e-05,
"loss": 0.3037,
"step": 749
},
{
"epoch": 57.69230769230769,
"grad_norm": 16.20356559753418,
"learning_rate": 1.984928742308308e-05,
"loss": 0.1665,
"step": 750
},
{
"epoch": 57.76923076923077,
"grad_norm": 22.45698356628418,
"learning_rate": 1.9848077530122083e-05,
"loss": 0.4112,
"step": 751
},
{
"epoch": 57.84615384615385,
"grad_norm": 27.950660705566406,
"learning_rate": 1.9846862837327733e-05,
"loss": 0.294,
"step": 752
},
{
"epoch": 57.92307692307692,
"grad_norm": 19.849308013916016,
"learning_rate": 1.9845643345292055e-05,
"loss": 0.0727,
"step": 753
},
{
"epoch": 58.0,
"grad_norm": 50.8568229675293,
"learning_rate": 1.9844419054609418e-05,
"loss": 0.3167,
"step": 754
},
{
"epoch": 58.07692307692308,
"grad_norm": 49.81483459472656,
"learning_rate": 1.9843189965876525e-05,
"loss": 0.541,
"step": 755
},
{
"epoch": 58.15384615384615,
"grad_norm": 49.06199645996094,
"learning_rate": 1.984195607969242e-05,
"loss": 0.6278,
"step": 756
},
{
"epoch": 58.23076923076923,
"grad_norm": 29.350656509399414,
"learning_rate": 1.9840717396658483e-05,
"loss": 0.3204,
"step": 757
},
{
"epoch": 58.30769230769231,
"grad_norm": 20.372554779052734,
"learning_rate": 1.9839473917378432e-05,
"loss": 0.0781,
"step": 758
},
{
"epoch": 58.38461538461539,
"grad_norm": 22.690336227416992,
"learning_rate": 1.983822564245833e-05,
"loss": 0.1219,
"step": 759
},
{
"epoch": 58.46153846153846,
"grad_norm": 24.395050048828125,
"learning_rate": 1.9836972572506557e-05,
"loss": 0.3527,
"step": 760
},
{
"epoch": 58.53846153846154,
"grad_norm": 30.973020553588867,
"learning_rate": 1.983571470813386e-05,
"loss": 0.39,
"step": 761
},
{
"epoch": 58.61538461538461,
"grad_norm": 28.124399185180664,
"learning_rate": 1.98344520499533e-05,
"loss": 0.4424,
"step": 762
},
{
"epoch": 58.69230769230769,
"grad_norm": 31.887248992919922,
"learning_rate": 1.983318459858028e-05,
"loss": 0.0827,
"step": 763
},
{
"epoch": 58.76923076923077,
"grad_norm": 35.62253189086914,
"learning_rate": 1.9831912354632537e-05,
"loss": 0.6014,
"step": 764
},
{
"epoch": 58.84615384615385,
"grad_norm": 35.35845947265625,
"learning_rate": 1.9830635318730155e-05,
"loss": 0.1419,
"step": 765
},
{
"epoch": 58.92307692307692,
"grad_norm": 12.43506908416748,
"learning_rate": 1.9829353491495545e-05,
"loss": 0.1451,
"step": 766
},
{
"epoch": 59.0,
"grad_norm": 17.245344161987305,
"learning_rate": 1.982806687355345e-05,
"loss": 0.1339,
"step": 767
},
{
"epoch": 59.07692307692308,
"grad_norm": 31.799854278564453,
"learning_rate": 1.982677546553095e-05,
"loss": 0.2812,
"step": 768
},
{
"epoch": 59.15384615384615,
"grad_norm": 19.206119537353516,
"learning_rate": 1.982547926805747e-05,
"loss": 0.0983,
"step": 769
},
{
"epoch": 59.23076923076923,
"grad_norm": 33.53507614135742,
"learning_rate": 1.9824178281764753e-05,
"loss": 0.1039,
"step": 770
},
{
"epoch": 59.30769230769231,
"grad_norm": 13.916312217712402,
"learning_rate": 1.982287250728689e-05,
"loss": 0.095,
"step": 771
},
{
"epoch": 59.38461538461539,
"grad_norm": 24.899681091308594,
"learning_rate": 1.9821561945260292e-05,
"loss": 0.2406,
"step": 772
},
{
"epoch": 59.46153846153846,
"grad_norm": 17.658226013183594,
"learning_rate": 1.982024659632372e-05,
"loss": 0.1433,
"step": 773
},
{
"epoch": 59.53846153846154,
"grad_norm": 38.41850280761719,
"learning_rate": 1.9818926461118254e-05,
"loss": 0.1838,
"step": 774
},
{
"epoch": 59.61538461538461,
"grad_norm": 27.547054290771484,
"learning_rate": 1.981760154028731e-05,
"loss": 0.7646,
"step": 775
},
{
"epoch": 59.69230769230769,
"grad_norm": 48.75922775268555,
"learning_rate": 1.9816271834476642e-05,
"loss": 0.8896,
"step": 776
},
{
"epoch": 59.76923076923077,
"grad_norm": 25.430660247802734,
"learning_rate": 1.981493734433433e-05,
"loss": 0.4132,
"step": 777
},
{
"epoch": 59.84615384615385,
"grad_norm": 29.855377197265625,
"learning_rate": 1.981359807051079e-05,
"loss": 0.0553,
"step": 778
},
{
"epoch": 59.92307692307692,
"grad_norm": 29.918643951416016,
"learning_rate": 1.981225401365877e-05,
"loss": 0.2585,
"step": 779
},
{
"epoch": 60.0,
"grad_norm": 36.88877487182617,
"learning_rate": 1.981090517443334e-05,
"loss": 0.0971,
"step": 780
},
{
"epoch": 60.07692307692308,
"grad_norm": 48.528507232666016,
"learning_rate": 1.9809551553491918e-05,
"loss": 0.9207,
"step": 781
},
{
"epoch": 60.15384615384615,
"grad_norm": 37.711578369140625,
"learning_rate": 1.9808193151494233e-05,
"loss": 0.9668,
"step": 782
},
{
"epoch": 60.23076923076923,
"grad_norm": 22.573158264160156,
"learning_rate": 1.9806829969102356e-05,
"loss": 0.3043,
"step": 783
},
{
"epoch": 60.30769230769231,
"grad_norm": 24.48107147216797,
"learning_rate": 1.9805462006980688e-05,
"loss": 0.2461,
"step": 784
},
{
"epoch": 60.38461538461539,
"grad_norm": 17.666908264160156,
"learning_rate": 1.980408926579596e-05,
"loss": 0.0987,
"step": 785
},
{
"epoch": 60.46153846153846,
"grad_norm": 23.56198501586914,
"learning_rate": 1.9802711746217222e-05,
"loss": 0.1972,
"step": 786
},
{
"epoch": 60.53846153846154,
"grad_norm": 20.815799713134766,
"learning_rate": 1.9801329448915863e-05,
"loss": 0.3076,
"step": 787
},
{
"epoch": 60.61538461538461,
"grad_norm": 31.176063537597656,
"learning_rate": 1.9799942374565597e-05,
"loss": 0.3889,
"step": 788
},
{
"epoch": 60.69230769230769,
"grad_norm": 13.005535125732422,
"learning_rate": 1.979855052384247e-05,
"loss": 0.0651,
"step": 789
},
{
"epoch": 60.76923076923077,
"grad_norm": 18.74942970275879,
"learning_rate": 1.9797153897424854e-05,
"loss": 0.0623,
"step": 790
},
{
"epoch": 60.84615384615385,
"grad_norm": 20.31600570678711,
"learning_rate": 1.979575249599344e-05,
"loss": 0.1789,
"step": 791
},
{
"epoch": 60.92307692307692,
"grad_norm": 38.58935546875,
"learning_rate": 1.9794346320231265e-05,
"loss": 0.4844,
"step": 792
},
{
"epoch": 61.0,
"grad_norm": 30.701807022094727,
"learning_rate": 1.9792935370823676e-05,
"loss": 0.2106,
"step": 793
},
{
"epoch": 61.07692307692308,
"grad_norm": 16.613651275634766,
"learning_rate": 1.9791519648458352e-05,
"loss": 0.0731,
"step": 794
},
{
"epoch": 61.15384615384615,
"grad_norm": 31.29366111755371,
"learning_rate": 1.97900991538253e-05,
"loss": 0.3114,
"step": 795
},
{
"epoch": 61.23076923076923,
"grad_norm": 34.595489501953125,
"learning_rate": 1.9788673887616852e-05,
"loss": 0.4594,
"step": 796
},
{
"epoch": 61.30769230769231,
"grad_norm": 25.498994827270508,
"learning_rate": 1.9787243850527663e-05,
"loss": 0.2845,
"step": 797
},
{
"epoch": 61.38461538461539,
"grad_norm": 39.75912857055664,
"learning_rate": 1.978580904325472e-05,
"loss": 0.2296,
"step": 798
},
{
"epoch": 61.46153846153846,
"grad_norm": 46.827213287353516,
"learning_rate": 1.9784369466497333e-05,
"loss": 0.1907,
"step": 799
},
{
"epoch": 61.53846153846154,
"grad_norm": 35.235965728759766,
"learning_rate": 1.9782925120957123e-05,
"loss": 0.1975,
"step": 800
}
],
"logging_steps": 1,
"max_steps": 5000,
"num_input_tokens_seen": 0,
"num_train_epochs": 385,
"save_steps": 200,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": true,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 0.0,
"train_batch_size": 8,
"trial_name": null,
"trial_params": null
}